From bde000ae459f2829ed88e967f7fa7665b4e3afaf Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 19 May 2022 17:05:09 +0200 Subject: net: mac802154: Follow the count of ongoing transmissions In order to create a synchronous API for MLME command purposes, we need to be able to track the end of the ongoing transmissions. Let's introduce an atomic variable which is incremented when a transmission starts and decremented when relevant so that we know at any moment whether there is an ongoing transmission. The counter gets decremented in the following situations: - The operation is asynchronous and there was a failure during the offloading process. - The operation is synchronous and the synchronous operation failed. - The operation finished, either successfully or not. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220519150516.443078-5-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/cfg802154.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index d8d8719315fd..678ff00c7d70 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -214,6 +214,9 @@ struct wpan_phy { /* the network namespace this phy lives in currently */ possible_net_t _net; + /* Transmission monitoring */ + atomic_t ongoing_txs; + char priv[] __aligned(NETDEV_ALIGN); }; -- cgit v1.2.3 From 20a19d1df3e4079cbaa045ec89bbefb831d4705d Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 19 May 2022 17:05:10 +0200 Subject: net: mac802154: Bring the ability to hold the transmit queue Create a hold_txs atomic variable and increment/decrement it when relevant, ie. when we want to hold the queue or release it: currently all the "stopped" situations are suitable, but very soon we will more extensively use this feature for MLME purposes. Upon release, the atomic counter is decremented and checked. If it is back to 0, then the netif queue gets woken up. This makes the whole process fully transparent, provided that all the users of ieee802154_wake/stop_queue() now call ieee802154_hold/release_queue() instead. In no situation individual drivers should call any of these helpers manually in order to avoid messing with the counters. There are other functions more suited for this purpose which have been introduced, such as the _xmit_complete() and _xmit_error() helpers which will handle all that for them. One advantage is that, as no more drivers call the stop/wake helpers directly, we can safely stop exporting them and only declare the hold/release ones in a header only accessible to the core. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220519150516.443078-6-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/cfg802154.h | 6 ++++-- include/net/mac802154.h | 27 --------------------------- 2 files changed, 4 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 678ff00c7d70..e87f1b07f20f 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include @@ -214,8 +214,10 @@ struct wpan_phy { /* the network namespace this phy lives in currently */ possible_net_t _net; - /* Transmission monitoring */ + /* Transmission monitoring and control */ + spinlock_t queue_lock; atomic_t ongoing_txs; + atomic_t hold_txs; char priv[] __aligned(NETDEV_ALIGN); }; diff --git a/include/net/mac802154.h b/include/net/mac802154.h index bdac0ddbdcdb..357d25ef627a 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -460,33 +460,6 @@ void ieee802154_unregister_hw(struct ieee802154_hw *hw); */ void ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb, u8 lqi); -/** - * ieee802154_wake_queue - wake ieee802154 queue - * @hw: pointer as obtained from ieee802154_alloc_hw(). - * - * Tranceivers usually have either one transmit framebuffer or one framebuffer - * for both transmitting and receiving. Hence, the core currently only handles - * one frame at a time for each phy, which means we had to stop the queue to - * avoid new skb to come during the transmission. The queue then needs to be - * woken up after the operation. - * - * Drivers should use this function instead of netif_wake_queue. - */ -void ieee802154_wake_queue(struct ieee802154_hw *hw); - -/** - * ieee802154_stop_queue - stop ieee802154 queue - * @hw: pointer as obtained from ieee802154_alloc_hw(). - * - * Tranceivers usually have either one transmit framebuffer or one framebuffer - * for both transmitting and receiving. Hence, the core currently only handles - * one frame at a time for each phy, which means we need to tell upper layers to - * stop giving us new skbs while we are busy with the transmitted one. The queue - * must then be stopped before transmitting. - * - * Drivers should use this function instead of netif_stop_queue. - */ -void ieee802154_stop_queue(struct ieee802154_hw *hw); /** * ieee802154_xmit_complete - frame transmission complete -- cgit v1.2.3 From f0feb34904735ffa21fe7b0c50f9f9527ec74b7a Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 19 May 2022 17:05:13 +0200 Subject: net: mac802154: Introduce a tx queue flushing mechanism Right now we are able to stop a queue but we have no indication if a transmission is ongoing or not. Thanks to recent additions, we can track the number of ongoing transmissions so we know if the last transmission is over. Adding on top of it an internal wait queue also allows to be woken up asynchronously when this happens. If, beforehands, we marked the queue to be held and stopped it, we end up flushing and stopping the tx queue. Thanks to this feature, we will soon be able to introduce a synchronous transmit API. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220519150516.443078-9-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/cfg802154.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index e87f1b07f20f..0804d79669a4 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -218,6 +218,7 @@ struct wpan_phy { spinlock_t queue_lock; atomic_t ongoing_txs; atomic_t hold_txs; + wait_queue_head_t sync_txq; char priv[] __aligned(NETDEV_ALIGN); }; -- cgit v1.2.3 From 2b13db13af50a5dcdb944723c828915a50f0c3b2 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 19 May 2022 17:05:15 +0200 Subject: net: mac802154: Add a warning in the hot path We should never start a transmission after the queue has been stopped. But because it might work we don't kill the function here but rather warn loudly the user that something is wrong. Set a flag when the queue should remain stopped. Reset this flag when the queue actually gets restarded. Just check this value to know if a transmission is legitimate, warn if it is not. Turn the flags variable into an unsigned long to allow the use of atomic helpers on it. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220519150516.443078-11-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/cfg802154.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 0804d79669a4..428cece22205 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -166,11 +166,14 @@ wpan_phy_cca_cmp(const struct wpan_phy_cca *a, const struct wpan_phy_cca *b) * level setting. * @WPAN_PHY_FLAG_CCA_MODE: Indicates that transceiver will support cca mode * setting. + * @WPAN_PHY_FLAG_STATE_QUEUE_STOPPED: Indicates that the transmit queue was + * temporarily stopped. */ enum wpan_phy_flags { WPAN_PHY_FLAG_TXPOWER = BIT(1), WPAN_PHY_FLAG_CCA_ED_LEVEL = BIT(2), WPAN_PHY_FLAG_CCA_MODE = BIT(3), + WPAN_PHY_FLAG_STATE_QUEUE_STOPPED = BIT(4), }; struct wpan_phy { @@ -182,7 +185,7 @@ struct wpan_phy { */ const void *privid; - u32 flags; + unsigned long flags; /* * This is a PIB according to 802.15.4-2011. -- cgit v1.2.3 From 8a76145a2ec2a81dfe34d7ac42e8c242f095e8c8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 5 Oct 2022 21:24:51 -0700 Subject: bpf: explicitly define BPF_FUNC_xxx integer values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Historically enum bpf_func_id's BPF_FUNC_xxx enumerators relied on implicit sequential values being assigned by compiler. This is convenient, as new BPF helpers are always added at the very end, but it also has its downsides, some of them being: - with over 200 helpers now it's very hard to know what's each helper's ID, which is often important to know when working with BPF assembly (e.g., by dumping raw bpf assembly instructions with llvm-objdump -d command). it's possible to work around this by looking into vmlinux.h, dumping /sys/btf/kernel/vmlinux, looking at libbpf-provided bpf_helper_defs.h, etc. But it always feels like an unnecessary step and one should be able to quickly figure this out from UAPI header. - when backporting and cherry-picking only some BPF helpers onto older kernels it's important to be able to skip some enum values for helpers that weren't backported, but preserve absolute integer IDs to keep BPF helper IDs stable so that BPF programs stay portable across upstream and backported kernels. While neither problem is insurmountable, they come up frequently enough and are annoying enough to warrant improving the situation. And for the backporting the problem can easily go unnoticed for a while, especially if backport is done with people not very familiar with BPF subsystem overall. Anyways, it's easy to fix this by making sure that __BPF_FUNC_MAPPER macro provides explicit helper IDs. Unfortunately that would potentially break existing users that use UAPI-exposed __BPF_FUNC_MAPPER and are expected to pass macro that accepts only symbolic helper identifier (e.g., map_lookup_elem for bpf_map_lookup_elem() helper). As such, we need to introduce a new macro (___BPF_FUNC_MAPPER) which would specify both identifier and integer ID, but in such a way as to allow existing __BPF_FUNC_MAPPER be expressed in terms of new ___BPF_FUNC_MAPPER macro. And that's what this patch is doing. To avoid duplication and allow __BPF_FUNC_MAPPER stay *exactly* the same, ___BPF_FUNC_MAPPER accepts arbitrary "context" arguments, which can be used to pass any extra macros, arguments, and whatnot. In our case we use this to pass original user-provided macro that expects single argument and __BPF_FUNC_MAPPER is using it's own three-argument __BPF_FUNC_MAPPER_APPLY intermediate macro to impedance-match new and old "callback" macros. Once we resolve this, we use new ___BPF_FUNC_MAPPER to define enum bpf_func_id with explicit values. The other users of __BPF_FUNC_MAPPER in kernel (namely in kernel/bpf/disasm.c) are kept exactly the same both as demonstration that backwards compat works, but also to avoid unnecessary code churn. Note that new ___BPF_FUNC_MAPPER() doesn't forcefully insert comma between values, as that might not be appropriate in all possible cases where ___BPF_FUNC_MAPPER might be used by users. This doesn't reduce usability, as it's trivial to insert that comma inside "callback" macro. To validate all the manually specified IDs are exactly right, we used BTF to compare before and after values: $ bpftool btf dump file ~/linux-build/default/vmlinux | rg bpf_func_id -A 211 > after.txt $ git stash # stach UAPI changes $ make -j90 ... re-building kernel without UAPI changes ... $ bpftool btf dump file ~/linux-build/default/vmlinux | rg bpf_func_id -A 211 > before.txt $ diff -u before.txt after.txt --- before.txt 2022-10-05 10:48:18.119195916 -0700 +++ after.txt 2022-10-05 10:46:49.446615025 -0700 @@ -1,4 +1,4 @@ -[14576] ENUM 'bpf_func_id' encoding=UNSIGNED size=4 vlen=211 +[9560] ENUM 'bpf_func_id' encoding=UNSIGNED size=4 vlen=211 'BPF_FUNC_unspec' val=0 'BPF_FUNC_map_lookup_elem' val=1 'BPF_FUNC_map_update_elem' val=2 As can be seen from diff above, the only thing that changed was resulting BTF type ID of ENUM bpf_func_id, not any of the enumerators, their names or integer values. The only other place that needed fixing was scripts/bpf_doc.py used to generate man pages and bpf_helper_defs.h header for libbpf and selftests. That script is tightly-coupled to exact shape of ___BPF_FUNC_MAPPER macro definition, so had to be trivially adapted. Cc: Quentin Monnet Reported-by: Andrea Terzolo Signed-off-by: Andrii Nakryiko Reviewed-by: Quentin Monnet Acked-by: Jiri Olsa Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20221006042452.2089843-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 432 ++++++++++++++++++++++++----------------------- 1 file changed, 219 insertions(+), 213 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 51b9aa640ad2..17f61338f8f8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5436,225 +5436,231 @@ union bpf_attr { * larger than the size of the ring buffer, or which cannot fit * within a struct bpf_dynptr. */ -#define __BPF_FUNC_MAPPER(FN) \ - FN(unspec), \ - FN(map_lookup_elem), \ - FN(map_update_elem), \ - FN(map_delete_elem), \ - FN(probe_read), \ - FN(ktime_get_ns), \ - FN(trace_printk), \ - FN(get_prandom_u32), \ - FN(get_smp_processor_id), \ - FN(skb_store_bytes), \ - FN(l3_csum_replace), \ - FN(l4_csum_replace), \ - FN(tail_call), \ - FN(clone_redirect), \ - FN(get_current_pid_tgid), \ - FN(get_current_uid_gid), \ - FN(get_current_comm), \ - FN(get_cgroup_classid), \ - FN(skb_vlan_push), \ - FN(skb_vlan_pop), \ - FN(skb_get_tunnel_key), \ - FN(skb_set_tunnel_key), \ - FN(perf_event_read), \ - FN(redirect), \ - FN(get_route_realm), \ - FN(perf_event_output), \ - FN(skb_load_bytes), \ - FN(get_stackid), \ - FN(csum_diff), \ - FN(skb_get_tunnel_opt), \ - FN(skb_set_tunnel_opt), \ - FN(skb_change_proto), \ - FN(skb_change_type), \ - FN(skb_under_cgroup), \ - FN(get_hash_recalc), \ - FN(get_current_task), \ - FN(probe_write_user), \ - FN(current_task_under_cgroup), \ - FN(skb_change_tail), \ - FN(skb_pull_data), \ - FN(csum_update), \ - FN(set_hash_invalid), \ - FN(get_numa_node_id), \ - FN(skb_change_head), \ - FN(xdp_adjust_head), \ - FN(probe_read_str), \ - FN(get_socket_cookie), \ - FN(get_socket_uid), \ - FN(set_hash), \ - FN(setsockopt), \ - FN(skb_adjust_room), \ - FN(redirect_map), \ - FN(sk_redirect_map), \ - FN(sock_map_update), \ - FN(xdp_adjust_meta), \ - FN(perf_event_read_value), \ - FN(perf_prog_read_value), \ - FN(getsockopt), \ - FN(override_return), \ - FN(sock_ops_cb_flags_set), \ - FN(msg_redirect_map), \ - FN(msg_apply_bytes), \ - FN(msg_cork_bytes), \ - FN(msg_pull_data), \ - FN(bind), \ - FN(xdp_adjust_tail), \ - FN(skb_get_xfrm_state), \ - FN(get_stack), \ - FN(skb_load_bytes_relative), \ - FN(fib_lookup), \ - FN(sock_hash_update), \ - FN(msg_redirect_hash), \ - FN(sk_redirect_hash), \ - FN(lwt_push_encap), \ - FN(lwt_seg6_store_bytes), \ - FN(lwt_seg6_adjust_srh), \ - FN(lwt_seg6_action), \ - FN(rc_repeat), \ - FN(rc_keydown), \ - FN(skb_cgroup_id), \ - FN(get_current_cgroup_id), \ - FN(get_local_storage), \ - FN(sk_select_reuseport), \ - FN(skb_ancestor_cgroup_id), \ - FN(sk_lookup_tcp), \ - FN(sk_lookup_udp), \ - FN(sk_release), \ - FN(map_push_elem), \ - FN(map_pop_elem), \ - FN(map_peek_elem), \ - FN(msg_push_data), \ - FN(msg_pop_data), \ - FN(rc_pointer_rel), \ - FN(spin_lock), \ - FN(spin_unlock), \ - FN(sk_fullsock), \ - FN(tcp_sock), \ - FN(skb_ecn_set_ce), \ - FN(get_listener_sock), \ - FN(skc_lookup_tcp), \ - FN(tcp_check_syncookie), \ - FN(sysctl_get_name), \ - FN(sysctl_get_current_value), \ - FN(sysctl_get_new_value), \ - FN(sysctl_set_new_value), \ - FN(strtol), \ - FN(strtoul), \ - FN(sk_storage_get), \ - FN(sk_storage_delete), \ - FN(send_signal), \ - FN(tcp_gen_syncookie), \ - FN(skb_output), \ - FN(probe_read_user), \ - FN(probe_read_kernel), \ - FN(probe_read_user_str), \ - FN(probe_read_kernel_str), \ - FN(tcp_send_ack), \ - FN(send_signal_thread), \ - FN(jiffies64), \ - FN(read_branch_records), \ - FN(get_ns_current_pid_tgid), \ - FN(xdp_output), \ - FN(get_netns_cookie), \ - FN(get_current_ancestor_cgroup_id), \ - FN(sk_assign), \ - FN(ktime_get_boot_ns), \ - FN(seq_printf), \ - FN(seq_write), \ - FN(sk_cgroup_id), \ - FN(sk_ancestor_cgroup_id), \ - FN(ringbuf_output), \ - FN(ringbuf_reserve), \ - FN(ringbuf_submit), \ - FN(ringbuf_discard), \ - FN(ringbuf_query), \ - FN(csum_level), \ - FN(skc_to_tcp6_sock), \ - FN(skc_to_tcp_sock), \ - FN(skc_to_tcp_timewait_sock), \ - FN(skc_to_tcp_request_sock), \ - FN(skc_to_udp6_sock), \ - FN(get_task_stack), \ - FN(load_hdr_opt), \ - FN(store_hdr_opt), \ - FN(reserve_hdr_opt), \ - FN(inode_storage_get), \ - FN(inode_storage_delete), \ - FN(d_path), \ - FN(copy_from_user), \ - FN(snprintf_btf), \ - FN(seq_printf_btf), \ - FN(skb_cgroup_classid), \ - FN(redirect_neigh), \ - FN(per_cpu_ptr), \ - FN(this_cpu_ptr), \ - FN(redirect_peer), \ - FN(task_storage_get), \ - FN(task_storage_delete), \ - FN(get_current_task_btf), \ - FN(bprm_opts_set), \ - FN(ktime_get_coarse_ns), \ - FN(ima_inode_hash), \ - FN(sock_from_file), \ - FN(check_mtu), \ - FN(for_each_map_elem), \ - FN(snprintf), \ - FN(sys_bpf), \ - FN(btf_find_by_name_kind), \ - FN(sys_close), \ - FN(timer_init), \ - FN(timer_set_callback), \ - FN(timer_start), \ - FN(timer_cancel), \ - FN(get_func_ip), \ - FN(get_attach_cookie), \ - FN(task_pt_regs), \ - FN(get_branch_snapshot), \ - FN(trace_vprintk), \ - FN(skc_to_unix_sock), \ - FN(kallsyms_lookup_name), \ - FN(find_vma), \ - FN(loop), \ - FN(strncmp), \ - FN(get_func_arg), \ - FN(get_func_ret), \ - FN(get_func_arg_cnt), \ - FN(get_retval), \ - FN(set_retval), \ - FN(xdp_get_buff_len), \ - FN(xdp_load_bytes), \ - FN(xdp_store_bytes), \ - FN(copy_from_user_task), \ - FN(skb_set_tstamp), \ - FN(ima_file_hash), \ - FN(kptr_xchg), \ - FN(map_lookup_percpu_elem), \ - FN(skc_to_mptcp_sock), \ - FN(dynptr_from_mem), \ - FN(ringbuf_reserve_dynptr), \ - FN(ringbuf_submit_dynptr), \ - FN(ringbuf_discard_dynptr), \ - FN(dynptr_read), \ - FN(dynptr_write), \ - FN(dynptr_data), \ - FN(tcp_raw_gen_syncookie_ipv4), \ - FN(tcp_raw_gen_syncookie_ipv6), \ - FN(tcp_raw_check_syncookie_ipv4), \ - FN(tcp_raw_check_syncookie_ipv6), \ - FN(ktime_get_tai_ns), \ - FN(user_ringbuf_drain), \ +#define ___BPF_FUNC_MAPPER(FN, ctx...) \ + FN(unspec, 0, ##ctx) \ + FN(map_lookup_elem, 1, ##ctx) \ + FN(map_update_elem, 2, ##ctx) \ + FN(map_delete_elem, 3, ##ctx) \ + FN(probe_read, 4, ##ctx) \ + FN(ktime_get_ns, 5, ##ctx) \ + FN(trace_printk, 6, ##ctx) \ + FN(get_prandom_u32, 7, ##ctx) \ + FN(get_smp_processor_id, 8, ##ctx) \ + FN(skb_store_bytes, 9, ##ctx) \ + FN(l3_csum_replace, 10, ##ctx) \ + FN(l4_csum_replace, 11, ##ctx) \ + FN(tail_call, 12, ##ctx) \ + FN(clone_redirect, 13, ##ctx) \ + FN(get_current_pid_tgid, 14, ##ctx) \ + FN(get_current_uid_gid, 15, ##ctx) \ + FN(get_current_comm, 16, ##ctx) \ + FN(get_cgroup_classid, 17, ##ctx) \ + FN(skb_vlan_push, 18, ##ctx) \ + FN(skb_vlan_pop, 19, ##ctx) \ + FN(skb_get_tunnel_key, 20, ##ctx) \ + FN(skb_set_tunnel_key, 21, ##ctx) \ + FN(perf_event_read, 22, ##ctx) \ + FN(redirect, 23, ##ctx) \ + FN(get_route_realm, 24, ##ctx) \ + FN(perf_event_output, 25, ##ctx) \ + FN(skb_load_bytes, 26, ##ctx) \ + FN(get_stackid, 27, ##ctx) \ + FN(csum_diff, 28, ##ctx) \ + FN(skb_get_tunnel_opt, 29, ##ctx) \ + FN(skb_set_tunnel_opt, 30, ##ctx) \ + FN(skb_change_proto, 31, ##ctx) \ + FN(skb_change_type, 32, ##ctx) \ + FN(skb_under_cgroup, 33, ##ctx) \ + FN(get_hash_recalc, 34, ##ctx) \ + FN(get_current_task, 35, ##ctx) \ + FN(probe_write_user, 36, ##ctx) \ + FN(current_task_under_cgroup, 37, ##ctx) \ + FN(skb_change_tail, 38, ##ctx) \ + FN(skb_pull_data, 39, ##ctx) \ + FN(csum_update, 40, ##ctx) \ + FN(set_hash_invalid, 41, ##ctx) \ + FN(get_numa_node_id, 42, ##ctx) \ + FN(skb_change_head, 43, ##ctx) \ + FN(xdp_adjust_head, 44, ##ctx) \ + FN(probe_read_str, 45, ##ctx) \ + FN(get_socket_cookie, 46, ##ctx) \ + FN(get_socket_uid, 47, ##ctx) \ + FN(set_hash, 48, ##ctx) \ + FN(setsockopt, 49, ##ctx) \ + FN(skb_adjust_room, 50, ##ctx) \ + FN(redirect_map, 51, ##ctx) \ + FN(sk_redirect_map, 52, ##ctx) \ + FN(sock_map_update, 53, ##ctx) \ + FN(xdp_adjust_meta, 54, ##ctx) \ + FN(perf_event_read_value, 55, ##ctx) \ + FN(perf_prog_read_value, 56, ##ctx) \ + FN(getsockopt, 57, ##ctx) \ + FN(override_return, 58, ##ctx) \ + FN(sock_ops_cb_flags_set, 59, ##ctx) \ + FN(msg_redirect_map, 60, ##ctx) \ + FN(msg_apply_bytes, 61, ##ctx) \ + FN(msg_cork_bytes, 62, ##ctx) \ + FN(msg_pull_data, 63, ##ctx) \ + FN(bind, 64, ##ctx) \ + FN(xdp_adjust_tail, 65, ##ctx) \ + FN(skb_get_xfrm_state, 66, ##ctx) \ + FN(get_stack, 67, ##ctx) \ + FN(skb_load_bytes_relative, 68, ##ctx) \ + FN(fib_lookup, 69, ##ctx) \ + FN(sock_hash_update, 70, ##ctx) \ + FN(msg_redirect_hash, 71, ##ctx) \ + FN(sk_redirect_hash, 72, ##ctx) \ + FN(lwt_push_encap, 73, ##ctx) \ + FN(lwt_seg6_store_bytes, 74, ##ctx) \ + FN(lwt_seg6_adjust_srh, 75, ##ctx) \ + FN(lwt_seg6_action, 76, ##ctx) \ + FN(rc_repeat, 77, ##ctx) \ + FN(rc_keydown, 78, ##ctx) \ + FN(skb_cgroup_id, 79, ##ctx) \ + FN(get_current_cgroup_id, 80, ##ctx) \ + FN(get_local_storage, 81, ##ctx) \ + FN(sk_select_reuseport, 82, ##ctx) \ + FN(skb_ancestor_cgroup_id, 83, ##ctx) \ + FN(sk_lookup_tcp, 84, ##ctx) \ + FN(sk_lookup_udp, 85, ##ctx) \ + FN(sk_release, 86, ##ctx) \ + FN(map_push_elem, 87, ##ctx) \ + FN(map_pop_elem, 88, ##ctx) \ + FN(map_peek_elem, 89, ##ctx) \ + FN(msg_push_data, 90, ##ctx) \ + FN(msg_pop_data, 91, ##ctx) \ + FN(rc_pointer_rel, 92, ##ctx) \ + FN(spin_lock, 93, ##ctx) \ + FN(spin_unlock, 94, ##ctx) \ + FN(sk_fullsock, 95, ##ctx) \ + FN(tcp_sock, 96, ##ctx) \ + FN(skb_ecn_set_ce, 97, ##ctx) \ + FN(get_listener_sock, 98, ##ctx) \ + FN(skc_lookup_tcp, 99, ##ctx) \ + FN(tcp_check_syncookie, 100, ##ctx) \ + FN(sysctl_get_name, 101, ##ctx) \ + FN(sysctl_get_current_value, 102, ##ctx) \ + FN(sysctl_get_new_value, 103, ##ctx) \ + FN(sysctl_set_new_value, 104, ##ctx) \ + FN(strtol, 105, ##ctx) \ + FN(strtoul, 106, ##ctx) \ + FN(sk_storage_get, 107, ##ctx) \ + FN(sk_storage_delete, 108, ##ctx) \ + FN(send_signal, 109, ##ctx) \ + FN(tcp_gen_syncookie, 110, ##ctx) \ + FN(skb_output, 111, ##ctx) \ + FN(probe_read_user, 112, ##ctx) \ + FN(probe_read_kernel, 113, ##ctx) \ + FN(probe_read_user_str, 114, ##ctx) \ + FN(probe_read_kernel_str, 115, ##ctx) \ + FN(tcp_send_ack, 116, ##ctx) \ + FN(send_signal_thread, 117, ##ctx) \ + FN(jiffies64, 118, ##ctx) \ + FN(read_branch_records, 119, ##ctx) \ + FN(get_ns_current_pid_tgid, 120, ##ctx) \ + FN(xdp_output, 121, ##ctx) \ + FN(get_netns_cookie, 122, ##ctx) \ + FN(get_current_ancestor_cgroup_id, 123, ##ctx) \ + FN(sk_assign, 124, ##ctx) \ + FN(ktime_get_boot_ns, 125, ##ctx) \ + FN(seq_printf, 126, ##ctx) \ + FN(seq_write, 127, ##ctx) \ + FN(sk_cgroup_id, 128, ##ctx) \ + FN(sk_ancestor_cgroup_id, 129, ##ctx) \ + FN(ringbuf_output, 130, ##ctx) \ + FN(ringbuf_reserve, 131, ##ctx) \ + FN(ringbuf_submit, 132, ##ctx) \ + FN(ringbuf_discard, 133, ##ctx) \ + FN(ringbuf_query, 134, ##ctx) \ + FN(csum_level, 135, ##ctx) \ + FN(skc_to_tcp6_sock, 136, ##ctx) \ + FN(skc_to_tcp_sock, 137, ##ctx) \ + FN(skc_to_tcp_timewait_sock, 138, ##ctx) \ + FN(skc_to_tcp_request_sock, 139, ##ctx) \ + FN(skc_to_udp6_sock, 140, ##ctx) \ + FN(get_task_stack, 141, ##ctx) \ + FN(load_hdr_opt, 142, ##ctx) \ + FN(store_hdr_opt, 143, ##ctx) \ + FN(reserve_hdr_opt, 144, ##ctx) \ + FN(inode_storage_get, 145, ##ctx) \ + FN(inode_storage_delete, 146, ##ctx) \ + FN(d_path, 147, ##ctx) \ + FN(copy_from_user, 148, ##ctx) \ + FN(snprintf_btf, 149, ##ctx) \ + FN(seq_printf_btf, 150, ##ctx) \ + FN(skb_cgroup_classid, 151, ##ctx) \ + FN(redirect_neigh, 152, ##ctx) \ + FN(per_cpu_ptr, 153, ##ctx) \ + FN(this_cpu_ptr, 154, ##ctx) \ + FN(redirect_peer, 155, ##ctx) \ + FN(task_storage_get, 156, ##ctx) \ + FN(task_storage_delete, 157, ##ctx) \ + FN(get_current_task_btf, 158, ##ctx) \ + FN(bprm_opts_set, 159, ##ctx) \ + FN(ktime_get_coarse_ns, 160, ##ctx) \ + FN(ima_inode_hash, 161, ##ctx) \ + FN(sock_from_file, 162, ##ctx) \ + FN(check_mtu, 163, ##ctx) \ + FN(for_each_map_elem, 164, ##ctx) \ + FN(snprintf, 165, ##ctx) \ + FN(sys_bpf, 166, ##ctx) \ + FN(btf_find_by_name_kind, 167, ##ctx) \ + FN(sys_close, 168, ##ctx) \ + FN(timer_init, 169, ##ctx) \ + FN(timer_set_callback, 170, ##ctx) \ + FN(timer_start, 171, ##ctx) \ + FN(timer_cancel, 172, ##ctx) \ + FN(get_func_ip, 173, ##ctx) \ + FN(get_attach_cookie, 174, ##ctx) \ + FN(task_pt_regs, 175, ##ctx) \ + FN(get_branch_snapshot, 176, ##ctx) \ + FN(trace_vprintk, 177, ##ctx) \ + FN(skc_to_unix_sock, 178, ##ctx) \ + FN(kallsyms_lookup_name, 179, ##ctx) \ + FN(find_vma, 180, ##ctx) \ + FN(loop, 181, ##ctx) \ + FN(strncmp, 182, ##ctx) \ + FN(get_func_arg, 183, ##ctx) \ + FN(get_func_ret, 184, ##ctx) \ + FN(get_func_arg_cnt, 185, ##ctx) \ + FN(get_retval, 186, ##ctx) \ + FN(set_retval, 187, ##ctx) \ + FN(xdp_get_buff_len, 188, ##ctx) \ + FN(xdp_load_bytes, 189, ##ctx) \ + FN(xdp_store_bytes, 190, ##ctx) \ + FN(copy_from_user_task, 191, ##ctx) \ + FN(skb_set_tstamp, 192, ##ctx) \ + FN(ima_file_hash, 193, ##ctx) \ + FN(kptr_xchg, 194, ##ctx) \ + FN(map_lookup_percpu_elem, 195, ##ctx) \ + FN(skc_to_mptcp_sock, 196, ##ctx) \ + FN(dynptr_from_mem, 197, ##ctx) \ + FN(ringbuf_reserve_dynptr, 198, ##ctx) \ + FN(ringbuf_submit_dynptr, 199, ##ctx) \ + FN(ringbuf_discard_dynptr, 200, ##ctx) \ + FN(dynptr_read, 201, ##ctx) \ + FN(dynptr_write, 202, ##ctx) \ + FN(dynptr_data, 203, ##ctx) \ + FN(tcp_raw_gen_syncookie_ipv4, 204, ##ctx) \ + FN(tcp_raw_gen_syncookie_ipv6, 205, ##ctx) \ + FN(tcp_raw_check_syncookie_ipv4, 206, ##ctx) \ + FN(tcp_raw_check_syncookie_ipv6, 207, ##ctx) \ + FN(ktime_get_tai_ns, 208, ##ctx) \ + FN(user_ringbuf_drain, 209, ##ctx) \ /* */ +/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't + * know or care about integer value that is now passed as second argument + */ +#define __BPF_FUNC_MAPPER_APPLY(name, value, FN) FN(name), +#define __BPF_FUNC_MAPPER(FN) ___BPF_FUNC_MAPPER(__BPF_FUNC_MAPPER_APPLY, FN) + /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call */ -#define __BPF_ENUM_FN(x) BPF_FUNC_ ## x +#define __BPF_ENUM_FN(x, y) BPF_FUNC_ ## x = y, enum bpf_func_id { - __BPF_FUNC_MAPPER(__BPF_ENUM_FN) + ___BPF_FUNC_MAPPER(__BPF_ENUM_FN) __BPF_FUNC_MAX_ID, }; #undef __BPF_ENUM_FN -- cgit v1.2.3 From 1d9e4c91db17c9bf6f94ac234a4d4f2bffd52b97 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 6 Sep 2022 19:02:04 +0200 Subject: wifi: mac80211: add pointer from link STA to STA While often not needed, this considerably simplifies going from a link to the STA. This helps in cases such as debugfs where a single pointer should allow accessing a specific link and the STA. Signed-off-by: Benjamin Berg Signed-off-by: Johannes Berg --- include/net/mac80211.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index ac2bad57933f..7778a92d9582 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2176,6 +2176,7 @@ struct ieee80211_sta_aggregates { * All link specific info for a STA link for a non MLD STA(single) * or a MLD STA(multiple entries) are stored here. * + * @sta: reference to owning STA * @addr: MAC address of the Link STA. For non-MLO STA this is same as the addr * in ieee80211_sta. For MLO Link STA this addr can be same or different * from addr in ieee80211_sta (representing MLD STA addr) @@ -2196,6 +2197,8 @@ struct ieee80211_sta_aggregates { * */ struct ieee80211_link_sta { + struct ieee80211_sta *sta; + u8 addr[ETH_ALEN]; u8 link_id; enum ieee80211_smps_mode smps_mode; -- cgit v1.2.3 From d2caad527c191563116809990081ab4fc0dafdb6 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 6 Sep 2022 14:26:52 +0200 Subject: wifi: mac80211: add API to show the link STAs in debugfs Create debugfs data per-link. For drivers, there is a new operation link_sta_add_debugfs which will always be called. For non-MLO, the station directory will be used directly rather than creating a corresponding subdirectory. As such, non-MLO drivers can simply continue to create the data from sta_debugfs_add. Signed-off-by: Benjamin Berg [add missing inlines if !CONFIG_MAC80211_DEBUGFS] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7778a92d9582..c413050ec8dd 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3790,6 +3790,13 @@ struct ieee80211_prep_tx_info { * should be within a CONFIG_MAC80211_DEBUGFS conditional. This * callback can sleep. * + * @link_sta_add_debugfs: Drivers can use this callback to add debugfs files + * when a link is added to a mac80211 station. This callback + * should be within a CPTCFG_MAC80211_DEBUGFS conditional. This + * callback can sleep. + * For non-MLO the callback will be called once for the deflink with the + * station's directory rather than a separate subdirectory. + * * @sta_notify: Notifies low level driver about power state transition of an * associated station, AP, IBSS/WDS/mesh peer etc. For a VIF operating * in AP mode, this callback will not be called when the flag @@ -4260,6 +4267,10 @@ struct ieee80211_ops { struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct dentry *dir); + void (*link_sta_add_debugfs)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_link_sta *link_sta, + struct dentry *dir); #endif void (*sta_notify)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum sta_notify_cmd, struct ieee80211_sta *sta); -- cgit v1.2.3 From 53ad07e9823bca10c26e71d662b58c3e80e8ff2a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 6 Sep 2022 11:27:57 +0200 Subject: wifi: cfg80211: support reporting failed links For assoc and connect result APIs, support reporting failed links; they should still come with the BSS pointer in the case of assoc, so they're released correctly. In the case of connect result, this is optional. Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e09ff87146c1..4d35a4234417 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6933,6 +6933,8 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr); * @ap_mld_addr: AP MLD address (in case of MLO) * @links: per-link information indexed by link ID, use links[0] for * non-MLO connections + * @links.status: Set this (along with a BSS pointer) for links that + * were rejected by the AP. */ struct cfg80211_rx_assoc_resp { const u8 *buf; @@ -6944,6 +6946,7 @@ struct cfg80211_rx_assoc_resp { struct { const u8 *addr; struct cfg80211_bss *bss; + u16 status; } links[IEEE80211_MLD_MAX_NUM_LINKS]; }; @@ -7454,6 +7457,9 @@ struct cfg80211_fils_resp_params { * if the bss is expired during the connection, esp. for those drivers * implementing connect op. Only one parameter among @bssid and @bss needs * to be specified. + * @links.status: per-link status code, to report a status code that's not + * %WLAN_STATUS_SUCCESS for a given link, it must also be in the + * @valid_links bitmap and may have a BSS pointer (which is then released) */ struct cfg80211_connect_resp_params { int status; @@ -7470,6 +7476,7 @@ struct cfg80211_connect_resp_params { const u8 *addr; const u8 *bssid; struct cfg80211_bss *bss; + u16 status; } links[IEEE80211_MLD_MAX_NUM_LINKS]; }; -- cgit v1.2.3 From 7b6f08771bf6045c32a2a1e18201c1aeeb78d72a Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Wed, 7 Sep 2022 10:34:30 +0300 Subject: wifi: ieee80211: Support validating ML station profile length Add a function to validate EHT Multi-Link per station profile length. Signed-off-by: Ilan Peer Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 79690938d9a2..bdf668f9dace 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4685,6 +4685,46 @@ struct ieee80211_mle_per_sta_profile { u8 variable[]; } __packed; +/** + * ieee80211_mle_sta_prof_size_ok - validate multi-link element sta profile size + * @data: pointer to the sub element data + * @len: length of the containing sub element + */ +static inline bool ieee80211_mle_sta_prof_size_ok(const u8 *data, size_t len) +{ + const struct ieee80211_mle_per_sta_profile *prof = (const void *)data; + u16 control; + u8 fixed = sizeof(*prof); + u8 info_len = 1; + + if (len < fixed) + return false; + + control = le16_to_cpu(prof->control); + + if (control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT) + info_len += 6; + if (control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT) + info_len += 2; + if (control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT) + info_len += 8; + if (control & IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT) + info_len += 2; + if (control & IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT) + info_len += 1; + + if (control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE && + control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE) { + if (control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE) + info_len += 2; + else + info_len += 1; + } + + return prof->sta_info_len >= info_len && + fixed + prof->sta_info_len <= len; +} + #define for_each_mle_subelement(_elem, _data, _len) \ if (ieee80211_mle_size_ok(_data, _len)) \ for_each_element(_elem, \ -- cgit v1.2.3 From 1403b109c9a5244dc6ab79154f04eecc209ef3d2 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Wed, 7 Sep 2022 17:23:09 +0300 Subject: wifi: cfg80211/mac80211: Fix ML element common size calculation The common size is part of the length in the data so don't add it again. Signed-off-by: Ilan Peer Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index bdf668f9dace..442b13333da8 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4573,18 +4573,17 @@ static inline u8 ieee80211_mle_common_size(const u8 *data) switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) { case IEEE80211_ML_CONTROL_TYPE_BASIC: - common += sizeof(struct ieee80211_mle_basic_common_info); - break; case IEEE80211_ML_CONTROL_TYPE_PREQ: - common += sizeof(struct ieee80211_mle_preq_common_info); + case IEEE80211_ML_CONTROL_TYPE_TDLS: + /* + * The length is the first octet pointed by mle->variable so no + * need to add anything + */ break; case IEEE80211_ML_CONTROL_TYPE_RECONF: if (control & IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR) common += ETH_ALEN; return common; - case IEEE80211_ML_CONTROL_TYPE_TDLS: - common += sizeof(struct ieee80211_mle_tdls_common_info); - break; case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: if (control & IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR) common += ETH_ALEN; -- cgit v1.2.3 From fb99c7d4d6d0fb4fe5a953e0c5f6c37a5b796b98 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Sun, 11 Sep 2022 16:55:12 +0300 Subject: wifi: cfg80211/mac80211: Fix ML element common size validation The Multi-Link element can be fragmented, thus its size can exceed 254. Thus, modify ieee80211_mle_size_ok() to use 'size_t len' instead of 'u8 len'. Signed-off-by: Ilan Peer Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 442b13333da8..b935a85b2f44 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4601,7 +4601,7 @@ static inline u8 ieee80211_mle_common_size(const u8 *data) * @data: pointer to the element data * @len: length of the containing element */ -static inline bool ieee80211_mle_size_ok(const u8 *data, u8 len) +static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) { const struct ieee80211_multi_link_elem *mle = (const void *)data; u8 fixed = sizeof(*mle); -- cgit v1.2.3 From 45ebac4f059b92906e7e86dd1a780739f883857c Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Tue, 6 Sep 2022 11:48:56 +0300 Subject: wifi: mac80211: Parse station profile from association response When processing an association response frame for a Multi-Link connection, extract the per station profile for each additional link, and use it for parsing the link elements. As the Multi-Link element might be fragmented, add support for reassembling a fragmented element. To simplify memory management logic, extend 'struct ieee802_11_elems' to hold a scratch buffer, which is used for the defragmentation. Once an element is reconstructed in the scratch area, point the corresponding element pointer to it. Currently only defragmentation of Multi-Link element and the contained per-STA profile subelement is supported. Signed-off-by: Ilan Peer Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index b935a85b2f44..f51e939f28f2 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4666,6 +4666,7 @@ static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) enum ieee80211_mle_subelems { IEEE80211_MLE_SUBELEM_PER_STA_PROFILE = 0, + IEEE80211_MLE_SUBELEM_FRAGMENT = 254, }; #define IEEE80211_MLE_STA_CONTROL_LINK_ID 0x000f -- cgit v1.2.3 From 1e0f8cc96b7162075d2e3b6bef856497884a3ae8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 6 Sep 2022 22:37:03 +0200 Subject: wifi: nl80211: use link ID in NL80211_CMD_SET_BSS We clearly need the link ID here, to know the right BSS to configure. Use/require it. Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 4d35a4234417..659dd1bee70f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2105,6 +2105,7 @@ struct mpath_info { * * Used to change BSS parameters (mainly for AP mode). * + * @link_id: link_id or -1 for non-MLD * @use_cts_prot: Whether to use CTS protection * (0 = no, 1 = yes, -1 = do not change) * @use_short_preamble: Whether the use of short preambles is allowed @@ -2122,6 +2123,7 @@ struct mpath_info { * @p2p_opp_ps: P2P opportunistic PS (-1 = no change) */ struct bss_parameters { + int link_id; int use_cts_prot; int use_short_preamble; int use_short_slot_time; -- cgit v1.2.3 From f3630c4f82ae43682bf84e6ddcbd7e97285d4699 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 27 Sep 2022 11:39:23 +0200 Subject: wifi: mac80211: add RCU _check() link access variants We might sometimes need to use RCU and locking in the same code path, so add the two variants link_conf_dereference_check() and link_sta_dereference_check(). Signed-off-by: Johannes Berg --- include/net/mac80211.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c413050ec8dd..cda4584dfd51 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1915,6 +1915,10 @@ static inline bool lockdep_vif_mutex_held(struct ieee80211_vif *vif) rcu_dereference_protected((vif)->link_conf[link_id], \ lockdep_vif_mutex_held(vif)) +#define link_conf_dereference_check(vif, link_id) \ + rcu_dereference_check((vif)->link_conf[link_id], \ + lockdep_vif_mutex_held(vif)) + /** * enum ieee80211_key_flags - key flags * @@ -2311,6 +2315,10 @@ static inline bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta) rcu_dereference_protected((sta)->link[link_id], \ lockdep_sta_mutex_held(sta)) +#define link_sta_dereference_check(sta, link_id) \ + rcu_dereference_check((sta)->link[link_id], \ + lockdep_sta_mutex_held(sta)) + #define for_each_sta_active_link(vif, sta, link_sta, link_id) \ for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) \ if ((!(vif)->active_links || \ -- cgit v1.2.3 From 1177aaa7fe9373c762cd5bf5f5de8517bac989d5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 17 Sep 2022 03:14:53 +0200 Subject: wifi: fix multi-link element subelement iteration The subelements obviously start after the common data, including the common multi-link element structure definition itself. This bug was possibly just hidden by the higher bits of the control being set to 0, so the iteration just found one bogus element and most of the code could continue anyway. Fixes: 0f48b8b88aa9 ("wifi: ieee80211: add definitions for multi-link element") Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index f51e939f28f2..6252f02f38b7 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4593,7 +4593,7 @@ static inline u8 ieee80211_mle_common_size(const u8 *data) return 0; } - return common + mle->variable[0]; + return sizeof(*mle) + common + mle->variable[0]; } /** -- cgit v1.2.3 From 0ff57171d6d225558c81a69439d5323e35b40549 Mon Sep 17 00:00:00 2001 From: Vinayak Yadawad Date: Wed, 7 Sep 2022 18:14:48 +0530 Subject: cfg80211: Update Transition Disable policy during port authorization In case of 4way handshake offload, transition disable policy updated by the AP during EAPOL 3/4 is not updated to the upper layer. This results in mismatch between transition disable policy between the upper layer and the driver. This patch addresses this issue by updating transition disable policy as part of port authorization indication. Signed-off-by: Vinayak Yadawad Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 +++- include/uapi/linux/nl80211.h | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 659dd1bee70f..11a370e64143 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -7683,6 +7683,8 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, * * @dev: network device * @bssid: the BSSID of the AP + * @td_bitmap: transition disable policy + * @td_bitmap_len: Length of transition disable policy * @gfp: allocation flags * * This function should be called by a driver that supports 4 way handshake @@ -7693,7 +7695,7 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, * indicate the 802.11 association. */ void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid, - gfp_t gfp); + const u8* td_bitmap, u8 td_bitmap_len, gfp_t gfp); /** * cfg80211_disconnected - notify cfg80211 that connection was dropped diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c32e7616a366..c14a91bbca7c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2749,6 +2749,8 @@ enum nl80211_commands { * When used with %NL80211_CMD_FRAME_TX_STATUS, indicates the ack RX * timestamp. When used with %NL80211_CMD_FRAME RX notification, indicates * the incoming frame RX timestamp. + * @NL80211_ATTR_TD_BITMAP: Transition Disable bitmap, for subsequent + * (re)associations. * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3276,6 +3278,7 @@ enum nl80211_attrs { NL80211_ATTR_TX_HW_TIMESTAMP, NL80211_ATTR_RX_HW_TIMESTAMP, + NL80211_ATTR_TD_BITMAP, /* add attributes here, update the policy in nl80211.c */ -- cgit v1.2.3 From c850e31f79f049af5022f07cd9961605b4470d0b Mon Sep 17 00:00:00 2001 From: Alexander Wetzel Date: Sun, 9 Oct 2022 18:30:38 +0200 Subject: wifi: mac80211: add internal handler for wake_tx_queue Start to align the TX handling to only use internal TX queues (iTXQs): Provide a handler for drivers not having a custom wake_tx_queue callback and update the documentation. Signed-off-by: Alexander Wetzel Signed-off-by: Johannes Berg --- include/net/mac80211.h | 51 +++++++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index cda4584dfd51..721c450a9ccd 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -89,15 +89,13 @@ /** * DOC: mac80211 software tx queueing * - * mac80211 provides an optional intermediate queueing implementation designed - * to allow the driver to keep hardware queues short and provide some fairness - * between different stations/interfaces. - * In this model, the driver pulls data frames from the mac80211 queue instead - * of letting mac80211 push them via drv_tx(). - * Other frames (e.g. control or management) are still pushed using drv_tx(). + * mac80211 uses an intermediate queueing implementation, designed to allow the + * driver to keep hardware queues short and to provide some fairness between + * different stations/interfaces. * - * Drivers indicate that they use this model by implementing the .wake_tx_queue - * driver operation. + * Drivers must provide the .wake_tx_queue driver operation by either + * linking it to ieee80211_handle_wake_tx_queue() or implementing a custom + * handler. * * Intermediate queues (struct ieee80211_txq) are kept per-sta per-tid, with * another per-sta for non-data/non-mgmt and bufferable management frames, and @@ -106,9 +104,12 @@ * The driver is expected to initialize its private per-queue data for stations * and interfaces in the .add_interface and .sta_add ops. * - * The driver can't access the queue directly. To dequeue a frame from a - * txq, it calls ieee80211_tx_dequeue(). Whenever mac80211 adds a new frame to a - * queue, it calls the .wake_tx_queue driver op. + * The driver can't access the internal TX queues (iTXQs) directly. + * Whenever mac80211 adds a new frame to a queue, it calls the .wake_tx_queue + * driver op. + * Drivers implementing a custom .wake_tx_queue op can get them by calling + * ieee80211_tx_dequeue(). Drivers using ieee80211_handle_wake_tx_queue() will + * simply get the individual frames pushed via the .tx driver operation. * * Drivers can optionally delegate responsibility for scheduling queues to * mac80211, to take advantage of airtime fairness accounting. In this case, to @@ -1826,7 +1827,7 @@ struct ieee80211_vif_cfg { * for this interface. * @drv_priv: data area for driver use, will always be aligned to * sizeof(void \*). - * @txq: the multicast data TX queue (if driver uses the TXQ abstraction) + * @txq: the multicast data TX queue * @txqs_stopped: per AC flag to indicate that intermediate TXQs are stopped, * protected by fq->lock. * @offload_flags: 802.3 -> 802.11 enapsulation offload flags, see @@ -2259,8 +2260,8 @@ struct ieee80211_link_sta { * For non MLO STA it will point to the deflink data. For MLO STA * ieee80211_sta_recalc_aggregates() must be called to update it. * @support_p2p_ps: indicates whether the STA supports P2P PS mechanism or not. - * @txq: per-TID data TX queues (if driver uses the TXQ abstraction); note that - * the last entry (%IEEE80211_NUM_TIDS) is used for non-data frames + * @txq: per-TID data TX queues; note that the last entry (%IEEE80211_NUM_TIDS) + * is used for non-data frames * @deflink: This holds the default link STA information, for non MLO STA all link * specific STA information is accessed through @deflink or through * link[0] which points to address of @deflink. For MLO Link STA @@ -5713,7 +5714,7 @@ void ieee80211_key_replay(struct ieee80211_key_conf *keyconf); * @hw: pointer as obtained from ieee80211_alloc_hw(). * @queue: queue number (counted from zero). * - * Drivers should use this function instead of netif_wake_queue. + * Drivers must use this function instead of netif_wake_queue. */ void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue); @@ -5722,7 +5723,7 @@ void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue); * @hw: pointer as obtained from ieee80211_alloc_hw(). * @queue: queue number (counted from zero). * - * Drivers should use this function instead of netif_stop_queue. + * Drivers must use this function instead of netif_stop_queue. */ void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue); @@ -5731,7 +5732,7 @@ void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue); * @hw: pointer as obtained from ieee80211_alloc_hw(). * @queue: queue number (counted from zero). * - * Drivers should use this function instead of netif_stop_queue. + * Drivers must use this function instead of netif_queue_stopped. * * Return: %true if the queue is stopped. %false otherwise. */ @@ -5742,7 +5743,7 @@ int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue); * ieee80211_stop_queues - stop all queues * @hw: pointer as obtained from ieee80211_alloc_hw(). * - * Drivers should use this function instead of netif_stop_queue. + * Drivers must use this function instead of netif_tx_stop_all_queues. */ void ieee80211_stop_queues(struct ieee80211_hw *hw); @@ -5750,7 +5751,7 @@ void ieee80211_stop_queues(struct ieee80211_hw *hw); * ieee80211_wake_queues - wake all queues * @hw: pointer as obtained from ieee80211_alloc_hw(). * - * Drivers should use this function instead of netif_wake_queue. + * Drivers must use this function instead of netif_tx_wake_all_queues. */ void ieee80211_wake_queues(struct ieee80211_hw *hw); @@ -6971,6 +6972,18 @@ static inline struct sk_buff *ieee80211_tx_dequeue_ni(struct ieee80211_hw *hw, return skb; } +/** + * ieee80211_handle_wake_tx_queue - mac80211 handler for wake_tx_queue callback + * + * @hw: pointer as obtained from wake_tx_queue() callback(). + * @txq: pointer as obtained from wake_tx_queue() callback(). + * + * Drivers can use this function for the mandatory mac80211 wake_tx_queue + * callback in struct ieee80211_ops. They should not call this function. + */ +void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw, + struct ieee80211_txq *txq); + /** * ieee80211_next_txq - get next tx queue to pull packets from * -- cgit v1.2.3 From a6a6163b9a934df5965792d60af1f53aa51fdd39 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 7 Oct 2022 10:53:03 +0200 Subject: mac802154: Introduce filtering levels The 802154 specification details several filtering levels in which the PHY and the MAC could be. The amount of filtering will vary if they are in promiscuous mode or in scanning mode. Otherwise they are expected to do some very basic checks, such as enforcing the frame validity. Either the PHY is able to do so, and the MAC has nothing to do, or the PHY has a lower filtering level than expected and the MAC should take over. For now we just define these levels in an enumeration. In a second time, we will add a per-PHY parameter showing the expected filtering level as well as a per device current filtering level, and will initialize all these fields. In a third time, we will use them to apply more filtering by software when the PHY is limited. Indeed, if the drivers know they cannot reach the requested level of filtering, they will overwrite the "current filtering" parameter so that it reflects what they do. Then, in the core, the expected filtering level will be used to decide whether some additional software processing is needed or not. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20221007085310.503366-2-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/linux/ieee802154.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h index f1f9412b6ac6..0303eb84d596 100644 --- a/include/linux/ieee802154.h +++ b/include/linux/ieee802154.h @@ -276,6 +276,30 @@ enum { IEEE802154_SYSTEM_ERROR = 0xff, }; +/** + * enum ieee802154_filtering_level - Filtering levels applicable to a PHY + * + * @IEEE802154_FILTERING_NONE: No filtering at all, what is received is + * forwarded to the softMAC + * @IEEE802154_FILTERING_1_FCS: First filtering level, frames with an invalid + * FCS should be dropped + * @IEEE802154_FILTERING_2_PROMISCUOUS: Second filtering level, promiscuous + * mode as described in the spec, identical in terms of filtering to the + * level one on PHY side, but at the MAC level the frame should be + * forwarded to the upper layer directly + * @IEEE802154_FILTERING_3_SCAN: Third filtering level, scan related, where + * only beacons must be processed, all remaining traffic gets dropped + * @IEEE802154_FILTERING_4_FRAME_FIELDS: Fourth filtering level actually + * enforcing the validity of the content of the frame with various checks + */ +enum ieee802154_filtering_level { + IEEE802154_FILTERING_NONE, + IEEE802154_FILTERING_1_FCS, + IEEE802154_FILTERING_2_PROMISCUOUS, + IEEE802154_FILTERING_3_SCAN, + IEEE802154_FILTERING_4_FRAME_FIELDS, +}; + /* frame control handling */ #define IEEE802154_FCTL_FTYPE 0x0003 #define IEEE802154_FCTL_ACKREQ 0x0020 -- cgit v1.2.3 From ac8037c35bd1fb1799d39f52205f813e6585b58b Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 7 Oct 2022 10:53:05 +0200 Subject: mac802154: set filter at drv_start() The current filtering level is set on the first interface up on a wpan phy. If we support scan functionality we need to change the filtering level on the fly on an operational phy and switching back again. This patch will move the receive mode parameter e.g. address filter and promiscuous mode to the drv_start() functionality to allow changing the receive mode on an operational phy not on first ifup only. In future this should be handled on driver layer because each hardware has it's own way to enter a specific filtering level. However this should offer to switch to mode IEEE802154_FILTERING_NONE and back to IEEE802154_FILTERING_4_FRAME_FIELDS. Only IEEE802154_FILTERING_4_FRAME_FIELDS and IEEE802154_FILTERING_NONE are somewhat supported by current hardware. All other filtering levels can be supported in future but will end in IEEE802154_FILTERING_NONE as the receive part can kind of "emulate" those receive paths by doing additional filtering routines. There are in total three filtering levels in the code: - the per-interface default level (should not be changed) - the required per-interface level (mac commands may play with it) - the actual per-PHY (hw) level that is currently in use Signed-off-by: Alexander Aring [ Link: https://lore.kernel.org/r/20221007085310.503366-4-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/cfg802154.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 428cece22205..e1481f9cf049 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -223,6 +223,11 @@ struct wpan_phy { atomic_t hold_txs; wait_queue_head_t sync_txq; + /* Current filtering level on reception. + * Only allowed to be changed if phy is not operational. + */ + enum ieee802154_filtering_level filtering; + char priv[] __aligned(NETDEV_ALIGN); }; @@ -374,8 +379,6 @@ struct wpan_dev { bool lbt; - bool promiscuous_mode; - /* fallback for acknowledgment bit setting */ bool ackreq; }; -- cgit v1.2.3 From ea562d8c486eebd2707bcd193974078a2a47affc Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 7 Oct 2022 10:53:07 +0200 Subject: ieee802154: hwsim: Implement address filtering We have access to the address filters being theoretically applied, we also have access to the actual filtering level applied, so let's add a proper frame validation sequence in hwsim. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20221007085310.503366-6-miquel.raynal@bootlin.com [stefan@datenfreihafen.org: fixup some checkpatch warnings] Signed-off-by: Stefan Schmidt --- include/net/ieee802154_netdev.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/ieee802154_netdev.h b/include/net/ieee802154_netdev.h index d0d188c3294b..1b82bbafe8c7 100644 --- a/include/net/ieee802154_netdev.h +++ b/include/net/ieee802154_netdev.h @@ -69,6 +69,14 @@ struct ieee802154_hdr_fc { #endif }; +enum ieee802154_frame_version { + IEEE802154_2003_STD, + IEEE802154_2006_STD, + IEEE802154_STD, + IEEE802154_RESERVED_STD, + IEEE802154_MULTIPURPOSE_STD = IEEE802154_2003_STD, +}; + struct ieee802154_hdr { struct ieee802154_hdr_fc fc; u8 seq; -- cgit v1.2.3 From a4b5b4c56dd8b1dd46b2f13cb09f5f8031978f86 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 7 Oct 2022 10:53:08 +0200 Subject: mac802154: Drop IEEE802154_HW_RX_DROP_BAD_CKSUM This IEEE802154_HW_RX_DROP_BAD_CKSUM flag was only used by hwsim to reflect the fact that it would not validate the checksum (FCS). So this was only useful while the only filtering level hwsim was capable of was "NONE". Now that the driver has been improved we no longer need this flag. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20221007085310.503366-7-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/mac802154.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/net/mac802154.h b/include/net/mac802154.h index 357d25ef627a..4a3a9de9da73 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -111,9 +111,6 @@ struct ieee802154_hw { * promiscuous mode setting. * * @IEEE802154_HW_RX_OMIT_CKSUM: Indicates that receiver omits FCS. - * - * @IEEE802154_HW_RX_DROP_BAD_CKSUM: Indicates that receiver will not filter - * frames with bad checksum. */ enum ieee802154_hw_flags { IEEE802154_HW_TX_OMIT_CKSUM = BIT(0), @@ -123,7 +120,6 @@ enum ieee802154_hw_flags { IEEE802154_HW_AFILT = BIT(4), IEEE802154_HW_PROMISCUOUS = BIT(5), IEEE802154_HW_RX_OMIT_CKSUM = BIT(6), - IEEE802154_HW_RX_DROP_BAD_CKSUM = BIT(7), }; /* Indicates that receiver omits FCS and xmitter will add FCS on it's own. */ -- cgit v1.2.3 From e6c86c513f440bec5f1046539c7e3c6c653842da Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Oct 2022 19:39:43 +0800 Subject: rcu-tasks: Provide rcu_trace_implies_rcu_gp() As an accident of implementation, an RCU Tasks Trace grace period also acts as an RCU grace period. However, this could change at any time. This commit therefore creates an rcu_trace_implies_rcu_gp() that currently returns true to codify this accident. Code relying on this accident must call this function to verify that this accident is still happening. Reported-by: Hou Tao Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Martin KaFai Lau Link: https://lore.kernel.org/r/20221014113946.965131-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/rcupdate.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 08605ce7379d..8822f06e4b40 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -240,6 +240,18 @@ static inline void exit_tasks_rcu_start(void) { } static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ +/** + * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period? + * + * As an accident of implementation, an RCU Tasks Trace grace period also + * acts as an RCU grace period. However, this could change at any time. + * Code relying on this accident must call this function to verify that + * this accident is still happening. + * + * You have been warned! + */ +static inline bool rcu_trace_implies_rcu_gp(void) { return true; } + /** * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU * -- cgit v1.2.3 From a2fd08448f2b7591fc7ec8dec2e5e025a43f0cee Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 17 Oct 2022 14:18:26 +0200 Subject: net: remove smc911x driver This driver was used on Arm and SH machines until 2009, when the last platforms moved to the smsc911x driver for the same hardware. Time to retire this version. Link: https://lore.kernel.org/netdev/1232010482-3744-1-git-send-email-steve.glendinning@smsc.com/ Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20221017121900.3520108-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/smc911x.h | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 include/linux/smc911x.h (limited to 'include') diff --git a/include/linux/smc911x.h b/include/linux/smc911x.h deleted file mode 100644 index 8cace8189e74..000000000000 --- a/include/linux/smc911x.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __SMC911X_H__ -#define __SMC911X_H__ - -#define SMC911X_USE_16BIT (1 << 0) -#define SMC911X_USE_32BIT (1 << 1) - -struct smc911x_platdata { - unsigned long flags; - unsigned long irq_flags; /* IRQF_... */ - int irq_polarity; -}; - -#endif /* __SMC911X_H__ */ -- cgit v1.2.3 From f392a1846489720fc2e063d1210633b6cf4ec5a4 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 17 Oct 2022 16:22:35 -0400 Subject: net: phylink: provide phylink_validate_mask_caps() helper Provide a helper that restricts the link modes according to the phylink capabilities. Signed-off-by: Russell King (Oracle) [rebased on net-next/master and added documentation] Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phylink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 664dd409feb9..c29c3f174972 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -556,6 +556,9 @@ void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps); unsigned long phylink_get_capabilities(phy_interface_t interface, unsigned long mac_capabilities, int rate_matching); +void phylink_validate_mask_caps(unsigned long *supported, + struct phylink_link_state *state, + unsigned long caps); void phylink_generic_validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); -- cgit v1.2.3 From 51c352bdbcd23d7ce46b06c1e64c82754dc44044 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 18 Oct 2022 15:37:27 +0100 Subject: netlink: add support for formatted extack messages Include an 80-byte buffer in struct netlink_ext_ack that can be used for scnprintf()ed messages. This does mean that the resulting string can't be enumerated, translated etc. in the way NL_SET_ERR_MSG() was designed to allow. Signed-off-by: Edward Cree Reviewed-by: Jakub Kicinski Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index d51e041d2242..d81bde5a5844 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -64,6 +64,7 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) /* this can be increased when necessary - don't expose to userland */ #define NETLINK_MAX_COOKIE_LEN 20 +#define NETLINK_MAX_FMTMSG_LEN 80 /** * struct netlink_ext_ack - netlink extended ACK report struct @@ -75,6 +76,8 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) * @miss_nest: nest missing an attribute (%NULL if missing top level attr) * @cookie: cookie data to return to userspace (for success) * @cookie_len: actual cookie data length + * @_msg_buf: output buffer for formatted message strings - don't access + * directly, use %NL_SET_ERR_MSG_FMT */ struct netlink_ext_ack { const char *_msg; @@ -84,13 +87,13 @@ struct netlink_ext_ack { u16 miss_type; u8 cookie[NETLINK_MAX_COOKIE_LEN]; u8 cookie_len; + char _msg_buf[NETLINK_MAX_FMTMSG_LEN]; }; /* Always use this macro, this allows later putting the * message into a separate section or such for things * like translation or listing all possible messages. - * Currently string formatting is not supported (due - * to the lack of an output buffer.) + * If string formatting is needed use NL_SET_ERR_MSG_FMT. */ #define NL_SET_ERR_MSG(extack, msg) do { \ static const char __msg[] = msg; \ @@ -102,9 +105,31 @@ struct netlink_ext_ack { __extack->_msg = __msg; \ } while (0) +/* We splice fmt with %s at each end even in the snprintf so that both calls + * can use the same string constant, avoiding its duplication in .ro + */ +#define NL_SET_ERR_MSG_FMT(extack, fmt, args...) do { \ + struct netlink_ext_ack *__extack = (extack); \ + \ + if (!__extack) \ + break; \ + if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \ + "%s" fmt "%s", "", ##args, "") >= \ + NETLINK_MAX_FMTMSG_LEN) \ + net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \ + ##args, "\n"); \ + \ + do_trace_netlink_extack(__extack->_msg_buf); \ + \ + __extack->_msg = __extack->_msg_buf; \ +} while (0) + #define NL_SET_ERR_MSG_MOD(extack, msg) \ NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg) +#define NL_SET_ERR_MSG_FMT_MOD(extack, fmt, args...) \ + NL_SET_ERR_MSG_FMT((extack), KBUILD_MODNAME ": " fmt, ##args) + #define NL_SET_BAD_ATTR_POLICY(extack, attr, pol) do { \ if ((extack)) { \ (extack)->bad_attr = (attr); \ -- cgit v1.2.3 From 6fdfdef7fdb57e6b9f768c9ca0718dcb5e727a85 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Wed, 19 Oct 2022 21:07:33 +0300 Subject: sctp: remove unnecessary NULL check in sctp_association_init() '&asoc->ulpq' passed to sctp_ulpq_init() as the first argument, then sctp_qlpq_init() initializes it and eventually returns the address of the struct member back. Therefore, in this case, the return pointer cannot be NULL. Moreover, it seems sctp_ulpq_init() has always been used only in sctp_association_init(), so there's really no need to return ulpq anymore. Detected using the static analysis tool - Svace. Signed-off-by: Alexey Kodanev Reviewed-by: Xin Long Link: https://lore.kernel.org/r/20221019180735.161388-1-aleksei.kodanev@bell-sw.com Signed-off-by: Jakub Kicinski --- include/net/sctp/ulpqueue.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h index 0eaf8650e3b2..60f6641290c3 100644 --- a/include/net/sctp/ulpqueue.h +++ b/include/net/sctp/ulpqueue.h @@ -35,8 +35,7 @@ struct sctp_ulpq { }; /* Prototypes. */ -struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *, - struct sctp_association *); +void sctp_ulpq_init(struct sctp_ulpq *ulpq, struct sctp_association *asoc); void sctp_ulpq_flush(struct sctp_ulpq *ulpq); void sctp_ulpq_free(struct sctp_ulpq *); -- cgit v1.2.3 From 1f8c4eeb945553baf868bbec7a8c59810df97a07 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 19 Oct 2022 15:36:02 -0700 Subject: inet6: Remove inet6_destroy_sock(). The last user of inet6_destroy_sock() is its wrapper inet6_cleanup_sock(). Let's rename inet6_destroy_sock() to inet6_cleanup_sock(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/transp_v6.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h index b830463e3dff..d27b1caf3753 100644 --- a/include/net/transp_v6.h +++ b/include/net/transp_v6.h @@ -58,8 +58,6 @@ ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp, #define LOOPBACK4_IPV6 cpu_to_be32(0x7f000006) -void inet6_destroy_sock(struct sock *sk); - #define IPV6_SEQ_DGRAM_HEADER \ " sl " \ "local_address " \ -- cgit v1.2.3 From 404c76783f322120266a4ef659abe418dc74f5c6 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Thu, 20 Oct 2022 17:20:03 +0200 Subject: ethtool: Add support for 800Gbps link modes Add support for 800Gbps speed, link modes of 100Gbps per lane. As mentioned in slide 21 in IEEE documentation [1], all adopted 802.3df copper and optical PMDs baselines using 100G/lane will be supported. Add the relevant PMDs which are mentioned in slide 5 in IEEE documentation [1] and were approved on 10-2022 [2]: BP - KR8 Cu Cable - CR8 MMF 50m - VR8 MMF 100m - SR8 SMF 500m - DR8 SMF 2km - DR8-2 [1]: https://www.ieee802.org/3/df/public/22_10/22_1004/shrikhande_3df_01a_221004.pdf [2]: https://ieee802.org/3/df/KeyMotions_3df_221005.pdf Signed-off-by: Amit Cohen Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index dc2aa3d75b39..f341de2ae612 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1737,6 +1737,13 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_100baseFX_Half_BIT = 90, ETHTOOL_LINK_MODE_100baseFX_Full_BIT = 91, ETHTOOL_LINK_MODE_10baseT1L_Full_BIT = 92, + ETHTOOL_LINK_MODE_800000baseCR8_Full_BIT = 93, + ETHTOOL_LINK_MODE_800000baseKR8_Full_BIT = 94, + ETHTOOL_LINK_MODE_800000baseDR8_Full_BIT = 95, + ETHTOOL_LINK_MODE_800000baseDR8_2_Full_BIT = 96, + ETHTOOL_LINK_MODE_800000baseSR8_Full_BIT = 97, + ETHTOOL_LINK_MODE_800000baseVR8_Full_BIT = 98, + /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS }; @@ -1848,6 +1855,7 @@ enum ethtool_link_mode_bit_indices { #define SPEED_100000 100000 #define SPEED_200000 200000 #define SPEED_400000 400000 +#define SPEED_800000 800000 #define SPEED_UNKNOWN -1 -- cgit v1.2.3 From a5ef058dc4d9a3e60d1808a0700e18e0e37e408e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 20 Oct 2022 19:48:51 +0200 Subject: net: introduce and use custom sockopt socket flag We will soon introduce custom setsockopt for UDP sockets, too. Instead of doing even more complex arbitrary checks inside sock_use_custom_sol_socket(), add a new socket flag and set it for the relevant socket types (currently only MPTCP). Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/net.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index 711c3593c3b8..59350fd85823 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -41,6 +41,7 @@ struct net; #define SOCK_NOSPACE 2 #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 +#define SOCK_CUSTOM_SOCKOPT 5 #ifndef ARCH_HAS_SOCKET_TYPES /** -- cgit v1.2.3 From 8a3854c7b8e4532063b14bed34115079b7d0cb36 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 20 Oct 2022 19:48:52 +0200 Subject: udp: track the forward memory release threshold in an hot cacheline When the receiver process and the BH runs on different cores, udp_rmem_release() experience a cache miss while accessing sk_rcvbuf, as the latter shares the same cacheline with sk_forward_alloc, written by the BH. With this patch, UDP tracks the rcvbuf value and its update via custom SOL_SOCKET socket options, and copies the forward memory threshold value used by udp_rmem_release() in a different cacheline, already accessed by the above function and uncontended. Since the UDP socket init operation grown a bit, factor out the common code between v4 and v6 in a shared helper. Overall the above give a 10% peek throughput increase under UDP flood. Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/udp.h | 3 +++ include/net/udp.h | 9 +++++++++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/udp.h b/include/linux/udp.h index e96da4157d04..5cdba00a904a 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -87,6 +87,9 @@ struct udp_sock { /* This field is dirtied by udp_recvmsg() */ int forward_deficit; + + /* This fields follows rcvbuf value, and is touched by udp_recvmsg */ + int forward_threshold; }; #define UDP_MAX_SEGMENTS (1 << 6UL) diff --git a/include/net/udp.h b/include/net/udp.h index fee053bcd17c..de4b528522bb 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -174,6 +174,15 @@ INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features, bool is_ipv6); +static inline void udp_lib_init_sock(struct sock *sk) +{ + struct udp_sock *up = udp_sk(sk); + + skb_queue_head_init(&up->reader_queue); + up->forward_threshold = sk->sk_rcvbuf >> 2; + set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); +} + /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ static inline int udp_lib_hash(struct sock *sk) { -- cgit v1.2.3 From 0cafd77dcd032d1687efaba5598cf07bce85997f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Oct 2022 23:20:18 +0000 Subject: net: add a refcount tracker for kernel sockets Commit ffa84b5ffb37 ("net: add netns refcount tracker to struct sock") added a tracker to sockets, but did not track kernel sockets. We still have syzbot reports hinting about netns being destroyed while some kernel TCP sockets had not been dismantled. This patch tracks kernel sockets, and adds a ref_tracker_dir_print() call to net_free() right before the netns is freed. Normally, each layer is responsible for properly releasing its kernel sockets before last call to net_free(). This debugging facility is enabled with CONFIG_NET_NS_REFCNT_TRACKER=y Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Tested-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/net_namespace.h | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 8c3587d5c308..78beaa765c73 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -92,7 +92,9 @@ struct net { struct ns_common ns; struct ref_tracker_dir refcnt_tracker; - + struct ref_tracker_dir notrefcnt_tracker; /* tracker for objects not + * refcounted against netns + */ struct list_head dev_base_head; struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat; @@ -320,19 +322,31 @@ static inline int check_net(const struct net *net) #endif -static inline void netns_tracker_alloc(struct net *net, - netns_tracker *tracker, gfp_t gfp) +static inline void __netns_tracker_alloc(struct net *net, + netns_tracker *tracker, + bool refcounted, + gfp_t gfp) { #ifdef CONFIG_NET_NS_REFCNT_TRACKER - ref_tracker_alloc(&net->refcnt_tracker, tracker, gfp); + ref_tracker_alloc(refcounted ? &net->refcnt_tracker : + &net->notrefcnt_tracker, + tracker, gfp); #endif } -static inline void netns_tracker_free(struct net *net, - netns_tracker *tracker) +static inline void netns_tracker_alloc(struct net *net, netns_tracker *tracker, + gfp_t gfp) +{ + __netns_tracker_alloc(net, tracker, true, gfp); +} + +static inline void __netns_tracker_free(struct net *net, + netns_tracker *tracker, + bool refcounted) { #ifdef CONFIG_NET_NS_REFCNT_TRACKER - ref_tracker_free(&net->refcnt_tracker, tracker); + ref_tracker_free(refcounted ? &net->refcnt_tracker : + &net->notrefcnt_tracker, tracker); #endif } @@ -346,7 +360,7 @@ static inline struct net *get_net_track(struct net *net, static inline void put_net_track(struct net *net, netns_tracker *tracker) { - netns_tracker_free(net, tracker); + __netns_tracker_free(net, tracker, true); put_net(net); } -- cgit v1.2.3 From 233baf9a1bc46f18ad3bec688f52ea5f818a8a25 Mon Sep 17 00:00:00 2001 From: xu xin Date: Thu, 20 Oct 2022 06:54:41 +0000 Subject: net: remove useless parameter of __sock_cmsg_send The parameter 'msg' has never been used by __sock_cmsg_send, so we can remove it safely. Reported-by: Zeal Robot Signed-off-by: xu xin Reviewed-by: Zhang Yunkai Acked-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 9e464f6409a7..b1dacc4d68c9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1901,7 +1901,7 @@ static inline void sockcm_init(struct sockcm_cookie *sockc, *sockc = (struct sockcm_cookie) { .tsflags = sk->sk_tsflags }; } -int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, +int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, struct sockcm_cookie *sockc); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc); -- cgit v1.2.3 From 4727bab4e9bbeafeff6acdfcb077a7a548cbde30 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Fri, 21 Oct 2022 10:58:22 +0800 Subject: net: skb: move skb_pp_recycle() to skbuff.c skb_pp_recycle() is only used by skb_free_head() in skbuff.c, so move it to skbuff.c. Signed-off-by: Yunsheng Lin Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7be5bb4c94b6..59c9fd55699d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -5050,12 +5050,5 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb) } #endif -static inline bool skb_pp_recycle(struct sk_buff *skb, void *data) -{ - if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) - return false; - return page_pool_return_skb_page(virt_to_page(data)); -} - #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ -- cgit v1.2.3 From 398900498485fa9465386454681763d81efaad25 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 21 Oct 2022 16:09:59 +0100 Subject: net: sfp: provide a definition for the power level select bit Provide a named definition for the power level select bit in the extended status register, rather than using BIT(0) in the code. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/sfp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index d1f343853b6c..01ae9f1dd2ad 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -489,6 +489,8 @@ enum { SFP_WARN1_RXPWR_LOW = BIT(6), SFP_EXT_STATUS = 0x76, + SFP_EXT_STATUS_PWRLVL_SELECT = BIT(0), + SFP_VSL = 0x78, SFP_PAGE = 0x7f, }; -- cgit v1.2.3 From 4a6a676f8c16ec17d2f8d69ce3b5d680277ed0d2 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 21 Oct 2022 00:58:39 -0700 Subject: act_skbedit: skbedit queue mapping for receive queue Add support for skbedit queue mapping action on receive side. This is supported only in hardware, so the skip_sw flag is enforced. This enables offloading filters for receive queue selection in the hardware using the skbedit action. Traffic arrives on the Rx queue requested in the skbedit action parameter. A new tc action flag TCA_ACT_FLAGS_AT_INGRESS is introduced to identify the traffic direction the action queue_mapping is requested on during filter addition. This is used to disallow offloading the skbedit queue mapping action on transmit side. Example: $tc filter add dev $IFACE ingress protocol ip flower dst_ip $DST_IP\ action skbedit queue_mapping $rxq_id skip_sw Reviewed-by: Sridhar Samudrala Signed-off-by: Amritha Nambiar Signed-off-by: Paolo Abeni --- include/net/act_api.h | 1 + include/net/flow_offload.h | 2 ++ include/net/tc_act/tc_skbedit.h | 29 +++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 61f2ceb3939e..c94ea1a306e0 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -67,6 +67,7 @@ struct tc_action { #define TCA_ACT_FLAGS_BIND (1U << (TCA_ACT_FLAGS_USER_BITS + 1)) #define TCA_ACT_FLAGS_REPLACE (1U << (TCA_ACT_FLAGS_USER_BITS + 2)) #define TCA_ACT_FLAGS_NO_RTNL (1U << (TCA_ACT_FLAGS_USER_BITS + 3)) +#define TCA_ACT_FLAGS_AT_INGRESS (1U << (TCA_ACT_FLAGS_USER_BITS + 4)) /* Update lastuse only if needed, to avoid dirtying a cache line. * We use a temp variable to avoid fetching jiffies twice. diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index e343f9f8363e..7a60bc6d72c9 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -155,6 +155,7 @@ enum flow_action_id { FLOW_ACTION_MARK, FLOW_ACTION_PTYPE, FLOW_ACTION_PRIORITY, + FLOW_ACTION_RX_QUEUE_MAPPING, FLOW_ACTION_WAKE, FLOW_ACTION_QUEUE, FLOW_ACTION_SAMPLE, @@ -247,6 +248,7 @@ struct flow_action_entry { u32 csum_flags; /* FLOW_ACTION_CSUM */ u32 mark; /* FLOW_ACTION_MARK */ u16 ptype; /* FLOW_ACTION_PTYPE */ + u16 rx_queue; /* FLOW_ACTION_RX_QUEUE_MAPPING */ u32 priority; /* FLOW_ACTION_PRIORITY */ struct { /* FLOW_ACTION_QUEUE */ u32 ctx; diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index dc1079f28e13..9649600fb3dc 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -95,12 +95,41 @@ static inline u32 tcf_skbedit_priority(const struct tc_action *a) return priority; } +static inline u16 tcf_skbedit_rx_queue_mapping(const struct tc_action *a) +{ + u16 rx_queue; + + rcu_read_lock(); + rx_queue = rcu_dereference(to_skbedit(a)->params)->queue_mapping; + rcu_read_unlock(); + + return rx_queue; +} + /* Return true iff action is queue_mapping */ static inline bool is_tcf_skbedit_queue_mapping(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_QUEUE_MAPPING); } +/* Return true if action is on ingress traffic */ +static inline bool is_tcf_skbedit_ingress(u32 flags) +{ + return flags & TCA_ACT_FLAGS_AT_INGRESS; +} + +static inline bool is_tcf_skbedit_tx_queue_mapping(const struct tc_action *a) +{ + return is_tcf_skbedit_queue_mapping(a) && + !is_tcf_skbedit_ingress(a->tcfa_flags); +} + +static inline bool is_tcf_skbedit_rx_queue_mapping(const struct tc_action *a) +{ + return is_tcf_skbedit_queue_mapping(a) && + is_tcf_skbedit_ingress(a->tcfa_flags); +} + /* Return true iff action is inheritdsfield */ static inline bool is_tcf_skbedit_inheritdsfield(const struct tc_action *a) { -- cgit v1.2.3 From b261eda84ec136240a9ca753389853a3a1bccca2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 21 Oct 2022 13:44:34 -0700 Subject: soreuseport: Fix socket selection for SO_INCOMING_CPU. Kazuho Oku reported that setsockopt(SO_INCOMING_CPU) does not work with setsockopt(SO_REUSEPORT) since v4.6. With the combination of SO_REUSEPORT and SO_INCOMING_CPU, we could build a highly efficient server application. setsockopt(SO_INCOMING_CPU) associates a CPU with a TCP listener or UDP socket, and then incoming packets processed on the CPU will likely be distributed to the socket. Technically, a socket could even receive packets handled on another CPU if no sockets in the reuseport group have the same CPU receiving the flow. The logic exists in compute_score() so that a socket will get a higher score if it has the same CPU with the flow. However, the score gets ignored after the blamed two commits, which introduced a faster socket selection algorithm for SO_REUSEPORT. This patch introduces a counter of sockets with SO_INCOMING_CPU in a reuseport group to check if we should iterate all sockets to find a proper one. We increment the counter when * calling listen() if the socket has SO_INCOMING_CPU and SO_REUSEPORT * enabling SO_INCOMING_CPU if the socket is in a reuseport group Also, we decrement it when * detaching a socket out of the group to apply SO_INCOMING_CPU to migrated TCP requests * disabling SO_INCOMING_CPU if the socket is in a reuseport group When the counter reaches 0, we can get back to the O(1) selection algorithm. The overall changes are negligible for the non-SO_INCOMING_CPU case, and the only notable thing is that we have to update sk_incomnig_cpu under reuseport_lock. Otherwise, the race prevents transitioning to the O(n) algorithm and results in the wrong socket selection. cpu1 (setsockopt) cpu2 (listen) +-----------------+ +-------------+ lock_sock(sk1) lock_sock(sk2) reuseport_update_incoming_cpu(sk1, val) . | /* set CPU as 0 */ |- WRITE_ONCE(sk1->incoming_cpu, val) | | spin_lock_bh(&reuseport_lock) | reuseport_grow(sk2, reuse) | . | |- more_socks_size = reuse->max_socks * 2U; | |- if (more_socks_size > U16_MAX && | | reuse->num_closed_socks) | | . | | |- RCU_INIT_POINTER(sk1->sk_reuseport_cb, NULL); | | `- __reuseport_detach_closed_sock(sk1, reuse) | | . | | `- reuseport_put_incoming_cpu(sk1, reuse) | | . | | | /* Read shutdown()ed sk1's sk_incoming_cpu | | | * without lock_sock(). | | | */ | | `- if (sk1->sk_incoming_cpu >= 0) | | . | | | /* decrement not-yet-incremented | | | * count, which is never incremented. | | | */ | | `- __reuseport_put_incoming_cpu(reuse); | | | `- spin_lock_bh(&reuseport_lock) | |- spin_lock_bh(&reuseport_lock) | |- reuse = rcu_dereference_protected(sk1->sk_reuseport_cb, ...) |- if (!reuse) | . | | /* Cannot increment reuse->incoming_cpu. */ | `- goto out; | `- spin_unlock_bh(&reuseport_lock) Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection") Fixes: c125e80b8868 ("soreuseport: fast reuseport TCP socket selection") Reported-by: Kazuho Oku Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- include/net/sock_reuseport.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index efc9085c6892..6ec140b0a61b 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -16,6 +16,7 @@ struct sock_reuseport { u16 max_socks; /* length of socks */ u16 num_socks; /* elements in socks */ u16 num_closed_socks; /* closed elements in socks */ + u16 incoming_cpu; /* The last synq overflow event timestamp of this * reuse->socks[] group. */ @@ -58,5 +59,6 @@ static inline bool reuseport_has_conns(struct sock *sk) } void reuseport_has_conns_set(struct sock *sk); +void reuseport_update_incoming_cpu(struct sock *sk, int val); #endif /* _SOCK_REUSEPORT_H */ -- cgit v1.2.3 From ac1f8c049319847b1b4c6b387fdb2e3f7fb84ffc Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 28 Sep 2022 23:55:06 +0200 Subject: netfilter: nft_payload: move struct nft_payload_set definition where it belongs Not required to expose this header in nf_tables_core.h, move it to where it is used, ie. nft_payload. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 1223af68cd9a..990c3767a350 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -66,16 +66,6 @@ struct nft_payload { u8 dreg; }; -struct nft_payload_set { - enum nft_payload_bases base:8; - u8 offset; - u8 len; - u8 sreg; - u8 csum_type; - u8 csum_offset; - u8 csum_flags; -}; - extern const struct nft_expr_ops nft_payload_fast_ops; extern const struct nft_expr_ops nft_bitwise_fast_ops; -- cgit v1.2.3 From e7a1caa67ce62765fe174cae08e537d542bb44f8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 15 Oct 2022 00:20:50 +0200 Subject: netfilter: nf_tables: reduce nft_pktinfo by 8 bytes structure is reduced from 32 to 24 bytes. While at it, also check that iphdrlen is sane, this is guaranteed for NFPROTO_IPV4 but not for ingress or bridge, so add checks for this. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++-- include/net/netfilter/nf_tables_ipv4.h | 4 ++++ include/net/netfilter/nf_tables_ipv6.h | 6 +++--- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index cdb7db9b0e25..f6db510689a8 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -32,8 +32,8 @@ struct nft_pktinfo { u8 flags; u8 tprot; u16 fragoff; - unsigned int thoff; - unsigned int inneroff; + u16 thoff; + u16 inneroff; }; static inline struct sock *nft_sk(const struct nft_pktinfo *pkt) diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h index c4a6147b0ef8..112708f7a6b4 100644 --- a/include/net/netfilter/nf_tables_ipv4.h +++ b/include/net/netfilter/nf_tables_ipv4.h @@ -35,6 +35,8 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) return -1; else if (len < thoff) return -1; + else if (thoff < sizeof(*iph)) + return -1; pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = iph->protocol; @@ -69,6 +71,8 @@ static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt) return -1; } else if (len < thoff) { goto inhdr_error; + } else if (thoff < sizeof(*iph)) { + return -1; } pkt->flags = NFT_PKTINFO_L4PROTO; diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index ec7eaeaf4f04..467d59b9e533 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -13,7 +13,7 @@ static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt) unsigned short frag_off; protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); - if (protohdr < 0) { + if (protohdr < 0 || thoff > U16_MAX) { nft_set_pktinfo_unspec(pkt); return; } @@ -47,7 +47,7 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) return -1; protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); - if (protohdr < 0) + if (protohdr < 0 || thoff > U16_MAX) return -1; pkt->flags = NFT_PKTINFO_L4PROTO; @@ -93,7 +93,7 @@ static inline int nft_set_pktinfo_ipv6_ingress(struct nft_pktinfo *pkt) } protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); - if (protohdr < 0) + if (protohdr < 0 || thoff > U16_MAX) goto inhdr_error; pkt->flags = NFT_PKTINFO_L4PROTO; -- cgit v1.2.3 From d037abc2414b4539401e0e6aa278bedc4628ad69 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 21 Oct 2022 16:17:53 +0200 Subject: netfilter: nft_objref: make it builtin nft_objref is needed to reference named objects, it makes no sense to disable it. Before: text data bss dec filename 4014 424 0 4438 nft_objref.o 4174 1128 0 5302 nft_objref.ko 359351 15276 864 375491 nf_tables.ko After: text data bss dec filename 3815 408 0 4223 nft_objref.o 363161 15692 864 379717 nf_tables.ko Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 990c3767a350..83d763631f81 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -18,6 +18,7 @@ extern struct nft_expr_type nft_meta_type; extern struct nft_expr_type nft_rt_type; extern struct nft_expr_type nft_exthdr_type; extern struct nft_expr_type nft_last_type; +extern struct nft_expr_type nft_objref_type; #ifdef CONFIG_NETWORK_SECMARK extern struct nft_object_type nft_secmark_obj_type; -- cgit v1.2.3 From 3a07327d10a09379315c844c63f27941f5081e0a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 25 Oct 2022 13:48:15 +0200 Subject: netfilter: nft_inner: support for inner tunnel header matching This new expression allows you to match on the inner headers that are encapsulated by any of the existing tunneling protocols. This expression parses the inner packet to set the link, network and transport offsets, so the existing expressions (with a few updates) can be reused to match on the inner headers. The inner expression supports for different tunnel combinations such as: - ethernet frame over IPv4/IPv6 packet, eg. VxLAN. - IPv4/IPv6 packet over IPv4/IPv6 packet, eg. IPIP. - IPv4/IPv6 packet over IPv4/IPv6 + transport header, eg. GRE. - transport header (ESP or SCTP) over transport header (usually UDP) The following fields are used to describe the tunnel protocol: - flags, which describe how to parse the inner headers: NFT_PAYLOAD_CTX_INNER_TUN, the tunnel provides its own header. NFT_PAYLOAD_CTX_INNER_ETHER, the ethernet frame is available as inner header. NFT_PAYLOAD_CTX_INNER_NH, the network header is available as inner header. NFT_PAYLOAD_CTX_INNER_TH, the transport header is available as inner header. For example, VxLAN sets on all of these flags. While GRE only sets on NFT_PAYLOAD_CTX_INNER_NH and NFT_PAYLOAD_CTX_INNER_TH. Then, ESP over UDP only sets on NFT_PAYLOAD_CTX_INNER_TH. The tunnel description is composed of the following attributes: - header size: in case the tunnel comes with its own header, eg. VxLAN. - type: this provides a hint to userspace on how to delinearize the rule. This is useful for VxLAN and Geneve since they run over UDP, since transport does not provide a hint. This is also useful in case hardware offload is ever supported. The type is not currently interpreted by the kernel. - expression: currently only payload supported. Follow up patch adds also inner meta support which is required by autogenerated dependencies. The exthdr expression should be supported too at some point. There is a new inner_ops operation that needs to be set on to allow to use an existing expression from the inner expression. This patch adds a new NFT_PAYLOAD_TUN_HEADER base which allows to match on the tunnel header fields, eg. vxlan vni. The payload expression is embedded into nft_inner private area and this private data area is passed to the payload inner eval function via direct call. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 +++++ include/net/netfilter/nf_tables_core.h | 24 ++++++++++++++++++++++++ include/uapi/linux/netfilter/nf_tables.h | 26 ++++++++++++++++++++++++++ 3 files changed, 55 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index f6db510689a8..2dbfe7524a7e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -375,6 +375,10 @@ static inline void *nft_expr_priv(const struct nft_expr *expr) return (void *)expr->data; } +struct nft_expr_info; + +int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, + struct nft_expr_info *info); int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, @@ -864,6 +868,7 @@ struct nft_expr_type { const struct nlattr * const tb[]); void (*release_ops)(const struct nft_expr_ops *ops); const struct nft_expr_ops *ops; + const struct nft_expr_ops *inner_ops; struct list_head list; const char *name; struct module *owner; diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 83d763631f81..be2b2b5d0a52 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -19,6 +19,7 @@ extern struct nft_expr_type nft_rt_type; extern struct nft_expr_type nft_exthdr_type; extern struct nft_expr_type nft_last_type; extern struct nft_expr_type nft_objref_type; +extern struct nft_expr_type nft_inner_type; #ifdef CONFIG_NETWORK_SECMARK extern struct nft_object_type nft_secmark_obj_type; @@ -139,4 +140,27 @@ void nft_rt_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); + +enum { + NFT_PAYLOAD_CTX_INNER_TUN = (1 << 0), + NFT_PAYLOAD_CTX_INNER_LL = (1 << 1), + NFT_PAYLOAD_CTX_INNER_NH = (1 << 2), + NFT_PAYLOAD_CTX_INNER_TH = (1 << 3), +}; + +struct nft_inner_tun_ctx { + u16 inner_tunoff; + u16 inner_lloff; + u16 inner_nhoff; + u16 inner_thoff; + __be16 llproto; + u8 l4proto; + u8 flags; +}; + +int nft_payload_inner_offset(const struct nft_pktinfo *pkt); +void nft_payload_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *ctx); + #endif /* _NET_NF_TABLES_CORE_H */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 466fd3f4447c..05a15dce8271 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -760,6 +760,7 @@ enum nft_payload_bases { NFT_PAYLOAD_NETWORK_HEADER, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_PAYLOAD_INNER_HEADER, + NFT_PAYLOAD_TUN_HEADER, }; /** @@ -779,6 +780,31 @@ enum nft_payload_csum_flags { NFT_PAYLOAD_L4CSUM_PSEUDOHDR = (1 << 0), }; +enum nft_inner_type { + NFT_INNER_UNSPEC = 0, + NFT_INNER_VXLAN, +}; + +enum nft_inner_flags { + NFT_INNER_HDRSIZE = (1 << 0), + NFT_INNER_LL = (1 << 1), + NFT_INNER_NH = (1 << 2), + NFT_INNER_TH = (1 << 3), +}; +#define NFT_INNER_MASK (NFT_INNER_HDRSIZE | NFT_INNER_LL | \ + NFT_INNER_NH | NFT_INNER_TH) + +enum nft_inner_attributes { + NFTA_INNER_UNSPEC, + NFTA_INNER_NUM, + NFTA_INNER_TYPE, + NFTA_INNER_FLAGS, + NFTA_INNER_HDRSIZE, + NFTA_INNER_EXPR, + __NFTA_INNER_MAX +}; +#define NFTA_INNER_MAX (__NFTA_INNER_MAX - 1) + /** * enum nft_payload_attributes - nf_tables payload expression netlink attributes * -- cgit v1.2.3 From 0e795b37ba044893107f887b037594645a6fc584 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 17 Oct 2022 13:03:32 +0200 Subject: netfilter: nft_inner: add percpu inner context Add NFT_PKTINFO_INNER_FULL flag to annotate that inner offsets are available. Store nft_inner_tun_ctx object in percpu area to cache existing inner offsets for this skbuff. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 1 + include/net/netfilter/nf_tables_core.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2dbfe7524a7e..38e2b396e38a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -24,6 +24,7 @@ struct module; enum { NFT_PKTINFO_L4PROTO = (1 << 0), NFT_PKTINFO_INNER = (1 << 1), + NFT_PKTINFO_INNER_FULL = (1 << 2), }; struct nft_pktinfo { diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index be2b2b5d0a52..3e825381ac5c 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -149,6 +149,7 @@ enum { }; struct nft_inner_tun_ctx { + u16 type; u16 inner_tunoff; u16 inner_lloff; u16 inner_nhoff; -- cgit v1.2.3 From a150d122b6bdb84df532057aa3b2faf8c6485792 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 17 Oct 2022 13:03:33 +0200 Subject: netfilter: nft_meta: add inner match support Add support for inner meta matching on: - NFT_META_PROTOCOL: to match on the ethertype, this can be used regardless tunnel protocol provides no link layer header, in that case nft_inner sets on the ethertype based on the IP header version field. - NFT_META_L4PROTO: to match on the layer 4 protocol. These meta expression are usually autogenerated as dependencies by userspace nftables. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_meta.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index 9b51cc67de54..f3a5285a511c 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -46,4 +46,10 @@ int nft_meta_set_validate(const struct nft_ctx *ctx, bool nft_meta_get_reduce(struct nft_regs_track *track, const struct nft_expr *expr); + +struct nft_inner_tun_ctx; +void nft_meta_inner_eval(const struct nft_expr *expr, + struct nft_regs *regs, const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx); + #endif -- cgit v1.2.3 From 0db14b95660b63dceeb7e89f2e3ffa97d331fce0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 17 Oct 2022 13:03:34 +0200 Subject: netfilter: nft_inner: add geneve support Geneve tunnel header may contain options, parse geneve header and update offset to point to the link layer header according to the opt_len field. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 05a15dce8271..e4b739d57480 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -783,6 +783,7 @@ enum nft_payload_csum_flags { enum nft_inner_type { NFT_INNER_UNSPEC = 0, NFT_INNER_VXLAN, + NFT_INNER_GENEVE, }; enum nft_inner_flags { -- cgit v1.2.3 From 73feb8d5fa3b755bb51077c0aabfb6aa556fd498 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Oct 2022 15:41:41 +0200 Subject: kallsyms: Make module_kallsyms_on_each_symbol generally available Making module_kallsyms_on_each_symbol generally available, so it can be used outside CONFIG_LIVEPATCH option in following changes. Rather than adding another ifdef option let's make the function generally available (when CONFIG_KALLSYMS and CONFIG_MODULES options are defined). Cc: Christoph Hellwig Acked-by: Song Liu Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20221025134148.3300700-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/module.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/module.h b/include/linux/module.h index ec61fb53979a..35876e89eb93 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -879,8 +879,17 @@ static inline bool module_sig_ok(struct module *module) } #endif /* CONFIG_MODULE_SIG */ +#if defined(CONFIG_MODULES) && defined(CONFIG_KALLSYMS) int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data); +#else +static inline int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, + struct module *, unsigned long), + void *data) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_MODULES && CONFIG_KALLSYMS */ #endif /* _LINUX_MODULE_H */ -- cgit v1.2.3 From b5f0de6df6dce8d641ef58ef7012f3304dffb9a1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 18 Oct 2022 02:56:03 -0700 Subject: net: dev: Convert sa_data to flexible array in struct sockaddr One of the worst offenders of "fake flexible arrays" is struct sockaddr, as it is the classic example of why GCC and Clang have been traditionally forced to treat all trailing arrays as fake flexible arrays: in the distant misty past, sa_data became too small, and code started just treating it as a flexible array, even though it was fixed-size. The special case by the compiler is specifically that sizeof(sa->sa_data) and FORTIFY_SOURCE (which uses __builtin_object_size(sa->sa_data, 1)) do not agree (14 and -1 respectively), which makes FORTIFY_SOURCE treat it as a flexible array. However, the coming -fstrict-flex-arrays compiler flag will remove these special cases so that FORTIFY_SOURCE can gain coverage over all the trailing arrays in the kernel that are _not_ supposed to be treated as a flexible array. To deal with this change, convert sa_data to a true flexible array. To keep the structure size the same, move sa_data into a union with a newly introduced sa_data_min with the original size. The result is that FORTIFY_SOURCE can continue to have no idea how large sa_data may actually be, but anything using sizeof(sa->sa_data) must switch to sizeof(sa->sa_data_min). Cc: Jens Axboe Cc: Pavel Begunkov Cc: David Ahern Cc: Dylan Yudaken Cc: Yajun Deng Cc: Petr Machata Cc: Hangbin Liu Cc: Leon Romanovsky Cc: syzbot Cc: Willem de Bruijn Cc: Pablo Neira Ayuso Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221018095503.never.671-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index de3701a2a212..13c3a237b9c9 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -33,7 +33,10 @@ typedef __kernel_sa_family_t sa_family_t; struct sockaddr { sa_family_t sa_family; /* address family, AF_xxx */ - char sa_data[14]; /* 14 bytes of protocol address */ + union { + char sa_data_min[14]; /* Minimum 14 bytes of protocol address */ + DECLARE_FLEX_ARRAY(char, sa_data); + }; }; struct linger { -- cgit v1.2.3 From 271de525e1d7f564e88a9d212c50998b49a54476 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Oct 2022 11:45:16 -0700 Subject: bpf: Remove prog->active check for bpf_lsm and bpf_iter The commit 64696c40d03c ("bpf: Add __bpf_prog_{enter,exit}_struct_ops for struct_ops trampoline") removed prog->active check for struct_ops prog. The bpf_lsm and bpf_iter is also using trampoline. Like struct_ops, the bpf_lsm and bpf_iter have fixed hooks for the prog to attach. The kernel does not call the same hook in a recursive way. This patch also removes the prog->active check for bpf_lsm and bpf_iter. A later patch has a test to reproduce the recursion issue for a sleepable bpf_lsm program. This patch appends the '_recur' naming to the existing enter and exit functions that track the prog->active counter. New __bpf_prog_{enter,exit}[_sleepable] function are added to skip the prog->active tracking. The '_struct_ops' version is also removed. It also moves the decision on picking the enter and exit function to the new bpf_trampoline_{enter,exit}(). It returns the '_recur' ones for all tracing progs to use. For bpf_lsm, bpf_iter, struct_ops (no prog->active tracking after 64696c40d03c), and bpf_lsm_cgroup (no prog->active tracking after 69fd337a975c7), it will return the functions that don't track the prog->active. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20221025184524.3526117-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 24 ++++++++++-------------- include/linux/bpf_verifier.h | 15 ++++++++++++++- 2 files changed, 24 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9e7d46d16032..1279e699dc98 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -854,22 +854,18 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *i const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *orig_call); -/* these two functions are called from generated trampoline */ -u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); -u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); -void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, - struct bpf_tramp_run_ctx *run_ctx); -u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, - struct bpf_tramp_run_ctx *run_ctx); -void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, - struct bpf_tramp_run_ctx *run_ctx); -u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog, - struct bpf_tramp_run_ctx *run_ctx); -void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start, - struct bpf_tramp_run_ctx *run_ctx); +u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx); +void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); +typedef u64 (*bpf_trampoline_enter_t)(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx); +typedef void (*bpf_trampoline_exit_t)(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx); +bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog); +bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog); struct bpf_ksym { unsigned long start; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 9e1e6965f407..1a32baa78ce2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -642,10 +642,23 @@ static inline u32 type_flag(u32 type) } /* only use after check_attach_btf_id() */ -static inline enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog) +static inline enum bpf_prog_type resolve_prog_type(const struct bpf_prog *prog) { return prog->type == BPF_PROG_TYPE_EXT ? prog->aux->dst_prog->type : prog->type; } +static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) +{ + switch (resolve_prog_type(prog)) { + case BPF_PROG_TYPE_TRACING: + return prog->expected_attach_type != BPF_TRACE_ITER; + case BPF_PROG_TYPE_STRUCT_OPS: + case BPF_PROG_TYPE_LSM: + return false; + default: + return true; + } +} + #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit v1.2.3 From 0593dd34e53489557569d5e6d27371b49aa9b41f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Oct 2022 11:45:17 -0700 Subject: bpf: Append _recur naming to the bpf_task_storage helper proto This patch adds the "_recur" naming to the bpf_task_storage_{get,delete} proto. In a latter patch, they will only be used by the tracing programs that requires a deadlock detection because a tracing prog may use bpf_task_storage_{get,delete} recursively and cause a deadlock. Another following patch will add a different helper proto for the non tracing programs because they do not need the deadlock prevention. This patch does this rename to prepare for this future proto additions. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20221025184524.3526117-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1279e699dc98..b04fe3f4342e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2519,8 +2519,8 @@ extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; extern const struct bpf_func_proto bpf_sock_from_file_proto; extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; -extern const struct bpf_func_proto bpf_task_storage_get_proto; -extern const struct bpf_func_proto bpf_task_storage_delete_proto; +extern const struct bpf_func_proto bpf_task_storage_get_recur_proto; +extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto; extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; -- cgit v1.2.3 From 4279adb094a17132423f1271c3d11b593fc2327e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Oct 2022 11:45:20 -0700 Subject: bpf: Add new bpf_task_storage_get proto with no deadlock detection The bpf_lsm and bpf_iter do not recur that will cause a deadlock. The situation is similar to the bpf_pid_task_storage_lookup_elem() which is called from the syscall map_lookup_elem. It does not need deadlock detection. Otherwise, it will cause unnecessary failure when calling the bpf_task_storage_get() helper. This patch adds bpf_task_storage_get proto that does not do deadlock detection. It will be used by bpf_lsm and bpf_iter programs. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20221025184524.3526117-6-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b04fe3f4342e..ef3f98afa45d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2520,6 +2520,7 @@ extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; extern const struct bpf_func_proto bpf_sock_from_file_proto; extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; extern const struct bpf_func_proto bpf_task_storage_get_recur_proto; +extern const struct bpf_func_proto bpf_task_storage_get_proto; extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto; extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; -- cgit v1.2.3 From 8a7dac37f27a3dfbd814bf29a73d6417db2c81d9 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Oct 2022 11:45:22 -0700 Subject: bpf: Add new bpf_task_storage_delete proto with no deadlock detection The bpf_lsm and bpf_iter do not recur that will cause a deadlock. The situation is similar to the bpf_pid_task_storage_delete_elem() which is called from the syscall map_delete_elem. It does not need deadlock detection. Otherwise, it will cause unnecessary failure when calling the bpf_task_storage_delete() helper. This patch adds bpf_task_storage_delete proto that does not do deadlock detection. It will be used by bpf_lsm and bpf_iter program. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20221025184524.3526117-8-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ef3f98afa45d..a5dbac8f5aba 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2522,6 +2522,7 @@ extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; extern const struct bpf_func_proto bpf_task_storage_get_recur_proto; extern const struct bpf_func_proto bpf_task_storage_get_proto; extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto; +extern const struct bpf_func_proto bpf_task_storage_delete_proto; extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; -- cgit v1.2.3 From 5e67b8ef125bb6e83bf0f0442ad7ffc09e7956f9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 25 Oct 2022 21:28:40 -0700 Subject: bpf: Make struct cgroup btf id global Make struct cgroup btf id global so later patch can reuse the same btf id. Acked-by: David Vernet Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221026042840.672602-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/btf_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 2aea877d644f..c9744efd202f 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -265,5 +265,6 @@ MAX_BTF_TRACING_TYPE, }; extern u32 btf_tracing_ids[]; +extern u32 bpf_cgroup_btf_id[]; #endif -- cgit v1.2.3 From c83597fa5dc6b322e9bdf929e5f4136a3f4aa4db Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 25 Oct 2022 21:28:45 -0700 Subject: bpf: Refactor some inode/task/sk storage functions for reuse Refactor codes so that inode/task/sk storage implementation can maximally share the same code. I also added some comments in new function bpf_local_storage_unlink_nolock() to make codes easy to understand. There is no functionality change. Acked-by: David Vernet Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221026042845.672944-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 7ea18d4da84b..6d37a40cd90e 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -116,21 +116,22 @@ static struct bpf_local_storage_cache name = { \ .idx_lock = __SPIN_LOCK_UNLOCKED(name.idx_lock), \ } -u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache); -void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, - u16 idx); - /* Helper functions for bpf_local_storage */ int bpf_local_storage_map_alloc_check(union bpf_attr *attr); -struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr); +struct bpf_map * +bpf_local_storage_map_alloc(union bpf_attr *attr, + struct bpf_local_storage_cache *cache); struct bpf_local_storage_data * bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool cacheit_lockit); -void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, +bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage); + +void bpf_local_storage_map_free(struct bpf_map *map, + struct bpf_local_storage_cache *cache, int __percpu *busy_counter); int bpf_local_storage_map_check_btf(const struct bpf_map *map, @@ -141,10 +142,6 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); -bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem, - bool uncharge_omem, bool use_trace_rcu); - void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu); void bpf_selem_link_map(struct bpf_local_storage_map *smap, -- cgit v1.2.3 From c4bcfb38a95edb1021a53f2d0356a78120ecfbe4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 25 Oct 2022 21:28:50 -0700 Subject: bpf: Implement cgroup storage available to non-cgroup-attached bpf progs Similar to sk/inode/task storage, implement similar cgroup local storage. There already exists a local storage implementation for cgroup-attached bpf programs. See map type BPF_MAP_TYPE_CGROUP_STORAGE and helper bpf_get_local_storage(). But there are use cases such that non-cgroup attached bpf progs wants to access cgroup local storage data. For example, tc egress prog has access to sk and cgroup. It is possible to use sk local storage to emulate cgroup local storage by storing data in socket. But this is a waste as it could be lots of sockets belonging to a particular cgroup. Alternatively, a separate map can be created with cgroup id as the key. But this will introduce additional overhead to manipulate the new map. A cgroup local storage, similar to existing sk/inode/task storage, should help for this use case. The life-cycle of storage is managed with the life-cycle of the cgroup struct. i.e. the storage is destroyed along with the owning cgroup with a call to bpf_cgrp_storage_free() when cgroup itself is deleted. The userspace map operations can be done by using a cgroup fd as a key passed to the lookup, update and delete operations. Typically, the following code is used to get the current cgroup: struct task_struct *task = bpf_get_current_task_btf(); ... task->cgroups->dfl_cgrp ... and in structure task_struct definition: struct task_struct { .... struct css_set __rcu *cgroups; .... } With sleepable program, accessing task->cgroups is not protected by rcu_read_lock. So the current implementation only supports non-sleepable program and supporting sleepable program will be the next step together with adding rcu_read_lock protection for rcu tagged structures. Since map name BPF_MAP_TYPE_CGROUP_STORAGE has been used for old cgroup local storage support, the new map name BPF_MAP_TYPE_CGRP_STORAGE is used for cgroup storage available to non-cgroup-attached bpf programs. The old cgroup storage supports bpf_get_local_storage() helper to get the cgroup data. The new cgroup storage helper bpf_cgrp_storage_get() can provide similar functionality. While old cgroup storage pre-allocates storage memory, the new mechanism can also pre-allocate with a user space bpf_map_update_elem() call to avoid potential run-time memory allocation failure. Therefore, the new cgroup storage can provide all functionality w.r.t. the old one. So in uapi bpf.h, the old BPF_MAP_TYPE_CGROUP_STORAGE is alias to BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED to indicate the old cgroup storage can be deprecated since the new one can provide the same functionality. Acked-by: David Vernet Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221026042850.673791-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 +++++++ include/linux/bpf_types.h | 1 + include/linux/cgroup-defs.h | 4 ++++ include/uapi/linux/bpf.h | 50 ++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 61 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a5dbac8f5aba..9fd68b0b3e9c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2041,6 +2041,7 @@ struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); void bpf_task_storage_free(struct task_struct *task); +void bpf_cgrp_storage_free(struct cgroup *cgroup); bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); const struct btf_func_model * bpf_jit_find_kfunc_model(const struct bpf_prog *prog, @@ -2295,6 +2296,10 @@ static inline bool has_current_bpf_ctx(void) static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog) { } + +static inline void bpf_cgrp_storage_free(struct cgroup *cgroup) +{ +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, @@ -2535,6 +2540,8 @@ extern const struct bpf_func_proto bpf_copy_from_user_task_proto; extern const struct bpf_func_proto bpf_set_retval_proto; extern const struct bpf_func_proto bpf_get_retval_proto; extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto; +extern const struct bpf_func_proto bpf_cgrp_storage_get_proto; +extern const struct bpf_func_proto bpf_cgrp_storage_delete_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2c6a4f2562a7..d4ee3ccd3753 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -86,6 +86,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_PROG_ARRAY, prog_array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops) #ifdef CONFIG_CGROUPS BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_CGRP_STORAGE, cgrp_storage_map_ops) #endif #ifdef CONFIG_CGROUP_BPF BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 8f481d1b159a..c466fdc3a32a 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -504,6 +504,10 @@ struct cgroup { /* Used to store internal freezer state */ struct cgroup_freezer_state freezer; +#ifdef CONFIG_BPF_SYSCALL + struct bpf_local_storage __rcu *bpf_cgrp_storage; +#endif + /* All ancestors including self */ struct cgroup *ancestors[]; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 17f61338f8f8..94659f6b3395 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -922,7 +922,14 @@ enum bpf_map_type { BPF_MAP_TYPE_CPUMAP, BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, - BPF_MAP_TYPE_CGROUP_STORAGE, + BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, + /* BPF_MAP_TYPE_CGROUP_STORAGE is available to bpf programs attaching + * to a cgroup. The newer BPF_MAP_TYPE_CGRP_STORAGE is available to + * both cgroup-attached and other progs and supports all functionality + * provided by BPF_MAP_TYPE_CGROUP_STORAGE. So mark + * BPF_MAP_TYPE_CGROUP_STORAGE deprecated. + */ + BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, BPF_MAP_TYPE_QUEUE, @@ -935,6 +942,7 @@ enum bpf_map_type { BPF_MAP_TYPE_TASK_STORAGE, BPF_MAP_TYPE_BLOOM_FILTER, BPF_MAP_TYPE_USER_RINGBUF, + BPF_MAP_TYPE_CGRP_STORAGE, }; /* Note that tracing related programs such as @@ -5435,6 +5443,44 @@ union bpf_attr { * **-E2BIG** if user-space has tried to publish a sample which is * larger than the size of the ring buffer, or which cannot fit * within a struct bpf_dynptr. + * + * void *bpf_cgrp_storage_get(struct bpf_map *map, struct cgroup *cgroup, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *cgroup*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *cgroup* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *cgroup*) except this + * helper enforces the key must be a cgroup struct and the map must also + * be a **BPF_MAP_TYPE_CGRP_STORAGE**. + * + * In reality, the local-storage value is embedded directly inside of the + * *cgroup* object itself, rather than being located in the + * **BPF_MAP_TYPE_CGRP_STORAGE** map. When the local-storage value is + * queried for some *map* on a *cgroup* object, the kernel will perform an + * O(n) iteration over all of the live local-storage values for that + * *cgroup* object until the local-storage value for the *map* is found. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_cgrp_storage_delete(struct bpf_map *map, struct cgroup *cgroup) + * Description + * Delete a bpf_local_storage from a *cgroup*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5647,6 +5693,8 @@ union bpf_attr { FN(tcp_raw_check_syncookie_ipv6, 207, ##ctx) \ FN(ktime_get_tai_ns, 208, ##ctx) \ FN(user_ringbuf_drain, 209, ##ctx) \ + FN(cgrp_storage_get, 210, ##ctx) \ + FN(cgrp_storage_delete, 211, ##ctx) \ /* */ /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't -- cgit v1.2.3 From 28581b9c2c94cc912354eadc98c1146fdc7092e6 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 25 Oct 2022 13:53:00 +0300 Subject: bond: Disable TLS features indication Bond agnostically interacts with TLS device-offload requests via the .ndo_sk_get_lower_dev operation. Return value is true iff bond guarantees fixed mapping between the TLS connection and a lower netdev. Due to this nature, the bond TLS device offload features are not explicitly controllable in the bond layer. As of today, these are read-only values based on the evaluation of bond_sk_check(). However, this indication might be incorrect and misleading, when the feature bits are "fixed" by some dependency features. For example, NETIF_F_HW_TLS_TX/RX are forcefully cleared in case the corresponding checksum offload is disabled. But in fact the bond ability to still offload TLS connections to the lower device is not hurt. This means that these bits can not be trusted, and hence better become unused. This patch revives some old discussion [1] and proposes a much simpler solution: Clear the bond's TLS features bits. Everyone should stop reading them. [1] https://lore.kernel.org/netdev/20210526095747.22446-1-tariqt@nvidia.com/ Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20221025105300.4718-1-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- include/net/bonding.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/net/bonding.h b/include/net/bonding.h index e999f851738b..ea36ab7f9e72 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -92,8 +92,6 @@ #define BOND_XFRM_FEATURES (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | \ NETIF_F_GSO_ESP) -#define BOND_TLS_FEATURES (NETIF_F_HW_TLS_TX | NETIF_F_HW_TLS_RX) - #ifdef CONFIG_NET_POLL_CONTROLLER extern atomic_t netpoll_block_tx; @@ -280,8 +278,6 @@ struct bond_vlan_tag { unsigned short vlan_id; }; -bool bond_sk_check(struct bonding *bond); - /** * Returns NULL if the net_device does not belong to any of the bond's slaves * -- cgit v1.2.3 From e753df8fbca592d36f539ed950fcdf412166c549 Mon Sep 17 00:00:00 2001 From: Michal Jaron Date: Tue, 25 Oct 2022 09:12:52 -0700 Subject: ice: Add support Flex RXD Add new VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC flag, opcode VIRTCHNL_OP_GET_SUPPORTED_RXDIDS and add member rxdid in struct virtchnl_rxq_info to support AVF Flex RXD extension. Add support to allow VF to query flexible descriptor RXDIDs supported by DDP package and configure Rx queues with selected RXDID for IAVF. Add code to allow VIRTCHNL_OP_GET_SUPPORTED_RXDIDS message to be processed. Add necessary macros for registers. Signed-off-by: Leyi Rong Signed-off-by: Xu Ting Signed-off-by: Michal Jaron Signed-off-by: Mateusz Palczewski Tested-by: Maxime Coquelin Tested-by: Konrad Jankowski Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20221025161252.1952939-1-jacob.e.keller@intel.com Signed-off-by: Paolo Abeni --- include/linux/avf/virtchnl.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 2ce27e8e4f19..d91af50ac58d 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -136,7 +136,8 @@ enum virtchnl_ops { VIRTCHNL_OP_DISABLE_CHANNELS = 31, VIRTCHNL_OP_ADD_CLOUD_FILTER = 32, VIRTCHNL_OP_DEL_CLOUD_FILTER = 33, - /* opcode 34 - 44 are reserved */ + /* opcode 34 - 43 are reserved */ + VIRTCHNL_OP_GET_SUPPORTED_RXDIDS = 44, VIRTCHNL_OP_ADD_RSS_CFG = 45, VIRTCHNL_OP_DEL_RSS_CFG = 46, VIRTCHNL_OP_ADD_FDIR_FILTER = 47, @@ -263,6 +264,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource); #define VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM BIT(22) #define VIRTCHNL_VF_OFFLOAD_ADQ BIT(23) #define VIRTCHNL_VF_OFFLOAD_USO BIT(25) +#define VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC BIT(26) #define VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF BIT(27) #define VIRTCHNL_VF_OFFLOAD_FDIR_PF BIT(28) @@ -318,7 +320,9 @@ struct virtchnl_rxq_info { u16 splithdr_enabled; /* deprecated with AVF 1.0 */ u32 databuffer_size; u32 max_pkt_size; - u32 pad1; + u8 pad0; + u8 rxdid; + u8 pad1[2]; u64 dma_ring_addr; enum virtchnl_rx_hsplit rx_split_pos; /* deprecated with AVF 1.0 */ u32 pad2; @@ -970,6 +974,10 @@ struct virtchnl_filter { VIRTCHNL_CHECK_STRUCT_LEN(272, virtchnl_filter); +struct virtchnl_supported_rxdids { + u64 supported_rxdids; +}; + /* VIRTCHNL_OP_EVENT * PF sends this message to inform the VF driver of events that may affect it. * No direct response is expected from the VF, though it may generate other @@ -1499,6 +1507,8 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, case VIRTCHNL_OP_DEL_CLOUD_FILTER: valid_len = sizeof(struct virtchnl_filter); break; + case VIRTCHNL_OP_GET_SUPPORTED_RXDIDS: + break; case VIRTCHNL_OP_ADD_RSS_CFG: case VIRTCHNL_OP_DEL_RSS_CFG: valid_len = sizeof(struct virtchnl_rss_cfg); -- cgit v1.2.3 From bd456f283b66704920fae8e655ebc769cb743420 Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:11 +0000 Subject: tcp: add sysctls for TCP PLB parameters PLB (Protective Load Balancing) is a host based mechanism for load balancing across switch links. It leverages congestion signals(e.g. ECN) from transport layer to randomly change the path of the connection experiencing congestion. PLB changes the path of the connection by changing the outgoing IPv6 flow label for IPv6 connections (implemented in Linux by calling sk_rethink_txhash()). Because of this implementation mechanism, PLB can currently only work for IPv6 traffic. For more information, see the SIGCOMM 2022 paper: https://doi.org/10.1145/3544216.3544226 This commit adds new sysctl knobs and sets their default values for TCP PLB. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 1b8004679445..25f90bba4889 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -183,6 +183,11 @@ struct netns_ipv4 { unsigned long tfo_active_disable_stamp; u32 tcp_challenge_timestamp; u32 tcp_challenge_count; + u8 sysctl_tcp_plb_enabled; + u8 sysctl_tcp_plb_idle_rehash_rounds; + u8 sysctl_tcp_plb_rehash_rounds; + u8 sysctl_tcp_plb_suspend_rto_sec; + int sysctl_tcp_plb_cong_thresh; int sysctl_udp_wmem_min; int sysctl_udp_rmem_min; -- cgit v1.2.3 From 1a91bb7c3ebf95e908ec33220defbcda1ecc072f Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:12 +0000 Subject: tcp: add PLB functionality for TCP Congestion control algorithms track PLB state and cause the connection to trigger a path change when either of the 2 conditions is satisfied: - No packets are in flight and (# consecutive congested rounds >= sysctl_tcp_plb_idle_rehash_rounds) - (# consecutive congested rounds >= sysctl_tcp_plb_rehash_rounds) A round (RTT) is marked as congested when congestion signal (ECN ce_ratio) over an RTT is greater than sysctl_tcp_plb_cong_thresh. In the event of RTO, PLB (via tcp_write_timeout()) triggers a path change and disables congestion-triggered path changes for random time between (sysctl_tcp_plb_suspend_rto_sec, 2*sysctl_tcp_plb_suspend_rto_sec) to avoid hopping onto the "connectivity blackhole". RTO-triggered path changes can still happen during this cool-off period. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 14d45661a84d..6b814e788f00 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2140,6 +2140,34 @@ extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, extern void tcp_rack_reo_timeout(struct sock *sk); extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs); +/* tcp_plb.c */ + +/* + * Scaling factor for fractions in PLB. For example, tcp_plb_update_state + * expects cong_ratio which represents fraction of traffic that experienced + * congestion over a single RTT. In order to avoid floating point operations, + * this fraction should be mapped to (1 << TCP_PLB_SCALE) and passed in. + */ +#define TCP_PLB_SCALE 8 + +/* State for PLB (Protective Load Balancing) for a single TCP connection. */ +struct tcp_plb_state { + u8 consec_cong_rounds:5, /* consecutive congested rounds */ + unused:3; + u32 pause_until; /* jiffies32 when PLB can resume rerouting */ +}; + +static inline void tcp_plb_init(const struct sock *sk, + struct tcp_plb_state *plb) +{ + plb->consec_cong_rounds = 0; + plb->pause_until = 0; +} +void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, + const int cong_ratio); +void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb); +void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb); + /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) { -- cgit v1.2.3 From 29c1c44646aec5d5134f2365259a84becc1ee7d3 Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:14 +0000 Subject: tcp: add u32 counter in tcp_sock and an SNMP counter for PLB A u32 counter is added to tcp_sock for counting the number of PLB triggered rehashes for a TCP connection. An SNMP counter is also added to count overall PLB triggered rehash events for a host. These counters are hooked up to PLB implementation for DCTCP. TCP_NLA_REHASH is added to SCM_TIMESTAMPING_OPT_STATS that reports the rehash attempts triggered due to PLB or timeouts. This gives a historical view of sustained congestion or timeouts experienced by the TCP connection. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + include/uapi/linux/snmp.h | 1 + include/uapi/linux/tcp.h | 1 + 3 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 41b1da621a45..ca7f05a130d2 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -423,6 +423,7 @@ struct tcp_sock { u32 probe_seq_start; u32 probe_seq_end; } mtu_probe; + u32 plb_rehash; /* PLB-triggered rehash attempts */ u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG * while socket was owned by user. */ diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 4d7470036a8b..6600cb0164c2 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -292,6 +292,7 @@ enum LINUX_MIB_TCPDSACKIGNOREDDUBIOUS, /* TCPDSACKIgnoredDubious */ LINUX_MIB_TCPMIGRATEREQSUCCESS, /* TCPMigrateReqSuccess */ LINUX_MIB_TCPMIGRATEREQFAILURE, /* TCPMigrateReqFailure */ + LINUX_MIB_TCPPLBREHASH, /* TCPPLBRehash */ __LINUX_MIB_MAX }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 8fc09e8638b3..c9abe86eda5f 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -315,6 +315,7 @@ enum { TCP_NLA_BYTES_NOTSENT, /* Bytes in write queue not yet sent */ TCP_NLA_EDT, /* Earliest departure time (CLOCK_MONOTONIC) */ TCP_NLA_TTL, /* TTL or hop limit of a packet received */ + TCP_NLA_REHASH, /* PLB and timeout triggered rehash attempts */ }; /* for TCP_MD5SIG socket option */ -- cgit v1.2.3 From 71fc704768f601ed3fa36310822a5e03f310f781 Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:15 +0000 Subject: tcp: add rcv_wnd and plb_rehash to TCP_INFO rcv_wnd can be useful to diagnose TCP performance where receiver window becomes the bottleneck. rehash reports the PLB and timeout triggered rehash attempts by the TCP connection. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index c9abe86eda5f..879eeb0a084b 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -284,6 +284,11 @@ struct tcp_info { __u32 tcpi_snd_wnd; /* peer's advertised receive window after * scaling (bytes) */ + __u32 tcpi_rcv_wnd; /* local advertised receive window after + * scaling (bytes) + */ + + __u32 tcpi_rehash; /* PLB or timeout triggered rehash attempts */ }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ -- cgit v1.2.3 From 9c5a170677c3c8facc83e931a57f4c99c0511ae0 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 27 Oct 2022 14:10:37 +0100 Subject: net: phylink: add phylink_get_link_timer_ns() helper Add a helper to convert the PHY interface mode to the required link timer setting as stated by the appropriate standard. Inappropriate interface modes return an error. Signed-off-by: Russell King (Oracle) Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 63800bf4a7ac..1df3e5e316e8 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -616,6 +616,30 @@ int phylink_speed_up(struct phylink *pl); void phylink_set_port_modes(unsigned long *bits); +/** + * phylink_get_link_timer_ns - return the PCS link timer value + * @interface: link &typedef phy_interface_t mode + * + * Return the PCS link timer setting in nanoseconds for the PHY @interface + * mode, or -EINVAL if not appropriate. + */ +static inline int phylink_get_link_timer_ns(phy_interface_t interface) +{ + switch (interface) { + case PHY_INTERFACE_MODE_SGMII: + case PHY_INTERFACE_MODE_QSGMII: + case PHY_INTERFACE_MODE_USXGMII: + return 1600000; + + case PHY_INTERFACE_MODE_1000BASEX: + case PHY_INTERFACE_MODE_2500BASEX: + return 10000000; + + default: + return -EINVAL; + } +} + void phylink_mii_c22_pcs_decode_state(struct phylink_link_state *state, u16 bmsr, u16 lpa); void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs, -- cgit v1.2.3 From 17dd361119e5bc4cfb85aed660674a846b613817 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 27 Oct 2022 14:21:16 +0100 Subject: net: sfp: convert register indexes from hex to decimal The register indexes in the standards are in decimal rather than hex, so lets specify them in decimal in the header file so we can easily cross-reference without converting between hex and decimal. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/sfp.h | 180 ++++++++++++++++++++++++++-------------------------- 1 file changed, 90 insertions(+), 90 deletions(-) (limited to 'include') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 01ae9f1dd2ad..4e2db155664d 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -332,37 +332,37 @@ enum { /* SFP EEPROM registers */ enum { - SFP_PHYS_ID = 0x00, - SFP_PHYS_EXT_ID = 0x01, - SFP_CONNECTOR = 0x02, - SFP_COMPLIANCE = 0x03, - SFP_ENCODING = 0x0b, - SFP_BR_NOMINAL = 0x0c, - SFP_RATE_ID = 0x0d, - SFP_LINK_LEN_SM_KM = 0x0e, - SFP_LINK_LEN_SM_100M = 0x0f, - SFP_LINK_LEN_50UM_OM2_10M = 0x10, - SFP_LINK_LEN_62_5UM_OM1_10M = 0x11, - SFP_LINK_LEN_COPPER_1M = 0x12, - SFP_LINK_LEN_50UM_OM4_10M = 0x12, - SFP_LINK_LEN_50UM_OM3_10M = 0x13, - SFP_VENDOR_NAME = 0x14, - SFP_VENDOR_OUI = 0x25, - SFP_VENDOR_PN = 0x28, - SFP_VENDOR_REV = 0x38, - SFP_OPTICAL_WAVELENGTH_MSB = 0x3c, - SFP_OPTICAL_WAVELENGTH_LSB = 0x3d, - SFP_CABLE_SPEC = 0x3c, - SFP_CC_BASE = 0x3f, - SFP_OPTIONS = 0x40, /* 2 bytes, MSB, LSB */ - SFP_BR_MAX = 0x42, - SFP_BR_MIN = 0x43, - SFP_VENDOR_SN = 0x44, - SFP_DATECODE = 0x54, - SFP_DIAGMON = 0x5c, - SFP_ENHOPTS = 0x5d, - SFP_SFF8472_COMPLIANCE = 0x5e, - SFP_CC_EXT = 0x5f, + SFP_PHYS_ID = 0, + SFP_PHYS_EXT_ID = 1, + SFP_CONNECTOR = 2, + SFP_COMPLIANCE = 3, + SFP_ENCODING = 11, + SFP_BR_NOMINAL = 12, + SFP_RATE_ID = 13, + SFP_LINK_LEN_SM_KM = 14, + SFP_LINK_LEN_SM_100M = 15, + SFP_LINK_LEN_50UM_OM2_10M = 16, + SFP_LINK_LEN_62_5UM_OM1_10M = 17, + SFP_LINK_LEN_COPPER_1M = 18, + SFP_LINK_LEN_50UM_OM4_10M = 18, + SFP_LINK_LEN_50UM_OM3_10M = 19, + SFP_VENDOR_NAME = 20, + SFP_VENDOR_OUI = 37, + SFP_VENDOR_PN = 40, + SFP_VENDOR_REV = 56, + SFP_OPTICAL_WAVELENGTH_MSB = 60, + SFP_OPTICAL_WAVELENGTH_LSB = 61, + SFP_CABLE_SPEC = 60, + SFP_CC_BASE = 63, + SFP_OPTIONS = 64, /* 2 bytes, MSB, LSB */ + SFP_BR_MAX = 66, + SFP_BR_MIN = 67, + SFP_VENDOR_SN = 68, + SFP_DATECODE = 84, + SFP_DIAGMON = 92, + SFP_ENHOPTS = 93, + SFP_SFF8472_COMPLIANCE = 94, + SFP_CC_EXT = 95, SFP_PHYS_EXT_ID_SFP = 0x04, SFP_OPTIONS_HIGH_POWER_LEVEL = BIT(13), @@ -404,63 +404,63 @@ enum { /* SFP Diagnostics */ enum { /* Alarm and warnings stored MSB at lower address then LSB */ - SFP_TEMP_HIGH_ALARM = 0x00, - SFP_TEMP_LOW_ALARM = 0x02, - SFP_TEMP_HIGH_WARN = 0x04, - SFP_TEMP_LOW_WARN = 0x06, - SFP_VOLT_HIGH_ALARM = 0x08, - SFP_VOLT_LOW_ALARM = 0x0a, - SFP_VOLT_HIGH_WARN = 0x0c, - SFP_VOLT_LOW_WARN = 0x0e, - SFP_BIAS_HIGH_ALARM = 0x10, - SFP_BIAS_LOW_ALARM = 0x12, - SFP_BIAS_HIGH_WARN = 0x14, - SFP_BIAS_LOW_WARN = 0x16, - SFP_TXPWR_HIGH_ALARM = 0x18, - SFP_TXPWR_LOW_ALARM = 0x1a, - SFP_TXPWR_HIGH_WARN = 0x1c, - SFP_TXPWR_LOW_WARN = 0x1e, - SFP_RXPWR_HIGH_ALARM = 0x20, - SFP_RXPWR_LOW_ALARM = 0x22, - SFP_RXPWR_HIGH_WARN = 0x24, - SFP_RXPWR_LOW_WARN = 0x26, - SFP_LASER_TEMP_HIGH_ALARM = 0x28, - SFP_LASER_TEMP_LOW_ALARM = 0x2a, - SFP_LASER_TEMP_HIGH_WARN = 0x2c, - SFP_LASER_TEMP_LOW_WARN = 0x2e, - SFP_TEC_CUR_HIGH_ALARM = 0x30, - SFP_TEC_CUR_LOW_ALARM = 0x32, - SFP_TEC_CUR_HIGH_WARN = 0x34, - SFP_TEC_CUR_LOW_WARN = 0x36, - SFP_CAL_RXPWR4 = 0x38, - SFP_CAL_RXPWR3 = 0x3c, - SFP_CAL_RXPWR2 = 0x40, - SFP_CAL_RXPWR1 = 0x44, - SFP_CAL_RXPWR0 = 0x48, - SFP_CAL_TXI_SLOPE = 0x4c, - SFP_CAL_TXI_OFFSET = 0x4e, - SFP_CAL_TXPWR_SLOPE = 0x50, - SFP_CAL_TXPWR_OFFSET = 0x52, - SFP_CAL_T_SLOPE = 0x54, - SFP_CAL_T_OFFSET = 0x56, - SFP_CAL_V_SLOPE = 0x58, - SFP_CAL_V_OFFSET = 0x5a, - SFP_CHKSUM = 0x5f, - - SFP_TEMP = 0x60, - SFP_VCC = 0x62, - SFP_TX_BIAS = 0x64, - SFP_TX_POWER = 0x66, - SFP_RX_POWER = 0x68, - SFP_LASER_TEMP = 0x6a, - SFP_TEC_CUR = 0x6c, - - SFP_STATUS = 0x6e, + SFP_TEMP_HIGH_ALARM = 0, + SFP_TEMP_LOW_ALARM = 2, + SFP_TEMP_HIGH_WARN = 4, + SFP_TEMP_LOW_WARN = 6, + SFP_VOLT_HIGH_ALARM = 8, + SFP_VOLT_LOW_ALARM = 10, + SFP_VOLT_HIGH_WARN = 12, + SFP_VOLT_LOW_WARN = 14, + SFP_BIAS_HIGH_ALARM = 16, + SFP_BIAS_LOW_ALARM = 18, + SFP_BIAS_HIGH_WARN = 20, + SFP_BIAS_LOW_WARN = 22, + SFP_TXPWR_HIGH_ALARM = 24, + SFP_TXPWR_LOW_ALARM = 26, + SFP_TXPWR_HIGH_WARN = 28, + SFP_TXPWR_LOW_WARN = 30, + SFP_RXPWR_HIGH_ALARM = 32, + SFP_RXPWR_LOW_ALARM = 34, + SFP_RXPWR_HIGH_WARN = 36, + SFP_RXPWR_LOW_WARN = 38, + SFP_LASER_TEMP_HIGH_ALARM = 40, + SFP_LASER_TEMP_LOW_ALARM = 42, + SFP_LASER_TEMP_HIGH_WARN = 44, + SFP_LASER_TEMP_LOW_WARN = 46, + SFP_TEC_CUR_HIGH_ALARM = 48, + SFP_TEC_CUR_LOW_ALARM = 50, + SFP_TEC_CUR_HIGH_WARN = 52, + SFP_TEC_CUR_LOW_WARN = 54, + SFP_CAL_RXPWR4 = 56, + SFP_CAL_RXPWR3 = 60, + SFP_CAL_RXPWR2 = 64, + SFP_CAL_RXPWR1 = 68, + SFP_CAL_RXPWR0 = 72, + SFP_CAL_TXI_SLOPE = 76, + SFP_CAL_TXI_OFFSET = 78, + SFP_CAL_TXPWR_SLOPE = 80, + SFP_CAL_TXPWR_OFFSET = 82, + SFP_CAL_T_SLOPE = 84, + SFP_CAL_T_OFFSET = 86, + SFP_CAL_V_SLOPE = 88, + SFP_CAL_V_OFFSET = 90, + SFP_CHKSUM = 95, + + SFP_TEMP = 96, + SFP_VCC = 98, + SFP_TX_BIAS = 100, + SFP_TX_POWER = 102, + SFP_RX_POWER = 104, + SFP_LASER_TEMP = 106, + SFP_TEC_CUR = 108, + + SFP_STATUS = 110, SFP_STATUS_TX_DISABLE = BIT(7), SFP_STATUS_TX_DISABLE_FORCE = BIT(6), SFP_STATUS_TX_FAULT = BIT(2), SFP_STATUS_RX_LOS = BIT(1), - SFP_ALARM0 = 0x70, + SFP_ALARM0 = 112, SFP_ALARM0_TEMP_HIGH = BIT(7), SFP_ALARM0_TEMP_LOW = BIT(6), SFP_ALARM0_VCC_HIGH = BIT(5), @@ -470,11 +470,11 @@ enum { SFP_ALARM0_TXPWR_HIGH = BIT(1), SFP_ALARM0_TXPWR_LOW = BIT(0), - SFP_ALARM1 = 0x71, + SFP_ALARM1 = 113, SFP_ALARM1_RXPWR_HIGH = BIT(7), SFP_ALARM1_RXPWR_LOW = BIT(6), - SFP_WARN0 = 0x74, + SFP_WARN0 = 116, SFP_WARN0_TEMP_HIGH = BIT(7), SFP_WARN0_TEMP_LOW = BIT(6), SFP_WARN0_VCC_HIGH = BIT(5), @@ -484,15 +484,15 @@ enum { SFP_WARN0_TXPWR_HIGH = BIT(1), SFP_WARN0_TXPWR_LOW = BIT(0), - SFP_WARN1 = 0x75, + SFP_WARN1 = 117, SFP_WARN1_RXPWR_HIGH = BIT(7), SFP_WARN1_RXPWR_LOW = BIT(6), - SFP_EXT_STATUS = 0x76, + SFP_EXT_STATUS = 118, SFP_EXT_STATUS_PWRLVL_SELECT = BIT(0), - SFP_VSL = 0x78, - SFP_PAGE = 0x7f, + SFP_VSL = 120, + SFP_PAGE = 127, }; struct fwnode_handle; -- cgit v1.2.3 From d83845d224a059165902ecd0e1203015c5713a78 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 27 Oct 2022 14:21:21 +0100 Subject: net: sfp: move field definitions along side register index Just as we do for the A2h enum, arrange the A0h enum to have the field definitions next to their corresponding register index. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/sfp.h | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 4e2db155664d..52b98f9666a2 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -333,7 +333,10 @@ enum { /* SFP EEPROM registers */ enum { SFP_PHYS_ID = 0, + SFP_PHYS_EXT_ID = 1, + SFP_PHYS_EXT_ID_SFP = 0x04, + SFP_CONNECTOR = 2, SFP_COMPLIANCE = 3, SFP_ENCODING = 11, @@ -354,17 +357,8 @@ enum { SFP_OPTICAL_WAVELENGTH_LSB = 61, SFP_CABLE_SPEC = 60, SFP_CC_BASE = 63, - SFP_OPTIONS = 64, /* 2 bytes, MSB, LSB */ - SFP_BR_MAX = 66, - SFP_BR_MIN = 67, - SFP_VENDOR_SN = 68, - SFP_DATECODE = 84, - SFP_DIAGMON = 92, - SFP_ENHOPTS = 93, - SFP_SFF8472_COMPLIANCE = 94, - SFP_CC_EXT = 95, - SFP_PHYS_EXT_ID_SFP = 0x04, + SFP_OPTIONS = 64, /* 2 bytes, MSB, LSB */ SFP_OPTIONS_HIGH_POWER_LEVEL = BIT(13), SFP_OPTIONS_PAGING_A2 = BIT(12), SFP_OPTIONS_RETIMER = BIT(11), @@ -378,11 +372,20 @@ enum { SFP_OPTIONS_TX_FAULT = BIT(3), SFP_OPTIONS_LOS_INVERTED = BIT(2), SFP_OPTIONS_LOS_NORMAL = BIT(1), + + SFP_BR_MAX = 66, + SFP_BR_MIN = 67, + SFP_VENDOR_SN = 68, + SFP_DATECODE = 84, + + SFP_DIAGMON = 92, SFP_DIAGMON_DDM = BIT(6), SFP_DIAGMON_INT_CAL = BIT(5), SFP_DIAGMON_EXT_CAL = BIT(4), SFP_DIAGMON_RXPWR_AVG = BIT(3), SFP_DIAGMON_ADDRMODE = BIT(2), + + SFP_ENHOPTS = 93, SFP_ENHOPTS_ALARMWARN = BIT(7), SFP_ENHOPTS_SOFT_TX_DISABLE = BIT(6), SFP_ENHOPTS_SOFT_TX_FAULT = BIT(5), @@ -390,6 +393,8 @@ enum { SFP_ENHOPTS_SOFT_RATE_SELECT = BIT(3), SFP_ENHOPTS_APP_SELECT_SFF8079 = BIT(2), SFP_ENHOPTS_SOFT_RATE_SFF8431 = BIT(1), + + SFP_SFF8472_COMPLIANCE = 94, SFP_SFF8472_COMPLIANCE_NONE = 0x00, SFP_SFF8472_COMPLIANCE_REV9_3 = 0x01, SFP_SFF8472_COMPLIANCE_REV9_5 = 0x02, @@ -399,6 +404,8 @@ enum { SFP_SFF8472_COMPLIANCE_REV11_3 = 0x06, SFP_SFF8472_COMPLIANCE_REV11_4 = 0x07, SFP_SFF8472_COMPLIANCE_REV12_0 = 0x08, + + SFP_CC_EXT = 95, }; /* SFP Diagnostics */ -- cgit v1.2.3 From 58ba426388d9fe56aa638f555b01d6e63cada88c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 27 Oct 2022 17:10:14 -0400 Subject: net/packet: add PACKET_FANOUT_FLAG_IGNORE_OUTGOING Extend packet socket option PACKET_IGNORE_OUTGOING to fanout groups. The socket option sets ptype.ignore_outgoing, which makes dev_queue_xmit_nit skip the socket. When the socket joins a fanout group, the option is not reflected in the struct ptype of the group. dev_queue_xmit_nit only tests the fanout ptype, so the flag is ignored once a socket joins a fanout group. Inheriting the option from a socket would change established behavior. Different sockets in the group can set different flags, and can also change them at runtime. Testing in packet_rcv_fanout defeats the purpose of the original patch, which is to avoid skb_clone in dev_queue_xmit_nit (esp. for MSG_ZEROCOPY packets). Instead, introduce a new fanout group flag with the same behavior. Tested with https://github.com/wdebruij/kerneltools/blob/master/tests/test_psock_fanout_ignore_outgoing.c Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20221027211014.3581513-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_packet.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index c07caf7b40db..a8516b3594a4 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -70,6 +70,7 @@ struct sockaddr_ll { #define PACKET_FANOUT_EBPF 7 #define PACKET_FANOUT_FLAG_ROLLOVER 0x1000 #define PACKET_FANOUT_FLAG_UNIQUEID 0x2000 +#define PACKET_FANOUT_FLAG_IGNORE_OUTGOING 0x4000 #define PACKET_FANOUT_FLAG_DEFRAG 0x8000 struct tpacket_stats { -- cgit v1.2.3 From f3fb589aeb88c5bf7a7a804d36a8fa219b72e290 Mon Sep 17 00:00:00 2001 From: Juhee Kang Date: Fri, 28 Oct 2022 01:04:24 +0900 Subject: net: remove unused netdev_unregistering() Currently, use dev->reg_state == NETREG_UNREGISTERING to check the status which is NETREG_UNREGISTERING, rather than using netdev_unregistering. Also, A helper function which is netdev_unregistering on nedevice.h is no longer used. Thus, netdev_unregistering removes from netdevice.h. Signed-off-by: Juhee Kang Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eddf8ee270e7..99e58b773266 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5101,11 +5101,6 @@ static inline const char *netdev_name(const struct net_device *dev) return dev->name; } -static inline bool netdev_unregistering(const struct net_device *dev) -{ - return dev->reg_state == NETREG_UNREGISTERING; -} - static inline const char *netdev_reg_state(const struct net_device *dev) { switch (dev->reg_state) { -- cgit v1.2.3 From 738136a0e3757a8534df3ad97d6ff6d7f429f6c1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 27 Oct 2022 14:25:53 -0700 Subject: netlink: split up copies in the ack construction Clean up the use of unsafe_memcpy() by adding a flexible array at the end of netlink message header and splitting up the header and data copies. Reviewed-by: Kees Cook Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/netlink.h | 21 +++++++++++++++++++++ include/uapi/linux/netlink.h | 2 ++ 2 files changed, 23 insertions(+) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index 4418b1981e31..784b4688fc6f 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -931,6 +931,27 @@ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 se return __nlmsg_put(skb, portid, seq, type, payload, flags); } +/** + * nlmsg_append - Add more data to a nlmsg in a skb + * @skb: socket buffer to store message in + * @size: length of message payload + * + * Append data to an existing nlmsg, used when constructing a message + * with multiple fixed-format headers (which is rare). + * Returns NULL if the tailroom of the skb is insufficient to store + * the extra payload. + */ +static inline void *nlmsg_append(struct sk_buff *skb, u32 size) +{ + if (unlikely(skb_tailroom(skb) < NLMSG_ALIGN(size))) + return NULL; + + if (NLMSG_ALIGN(size) - size) + memset(skb_tail_pointer(skb) + size, 0, + NLMSG_ALIGN(size) - size); + return __skb_put(skb, NLMSG_ALIGN(size)); +} + /** * nlmsg_put_answer - Add a new callback based netlink message to an skb * @skb: socket buffer to store message in diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index e2ae82e3f9f7..5da0da59bf01 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -48,6 +48,7 @@ struct sockaddr_nl { * @nlmsg_flags: Additional flags * @nlmsg_seq: Sequence number * @nlmsg_pid: Sending process port ID + * @nlmsg_data: Message payload */ struct nlmsghdr { __u32 nlmsg_len; @@ -55,6 +56,7 @@ struct nlmsghdr { __u16 nlmsg_flags; __u32 nlmsg_seq; __u32 nlmsg_pid; + __u8 nlmsg_data[]; }; /* Flags values */ -- cgit v1.2.3 From 8c2a535e089b2ab82cf50c876bd10c8ed33252c9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 27 Oct 2022 20:52:59 -0700 Subject: net: geneve: fix array of flexible structures warnings New compilers don't like flexible array of flexible structs: include/net/geneve.h:62:34: warning: array of flexible structures Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/geneve.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/geneve.h b/include/net/geneve.h index bced0b1d9fe4..5c96827a487e 100644 --- a/include/net/geneve.h +++ b/include/net/geneve.h @@ -59,7 +59,7 @@ struct genevehdr { __be16 proto_type; u8 vni[3]; u8 rsvd2; - struct geneve_opt options[]; + u8 options[]; }; static inline bool netif_is_geneve(const struct net_device *dev) -- cgit v1.2.3 From b9a61b97798ca6d0c856a217e61e2177bd8f6eae Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 28 Oct 2022 04:04:12 -0700 Subject: ptp: add missing documentation for parameters The ptp_find_pin_unlocked function and the ptp_system_timestamp structure didn't document their parameters and fields. Fix this. Signed-off-by: Jacob Keller Acked-by: Richard Cochran Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 92b44161408e..ad4aaadc2f7a 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -45,6 +45,8 @@ struct system_device_crosststamp; /** * struct ptp_system_timestamp - system time corresponding to a PHC timestamp + * @pre_ts: system timestamp before capturing PHC + * @post_ts: system timestamp after capturing PHC */ struct ptp_system_timestamp { struct timespec64 pre_ts; @@ -316,6 +318,11 @@ int ptp_find_pin(struct ptp_clock *ptp, * should most likely call ptp_find_pin() directly from their * ptp_clock_info::enable() method. * +* @ptp: The clock obtained from ptp_clock_register(). +* @func: One of the ptp_pin_function enumerated values. +* @chan: The particular functional channel to find. +* Return: Pin index in the range of zero to ptp_clock_caps.n_pins - 1, +* or -1 if the auxiliary function cannot be found. */ int ptp_find_pin_unlocked(struct ptp_clock *ptp, -- cgit v1.2.3 From 1060707e380994e7c9b754b6eb74f25459b4a5b3 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 28 Oct 2022 04:04:13 -0700 Subject: ptp: introduce helpers to adjust by scaled parts per million Many drivers implement the .adjfreq or .adjfine PTP op function with the same basic logic: 1. Determine a base frequency value 2. Multiply this by the abs() of the requested adjustment, then divide by the appropriate divisor (1 billion, or 65,536 billion). 3. Add or subtract this difference from the base frequency to calculate a new adjustment. A few drivers need the difference and direction rather than the combined new increment value. I recently converted the Intel drivers to .adjfine and the scaled parts per million (65.536 parts per billion) logic. To avoid overflow with minimal loss of precision, mul_u64_u64_div_u64 was used. The basic logic used by all of these drivers is very similar, and leads to a lot of duplicate code to perform the same task. Rather than keep this duplicate code, introduce diff_by_scaled_ppm and adjust_by_scaled_ppm. These helper functions calculate the difference or adjustment necessary based on the scaled parts per million input. The diff_by_scaled_ppm function returns true if the difference should be subtracted, and false otherwise. Update the Intel drivers to use the new helper functions. Other vendor drivers will be converted to .adjfine and this helper function in the following changes. Signed-off-by: Jacob Keller Acked-by: Richard Cochran Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'include') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index ad4aaadc2f7a..f4781c5766d6 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -248,6 +248,52 @@ static inline long scaled_ppm_to_ppb(long ppm) return (long)ppb; } +/** + * diff_by_scaled_ppm - Calculate difference using scaled ppm + * @base: the base increment value to adjust + * @scaled_ppm: scaled parts per million to adjust by + * @diff: on return, the absolute value of calculated diff + * + * Calculate the difference to adjust the base increment using scaled parts + * per million. + * + * Use mul_u64_u64_div_u64 to perform the difference calculation in avoid + * possible overflow. + * + * Returns: true if scaled_ppm is negative, false otherwise + */ +static inline bool diff_by_scaled_ppm(u64 base, long scaled_ppm, u64 *diff) +{ + bool negative = false; + + if (scaled_ppm < 0) { + negative = true; + scaled_ppm = -scaled_ppm; + } + + *diff = mul_u64_u64_div_u64(base, (u64)scaled_ppm, 1000000ULL << 16); + + return negative; +} + +/** + * adjust_by_scaled_ppm - Adjust a base increment by scaled parts per million + * @base: the base increment value to adjust + * @scaled_ppm: scaled parts per million frequency adjustment + * + * Helper function which calculates a new increment value based on the + * requested scaled parts per million adjustment. + */ +static inline u64 adjust_by_scaled_ppm(u64 base, long scaled_ppm) +{ + u64 diff; + + if (diff_by_scaled_ppm(base, scaled_ppm, &diff)) + return base - diff; + + return base + diff; +} + #if IS_ENABLED(CONFIG_PTP_1588_CLOCK) /** -- cgit v1.2.3 From 1d997f1013079c05b642c739901e3584a3ae558d Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 28 Oct 2022 04:42:21 -0400 Subject: rtnetlink: pass netlink message header and portid to rtnl_configure_link() This patch pass netlink message header and portid to rtnl_configure_link() All the functions in this call chain need to add the parameters so we can use them in the last call rtnl_notify(), and notify the userspace about the new link info if NLM_F_ECHO flag is set. - rtnl_configure_link() - __dev_notify_flags() - rtmsg_ifinfo() - rtmsg_ifinfo_event() - rtmsg_ifinfo_build_skb() - rtmsg_ifinfo_send() - rtnl_notify() Also move __dev_notify_flags() declaration to net/core/dev.h, as Jakub suggested. Signed-off-by: Hangbin Liu Reviewed-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 -- include/linux/rtnetlink.h | 9 +++++---- include/net/netlink.h | 11 +++++++++++ include/net/rtnetlink.h | 3 ++- 4 files changed, 18 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 99e58b773266..4b5052db978f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3855,8 +3855,6 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); -void __dev_notify_flags(struct net_device *, unsigned int old_flags, - unsigned int gchanges); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); int __dev_change_net_namespace(struct net_device *dev, struct net *net, diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index ae2c6a3cec5d..92ad75549e9c 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -12,21 +12,22 @@ extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid); extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, - u32 group, struct nlmsghdr *nlh, gfp_t flags); + u32 group, const struct nlmsghdr *nlh, gfp_t flags); extern void rtnl_set_sk_err(struct net *net, u32 group, int error); extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics); extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error); -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags); +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags, + u32 portid, const struct nlmsghdr *nlh); void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, gfp_t flags, int *new_nsid, int new_ifindex); struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned change, u32 event, gfp_t flags, int *new_nsid, - int new_ifindex); + int new_ifindex, u32 portid, u32 seq); void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, - gfp_t flags); + gfp_t flags, u32 portid, const struct nlmsghdr *nlh); /* RTNL is used as a global lock for all changes to network configuration */ diff --git a/include/net/netlink.h b/include/net/netlink.h index 784b4688fc6f..464e2e026f7b 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -899,6 +899,17 @@ static inline int nlmsg_report(const struct nlmsghdr *nlh) return nlh ? !!(nlh->nlmsg_flags & NLM_F_ECHO) : 0; } +/** + * nlmsg_seq - return the seq number of netlink message + * @nlh: netlink message header + * + * Returns 0 if netlink message is NULL + */ +static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) +{ + return nlh ? nlh->nlmsg_seq : 0; +} + /** * nlmsg_for_each_attr - iterate over a stream of attributes * @pos: loop counter, set to current attribute diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index bf8bb3357825..cd94f65dc2a9 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -187,7 +187,8 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, struct nlattr *tb[], struct netlink_ext_ack *extack); int rtnl_delete_link(struct net_device *dev); -int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm); +int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, + u32 portid, const struct nlmsghdr *nlh); int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len, struct netlink_ext_ack *exterr); -- cgit v1.2.3 From f3a63cce1b4fbde7738395c5a2dea83f05de3407 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 28 Oct 2022 04:42:24 -0400 Subject: rtnetlink: Honour NLM_F_ECHO flag in rtnl_delete_link This patch use the new helper unregister_netdevice_many_notify() for rtnl_delete_link(), so that the kernel could reply unicast when userspace set NLM_F_ECHO flag to request the new created interface info. At the same time, the parameters of rtnl_delete_link() need to be updated since we need nlmsghdr and portid info. Suggested-by: Guillaume Nault Signed-off-by: Hangbin Liu Reviewed-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index cd94f65dc2a9..d9076a7a430c 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -186,7 +186,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[], struct netlink_ext_ack *extack); -int rtnl_delete_link(struct net_device *dev); +int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr *nlh); int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, u32 portid, const struct nlmsghdr *nlh); -- cgit v1.2.3 From 0e84afe8ebfbb9eade3f4f6de4720887bf908e26 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 29 Oct 2022 15:45:16 +0000 Subject: net: dropreason: add SKB_CONSUMED reason This will allow to simply use in the future: kfree_skb_reason(skb, reason); Instead of repeating sequences like: if (dropped) kfree_skb_reason(skb, reason); else consume_skb(skb); For instance, following patch in the series is adding @reason to skb_release_data() and skb_release_all(), so that we can propagate a meaningful @reason whenever consume_skb()/kfree_skb() have to take care of a potential frag_list. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/dropreason.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/dropreason.h b/include/net/dropreason.h index c1cbcdbaf149..0bd18c14dae0 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -80,6 +80,8 @@ enum skb_drop_reason { * @SKB_NOT_DROPPED_YET: skb is not dropped yet (used for no-drop case) */ SKB_NOT_DROPPED_YET = 0, + /** @SKB_CONSUMED: packet has been consumed */ + SKB_CONSUMED, /** @SKB_DROP_REASON_NOT_SPECIFIED: drop reason is not specified */ SKB_DROP_REASON_NOT_SPECIFIED, /** @SKB_DROP_REASON_NO_SOCKET: socket not found */ -- cgit v1.2.3 From 4ecbb1c27c363686d11a241cd682a454a8454c2b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 29 Oct 2022 15:45:18 +0000 Subject: net: dropreason: add SKB_DROP_REASON_DUP_FRAG This is used to track when a duplicate segment received by various reassembly units is dropped. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/dropreason.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/dropreason.h b/include/net/dropreason.h index 0bd18c14dae0..602d555a5f83 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -68,6 +68,7 @@ FN(IP_INADDRERRORS) \ FN(IP_INNOROUTES) \ FN(PKT_TOO_BIG) \ + FN(DUP_FRAG) \ FNe(MAX) /** @@ -300,6 +301,8 @@ enum skb_drop_reason { * MTU) */ SKB_DROP_REASON_PKT_TOO_BIG, + /** @SKB_DROP_REASON_DUP_FRAG: duplicate fragment */ + SKB_DROP_REASON_DUP_FRAG, /** * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be * used as a real 'reason' -- cgit v1.2.3 From 77adfd3a1d44c4730fd2af99b497e04ddc2b5837 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 29 Oct 2022 15:45:19 +0000 Subject: net: dropreason: add SKB_DROP_REASON_FRAG_REASM_TIMEOUT Used to track skbs freed after a timeout happened in a reassmbly unit. Passing a @reason argument to inet_frag_rbtree_purge() allows to use correct consumed status for frags that have been successfully re-assembled. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/dropreason.h | 3 +++ include/net/inet_frag.h | 6 +++++- include/net/ipv6_frag.h | 3 ++- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/dropreason.h b/include/net/dropreason.h index 602d555a5f83..1d45a74148c3 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -69,6 +69,7 @@ FN(IP_INNOROUTES) \ FN(PKT_TOO_BIG) \ FN(DUP_FRAG) \ + FN(FRAG_REASM_TIMEOUT) \ FNe(MAX) /** @@ -303,6 +304,8 @@ enum skb_drop_reason { SKB_DROP_REASON_PKT_TOO_BIG, /** @SKB_DROP_REASON_DUP_FRAG: duplicate fragment */ SKB_DROP_REASON_DUP_FRAG, + /** @SKB_DROP_REASON_FRAG_REASM_TIMEOUT: fragment reassembly timeout */ + SKB_DROP_REASON_FRAG_REASM_TIMEOUT, /** * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be * used as a real 'reason' diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 0b0876610553..b23ddec3cd5c 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -7,6 +7,7 @@ #include #include #include +#include /* Per netns frag queues directory */ struct fqdir { @@ -34,12 +35,14 @@ struct fqdir { * @INET_FRAG_LAST_IN: final fragment has arrived * @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction * @INET_FRAG_HASH_DEAD: inet_frag_kill() has not removed fq from rhashtable + * @INET_FRAG_DROP: if skbs must be dropped (instead of being consumed) */ enum { INET_FRAG_FIRST_IN = BIT(0), INET_FRAG_LAST_IN = BIT(1), INET_FRAG_COMPLETE = BIT(2), INET_FRAG_HASH_DEAD = BIT(3), + INET_FRAG_DROP = BIT(4), }; struct frag_v4_compare_key { @@ -139,7 +142,8 @@ void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key); /* Free all skbs in the queue; return the sum of their truesizes. */ -unsigned int inet_frag_rbtree_purge(struct rb_root *root); +unsigned int inet_frag_rbtree_purge(struct rb_root *root, + enum skb_drop_reason reason); static inline void inet_frag_put(struct inet_frag_queue *q) { diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h index 5052c66e22d2..7321ffe3a108 100644 --- a/include/net/ipv6_frag.h +++ b/include/net/ipv6_frag.h @@ -76,6 +76,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) if (fq->q.flags & INET_FRAG_COMPLETE) goto out; + fq->q.flags |= INET_FRAG_DROP; inet_frag_kill(&fq->q); dev = dev_get_by_index_rcu(net, fq->iif); @@ -101,7 +102,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) spin_unlock(&fq->q.lock); icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); - kfree_skb(head); + kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT); goto out_rcu_unlock; out: -- cgit v1.2.3 From 3bdfb04f13ebdd4ae50fc5dc595663874781e48c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 29 Oct 2022 15:45:20 +0000 Subject: net: dropreason: add SKB_DROP_REASON_FRAG_TOO_FAR IPv4 reassembly unit can decide to drop frags based on /proc/sys/net/ipv4/ipfrag_max_dist sysctl. Add a specific drop reason to track this specific and weird case. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/dropreason.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/dropreason.h b/include/net/dropreason.h index 1d45a74148c3..70539288f995 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -70,6 +70,7 @@ FN(PKT_TOO_BIG) \ FN(DUP_FRAG) \ FN(FRAG_REASM_TIMEOUT) \ + FN(FRAG_TOO_FAR) \ FNe(MAX) /** @@ -306,6 +307,11 @@ enum skb_drop_reason { SKB_DROP_REASON_DUP_FRAG, /** @SKB_DROP_REASON_FRAG_REASM_TIMEOUT: fragment reassembly timeout */ SKB_DROP_REASON_FRAG_REASM_TIMEOUT, + /** + * @SKB_DROP_REASON_FRAG_TOO_FAR: ipv4 fragment too far. + * (/proc/sys/net/ipv4/ipfrag_max_dist) + */ + SKB_DROP_REASON_FRAG_TOO_FAR, /** * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be * used as a real 'reason' -- cgit v1.2.3 From d08b0f8f46e45a274fc8c9a5bc92cb9da70d9887 Mon Sep 17 00:00:00 2001 From: Shane Parslow Date: Sat, 29 Oct 2022 02:03:56 -0700 Subject: net: wwan: iosm: add rpc interface for xmm modems Add a new iosm wwan port that connects to the modem rpc interface. This interface provides a configuration channel, and in the case of the 7360, is the only way to configure the modem (as it does not support mbim). The new interface is compatible with existing software, such as open_xdatachannel.py from the xmm7360-pci project [1]. [1] https://github.com/xmm7360/xmm7360-pci Signed-off-by: Shane Parslow Reviewed-by: Loic Poulain Signed-off-by: David S. Miller --- include/linux/wwan.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/wwan.h b/include/linux/wwan.h index 5ce2acf444fb..24d76500b1cc 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -15,6 +15,7 @@ * @WWAN_PORT_QMI: Qcom modem/MSM interface for modem control * @WWAN_PORT_QCDM: Qcom Modem diagnostic interface * @WWAN_PORT_FIREHOSE: XML based command protocol + * @WWAN_PORT_XMMRPC: Control protocol for Intel XMM modems * * @WWAN_PORT_MAX: Highest supported port types * @WWAN_PORT_UNKNOWN: Special value to indicate an unknown port type @@ -26,6 +27,7 @@ enum wwan_port_type { WWAN_PORT_QMI, WWAN_PORT_QCDM, WWAN_PORT_FIREHOSE, + WWAN_PORT_XMMRPC, /* Add new port types above this line */ -- cgit v1.2.3 From 777fa87c7682228e155cf0892ba61cb2ab1fe3ae Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 31 Oct 2022 12:44:09 +0100 Subject: bonding (gcc13): synchronize bond_{a,t}lb_xmit() types Both bond_alb_xmit() and bond_tlb_xmit() produce a valid warning with gcc-13: drivers/net/bonding/bond_alb.c:1409:13: error: conflicting types for 'bond_tlb_xmit' due to enum/integer mismatch; have 'netdev_tx_t(struct sk_buff *, struct net_device *)' ... include/net/bond_alb.h:160:5: note: previous declaration of 'bond_tlb_xmit' with type 'int(struct sk_buff *, struct net_device *)' drivers/net/bonding/bond_alb.c:1523:13: error: conflicting types for 'bond_alb_xmit' due to enum/integer mismatch; have 'netdev_tx_t(struct sk_buff *, struct net_device *)' ... include/net/bond_alb.h:159:5: note: previous declaration of 'bond_alb_xmit' with type 'int(struct sk_buff *, struct net_device *)' I.e. the return type of the declaration is int, while the definitions spell netdev_tx_t. Synchronize both of them to the latter. Cc: Martin Liska Cc: Jay Vosburgh Cc: Veaceslav Falico Cc: Andy Gospodarek Signed-off-by: Jiri Slaby (SUSE) Link: https://lore.kernel.org/r/20221031114409.10417-1-jirislaby@kernel.org Signed-off-by: Jakub Kicinski --- include/net/bond_alb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/bond_alb.h b/include/net/bond_alb.h index 191c36afa1f4..9dc082b2d543 100644 --- a/include/net/bond_alb.h +++ b/include/net/bond_alb.h @@ -156,8 +156,8 @@ int bond_alb_init_slave(struct bonding *bond, struct slave *slave); void bond_alb_deinit_slave(struct bonding *bond, struct slave *slave); void bond_alb_handle_link_change(struct bonding *bond, struct slave *slave, char link); void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave); -int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev); -int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev); +netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev); +netdev_tx_t bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev); struct slave *bond_xmit_alb_slave_get(struct bonding *bond, struct sk_buff *skb); struct slave *bond_xmit_tlb_slave_get(struct bonding *bond, -- cgit v1.2.3 From ec32c0c42d0a7208571c4c77f140bffaa4e5e304 Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Tue, 1 Nov 2022 10:48:29 +0100 Subject: net: dcb: add new pcp selector to app object Add new PCP selector for the 8021Qaz APP managed object. As the PCP selector is not part of the 8021Qaz standard, a new non-std extension attribute DCB_ATTR_DCB_APP has been introduced. Also two helper functions to translate between selector and app attribute type has been added. The new selector has been given a value of 255, to minimize the risk of future overlap of std- and non-std attributes. The new DCB_ATTR_DCB_APP is sent alongside the ieee std attribute in the app table. This means that the dcb_app struct can now both contain std- and non-std app attributes. Currently there is no overlap between the selector values of the two attributes. The purpose of adding the PCP selector, is to be able to offload PCP-based queue classification to the 8021Q Priority Code Point table, see 6.9.3 of IEEE Std 802.1Q-2018. PCP and DEI is encoded in the protocol field as 8*dei+pcp, so that a mapping of PCP 2 and DEI 1 to priority 3 is encoded as {255, 10, 3}. Signed-off-by: Daniel Machon Reviewed-by: Petr Machata Signed-off-by: Paolo Abeni --- include/uapi/linux/dcbnl.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h index a791a94013a6..dc7ef96207ca 100644 --- a/include/uapi/linux/dcbnl.h +++ b/include/uapi/linux/dcbnl.h @@ -218,6 +218,9 @@ struct cee_pfc { #define IEEE_8021QAZ_APP_SEL_ANY 4 #define IEEE_8021QAZ_APP_SEL_DSCP 5 +/* Non-std selector values */ +#define DCB_APP_SEL_PCP 255 + /* This structure contains the IEEE 802.1Qaz APP managed object. This * object is also used for the CEE std as well. * @@ -247,6 +250,8 @@ struct dcb_app { __u16 protocol; }; +#define IEEE_8021QAZ_APP_SEL_MAX 255 + /** * struct dcb_peer_app_info - APP feature information sent by the peer * @@ -425,6 +430,7 @@ enum ieee_attrs { enum ieee_attrs_app { DCB_ATTR_IEEE_APP_UNSPEC, DCB_ATTR_IEEE_APP, + DCB_ATTR_DCB_APP, __DCB_ATTR_IEEE_APP_MAX }; #define DCB_ATTR_IEEE_APP_MAX (__DCB_ATTR_IEEE_APP_MAX - 1) -- cgit v1.2.3 From 6182d5875c330a5a611687caa05f47752455720c Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Tue, 1 Nov 2022 10:48:30 +0100 Subject: net: dcb: add new apptrust attribute Add new apptrust extension attributes to the 8021Qaz APP managed object. Two new attributes, DCB_ATTR_DCB_APP_TRUST_TABLE and DCB_ATTR_DCB_APP_TRUST, has been added. Trusted selectors are passed in the nested attribute DCB_ATTR_DCB_APP_TRUST, in order of precedence. The new attributes are meant to allow drivers, whose hw supports the notion of trust, to be able to set whether a particular app selector is trusted - and in which order. Signed-off-by: Daniel Machon Reviewed-by: Petr Machata Signed-off-by: Paolo Abeni --- include/net/dcbnl.h | 4 ++++ include/uapi/linux/dcbnl.h | 2 ++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h index 2b2d86fb3131..8841ab6c2de7 100644 --- a/include/net/dcbnl.h +++ b/include/net/dcbnl.h @@ -109,6 +109,10 @@ struct dcbnl_rtnl_ops { /* buffer settings */ int (*dcbnl_getbuffer)(struct net_device *, struct dcbnl_buffer *); int (*dcbnl_setbuffer)(struct net_device *, struct dcbnl_buffer *); + + /* apptrust */ + int (*dcbnl_setapptrust)(struct net_device *, u8 *, int); + int (*dcbnl_getapptrust)(struct net_device *, u8 *, int *); }; #endif /* __NET_DCBNL_H__ */ diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h index dc7ef96207ca..99047223ab26 100644 --- a/include/uapi/linux/dcbnl.h +++ b/include/uapi/linux/dcbnl.h @@ -410,6 +410,7 @@ enum dcbnl_attrs { * @DCB_ATTR_IEEE_PEER_ETS: peer ETS configuration - get only * @DCB_ATTR_IEEE_PEER_PFC: peer PFC configuration - get only * @DCB_ATTR_IEEE_PEER_APP: peer APP tlv - get only + * @DCB_ATTR_DCB_APP_TRUST_TABLE: selector trust table */ enum ieee_attrs { DCB_ATTR_IEEE_UNSPEC, @@ -423,6 +424,7 @@ enum ieee_attrs { DCB_ATTR_IEEE_QCN, DCB_ATTR_IEEE_QCN_STATS, DCB_ATTR_DCB_BUFFER, + DCB_ATTR_DCB_APP_TRUST_TABLE, __DCB_ATTR_IEEE_MAX }; #define DCB_ATTR_IEEE_MAX (__DCB_ATTR_IEEE_MAX - 1) -- cgit v1.2.3 From 23da464dd6b8935b66f4ee306ad8947fd32ccd75 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:51 +0530 Subject: bpf: Allow specifying volatile type modifier for kptrs This is useful in particular to mark the pointer as volatile, so that compiler treats each load and store to the field as a volatile access. The alternative is having to define and use READ_ONCE and WRITE_ONCE in the BPF program. Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: David Vernet Link: https://lore.kernel.org/r/20221103191013.1236066-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index f9aababc5d78..86aad9b2ce02 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -288,6 +288,11 @@ static inline bool btf_type_is_typedef(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF; } +static inline bool btf_type_is_volatile(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_VOLATILE; +} + static inline bool btf_type_is_func(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC; -- cgit v1.2.3 From a35ec8e38cdd1766f29924ca391a01de20163931 Mon Sep 17 00:00:00 2001 From: "Hans J. Schultz" Date: Tue, 1 Nov 2022 21:39:21 +0200 Subject: bridge: Add MAC Authentication Bypass (MAB) support Hosts that support 802.1X authentication are able to authenticate themselves by exchanging EAPOL frames with an authenticator (Ethernet bridge, in this case) and an authentication server. Access to the network is only granted by the authenticator to successfully authenticated hosts. The above is implemented in the bridge using the "locked" bridge port option. When enabled, link-local frames (e.g., EAPOL) can be locally received by the bridge, but all other frames are dropped unless the host is authenticated. That is, unless the user space control plane installed an FDB entry according to which the source address of the frame is located behind the locked ingress port. The entry can be dynamic, in which case learning needs to be enabled so that the entry will be refreshed by incoming traffic. There are deployments in which not all the devices connected to the authenticator (the bridge) support 802.1X. Such devices can include printers and cameras. One option to support such deployments is to unlock the bridge ports connecting these devices, but a slightly more secure option is to use MAB. When MAB is enabled, the MAC address of the connected device is used as the user name and password for the authentication. For MAB to work, the user space control plane needs to be notified about MAC addresses that are trying to gain access so that they will be compared against an allow list. This can be implemented via the regular learning process with the sole difference that learned FDB entries are installed with a new "locked" flag indicating that the entry cannot be used to authenticate the device. The flag cannot be set by user space, but user space can clear the flag by replacing the entry, thereby authenticating the device. Locked FDB entries implement the following semantics with regards to roaming, aging and forwarding: 1. Roaming: Locked FDB entries can roam to unlocked (authorized) ports, in which case the "locked" flag is cleared. FDB entries cannot roam to locked ports regardless of MAB being enabled or not. Therefore, locked FDB entries are only created if an FDB entry with the given {MAC, VID} does not already exist. This behavior prevents unauthenticated devices from disrupting traffic destined to already authenticated devices. 2. Aging: Locked FDB entries age and refresh by incoming traffic like regular entries. 3. Forwarding: Locked FDB entries forward traffic like regular entries. If user space detects an unauthorized MAC behind a locked port and wishes to prevent traffic with this MAC DA from reaching the host, it can do so using tc or a different mechanism. Enable the above behavior using a new bridge port option called "mab". It can only be enabled on a bridge port that is both locked and has learning enabled. Locked FDB entries are flushed from the port once MAB is disabled. A new option is added because there are pure 802.1X deployments that are not interested in notifications about locked FDB entries. Signed-off-by: Hans J. Schultz Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/linux/if_bridge.h | 1 + include/uapi/linux/if_link.h | 1 + include/uapi/linux/neighbour.h | 8 +++++++- 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index d62ef428e3aa..1668ac4d7adc 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -59,6 +59,7 @@ struct br_ip_list { #define BR_MRP_LOST_IN_CONT BIT(19) #define BR_TX_FWD_OFFLOAD BIT(20) #define BR_PORT_LOCKED BIT(21) +#define BR_PORT_MAB BIT(22) #define BR_DEFAULT_AGEING_TIME (300 * HZ) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 5e7a1041df3a..d92b3f79eba3 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -561,6 +561,7 @@ enum { IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, IFLA_BRPORT_LOCKED, + IFLA_BRPORT_MAB, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index a998bf761635..5e67a7eaf4a7 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -52,7 +52,8 @@ enum { #define NTF_STICKY (1 << 6) #define NTF_ROUTER (1 << 7) /* Extended flags under NDA_FLAGS_EXT: */ -#define NTF_EXT_MANAGED (1 << 0) +#define NTF_EXT_MANAGED (1 << 0) +#define NTF_EXT_LOCKED (1 << 1) /* * Neighbor Cache Entry States. @@ -86,6 +87,11 @@ enum { * NTF_EXT_MANAGED flagged neigbor entries are managed by the kernel on behalf * of a user space control plane, and automatically refreshed so that (if * possible) they remain in NUD_REACHABLE state. + * + * NTF_EXT_LOCKED flagged bridge FDB entries are entries generated by the + * bridge in response to a host trying to communicate via a locked bridge port + * with MAB enabled. Their purpose is to notify user space that a host requires + * authentication. */ struct nda_cacheinfo { -- cgit v1.2.3 From 3830c5719af66fac9849cf5fb04b03d4e4bb46ff Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:01:59 +0100 Subject: net: devlink: convert devlink port type-specific pointers to union Instead of storing type_dev as a void pointer, convert it to union and use it to store either struct net_device or struct ib_device pointer. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index ba6b8b094943..6c55aabaedf1 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -121,12 +121,19 @@ struct devlink_port { struct list_head region_list; struct devlink *devlink; unsigned int index; - spinlock_t type_lock; /* Protects type and type_dev - * pointer consistency. + spinlock_t type_lock; /* Protects type and type_eth/ib + * structures consistency. */ enum devlink_port_type type; enum devlink_port_type desired_type; - void *type_dev; + union { + struct { + struct net_device *netdev; + } type_eth; + struct { + struct ib_device *ibdev; + } type_ib; + }; struct devlink_port_attrs attrs; u8 attrs_set:1, switch_port:1, -- cgit v1.2.3 From 02a68a47eadedf95748facfca6ced31fb0181d52 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:03 +0100 Subject: net: devlink: track netdev with devlink_port assigned Currently, ethernet drivers are using devlink_port_type_eth_set() and devlink_port_type_clear() to set devlink port type and link to related netdev. Instead of calling them directly, let the driver use SET_NETDEV_DEVLINK_PORT macro to assign devlink_port pointer and let devlink to track it. Note the devlink port pointer is static during the time netdevice is registered. In devlink code, use per-namespace netdev notifier to track the netdevices with devlink_port assigned and change the internal devlink_port type and related type pointer accordingly. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4b5052db978f..f048a30ea10b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1999,6 +1999,11 @@ enum netdev_ml_priv_type { * registered * @offload_xstats_l3: L3 HW stats for this netdevice. * + * @devlink_port: Pointer to related devlink port structure. + * Assigned by a driver before netdev registration using + * SET_NETDEV_DEVLINK_PORT macro. This pointer is static + * during the time netdevice is registered. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2349,9 +2354,22 @@ struct net_device { netdevice_tracker watchdog_dev_tracker; netdevice_tracker dev_registered_tracker; struct rtnl_hw_stats64 *offload_xstats_l3; + + struct devlink_port *devlink_port; }; #define to_net_dev(d) container_of(d, struct net_device, dev) +/* + * Driver should use this to assign devlink port instance to a netdevice + * before it registers the netdevice. Therefore devlink_port is static + * during the netdev lifetime after it is registered. + */ +#define SET_NETDEV_DEVLINK_PORT(dev, port) \ +({ \ + WARN_ON((dev)->reg_state != NETREG_UNINITIALIZED); \ + ((dev)->devlink_port = (port)); \ +}) + static inline bool netif_elide_gro(const struct net_device *dev) { if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog) @@ -2785,6 +2803,7 @@ enum netdev_cmd { NETDEV_PRE_TYPE_CHANGE, NETDEV_POST_TYPE_CHANGE, NETDEV_POST_INIT, + NETDEV_PRE_UNINIT, NETDEV_RELEASE, NETDEV_NOTIFY_PEERS, NETDEV_JOIN, -- cgit v1.2.3 From c80965784dbf2fd624be654c1e73c24beada7441 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:05 +0100 Subject: net: devlink: remove netdev arg from devlink_port_type_eth_set() Since devlink_port_type_eth_set() should no longer be called by any driver with netdev pointer as it should rather use SET_NETDEV_DEVLINK_PORT, remove the netdev arg. Add a warn to type_clear() Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 6c55aabaedf1..b1582b32183a 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1582,8 +1582,7 @@ int devlink_port_register(struct devlink *devlink, unsigned int port_index); void devl_port_unregister(struct devlink_port *devlink_port); void devlink_port_unregister(struct devlink_port *devlink_port); -void devlink_port_type_eth_set(struct devlink_port *devlink_port, - struct net_device *netdev); +void devlink_port_type_eth_set(struct devlink_port *devlink_port); void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev); void devlink_port_type_clear(struct devlink_port *devlink_port); -- cgit v1.2.3 From 31265c1e29eb28f17df50d04ee421b5b6369fefd Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:07 +0100 Subject: net: devlink: store copy netdevice ifindex and ifname to allow port_fill() without RTNL held To avoid a need to take RTNL mutex in port_fill() function, benefit from the introduce infrastructure that tracks netdevice notifier events. Store the ifindex and ifname upon register and change name events. Remove the rtnl_held bool propagated down to port_fill() function as it is no longer needed. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index b1582b32183a..7befad57afd4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -129,6 +129,8 @@ struct devlink_port { union { struct { struct net_device *netdev; + int ifindex; + char ifname[IFNAMSIZ]; } type_eth; struct { struct ib_device *ibdev; -- cgit v1.2.3 From 77df1db80da384c565106321f5934967690da7dd Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:10 +0100 Subject: net: remove unused ndo_get_devlink_port Remove ndo_get_devlink_port which is no longer used alongside with the implementations in drivers. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f048a30ea10b..d45713a06568 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1366,10 +1366,6 @@ struct netdev_net_notifier { * queue id bound to an AF_XDP socket. The flags field specifies if * only RX, only Tx, or both should be woken up using the flags * XDP_WAKEUP_RX and XDP_WAKEUP_TX. - * struct devlink_port *(*ndo_get_devlink_port)(struct net_device *dev); - * Get devlink port instance associated with a given netdev. - * Called with a reference on the netdevice and devlink locks only, - * rtnl_lock is not held. * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, * int cmd); * Add, change, delete or get information on an IPv4 tunnel. @@ -1600,7 +1596,6 @@ struct net_device_ops { struct xdp_buff *xdp); int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); - struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); -- cgit v1.2.3 From dca56c3038c34a3e5acfe0aadb1f2bc9d724ae79 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:11 +0100 Subject: net: expose devlink port over rtnetlink Expose devlink port handle related to netdev over rtnetlink. Introduce a new nested IFLA attribute to carry the info. Call into devlink code to fill-up the nest with existing devlink attributes that are used over devlink netlink. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 14 ++++++++++++++ include/uapi/linux/if_link.h | 2 ++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 7befad57afd4..fa6e936af1a5 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1873,6 +1873,9 @@ int devlink_compat_phys_port_name_get(struct net_device *dev, int devlink_compat_switch_id_get(struct net_device *dev, struct netdev_phys_item_id *ppid); +int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port); +size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port); + #else static inline struct devlink *devlink_try_get(struct devlink *devlink) @@ -1909,6 +1912,17 @@ devlink_compat_switch_id_get(struct net_device *dev, return -EOPNOTSUPP; } +static inline int +devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port) +{ + return 0; +} + +static inline size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) +{ + return 0; +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index d92b3f79eba3..1021a7e47a86 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -372,6 +372,8 @@ enum { IFLA_TSO_MAX_SEGS, IFLA_ALLMULTI, /* Allmulti count: > 0 means acts ALLMULTI */ + IFLA_DEVLINK_PORT, + __IFLA_MAX }; -- cgit v1.2.3 From aa3496accc412b3d975e4ee5d06076d73394d8b5 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:55 +0530 Subject: bpf: Refactor kptr_off_tab into btf_record To prepare the BPF verifier to handle special fields in both map values and program allocated types coming from program BTF, we need to refactor the kptr_off_tab handling code into something more generic and reusable across both cases to avoid code duplication. Later patches also require passing this data to helpers at runtime, so that they can work on user defined types, initialize them, destruct them, etc. The main observation is that both map values and such allocated types point to a type in program BTF, hence they can be handled similarly. We can prepare a field metadata table for both cases and store them in struct bpf_map or struct btf depending on the use case. Hence, refactor the code into generic btf_record and btf_field member structs. The btf_record represents the fields of a specific btf_type in user BTF. The cnt indicates the number of special fields we successfully recognized, and field_mask is a bitmask of fields that were found, to enable quick determination of availability of a certain field. Subsequently, refactor the rest of the code to work with these generic types, remove assumptions about kptr and kptr_off_tab, rename variables to more meaningful names, etc. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221103191013.1236066-7-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 125 ++++++++++++++++++++++++++++++++++------------------ include/linux/btf.h | 3 +- 2 files changed, 82 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8d948bfcb984..5f2a42033a37 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -165,35 +165,41 @@ struct bpf_map_ops { }; enum { - /* Support at most 8 pointers in a BPF map value */ - BPF_MAP_VALUE_OFF_MAX = 8, - BPF_MAP_OFF_ARR_MAX = BPF_MAP_VALUE_OFF_MAX + + /* Support at most 8 pointers in a BTF type */ + BTF_FIELDS_MAX = 8, + BPF_MAP_OFF_ARR_MAX = BTF_FIELDS_MAX + 1 + /* for bpf_spin_lock */ 1, /* for bpf_timer */ }; -enum bpf_kptr_type { - BPF_KPTR_UNREF, - BPF_KPTR_REF, +enum btf_field_type { + BPF_KPTR_UNREF = (1 << 2), + BPF_KPTR_REF = (1 << 3), + BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, }; -struct bpf_map_value_off_desc { +struct btf_field_kptr { + struct btf *btf; + struct module *module; + btf_dtor_kfunc_t dtor; + u32 btf_id; +}; + +struct btf_field { u32 offset; - enum bpf_kptr_type type; - struct { - struct btf *btf; - struct module *module; - btf_dtor_kfunc_t dtor; - u32 btf_id; - } kptr; + enum btf_field_type type; + union { + struct btf_field_kptr kptr; + }; }; -struct bpf_map_value_off { - u32 nr_off; - struct bpf_map_value_off_desc off[]; +struct btf_record { + u32 cnt; + u32 field_mask; + struct btf_field fields[]; }; -struct bpf_map_off_arr { +struct btf_field_offs { u32 cnt; u32 field_off[BPF_MAP_OFF_ARR_MAX]; u8 field_sz[BPF_MAP_OFF_ARR_MAX]; @@ -215,7 +221,7 @@ struct bpf_map { u64 map_extra; /* any per-map-type extra fields */ u32 map_flags; int spin_lock_off; /* >=0 valid offset, <0 error */ - struct bpf_map_value_off *kptr_off_tab; + struct btf_record *record; int timer_off; /* >=0 valid offset, <0 error */ u32 id; int numa_node; @@ -227,7 +233,7 @@ struct bpf_map { struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; - struct bpf_map_off_arr *off_arr; + struct btf_field_offs *field_offs; /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ @@ -251,6 +257,37 @@ struct bpf_map { bool frozen; /* write-once; write-protected by freeze_mutex */ }; +static inline u32 btf_field_type_size(enum btf_field_type type) +{ + switch (type) { + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + return sizeof(u64); + default: + WARN_ON_ONCE(1); + return 0; + } +} + +static inline u32 btf_field_type_align(enum btf_field_type type) +{ + switch (type) { + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + return __alignof__(u64); + default: + WARN_ON_ONCE(1); + return 0; + } +} + +static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_field_type type) +{ + if (IS_ERR_OR_NULL(rec)) + return false; + return rec->field_mask & type; +} + static inline bool map_value_has_spin_lock(const struct bpf_map *map) { return map->spin_lock_off >= 0; @@ -261,23 +298,19 @@ static inline bool map_value_has_timer(const struct bpf_map *map) return map->timer_off >= 0; } -static inline bool map_value_has_kptrs(const struct bpf_map *map) -{ - return !IS_ERR_OR_NULL(map->kptr_off_tab); -} - static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { if (unlikely(map_value_has_spin_lock(map))) memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock)); if (unlikely(map_value_has_timer(map))) memset(dst + map->timer_off, 0, sizeof(struct bpf_timer)); - if (unlikely(map_value_has_kptrs(map))) { - struct bpf_map_value_off *tab = map->kptr_off_tab; + if (!IS_ERR_OR_NULL(map->record)) { + struct btf_field *fields = map->record->fields; + u32 cnt = map->record->cnt; int i; - for (i = 0; i < tab->nr_off; i++) - *(u64 *)(dst + tab->off[i].offset) = 0; + for (i = 0; i < cnt; i++) + memset(dst + fields[i].offset, 0, btf_field_type_size(fields[i].type)); } } @@ -303,7 +336,7 @@ static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, b u32 curr_off = 0; int i; - if (likely(!map->off_arr)) { + if (likely(!map->field_offs)) { if (long_memcpy) bpf_long_memcpy(dst, src, round_up(map->value_size, 8)); else @@ -311,11 +344,12 @@ static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, b return; } - for (i = 0; i < map->off_arr->cnt; i++) { - u32 next_off = map->off_arr->field_off[i]; + for (i = 0; i < map->field_offs->cnt; i++) { + u32 next_off = map->field_offs->field_off[i]; + u32 sz = next_off - curr_off; - memcpy(dst + curr_off, src + curr_off, next_off - curr_off); - curr_off += map->off_arr->field_sz[i]; + memcpy(dst + curr_off, src + curr_off, sz); + curr_off += map->field_offs->field_sz[i]; } memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off); } @@ -335,16 +369,17 @@ static inline void zero_map_value(struct bpf_map *map, void *dst) u32 curr_off = 0; int i; - if (likely(!map->off_arr)) { + if (likely(!map->field_offs)) { memset(dst, 0, map->value_size); return; } - for (i = 0; i < map->off_arr->cnt; i++) { - u32 next_off = map->off_arr->field_off[i]; + for (i = 0; i < map->field_offs->cnt; i++) { + u32 next_off = map->field_offs->field_off[i]; + u32 sz = next_off - curr_off; - memset(dst + curr_off, 0, next_off - curr_off); - curr_off += map->off_arr->field_sz[i]; + memset(dst + curr_off, 0, sz); + curr_off += map->field_offs->field_sz[i]; } memset(dst + curr_off, 0, map->value_size - curr_off); } @@ -1699,11 +1734,13 @@ void bpf_prog_put(struct bpf_prog *prog); void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); -struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset); -void bpf_map_free_kptr_off_tab(struct bpf_map *map); -struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map); -bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b); -void bpf_map_free_kptrs(struct bpf_map *map, void *map_value); +struct btf_field *btf_record_find(const struct btf_record *rec, + u32 offset, enum btf_field_type type); +void btf_record_free(struct btf_record *rec); +void bpf_map_free_record(struct bpf_map *map); +struct btf_record *btf_record_dup(const struct btf_record *rec); +bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); +void bpf_obj_free_fields(const struct btf_record *rec, void *obj); struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); diff --git a/include/linux/btf.h b/include/linux/btf.h index 86aad9b2ce02..9e62717cdc7a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -163,8 +163,7 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); -struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, - const struct btf_type *t); +struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, -- cgit v1.2.3 From db559117828d2448fe81ada051c60bcf39f822e9 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:56 +0530 Subject: bpf: Consolidate spin_lock, timer management into btf_record Now that kptr_off_tab has been refactored into btf_record, and can hold more than one specific field type, accomodate bpf_spin_lock and bpf_timer as well. While they don't require any more metadata than offset, having all special fields in one place allows us to share the same code for allocated user defined types and handle both map values and these allocated objects in a similar fashion. As an optimization, we still keep spin_lock_off and timer_off offsets in the btf_record structure, just to avoid having to find the btf_field struct each time their offset is needed. This is mostly needed to manipulate such objects in a map value at runtime. It's ok to hardcode just one offset as more than one field is disallowed. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221103191013.1236066-8-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 53 ++++++++++++++++++++++++++++++++--------------------- include/linux/btf.h | 3 ++- 2 files changed, 34 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5f2a42033a37..aae85019abde 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -166,13 +166,13 @@ struct bpf_map_ops { enum { /* Support at most 8 pointers in a BTF type */ - BTF_FIELDS_MAX = 8, - BPF_MAP_OFF_ARR_MAX = BTF_FIELDS_MAX + - 1 + /* for bpf_spin_lock */ - 1, /* for bpf_timer */ + BTF_FIELDS_MAX = 10, + BPF_MAP_OFF_ARR_MAX = BTF_FIELDS_MAX, }; enum btf_field_type { + BPF_SPIN_LOCK = (1 << 0), + BPF_TIMER = (1 << 1), BPF_KPTR_UNREF = (1 << 2), BPF_KPTR_REF = (1 << 3), BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, @@ -196,6 +196,8 @@ struct btf_field { struct btf_record { u32 cnt; u32 field_mask; + int spin_lock_off; + int timer_off; struct btf_field fields[]; }; @@ -220,10 +222,8 @@ struct bpf_map { u32 max_entries; u64 map_extra; /* any per-map-type extra fields */ u32 map_flags; - int spin_lock_off; /* >=0 valid offset, <0 error */ - struct btf_record *record; - int timer_off; /* >=0 valid offset, <0 error */ u32 id; + struct btf_record *record; int numa_node; u32 btf_key_type_id; u32 btf_value_type_id; @@ -257,9 +257,29 @@ struct bpf_map { bool frozen; /* write-once; write-protected by freeze_mutex */ }; +static inline const char *btf_field_type_name(enum btf_field_type type) +{ + switch (type) { + case BPF_SPIN_LOCK: + return "bpf_spin_lock"; + case BPF_TIMER: + return "bpf_timer"; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + return "kptr"; + default: + WARN_ON_ONCE(1); + return "unknown"; + } +} + static inline u32 btf_field_type_size(enum btf_field_type type) { switch (type) { + case BPF_SPIN_LOCK: + return sizeof(struct bpf_spin_lock); + case BPF_TIMER: + return sizeof(struct bpf_timer); case BPF_KPTR_UNREF: case BPF_KPTR_REF: return sizeof(u64); @@ -272,6 +292,10 @@ static inline u32 btf_field_type_size(enum btf_field_type type) static inline u32 btf_field_type_align(enum btf_field_type type) { switch (type) { + case BPF_SPIN_LOCK: + return __alignof__(struct bpf_spin_lock); + case BPF_TIMER: + return __alignof__(struct bpf_timer); case BPF_KPTR_UNREF: case BPF_KPTR_REF: return __alignof__(u64); @@ -288,22 +312,8 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f return rec->field_mask & type; } -static inline bool map_value_has_spin_lock(const struct bpf_map *map) -{ - return map->spin_lock_off >= 0; -} - -static inline bool map_value_has_timer(const struct bpf_map *map) -{ - return map->timer_off >= 0; -} - static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { - if (unlikely(map_value_has_spin_lock(map))) - memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock)); - if (unlikely(map_value_has_timer(map))) - memset(dst + map->timer_off, 0, sizeof(struct bpf_timer)); if (!IS_ERR_OR_NULL(map->record)) { struct btf_field *fields = map->record->fields; u32 cnt = map->record->cnt; @@ -1740,6 +1750,7 @@ void btf_record_free(struct btf_record *rec); void bpf_map_free_record(struct bpf_map *map); struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); +void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); struct bpf_map *bpf_map_get(u32 ufd); diff --git a/include/linux/btf.h b/include/linux/btf.h index 9e62717cdc7a..282006abd062 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -163,7 +163,8 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); -struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t); +struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, + u32 field_mask, u32 value_size); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, -- cgit v1.2.3 From f71b2f64177a199d5b1d2047e155d45fd98f564a Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:57 +0530 Subject: bpf: Refactor map->off_arr handling Refactor map->off_arr handling into generic functions that can work on their own without hardcoding map specific code. The btf_fields_offs structure is now returned from btf_parse_field_offs, which can be reused later for types in program BTF. All functions like copy_map_value, zero_map_value call generic underlying functions so that they can also be reused later for copying to values allocated in programs which encode specific fields. Later, some helper functions will also require access to this btf_field_offs structure to be able to skip over special fields at runtime. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221103191013.1236066-9-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 41 ++++++++++++++++++++++++----------------- include/linux/btf.h | 1 + 2 files changed, 25 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aae85019abde..798aec816970 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -341,57 +341,64 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) } /* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */ -static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, bool long_memcpy) +static inline void bpf_obj_memcpy(struct btf_field_offs *foffs, + void *dst, void *src, u32 size, + bool long_memcpy) { u32 curr_off = 0; int i; - if (likely(!map->field_offs)) { + if (likely(!foffs)) { if (long_memcpy) - bpf_long_memcpy(dst, src, round_up(map->value_size, 8)); + bpf_long_memcpy(dst, src, round_up(size, 8)); else - memcpy(dst, src, map->value_size); + memcpy(dst, src, size); return; } - for (i = 0; i < map->field_offs->cnt; i++) { - u32 next_off = map->field_offs->field_off[i]; + for (i = 0; i < foffs->cnt; i++) { + u32 next_off = foffs->field_off[i]; u32 sz = next_off - curr_off; memcpy(dst + curr_off, src + curr_off, sz); - curr_off += map->field_offs->field_sz[i]; + curr_off += foffs->field_sz[i]; } - memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off); + memcpy(dst + curr_off, src + curr_off, size - curr_off); } static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) { - __copy_map_value(map, dst, src, false); + bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, false); } static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src) { - __copy_map_value(map, dst, src, true); + bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, true); } -static inline void zero_map_value(struct bpf_map *map, void *dst) +static inline void bpf_obj_memzero(struct btf_field_offs *foffs, void *dst, u32 size) { u32 curr_off = 0; int i; - if (likely(!map->field_offs)) { - memset(dst, 0, map->value_size); + if (likely(!foffs)) { + memset(dst, 0, size); return; } - for (i = 0; i < map->field_offs->cnt; i++) { - u32 next_off = map->field_offs->field_off[i]; + for (i = 0; i < foffs->cnt; i++) { + u32 next_off = foffs->field_off[i]; u32 sz = next_off - curr_off; memset(dst + curr_off, 0, sz); - curr_off += map->field_offs->field_sz[i]; + curr_off += foffs->field_sz[i]; } - memset(dst + curr_off, 0, map->value_size - curr_off); + memset(dst + curr_off, 0, size - curr_off); +} + +static inline void zero_map_value(struct bpf_map *map, void *dst) +{ + bpf_obj_memzero(map->field_offs, dst, map->value_size); } void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, diff --git a/include/linux/btf.h b/include/linux/btf.h index 282006abd062..d80345fa566b 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -165,6 +165,7 @@ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size); +struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, -- cgit v1.2.3 From 2b6c0e152868c9c5939a5c5094d5b2be61cf48e6 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 28 Oct 2022 11:23:32 +0200 Subject: bcma: Use the proper gpio include The is including the legacy header to obtain struct gpio_chip. Instead, include where this struct is defined. It turns out that the brcm80211 brcmsmac depends on this to bring in the symbol gpio_is_valid(). The driver looks up the BCMA parent GPIO driver and checks that this succeeds, but then it goes on to use the deprecated GPIO call gpio_is_valid() to check the consistency of the .base member of the BCMA GPIO struct. The whole check can be dropped because the bcma_gpio is initialized in the declarations: struct gpio_chip *bcma_gpio = &cc_drv->gpio; And this can never be NULL. Cc: Jonas Gorski Acked-by: Arend van Spriel Signed-off-by: Linus Walleij Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20221028092332.238728-1-linus.walleij@linaro.org --- include/linux/bcma/bcma_driver_chipcommon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h index 2d94c30ed439..0cb6638b55e5 100644 --- a/include/linux/bcma/bcma_driver_chipcommon.h +++ b/include/linux/bcma/bcma_driver_chipcommon.h @@ -4,7 +4,7 @@ #include #include -#include +#include /** ChipCommon core registers. **/ #define BCMA_CC_ID 0x0000 -- cgit v1.2.3 From d08cb25556773c65efb21761b75f92c804ad0975 Mon Sep 17 00:00:00 2001 From: David Yang Date: Fri, 28 Oct 2022 10:01:01 +0800 Subject: net: mv643xx_eth: support MII/GMII/RGMII modes for Kirkwood Support mode switch properly, which is not available before. If SoC has two Ethernet controllers, by setting both of them into MII mode, the first controller enters GMII mode, while the second controller is effectively disabled. This requires configuring (and maybe enabling) the second controller in the device tree, even though it cannot be used. Signed-off-by: David Yang Signed-off-by: David S. Miller --- include/linux/mv643xx_eth.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mv643xx_eth.h b/include/linux/mv643xx_eth.h index 3682ae75c7aa..145169be2ed8 100644 --- a/include/linux/mv643xx_eth.h +++ b/include/linux/mv643xx_eth.h @@ -8,6 +8,7 @@ #include #include +#include #define MV643XX_ETH_SHARED_NAME "mv643xx_eth" #define MV643XX_ETH_NAME "mv643xx_eth_port" @@ -59,6 +60,7 @@ struct mv643xx_eth_platform_data { */ int speed; int duplex; + phy_interface_t interface; /* * How many RX/TX queues to use. -- cgit v1.2.3 From 7c3eaa022261d79d273d73ac68ff02cbe2839c10 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:13:32 -0700 Subject: genetlink: move the private fields in struct genl_family Move the private fields down to form a "private section". Use the kdoc "private:" label comment thing to hide them from the main kdoc comment. Signed-off-by: Jakub Kicinski Reviewed-by: Johannes Berg Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- include/net/genetlink.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 9f97f73615b6..81180fc6526a 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -23,7 +23,6 @@ struct genl_info; /** * struct genl_family - generic netlink family - * @id: protocol family identifier (private) * @hdrsize: length of user specific header in bytes * @name: name of family * @version: protocol version @@ -43,8 +42,6 @@ struct genl_info; * @resv_start_op: first operation for which reserved fields of the header * can be validated and policies are required (see below); * new families should leave this field at zero - * @mcgrp_offset: starting number of multicast group IDs in this family - * (private) * @ops: the operations supported by this family * @n_ops: number of operations supported by this family * @small_ops: the small-struct operations supported by this family @@ -58,12 +55,10 @@ struct genl_info; * if policy is not provided core will reject all TLV attributes. */ struct genl_family { - int id; /* private */ unsigned int hdrsize; char name[GENL_NAMSIZ]; unsigned int version; unsigned int maxattr; - unsigned int mcgrp_offset; /* private */ u8 netnsok:1; u8 parallel_ops:1; u8 n_ops; @@ -81,6 +76,12 @@ struct genl_family { const struct genl_small_ops *small_ops; const struct genl_multicast_group *mcgrps; struct module *module; + +/* private: internal use only */ + /* protocol family identifier */ + int id; + /* starting number of multicast group IDs in this family */ + unsigned int mcgrp_offset; }; /** -- cgit v1.2.3 From 20b0b53aca436af9fece9428ca3ab7c7b9cf4583 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:13:33 -0700 Subject: genetlink: introduce split op representation We currently have two forms of operations - small ops and "full" ops (or just ops). The former does not have pointers for some of the less commonly used features (namely dump start/done and policy). The "full" ops, however, still don't contain all the necessary information. In particular the policy is per command ID, while do and dump often accept different attributes. It's also not possible to define different pre_doit and post_doit callbacks for different commands within the family. At the same time a lot of commands do not support dumping and therefore all the dump-related information is wasted space. Create a new command representation which can hold info about a do implementation or a dump implementation, but not both at the same time. Use this new representation on the command execution path (genl_family_rcv_msg) as we either run a do or a dump and don't have to create a "full" op there. Signed-off-by: Jakub Kicinski Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- include/net/genetlink.h | 60 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 81180fc6526a..4be7989c451b 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -18,7 +18,7 @@ struct genl_multicast_group { u8 flags; }; -struct genl_ops; +struct genl_split_ops; struct genl_info; /** @@ -66,10 +66,10 @@ struct genl_family { u8 n_mcgrps; u8 resv_start_op; const struct nla_policy *policy; - int (*pre_doit)(const struct genl_ops *ops, + int (*pre_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); - void (*post_doit)(const struct genl_ops *ops, + void (*post_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); const struct genl_ops * ops; @@ -182,6 +182,58 @@ struct genl_ops { u8 validate; }; +/** + * struct genl_split_ops - generic netlink operations (do/dump split version) + * @cmd: command identifier + * @internal_flags: flags used by the family + * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) + * @validate: validation flags from enum genl_validate_flags + * @policy: netlink policy (takes precedence over family policy) + * @maxattr: maximum number of attributes supported + * + * Do callbacks: + * @pre_doit: called before an operation's @doit callback, it may + * do additional, common, filtering and return an error + * @doit: standard command callback + * @post_doit: called after an operation's @doit callback, it may + * undo operations done by pre_doit, for example release locks + * + * Dump callbacks: + * @start: start callback for dumps + * @dumpit: callback for dumpers + * @done: completion callback for dumps + * + * Do callbacks can be used if %GENL_CMD_CAP_DO is set in @flags. + * Dump callbacks can be used if %GENL_CMD_CAP_DUMP is set in @flags. + * Exactly one of those flags must be set. + */ +struct genl_split_ops { + union { + struct { + int (*pre_doit)(const struct genl_split_ops *ops, + struct sk_buff *skb, + struct genl_info *info); + int (*doit)(struct sk_buff *skb, + struct genl_info *info); + void (*post_doit)(const struct genl_split_ops *ops, + struct sk_buff *skb, + struct genl_info *info); + }; + struct { + int (*start)(struct netlink_callback *cb); + int (*dumpit)(struct sk_buff *skb, + struct netlink_callback *cb); + int (*done)(struct netlink_callback *cb); + }; + }; + const struct nla_policy *policy; + unsigned int maxattr; + u8 cmd; + u8 internal_flags; + u8 flags; + u8 validate; +}; + /** * struct genl_dumpit_info - info that is available during dumpit op call * @family: generic netlink family - for internal genl code usage @@ -190,7 +242,7 @@ struct genl_ops { */ struct genl_dumpit_info { const struct genl_family *family; - struct genl_ops op; + struct genl_split_ops op; struct nlattr **attrs; }; -- cgit v1.2.3 From b8fd60c36a44351f773432e24efd8bb92f8ba0c6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:13:42 -0700 Subject: genetlink: allow families to use split ops directly Let families to hook in the new split ops. They are more flexible and should not be much larger than full ops. Each split op is 40B while full op is 48B. Devlink for example has 54 dos and 19 dumps, 2 of the dumps do not have a do -> 56 full commands = 2688B. Split ops would have taken 2920B, so 9% more space while allowing individual per/post doit and per-type policies. Signed-off-by: Jakub Kicinski Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- include/net/genetlink.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 4be7989c451b..d21210709f84 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -46,6 +46,9 @@ struct genl_info; * @n_ops: number of operations supported by this family * @small_ops: the small-struct operations supported by this family * @n_small_ops: number of small-struct operations supported by this family + * @split_ops: the split do/dump form of operation definition + * @n_split_ops: number of entries in @split_ops, not that with split do/dump + * ops the number of entries is not the same as number of commands * * Attribute policies (the combination of @policy and @maxattr fields) * can be attached at the family level or at the operation level. @@ -63,6 +66,7 @@ struct genl_family { u8 parallel_ops:1; u8 n_ops; u8 n_small_ops; + u8 n_split_ops; u8 n_mcgrps; u8 resv_start_op; const struct nla_policy *policy; @@ -74,6 +78,7 @@ struct genl_family { struct genl_info *info); const struct genl_ops * ops; const struct genl_small_ops *small_ops; + const struct genl_split_ops *split_ops; const struct genl_multicast_group *mcgrps; struct module *module; -- cgit v1.2.3 From e1f4ecab19338ec7079830e8700e4869f991fd45 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 4 Nov 2022 17:13:01 +0000 Subject: net: remove explicit phylink_generic_validate() references Virtually all conventional network drivers are now converted to use phylink_generic_validate() - only DSA drivers and fman_memac remain, so lets remove the necessity for network drivers to explicitly set this member, and default to phylink_generic_validate() when unset. This is possible as .validate must currently be set. Any remaining instances that have not been addressed by this patch can be fixed up later. Signed-off-by: Russell King (Oracle) Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/E1or0FZ-001tRa-DI@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 1df3e5e316e8..c492c26202b5 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -207,6 +207,11 @@ struct phylink_mac_ops { * * If the @state->interface mode is not supported, then the @supported * mask must be cleared. + * + * This member is optional; if not set, the generic validator will be + * used making use of @config->mac_capabilities and + * @config->supported_interfaces to determine which link modes are + * supported. */ void validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); -- cgit v1.2.3 From 9a0f830f80265bd1ef816e1541ac24bee80e9a3c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:01:25 -0700 Subject: ethtool: linkstate: add a statistic for PHY down events The previous attempt to augment carrier_down (see Link) was not met with much enthusiasm so let's do the simple thing of exposing what some devices already maintain. Add a common ethtool statistic for link going down. Currently users have to maintain per-driver mapping to extract the right stat from the vendor-specific ethtool -S stats. carrier_down does not fit the bill because it counts a lot of software related false positives. Add the statistic to the extended link state API to steer vendors towards implementing all of it. Implement for bnxt and all Linux-controlled PHYs. mlx5 and (possibly) enic also have a counter for this but I leave the implementation to their maintainers. Link: https://lore.kernel.org/r/20220520004500.2250674-1-kuba@kernel.org Reviewed-by: Florian Fainelli Reviewed-by: Michael Chan Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20221104190125.684910-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- include/linux/ethtool.h | 17 +++++++++++++++++ include/linux/phy.h | 3 +++ include/uapi/linux/ethtool_netlink.h | 1 + 3 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 99dc7bfbcd3c..5c51c7fda32a 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -125,6 +125,20 @@ struct ethtool_link_ext_state_info { }; }; +struct ethtool_link_ext_stats { + /* Custom Linux statistic for PHY level link down events. + * In a simpler world it should be equal to netdev->carrier_down_count + * unfortunately netdev also counts local reconfigurations which don't + * actually take the physical link down, not to mention NC-SI which, + * if present, keeps the link up regardless of host state. + * This statistic counts when PHY _actually_ went down, or lost link. + * + * Note that we need u64 for ethtool_stats_init() and comparisons + * to ETHTOOL_STAT_NOT_SET, but only u32 is exposed to the user. + */ + u64 link_down_events; +}; + /** * ethtool_rxfh_indir_default - get default value for RX flow hash indirection * @index: Index in RX flow hash indirection table @@ -481,6 +495,7 @@ struct ethtool_module_power_mode_params { * do not attach ext_substate attribute to netlink message). If link_ext_state * and link_ext_substate are unknown, return -ENODATA. If not implemented, * link_ext_state and link_ext_substate will not be sent to userspace. + * @get_link_ext_stats: Read extra link-related counters. * @get_eeprom_len: Read range of EEPROM addresses for validation of * @get_eeprom and @set_eeprom requests. * Returns 0 if device does not support EEPROM access. @@ -652,6 +667,8 @@ struct ethtool_ops { u32 (*get_link)(struct net_device *); int (*get_link_ext_state)(struct net_device *, struct ethtool_link_ext_state_info *); + void (*get_link_ext_stats)(struct net_device *dev, + struct ethtool_link_ext_stats *stats); int (*get_eeprom_len)(struct net_device *); int (*get_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); diff --git a/include/linux/phy.h b/include/linux/phy.h index ddf66198f751..9a3752c0c444 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -600,6 +600,7 @@ struct macsec_ops; * @psec: Pointer to Power Sourcing Equipment control struct * @lock: Mutex for serialization access to PHY * @state_queue: Work queue for state machine + * @link_down_events: Number of times link was lost * @shared: Pointer to private data shared by phys in one package * @priv: Pointer to driver private data * @@ -723,6 +724,8 @@ struct phy_device { int pma_extable; + unsigned int link_down_events; + void (*phy_link_change)(struct phy_device *phydev, bool up); void (*adjust_link)(struct net_device *dev); diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index bb57084ac524..aaf7c6963d61 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -262,6 +262,7 @@ enum { ETHTOOL_A_LINKSTATE_SQI_MAX, /* u32 */ ETHTOOL_A_LINKSTATE_EXT_STATE, /* u8 */ ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, /* u8 */ + ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT, /* u32 */ /* add new constants above here */ __ETHTOOL_A_LINKSTATE_CNT, -- cgit v1.2.3 From ca71277f36e0781db663aedeb5fc1e26e7c144c4 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 6 Nov 2022 15:34:14 -0500 Subject: net: move the ct helper function to nf_conntrack_helper for ovs and tc Move ovs_ct_helper from openvswitch to nf_conntrack_helper and rename as nf_ct_helper so that it can be used in TC act_ct in the next patch. Note that it also adds the checks for the family and proto, as in TC act_ct, the packets with correct family and proto are not guaranteed. Acked-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Signed-off-by: Paolo Abeni --- include/net/netfilter/nf_conntrack_helper.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 9939c366f720..b6676249eeeb 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -115,6 +115,9 @@ struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp); int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags); +int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, u16 proto); + void nf_ct_helper_destroy(struct nf_conn *ct); static inline struct nf_conn_help *nfct_help(const struct nf_conn *ct) -- cgit v1.2.3 From f96cba2eb923c025014fe74a50e104b7c5234feb Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 6 Nov 2022 15:34:15 -0500 Subject: net: move add ct helper function to nf_conntrack_helper for ovs and tc Move ovs_ct_add_helper from openvswitch to nf_conntrack_helper and rename as nf_ct_add_helper, so that it can be used in TC act_ct in the next patch. Acked-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Signed-off-by: Paolo Abeni --- include/net/netfilter/nf_conntrack_helper.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index b6676249eeeb..f30b1694b690 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -117,6 +117,8 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, u16 proto); +int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family, + u8 proto, bool nat, struct nf_conntrack_helper **hp); void nf_ct_helper_destroy(struct nf_conn *ct); -- cgit v1.2.3 From a21b06e7319129994f339ed47f512bbe57b77f5b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 6 Nov 2022 15:34:17 -0500 Subject: net: sched: add helper support in act_ct This patch is to add helper support in act_ct for OVS actions=ct(alg=xxx) offloading, which is corresponding to Commit cae3a2627520 ("openvswitch: Allow attaching helpers to ct action") in OVS kernel part. The difference is when adding TC actions family and proto cannot be got from the filter/match, other than helper name in tb[TCA_CT_HELPER_NAME], we also need to send the family in tb[TCA_CT_HELPER_FAMILY] and the proto in tb[TCA_CT_HELPER_PROTO] to kernel. Acked-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Signed-off-by: Paolo Abeni --- include/net/tc_act/tc_ct.h | 1 + include/uapi/linux/tc_act/tc_ct.h | 3 +++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h index 8250d6f0a462..b24ea2d9400b 100644 --- a/include/net/tc_act/tc_ct.h +++ b/include/net/tc_act/tc_ct.h @@ -10,6 +10,7 @@ #include struct tcf_ct_params { + struct nf_conntrack_helper *helper; struct nf_conn *tmpl; u16 zone; diff --git a/include/uapi/linux/tc_act/tc_ct.h b/include/uapi/linux/tc_act/tc_ct.h index 5fb1d7ac1027..6c5200f0ed38 100644 --- a/include/uapi/linux/tc_act/tc_ct.h +++ b/include/uapi/linux/tc_act/tc_ct.h @@ -22,6 +22,9 @@ enum { TCA_CT_NAT_PORT_MIN, /* be16 */ TCA_CT_NAT_PORT_MAX, /* be16 */ TCA_CT_PAD, + TCA_CT_HELPER_NAME, /* string */ + TCA_CT_HELPER_FAMILY, /* u8 */ + TCA_CT_HELPER_PROTO, /* u8 */ __TCA_CT_MAX }; -- cgit v1.2.3 From c3d96f690a790074b508fe183a41e36a00cd7ddd Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 3 Oct 2022 07:34:21 +0100 Subject: net, proc: Provide PROC_FS=n fallback for proc_create_net_single_write() Provide a CONFIG_PROC_FS=n fallback for proc_create_net_single_write(). Also provide a fallback for proc_create_net_data_write(). Fixes: 564def71765c ("proc: Add a way to make network proc files writable") Reported-by: kernel test robot Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org --- include/linux/proc_fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 81d6e4ec2294..0260f5ea98fe 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -208,8 +208,10 @@ static inline void proc_remove(struct proc_dir_entry *de) {} static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) { return 0; } #define proc_create_net_data(name, mode, parent, ops, state_size, data) ({NULL;}) +#define proc_create_net_data_write(name, mode, parent, ops, write, state_size, data) ({NULL;}) #define proc_create_net(name, mode, parent, state_size, ops) ({NULL;}) #define proc_create_net_single(name, mode, parent, show, data) ({NULL;}) +#define proc_create_net_single_write(name, mode, parent, show, write, data) ({NULL;}) static inline struct pid *tgid_pidfd_to_pid(const struct file *file) { -- cgit v1.2.3 From 4d843be56ba6a8c0e566afd58775742d9e721505 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 5 Apr 2022 21:48:48 +0100 Subject: rxrpc: Trace setting of the request-ack flag Add a tracepoint to log why the request-ack flag is set on an outgoing DATA packet, allowing debugging as to why. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index d20bf4aa0204..4c501c660123 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -242,6 +242,16 @@ EM(rxrpc_tx_point_version_keepalive, "VerKeepalive") \ E_(rxrpc_tx_point_version_reply, "VerReply") +#define rxrpc_req_ack_traces \ + EM(rxrpc_reqack_ack_lost, "ACK-LOST ") \ + EM(rxrpc_reqack_already_on, "ALREADY-ON") \ + EM(rxrpc_reqack_more_rtt, "MORE-RTT ") \ + EM(rxrpc_reqack_no_srv_last, "NO-SRVLAST") \ + EM(rxrpc_reqack_old_rtt, "OLD-RTT ") \ + EM(rxrpc_reqack_retrans, "RETRANS ") \ + EM(rxrpc_reqack_slow_start, "SLOW-START") \ + E_(rxrpc_reqack_small_txwin, "SMALL-TXWN") + /* * Generate enums for tracing information. */ @@ -263,6 +273,7 @@ enum rxrpc_propose_ack_outcome { rxrpc_propose_ack_outcomes } __mode(byte); enum rxrpc_propose_ack_trace { rxrpc_propose_ack_traces } __mode(byte); enum rxrpc_receive_trace { rxrpc_receive_traces } __mode(byte); enum rxrpc_recvmsg_trace { rxrpc_recvmsg_traces } __mode(byte); +enum rxrpc_req_ack_trace { rxrpc_req_ack_traces } __mode(byte); enum rxrpc_rtt_rx_trace { rxrpc_rtt_rx_traces } __mode(byte); enum rxrpc_rtt_tx_trace { rxrpc_rtt_tx_traces } __mode(byte); enum rxrpc_skb_trace { rxrpc_skb_traces } __mode(byte); @@ -290,6 +301,7 @@ rxrpc_propose_ack_outcomes; rxrpc_propose_ack_traces; rxrpc_receive_traces; rxrpc_recvmsg_traces; +rxrpc_req_ack_traces; rxrpc_rtt_rx_traces; rxrpc_rtt_tx_traces; rxrpc_skb_traces; @@ -1395,6 +1407,30 @@ TRACE_EVENT(rxrpc_rx_discard_ack, __entry->call_ackr_prev) ); +TRACE_EVENT(rxrpc_req_ack, + TP_PROTO(unsigned int call_debug_id, rxrpc_seq_t seq, + enum rxrpc_req_ack_trace why), + + TP_ARGS(call_debug_id, seq, why), + + TP_STRUCT__entry( + __field(unsigned int, call_debug_id ) + __field(rxrpc_seq_t, seq ) + __field(enum rxrpc_req_ack_trace, why ) + ), + + TP_fast_assign( + __entry->call_debug_id = call_debug_id; + __entry->seq = seq; + __entry->why = why; + ), + + TP_printk("c=%08x q=%08x REQ-%s", + __entry->call_debug_id, + __entry->seq, + __print_symbolic(__entry->why, rxrpc_req_ack_traces)) + ); + #undef EM #undef E_ #endif /* _TRACE_RXRPC_H */ -- cgit v1.2.3 From 334dfbfc5a7187c99761df2392dd4cc49c453bea Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 22 Apr 2022 00:20:49 +0100 Subject: rxrpc: Split call timer-expiration from call timer-set tracepoint Split the tracepoint for call timer-set to separate out the call timer-expiration event Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 4c501c660123..a72f04e3d264 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -133,7 +133,6 @@ #define rxrpc_timer_traces \ EM(rxrpc_timer_begin, "Begin ") \ - EM(rxrpc_timer_expired, "*EXPR*") \ EM(rxrpc_timer_exp_ack, "ExpAck") \ EM(rxrpc_timer_exp_hard, "ExpHrd") \ EM(rxrpc_timer_exp_idle, "ExpIdl") \ @@ -1019,6 +1018,47 @@ TRACE_EVENT(rxrpc_timer, __entry->timer - __entry->now) ); +TRACE_EVENT(rxrpc_timer_expired, + TP_PROTO(struct rxrpc_call *call, unsigned long now), + + TP_ARGS(call, now), + + TP_STRUCT__entry( + __field(unsigned int, call ) + __field(long, now ) + __field(long, ack_at ) + __field(long, ack_lost_at ) + __field(long, resend_at ) + __field(long, ping_at ) + __field(long, expect_rx_by ) + __field(long, expect_req_by ) + __field(long, expect_term_by ) + __field(long, timer ) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->now = now; + __entry->ack_at = call->ack_at; + __entry->ack_lost_at = call->ack_lost_at; + __entry->resend_at = call->resend_at; + __entry->expect_rx_by = call->expect_rx_by; + __entry->expect_req_by = call->expect_req_by; + __entry->expect_term_by = call->expect_term_by; + __entry->timer = call->timer.expires; + ), + + TP_printk("c=%08x EXPIRED a=%ld la=%ld r=%ld xr=%ld xq=%ld xt=%ld t=%ld", + __entry->call, + __entry->ack_at - __entry->now, + __entry->ack_lost_at - __entry->now, + __entry->resend_at - __entry->now, + __entry->expect_rx_by - __entry->now, + __entry->expect_req_by - __entry->now, + __entry->expect_term_by - __entry->now, + __entry->timer - __entry->now) + ); + TRACE_EVENT(rxrpc_rx_lose, TP_PROTO(struct rxrpc_skb_priv *sp), -- cgit v1.2.3 From f7fa52421f76309c574f2575701660bc3ea3a705 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 18 Aug 2022 11:52:36 +0100 Subject: rxrpc: Record stats for why the REQUEST-ACK flag is being set Record stats for why the REQUEST-ACK flag is being set. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index a72f04e3d264..794523d15321 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -250,6 +250,7 @@ EM(rxrpc_reqack_retrans, "RETRANS ") \ EM(rxrpc_reqack_slow_start, "SLOW-START") \ E_(rxrpc_reqack_small_txwin, "SMALL-TXWN") +/* ---- Must update size of stat_why_req_ack[] if more are added! */ /* * Generate enums for tracing information. -- cgit v1.2.3 From 42fb06b391ace2aec5cdb1ebb8ff668f0a34332f Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Oct 2022 08:49:29 +0100 Subject: net: Change the udp encap_err_rcv to allow use of {ip,ipv6}_icmp_error() Change the udp encap_err_rcv signature to match ip_icmp_error() and ipv6_icmp_error() so that those can be used from the called function and export them. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org --- include/linux/udp.h | 3 ++- include/net/udp_tunnel.h | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/udp.h b/include/linux/udp.h index 5cdba00a904a..dea57aa37df6 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -70,7 +70,8 @@ struct udp_sock { * For encapsulation sockets. */ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); - void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset); + void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload); int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb); void (*encap_destroy)(struct sock *sk); diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 72394f441dad..0ca9b7a11baf 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -68,8 +68,8 @@ typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); typedef int (*udp_tunnel_encap_err_lookup_t)(struct sock *sk, struct sk_buff *skb); typedef void (*udp_tunnel_encap_err_rcv_t)(struct sock *sk, - struct sk_buff *skb, - unsigned int udp_offset); + struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload); typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk); typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk, struct list_head *head, -- cgit v1.2.3 From 27f699ccb89d65165175525254fec3d9d6b8d500 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 7 Oct 2022 13:52:06 +0100 Subject: rxrpc: Remove the flags from the rxrpc_skb tracepoint Remove the flags from the rxrpc_skb tracepoint as we're no longer going to be using this for the transmission buffers and so marking which are transmission buffers isn't going to be necessary. Note that this also remove the rxrpc skb flag that indicates if this is a transmission buffer and so the count is not updated for the moment. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 794523d15321..484c8d032ab8 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -461,14 +461,13 @@ TRACE_EVENT(rxrpc_call, TRACE_EVENT(rxrpc_skb, TP_PROTO(struct sk_buff *skb, enum rxrpc_skb_trace op, - int usage, int mod_count, u8 flags, const void *where), + int usage, int mod_count, const void *where), - TP_ARGS(skb, op, usage, mod_count, flags, where), + TP_ARGS(skb, op, usage, mod_count, where), TP_STRUCT__entry( __field(struct sk_buff *, skb ) __field(enum rxrpc_skb_trace, op ) - __field(u8, flags ) __field(int, usage ) __field(int, mod_count ) __field(const void *, where ) @@ -476,16 +475,14 @@ TRACE_EVENT(rxrpc_skb, TP_fast_assign( __entry->skb = skb; - __entry->flags = flags; __entry->op = op; __entry->usage = usage; __entry->mod_count = mod_count; __entry->where = where; ), - TP_printk("s=%p %cx %s u=%d m=%d p=%pSR", + TP_printk("s=%p Rx %s u=%d m=%d p=%pSR", __entry->skb, - __entry->flags & RXRPC_SKB_TX_BUFFER ? 'T' : 'R', __print_symbolic(__entry->op, rxrpc_skb_traces), __entry->usage, __entry->mod_count, -- cgit v1.2.3 From 02a1935640f8f8539b8f2dbd6eeb539de93b2ce4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 5 Apr 2022 21:16:32 +0100 Subject: rxrpc: Define rxrpc_txbuf struct to carry data to be transmitted Define a struct, rxrpc_txbuf, to carry data to be transmitted instead of a socket buffer so that it can be placed onto multiple queues at once. This also allows the data buffer to be in the same allocation as the internal data. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 45 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 484c8d032ab8..47b157b1d32b 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -252,6 +252,18 @@ E_(rxrpc_reqack_small_txwin, "SMALL-TXWN") /* ---- Must update size of stat_why_req_ack[] if more are added! */ +#define rxrpc_txbuf_traces \ + EM(rxrpc_txbuf_alloc_ack, "ALLOC ACK ") \ + EM(rxrpc_txbuf_alloc_data, "ALLOC DATA ") \ + EM(rxrpc_txbuf_free, "FREE ") \ + EM(rxrpc_txbuf_get_trans, "GET TRANS ") \ + EM(rxrpc_txbuf_get_retrans, "GET RETRANS") \ + EM(rxrpc_txbuf_put_cleaned, "PUT CLEANED") \ + EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ + EM(rxrpc_txbuf_put_send_aborted, "PUT SEND-X ") \ + EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \ + E_(rxrpc_txbuf_see_unacked, "SEE UNACKED") + /* * Generate enums for tracing information. */ @@ -280,6 +292,7 @@ enum rxrpc_skb_trace { rxrpc_skb_traces } __mode(byte); enum rxrpc_timer_trace { rxrpc_timer_traces } __mode(byte); enum rxrpc_transmit_trace { rxrpc_transmit_traces } __mode(byte); enum rxrpc_tx_point { rxrpc_tx_points } __mode(byte); +enum rxrpc_txbuf_trace { rxrpc_txbuf_traces } __mode(byte); #endif /* end __RXRPC_DECLARE_TRACE_ENUMS_ONCE_ONLY */ @@ -308,6 +321,7 @@ rxrpc_skb_traces; rxrpc_timer_traces; rxrpc_transmit_traces; rxrpc_tx_points; +rxrpc_txbuf_traces; /* * Now redefine the EM() and E_() macros to map the enums to the strings that @@ -1469,6 +1483,37 @@ TRACE_EVENT(rxrpc_req_ack, __print_symbolic(__entry->why, rxrpc_req_ack_traces)) ); +TRACE_EVENT(rxrpc_txbuf, + TP_PROTO(unsigned int debug_id, + unsigned int call_debug_id, rxrpc_seq_t seq, + int ref, enum rxrpc_txbuf_trace what), + + TP_ARGS(debug_id, call_debug_id, seq, ref, what), + + TP_STRUCT__entry( + __field(unsigned int, debug_id ) + __field(unsigned int, call_debug_id ) + __field(rxrpc_seq_t, seq ) + __field(int, ref ) + __field(enum rxrpc_txbuf_trace, what ) + ), + + TP_fast_assign( + __entry->debug_id = debug_id; + __entry->call_debug_id = call_debug_id; + __entry->seq = seq; + __entry->ref = ref; + __entry->what = what; + ), + + TP_printk("B=%08x c=%08x q=%08x %s r=%d", + __entry->debug_id, + __entry->call_debug_id, + __entry->seq, + __print_symbolic(__entry->what, rxrpc_txbuf_traces), + __entry->ref) + ); + #undef EM #undef E_ #endif /* _TRACE_RXRPC_H */ -- cgit v1.2.3 From 72f0c6fb057971864fe4d42b289b8e6ede836ef1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 30 Jan 2020 21:48:13 +0000 Subject: rxrpc: Allocate ACK records at proposal and queue for transmission Allocate rxrpc_txbuf records for ACKs and put onto a queue for the transmitter thread to dispatch. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 47 +++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 47b157b1d32b..1597ff7ad97e 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -34,7 +34,8 @@ EM(rxrpc_local_new, "NEW") \ EM(rxrpc_local_processing, "PRO") \ EM(rxrpc_local_put, "PUT") \ - E_(rxrpc_local_queued, "QUE") + EM(rxrpc_local_queued, "QUE") \ + E_(rxrpc_local_tx_ack, "TAK") #define rxrpc_peer_traces \ EM(rxrpc_peer_got, "GOT") \ @@ -258,7 +259,9 @@ EM(rxrpc_txbuf_free, "FREE ") \ EM(rxrpc_txbuf_get_trans, "GET TRANS ") \ EM(rxrpc_txbuf_get_retrans, "GET RETRANS") \ + EM(rxrpc_txbuf_put_ack_tx, "PUT ACK TX ") \ EM(rxrpc_txbuf_put_cleaned, "PUT CLEANED") \ + EM(rxrpc_txbuf_put_nomem, "PUT NOMEM ") \ EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ EM(rxrpc_txbuf_put_send_aborted, "PUT SEND-X ") \ EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \ @@ -1095,19 +1098,16 @@ TRACE_EVENT(rxrpc_rx_lose, TRACE_EVENT(rxrpc_propose_ack, TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why, - u8 ack_reason, rxrpc_serial_t serial, bool immediate, - bool background, enum rxrpc_propose_ack_outcome outcome), + u8 ack_reason, rxrpc_serial_t serial, + enum rxrpc_propose_ack_outcome outcome), - TP_ARGS(call, why, ack_reason, serial, immediate, background, - outcome), + TP_ARGS(call, why, ack_reason, serial, outcome), TP_STRUCT__entry( __field(unsigned int, call ) __field(enum rxrpc_propose_ack_trace, why ) __field(rxrpc_serial_t, serial ) __field(u8, ack_reason ) - __field(bool, immediate ) - __field(bool, background ) __field(enum rxrpc_propose_ack_outcome, outcome ) ), @@ -1116,21 +1116,44 @@ TRACE_EVENT(rxrpc_propose_ack, __entry->why = why; __entry->serial = serial; __entry->ack_reason = ack_reason; - __entry->immediate = immediate; - __entry->background = background; __entry->outcome = outcome; ), - TP_printk("c=%08x %s %s r=%08x i=%u b=%u%s", + TP_printk("c=%08x %s %s r=%08x%s", __entry->call, __print_symbolic(__entry->why, rxrpc_propose_ack_traces), __print_symbolic(__entry->ack_reason, rxrpc_ack_names), __entry->serial, - __entry->immediate, - __entry->background, __print_symbolic(__entry->outcome, rxrpc_propose_ack_outcomes)) ); +TRACE_EVENT(rxrpc_send_ack, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why, + u8 ack_reason, rxrpc_serial_t serial), + + TP_ARGS(call, why, ack_reason, serial), + + TP_STRUCT__entry( + __field(unsigned int, call ) + __field(enum rxrpc_propose_ack_trace, why ) + __field(rxrpc_serial_t, serial ) + __field(u8, ack_reason ) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->why = why; + __entry->serial = serial; + __entry->ack_reason = ack_reason; + ), + + TP_printk("c=%08x %s %s r=%08x", + __entry->call, + __print_symbolic(__entry->why, rxrpc_propose_ack_traces), + __print_symbolic(__entry->ack_reason, rxrpc_ack_names), + __entry->serial) + ); + TRACE_EVENT(rxrpc_retransmit, TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, u8 annotation, s64 expiry), -- cgit v1.2.3 From 530403d9ba1c3f51c721a394f642e56309072295 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 30 Jan 2020 21:48:14 +0000 Subject: rxrpc: Clean up ACK handling Clean up the rxrpc_propose_ACK() function. If deferred PING ACK proposal is split out, it's only really needed for deferred DELAY ACKs. All other ACKs, bar terminal IDLE ACK are sent immediately. The deferred IDLE ACK submission can be handled by conversion of a DELAY ACK into an IDLE ACK if there's nothing to be SACK'd. Also, because there's a delay between an ACK being generated and being transmitted, it's possible that other ACKs of the same type will be generated during that interval. Apart from the ACK time and the serial number responded to, most of the ACK body, including window and SACK parameters, are not filled out till the point of transmission - so we can avoid generating a new ACK if there's one pending that will cover the SACK data we need to convey. Therefore, don't propose a new DELAY or IDLE ACK for a call if there's one already pending. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 52 ++++++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 1597ff7ad97e..d32e9858c682 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -158,6 +158,7 @@ #define rxrpc_propose_ack_traces \ EM(rxrpc_propose_ack_client_tx_end, "ClTxEnd") \ EM(rxrpc_propose_ack_input_data, "DataIn ") \ + EM(rxrpc_propose_ack_input_data_hole, "DataInH") \ EM(rxrpc_propose_ack_ping_for_check_life, "ChkLife") \ EM(rxrpc_propose_ack_ping_for_keepalive, "KeepAlv") \ EM(rxrpc_propose_ack_ping_for_lost_ack, "LostAck") \ @@ -170,11 +171,6 @@ EM(rxrpc_propose_ack_rotate_rx, "RxAck ") \ E_(rxrpc_propose_ack_terminal_ack, "ClTerm ") -#define rxrpc_propose_ack_outcomes \ - EM(rxrpc_propose_ack_subsume, " Subsume") \ - EM(rxrpc_propose_ack_update, " Update") \ - E_(rxrpc_propose_ack_use, " New") - #define rxrpc_congest_modes \ EM(RXRPC_CALL_CONGEST_AVOIDANCE, "CongAvoid") \ EM(RXRPC_CALL_FAST_RETRANSMIT, "FastReTx ") \ @@ -313,7 +309,6 @@ rxrpc_congest_changes; rxrpc_congest_modes; rxrpc_conn_traces; rxrpc_local_traces; -rxrpc_propose_ack_outcomes; rxrpc_propose_ack_traces; rxrpc_receive_traces; rxrpc_recvmsg_traces; @@ -1012,7 +1007,7 @@ TRACE_EVENT(rxrpc_timer, __entry->call = call->debug_id; __entry->why = why; __entry->now = now; - __entry->ack_at = call->ack_at; + __entry->ack_at = call->delay_ack_at; __entry->ack_lost_at = call->ack_lost_at; __entry->resend_at = call->resend_at; __entry->expect_rx_by = call->expect_rx_by; @@ -1054,7 +1049,7 @@ TRACE_EVENT(rxrpc_timer_expired, TP_fast_assign( __entry->call = call->debug_id; __entry->now = now; - __entry->ack_at = call->ack_at; + __entry->ack_at = call->delay_ack_at; __entry->ack_lost_at = call->ack_lost_at; __entry->resend_at = call->resend_at; __entry->expect_rx_by = call->expect_rx_by; @@ -1098,17 +1093,15 @@ TRACE_EVENT(rxrpc_rx_lose, TRACE_EVENT(rxrpc_propose_ack, TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why, - u8 ack_reason, rxrpc_serial_t serial, - enum rxrpc_propose_ack_outcome outcome), + u8 ack_reason, rxrpc_serial_t serial), - TP_ARGS(call, why, ack_reason, serial, outcome), + TP_ARGS(call, why, ack_reason, serial), TP_STRUCT__entry( __field(unsigned int, call ) __field(enum rxrpc_propose_ack_trace, why ) __field(rxrpc_serial_t, serial ) __field(u8, ack_reason ) - __field(enum rxrpc_propose_ack_outcome, outcome ) ), TP_fast_assign( @@ -1116,15 +1109,13 @@ TRACE_EVENT(rxrpc_propose_ack, __entry->why = why; __entry->serial = serial; __entry->ack_reason = ack_reason; - __entry->outcome = outcome; ), - TP_printk("c=%08x %s %s r=%08x%s", + TP_printk("c=%08x %s %s r=%08x", __entry->call, __print_symbolic(__entry->why, rxrpc_propose_ack_traces), __print_symbolic(__entry->ack_reason, rxrpc_ack_names), - __entry->serial, - __print_symbolic(__entry->outcome, rxrpc_propose_ack_outcomes)) + __entry->serial) ); TRACE_EVENT(rxrpc_send_ack, @@ -1154,6 +1145,35 @@ TRACE_EVENT(rxrpc_send_ack, __entry->serial) ); +TRACE_EVENT(rxrpc_drop_ack, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why, + u8 ack_reason, rxrpc_serial_t serial, bool nobuf), + + TP_ARGS(call, why, ack_reason, serial, nobuf), + + TP_STRUCT__entry( + __field(unsigned int, call ) + __field(enum rxrpc_propose_ack_trace, why ) + __field(rxrpc_serial_t, serial ) + __field(u8, ack_reason ) + __field(bool, nobuf ) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->why = why; + __entry->serial = serial; + __entry->ack_reason = ack_reason; + __entry->nobuf = nobuf; + ), + + TP_printk("c=%08x %s %s r=%08x nbf=%u", + __entry->call, + __print_symbolic(__entry->why, rxrpc_propose_ack_traces), + __print_symbolic(__entry->ack_reason, rxrpc_ack_names), + __entry->serial, __entry->nobuf) + ); + TRACE_EVENT(rxrpc_retransmit, TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, u8 annotation, s64 expiry), -- cgit v1.2.3 From faf92e8d53f5f03842da25af971a3f0ef88ffba2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 7 Oct 2022 17:22:40 +0100 Subject: rxrpc: Split the rxrpc_recvmsg tracepoint Split the rxrpc_recvmsg tracepoint so that the tracepoints that are about data packet processing (and which have extra pieces of information) are separate from the tracepoint that shows the general flow of recvmsg(). Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index d32e9858c682..84464b29e54a 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -885,6 +885,30 @@ TRACE_EVENT(rxrpc_receive, ); TRACE_EVENT(rxrpc_recvmsg, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_recvmsg_trace why, + int ret), + + TP_ARGS(call, why, ret), + + TP_STRUCT__entry( + __field(unsigned int, call ) + __field(enum rxrpc_recvmsg_trace, why ) + __field(int, ret ) + ), + + TP_fast_assign( + __entry->call = call ? call->debug_id : 0; + __entry->why = why; + __entry->ret = ret; + ), + + TP_printk("c=%08x %s ret=%d", + __entry->call, + __print_symbolic(__entry->why, rxrpc_recvmsg_traces), + __entry->ret) + ); + +TRACE_EVENT(rxrpc_recvdata, TP_PROTO(struct rxrpc_call *call, enum rxrpc_recvmsg_trace why, rxrpc_seq_t seq, unsigned int offset, unsigned int len, int ret), -- cgit v1.2.3 From d4d02d8bb5c412d977af7ea7c7ea91977a6a64dc Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 7 Oct 2022 17:44:39 +0100 Subject: rxrpc: Clone received jumbo subpackets and queue separately Split up received jumbo packets into separate skbuffs by cloning the original skbuff for each subpacket and setting the offset and length of the data in that subpacket in the skbuff's private data. The subpackets are then placed on the recvmsg queue separately. The security class then gets to revise the offset and length to remove its metadata. If we fail to clone a packet, we just drop it and let the peer resend it. The original packet gets used for the final subpacket. This should make it easier to handle parallel decryption of the subpackets. It also simplifies the handling of lost or misordered packets in the queuing/buffering loop as the possibility of overlapping jumbo packets no longer needs to be considered. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 84464b29e54a..03a984e661bc 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -18,6 +18,7 @@ */ #define rxrpc_skb_traces \ EM(rxrpc_skb_cleaned, "CLN") \ + EM(rxrpc_skb_cloned_jumbo, "CLJ") \ EM(rxrpc_skb_freed, "FRE") \ EM(rxrpc_skb_got, "GOT") \ EM(rxrpc_skb_lost, "*L*") \ @@ -630,16 +631,15 @@ TRACE_EVENT(rxrpc_transmit, TRACE_EVENT(rxrpc_rx_data, TP_PROTO(unsigned int call, rxrpc_seq_t seq, - rxrpc_serial_t serial, u8 flags, u8 anno), + rxrpc_serial_t serial, u8 flags), - TP_ARGS(call, seq, serial, flags, anno), + TP_ARGS(call, seq, serial, flags), TP_STRUCT__entry( __field(unsigned int, call ) __field(rxrpc_seq_t, seq ) __field(rxrpc_serial_t, serial ) __field(u8, flags ) - __field(u8, anno ) ), TP_fast_assign( @@ -647,15 +647,13 @@ TRACE_EVENT(rxrpc_rx_data, __entry->seq = seq; __entry->serial = serial; __entry->flags = flags; - __entry->anno = anno; ), - TP_printk("c=%08x DATA %08x q=%08x fl=%02x a=%02x", + TP_printk("c=%08x DATA %08x q=%08x fl=%02x", __entry->call, __entry->serial, __entry->seq, - __entry->flags, - __entry->anno) + __entry->flags) ); TRACE_EVENT(rxrpc_rx_ack, -- cgit v1.2.3 From 5d7edbc9231ec6b60f9c5b7e7980e9a1cd92e6bb Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 27 Aug 2022 14:27:56 +0100 Subject: rxrpc: Get rid of the Rx ring Get rid of the Rx ring and replace it with a pair of queues instead. One queue gets the packets that are in-sequence and are ready for processing by recvmsg(); the other queue gets the out-of-sequence packets for addition to the first queue as the holes get filled. The annotation ring is removed and replaced with a SACK table. The SACK table has the bits set that correspond exactly to the sequence number of the packet being acked. The SACK ring is copied when an ACK packet is being assembled and rotated so that the first ACK is in byte 0. Flow control handling is altered so that packets that are moved to the in-sequence queue are hard-ACK'd even before they're consumed - and then the Rx window size in the ACK packet (rsize) is shrunk down to compensate (even going to 0 if the window is full). Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 03a984e661bc..284a1560b0a8 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -104,7 +104,12 @@ EM(rxrpc_receive_incoming, "INC") \ EM(rxrpc_receive_queue, "QUE") \ EM(rxrpc_receive_queue_last, "QLS") \ - E_(rxrpc_receive_rotate, "ROT") + EM(rxrpc_receive_queue_oos, "QUO") \ + EM(rxrpc_receive_queue_oos_last, "QOL") \ + EM(rxrpc_receive_oos, "OOS") \ + EM(rxrpc_receive_oos_last, "OSL") \ + EM(rxrpc_receive_rotate, "ROT") \ + E_(rxrpc_receive_rotate_last, "RLS") #define rxrpc_recvmsg_traces \ EM(rxrpc_recvmsg_cont, "CONT") \ @@ -860,8 +865,7 @@ TRACE_EVENT(rxrpc_receive, __field(enum rxrpc_receive_trace, why ) __field(rxrpc_serial_t, serial ) __field(rxrpc_seq_t, seq ) - __field(rxrpc_seq_t, hard_ack ) - __field(rxrpc_seq_t, top ) + __field(u64, window ) ), TP_fast_assign( @@ -869,8 +873,7 @@ TRACE_EVENT(rxrpc_receive, __entry->why = why; __entry->serial = serial; __entry->seq = seq; - __entry->hard_ack = call->rx_hard_ack; - __entry->top = call->rx_top; + __entry->window = atomic64_read(&call->ackr_window); ), TP_printk("c=%08x %s r=%08x q=%08x w=%08x-%08x", @@ -878,8 +881,8 @@ TRACE_EVENT(rxrpc_receive, __print_symbolic(__entry->why, rxrpc_receive_traces), __entry->serial, __entry->seq, - __entry->hard_ack, - __entry->top) + lower_32_bits(__entry->window), + upper_32_bits(__entry->window)) ); TRACE_EVENT(rxrpc_recvmsg, @@ -1459,7 +1462,7 @@ TRACE_EVENT(rxrpc_call_reset, __entry->call_serial = call->rx_serial; __entry->conn_serial = call->conn->hi_serial; __entry->tx_seq = call->tx_hard_ack; - __entry->rx_seq = call->rx_hard_ack; + __entry->rx_seq = call->rx_highest_seq; ), TP_printk("c=%08x %08x:%08x r=%08x/%08x tx=%08x rx=%08x", -- cgit v1.2.3 From a4ea4c47761943d90cd5d1688b3c3c65922ff2b1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 31 Mar 2022 23:55:08 +0100 Subject: rxrpc: Don't use a ring buffer for call Tx queue Change the way the Tx queueing works to make the following ends easier to achieve: (1) The filling of packets, the encryption of packets and the transmission of packets can be handled in parallel by separate threads, rather than rxrpc_sendmsg() allocating, filling, encrypting and transmitting each packet before moving onto the next one. (2) Get rid of the fixed-size ring which sets a hard limit on the number of packets that can be retained in the ring. This allows the number of packets to increase without having to allocate a very large ring or having variable-sized rings. [Note: the downside of this is that it's then less efficient to locate a packet for retransmission as we then have to step through a list and examine each buffer in the list.] (3) Allow the filler/encrypter to run ahead of the transmission window. (4) Make it easier to do zero copy UDP from the packet buffers. (5) Make it easier to do zero copy from userspace to the packet buffers - and thence to UDP (only if for unauthenticated connections). To that end, the following changes are made: (1) Use the new rxrpc_txbuf struct instead of sk_buff for keeping packets to be transmitted in. This allows them to be placed on multiple queues simultaneously. An sk_buff isn't really necessary as it's never passed on to lower-level networking code. (2) Keep the transmissable packets in a linked list on the call struct rather than in a ring. As a consequence, the annotation buffer isn't used either; rather a flag is set on the packet to indicate ackedness. (3) Use the RXRPC_CALL_TX_LAST flag to indicate that the last packet to be transmitted has been queued. Add RXRPC_CALL_TX_ALL_ACKED to indicate that all packets up to and including the last got hard acked. (4) Wire headers are now stored in the txbuf rather than being concocted on the stack and they're stored immediately before the data, thereby allowing zerocopy of a single span. (5) Don't bother with instant-resend on transmission failure; rather, leave it for a timer or an ACK packet to trigger. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 78 +++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 284a1560b0a8..71ca74e40ec8 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -75,6 +75,7 @@ EM(rxrpc_call_got, "GOT") \ EM(rxrpc_call_got_kernel, "Gke") \ EM(rxrpc_call_got_timer, "GTM") \ + EM(rxrpc_call_got_tx, "Gtx") \ EM(rxrpc_call_got_userid, "Gus") \ EM(rxrpc_call_new_client, "NWc") \ EM(rxrpc_call_new_service, "NWs") \ @@ -83,20 +84,22 @@ EM(rxrpc_call_put_noqueue, "PnQ") \ EM(rxrpc_call_put_notimer, "PnT") \ EM(rxrpc_call_put_timer, "PTM") \ + EM(rxrpc_call_put_tx, "Ptx") \ EM(rxrpc_call_put_userid, "Pus") \ EM(rxrpc_call_queued, "QUE") \ EM(rxrpc_call_queued_ref, "QUR") \ EM(rxrpc_call_release, "RLS") \ E_(rxrpc_call_seen, "SEE") -#define rxrpc_transmit_traces \ - EM(rxrpc_transmit_await_reply, "AWR") \ - EM(rxrpc_transmit_end, "END") \ - EM(rxrpc_transmit_queue, "QUE") \ - EM(rxrpc_transmit_queue_last, "QLS") \ - EM(rxrpc_transmit_rotate, "ROT") \ - EM(rxrpc_transmit_rotate_last, "RLS") \ - E_(rxrpc_transmit_wait, "WAI") +#define rxrpc_txqueue_traces \ + EM(rxrpc_txqueue_await_reply, "AWR") \ + EM(rxrpc_txqueue_dequeue, "DEQ") \ + EM(rxrpc_txqueue_end, "END") \ + EM(rxrpc_txqueue_queue, "QUE") \ + EM(rxrpc_txqueue_queue_last, "QLS") \ + EM(rxrpc_txqueue_rotate, "ROT") \ + EM(rxrpc_txqueue_rotate_last, "RLS") \ + E_(rxrpc_txqueue_wait, "WAI") #define rxrpc_receive_traces \ EM(rxrpc_receive_end, "END") \ @@ -259,6 +262,7 @@ EM(rxrpc_txbuf_alloc_ack, "ALLOC ACK ") \ EM(rxrpc_txbuf_alloc_data, "ALLOC DATA ") \ EM(rxrpc_txbuf_free, "FREE ") \ + EM(rxrpc_txbuf_get_buffer, "GET BUFFER ") \ EM(rxrpc_txbuf_get_trans, "GET TRANS ") \ EM(rxrpc_txbuf_get_retrans, "GET RETRANS") \ EM(rxrpc_txbuf_put_ack_tx, "PUT ACK TX ") \ @@ -266,6 +270,7 @@ EM(rxrpc_txbuf_put_nomem, "PUT NOMEM ") \ EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ EM(rxrpc_txbuf_put_send_aborted, "PUT SEND-X ") \ + EM(rxrpc_txbuf_put_trans, "PUT TRANS ") \ EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \ E_(rxrpc_txbuf_see_unacked, "SEE UNACKED") @@ -295,9 +300,9 @@ enum rxrpc_rtt_rx_trace { rxrpc_rtt_rx_traces } __mode(byte); enum rxrpc_rtt_tx_trace { rxrpc_rtt_tx_traces } __mode(byte); enum rxrpc_skb_trace { rxrpc_skb_traces } __mode(byte); enum rxrpc_timer_trace { rxrpc_timer_traces } __mode(byte); -enum rxrpc_transmit_trace { rxrpc_transmit_traces } __mode(byte); enum rxrpc_tx_point { rxrpc_tx_points } __mode(byte); enum rxrpc_txbuf_trace { rxrpc_txbuf_traces } __mode(byte); +enum rxrpc_txqueue_trace { rxrpc_txqueue_traces } __mode(byte); #endif /* end __RXRPC_DECLARE_TRACE_ENUMS_ONCE_ONLY */ @@ -323,9 +328,9 @@ rxrpc_rtt_rx_traces; rxrpc_rtt_tx_traces; rxrpc_skb_traces; rxrpc_timer_traces; -rxrpc_transmit_traces; rxrpc_tx_points; rxrpc_txbuf_traces; +rxrpc_txqueue_traces; /* * Now redefine the EM() and E_() macros to map the enums to the strings that @@ -605,15 +610,16 @@ TRACE_EVENT(rxrpc_call_complete, __entry->abort_code) ); -TRACE_EVENT(rxrpc_transmit, - TP_PROTO(struct rxrpc_call *call, enum rxrpc_transmit_trace why), +TRACE_EVENT(rxrpc_txqueue, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_txqueue_trace why), TP_ARGS(call, why), TP_STRUCT__entry( __field(unsigned int, call ) - __field(enum rxrpc_transmit_trace, why ) - __field(rxrpc_seq_t, tx_hard_ack ) + __field(enum rxrpc_txqueue_trace, why ) + __field(rxrpc_seq_t, acks_hard_ack ) + __field(rxrpc_seq_t, tx_bottom ) __field(rxrpc_seq_t, tx_top ) __field(int, tx_winsize ) ), @@ -621,16 +627,19 @@ TRACE_EVENT(rxrpc_transmit, TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; - __entry->tx_hard_ack = call->tx_hard_ack; + __entry->acks_hard_ack = call->acks_hard_ack; + __entry->tx_bottom = call->tx_bottom; __entry->tx_top = call->tx_top; __entry->tx_winsize = call->tx_winsize; ), - TP_printk("c=%08x %s f=%08x n=%u/%u", + TP_printk("c=%08x %s f=%08x h=%08x n=%u/%u/%u", __entry->call, - __print_symbolic(__entry->why, rxrpc_transmit_traces), - __entry->tx_hard_ack + 1, - __entry->tx_top - __entry->tx_hard_ack, + __print_symbolic(__entry->why, rxrpc_txqueue_traces), + __entry->tx_bottom, + __entry->acks_hard_ack, + __entry->tx_top - __entry->tx_bottom, + __entry->tx_top - __entry->acks_hard_ack, __entry->tx_winsize) ); @@ -1200,29 +1209,25 @@ TRACE_EVENT(rxrpc_drop_ack, ); TRACE_EVENT(rxrpc_retransmit, - TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, u8 annotation, - s64 expiry), + TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, s64 expiry), - TP_ARGS(call, seq, annotation, expiry), + TP_ARGS(call, seq, expiry), TP_STRUCT__entry( __field(unsigned int, call ) __field(rxrpc_seq_t, seq ) - __field(u8, annotation ) __field(s64, expiry ) ), TP_fast_assign( __entry->call = call->debug_id; __entry->seq = seq; - __entry->annotation = annotation; __entry->expiry = expiry; ), - TP_printk("c=%08x q=%x a=%02x xp=%lld", + TP_printk("c=%08x q=%x xp=%lld", __entry->call, __entry->seq, - __entry->annotation, __entry->expiry) ); @@ -1245,14 +1250,14 @@ TRACE_EVENT(rxrpc_congest, TP_fast_assign( __entry->call = call->debug_id; __entry->change = change; - __entry->hard_ack = call->tx_hard_ack; + __entry->hard_ack = call->acks_hard_ack; __entry->top = call->tx_top; __entry->lowest_nak = call->acks_lowest_nak; __entry->ack_serial = ack_serial; memcpy(&__entry->sum, summary, sizeof(__entry->sum)); ), - TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u nr=%u,%u nw=%u,%u r=%u b=%u u=%u d=%u l=%x%s%s%s", + TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u nA=%u,%u+%u,%u r=%u b=%u u=%u d=%u l=%x%s%s%s", __entry->call, __entry->ack_serial, __print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names), @@ -1362,26 +1367,23 @@ TRACE_EVENT(rxrpc_connect_call, ); TRACE_EVENT(rxrpc_resend, - TP_PROTO(struct rxrpc_call *call, int ix), + TP_PROTO(struct rxrpc_call *call), - TP_ARGS(call, ix), + TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, call ) - __field(int, ix ) - __array(u8, anno, 64 ) + __field(rxrpc_seq_t, seq ) ), TP_fast_assign( __entry->call = call->debug_id; - __entry->ix = ix; - memcpy(__entry->anno, call->rxtx_annotations, 64); + __entry->seq = call->acks_hard_ack; ), - TP_printk("c=%08x ix=%u a=%64phN", + TP_printk("c=%08x q=%x", __entry->call, - __entry->ix, - __entry->anno) + __entry->seq) ); TRACE_EVENT(rxrpc_rx_icmp, @@ -1461,7 +1463,7 @@ TRACE_EVENT(rxrpc_call_reset, __entry->call_id = call->call_id; __entry->call_serial = call->rx_serial; __entry->conn_serial = call->conn->hi_serial; - __entry->tx_seq = call->tx_hard_ack; + __entry->tx_seq = call->acks_hard_ack; __entry->rx_seq = call->rx_highest_seq; ), -- cgit v1.2.3 From d57a3a151660902091491ac2633134e1be92557f Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 7 May 2022 10:06:13 +0100 Subject: rxrpc: Save last ACK's SACK table rather than marking txbufs Improve the tracking of which packets need to be transmitted by saving the last ACK packet that we receive that has a populated soft-ACK table rather than marking packets. Then we can step through the soft-ACK table and look at the packets we've transmitted beyond that to determine which packets we might want to retransmit. We also look at the highest serial number that has been acked to try and guess which packets we've transmitted the peer is likely to have seen. If necessary, we send a ping to retrieve that number. One downside that might be a problem is that we can't then compare the previous acked/unacked state so easily in rxrpc_input_soft_acks() - which is a potential problem for the slow-start algorithm. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 71ca74e40ec8..a11de55c3c14 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -17,6 +17,7 @@ * Declare tracing information enums and their string mappings for display. */ #define rxrpc_skb_traces \ + EM(rxrpc_skb_ack, "ACK") \ EM(rxrpc_skb_cleaned, "CLN") \ EM(rxrpc_skb_cloned_jumbo, "CLJ") \ EM(rxrpc_skb_freed, "FRE") \ @@ -1257,7 +1258,7 @@ TRACE_EVENT(rxrpc_congest, memcpy(&__entry->sum, summary, sizeof(__entry->sum)); ), - TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u nA=%u,%u+%u,%u r=%u b=%u u=%u d=%u l=%x%s%s%s", + TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u nA=%u,%u+%u r=%u b=%u u=%u d=%u l=%x%s%s%s", __entry->call, __entry->ack_serial, __print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names), @@ -1265,8 +1266,8 @@ TRACE_EVENT(rxrpc_congest, __print_symbolic(__entry->sum.mode, rxrpc_congest_modes), __entry->sum.cwnd, __entry->sum.ssthresh, - __entry->sum.nr_acks, __entry->sum.nr_nacks, - __entry->sum.nr_new_acks, __entry->sum.nr_new_nacks, + __entry->sum.nr_acks, __entry->sum.saw_nacks, + __entry->sum.nr_new_acks, __entry->sum.nr_rot_new_acks, __entry->top - __entry->hard_ack, __entry->sum.cumulative_acks, -- cgit v1.2.3 From 1fc4fa2ac93dcf3542f2dc6f7ff88fb022da5116 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 3 Oct 2022 18:49:11 +0100 Subject: rxrpc: Fix congestion management rxrpc has a problem in its congestion management in that it saves the congestion window size (cwnd) from one call to another, but if this is 0 at the time is saved, then the next call may not actually manage to ever transmit anything. To this end: (1) Don't save cwnd between calls, but rather reset back down to the initial cwnd and re-enter slow-start if data transmission is idle for more than an RTT. (2) Preserve ssthresh instead, as that is a handy estimate of pipe capacity. Knowing roughly when to stop slow start and enter congestion avoidance can reduce the tendency to overshoot and drop larger amounts of packets when probing. In future, cwind growth also needs to be constrained when the window isn't being filled due to being application limited. Reported-by: Simon Wilkinson cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index a11de55c3c14..b9886d1df825 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -193,6 +193,7 @@ EM(rxrpc_cong_new_low_nack, " NewLowN") \ EM(rxrpc_cong_no_change, " -") \ EM(rxrpc_cong_progress, " Progres") \ + EM(rxrpc_cong_idle_reset, " IdleRes") \ EM(rxrpc_cong_retransmit_again, " ReTxAgn") \ EM(rxrpc_cong_rtt_window_end, " RttWinE") \ E_(rxrpc_cong_saw_nack, " SawNack") -- cgit v1.2.3 From bd039b5ea2a91ea707ee8539df26456bd5be80af Mon Sep 17 00:00:00 2001 From: Andy Ren Date: Mon, 7 Nov 2022 09:42:42 -0800 Subject: net/core: Allow live renaming when an interface is up Allow a network interface to be renamed when the interface is up. As described in the netconsole documentation [1], when netconsole is used as a built-in, it will bring up the specified interface as soon as possible. As a result, user space will not be able to rename the interface since the kernel disallows renaming of interfaces that are administratively up unless the 'IFF_LIVE_RENAME_OK' private flag was set by the kernel. The original solution [2] to this problem was to add a new parameter to the netconsole configuration parameters that allows renaming of the interface used by netconsole while it is administratively up. However, during the discussion that followed, it became apparent that we have no reason to keep the current restriction and instead we should allow user space to rename interfaces regardless of their administrative state: 1. The restriction was put in place over 20 years ago when renaming was only possible via IOCTL and before rtnetlink started notifying user space about such changes like it does today. 2. The 'IFF_LIVE_RENAME_OK' flag was added over 3 years ago in version 5.2 and no regressions were reported. 3. In-kernel listeners to 'NETDEV_CHANGENAME' do not seem to care about the administrative state of interface. Therefore, allow user space to rename running interfaces by removing the restriction and the associated 'IFF_LIVE_RENAME_OK' flag. Help in possible triage by emitting a message to the kernel log that an interface was renamed while UP. [1] https://www.kernel.org/doc/Documentation/networking/netconsole.rst [2] https://lore.kernel.org/netdev/20221102002420.2613004-1-andy.ren@getcruise.com/ Signed-off-by: Andy Ren Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d45713a06568..4be87b89e481 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1650,7 +1650,6 @@ struct net_device_ops { * @IFF_FAILOVER: device is a failover master device * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device - * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with * skb_headlen(skb) == 0 (data starts from frag0) * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN @@ -1686,7 +1685,7 @@ enum netdev_priv_flags { IFF_FAILOVER = 1<<27, IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, - IFF_LIVE_RENAME_OK = 1<<30, + /* was IFF_LIVE_RENAME_OK */ IFF_TX_SKB_NO_LINEAR = BIT_ULL(31), IFF_CHANGE_PROTO_DOWN = BIT_ULL(32), }; @@ -1721,7 +1720,6 @@ enum netdev_priv_flags { #define IFF_FAILOVER IFF_FAILOVER #define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER -#define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK #define IFF_TX_SKB_NO_LINEAR IFF_TX_SKB_NO_LINEAR /* Specifies the type of the struct net_device::ml_priv pointer */ -- cgit v1.2.3 From 3e52fba03a20234abc65a656cef063a1045d9723 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 8 Nov 2022 14:22:06 +0100 Subject: net: introduce a helper to move notifier block to different namespace Currently, net_dev() netdev notifier variant follows the netdev with per-net notifier from namespace to namespace. This is implemented by move_netdevice_notifiers_dev_net() helper. For devlink it is needed to re-register per-net notifier during devlink reload. Introduce a new helper called move_netdevice_notifier_net() and share the unregister/register code with existing move_netdevice_notifiers_dev_net() helper. Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4be87b89e481..02a2318da7c7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2826,6 +2826,8 @@ int unregister_netdevice_notifier(struct notifier_block *nb); int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb); int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb); +void move_netdevice_notifier_net(struct net *src_net, struct net *dst_net, + struct notifier_block *nb); int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn); -- cgit v1.2.3 From 27fabd02abf30a9df9899f92d467591c7eabb1ba Mon Sep 17 00:00:00 2001 From: "Hans J. Schultz" Date: Tue, 8 Nov 2022 11:47:08 +0100 Subject: bridge: switchdev: Allow device drivers to install locked FDB entries When the bridge is offloaded to hardware, FDB entries are learned and aged-out by the hardware. Some device drivers synchronize the hardware and software FDBs by generating switchdev events towards the bridge. When a port is locked, the hardware must not learn autonomously, as otherwise any host will blindly gain authorization. Instead, the hardware should generate events regarding hosts that are trying to gain authorization and their MAC addresses should be notified by the device driver as locked FDB entries towards the bridge driver. Allow device drivers to notify the bridge driver about such entries by extending the 'switchdev_notifier_fdb_info' structure with the 'locked' bit. The bit can only be set by device drivers and not by the bridge driver. Prevent a locked entry from being installed if MAB is not enabled on the bridge port. If an entry already exists in the bridge driver, reject the locked entry if the current entry does not have the "locked" flag set or if it points to a different port. The same semantics are implemented in the software data path. Signed-off-by: Hans J. Schultz Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Reviewed-by: Vladimir Oltean Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/net/switchdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 7dcdc97c0bc3..ca0312b78294 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -248,6 +248,7 @@ struct switchdev_notifier_fdb_info { u16 vid; u8 added_by_user:1, is_local:1, + locked:1, offloaded:1; }; -- cgit v1.2.3 From 2640a82bbc08393c846c7b55178079bb8ca31a8c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Nov 2022 11:47:10 +0100 Subject: devlink: Add packet traps for 802.1X operation Add packet traps for 802.1X operation. The "eapol" control trap is used to trap EAPOL packets and is required for the correct operation of the control plane. The "locked_port" drop trap can be enabled to gain visibility into packets that were dropped by the device due to the locked bridge port check. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index fa6e936af1a5..611a23a3deb2 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -894,6 +894,8 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_ESP_PARSING, DEVLINK_TRAP_GENERIC_ID_BLACKHOLE_NEXTHOP, DEVLINK_TRAP_GENERIC_ID_DMAC_FILTER, + DEVLINK_TRAP_GENERIC_ID_EAPOL, + DEVLINK_TRAP_GENERIC_ID_LOCKED_PORT, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -930,6 +932,7 @@ enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_SAMPLE, DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_TRAP, DEVLINK_TRAP_GROUP_GENERIC_ID_PARSER_ERROR_DROPS, + DEVLINK_TRAP_GROUP_GENERIC_ID_EAPOL, /* Add new generic trap group IDs above */ __DEVLINK_TRAP_GROUP_GENERIC_ID_MAX, @@ -1121,6 +1124,10 @@ enum devlink_trap_group_generic_id { "blackhole_nexthop" #define DEVLINK_TRAP_GENERIC_NAME_DMAC_FILTER \ "dmac_filter" +#define DEVLINK_TRAP_GENERIC_NAME_EAPOL \ + "eapol" +#define DEVLINK_TRAP_GENERIC_NAME_LOCKED_PORT \ + "locked_port" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" @@ -1174,6 +1181,8 @@ enum devlink_trap_group_generic_id { "acl_trap" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_PARSER_ERROR_DROPS \ "parser_error_drops" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_EAPOL \ + "eapol" #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group_id, \ _metadata_cap) \ -- cgit v1.2.3 From f6479ea4e5990e17b9690d66474198dcdba1b2c1 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 8 Nov 2022 14:25:56 +0000 Subject: net: mdio: add mdiodev_c45_(read|write) Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/mdio.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 00177567cfef..f7fbbf3069e7 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -488,6 +488,19 @@ static inline int mdiobus_c45_write(struct mii_bus *bus, int prtad, int devad, return mdiobus_write(bus, prtad, mdiobus_c45_addr(devad, regnum), val); } +static inline int mdiodev_c45_read(struct mdio_device *mdiodev, int devad, + u16 regnum) +{ + return mdiobus_c45_read(mdiodev->bus, mdiodev->addr, devad, regnum); +} + +static inline int mdiodev_c45_write(struct mdio_device *mdiodev, u32 devad, + u16 regnum, u16 val) +{ + return mdiobus_c45_write(mdiodev->bus, mdiodev->addr, devad, regnum, + val); +} + int mdiobus_register_device(struct mdio_device *mdiodev); int mdiobus_unregister_device(struct mdio_device *mdiodev); bool mdiobus_is_registered_device(struct mii_bus *bus, int addr); -- cgit v1.2.3 From fd325cd648f15eb9a8b32a68de3bafc72bcfe753 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 3 Nov 2022 12:16:25 -0700 Subject: net: mana: Move header files to a common location In preparation to add MANA RDMA driver, move all the required header files to a common location for use by both Ethernet and RDMA drivers. Reviewed-by: Dexuan Cui Signed-off-by: Long Li Link: https://lore.kernel.org/r/1667502990-2559-8-git-send-email-longli@linuxonhyperv.com Acked-by: Haiyang Zhang Signed-off-by: Leon Romanovsky --- include/net/mana/gdma.h | 689 ++++++++++++++++++++++++++++++++++++++ include/net/mana/hw_channel.h | 195 +++++++++++ include/net/mana/mana.h | 650 +++++++++++++++++++++++++++++++++++ include/net/mana/mana_auxiliary.h | 10 + include/net/mana/shm_channel.h | 21 ++ 5 files changed, 1565 insertions(+) create mode 100644 include/net/mana/gdma.h create mode 100644 include/net/mana/hw_channel.h create mode 100644 include/net/mana/mana.h create mode 100644 include/net/mana/mana_auxiliary.h create mode 100644 include/net/mana/shm_channel.h (limited to 'include') diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h new file mode 100644 index 000000000000..72eaec2470c0 --- /dev/null +++ b/include/net/mana/gdma.h @@ -0,0 +1,689 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2021, Microsoft Corporation. */ + +#ifndef _GDMA_H +#define _GDMA_H + +#include +#include + +#include "shm_channel.h" + +/* Structures labeled with "HW DATA" are exchanged with the hardware. All of + * them are naturally aligned and hence don't need __packed. + */ + +enum gdma_request_type { + GDMA_VERIFY_VF_DRIVER_VERSION = 1, + GDMA_QUERY_MAX_RESOURCES = 2, + GDMA_LIST_DEVICES = 3, + GDMA_REGISTER_DEVICE = 4, + GDMA_DEREGISTER_DEVICE = 5, + GDMA_GENERATE_TEST_EQE = 10, + GDMA_CREATE_QUEUE = 12, + GDMA_DISABLE_QUEUE = 13, + GDMA_CREATE_DMA_REGION = 25, + GDMA_DMA_REGION_ADD_PAGES = 26, + GDMA_DESTROY_DMA_REGION = 27, +}; + +enum gdma_queue_type { + GDMA_INVALID_QUEUE, + GDMA_SQ, + GDMA_RQ, + GDMA_CQ, + GDMA_EQ, +}; + +enum gdma_work_request_flags { + GDMA_WR_NONE = 0, + GDMA_WR_OOB_IN_SGL = BIT(0), + GDMA_WR_PAD_BY_SGE0 = BIT(1), +}; + +enum gdma_eqe_type { + GDMA_EQE_COMPLETION = 3, + GDMA_EQE_TEST_EVENT = 64, + GDMA_EQE_HWC_INIT_EQ_ID_DB = 129, + GDMA_EQE_HWC_INIT_DATA = 130, + GDMA_EQE_HWC_INIT_DONE = 131, +}; + +enum { + GDMA_DEVICE_NONE = 0, + GDMA_DEVICE_HWC = 1, + GDMA_DEVICE_MANA = 2, +}; + +struct gdma_resource { + /* Protect the bitmap */ + spinlock_t lock; + + /* The bitmap size in bits. */ + u32 size; + + /* The bitmap tracks the resources. */ + unsigned long *map; +}; + +union gdma_doorbell_entry { + u64 as_uint64; + + struct { + u64 id : 24; + u64 reserved : 8; + u64 tail_ptr : 31; + u64 arm : 1; + } cq; + + struct { + u64 id : 24; + u64 wqe_cnt : 8; + u64 tail_ptr : 32; + } rq; + + struct { + u64 id : 24; + u64 reserved : 8; + u64 tail_ptr : 32; + } sq; + + struct { + u64 id : 16; + u64 reserved : 16; + u64 tail_ptr : 31; + u64 arm : 1; + } eq; +}; /* HW DATA */ + +struct gdma_msg_hdr { + u32 hdr_type; + u32 msg_type; + u16 msg_version; + u16 hwc_msg_id; + u32 msg_size; +}; /* HW DATA */ + +struct gdma_dev_id { + union { + struct { + u16 type; + u16 instance; + }; + + u32 as_uint32; + }; +}; /* HW DATA */ + +struct gdma_req_hdr { + struct gdma_msg_hdr req; + struct gdma_msg_hdr resp; /* The expected response */ + struct gdma_dev_id dev_id; + u32 activity_id; +}; /* HW DATA */ + +struct gdma_resp_hdr { + struct gdma_msg_hdr response; + struct gdma_dev_id dev_id; + u32 activity_id; + u32 status; + u32 reserved; +}; /* HW DATA */ + +struct gdma_general_req { + struct gdma_req_hdr hdr; +}; /* HW DATA */ + +#define GDMA_MESSAGE_V1 1 + +struct gdma_general_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +#define GDMA_STANDARD_HEADER_TYPE 0 + +static inline void mana_gd_init_req_hdr(struct gdma_req_hdr *hdr, u32 code, + u32 req_size, u32 resp_size) +{ + hdr->req.hdr_type = GDMA_STANDARD_HEADER_TYPE; + hdr->req.msg_type = code; + hdr->req.msg_version = GDMA_MESSAGE_V1; + hdr->req.msg_size = req_size; + + hdr->resp.hdr_type = GDMA_STANDARD_HEADER_TYPE; + hdr->resp.msg_type = code; + hdr->resp.msg_version = GDMA_MESSAGE_V1; + hdr->resp.msg_size = resp_size; +} + +/* The 16-byte struct is part of the GDMA work queue entry (WQE). */ +struct gdma_sge { + u64 address; + u32 mem_key; + u32 size; +}; /* HW DATA */ + +struct gdma_wqe_request { + struct gdma_sge *sgl; + u32 num_sge; + + u32 inline_oob_size; + const void *inline_oob_data; + + u32 flags; + u32 client_data_unit; +}; + +enum gdma_page_type { + GDMA_PAGE_TYPE_4K, +}; + +#define GDMA_INVALID_DMA_REGION 0 + +struct gdma_mem_info { + struct device *dev; + + dma_addr_t dma_handle; + void *virt_addr; + u64 length; + + /* Allocated by the PF driver */ + u64 gdma_region; +}; + +#define REGISTER_ATB_MST_MKEY_LOWER_SIZE 8 + +struct gdma_dev { + struct gdma_context *gdma_context; + + struct gdma_dev_id dev_id; + + u32 pdid; + u32 doorbell; + u32 gpa_mkey; + + /* GDMA driver specific pointer */ + void *driver_data; + + struct auxiliary_device *adev; +}; + +#define MINIMUM_SUPPORTED_PAGE_SIZE PAGE_SIZE + +#define GDMA_CQE_SIZE 64 +#define GDMA_EQE_SIZE 16 +#define GDMA_MAX_SQE_SIZE 512 +#define GDMA_MAX_RQE_SIZE 256 + +#define GDMA_COMP_DATA_SIZE 0x3C + +#define GDMA_EVENT_DATA_SIZE 0xC + +/* The WQE size must be a multiple of the Basic Unit, which is 32 bytes. */ +#define GDMA_WQE_BU_SIZE 32 + +#define INVALID_PDID UINT_MAX +#define INVALID_DOORBELL UINT_MAX +#define INVALID_MEM_KEY UINT_MAX +#define INVALID_QUEUE_ID UINT_MAX +#define INVALID_PCI_MSIX_INDEX UINT_MAX + +struct gdma_comp { + u32 cqe_data[GDMA_COMP_DATA_SIZE / 4]; + u32 wq_num; + bool is_sq; +}; + +struct gdma_event { + u32 details[GDMA_EVENT_DATA_SIZE / 4]; + u8 type; +}; + +struct gdma_queue; + +struct mana_eq { + struct gdma_queue *eq; +}; + +typedef void gdma_eq_callback(void *context, struct gdma_queue *q, + struct gdma_event *e); + +typedef void gdma_cq_callback(void *context, struct gdma_queue *q); + +/* The 'head' is the producer index. For SQ/RQ, when the driver posts a WQE + * (Note: the WQE size must be a multiple of the 32-byte Basic Unit), the + * driver increases the 'head' in BUs rather than in bytes, and notifies + * the HW of the updated head. For EQ/CQ, the driver uses the 'head' to track + * the HW head, and increases the 'head' by 1 for every processed EQE/CQE. + * + * The 'tail' is the consumer index for SQ/RQ. After the CQE of the SQ/RQ is + * processed, the driver increases the 'tail' to indicate that WQEs have + * been consumed by the HW, so the driver can post new WQEs into the SQ/RQ. + * + * The driver doesn't use the 'tail' for EQ/CQ, because the driver ensures + * that the EQ/CQ is big enough so they can't overflow, and the driver uses + * the owner bits mechanism to detect if the queue has become empty. + */ +struct gdma_queue { + struct gdma_dev *gdma_dev; + + enum gdma_queue_type type; + u32 id; + + struct gdma_mem_info mem_info; + + void *queue_mem_ptr; + u32 queue_size; + + bool monitor_avl_buf; + + u32 head; + u32 tail; + + /* Extra fields specific to EQ/CQ. */ + union { + struct { + bool disable_needed; + + gdma_eq_callback *callback; + void *context; + + unsigned int msix_index; + + u32 log2_throttle_limit; + } eq; + + struct { + gdma_cq_callback *callback; + void *context; + + struct gdma_queue *parent; /* For CQ/EQ relationship */ + } cq; + }; +}; + +struct gdma_queue_spec { + enum gdma_queue_type type; + bool monitor_avl_buf; + unsigned int queue_size; + + /* Extra fields specific to EQ/CQ. */ + union { + struct { + gdma_eq_callback *callback; + void *context; + + unsigned long log2_throttle_limit; + } eq; + + struct { + gdma_cq_callback *callback; + void *context; + + struct gdma_queue *parent_eq; + + } cq; + }; +}; + +struct gdma_irq_context { + void (*handler)(void *arg); + void *arg; +}; + +struct gdma_context { + struct device *dev; + + /* Per-vPort max number of queues */ + unsigned int max_num_queues; + unsigned int max_num_msix; + unsigned int num_msix_usable; + struct gdma_resource msix_resource; + struct gdma_irq_context *irq_contexts; + + /* This maps a CQ index to the queue structure. */ + unsigned int max_num_cqs; + struct gdma_queue **cq_table; + + /* Protect eq_test_event and test_event_eq_id */ + struct mutex eq_test_event_mutex; + struct completion eq_test_event; + u32 test_event_eq_id; + + bool is_pf; + phys_addr_t bar0_pa; + void __iomem *bar0_va; + void __iomem *shm_base; + void __iomem *db_page_base; + phys_addr_t phys_db_page_base; + u32 db_page_size; + + /* Shared memory chanenl (used to bootstrap HWC) */ + struct shm_channel shm_channel; + + /* Hardware communication channel (HWC) */ + struct gdma_dev hwc; + + /* Azure network adapter */ + struct gdma_dev mana; +}; + +#define MAX_NUM_GDMA_DEVICES 4 + +static inline bool mana_gd_is_mana(struct gdma_dev *gd) +{ + return gd->dev_id.type == GDMA_DEVICE_MANA; +} + +static inline bool mana_gd_is_hwc(struct gdma_dev *gd) +{ + return gd->dev_id.type == GDMA_DEVICE_HWC; +} + +u8 *mana_gd_get_wqe_ptr(const struct gdma_queue *wq, u32 wqe_offset); +u32 mana_gd_wq_avail_space(struct gdma_queue *wq); + +int mana_gd_test_eq(struct gdma_context *gc, struct gdma_queue *eq); + +int mana_gd_create_hwc_queue(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + struct gdma_queue **queue_ptr); + +int mana_gd_create_mana_eq(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + struct gdma_queue **queue_ptr); + +int mana_gd_create_mana_wq_cq(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + struct gdma_queue **queue_ptr); + +void mana_gd_destroy_queue(struct gdma_context *gc, struct gdma_queue *queue); + +int mana_gd_poll_cq(struct gdma_queue *cq, struct gdma_comp *comp, int num_cqe); + +void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit); + +struct gdma_wqe { + u32 reserved :24; + u32 last_vbytes :8; + + union { + u32 flags; + + struct { + u32 num_sge :8; + u32 inline_oob_size_div4:3; + u32 client_oob_in_sgl :1; + u32 reserved1 :4; + u32 client_data_unit :14; + u32 reserved2 :2; + }; + }; +}; /* HW DATA */ + +#define INLINE_OOB_SMALL_SIZE 8 +#define INLINE_OOB_LARGE_SIZE 24 + +#define MAX_TX_WQE_SIZE 512 +#define MAX_RX_WQE_SIZE 256 + +struct gdma_cqe { + u32 cqe_data[GDMA_COMP_DATA_SIZE / 4]; + + union { + u32 as_uint32; + + struct { + u32 wq_num : 24; + u32 is_sq : 1; + u32 reserved : 4; + u32 owner_bits : 3; + }; + } cqe_info; +}; /* HW DATA */ + +#define GDMA_CQE_OWNER_BITS 3 + +#define GDMA_CQE_OWNER_MASK ((1 << GDMA_CQE_OWNER_BITS) - 1) + +#define SET_ARM_BIT 1 + +#define GDMA_EQE_OWNER_BITS 3 + +union gdma_eqe_info { + u32 as_uint32; + + struct { + u32 type : 8; + u32 reserved1 : 8; + u32 client_id : 2; + u32 reserved2 : 11; + u32 owner_bits : 3; + }; +}; /* HW DATA */ + +#define GDMA_EQE_OWNER_MASK ((1 << GDMA_EQE_OWNER_BITS) - 1) +#define INITIALIZED_OWNER_BIT(log2_num_entries) (1UL << (log2_num_entries)) + +struct gdma_eqe { + u32 details[GDMA_EVENT_DATA_SIZE / 4]; + u32 eqe_info; +}; /* HW DATA */ + +#define GDMA_REG_DB_PAGE_OFFSET 8 +#define GDMA_REG_DB_PAGE_SIZE 0x10 +#define GDMA_REG_SHM_OFFSET 0x18 + +#define GDMA_PF_REG_DB_PAGE_SIZE 0xD0 +#define GDMA_PF_REG_DB_PAGE_OFF 0xC8 +#define GDMA_PF_REG_SHM_OFF 0x70 + +#define GDMA_SRIOV_REG_CFG_BASE_OFF 0x108 + +#define MANA_PF_DEVICE_ID 0x00B9 +#define MANA_VF_DEVICE_ID 0x00BA + +struct gdma_posted_wqe_info { + u32 wqe_size_in_bu; +}; + +/* GDMA_GENERATE_TEST_EQE */ +struct gdma_generate_test_event_req { + struct gdma_req_hdr hdr; + u32 queue_index; +}; /* HW DATA */ + +/* GDMA_VERIFY_VF_DRIVER_VERSION */ +enum { + GDMA_PROTOCOL_V1 = 1, + GDMA_PROTOCOL_FIRST = GDMA_PROTOCOL_V1, + GDMA_PROTOCOL_LAST = GDMA_PROTOCOL_V1, +}; + +#define GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT BIT(0) + +#define GDMA_DRV_CAP_FLAGS1 GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT + +#define GDMA_DRV_CAP_FLAGS2 0 + +#define GDMA_DRV_CAP_FLAGS3 0 + +#define GDMA_DRV_CAP_FLAGS4 0 + +struct gdma_verify_ver_req { + struct gdma_req_hdr hdr; + + /* Mandatory fields required for protocol establishment */ + u64 protocol_ver_min; + u64 protocol_ver_max; + + /* Gdma Driver Capability Flags */ + u64 gd_drv_cap_flags1; + u64 gd_drv_cap_flags2; + u64 gd_drv_cap_flags3; + u64 gd_drv_cap_flags4; + + /* Advisory fields */ + u64 drv_ver; + u32 os_type; /* Linux = 0x10; Windows = 0x20; Other = 0x30 */ + u32 reserved; + u32 os_ver_major; + u32 os_ver_minor; + u32 os_ver_build; + u32 os_ver_platform; + u64 reserved_2; + u8 os_ver_str1[128]; + u8 os_ver_str2[128]; + u8 os_ver_str3[128]; + u8 os_ver_str4[128]; +}; /* HW DATA */ + +struct gdma_verify_ver_resp { + struct gdma_resp_hdr hdr; + u64 gdma_protocol_ver; + u64 pf_cap_flags1; + u64 pf_cap_flags2; + u64 pf_cap_flags3; + u64 pf_cap_flags4; +}; /* HW DATA */ + +/* GDMA_QUERY_MAX_RESOURCES */ +struct gdma_query_max_resources_resp { + struct gdma_resp_hdr hdr; + u32 status; + u32 max_sq; + u32 max_rq; + u32 max_cq; + u32 max_eq; + u32 max_db; + u32 max_mst; + u32 max_cq_mod_ctx; + u32 max_mod_cq; + u32 max_msix; +}; /* HW DATA */ + +/* GDMA_LIST_DEVICES */ +struct gdma_list_devices_resp { + struct gdma_resp_hdr hdr; + u32 num_of_devs; + u32 reserved; + struct gdma_dev_id devs[64]; +}; /* HW DATA */ + +/* GDMA_REGISTER_DEVICE */ +struct gdma_register_device_resp { + struct gdma_resp_hdr hdr; + u32 pdid; + u32 gpa_mkey; + u32 db_id; +}; /* HW DATA */ + +/* GDMA_CREATE_QUEUE */ +struct gdma_create_queue_req { + struct gdma_req_hdr hdr; + u32 type; + u32 reserved1; + u32 pdid; + u32 doolbell_id; + u64 gdma_region; + u32 reserved2; + u32 queue_size; + u32 log2_throttle_limit; + u32 eq_pci_msix_index; + u32 cq_mod_ctx_id; + u32 cq_parent_eq_id; + u8 rq_drop_on_overrun; + u8 rq_err_on_wqe_overflow; + u8 rq_chain_rec_wqes; + u8 sq_hw_db; + u32 reserved3; +}; /* HW DATA */ + +struct gdma_create_queue_resp { + struct gdma_resp_hdr hdr; + u32 queue_index; +}; /* HW DATA */ + +/* GDMA_DISABLE_QUEUE */ +struct gdma_disable_queue_req { + struct gdma_req_hdr hdr; + u32 type; + u32 queue_index; + u32 alloc_res_id_on_creation; +}; /* HW DATA */ + +/* GDMA_CREATE_DMA_REGION */ +struct gdma_create_dma_region_req { + struct gdma_req_hdr hdr; + + /* The total size of the DMA region */ + u64 length; + + /* The offset in the first page */ + u32 offset_in_page; + + /* enum gdma_page_type */ + u32 gdma_page_type; + + /* The total number of pages */ + u32 page_count; + + /* If page_addr_list_len is smaller than page_count, + * the remaining page addresses will be added via the + * message GDMA_DMA_REGION_ADD_PAGES. + */ + u32 page_addr_list_len; + u64 page_addr_list[]; +}; /* HW DATA */ + +struct gdma_create_dma_region_resp { + struct gdma_resp_hdr hdr; + u64 gdma_region; +}; /* HW DATA */ + +/* GDMA_DMA_REGION_ADD_PAGES */ +struct gdma_dma_region_add_pages_req { + struct gdma_req_hdr hdr; + + u64 gdma_region; + + u32 page_addr_list_len; + u32 reserved3; + + u64 page_addr_list[]; +}; /* HW DATA */ + +/* GDMA_DESTROY_DMA_REGION */ +struct gdma_destroy_dma_region_req { + struct gdma_req_hdr hdr; + + u64 gdma_region; +}; /* HW DATA */ + +int mana_gd_verify_vf_version(struct pci_dev *pdev); + +int mana_gd_register_device(struct gdma_dev *gd); +int mana_gd_deregister_device(struct gdma_dev *gd); + +int mana_gd_post_work_request(struct gdma_queue *wq, + const struct gdma_wqe_request *wqe_req, + struct gdma_posted_wqe_info *wqe_info); + +int mana_gd_post_and_ring(struct gdma_queue *queue, + const struct gdma_wqe_request *wqe, + struct gdma_posted_wqe_info *wqe_info); + +int mana_gd_alloc_res_map(u32 res_avail, struct gdma_resource *r); +void mana_gd_free_res_map(struct gdma_resource *r); + +void mana_gd_wq_ring_doorbell(struct gdma_context *gc, + struct gdma_queue *queue); + +int mana_gd_alloc_memory(struct gdma_context *gc, unsigned int length, + struct gdma_mem_info *gmi); + +void mana_gd_free_memory(struct gdma_mem_info *gmi); + +int mana_gd_send_request(struct gdma_context *gc, u32 req_len, const void *req, + u32 resp_len, void *resp); +#endif /* _GDMA_H */ diff --git a/include/net/mana/hw_channel.h b/include/net/mana/hw_channel.h new file mode 100644 index 000000000000..6a757a6e2732 --- /dev/null +++ b/include/net/mana/hw_channel.h @@ -0,0 +1,195 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2021, Microsoft Corporation. */ + +#ifndef _HW_CHANNEL_H +#define _HW_CHANNEL_H + +#define DEFAULT_LOG2_THROTTLING_FOR_ERROR_EQ 4 + +#define HW_CHANNEL_MAX_REQUEST_SIZE 0x1000 +#define HW_CHANNEL_MAX_RESPONSE_SIZE 0x1000 + +#define HW_CHANNEL_VF_BOOTSTRAP_QUEUE_DEPTH 1 + +#define HWC_INIT_DATA_CQID 1 +#define HWC_INIT_DATA_RQID 2 +#define HWC_INIT_DATA_SQID 3 +#define HWC_INIT_DATA_QUEUE_DEPTH 4 +#define HWC_INIT_DATA_MAX_REQUEST 5 +#define HWC_INIT_DATA_MAX_RESPONSE 6 +#define HWC_INIT_DATA_MAX_NUM_CQS 7 +#define HWC_INIT_DATA_PDID 8 +#define HWC_INIT_DATA_GPA_MKEY 9 +#define HWC_INIT_DATA_PF_DEST_RQ_ID 10 +#define HWC_INIT_DATA_PF_DEST_CQ_ID 11 + +/* Structures labeled with "HW DATA" are exchanged with the hardware. All of + * them are naturally aligned and hence don't need __packed. + */ + +union hwc_init_eq_id_db { + u32 as_uint32; + + struct { + u32 eq_id : 16; + u32 doorbell : 16; + }; +}; /* HW DATA */ + +union hwc_init_type_data { + u32 as_uint32; + + struct { + u32 value : 24; + u32 type : 8; + }; +}; /* HW DATA */ + +struct hwc_rx_oob { + u32 type : 6; + u32 eom : 1; + u32 som : 1; + u32 vendor_err : 8; + u32 reserved1 : 16; + + u32 src_virt_wq : 24; + u32 src_vfid : 8; + + u32 reserved2; + + union { + u32 wqe_addr_low; + u32 wqe_offset; + }; + + u32 wqe_addr_high; + + u32 client_data_unit : 14; + u32 reserved3 : 18; + + u32 tx_oob_data_size; + + u32 chunk_offset : 21; + u32 reserved4 : 11; +}; /* HW DATA */ + +struct hwc_tx_oob { + u32 reserved1; + + u32 reserved2; + + u32 vrq_id : 24; + u32 dest_vfid : 8; + + u32 vrcq_id : 24; + u32 reserved3 : 8; + + u32 vscq_id : 24; + u32 loopback : 1; + u32 lso_override: 1; + u32 dest_pf : 1; + u32 reserved4 : 5; + + u32 vsq_id : 24; + u32 reserved5 : 8; +}; /* HW DATA */ + +struct hwc_work_request { + void *buf_va; + void *buf_sge_addr; + u32 buf_len; + u32 msg_size; + + struct gdma_wqe_request wqe_req; + struct hwc_tx_oob tx_oob; + + struct gdma_sge sge; +}; + +/* hwc_dma_buf represents the array of in-flight WQEs. + * mem_info as know as the GDMA mapped memory is partitioned and used by + * in-flight WQEs. + * The number of WQEs is determined by the number of in-flight messages. + */ +struct hwc_dma_buf { + struct gdma_mem_info mem_info; + + u32 gpa_mkey; + + u32 num_reqs; + struct hwc_work_request reqs[]; +}; + +typedef void hwc_rx_event_handler_t(void *ctx, u32 gdma_rxq_id, + const struct hwc_rx_oob *rx_oob); + +typedef void hwc_tx_event_handler_t(void *ctx, u32 gdma_txq_id, + const struct hwc_rx_oob *rx_oob); + +struct hwc_cq { + struct hw_channel_context *hwc; + + struct gdma_queue *gdma_cq; + struct gdma_queue *gdma_eq; + struct gdma_comp *comp_buf; + u16 queue_depth; + + hwc_rx_event_handler_t *rx_event_handler; + void *rx_event_ctx; + + hwc_tx_event_handler_t *tx_event_handler; + void *tx_event_ctx; +}; + +struct hwc_wq { + struct hw_channel_context *hwc; + + struct gdma_queue *gdma_wq; + struct hwc_dma_buf *msg_buf; + u16 queue_depth; + + struct hwc_cq *hwc_cq; +}; + +struct hwc_caller_ctx { + struct completion comp_event; + void *output_buf; + u32 output_buflen; + + u32 error; /* Linux error code */ + u32 status_code; +}; + +struct hw_channel_context { + struct gdma_dev *gdma_dev; + struct device *dev; + + u16 num_inflight_msg; + u32 max_req_msg_size; + + u16 hwc_init_q_depth_max; + u32 hwc_init_max_req_msg_size; + u32 hwc_init_max_resp_msg_size; + + struct completion hwc_init_eqe_comp; + + struct hwc_wq *rxq; + struct hwc_wq *txq; + struct hwc_cq *cq; + + struct semaphore sema; + struct gdma_resource inflight_msg_res; + + u32 pf_dest_vrq_id; + u32 pf_dest_vrcq_id; + + struct hwc_caller_ctx *caller_ctx; +}; + +int mana_hwc_create_channel(struct gdma_context *gc); +void mana_hwc_destroy_channel(struct gdma_context *gc); + +int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len, + const void *req, u32 resp_len, void *resp); + +#endif /* _HW_CHANNEL_H */ diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h new file mode 100644 index 000000000000..6e9e86fb4c02 --- /dev/null +++ b/include/net/mana/mana.h @@ -0,0 +1,650 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2021, Microsoft Corporation. */ + +#ifndef _MANA_H +#define _MANA_H + +#include "gdma.h" +#include "hw_channel.h" + +/* Microsoft Azure Network Adapter (MANA)'s definitions + * + * Structures labeled with "HW DATA" are exchanged with the hardware. All of + * them are naturally aligned and hence don't need __packed. + */ + +/* MANA protocol version */ +#define MANA_MAJOR_VERSION 0 +#define MANA_MINOR_VERSION 1 +#define MANA_MICRO_VERSION 1 + +typedef u64 mana_handle_t; +#define INVALID_MANA_HANDLE ((mana_handle_t)-1) + +enum TRI_STATE { + TRI_STATE_UNKNOWN = -1, + TRI_STATE_FALSE = 0, + TRI_STATE_TRUE = 1 +}; + +/* Number of entries for hardware indirection table must be in power of 2 */ +#define MANA_INDIRECT_TABLE_SIZE 64 +#define MANA_INDIRECT_TABLE_MASK (MANA_INDIRECT_TABLE_SIZE - 1) + +/* The Toeplitz hash key's length in bytes: should be multiple of 8 */ +#define MANA_HASH_KEY_SIZE 40 + +#define COMP_ENTRY_SIZE 64 + +#define ADAPTER_MTU_SIZE 1500 +#define MAX_FRAME_SIZE (ADAPTER_MTU_SIZE + 14) + +#define RX_BUFFERS_PER_QUEUE 512 + +#define MAX_SEND_BUFFERS_PER_QUEUE 256 + +#define EQ_SIZE (8 * PAGE_SIZE) +#define LOG2_EQ_THROTTLE 3 + +#define MAX_PORTS_IN_MANA_DEV 256 + +struct mana_stats_rx { + u64 packets; + u64 bytes; + u64 xdp_drop; + u64 xdp_tx; + u64 xdp_redirect; + struct u64_stats_sync syncp; +}; + +struct mana_stats_tx { + u64 packets; + u64 bytes; + u64 xdp_xmit; + struct u64_stats_sync syncp; +}; + +struct mana_txq { + struct gdma_queue *gdma_sq; + + union { + u32 gdma_txq_id; + struct { + u32 reserved1 : 10; + u32 vsq_frame : 14; + u32 reserved2 : 8; + }; + }; + + u16 vp_offset; + + struct net_device *ndev; + + /* The SKBs are sent to the HW and we are waiting for the CQEs. */ + struct sk_buff_head pending_skbs; + struct netdev_queue *net_txq; + + atomic_t pending_sends; + + struct mana_stats_tx stats; +}; + +/* skb data and frags dma mappings */ +struct mana_skb_head { + dma_addr_t dma_handle[MAX_SKB_FRAGS + 1]; + + u32 size[MAX_SKB_FRAGS + 1]; +}; + +#define MANA_HEADROOM sizeof(struct mana_skb_head) + +enum mana_tx_pkt_format { + MANA_SHORT_PKT_FMT = 0, + MANA_LONG_PKT_FMT = 1, +}; + +struct mana_tx_short_oob { + u32 pkt_fmt : 2; + u32 is_outer_ipv4 : 1; + u32 is_outer_ipv6 : 1; + u32 comp_iphdr_csum : 1; + u32 comp_tcp_csum : 1; + u32 comp_udp_csum : 1; + u32 supress_txcqe_gen : 1; + u32 vcq_num : 24; + + u32 trans_off : 10; /* Transport header offset */ + u32 vsq_frame : 14; + u32 short_vp_offset : 8; +}; /* HW DATA */ + +struct mana_tx_long_oob { + u32 is_encap : 1; + u32 inner_is_ipv6 : 1; + u32 inner_tcp_opt : 1; + u32 inject_vlan_pri_tag : 1; + u32 reserved1 : 12; + u32 pcp : 3; /* 802.1Q */ + u32 dei : 1; /* 802.1Q */ + u32 vlan_id : 12; /* 802.1Q */ + + u32 inner_frame_offset : 10; + u32 inner_ip_rel_offset : 6; + u32 long_vp_offset : 12; + u32 reserved2 : 4; + + u32 reserved3; + u32 reserved4; +}; /* HW DATA */ + +struct mana_tx_oob { + struct mana_tx_short_oob s_oob; + struct mana_tx_long_oob l_oob; +}; /* HW DATA */ + +enum mana_cq_type { + MANA_CQ_TYPE_RX, + MANA_CQ_TYPE_TX, +}; + +enum mana_cqe_type { + CQE_INVALID = 0, + CQE_RX_OKAY = 1, + CQE_RX_COALESCED_4 = 2, + CQE_RX_OBJECT_FENCE = 3, + CQE_RX_TRUNCATED = 4, + + CQE_TX_OKAY = 32, + CQE_TX_SA_DROP = 33, + CQE_TX_MTU_DROP = 34, + CQE_TX_INVALID_OOB = 35, + CQE_TX_INVALID_ETH_TYPE = 36, + CQE_TX_HDR_PROCESSING_ERROR = 37, + CQE_TX_VF_DISABLED = 38, + CQE_TX_VPORT_IDX_OUT_OF_RANGE = 39, + CQE_TX_VPORT_DISABLED = 40, + CQE_TX_VLAN_TAGGING_VIOLATION = 41, +}; + +#define MANA_CQE_COMPLETION 1 + +struct mana_cqe_header { + u32 cqe_type : 6; + u32 client_type : 2; + u32 vendor_err : 24; +}; /* HW DATA */ + +/* NDIS HASH Types */ +#define NDIS_HASH_IPV4 BIT(0) +#define NDIS_HASH_TCP_IPV4 BIT(1) +#define NDIS_HASH_UDP_IPV4 BIT(2) +#define NDIS_HASH_IPV6 BIT(3) +#define NDIS_HASH_TCP_IPV6 BIT(4) +#define NDIS_HASH_UDP_IPV6 BIT(5) +#define NDIS_HASH_IPV6_EX BIT(6) +#define NDIS_HASH_TCP_IPV6_EX BIT(7) +#define NDIS_HASH_UDP_IPV6_EX BIT(8) + +#define MANA_HASH_L3 (NDIS_HASH_IPV4 | NDIS_HASH_IPV6 | NDIS_HASH_IPV6_EX) +#define MANA_HASH_L4 \ + (NDIS_HASH_TCP_IPV4 | NDIS_HASH_UDP_IPV4 | NDIS_HASH_TCP_IPV6 | \ + NDIS_HASH_UDP_IPV6 | NDIS_HASH_TCP_IPV6_EX | NDIS_HASH_UDP_IPV6_EX) + +struct mana_rxcomp_perpkt_info { + u32 pkt_len : 16; + u32 reserved1 : 16; + u32 reserved2; + u32 pkt_hash; +}; /* HW DATA */ + +#define MANA_RXCOMP_OOB_NUM_PPI 4 + +/* Receive completion OOB */ +struct mana_rxcomp_oob { + struct mana_cqe_header cqe_hdr; + + u32 rx_vlan_id : 12; + u32 rx_vlantag_present : 1; + u32 rx_outer_iphdr_csum_succeed : 1; + u32 rx_outer_iphdr_csum_fail : 1; + u32 reserved1 : 1; + u32 rx_hashtype : 9; + u32 rx_iphdr_csum_succeed : 1; + u32 rx_iphdr_csum_fail : 1; + u32 rx_tcp_csum_succeed : 1; + u32 rx_tcp_csum_fail : 1; + u32 rx_udp_csum_succeed : 1; + u32 rx_udp_csum_fail : 1; + u32 reserved2 : 1; + + struct mana_rxcomp_perpkt_info ppi[MANA_RXCOMP_OOB_NUM_PPI]; + + u32 rx_wqe_offset; +}; /* HW DATA */ + +struct mana_tx_comp_oob { + struct mana_cqe_header cqe_hdr; + + u32 tx_data_offset; + + u32 tx_sgl_offset : 5; + u32 tx_wqe_offset : 27; + + u32 reserved[12]; +}; /* HW DATA */ + +struct mana_rxq; + +#define CQE_POLLING_BUFFER 512 + +struct mana_cq { + struct gdma_queue *gdma_cq; + + /* Cache the CQ id (used to verify if each CQE comes to the right CQ. */ + u32 gdma_id; + + /* Type of the CQ: TX or RX */ + enum mana_cq_type type; + + /* Pointer to the mana_rxq that is pushing RX CQEs to the queue. + * Only and must be non-NULL if type is MANA_CQ_TYPE_RX. + */ + struct mana_rxq *rxq; + + /* Pointer to the mana_txq that is pushing TX CQEs to the queue. + * Only and must be non-NULL if type is MANA_CQ_TYPE_TX. + */ + struct mana_txq *txq; + + /* Buffer which the CQ handler can copy the CQE's into. */ + struct gdma_comp gdma_comp_buf[CQE_POLLING_BUFFER]; + + /* NAPI data */ + struct napi_struct napi; + int work_done; + int budget; +}; + +#define GDMA_MAX_RQE_SGES 15 + +struct mana_recv_buf_oob { + /* A valid GDMA work request representing the data buffer. */ + struct gdma_wqe_request wqe_req; + + void *buf_va; + dma_addr_t buf_dma_addr; + + /* SGL of the buffer going to be sent has part of the work request. */ + u32 num_sge; + struct gdma_sge sgl[GDMA_MAX_RQE_SGES]; + + /* Required to store the result of mana_gd_post_work_request. + * gdma_posted_wqe_info.wqe_size_in_bu is required for progressing the + * work queue when the WQE is consumed. + */ + struct gdma_posted_wqe_info wqe_inf; +}; + +struct mana_rxq { + struct gdma_queue *gdma_rq; + /* Cache the gdma receive queue id */ + u32 gdma_id; + + /* Index of RQ in the vPort, not gdma receive queue id */ + u32 rxq_idx; + + u32 datasize; + + mana_handle_t rxobj; + + struct mana_cq rx_cq; + + struct completion fence_event; + + struct net_device *ndev; + + /* Total number of receive buffers to be allocated */ + u32 num_rx_buf; + + u32 buf_index; + + struct mana_stats_rx stats; + + struct bpf_prog __rcu *bpf_prog; + struct xdp_rxq_info xdp_rxq; + struct page *xdp_save_page; + bool xdp_flush; + int xdp_rc; /* XDP redirect return code */ + + /* MUST BE THE LAST MEMBER: + * Each receive buffer has an associated mana_recv_buf_oob. + */ + struct mana_recv_buf_oob rx_oobs[]; +}; + +struct mana_tx_qp { + struct mana_txq txq; + + struct mana_cq tx_cq; + + mana_handle_t tx_object; +}; + +struct mana_ethtool_stats { + u64 stop_queue; + u64 wake_queue; +}; + +struct mana_context { + struct gdma_dev *gdma_dev; + + u16 num_ports; + + struct mana_eq *eqs; + + struct net_device *ports[MAX_PORTS_IN_MANA_DEV]; +}; + +struct mana_port_context { + struct mana_context *ac; + struct net_device *ndev; + + u8 mac_addr[ETH_ALEN]; + + enum TRI_STATE rss_state; + + mana_handle_t default_rxobj; + bool tx_shortform_allowed; + u16 tx_vp_offset; + + struct mana_tx_qp *tx_qp; + + /* Indirection Table for RX & TX. The values are queue indexes */ + u32 indir_table[MANA_INDIRECT_TABLE_SIZE]; + + /* Indirection table containing RxObject Handles */ + mana_handle_t rxobj_table[MANA_INDIRECT_TABLE_SIZE]; + + /* Hash key used by the NIC */ + u8 hashkey[MANA_HASH_KEY_SIZE]; + + /* This points to an array of num_queues of RQ pointers. */ + struct mana_rxq **rxqs; + + struct bpf_prog *bpf_prog; + + /* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */ + unsigned int max_queues; + unsigned int num_queues; + + mana_handle_t port_handle; + mana_handle_t pf_filter_handle; + + /* Mutex for sharing access to vport_use_count */ + struct mutex vport_mutex; + int vport_use_count; + + u16 port_idx; + + bool port_is_up; + bool port_st_save; /* Saved port state */ + + struct mana_ethtool_stats eth_stats; +}; + +int mana_start_xmit(struct sk_buff *skb, struct net_device *ndev); +int mana_config_rss(struct mana_port_context *ac, enum TRI_STATE rx, + bool update_hash, bool update_tab); + +int mana_alloc_queues(struct net_device *ndev); +int mana_attach(struct net_device *ndev); +int mana_detach(struct net_device *ndev, bool from_close); + +int mana_probe(struct gdma_dev *gd, bool resuming); +void mana_remove(struct gdma_dev *gd, bool suspending); + +void mana_xdp_tx(struct sk_buff *skb, struct net_device *ndev); +int mana_xdp_xmit(struct net_device *ndev, int n, struct xdp_frame **frames, + u32 flags); +u32 mana_run_xdp(struct net_device *ndev, struct mana_rxq *rxq, + struct xdp_buff *xdp, void *buf_va, uint pkt_len); +struct bpf_prog *mana_xdp_get(struct mana_port_context *apc); +void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog); +int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf); + +extern const struct ethtool_ops mana_ethtool_ops; + +struct mana_obj_spec { + u32 queue_index; + u64 gdma_region; + u32 queue_size; + u32 attached_eq; + u32 modr_ctx_id; +}; + +enum mana_command_code { + MANA_QUERY_DEV_CONFIG = 0x20001, + MANA_QUERY_GF_STAT = 0x20002, + MANA_CONFIG_VPORT_TX = 0x20003, + MANA_CREATE_WQ_OBJ = 0x20004, + MANA_DESTROY_WQ_OBJ = 0x20005, + MANA_FENCE_RQ = 0x20006, + MANA_CONFIG_VPORT_RX = 0x20007, + MANA_QUERY_VPORT_CONFIG = 0x20008, + + /* Privileged commands for the PF mode */ + MANA_REGISTER_FILTER = 0x28000, + MANA_DEREGISTER_FILTER = 0x28001, + MANA_REGISTER_HW_PORT = 0x28003, + MANA_DEREGISTER_HW_PORT = 0x28004, +}; + +/* Query Device Configuration */ +struct mana_query_device_cfg_req { + struct gdma_req_hdr hdr; + + /* MANA Nic Driver Capability flags */ + u64 mn_drv_cap_flags1; + u64 mn_drv_cap_flags2; + u64 mn_drv_cap_flags3; + u64 mn_drv_cap_flags4; + + u32 proto_major_ver; + u32 proto_minor_ver; + u32 proto_micro_ver; + + u32 reserved; +}; /* HW DATA */ + +struct mana_query_device_cfg_resp { + struct gdma_resp_hdr hdr; + + u64 pf_cap_flags1; + u64 pf_cap_flags2; + u64 pf_cap_flags3; + u64 pf_cap_flags4; + + u16 max_num_vports; + u16 reserved; + u32 max_num_eqs; +}; /* HW DATA */ + +/* Query vPort Configuration */ +struct mana_query_vport_cfg_req { + struct gdma_req_hdr hdr; + u32 vport_index; +}; /* HW DATA */ + +struct mana_query_vport_cfg_resp { + struct gdma_resp_hdr hdr; + u32 max_num_sq; + u32 max_num_rq; + u32 num_indirection_ent; + u32 reserved1; + u8 mac_addr[6]; + u8 reserved2[2]; + mana_handle_t vport; +}; /* HW DATA */ + +/* Configure vPort */ +struct mana_config_vport_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; + u32 pdid; + u32 doorbell_pageid; +}; /* HW DATA */ + +struct mana_config_vport_resp { + struct gdma_resp_hdr hdr; + u16 tx_vport_offset; + u8 short_form_allowed; + u8 reserved; +}; /* HW DATA */ + +/* Create WQ Object */ +struct mana_create_wqobj_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; + u32 wq_type; + u32 reserved; + u64 wq_gdma_region; + u64 cq_gdma_region; + u32 wq_size; + u32 cq_size; + u32 cq_moderation_ctx_id; + u32 cq_parent_qid; +}; /* HW DATA */ + +struct mana_create_wqobj_resp { + struct gdma_resp_hdr hdr; + u32 wq_id; + u32 cq_id; + mana_handle_t wq_obj; +}; /* HW DATA */ + +/* Destroy WQ Object */ +struct mana_destroy_wqobj_req { + struct gdma_req_hdr hdr; + u32 wq_type; + u32 reserved; + mana_handle_t wq_obj_handle; +}; /* HW DATA */ + +struct mana_destroy_wqobj_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +/* Fence RQ */ +struct mana_fence_rq_req { + struct gdma_req_hdr hdr; + mana_handle_t wq_obj_handle; +}; /* HW DATA */ + +struct mana_fence_rq_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +/* Configure vPort Rx Steering */ +struct mana_cfg_rx_steer_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; + u16 num_indir_entries; + u16 indir_tab_offset; + u32 rx_enable; + u32 rss_enable; + u8 update_default_rxobj; + u8 update_hashkey; + u8 update_indir_tab; + u8 reserved; + mana_handle_t default_rxobj; + u8 hashkey[MANA_HASH_KEY_SIZE]; +}; /* HW DATA */ + +struct mana_cfg_rx_steer_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +/* Register HW vPort */ +struct mana_register_hw_vport_req { + struct gdma_req_hdr hdr; + u16 attached_gfid; + u8 is_pf_default_vport; + u8 reserved1; + u8 allow_all_ether_types; + u8 reserved2; + u8 reserved3; + u8 reserved4; +}; /* HW DATA */ + +struct mana_register_hw_vport_resp { + struct gdma_resp_hdr hdr; + mana_handle_t hw_vport_handle; +}; /* HW DATA */ + +/* Deregister HW vPort */ +struct mana_deregister_hw_vport_req { + struct gdma_req_hdr hdr; + mana_handle_t hw_vport_handle; +}; /* HW DATA */ + +struct mana_deregister_hw_vport_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +/* Register filter */ +struct mana_register_filter_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; + u8 mac_addr[6]; + u8 reserved1; + u8 reserved2; + u8 reserved3; + u8 reserved4; + u16 reserved5; + u32 reserved6; + u32 reserved7; + u32 reserved8; +}; /* HW DATA */ + +struct mana_register_filter_resp { + struct gdma_resp_hdr hdr; + mana_handle_t filter_handle; +}; /* HW DATA */ + +/* Deregister filter */ +struct mana_deregister_filter_req { + struct gdma_req_hdr hdr; + mana_handle_t filter_handle; +}; /* HW DATA */ + +struct mana_deregister_filter_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +#define MANA_MAX_NUM_QUEUES 64 + +#define MANA_SHORT_VPORT_OFFSET_MAX ((1U << 8) - 1) + +struct mana_tx_package { + struct gdma_wqe_request wqe_req; + struct gdma_sge sgl_array[5]; + struct gdma_sge *sgl_ptr; + + struct mana_tx_oob tx_oob; + + struct gdma_posted_wqe_info wqe_info; +}; + +int mana_create_wq_obj(struct mana_port_context *apc, + mana_handle_t vport, + u32 wq_type, struct mana_obj_spec *wq_spec, + struct mana_obj_spec *cq_spec, + mana_handle_t *wq_obj); + +void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type, + mana_handle_t wq_obj); + +int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id, + u32 doorbell_pg_id); +void mana_uncfg_vport(struct mana_port_context *apc); +#endif /* _MANA_H */ diff --git a/include/net/mana/mana_auxiliary.h b/include/net/mana/mana_auxiliary.h new file mode 100644 index 000000000000..373d59756846 --- /dev/null +++ b/include/net/mana/mana_auxiliary.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2022, Microsoft Corporation. */ + +#include "mana.h" +#include + +struct mana_adev { + struct auxiliary_device adev; + struct gdma_dev *mdev; +}; diff --git a/include/net/mana/shm_channel.h b/include/net/mana/shm_channel.h new file mode 100644 index 000000000000..5199b41497ff --- /dev/null +++ b/include/net/mana/shm_channel.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2021, Microsoft Corporation. */ + +#ifndef _SHM_CHANNEL_H +#define _SHM_CHANNEL_H + +struct shm_channel { + struct device *dev; + void __iomem *base; +}; + +void mana_smc_init(struct shm_channel *sc, struct device *dev, + void __iomem *base); + +int mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, u64 eq_addr, + u64 cq_addr, u64 rq_addr, u64 sq_addr, + u32 eq_msix_index); + +int mana_smc_teardown_hwc(struct shm_channel *sc, bool reset_vf); + +#endif /* _SHM_CHANNEL_H */ -- cgit v1.2.3 From aa56549792fb348892fbbae67f6f0c71bb750b65 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 3 Nov 2022 12:16:26 -0700 Subject: net: mana: Define max values for SGL entries The number of maximum SGl entries should be computed from the maximum WQE size for the intended queue type and the corresponding OOB data size. This guarantees the hardware queue can successfully queue requests up to the queue depth exposed to the upper layer. Reviewed-by: Dexuan Cui Signed-off-by: Long Li Link: https://lore.kernel.org/r/1667502990-2559-9-git-send-email-longli@linuxonhyperv.com Acked-by: Haiyang Zhang Signed-off-by: Leon Romanovsky --- include/net/mana/gdma.h | 7 +++++++ include/net/mana/mana.h | 4 +--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 72eaec2470c0..0b0c0cd0b6bd 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -427,6 +427,13 @@ struct gdma_wqe { #define MAX_TX_WQE_SIZE 512 #define MAX_RX_WQE_SIZE 256 +#define MAX_TX_WQE_SGL_ENTRIES ((GDMA_MAX_SQE_SIZE - \ + sizeof(struct gdma_sge) - INLINE_OOB_SMALL_SIZE) / \ + sizeof(struct gdma_sge)) + +#define MAX_RX_WQE_SGL_ENTRIES ((GDMA_MAX_RQE_SIZE - \ + sizeof(struct gdma_sge)) / sizeof(struct gdma_sge)) + struct gdma_cqe { u32 cqe_data[GDMA_COMP_DATA_SIZE / 4]; diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 6e9e86fb4c02..713a8f8cca9a 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -265,8 +265,6 @@ struct mana_cq { int budget; }; -#define GDMA_MAX_RQE_SGES 15 - struct mana_recv_buf_oob { /* A valid GDMA work request representing the data buffer. */ struct gdma_wqe_request wqe_req; @@ -276,7 +274,7 @@ struct mana_recv_buf_oob { /* SGL of the buffer going to be sent has part of the work request. */ u32 num_sge; - struct gdma_sge sgl[GDMA_MAX_RQE_SGES]; + struct gdma_sge sgl[MAX_RX_WQE_SGL_ENTRIES]; /* Required to store the result of mana_gd_post_work_request. * gdma_posted_wqe_info.wqe_size_in_bu is required for progressing the -- cgit v1.2.3 From de372f2a9ca7ada2698ecac7df8f02407cd98fa0 Mon Sep 17 00:00:00 2001 From: Ajay Sharma Date: Thu, 3 Nov 2022 12:16:27 -0700 Subject: net: mana: Define and process GDMA response code GDMA_STATUS_MORE_ENTRIES When doing memory registration, the PF may respond with GDMA_STATUS_MORE_ENTRIES to indicate a follow request is needed. This is not an error and should be processed as expected. Signed-off-by: Ajay Sharma Reviewed-by: Dexuan Cui Signed-off-by: Long Li Link: https://lore.kernel.org/r/1667502990-2559-10-git-send-email-longli@linuxonhyperv.com Acked-by: Haiyang Zhang Signed-off-by: Leon Romanovsky --- include/net/mana/gdma.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 0b0c0cd0b6bd..a9b7930dfbf8 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -9,6 +9,8 @@ #include "shm_channel.h" +#define GDMA_STATUS_MORE_ENTRIES 0x00000105 + /* Structures labeled with "HW DATA" are exchanged with the hardware. All of * them are naturally aligned and hence don't need __packed. */ -- cgit v1.2.3 From f72ececfc197e9b0bbb5595294908a950cf444fa Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 3 Nov 2022 12:16:28 -0700 Subject: net: mana: Define data structures for allocating doorbell page from GDMA The RDMA device needs to allocate doorbell pages for each user context. Define the GDMA data structures for use by the RDMA driver. Reviewed-by: Dexuan Cui Signed-off-by: Long Li Link: https://lore.kernel.org/r/1667502990-2559-11-git-send-email-longli@linuxonhyperv.com Acked-by: Haiyang Zhang Signed-off-by: Leon Romanovsky --- include/net/mana/gdma.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index a9b7930dfbf8..055408a5baf3 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -24,11 +24,15 @@ enum gdma_request_type { GDMA_GENERATE_TEST_EQE = 10, GDMA_CREATE_QUEUE = 12, GDMA_DISABLE_QUEUE = 13, + GDMA_ALLOCATE_RESOURCE_RANGE = 22, + GDMA_DESTROY_RESOURCE_RANGE = 24, GDMA_CREATE_DMA_REGION = 25, GDMA_DMA_REGION_ADD_PAGES = 26, GDMA_DESTROY_DMA_REGION = 27, }; +#define GDMA_RESOURCE_DOORBELL_PAGE 27 + enum gdma_queue_type { GDMA_INVALID_QUEUE, GDMA_SQ, @@ -587,6 +591,26 @@ struct gdma_register_device_resp { u32 db_id; }; /* HW DATA */ +struct gdma_allocate_resource_range_req { + struct gdma_req_hdr hdr; + u32 resource_type; + u32 num_resources; + u32 alignment; + u32 allocated_resources; +}; + +struct gdma_allocate_resource_range_resp { + struct gdma_resp_hdr hdr; + u32 allocated_resources; +}; + +struct gdma_destroy_resource_range_req { + struct gdma_req_hdr hdr; + u32 resource_type; + u32 num_resources; + u32 allocated_resources; +}; + /* GDMA_CREATE_QUEUE */ struct gdma_create_queue_req { struct gdma_req_hdr hdr; -- cgit v1.2.3 From 28c66cfa45388af1126985d1114e0ed762eb2abd Mon Sep 17 00:00:00 2001 From: Ajay Sharma Date: Thu, 3 Nov 2022 12:16:29 -0700 Subject: net: mana: Define data structures for protection domain and memory registration The MANA hardware support protection domain and memory registration for use in RDMA environment. Add those definitions and expose them for use by the RDMA driver. Signed-off-by: Ajay Sharma Signed-off-by: Long Li Link: https://lore.kernel.org/r/1667502990-2559-12-git-send-email-longli@linuxonhyperv.com Reviewed-by: Dexuan Cui Acked-by: Haiyang Zhang Signed-off-by: Leon Romanovsky --- include/net/mana/gdma.h | 121 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 116 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 055408a5baf3..221adc96340c 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -29,6 +29,10 @@ enum gdma_request_type { GDMA_CREATE_DMA_REGION = 25, GDMA_DMA_REGION_ADD_PAGES = 26, GDMA_DESTROY_DMA_REGION = 27, + GDMA_CREATE_PD = 29, + GDMA_DESTROY_PD = 30, + GDMA_CREATE_MR = 31, + GDMA_DESTROY_MR = 32, }; #define GDMA_RESOURCE_DOORBELL_PAGE 27 @@ -61,6 +65,8 @@ enum { GDMA_DEVICE_MANA = 2, }; +typedef u64 gdma_obj_handle_t; + struct gdma_resource { /* Protect the bitmap */ spinlock_t lock; @@ -194,7 +200,7 @@ struct gdma_mem_info { u64 length; /* Allocated by the PF driver */ - u64 gdma_region; + gdma_obj_handle_t dma_region_handle; }; #define REGISTER_ATB_MST_MKEY_LOWER_SIZE 8 @@ -618,7 +624,7 @@ struct gdma_create_queue_req { u32 reserved1; u32 pdid; u32 doolbell_id; - u64 gdma_region; + gdma_obj_handle_t gdma_region; u32 reserved2; u32 queue_size; u32 log2_throttle_limit; @@ -645,6 +651,28 @@ struct gdma_disable_queue_req { u32 alloc_res_id_on_creation; }; /* HW DATA */ +enum atb_page_size { + ATB_PAGE_SIZE_4K, + ATB_PAGE_SIZE_8K, + ATB_PAGE_SIZE_16K, + ATB_PAGE_SIZE_32K, + ATB_PAGE_SIZE_64K, + ATB_PAGE_SIZE_128K, + ATB_PAGE_SIZE_256K, + ATB_PAGE_SIZE_512K, + ATB_PAGE_SIZE_1M, + ATB_PAGE_SIZE_2M, + ATB_PAGE_SIZE_MAX, +}; + +enum gdma_mr_access_flags { + GDMA_ACCESS_FLAG_LOCAL_READ = BIT_ULL(0), + GDMA_ACCESS_FLAG_LOCAL_WRITE = BIT_ULL(1), + GDMA_ACCESS_FLAG_REMOTE_READ = BIT_ULL(2), + GDMA_ACCESS_FLAG_REMOTE_WRITE = BIT_ULL(3), + GDMA_ACCESS_FLAG_REMOTE_ATOMIC = BIT_ULL(4), +}; + /* GDMA_CREATE_DMA_REGION */ struct gdma_create_dma_region_req { struct gdma_req_hdr hdr; @@ -671,14 +699,14 @@ struct gdma_create_dma_region_req { struct gdma_create_dma_region_resp { struct gdma_resp_hdr hdr; - u64 gdma_region; + gdma_obj_handle_t dma_region_handle; }; /* HW DATA */ /* GDMA_DMA_REGION_ADD_PAGES */ struct gdma_dma_region_add_pages_req { struct gdma_req_hdr hdr; - u64 gdma_region; + gdma_obj_handle_t dma_region_handle; u32 page_addr_list_len; u32 reserved3; @@ -690,9 +718,88 @@ struct gdma_dma_region_add_pages_req { struct gdma_destroy_dma_region_req { struct gdma_req_hdr hdr; - u64 gdma_region; + gdma_obj_handle_t dma_region_handle; }; /* HW DATA */ +enum gdma_pd_flags { + GDMA_PD_FLAG_INVALID = 0, +}; + +struct gdma_create_pd_req { + struct gdma_req_hdr hdr; + enum gdma_pd_flags flags; + u32 reserved; +};/* HW DATA */ + +struct gdma_create_pd_resp { + struct gdma_resp_hdr hdr; + gdma_obj_handle_t pd_handle; + u32 pd_id; + u32 reserved; +};/* HW DATA */ + +struct gdma_destroy_pd_req { + struct gdma_req_hdr hdr; + gdma_obj_handle_t pd_handle; +};/* HW DATA */ + +struct gdma_destory_pd_resp { + struct gdma_resp_hdr hdr; +};/* HW DATA */ + +enum gdma_mr_type { + /* Guest Virtual Address - MRs of this type allow access + * to memory mapped by PTEs associated with this MR using a virtual + * address that is set up in the MST + */ + GDMA_MR_TYPE_GVA = 2, +}; + +struct gdma_create_mr_params { + gdma_obj_handle_t pd_handle; + enum gdma_mr_type mr_type; + union { + struct { + gdma_obj_handle_t dma_region_handle; + u64 virtual_address; + enum gdma_mr_access_flags access_flags; + } gva; + }; +}; + +struct gdma_create_mr_request { + struct gdma_req_hdr hdr; + gdma_obj_handle_t pd_handle; + enum gdma_mr_type mr_type; + u32 reserved_1; + + union { + struct { + gdma_obj_handle_t dma_region_handle; + u64 virtual_address; + enum gdma_mr_access_flags access_flags; + } gva; + + }; + u32 reserved_2; +};/* HW DATA */ + +struct gdma_create_mr_response { + struct gdma_resp_hdr hdr; + gdma_obj_handle_t mr_handle; + u32 lkey; + u32 rkey; +};/* HW DATA */ + +struct gdma_destroy_mr_request { + struct gdma_req_hdr hdr; + gdma_obj_handle_t mr_handle; +};/* HW DATA */ + +struct gdma_destroy_mr_response { + struct gdma_resp_hdr hdr; +};/* HW DATA */ + int mana_gd_verify_vf_version(struct pci_dev *pdev); int mana_gd_register_device(struct gdma_dev *gd); @@ -719,4 +826,8 @@ void mana_gd_free_memory(struct gdma_mem_info *gmi); int mana_gd_send_request(struct gdma_context *gc, u32 req_len, const void *req, u32 resp_len, void *resp); + +int mana_gd_destroy_dma_region(struct gdma_context *gc, + gdma_obj_handle_t dma_region_handle); + #endif /* _GDMA_H */ -- cgit v1.2.3 From cc514101a97e3fb48f8617c8f291db798d10d831 Mon Sep 17 00:00:00 2001 From: Sujuan Chen Date: Sat, 5 Nov 2022 23:36:18 +0100 Subject: net: ethernet: mtk_wed: introduce wed mcu support Introduce WED mcu support used to configure WED WO chip. This is a preliminary patch in order to add RX Wireless Ethernet Dispatch available on MT7986 SoC. Tested-by: Daniel Golle Co-developed-by: Lorenzo Bianconi Signed-off-by: Lorenzo Bianconi Signed-off-by: Sujuan Chen Signed-off-by: David S. Miller --- include/linux/soc/mediatek/mtk_wed.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 4450c8b7a1cb..2cc2f1e43ba9 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -11,6 +11,35 @@ struct mtk_wed_hw; struct mtk_wdma_desc; +enum mtk_wed_wo_cmd { + MTK_WED_WO_CMD_WED_CFG, + MTK_WED_WO_CMD_WED_RX_STAT, + MTK_WED_WO_CMD_RRO_SER, + MTK_WED_WO_CMD_DBG_INFO, + MTK_WED_WO_CMD_DEV_INFO, + MTK_WED_WO_CMD_BSS_INFO, + MTK_WED_WO_CMD_STA_REC, + MTK_WED_WO_CMD_DEV_INFO_DUMP, + MTK_WED_WO_CMD_BSS_INFO_DUMP, + MTK_WED_WO_CMD_STA_REC_DUMP, + MTK_WED_WO_CMD_BA_INFO_DUMP, + MTK_WED_WO_CMD_FBCMD_Q_DUMP, + MTK_WED_WO_CMD_FW_LOG_CTRL, + MTK_WED_WO_CMD_LOG_FLUSH, + MTK_WED_WO_CMD_CHANGE_STATE, + MTK_WED_WO_CMD_CPU_STATS_ENABLE, + MTK_WED_WO_CMD_CPU_STATS_DUMP, + MTK_WED_WO_CMD_EXCEPTION_INIT, + MTK_WED_WO_CMD_PROF_CTRL, + MTK_WED_WO_CMD_STA_BA_DUMP, + MTK_WED_WO_CMD_BA_CTRL_DUMP, + MTK_WED_WO_CMD_RXCNT_CTRL, + MTK_WED_WO_CMD_RXCNT_INFO, + MTK_WED_WO_CMD_SET_CAP, + MTK_WED_WO_CMD_CCIF_RING_DUMP, + MTK_WED_WO_CMD_WED_END +}; + enum mtk_wed_bus_tye { MTK_WED_BUS_PCIE, MTK_WED_BUS_AXI, -- cgit v1.2.3 From 084d60ce0c6cef96024d53a58b92d7ff2d8b9318 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 5 Nov 2022 23:36:20 +0100 Subject: net: ethernet: mtk_wed: rename tx_wdma array in rx_wdma Rename tx_wdma queue array in rx_wdma since this is rx side of wdma soc. Moreover rename mtk_wed_wdma_ring_setup routine in mtk_wed_wdma_rx_ring_setup() Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- include/linux/soc/mediatek/mtk_wed.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 2cc2f1e43ba9..956978320f8b 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -7,6 +7,7 @@ #include #define MTK_WED_TX_QUEUES 2 +#define MTK_WED_RX_QUEUES 2 struct mtk_wed_hw; struct mtk_wdma_desc; @@ -66,7 +67,7 @@ struct mtk_wed_device { struct mtk_wed_ring tx_ring[MTK_WED_TX_QUEUES]; struct mtk_wed_ring txfree_ring; - struct mtk_wed_ring tx_wdma[MTK_WED_TX_QUEUES]; + struct mtk_wed_ring rx_wdma[MTK_WED_RX_QUEUES]; struct { int size; -- cgit v1.2.3 From 4c5de09eb0d05fc9e73ff8f0e052622f1f3df6d8 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 5 Nov 2022 23:36:21 +0100 Subject: net: ethernet: mtk_wed: add configure wed wo support Enable RX Wireless Ethernet Dispatch available on MT7986 Soc. Tested-by: Daniel Golle Co-developed-by: Sujuan Chen Signed-off-by: Sujuan Chen Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- include/linux/soc/mediatek/mtk_wed.h | 76 +++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 956978320f8b..8294978f4bca 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -5,10 +5,13 @@ #include #include #include +#include #define MTK_WED_TX_QUEUES 2 #define MTK_WED_RX_QUEUES 2 +#define WED_WO_STA_REC 0x6 + struct mtk_wed_hw; struct mtk_wdma_desc; @@ -41,21 +44,37 @@ enum mtk_wed_wo_cmd { MTK_WED_WO_CMD_WED_END }; +struct mtk_rxbm_desc { + __le32 buf0; + __le32 token; +} __packed __aligned(4); + enum mtk_wed_bus_tye { MTK_WED_BUS_PCIE, MTK_WED_BUS_AXI, }; +#define MTK_WED_RING_CONFIGURED BIT(0) struct mtk_wed_ring { struct mtk_wdma_desc *desc; dma_addr_t desc_phys; u32 desc_size; int size; + u32 flags; u32 reg_base; void __iomem *wpdma; }; +struct mtk_wed_wo_rx_stats { + __le16 wlan_idx; + __le16 tid; + __le32 rx_pkt_cnt; + __le32 rx_byte_cnt; + __le32 rx_err_cnt; + __le32 rx_drop_cnt; +}; + struct mtk_wed_device { #ifdef CONFIG_NET_MEDIATEK_SOC_WED const struct mtk_wed_ops *ops; @@ -64,9 +83,12 @@ struct mtk_wed_device { bool init_done, running; int wdma_idx; int irq; + u8 version; struct mtk_wed_ring tx_ring[MTK_WED_TX_QUEUES]; + struct mtk_wed_ring rx_ring[MTK_WED_RX_QUEUES]; struct mtk_wed_ring txfree_ring; + struct mtk_wed_ring tx_wdma[MTK_WED_TX_QUEUES]; struct mtk_wed_ring rx_wdma[MTK_WED_RX_QUEUES]; struct { @@ -74,7 +96,20 @@ struct mtk_wed_device { void **pages; struct mtk_wdma_desc *desc; dma_addr_t desc_phys; - } buf_ring; + } tx_buf_ring; + + struct { + int size; + struct page_frag_cache rx_page; + struct mtk_rxbm_desc *desc; + dma_addr_t desc_phys; + } rx_buf_ring; + + struct { + struct mtk_wed_ring ring; + dma_addr_t miod_phys; + dma_addr_t fdbk_phys; + } rro; /* filled by driver: */ struct { @@ -83,22 +118,36 @@ struct mtk_wed_device { struct pci_dev *pci_dev; }; enum mtk_wed_bus_tye bus_type; + void __iomem *base; + u32 phy_base; u32 wpdma_phys; u32 wpdma_int; u32 wpdma_mask; u32 wpdma_tx; u32 wpdma_txfree; + u32 wpdma_rx_glo; + u32 wpdma_rx; + + bool wcid_512; u16 token_start; unsigned int nbuf; + unsigned int rx_nbuf; + unsigned int rx_npkt; + unsigned int rx_size; u8 tx_tbit[MTK_WED_TX_QUEUES]; + u8 rx_tbit[MTK_WED_RX_QUEUES]; u8 txfree_tbit; u32 (*init_buf)(void *ptr, dma_addr_t phys, int token_id); int (*offload_enable)(struct mtk_wed_device *wed); void (*offload_disable)(struct mtk_wed_device *wed); + u32 (*init_rx_buf)(struct mtk_wed_device *wed, int size); + void (*release_rx_buf)(struct mtk_wed_device *wed); + void (*update_wo_rx_stats)(struct mtk_wed_device *wed, + struct mtk_wed_wo_rx_stats *stats); } wlan; #endif }; @@ -107,9 +156,15 @@ struct mtk_wed_ops { int (*attach)(struct mtk_wed_device *dev); int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring, void __iomem *regs); + int (*rx_ring_setup)(struct mtk_wed_device *dev, int ring, + void __iomem *regs); int (*txfree_ring_setup)(struct mtk_wed_device *dev, void __iomem *regs); + int (*msg_update)(struct mtk_wed_device *dev, int cmd_id, + void *data, int len); void (*detach)(struct mtk_wed_device *dev); + void (*ppe_check)(struct mtk_wed_device *dev, struct sk_buff *skb, + u32 reason, u32 hash); void (*stop)(struct mtk_wed_device *dev); void (*start)(struct mtk_wed_device *dev, u32 irq_mask); @@ -144,6 +199,16 @@ mtk_wed_device_attach(struct mtk_wed_device *dev) return ret; } +static inline bool +mtk_wed_get_rx_capa(struct mtk_wed_device *dev) +{ +#ifdef CONFIG_NET_MEDIATEK_SOC_WED + return dev->version != 1; +#else + return false; +#endif +} + #ifdef CONFIG_NET_MEDIATEK_SOC_WED #define mtk_wed_device_active(_dev) !!(_dev)->ops #define mtk_wed_device_detach(_dev) (_dev)->ops->detach(_dev) @@ -160,6 +225,12 @@ mtk_wed_device_attach(struct mtk_wed_device *dev) (_dev)->ops->irq_get(_dev, _mask) #define mtk_wed_device_irq_set_mask(_dev, _mask) \ (_dev)->ops->irq_set_mask(_dev, _mask) +#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) \ + (_dev)->ops->rx_ring_setup(_dev, _ring, _regs) +#define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash) \ + (_dev)->ops->ppe_check(_dev, _skb, _reason, _hash) +#define mtk_wed_device_update_msg(_dev, _id, _msg, _len) \ + (_dev)->ops->msg_update(_dev, _id, _msg, _len) #else static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) { @@ -173,6 +244,9 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) #define mtk_wed_device_reg_write(_dev, _reg, _val) do {} while (0) #define mtk_wed_device_irq_get(_dev, _mask) 0 #define mtk_wed_device_irq_set_mask(_dev, _mask) do {} while (0) +#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) -ENODEV +#define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash) do {} while (0) +#define mtk_wed_device_update_msg(_dev, _id, _msg, _len) -ENODEV #endif #endif -- cgit v1.2.3 From 75ab70ec5cefade915a999886e4863fe6a38f8b5 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 9 Nov 2022 15:09:45 -0800 Subject: ptp: remove the .adjfreq interface function Now that all drivers have been converted to .adjfine, we can remove the .adjfreq from the interface structure. Signed-off-by: Jacob Keller Cc: Richard Cochran Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index f4781c5766d6..fdffa6a98d79 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -77,12 +77,6 @@ struct ptp_system_timestamp { * nominal frequency in parts per million, but with a * 16 bit binary fractional field. * - * @adjfreq: Adjusts the frequency of the hardware clock. - * This method is deprecated. New drivers should implement - * the @adjfine method instead. - * parameter delta: Desired frequency offset from nominal frequency - * in parts per billion - * * @adjphase: Adjusts the phase offset of the hardware clock. * parameter delta: Desired change in nanoseconds. * @@ -174,7 +168,6 @@ struct ptp_clock_info { int pps; struct ptp_pin_desc *pin_config; int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm); - int (*adjfreq)(struct ptp_clock_info *ptp, s32 delta); int (*adjphase)(struct ptp_clock_info *ptp, s32 phase); int (*adjtime)(struct ptp_clock_info *ptp, s64 delta); int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts); -- cgit v1.2.3 From 9bb053490f1a5a0914eb9f7b4116a0e4a95d4f8e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 7 Nov 2022 15:04:18 -0800 Subject: bpf: Add hwtstamp field for the sockops prog The bpf-tc prog has already been able to access the skb_hwtstamps(skb)->hwtstamp. This patch extends the same hwtstamp access to the sockops prog. In sockops, the skb is also available to the bpf prog during the BPF_SOCK_OPS_PARSE_HDR_OPT_CB event. There is a use case that the hwtstamp will be useful to the sockops prog to better measure the one-way-delay when the sender has put the tx timestamp in the tcp header option. Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221107230420.4192307-2-martin.lau@linux.dev --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 94659f6b3395..fb4c911d2a03 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6445,6 +6445,7 @@ struct bpf_sock_ops { * the outgoing header has not * been written yet. */ + __u64 skb_hwtstamp; }; /* Definitions for bpf_sock_ops_cb_flags */ -- cgit v1.2.3 From 354259fa73e2aac92ae5e19522adb69a92c15b49 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Nov 2022 09:57:58 +0000 Subject: net: remove skb->vlan_present skb->vlan_present seems redundant. We can instead derive it from this boolean expression: vlan_present = skb->vlan_proto != 0 || skb->vlan_tci != 0 Add a new union, to access both fields in a single load/store when possible. union { u32 vlan_all; struct { __be16 vlan_proto; __u16 vlan_tci; }; }; This allows following patch to remove a conditional test in GRO stack. Note: We move remcsum_offload to keep TC_AT_INGRESS_MASK and SKB_MONO_DELIVERY_TIME_MASK unchanged. Signed-off-by: Eric Dumazet Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Signed-off-by: Jakub Kicinski --- include/linux/if_vlan.h | 9 +++------ include/linux/skbuff.h | 18 ++++++++++-------- 2 files changed, 13 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index e00c4ee81ff7..6864b89ef868 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -76,7 +76,7 @@ static inline bool is_vlan_dev(const struct net_device *dev) return dev->priv_flags & IFF_802_1Q_VLAN; } -#define skb_vlan_tag_present(__skb) ((__skb)->vlan_present) +#define skb_vlan_tag_present(__skb) (!!(__skb)->vlan_all) #define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci) #define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) #define skb_vlan_tag_get_cfi(__skb) (!!((__skb)->vlan_tci & VLAN_CFI_MASK)) @@ -471,7 +471,7 @@ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb, */ static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) { - skb->vlan_present = 0; + skb->vlan_all = 0; } /** @@ -483,9 +483,7 @@ static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) */ static inline void __vlan_hwaccel_copy_tag(struct sk_buff *dst, const struct sk_buff *src) { - dst->vlan_present = src->vlan_present; - dst->vlan_proto = src->vlan_proto; - dst->vlan_tci = src->vlan_tci; + dst->vlan_all = src->vlan_all; } /* @@ -519,7 +517,6 @@ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb, { skb->vlan_proto = vlan_proto; skb->vlan_tci = vlan_tci; - skb->vlan_present = 1; } /** diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 59c9fd55699d..4e464a27adaf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -818,7 +818,7 @@ typedef unsigned char *sk_buff_data_t; * @mark: Generic packet mark * @reserved_tailroom: (aka @mark) number of bytes of free space available * at the tail of an sk_buff - * @vlan_present: VLAN tag is present + * @vlan_all: vlan fields (proto & tci) * @vlan_proto: vlan encapsulation protocol * @vlan_tci: vlan tag control information * @inner_protocol: Protocol (encapsulation) @@ -951,7 +951,7 @@ struct sk_buff { /* private: */ __u8 __pkt_vlan_present_offset[0]; /* public: */ - __u8 vlan_present:1; /* See PKT_VLAN_PRESENT_BIT */ + __u8 remcsum_offload:1; __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 dst_pending_confirm:1; @@ -966,7 +966,6 @@ struct sk_buff { __u8 ipvs_property:1; __u8 inner_protocol_type:1; - __u8 remcsum_offload:1; #ifdef CONFIG_NET_SWITCHDEV __u8 offload_fwd_mark:1; __u8 offload_l3_fwd_mark:1; @@ -999,8 +998,13 @@ struct sk_buff { __u32 priority; int skb_iif; __u32 hash; - __be16 vlan_proto; - __u16 vlan_tci; + union { + u32 vlan_all; + struct { + __be16 vlan_proto; + __u16 vlan_tci; + }; + }; #if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS) union { unsigned int napi_id; @@ -1059,15 +1063,13 @@ struct sk_buff { #endif #define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset) -/* if you move pkt_vlan_present, tc_at_ingress, or mono_delivery_time +/* if you move tc_at_ingress or mono_delivery_time * around, you also must adapt these constants. */ #ifdef __BIG_ENDIAN_BITFIELD -#define PKT_VLAN_PRESENT_BIT 7 #define TC_AT_INGRESS_MASK (1 << 0) #define SKB_MONO_DELIVERY_TIME_MASK (1 << 2) #else -#define PKT_VLAN_PRESENT_BIT 0 #define TC_AT_INGRESS_MASK (1 << 7) #define SKB_MONO_DELIVERY_TIME_MASK (1 << 5) #endif -- cgit v1.2.3 From 2c925db0a7d69b404d6bfe4c037935c2d367913d Mon Sep 17 00:00:00 2001 From: Ofer Levi Date: Tue, 9 Feb 2021 17:48:11 +0200 Subject: net/mlx5e: Support enhanced CQE compression CQE compression feature improves performance by reducing PCI bandwidth bottleneck on CQEs write. Enhanced CQE compression introduced in ConnectX-6 and it aims to reduce CPU utilization of SW side packets decompression by eliminating the need to rewrite ownership bit, which is likely to cost a cache-miss, is replaced by validity byte handled solely by HW. Another advantage of the enhanced feature is that session packets are available to SW as soon as a single CQE slot is filled, instead of waiting for session to close, this improves packet latency from NIC to host. Performance: Following are tested scenarios and reults comparing basic and enahnced CQE compression. setup: IXIA 100GbE connected directly to port 0 and port 1 of ConnectX-6 Dx 100GbE dual port. Case #1 RX only, single flow goes to single queue: IRQ rate reduced by ~ 30%, CPU utilization improved by 2%. Case #2 IP forwarding from port 1 to port 0 single flow goes to single queue: Avg latency improved from 60us to 21us, frame loss improved from 0.5% to 0.0%. Case #3 IP forwarding from port 1 to port 0 Max Throughput IXIA sends 100%, 8192 UDP flows, goes to 24 queues: Enhanced is equal or slightly better than basic. Testing the basic compression feature with this patch shows there is no perfrormance degradation of the basic compression feature. Signed-off-by: Ofer Levi Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 1ff91cb79ded..eb3fac30488b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -882,6 +882,12 @@ static inline u8 get_cqe_opcode(struct mlx5_cqe64 *cqe) return cqe->op_own >> 4; } +static inline u8 get_cqe_enhanced_num_mini_cqes(struct mlx5_cqe64 *cqe) +{ + /* num_of_mini_cqes is zero based */ + return get_cqe_opcode(cqe) + 1; +} + static inline u8 get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe) { return (cqe->lro.tcppsh_abort_dupack >> 6) & 1; -- cgit v1.2.3 From 92125c3a602454824d70edb1b2abb382811cab4f Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 10 Nov 2022 15:32:17 -0600 Subject: ibmvnic: Add hotpluggable CPU callbacks to reassign affinity hints When CPU's are added and removed, ibmvnic devices will reassign hint values. Introduce a new cpu hotplug state CPUHP_IBMVNIC_DEAD to signal to ibmvnic devices that the CPU has been removed and it is time to reset affinity hint assignments. On the other hand, when CPU's are being added, add a state instance to CPUHP_AP_ONLINE_DYN which will trigger a reassignment of affinity hints once the new CPU's are online. This implementation is based on the virtio_net driver. Signed-off-by: Thomas Falcon Signed-off-by: Dany Madden Signed-off-by: Nick Child Reviewed-by: Rick Lindsley Reviewed-by: Haren Myneni Signed-off-by: David S. Miller --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index f61447913db9..c8bc85a87b1e 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -69,6 +69,7 @@ enum cpuhp_state { CPUHP_X86_APB_DEAD, CPUHP_X86_MCE_DEAD, CPUHP_VIRT_NET_DEAD, + CPUHP_IBMVNIC_DEAD, CPUHP_SLUB_DEAD, CPUHP_DEBUG_OBJ_DEAD, CPUHP_MM_WRITEBACK_DEAD, -- cgit v1.2.3 From 70ea86a0dfed10e00ee2666dadeb563bab00efea Mon Sep 17 00:00:00 2001 From: Steen Hegelund Date: Fri, 11 Nov 2022 14:05:14 +0100 Subject: net: flow_offload: add support for ARP frame matching This adds a new flow_rule_match_arp function that allows drivers to be able to dissect ARP frames. Signed-off-by: Steen Hegelund Signed-off-by: David S. Miller --- include/net/flow_offload.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 7a60bc6d72c9..0400a0ac8a29 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -32,6 +32,10 @@ struct flow_match_vlan { struct flow_dissector_key_vlan *key, *mask; }; +struct flow_match_arp { + struct flow_dissector_key_arp *key, *mask; +}; + struct flow_match_ipv4_addrs { struct flow_dissector_key_ipv4_addrs *key, *mask; }; @@ -98,6 +102,8 @@ void flow_rule_match_vlan(const struct flow_rule *rule, struct flow_match_vlan *out); void flow_rule_match_cvlan(const struct flow_rule *rule, struct flow_match_vlan *out); +void flow_rule_match_arp(const struct flow_rule *rule, + struct flow_match_arp *out); void flow_rule_match_ipv4_addrs(const struct flow_rule *rule, struct flow_match_ipv4_addrs *out); void flow_rule_match_ipv6_addrs(const struct flow_rule *rule, -- cgit v1.2.3 From 2d577252579b3efb9e934b68948a2edfa9920110 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:23 +0530 Subject: bpf: Remove BPF_MAP_OFF_ARR_MAX In f71b2f64177a ("bpf: Refactor map->off_arr handling"), map->off_arr was refactored to be btf_field_offs. The number of field offsets is equal to maximum possible fields limited by BTF_FIELDS_MAX. Hence, reuse BTF_FIELDS_MAX as spin_lock and timer no longer are to be handled specially for offset sorting, fix the comment, and remove incorrect WARN_ON as its rec->cnt can never exceed this value. The reason to keep separate constant was the it was always more 2 more than total kptrs. This is no longer the case. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 798aec816970..1a66a1df1af1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -165,9 +165,8 @@ struct bpf_map_ops { }; enum { - /* Support at most 8 pointers in a BTF type */ - BTF_FIELDS_MAX = 10, - BPF_MAP_OFF_ARR_MAX = BTF_FIELDS_MAX, + /* Support at most 10 fields in a BTF type */ + BTF_FIELDS_MAX = 10, }; enum btf_field_type { @@ -203,8 +202,8 @@ struct btf_record { struct btf_field_offs { u32 cnt; - u32 field_off[BPF_MAP_OFF_ARR_MAX]; - u8 field_sz[BPF_MAP_OFF_ARR_MAX]; + u32 field_off[BTF_FIELDS_MAX]; + u8 field_sz[BTF_FIELDS_MAX]; }; struct bpf_map { -- cgit v1.2.3 From e5feed0f64f73e167ef70755d3dc2db959d8fd5c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:24 +0530 Subject: bpf: Fix copy_map_value, zero_map_value The current offset needs to also skip over the already copied region in addition to the size of the next field. This case manifests where there are gaps between adjacent special fields. It was observed that for a map value with size 48, having fields at: off: 0, 16, 32 size: 4, 16, 16 The current code does: memcpy(dst + 0, src + 0, 0) memcpy(dst + 4, src + 4, 12) memcpy(dst + 20, src + 20, 12) memcpy(dst + 36, src + 36, 12) With the fix, it is done correctly as: memcpy(dst + 0, src + 0, 0) memcpy(dst + 4, src + 4, 12) memcpy(dst + 32, src + 32, 0) memcpy(dst + 48, src + 48, 0) Fixes: 4d7d7f69f4b1 ("bpf: Adapt copy_map_value for multiple offset case") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1a66a1df1af1..f08eb2d27de0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -360,7 +360,7 @@ static inline void bpf_obj_memcpy(struct btf_field_offs *foffs, u32 sz = next_off - curr_off; memcpy(dst + curr_off, src + curr_off, sz); - curr_off += foffs->field_sz[i]; + curr_off += foffs->field_sz[i] + sz; } memcpy(dst + curr_off, src + curr_off, size - curr_off); } @@ -390,7 +390,7 @@ static inline void bpf_obj_memzero(struct btf_field_offs *foffs, void *dst, u32 u32 sz = next_off - curr_off; memset(dst + curr_off, 0, sz); - curr_off += foffs->field_sz[i]; + curr_off += foffs->field_sz[i] + sz; } memset(dst + curr_off, 0, size - curr_off); } -- cgit v1.2.3 From f0c5941ff5b255413d31425bb327c2aec3625673 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:25 +0530 Subject: bpf: Support bpf_list_head in map values Add the support on the map side to parse, recognize, verify, and build metadata table for a new special field of the type struct bpf_list_head. To parameterize the bpf_list_head for a certain value type and the list_node member it will accept in that value type, we use BTF declaration tags. The definition of bpf_list_head in a map value will be done as follows: struct foo { struct bpf_list_node node; int data; }; struct map_value { struct bpf_list_head head __contains(foo, node); }; Then, the bpf_list_head only allows adding to the list 'head' using the bpf_list_node 'node' for the type struct foo. The 'contains' annotation is a BTF declaration tag composed of four parts, "contains:name:node" where the name is then used to look up the type in the map BTF, with its kind hardcoded to BTF_KIND_STRUCT during the lookup. The node defines name of the member in this type that has the type struct bpf_list_node, which is actually used for linking into the linked list. For now, 'kind' part is hardcoded as struct. This allows building intrusive linked lists in BPF, using container_of to obtain pointer to entry, while being completely type safe from the perspective of the verifier. The verifier knows exactly the type of the nodes, and knows that list helpers return that type at some fixed offset where the bpf_list_node member used for this list exists. The verifier also uses this information to disallow adding types that are not accepted by a certain list. For now, no elements can be added to such lists. Support for that is coming in future patches, hence draining and freeing items is done with a TODO that will be resolved in a future patch. Note that the bpf_list_head_free function moves the list out to a local variable under the lock and releases it, doing the actual draining of the list items outside the lock. While this helps with not holding the lock for too long pessimizing other concurrent list operations, it is also necessary for deadlock prevention: unless every function called in the critical section would be notrace, a fentry/fexit program could attach and call bpf_map_update_elem again on the map, leading to the same lock being acquired if the key matches and lead to a deadlock. While this requires some special effort on part of the BPF programmer to trigger and is highly unlikely to occur in practice, it is always better if we can avoid such a condition. While notrace would prevent this, doing the draining outside the lock has advantages of its own, hence it is used to also fix the deadlock related problem. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 17 +++++++++++++++++ include/uapi/linux/bpf.h | 10 ++++++++++ 2 files changed, 27 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f08eb2d27de0..05f98e9e5c48 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -175,6 +175,7 @@ enum btf_field_type { BPF_KPTR_UNREF = (1 << 2), BPF_KPTR_REF = (1 << 3), BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, + BPF_LIST_HEAD = (1 << 4), }; struct btf_field_kptr { @@ -184,11 +185,18 @@ struct btf_field_kptr { u32 btf_id; }; +struct btf_field_list_head { + struct btf *btf; + u32 value_btf_id; + u32 node_offset; +}; + struct btf_field { u32 offset; enum btf_field_type type; union { struct btf_field_kptr kptr; + struct btf_field_list_head list_head; }; }; @@ -266,6 +274,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) case BPF_KPTR_UNREF: case BPF_KPTR_REF: return "kptr"; + case BPF_LIST_HEAD: + return "bpf_list_head"; default: WARN_ON_ONCE(1); return "unknown"; @@ -282,6 +292,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) case BPF_KPTR_UNREF: case BPF_KPTR_REF: return sizeof(u64); + case BPF_LIST_HEAD: + return sizeof(struct bpf_list_head); default: WARN_ON_ONCE(1); return 0; @@ -298,6 +310,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) case BPF_KPTR_UNREF: case BPF_KPTR_REF: return __alignof__(u64); + case BPF_LIST_HEAD: + return __alignof__(struct bpf_list_head); default: WARN_ON_ONCE(1); return 0; @@ -403,6 +417,9 @@ static inline void zero_map_value(struct bpf_map *map, void *dst) void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); +void bpf_list_head_free(const struct btf_field *field, void *list_head, + struct bpf_spin_lock *spin_lock); + int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); struct bpf_offload_dev; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fb4c911d2a03..6580448e9f77 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6888,6 +6888,16 @@ struct bpf_dynptr { __u64 :64; } __attribute__((aligned(8))); +struct bpf_list_head { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_list_node { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. -- cgit v1.2.3 From 2de2669b4e52b2ae2f118bfc310004f50b47f0f5 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:26 +0530 Subject: bpf: Rename RET_PTR_TO_ALLOC_MEM Currently, the verifier has two return types, RET_PTR_TO_ALLOC_MEM, and RET_PTR_TO_ALLOC_MEM_OR_NULL, however the former is confusingly named to imply that it carries MEM_ALLOC, while only the latter does. This causes confusion during code review leading to conclusions like that the return value of RET_PTR_TO_DYNPTR_MEM_OR_NULL (which is RET_PTR_TO_ALLOC_MEM | PTR_MAYBE_NULL) may be consumable by bpf_ringbuf_{submit,commit}. Rename it to make it clear MEM_ALLOC needs to be tacked on top of RET_PTR_TO_MEM. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-6-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 05f98e9e5c48..2fe3ec620d54 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -607,7 +607,7 @@ enum bpf_return_type { RET_PTR_TO_SOCKET, /* returns a pointer to a socket */ RET_PTR_TO_TCP_SOCK, /* returns a pointer to a tcp_sock */ RET_PTR_TO_SOCK_COMMON, /* returns a pointer to a sock_common */ - RET_PTR_TO_ALLOC_MEM, /* returns a pointer to dynamically allocated memory */ + RET_PTR_TO_MEM, /* returns a pointer to memory */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ __BPF_RET_TYPE_MAX, @@ -617,8 +617,8 @@ enum bpf_return_type { RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, - RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_ALLOC_MEM, - RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM, + RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_MEM, + RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is -- cgit v1.2.3 From 894f2a8b1673a355a1a7507a4dfa6a3c836d07c1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:27 +0530 Subject: bpf: Rename MEM_ALLOC to MEM_RINGBUF Currently, verifier uses MEM_ALLOC type tag to specially tag memory returned from bpf_ringbuf_reserve helper. However, this is currently only used for this purpose and there is an implicit assumption that it only refers to ringbuf memory (e.g. the check for ARG_PTR_TO_ALLOC_MEM in check_func_arg_reg_off). Hence, rename MEM_ALLOC to MEM_RINGBUF to indicate this special relationship and instead open the use of MEM_ALLOC for more generic allocations made for user types. Also, since ARG_PTR_TO_ALLOC_MEM_OR_NULL is unused, simply drop it. Finally, update selftests using 'alloc_' verifier string to 'ringbuf_'. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-7-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2fe3ec620d54..afc1c51b59ff 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -488,10 +488,8 @@ enum bpf_type_flag { */ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), - /* MEM was "allocated" from a different helper, and cannot be mixed - * with regular non-MEM_ALLOC'ed MEM types. - */ - MEM_ALLOC = BIT(2 + BPF_BASE_TYPE_BITS), + /* MEM points to BPF ring buffer reservation. */ + MEM_RINGBUF = BIT(2 + BPF_BASE_TYPE_BITS), /* MEM is in user address space. */ MEM_USER = BIT(3 + BPF_BASE_TYPE_BITS), @@ -565,7 +563,7 @@ enum bpf_arg_type { ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ - ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ + ARG_PTR_TO_RINGBUF_MEM, /* pointer to dynamically reserved ringbuf memory */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ @@ -582,7 +580,6 @@ enum bpf_arg_type { ARG_PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MEM, ARG_PTR_TO_CTX_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_CTX, ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, - ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM, ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, ARG_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, /* pointer to memory does not need to be initialized, helper function must fill @@ -617,7 +614,7 @@ enum bpf_return_type { RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, - RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_MEM, + RET_PTR_TO_RINGBUF_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_RINGBUF | RET_PTR_TO_MEM, RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, -- cgit v1.2.3 From 6728aea7216c0c06c98e2e58d753a5e8b2ae1c6f Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:28 +0530 Subject: bpf: Refactor btf_struct_access Instead of having to pass multiple arguments that describe the register, pass the bpf_reg_state into the btf_struct_access callback. Currently, all call sites simply reuse the btf and btf_id of the reg they want to check the access of. The only exception to this pattern is the callsite in check_ptr_to_map_access, hence for that case create a dummy reg to simulate PTR_TO_BTF_ID access. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-8-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 17 ++++++++--------- include/linux/filter.h | 8 ++++---- 2 files changed, 12 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index afc1c51b59ff..49f9d2bec401 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -771,6 +771,7 @@ struct bpf_prog_ops { union bpf_attr __user *uattr); }; +struct bpf_reg_state; struct bpf_verifier_ops { /* return eBPF function prototype for verification */ const struct bpf_func_proto * @@ -792,9 +793,8 @@ struct bpf_verifier_ops { struct bpf_insn *dst, struct bpf_prog *prog, u32 *target_size); int (*btf_struct_access)(struct bpf_verifier_log *log, - const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag); }; @@ -2080,9 +2080,9 @@ static inline bool bpf_tracing_btf_ctx_access(int off, int size, return btf_ctx_access(off, size, type, prog, info); } -int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, +int btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag); bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, @@ -2333,9 +2333,8 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id) } static inline int btf_struct_access(struct bpf_verifier_log *log, - const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag) { return -EACCES; diff --git a/include/linux/filter.h b/include/linux/filter.h index efc42a6e3aed..787d35dbf5b0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -568,10 +568,10 @@ struct sk_filter { DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); extern struct mutex nf_conn_btf_access_lock; -extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, u32 *next_btf_id, - enum bpf_type_flag *flag); +extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, + u32 *next_btf_id, enum bpf_type_flag *flag); typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, const struct bpf_insn *insnsi, -- cgit v1.2.3 From 7d34aa3e03b6a56306296bd98b26c6a1710cd57b Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 14 Oct 2022 23:45:58 +0200 Subject: netfilter: nf_tables: Extend nft_expr_ops::dump callback parameters Add a 'reset' flag just like with nft_object_ops::dump. This will be useful to reset "anonymous stateful objects", e.g. simple rule counters. No functional change intended. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 ++- include/net/netfilter/nft_fib.h | 2 +- include/net/netfilter/nft_meta.h | 4 ++-- include/net/netfilter/nft_reject.h | 3 ++- 4 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 38e2b396e38a..c557a57fb0f1 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -927,7 +927,8 @@ struct nft_expr_ops { void (*destroy_clone)(const struct nft_ctx *ctx, const struct nft_expr *expr); int (*dump)(struct sk_buff *skb, - const struct nft_expr *expr); + const struct nft_expr *expr, + bool reset); int (*validate)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data); diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index eed099eae672..167640b843ef 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -18,7 +18,7 @@ nft_fib_is_loopback(const struct sk_buff *skb, const struct net_device *in) return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK; } -int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr); +int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index f3a5285a511c..ba1238f12a48 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -24,10 +24,10 @@ int nft_meta_set_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]); int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr); + const struct nft_expr *expr, bool reset); int nft_meta_set_dump(struct sk_buff *skb, - const struct nft_expr *expr); + const struct nft_expr *expr, bool reset); void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h index 56b123a42220..6d9ba62efd75 100644 --- a/include/net/netfilter/nft_reject.h +++ b/include/net/netfilter/nft_reject.h @@ -22,7 +22,8 @@ int nft_reject_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); -int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr); +int nft_reject_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset); int nft_reject_icmp_code(u8 code); int nft_reject_icmpv6_code(u8 code); -- cgit v1.2.3 From 8daa8fde3fc3f069ff0b5c87079a5c1df7743113 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 14 Oct 2022 23:45:59 +0200 Subject: netfilter: nf_tables: Introduce NFT_MSG_GETRULE_RESET Analogous to NFT_MSG_GETOBJ_RESET, but for rules: Reset stateful expressions like counters or quotas. The latter two are the only consumers, adjust their 'dump' callbacks to respect the parameter introduced earlier. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index c557a57fb0f1..e69ce23566ea 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -383,7 +383,7 @@ int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, - const struct nft_expr *expr); + const struct nft_expr *expr, bool reset); bool nft_expr_reduce_bitwise(struct nft_regs_track *track, const struct nft_expr *expr); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index e4b739d57480..cfa844da1ce6 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -97,6 +97,7 @@ enum nft_verdicts { * @NFT_MSG_NEWFLOWTABLE: add new flow table (enum nft_flowtable_attributes) * @NFT_MSG_GETFLOWTABLE: get flow table (enum nft_flowtable_attributes) * @NFT_MSG_DELFLOWTABLE: delete flow table (enum nft_flowtable_attributes) + * @NFT_MSG_GETRULE_RESET: get rules and reset stateful expressions (enum nft_obj_attributes) */ enum nf_tables_msg_types { NFT_MSG_NEWTABLE, @@ -124,6 +125,7 @@ enum nf_tables_msg_types { NFT_MSG_NEWFLOWTABLE, NFT_MSG_GETFLOWTABLE, NFT_MSG_DELFLOWTABLE, + NFT_MSG_GETRULE_RESET, NFT_MSG_MAX, }; -- cgit v1.2.3 From 14d898f3c1b3bf9c4375ee3255ec9e9b89a35578 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 8 Nov 2022 15:05:59 +0100 Subject: dev: Move received_rps counter next to RPS members in softnet data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the received_rps counter value next to the other RPS-related members in softnet_data. This closes two four-byte holes in the structure, making room for another pointer in the first two cache lines without bumping the xmit struct to its own line. Acked-by: Song Liu Reviewed-by: Stanislav Fomichev Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20221108140601.149971-2-toke@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 02a2318da7c7..ddc59ef98500 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3128,7 +3128,6 @@ struct softnet_data { /* stats */ unsigned int processed; unsigned int time_squeeze; - unsigned int received_rps; #ifdef CONFIG_RPS struct softnet_data *rps_ipi_list; #endif @@ -3161,6 +3160,7 @@ struct softnet_data { unsigned int cpu; unsigned int input_queue_tail; #endif + unsigned int received_rps; unsigned int dropped; struct sk_buff_head input_pkt_queue; struct napi_struct backlog; -- cgit v1.2.3 From 32637e33003f36e75e9147788cc0e2f21706ef99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 8 Nov 2022 15:06:00 +0100 Subject: bpf: Expand map key argument of bpf_redirect_map to u64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For queueing packets in XDP we want to add a new redirect map type with support for 64-bit indexes. To prepare fore this, expand the width of the 'key' argument to the bpf_redirect_map() helper. Since BPF registers are always 64-bit, this should be safe to do after the fact. Acked-by: Song Liu Reviewed-by: Stanislav Fomichev Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20221108140601.149971-3-toke@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- include/linux/filter.h | 12 ++++++------ include/uapi/linux/bpf.h | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 49f9d2bec401..54462dd28824 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -135,7 +135,7 @@ struct bpf_map_ops { struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); /* Misc helpers.*/ - int (*map_redirect)(struct bpf_map *map, u32 ifindex, u64 flags); + int (*map_redirect)(struct bpf_map *map, u64 key, u64 flags); /* map_meta_equal must be implemented for maps that can be * used as an inner map. It is a runtime check to ensure diff --git a/include/linux/filter.h b/include/linux/filter.h index 787d35dbf5b0..bf701976056e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -643,13 +643,13 @@ struct bpf_nh_params { }; struct bpf_redirect_info { - u32 flags; - u32 tgt_index; + u64 tgt_index; void *tgt_value; struct bpf_map *map; + u32 flags; + u32 kern_flags; u32 map_id; enum bpf_map_type map_type; - u32 kern_flags; struct bpf_nh_params nh; }; @@ -1504,7 +1504,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, } #endif /* IS_ENABLED(CONFIG_IPV6) */ -static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, +static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index, u64 flags, const u64 flag_mask, void *lookup_elem(struct bpf_map *map, u32 key)) { @@ -1515,7 +1515,7 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind if (unlikely(flags & ~(action_mask | flag_mask))) return XDP_ABORTED; - ri->tgt_value = lookup_elem(map, ifindex); + ri->tgt_value = lookup_elem(map, index); if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) { /* If the lookup fails we want to clear out the state in the * redirect_info struct completely, so that if an eBPF program @@ -1527,7 +1527,7 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind return flags & action_mask; } - ri->tgt_index = ifindex; + ri->tgt_index = index; ri->map_id = map->id; ri->map_type = map->map_type; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6580448e9f77..ab86145df760 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2647,7 +2647,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * long bpf_redirect_map(struct bpf_map *map, u64 key, u64 flags) * Description * Redirect the packet to the endpoint referenced by *map* at * index *key*. Depending on its type, this *map* can contain -- cgit v1.2.3 From 53d04b9811107633f25be02a5d981a6070d09e6e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 14 Nov 2022 19:07:30 +0200 Subject: net: dsa: remove phylink_validate() method As of now, no DSA driver uses a custom link mode validation procedure anymore. So remove this DSA operation and let phylink determine what is supported based on config->mac_capabilities (if provided by the driver). Leave a comment why we left the code that we did, and that there is more work to do. Signed-off-by: Vladimir Oltean Reviewed-by: Russell King (Oracle) Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index ee369670e20e..dde364688739 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -880,9 +880,6 @@ struct dsa_switch_ops { */ void (*phylink_get_caps)(struct dsa_switch *ds, int port, struct phylink_config *config); - void (*phylink_validate)(struct dsa_switch *ds, int port, - unsigned long *supported, - struct phylink_link_state *state); struct phylink_pcs *(*phylink_mac_select_pcs)(struct dsa_switch *ds, int port, phy_interface_t iface); -- cgit v1.2.3 From 02ae6a7034d7b2e3d89e33d73da10a1f156789a0 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 8 Nov 2022 14:23:55 -0600 Subject: wifi: cfg80211: Avoid clashing function prototypes When built with Control Flow Integrity, function prototypes between caller and function declaration must match. These mismatches are visible at compile time with the new -Wcast-function-type-strict in Clang[1]. Fix a total of 73 warnings like these: drivers/net/wireless/intersil/orinoco/wext.c:1379:27: warning: cast from 'int (*)(struct net_device *, struct iw_request_info *, struct iw_param *, char *)' to 'iw_handler' (aka 'int (*)(struct net_device *, struct iw_request_info *, union iwreq_data *, char *)') converts to incompatible function type [-Wcast-function-type-strict] IW_HANDLER(SIOCGIWPOWER, (iw_handler)orinoco_ioctl_getpower), ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../net/wireless/wext-compat.c:1607:33: warning: cast from 'int (*)(struct net_device *, struct iw_request_info *, struct iw_point *, char *)' to 'iw_handler' (aka 'int (*)(struct net_device *, struct iw_request_info *, union iwreq_data *, char *)') converts to incompatible function type [-Wcast-function-type-strict] [IW_IOCTL_IDX(SIOCSIWGENIE)] = (iw_handler) cfg80211_wext_siwgenie, ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/wireless/intersil/orinoco/wext.c:1390:27: error: incompatible function pointer types initializing 'const iw_handler' (aka 'int (*const)(struct net_device *, struct iw_request_info *, union iwreq_data *, char *)') with an expression of type 'int (struct net_device *, struct iw_request_info *, struct iw_param *, char *)' [-Wincompatible-function-pointer-types] IW_HANDLER(SIOCGIWRETRY, cfg80211_wext_giwretry), ^~~~~~~~~~~~~~~~~~~~~~ The cfg80211 Wireless Extension handler callbacks (iw_handler) use a union for the data argument. Actually use the union and perform explicit member selection in the function body instead of having a function prototype mismatch. There are no resulting binary differences before/after changes. These changes were made partly manually and partly with the help of Coccinelle. Link: https://github.com/KSPP/linux/issues/234 Link: https://reviews.llvm.org/D134831 [1] Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/a68822bf8dd587988131bb6a295280cb4293f05d.1667934775.git.gustavoars@kernel.org --- include/net/cfg80211-wext.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211-wext.h b/include/net/cfg80211-wext.h index ad77caf2ffde..0ee36d97e068 100644 --- a/include/net/cfg80211-wext.h +++ b/include/net/cfg80211-wext.h @@ -19,34 +19,34 @@ */ int cfg80211_wext_giwname(struct net_device *dev, struct iw_request_info *info, - char *name, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, - u32 *mode, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info, - u32 *mode, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_siwscan(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra); int cfg80211_wext_giwscan(struct net_device *dev, struct iw_request_info *info, - struct iw_point *data, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_giwrange(struct net_device *dev, struct iw_request_info *info, - struct iw_point *data, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_siwrts(struct net_device *dev, struct iw_request_info *info, - struct iw_param *rts, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_giwrts(struct net_device *dev, struct iw_request_info *info, - struct iw_param *rts, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_siwfrag(struct net_device *dev, struct iw_request_info *info, - struct iw_param *frag, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_giwfrag(struct net_device *dev, struct iw_request_info *info, - struct iw_param *frag, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_giwretry(struct net_device *dev, struct iw_request_info *info, - struct iw_param *retry, char *extra); + union iwreq_data *wrqu, char *extra); #endif /* __NET_CFG80211_WEXT_H */ -- cgit v1.2.3 From 06463f6e98df34908c26aa8e7a31a279646b1f51 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 9 Nov 2022 14:42:49 -0800 Subject: wifi: wl1251: drop support for platform data Remove support for configuring the device via platform data because there are no users of wl1251_platform_data left in the mainline kernel. Signed-off-by: Dmitry Torokhov Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20221109224250.2885119-2-dmitry.torokhov@gmail.com --- include/linux/wl12xx.h | 44 -------------------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 include/linux/wl12xx.h (limited to 'include') diff --git a/include/linux/wl12xx.h b/include/linux/wl12xx.h deleted file mode 100644 index 03d61f1d23ab..000000000000 --- a/include/linux/wl12xx.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * This file is part of wl12xx - * - * Copyright (C) 2009 Nokia Corporation - * - * Contact: Luciano Coelho - */ - -#ifndef _LINUX_WL12XX_H -#define _LINUX_WL12XX_H - -#include - -struct wl1251_platform_data { - int power_gpio; - /* SDIO only: IRQ number if WLAN_IRQ line is used, 0 for SDIO IRQs */ - int irq; - bool use_eeprom; -}; - -#ifdef CONFIG_WILINK_PLATFORM_DATA - -int wl1251_set_platform_data(const struct wl1251_platform_data *data); - -struct wl1251_platform_data *wl1251_get_platform_data(void); - -#else - -static inline -int wl1251_set_platform_data(const struct wl1251_platform_data *data) -{ - return -ENOSYS; -} - -static inline -struct wl1251_platform_data *wl1251_get_platform_data(void) -{ - return ERR_PTR(-ENODATA); -} - -#endif - -#endif -- cgit v1.2.3 From 67fb43308f4b354f13aabcc66dd5d99bfbb7e838 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Nov 2022 13:57:54 -0800 Subject: udp: Set NULL to sk->sk_prot->h.udp_table. We will soon introduce an optional per-netns hash table for UDP. This means we cannot use the global sk->sk_prot->h.udp_table to fetch a UDP hash table. Instead, set NULL to sk->sk_prot->h.udp_table for UDP and get a proper table from net->ipv4.udp_table. Note that we still need sk->sk_prot->h.udp_table for UDP LITE. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 25f90bba4889..e4cc4d3cacc4 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -43,6 +43,7 @@ struct tcp_fastopen_context; struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; + struct udp_table *udp_table; #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; -- cgit v1.2.3 From 9804985bf27f8fbcf0d96c7435b5ad94a2a6ea20 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Nov 2022 13:57:57 -0800 Subject: udp: Introduce optional per-netns hash table. The maximum hash table size is 64K due to the nature of the protocol. [0] It's smaller than TCP, and fewer sockets can cause a performance drop. On an EC2 c5.24xlarge instance (192 GiB memory), after running iperf3 in different netns, creating 32Mi sockets without data transfer in the root netns causes regression for the iperf3's connection. uhash_entries sockets length Gbps 64K 1 1 5.69 1Mi 16 5.27 2Mi 32 4.90 4Mi 64 4.09 8Mi 128 2.96 16Mi 256 2.06 32Mi 512 1.12 The per-netns hash table breaks the lengthy lists into shorter ones. It is useful on a multi-tenant system with thousands of netns. With smaller hash tables, we can look up sockets faster, isolate noisy neighbours, and reduce lock contention. The max size of the per-netns table is 64K as well. This is because the possible hash range by udp_hashfn() always fits in 64K within the same netns and we cannot make full use of the whole buckets larger than 64K. /* 0 < num < 64K -> X < hash < X + 64K */ (num + net_hash_mix(net)) & mask; Also, the min size is 128. We use a bitmap to search for an available port in udp_lib_get_port(). To keep the bitmap on the stack and not fire the CONFIG_FRAME_WARN error at build time, we round up the table size to 128. The sysctl usage is the same with TCP: $ dmesg | cut -d ' ' -f 6- | grep "UDP hash" UDP hash table entries: 65536 (order: 9, 2097152 bytes, vmalloc) # sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = 65536 # can be changed by uhash_entries # sysctl net.ipv4.udp_child_hash_entries net.ipv4.udp_child_hash_entries = 0 # disabled by default # ip netns add test1 # ip netns exec test1 sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = -65536 # share the global table # sysctl -w net.ipv4.udp_child_hash_entries=100 net.ipv4.udp_child_hash_entries = 100 # ip netns add test2 # ip netns exec test2 sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = 128 # own a per-netns table with 2^n buckets We could optimise the hash table lookup/iteration further by removing the netns comparison for the per-netns one in the future. Also, we could optimise the sparse udp_hslot layout by putting it in udp_table. [0]: https://lore.kernel.org/netdev/4ACC2815.7010101@gmail.com/ Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/udp.h | 2 ++ include/net/netns/ipv4.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/udp.h b/include/linux/udp.h index dea57aa37df6..a2892e151644 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -23,7 +23,9 @@ static inline struct udphdr *udp_hdr(const struct sk_buff *skb) return (struct udphdr *)skb_transport_header(skb); } +#define UDP_HTABLE_SIZE_MIN_PERNET 128 #define UDP_HTABLE_SIZE_MIN (CONFIG_BASE_SMALL ? 128 : 256) +#define UDP_HTABLE_SIZE_MAX 65536 static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask) { diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e4cc4d3cacc4..db762e35aca9 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -208,6 +208,8 @@ struct netns_ipv4 { atomic_t dev_addr_genid; + unsigned int sysctl_udp_child_hash_entries; + #ifdef CONFIG_SYSCTL unsigned long *sysctl_local_reserved_ports; int sysctl_ip_prot_sock; -- cgit v1.2.3 From 6c1c5097781f563b70a81683ea6fdac21637573b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2022 08:53:55 +0000 Subject: net: add atomic_long_t to net_device_stats fields Long standing KCSAN issues are caused by data-race around some dev->stats changes. Most performance critical paths already use per-cpu variables, or per-queue ones. It is reasonable (and more correct) to use atomic operations for the slow paths. This patch adds an union for each field of net_device_stats, so that we can convert paths that are not yet protected by a spinlock or a mutex. netdev_stats_to_stats64() no longer has an #if BITS_PER_LONG==64 Note that the memcpy() we were using on 64bit arches had no provision to avoid load-tearing, while atomic_long_read() is providing the needed protection at no cost. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 58 ++++++++++++++++++++++++++++------------------- include/net/dst.h | 5 ++-- 2 files changed, 37 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 02a2318da7c7..23b3903b0678 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -171,31 +171,38 @@ static inline bool dev_xmit_complete(int rc) * (unsigned long) so they can be read and written atomically. */ +#define NET_DEV_STAT(FIELD) \ + union { \ + unsigned long FIELD; \ + atomic_long_t __##FIELD; \ + } + struct net_device_stats { - unsigned long rx_packets; - unsigned long tx_packets; - unsigned long rx_bytes; - unsigned long tx_bytes; - unsigned long rx_errors; - unsigned long tx_errors; - unsigned long rx_dropped; - unsigned long tx_dropped; - unsigned long multicast; - unsigned long collisions; - unsigned long rx_length_errors; - unsigned long rx_over_errors; - unsigned long rx_crc_errors; - unsigned long rx_frame_errors; - unsigned long rx_fifo_errors; - unsigned long rx_missed_errors; - unsigned long tx_aborted_errors; - unsigned long tx_carrier_errors; - unsigned long tx_fifo_errors; - unsigned long tx_heartbeat_errors; - unsigned long tx_window_errors; - unsigned long rx_compressed; - unsigned long tx_compressed; + NET_DEV_STAT(rx_packets); + NET_DEV_STAT(tx_packets); + NET_DEV_STAT(rx_bytes); + NET_DEV_STAT(tx_bytes); + NET_DEV_STAT(rx_errors); + NET_DEV_STAT(tx_errors); + NET_DEV_STAT(rx_dropped); + NET_DEV_STAT(tx_dropped); + NET_DEV_STAT(multicast); + NET_DEV_STAT(collisions); + NET_DEV_STAT(rx_length_errors); + NET_DEV_STAT(rx_over_errors); + NET_DEV_STAT(rx_crc_errors); + NET_DEV_STAT(rx_frame_errors); + NET_DEV_STAT(rx_fifo_errors); + NET_DEV_STAT(rx_missed_errors); + NET_DEV_STAT(tx_aborted_errors); + NET_DEV_STAT(tx_carrier_errors); + NET_DEV_STAT(tx_fifo_errors); + NET_DEV_STAT(tx_heartbeat_errors); + NET_DEV_STAT(tx_window_errors); + NET_DEV_STAT(rx_compressed); + NET_DEV_STAT(tx_compressed); }; +#undef NET_DEV_STAT /* per-cpu stats, allocated on demand. * Try to fit them in a single cache line, for dev_get_stats() sake. @@ -5171,4 +5178,9 @@ extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; extern struct net_device *blackhole_netdev; +/* Note: Avoid these macros in fast path, prefer per-cpu or per-queue counters. */ +#define DEV_STATS_INC(DEV, FIELD) atomic_long_inc(&(DEV)->stats.__##FIELD) +#define DEV_STATS_ADD(DEV, FIELD, VAL) \ + atomic_long_add((VAL), &(DEV)->stats.__##FIELD) + #endif /* _LINUX_NETDEVICE_H */ diff --git a/include/net/dst.h b/include/net/dst.h index 00b479ce6b99..d67fda89cd0f 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -356,9 +356,8 @@ static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, struct net *net) { - /* TODO : stats should be SMP safe */ - dev->stats.rx_packets++; - dev->stats.rx_bytes += skb->len; + DEV_STATS_INC(dev, rx_packets); + DEV_STATS_ADD(dev, rx_bytes, skb->len); __skb_tunnel_rx(skb, dev, net); } -- cgit v1.2.3 From 3af43ba4c6019b29c048921eb8147eb010165329 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 16 Nov 2022 15:50:58 +0800 Subject: bpf: Pass map file to .map_update_batch directly Currently bpf_map_do_batch() first invokes fdget(batch.map_fd) to get the target map file, then it invokes generic_map_update_batch() to do batch update. generic_map_update_batch() will get the target map file by using fdget(batch.map_fd) again and pass it to bpf_map_update_value(). The problem is map file returned by the second fdget() may be NULL or a totally different file compared by map file in bpf_map_do_batch(). The reason is that the first fdget() only guarantees the liveness of struct file instead of file descriptor and the file description may be released by concurrent close() through pick_file(). It doesn't incur any problem as for now, because maps with batch update support don't use map file in .map_fd_get_ptr() ops. But it is better to fix the potential access of an invalid map file. Using __bpf_map_get() again in generic_map_update_batch() can not fix the problem, because batch.map_fd may be closed and reopened, and the returned map file may be different with map file got in bpf_map_do_batch(), so just passing the map file directly to .map_update_batch() in bpf_map_do_batch(). Signed-off-by: Hou Tao Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221116075059.1551277-1-houtao@huaweicloud.com --- include/linux/bpf.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 54462dd28824..e60a5c052473 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -85,7 +85,8 @@ struct bpf_map_ops { int (*map_lookup_and_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); - int (*map_update_batch)(struct bpf_map *map, const union bpf_attr *attr, + int (*map_update_batch)(struct bpf_map *map, struct file *map_file, + const union bpf_attr *attr, union bpf_attr __user *uattr); int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); @@ -1789,7 +1790,7 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); -int generic_map_update_batch(struct bpf_map *map, +int generic_map_update_batch(struct bpf_map *map, struct file *map_file, const union bpf_attr *attr, union bpf_attr __user *uattr); int generic_map_delete_batch(struct bpf_map *map, -- cgit v1.2.3 From 282de143ead96a5d53331e946f31c977b4610a74 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:25:55 +0530 Subject: bpf: Introduce allocated objects support Introduce support for representing pointers to objects allocated by the BPF program, i.e. PTR_TO_BTF_ID that point to a type in program BTF. This is indicated by the presence of MEM_ALLOC type flag in reg->type to avoid having to check btf_is_kernel when trying to match argument types in helpers. Whenever walking such types, any pointers being walked will always yield a SCALAR instead of pointer. In the future we might permit kptr inside such allocated objects (either kernel or program allocated), and it will then form a PTR_TO_BTF_ID of the respective type. For now, such allocated objects will always be referenced in verifier context, hence ref_obj_id == 0 for them is a bug. It is allowed to write to such objects, as long fields that are special are not touched (support for which will be added in subsequent patches). Note that once such a pointer is marked PTR_UNTRUSTED, it is no longer allowed to write to it. No PROBE_MEM handling is therefore done for loads into this type unless PTR_UNTRUSTED is part of the register type, since they can never be in an undefined state, and their lifetime will always be valid. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-6-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e60a5c052473..7440c20c4192 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -525,6 +525,11 @@ enum bpf_type_flag { /* Size is known at compile time. */ MEM_FIXED_SIZE = BIT(10 + BPF_BASE_TYPE_BITS), + /* MEM is of an allocated object of type in program BTF. This is used to + * tag PTR_TO_BTF_ID allocated using bpf_obj_new. + */ + MEM_ALLOC = BIT(11 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; @@ -2792,4 +2797,10 @@ struct bpf_key { bool has_ref; }; #endif /* CONFIG_KEYS */ + +static inline bool type_is_alloc(u32 type) +{ + return type & MEM_ALLOC; +} + #endif /* _LINUX_BPF_H */ -- cgit v1.2.3 From 8ffa5cc142137a59d6a10eb5273fa2ba5dcd4947 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:25:56 +0530 Subject: bpf: Recognize lock and list fields in allocated objects Allow specifying bpf_spin_lock, bpf_list_head, bpf_list_node fields in a allocated object. Also update btf_struct_access to reject direct access to these special fields. A bpf_list_head allows implementing map-in-map style use cases, where an allocated object with bpf_list_head is linked into a list in a map value. This would require embedding a bpf_list_node, support for which is also included. The bpf_spin_lock is used to protect the bpf_list_head and other data. While we strictly don't require to hold a bpf_spin_lock while touching the bpf_list_head in such objects, as when have access to it, we have complete ownership of the object, the locking constraint is still kept and may be conditionally lifted in the future. Note that the specification of such types can be done just like map values, e.g.: struct bar { struct bpf_list_node node; }; struct foo { struct bpf_spin_lock lock; struct bpf_list_head head __contains(bar, node); struct bpf_list_node node; }; struct map_value { struct bpf_spin_lock lock; struct bpf_list_head head __contains(foo, node); }; To recognize such types in user BTF, we build a btf_struct_metas array of metadata items corresponding to each BTF ID. This is done once during the btf_parse stage to avoid having to do it each time during the verification process's requirement to inspect the metadata. Moreover, the computed metadata needs to be passed to some helpers in future patches which requires allocating them and storing them in the BTF that is pinned by the program itself, so that valid access can be assumed to such data during program runtime. A key thing to note is that once a btf_struct_meta is available for a type, both the btf_record and btf_field_offs should be available. It is critical that btf_field_offs is available in case special fields are present, as we extensively rely on special fields being zeroed out in map values and allocated objects in later patches. The code ensures that by bailing out in case of errors and ensuring both are available together. If the record is not available, the special fields won't be recognized, so not having both is also fine (in terms of being a verification error and not a runtime bug). Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-7-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 +++++++ include/linux/btf.h | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7440c20c4192..eb6ea53fa5a2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -177,6 +177,7 @@ enum btf_field_type { BPF_KPTR_REF = (1 << 3), BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, BPF_LIST_HEAD = (1 << 4), + BPF_LIST_NODE = (1 << 5), }; struct btf_field_kptr { @@ -277,6 +278,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "kptr"; case BPF_LIST_HEAD: return "bpf_list_head"; + case BPF_LIST_NODE: + return "bpf_list_node"; default: WARN_ON_ONCE(1); return "unknown"; @@ -295,6 +298,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(u64); case BPF_LIST_HEAD: return sizeof(struct bpf_list_head); + case BPF_LIST_NODE: + return sizeof(struct bpf_list_node); default: WARN_ON_ONCE(1); return 0; @@ -313,6 +318,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(u64); case BPF_LIST_HEAD: return __alignof__(struct bpf_list_head); + case BPF_LIST_NODE: + return __alignof__(struct bpf_list_node); default: WARN_ON_ONCE(1); return 0; diff --git a/include/linux/btf.h b/include/linux/btf.h index d80345fa566b..a01a8da20021 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -6,6 +6,8 @@ #include #include +#include +#include #include #include @@ -78,6 +80,17 @@ struct btf_id_dtor_kfunc { u32 kfunc_btf_id; }; +struct btf_struct_meta { + u32 btf_id; + struct btf_record *record; + struct btf_field_offs *field_offs; +}; + +struct btf_struct_metas { + u32 cnt; + struct btf_struct_meta types[]; +}; + typedef void (*btf_dtor_kfunc_t)(void *); extern const struct file_operations btf_fops; @@ -408,6 +421,23 @@ static inline struct btf_param *btf_params(const struct btf_type *t) return (struct btf_param *)(t + 1); } +static inline int btf_id_cmp_func(const void *a, const void *b) +{ + const int *pa = a, *pb = b; + + return *pa - *pb; +} + +static inline bool btf_id_set_contains(const struct btf_id_set *set, u32 id) +{ + return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; +} + +static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id) +{ + return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func); +} + #ifdef CONFIG_BPF_SYSCALL struct bpf_prog; @@ -423,6 +453,7 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); +struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) @@ -454,6 +485,10 @@ static inline int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dt { return 0; } +static inline struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id) +{ + return NULL; +} #endif static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t) -- cgit v1.2.3 From 865ce09a49d79d2b2c1d980f4c05ffc0b3517bdc Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:25:57 +0530 Subject: bpf: Verify ownership relationships for user BTF types Ensure that there can be no ownership cycles among different types by way of having owning objects that can hold some other type as their element. For instance, a map value can only hold allocated objects, but these are allowed to have another bpf_list_head. To prevent unbounded recursion while freeing resources, elements of bpf_list_head in local kptrs can never have a bpf_list_head which are part of list in a map value. Later patches will verify this by having dedicated BTF selftests. Also, to make runtime destruction easier, once btf_struct_metas is fully populated, we can stash the metadata of the value type directly in the metadata of the list_head fields, as that allows easier access to the value type's layout to destruct it at runtime from the btf_field entry of the list head itself. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-8-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/btf.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eb6ea53fa5a2..323985a39ece 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -191,6 +191,7 @@ struct btf_field_list_head { struct btf *btf; u32 value_btf_id; u32 node_offset; + struct btf_record *value_rec; }; struct btf_field { diff --git a/include/linux/btf.h b/include/linux/btf.h index a01a8da20021..42d8f3730a8d 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -178,6 +178,7 @@ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size); +int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec); struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); -- cgit v1.2.3 From d0d78c1df9b1dbfb5e172de473561ce09d5e9d39 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:25:59 +0530 Subject: bpf: Allow locking bpf_spin_lock global variables Global variables reside in maps accessible using direct_value_addr callbacks, so giving each load instruction's rewrite a unique reg->id disallows us from holding locks which are global. The reason for preserving reg->id as a unique value for registers that may point to spin lock is that two separate lookups are treated as two separate memory regions, and any possible aliasing is ignored for the purposes of spin lock correctness. This is not great especially for the global variable case, which are served from maps that have max_entries == 1, i.e. they always lead to map values pointing into the same map value. So refactor the active_spin_lock into a 'active_lock' structure which represents the lock identity, and instead of the reg->id, remember two fields, a pointer and the reg->id. The pointer will store reg->map_ptr or reg->btf. It's only necessary to distinguish for the id == 0 case of global variables, but always setting the pointer to a non-NULL value and using the pointer to check whether the lock is held simplifies code in the verifier. This is generic enough to allow it for global variables, map lookups, and allocated objects at the same time. Note that while whether a lock is held can be answered by just comparing active_lock.ptr to NULL, to determine whether the register is pointing to the same held lock requires comparing _both_ ptr and id. Finally, as a result of this refactoring, pseudo load instructions are not given a unique reg->id, as they are doing lookup for the same map value (max_entries is never greater than 1). Essentially, we consider that the tuple of (ptr, id) will always be unique for any kind of argument to bpf_spin_{lock,unlock}. Note that this can be extended in the future to also remember offset used for locking, so that we can introduce multiple bpf_spin_lock fields in the same allocation. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-10-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1a32baa78ce2..1db2b4dc7009 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -323,7 +323,21 @@ struct bpf_verifier_state { u32 branches; u32 insn_idx; u32 curframe; - u32 active_spin_lock; + /* For every reg representing a map value or allocated object pointer, + * we consider the tuple of (ptr, id) for them to be unique in verifier + * context and conside them to not alias each other for the purposes of + * tracking lock state. + */ + struct { + /* This can either be reg->map_ptr or reg->btf. If ptr is NULL, + * there's no active lock held, and other fields have no + * meaning. If non-NULL, it indicates that a lock is held and + * id member has the reg->id of the register which can be >= 0. + */ + void *ptr; + /* This will be reg->id */ + u32 id; + } active_lock; bool speculative; /* first and last insn idx of this verifier state */ -- cgit v1.2.3 From 00b85860feb809852af9a88cb4ca8766d7dff6a3 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:26:01 +0530 Subject: bpf: Rewrite kfunc argument handling As we continue to add more features, argument types, kfunc flags, and different extensions to kfuncs, the code to verify the correctness of the kfunc prototype wrt the passed in registers has become ad-hoc and ugly to read. To make life easier, and make a very clear split between different stages of argument processing, move all the code into verifier.c and refactor into easier to read helpers and functions. This also makes sharing code within the verifier easier with kfunc argument processing. This will be more and more useful in later patches as we are now moving to implement very core BPF helpers as kfuncs, to keep them experimental before baking into UAPI. Remove all kfunc related bits now from btf_check_func_arg_match, as users have been converted away to refactored kfunc argument handling. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-12-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 ----------- include/linux/bpf_verifier.h | 2 -- include/linux/btf.h | 31 ++++++++++++++++++++++++++++++- 3 files changed, 30 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 323985a39ece..0a74df731eb8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2109,22 +2109,11 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, const char *func_name, struct btf_func_model *m); -struct bpf_kfunc_arg_meta { - u64 r0_size; - bool r0_rdonly; - int ref_obj_id; - u32 flags; -}; - struct bpf_reg_state; int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); -int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, - const struct btf *btf, u32 func_id, - struct bpf_reg_state *regs, - struct bpf_kfunc_arg_meta *meta); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *reg); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1db2b4dc7009..fb146b0ce006 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -603,8 +603,6 @@ int check_ptr_off_reg(struct bpf_verifier_env *env, int check_func_arg_reg_off(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, enum bpf_arg_type arg_type); -int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, diff --git a/include/linux/btf.h b/include/linux/btf.h index 42d8f3730a8d..d5b26380a60f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -338,6 +338,16 @@ static inline bool btf_type_is_struct(const struct btf_type *t) return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; } +static inline bool __btf_type_is_struct(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; +} + +static inline bool btf_type_is_array(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; +} + static inline u16 btf_type_vlen(const struct btf_type *t) { return BTF_INFO_VLEN(t->info); @@ -439,9 +449,10 @@ static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id) return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func); } -#ifdef CONFIG_BPF_SYSCALL struct bpf_prog; +struct bpf_verifier_log; +#ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); @@ -455,6 +466,12 @@ s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id); +const struct btf_member * +btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg); +bool btf_types_are_same(const struct btf *btf1, u32 id1, + const struct btf *btf2, u32 id2); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) @@ -490,6 +507,18 @@ static inline struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf { return NULL; } +static inline const struct btf_member * +btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg) +{ + return NULL; +} +static inline bool btf_types_are_same(const struct btf *btf1, u32 id1, + const struct btf *btf2, u32 id2) +{ + return false; +} #endif static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t) -- cgit v1.2.3 From 958cf2e273f0929c66169e0788031310e8118722 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:26:03 +0530 Subject: bpf: Introduce bpf_obj_new Introduce type safe memory allocator bpf_obj_new for BPF programs. The kernel side kfunc is named bpf_obj_new_impl, as passing hidden arguments to kfuncs still requires having them in prototype, unlike BPF helpers which always take 5 arguments and have them checked using bpf_func_proto in verifier, ignoring unset argument types. Introduce __ign suffix to ignore a specific kfunc argument during type checks, then use this to introduce support for passing type metadata to the bpf_obj_new_impl kfunc. The user passes BTF ID of the type it wants to allocates in program BTF, the verifier then rewrites the first argument as the size of this type, after performing some sanity checks (to ensure it exists and it is a struct type). The second argument is also fixed up and passed by the verifier. This is the btf_struct_meta for the type being allocated. It would be needed mostly for the offset array which is required for zero initializing special fields while leaving the rest of storage in unitialized state. It would also be needed in the next patch to perform proper destruction of the object's special fields. Under the hood, bpf_obj_new will call bpf_mem_alloc and bpf_mem_free, using the any context BPF memory allocator introduced recently. To this end, a global instance of the BPF memory allocator is initialized on boot to be used for this purpose. This 'bpf_global_ma' serves all allocations for bpf_obj_new. In the future, bpf_obj_new variants will allow specifying a custom allocator. Note that now that bpf_obj_new can be used to allocate objects that can be linked to BPF linked list (when future linked list helpers are available), we need to also free the elements using bpf_mem_free. However, since the draining of elements is done outside the bpf_spin_lock, we need to do migrate_disable around the call since bpf_list_head_free can be called from map free path where migration is enabled. Otherwise, when called from BPF programs migration is already disabled. A convenience macro is included in the bpf_experimental.h header to hide over the ugly details of the implementation, leading to user code looking similar to a language level extension which allocates and constructs fields of a user type. struct bar { struct bpf_list_node node; }; struct foo { struct bpf_spin_lock lock; struct bpf_list_head head __contains(bar, node); }; void prog(void) { struct foo *f; f = bpf_obj_new(typeof(*f)); if (!f) return; ... } A key piece of this story is still missing, i.e. the free function, which will come in the next patch. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-14-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 21 +++++++++++++-------- include/linux/bpf_verifier.h | 2 ++ 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0a74df731eb8..8b32376ce746 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -54,6 +54,8 @@ struct cgroup; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; extern struct kobject *btf_kobj; +extern struct bpf_mem_alloc bpf_global_ma; +extern bool bpf_global_ma_set; typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64); typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, @@ -334,16 +336,19 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f return rec->field_mask & type; } -static inline void check_and_init_map_value(struct bpf_map *map, void *dst) +static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj) { - if (!IS_ERR_OR_NULL(map->record)) { - struct btf_field *fields = map->record->fields; - u32 cnt = map->record->cnt; - int i; + int i; - for (i = 0; i < cnt; i++) - memset(dst + fields[i].offset, 0, btf_field_type_size(fields[i].type)); - } + if (!foffs) + return; + for (i = 0; i < foffs->cnt; i++) + memset(obj + foffs->field_off[i], 0, foffs->field_sz[i]); +} + +static inline void check_and_init_map_value(struct bpf_map *map, void *dst) +{ + bpf_obj_init(map->field_offs, dst); } /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index fb146b0ce006..3dc72d396dfc 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -433,6 +433,8 @@ struct bpf_insn_aux_data { */ struct bpf_loop_inline_state loop_inline_state; }; + u64 obj_new_size; /* remember the size of type passed to bpf_obj_new to rewrite R1 */ + struct btf_struct_meta *kptr_struct_meta; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ -- cgit v1.2.3 From 534e86bc6c66e1e0c798a1c0a6a680bb231c08db Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:26:07 +0530 Subject: bpf: Add 'release on unlock' logic for bpf_list_push_{front,back} This commit implements the delayed release logic for bpf_list_push_front and bpf_list_push_back. Once a node has been added to the list, it's pointer changes to PTR_UNTRUSTED. However, it is only released once the lock protecting the list is unlocked. For such PTR_TO_BTF_ID | MEM_ALLOC with PTR_UNTRUSTED set but an active ref_obj_id, it is still permitted to read them as long as the lock is held. Writing to them is not allowed. This allows having read access to push items we no longer own until we release the lock guarding the list, allowing a little more flexibility when working with these APIs. Note that enabling write support has fairly tricky interactions with what happens inside the critical section. Just as an example, currently, bpf_obj_drop is not permitted, but if it were, being able to write to the PTR_UNTRUSTED pointer while the object gets released back to the memory allocator would violate safety properties we wish to guarantee (i.e. not crashing the kernel). The memory could be reused for a different type in the BPF program or even in the kernel as it gets eventually kfree'd. Not enabling bpf_obj_drop inside the critical section would appear to prevent all of the above, but that is more of an artifical limitation right now. Since the write support is tangled with how we handle potential aliasing of nodes inside the critical section that may or may not be part of the list anymore, it has been deferred to a future patch. Acked-by: Dave Marchevsky Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-18-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3dc72d396dfc..23f30c685f28 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -223,6 +223,11 @@ struct bpf_reference_state { * exiting a callback function. */ int callback_ref; + /* Mark the reference state to release the registers sharing the same id + * on bpf_spin_unlock (for nodes that we will lose ownership to but are + * safe to access inside the critical section). + */ + bool release_on_unlock; }; /* state of the program: -- cgit v1.2.3 From f20a0a0519f35a3a34236ed2d38b067c5cb22b9f Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 17 Nov 2022 02:18:28 +0900 Subject: ethtool: doc: clarify what drivers can implement in their get_drvinfo() Many of the drivers which implement ethtool_ops::get_drvinfo() will prints the .driver, .version or .bus_info of struct ethtool_drvinfo. To have a glance of current state, do: $ git grep -W "get_drvinfo(struct" Printing in those three fields is useless because: - since [1], the driver version should be the kernel version (at least for upstream drivers). Arguably, out of tree drivers might still want to set a custom version, but out of tree is not our focus. - since [2], the core is able to provide default values for .driver and .bus_info. In summary, drivers may provide .fw_version and .erom_version, the rest is expected to be done by the core. In struct ethtool_ops doc from linux/ethtool: rephrase field get_drvinfo() doc to discourage developers from implementing this callback. In struct ethtool_drvinfo doc from uapi/linux/ethtool.h: remove the paragraph mentioning what drivers should do. Rationale: no need to repeat what is already written in struct ethtool_ops doc. But add a note that .fw_version and .erom_version are driver defined. Also update the dummy driver and simply remove the callback in order not to confuse the newcomers: most of the drivers will not need this callback function any more. [1] commit 6a7e25c7fb48 ("net/core: Replace driver version to be kernel version") Link: https://git.kernel.org/torvalds/linux/c/6a7e25c7fb48 [2] commit edaf5df22cb8 ("ethtool: ethtool_get_drvinfo: populate drvinfo fields even if callback exits") Link: https://git.kernel.org/netdev/net-next/c/edaf5df22cb8 Reviewed-by: Leon Romanovsky Signed-off-by: Vincent Mailhol Link: https://lore.kernel.org/r/20221116171828.4093-1-mailhol.vincent@wanadoo.fr Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 8 ++++---- include/uapi/linux/ethtool.h | 10 ++++------ 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 5c51c7fda32a..9e0a76fc7de9 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -473,10 +473,10 @@ struct ethtool_module_power_mode_params { * parameter. * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. - * @get_drvinfo: Report driver/device information. Should only set the - * @driver, @version, @fw_version and @bus_info fields. If not - * implemented, the @driver and @bus_info fields will be filled in - * according to the netdev's parent device. + * @get_drvinfo: Report driver/device information. Modern drivers no + * longer have to implement this callback. Most fields are + * correctly filled in by the core using system information, or + * populated using other driver operations. * @get_regs_len: Get buffer length required for @get_regs * @get_regs: Get device registers * @get_wol: Report whether Wake-on-Lan is enabled diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index f341de2ae612..58e587ba0450 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -159,8 +159,10 @@ static inline __u32 ethtool_cmd_speed(const struct ethtool_cmd *ep) * in its bus driver structure (e.g. pci_driver::name). Must * not be an empty string. * @version: Driver version string; may be an empty string - * @fw_version: Firmware version string; may be an empty string - * @erom_version: Expansion ROM version string; may be an empty string + * @fw_version: Firmware version string; driver defined; may be an + * empty string + * @erom_version: Expansion ROM version string; driver defined; may be + * an empty string * @bus_info: Device bus address. This should match the dev_name() * string for the underlying bus device, if there is one. May be * an empty string. @@ -179,10 +181,6 @@ static inline __u32 ethtool_cmd_speed(const struct ethtool_cmd *ep) * * Users can use the %ETHTOOL_GSSET_INFO command to get the number of * strings in any string set (from Linux 2.6.34). - * - * Drivers should set at most @driver, @version, @fw_version and - * @bus_info in their get_drvinfo() implementation. The ethtool - * core fills in the other fields using other driver operations. */ struct ethtool_drvinfo { __u32 cmd; -- cgit v1.2.3 From 9999f85ba34651726018e0f50d4afdf6c8cc8096 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 15 Nov 2022 03:18:42 +0200 Subject: net: dsa: stop exposing tag proto module helpers to the world The DSA tagging protocol driver macros are in the public include/net/dsa.h probably because that's also where the DSA_TAG_PROTO_*_VALUE macros are (MODULE_ALIAS_DSA_TAG_DRIVER hinges on those macro definitions). But there is no reason to expose these helpers to . That header is shared between switch drivers (drivers/net/dsa/), tagging protocol drivers (net/dsa/tag_*.c), the DSA core (net/dsa/ sans tag_*.c), and the rest of the world (DSA master drivers, network stack, etc). Too much exposure. On the other hand, net/dsa/dsa_priv.h is included only by the DSA core and by DSA tagging protocol drivers (or IOW, "friend" modules). Also a bit too much exposure - I've contemplated creating a new header which is only included by tagging protocol drivers, but completely separating a new dsa_tag_proto.h from dsa_priv.h is not immediately trivial - for example dsa_slave_to_port() is used both from the fast path and from the control path. So for now, move these definitions to dsa_priv.h which at least hides them from the world. Signed-off-by: Vladimir Oltean Tested-by: Michael Walle Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 70 ------------------------------------------------------- 1 file changed, 70 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index dde364688739..82da44561f4c 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -118,10 +118,6 @@ struct dsa_netdevice_ops { int cmd); }; -#define DSA_TAG_DRIVER_ALIAS "dsa_tag-" -#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto) \ - MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE)) - struct dsa_lag { struct net_device *dev; unsigned int id; @@ -1400,70 +1396,4 @@ static inline bool dsa_slave_dev_check(const struct net_device *dev) netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); -struct dsa_tag_driver { - const struct dsa_device_ops *ops; - struct list_head list; - struct module *owner; -}; - -void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[], - unsigned int count, - struct module *owner); -void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[], - unsigned int count); - -#define dsa_tag_driver_module_drivers(__dsa_tag_drivers_array, __count) \ -static int __init dsa_tag_driver_module_init(void) \ -{ \ - dsa_tag_drivers_register(__dsa_tag_drivers_array, __count, \ - THIS_MODULE); \ - return 0; \ -} \ -module_init(dsa_tag_driver_module_init); \ - \ -static void __exit dsa_tag_driver_module_exit(void) \ -{ \ - dsa_tag_drivers_unregister(__dsa_tag_drivers_array, __count); \ -} \ -module_exit(dsa_tag_driver_module_exit) - -/** - * module_dsa_tag_drivers() - Helper macro for registering DSA tag - * drivers - * @__ops_array: Array of tag driver structures - * - * Helper macro for DSA tag drivers which do not do anything special - * in module init/exit. Each module may only use this macro once, and - * calling it replaces module_init() and module_exit(). - */ -#define module_dsa_tag_drivers(__ops_array) \ -dsa_tag_driver_module_drivers(__ops_array, ARRAY_SIZE(__ops_array)) - -#define DSA_TAG_DRIVER_NAME(__ops) dsa_tag_driver ## _ ## __ops - -/* Create a static structure we can build a linked list of dsa_tag - * drivers - */ -#define DSA_TAG_DRIVER(__ops) \ -static struct dsa_tag_driver DSA_TAG_DRIVER_NAME(__ops) = { \ - .ops = &__ops, \ -} - -/** - * module_dsa_tag_driver() - Helper macro for registering a single DSA tag - * driver - * @__ops: Single tag driver structures - * - * Helper macro for DSA tag drivers which do not do anything special - * in module init/exit. Each module may only use this macro once, and - * calling it replaces module_init() and module_exit(). - */ -#define module_dsa_tag_driver(__ops) \ -DSA_TAG_DRIVER(__ops); \ - \ -static struct dsa_tag_driver *dsa_tag_driver_array[] = { \ - &DSA_TAG_DRIVER_NAME(__ops) \ -}; \ -module_dsa_tag_drivers(dsa_tag_driver_array) #endif - -- cgit v1.2.3 From cd502236835b678738810ecd501c85a3a7a11150 Mon Sep 17 00:00:00 2001 From: Michal Wilczynski Date: Tue, 15 Nov 2022 11:48:15 +0100 Subject: devlink: Introduce new attribute 'tx_priority' to devlink-rate To fully utilize offload capabilities of Intel 100G card QoS capabilities new attribute 'tx_priority' needs to be introduced. This attribute allows for usage of strict priority arbiter among siblings. This arbitration scheme attempts to schedule nodes based on their priority as long as the nodes remain within their bandwidth limit. Introduce new attribute in devlink-rate that will allow for configuration of strict priority. New attribute is optional. Signed-off-by: Michal Wilczynski Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 6 ++++++ include/uapi/linux/devlink.h | 1 + 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 611a23a3deb2..90d59d673cb1 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -114,6 +114,8 @@ struct devlink_rate { refcount_t refcnt; }; }; + + u32 tx_priority; }; struct devlink_port { @@ -1511,10 +1513,14 @@ struct devlink_ops { u64 tx_share, struct netlink_ext_ack *extack); int (*rate_leaf_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_max, struct netlink_ext_ack *extack); + int (*rate_leaf_tx_priority_set)(struct devlink_rate *devlink_rate, void *priv, + u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack); int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_max, struct netlink_ext_ack *extack); + int (*rate_node_tx_priority_set)(struct devlink_rate *devlink_rate, void *priv, + u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, struct netlink_ext_ack *extack); int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 2f24b53a87a5..1a9214d35ef5 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -607,6 +607,7 @@ enum devlink_attr { DEVLINK_ATTR_SELFTESTS, /* nested */ + DEVLINK_ATTR_RATE_TX_PRIORITY, /* u32 */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, -- cgit v1.2.3 From 6e2d7e84fcfee62d70e62aa9e469d10b8b6a7dc7 Mon Sep 17 00:00:00 2001 From: Michal Wilczynski Date: Tue, 15 Nov 2022 11:48:16 +0100 Subject: devlink: Introduce new attribute 'tx_weight' to devlink-rate To fully utilize offload capabilities of Intel 100G card QoS capabilities new attribute 'tx_weight' needs to be introduced. This attribute allows for usage of Weighted Fair Queuing arbitration scheme among siblings. This arbitration scheme can be used simultaneously with the strict priority. Introduce new attribute in devlink-rate that will allow for configuration of Weighted Fair Queueing. New attribute is optional. Signed-off-by: Michal Wilczynski Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 5 +++++ include/uapi/linux/devlink.h | 2 ++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 90d59d673cb1..366b23d3f973 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -116,6 +116,7 @@ struct devlink_rate { }; u32 tx_priority; + u32 tx_weight; }; struct devlink_port { @@ -1515,12 +1516,16 @@ struct devlink_ops { u64 tx_max, struct netlink_ext_ack *extack); int (*rate_leaf_tx_priority_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_priority, struct netlink_ext_ack *extack); + int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, + u32 tx_weight, struct netlink_ext_ack *extack); int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack); int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_max, struct netlink_ext_ack *extack); int (*rate_node_tx_priority_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_priority, struct netlink_ext_ack *extack); + int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, + u32 tx_weight, struct netlink_ext_ack *extack); int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, struct netlink_ext_ack *extack); int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 1a9214d35ef5..498d0d5d0957 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -608,6 +608,8 @@ enum devlink_attr { DEVLINK_ATTR_SELFTESTS, /* nested */ DEVLINK_ATTR_RATE_TX_PRIORITY, /* u32 */ + DEVLINK_ATTR_RATE_TX_WEIGHT, /* u32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, -- cgit v1.2.3 From caba177d7f4d7693a9157ece8c9a30944c949e34 Mon Sep 17 00:00:00 2001 From: Michal Wilczynski Date: Tue, 15 Nov 2022 11:48:17 +0100 Subject: devlink: Enable creation of the devlink-rate nodes from the driver Intel 100G card internal firmware hierarchy for Hierarchicial QoS is very rigid and can't be easily removed. This requires an ability to export default hierarchy to allow user to modify it. Currently the driver is only able to create the 'leaf' nodes, which usually represent the vport. This is not enough for HQoS implemented in Intel hardware. Introduce new function devl_rate_node_create() that allows for creation of the devlink-rate nodes from the driver. Signed-off-by: Michal Wilczynski Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 366b23d3f973..339a2ed02d36 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1618,6 +1618,9 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u32 sf, bool external); int devl_rate_leaf_create(struct devlink_port *port, void *priv); +struct devlink_rate * +devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, + struct devlink_rate *parent); void devl_rate_leaf_destroy(struct devlink_port *devlink_port); void devl_rate_nodes_destroy(struct devlink *devlink); void devlink_port_linecard_set(struct devlink_port *devlink_port, -- cgit v1.2.3 From f2fc15e271f2d17f2bee2c5a3b3e50252a7ba91f Mon Sep 17 00:00:00 2001 From: Michal Wilczynski Date: Tue, 15 Nov 2022 11:48:19 +0100 Subject: devlink: Allow to set up parent in devl_rate_leaf_create() Currently the driver is able to create leaf nodes for the devlink-rate, but is unable to set parent for them. This wasn't as issue before the possibility to export hierarchy from the driver. After adding the export feature, in order for the driver to supply correct hierarchy, it's necessary for it to be able to supply a parent name to devl_rate_leaf_create(). Introduce a new parameter 'parent_name' in devl_rate_leaf_create(). Signed-off-by: Michal Wilczynski Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 339a2ed02d36..074a79b8933f 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1617,10 +1617,12 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u32 sf, bool external); -int devl_rate_leaf_create(struct devlink_port *port, void *priv); struct devlink_rate * devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, struct devlink_rate *parent); +int +devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv, + struct devlink_rate *parent); void devl_rate_leaf_destroy(struct devlink_port *devlink_port); void devl_rate_nodes_destroy(struct devlink *devlink); void devlink_port_linecard_set(struct devlink_port *devlink_port, -- cgit v1.2.3 From b78c4162823dddc621649edae704b14c5973298c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 15 Nov 2022 10:39:53 -0500 Subject: sctp: change to include linux/sctp.h in net/sctp/checksum.h Currently "net/sctp/checksum.h" including "net/sctp/sctp.h" is included in quite some places in netfilter and openswitch and net/sched. It's not necessary to include "net/sctp/sctp.h" if a module does not have dependence on SCTP, "linux/sctp.h" is the right one to include. Signed-off-by: Xin Long Reviewed-by: Saeed Mahameed Link: https://lore.kernel.org/r/ca7ea96d62a26732f0491153c3979dc1c0d8d34a.1668526793.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- include/net/sctp/checksum.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h index 5a9bb09f32b6..f514a0aa849e 100644 --- a/include/net/sctp/checksum.h +++ b/include/net/sctp/checksum.h @@ -24,7 +24,7 @@ #define __sctp_checksum_h__ #include -#include +#include #include #include -- cgit v1.2.3 From 647541ea06a7bbbcf941e501c726b3e26328c102 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 15 Nov 2022 10:40:21 -0500 Subject: sctp: move SCTP_PAD4 and SCTP_TRUNC4 to linux/sctp.h Move these two macros from net/sctp/sctp.h to linux/sctp.h, so that it will be enough to include only linux/sctp.h in nft_exthdr.c and xt_sctp.c. It should not include "net/sctp/sctp.h" if a module does not have a dependence on SCTP module. Signed-off-by: Xin Long Reviewed-by: Saeed Mahameed Link: https://lore.kernel.org/r/ef6468a687f36da06f575c2131cd4612f6b7be88.1668526821.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/sctp.h | 5 +++++ include/net/sctp/sctp.h | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index a86e852507b3..358dc08e0831 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -820,4 +820,9 @@ struct sctp_new_encap_port_hdr { __be16 new_port; }; +/* Round an int up to the next multiple of 4. */ +#define SCTP_PAD4(s) (((s)+3)&~3) +/* Truncate to the previous multiple of 4. */ +#define SCTP_TRUNC4(s) ((s)&~3) + #endif /* __LINUX_SCTP_H__ */ diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index a04999ee99b0..01d904b34cf0 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -67,11 +67,6 @@ #define SCTP_PROTOSW_FLAG INET_PROTOSW_PERMANENT #endif -/* Round an int up to the next multiple of 4. */ -#define SCTP_PAD4(s) (((s)+3)&~3) -/* Truncate to the previous multiple of 4. */ -#define SCTP_TRUNC4(s) ((s)&~3) - /* * Function declarations. */ -- cgit v1.2.3 From 33e93ed2209d5971043bed41dd194bc583b57ef3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 16 Nov 2022 15:01:19 -0500 Subject: sctp: add skb_sdif in struct sctp_af Add skb_sdif function in struct sctp_af to get the enslaved device for both ipv4 and ipv6 when adding SCTP VRF support in sctp_rcv in the next patch. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 350f250b0dc7..7b4884c63b26 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -477,6 +477,7 @@ struct sctp_af { int (*available) (union sctp_addr *, struct sctp_sock *); int (*skb_iif) (const struct sk_buff *sk); + int (*skb_sdif)(const struct sk_buff *sk); int (*is_ce) (const struct sk_buff *sk); void (*seq_dump_addr)(struct seq_file *seq, union sctp_addr *addr); -- cgit v1.2.3 From 0af03170637f47fb5cc6501d4b2dcbf1c14772a9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 16 Nov 2022 15:01:20 -0500 Subject: sctp: add dif and sdif check in asoc and ep lookup This patch at first adds a pernet global l3mdev_accept to decide if it accepts the packets from a l3mdev when a SCTP socket doesn't bind to any interface. It's set to 1 to avoid any possible incompatible issue, and in next patch, a sysctl will be introduced to allow to change it. Then similar to inet/udp_sk_bound_dev_eq(), sctp_sk_bound_dev_eq() is added to check either dif or sdif is equal to sk_bound_dev_if, and to check sid is 0 or l3mdev_accept is 1 if sk_bound_dev_if is not set. This function is used to match a association or a endpoint, namely called by sctp_addrs_lookup_transport() and sctp_endpoint_is_match(). All functions that needs updating are: sctp_rcv(): asoc: __sctp_rcv_lookup() __sctp_lookup_association() -> sctp_addrs_lookup_transport() __sctp_rcv_lookup_harder() __sctp_rcv_init_lookup() __sctp_lookup_association() -> sctp_addrs_lookup_transport() __sctp_rcv_walk_lookup() __sctp_rcv_asconf_lookup() __sctp_lookup_association() -> sctp_addrs_lookup_transport() ep: __sctp_rcv_lookup_endpoint() -> sctp_endpoint_is_match() sctp_connect(): sctp_endpoint_is_peeled_off() __sctp_lookup_association() sctp_has_association() sctp_lookup_association() __sctp_lookup_association() -> sctp_addrs_lookup_transport() sctp_diag_dump_one(): sctp_transport_lookup_process() -> sctp_addrs_lookup_transport() Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/netns/sctp.h | 4 ++++ include/net/sctp/sctp.h | 6 ++++-- include/net/sctp/structs.h | 8 +++++--- 3 files changed, 13 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index a681147aecd8..7eff3d981b89 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -175,6 +175,10 @@ struct netns_sctp { /* Threshold for autoclose timeout, in seconds. */ unsigned long max_autoclose; + +#ifdef CONFIG_NET_L3_MASTER_DEV + int l3mdev_accept; +#endif }; #endif /* __NETNS_SCTP_H__ */ diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 01d904b34cf0..c335dd01a597 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -109,7 +109,7 @@ struct sctp_transport *sctp_transport_get_idx(struct net *net, struct rhashtable_iter *iter, int pos); int sctp_transport_lookup_process(sctp_callback_t cb, struct net *net, const union sctp_addr *laddr, - const union sctp_addr *paddr, void *p); + const union sctp_addr *paddr, void *p, int dif); int sctp_transport_traverse_process(sctp_callback_t cb, sctp_callback_t cb_done, struct net *net, int *pos, void *p); int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p); @@ -157,10 +157,12 @@ void sctp_unhash_transport(struct sctp_transport *t); struct sctp_transport *sctp_addrs_lookup_transport( struct net *net, const union sctp_addr *laddr, - const union sctp_addr *paddr); + const union sctp_addr *paddr, + int dif, int sdif); struct sctp_transport *sctp_epaddr_lookup_transport( const struct sctp_endpoint *ep, const union sctp_addr *paddr); +bool sctp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif); /* * sctp/proc.c diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 7b4884c63b26..afa3781e3ca2 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1379,10 +1379,12 @@ struct sctp_association *sctp_endpoint_lookup_assoc( struct sctp_transport **); bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, const union sctp_addr *paddr); -struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *, - struct net *, const union sctp_addr *); +struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep, + struct net *net, + const union sctp_addr *laddr, + int dif, int sdif); bool sctp_has_association(struct net *net, const union sctp_addr *laddr, - const union sctp_addr *paddr); + const union sctp_addr *paddr, int dif, int sdif); int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, -- cgit v1.2.3 From ab0377803dafc58f1e22296708c1c28e309414d6 Mon Sep 17 00:00:00 2001 From: Schspa Shi Date: Wed, 16 Nov 2022 19:45:11 +0800 Subject: mrp: introduce active flags to prevent UAF when applicant uninit The caller of del_timer_sync must prevent restarting of the timer, If we have no this synchronization, there is a small probability that the cancellation will not be successful. And syzbot report the fellowing crash: ================================================================== BUG: KASAN: use-after-free in hlist_add_head include/linux/list.h:929 [inline] BUG: KASAN: use-after-free in enqueue_timer+0x18/0xa4 kernel/time/timer.c:605 Write at addr f9ff000024df6058 by task syz-fuzzer/2256 Pointer tag: [f9], memory tag: [fe] CPU: 1 PID: 2256 Comm: syz-fuzzer Not tainted 6.1.0-rc5-syzkaller-00008- ge01d50cbd6ee #0 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace.part.0+0xe0/0xf0 arch/arm64/kernel/stacktrace.c:156 dump_backtrace arch/arm64/kernel/stacktrace.c:162 [inline] show_stack+0x18/0x40 arch/arm64/kernel/stacktrace.c:163 __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x68/0x84 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:284 [inline] print_report+0x1a8/0x4a0 mm/kasan/report.c:395 kasan_report+0x94/0xb4 mm/kasan/report.c:495 __do_kernel_fault+0x164/0x1e0 arch/arm64/mm/fault.c:320 do_bad_area arch/arm64/mm/fault.c:473 [inline] do_tag_check_fault+0x78/0x8c arch/arm64/mm/fault.c:749 do_mem_abort+0x44/0x94 arch/arm64/mm/fault.c:825 el1_abort+0x40/0x60 arch/arm64/kernel/entry-common.c:367 el1h_64_sync_handler+0xd8/0xe4 arch/arm64/kernel/entry-common.c:427 el1h_64_sync+0x64/0x68 arch/arm64/kernel/entry.S:576 hlist_add_head include/linux/list.h:929 [inline] enqueue_timer+0x18/0xa4 kernel/time/timer.c:605 mod_timer+0x14/0x20 kernel/time/timer.c:1161 mrp_periodic_timer_arm net/802/mrp.c:614 [inline] mrp_periodic_timer+0xa0/0xc0 net/802/mrp.c:627 call_timer_fn.constprop.0+0x24/0x80 kernel/time/timer.c:1474 expire_timers+0x98/0xc4 kernel/time/timer.c:1519 To fix it, we can introduce a new active flags to make sure the timer will not restart. Reported-by: syzbot+6fd64001c20aa99e34a4@syzkaller.appspotmail.com Signed-off-by: Schspa Shi Signed-off-by: David S. Miller --- include/net/mrp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/mrp.h b/include/net/mrp.h index 92cd3fb6cf9d..b28915ffea28 100644 --- a/include/net/mrp.h +++ b/include/net/mrp.h @@ -124,6 +124,7 @@ struct mrp_applicant { struct sk_buff *pdu; struct rb_root mad; struct rcu_head rcu; + bool active; }; struct mrp_port { -- cgit v1.2.3 From c73a72f4cbb47672c8cc7f7d7aba52f1cb15baca Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Nov 2022 19:39:03 -0800 Subject: netlink: remove the flex array from struct nlmsghdr I've added a flex array to struct nlmsghdr in commit 738136a0e375 ("netlink: split up copies in the ack construction") to allow accessing the data easily. It leads to warnings with clang, if user space wraps this structure into another struct and the flex array is not at the end of the container. Reviewed-by: Kees Cook Reviewed-by: David Ahern Link: https://lore.kernel.org/all/20221114023927.GA685@u2004-local/ Link: https://lore.kernel.org/r/20221118033903.1651026-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/netlink.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 5da0da59bf01..e2ae82e3f9f7 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -48,7 +48,6 @@ struct sockaddr_nl { * @nlmsg_flags: Additional flags * @nlmsg_seq: Sequence number * @nlmsg_pid: Sending process port ID - * @nlmsg_data: Message payload */ struct nlmsghdr { __u32 nlmsg_len; @@ -56,7 +55,6 @@ struct nlmsghdr { __u16 nlmsg_flags; __u32 nlmsg_seq; __u32 nlmsg_pid; - __u8 nlmsg_data[]; }; /* Flags values */ -- cgit v1.2.3 From ef66c5475d7fb864c2418d3bdd19dee46324624b Mon Sep 17 00:00:00 2001 From: David Vernet Date: Sat, 19 Nov 2022 23:10:01 -0600 Subject: bpf: Allow multiple modifiers in reg_type_str() prefix reg_type_str() in the verifier currently only allows a single register type modifier to be present in the 'prefix' string which is eventually stored in the env type_str_buf. This currently works fine because there are no overlapping type modifiers, but once PTR_TRUSTED is added, that will no longer be the case. This patch updates reg_type_str() to support having multiple modifiers in the prefix string, and updates the size of type_str_buf to be 128 bytes. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20221120051004.3605026-2-void@manifault.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 23f30c685f28..608dde740fef 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -19,7 +19,7 @@ */ #define BPF_MAX_VAR_SIZ (1 << 29) /* size of type_str_buf in bpf_verifier. */ -#define TYPE_STR_BUF_LEN 64 +#define TYPE_STR_BUF_LEN 128 /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that -- cgit v1.2.3 From 3f00c52393445ed49aadc1a567aa502c6333b1a1 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Sat, 19 Nov 2022 23:10:02 -0600 Subject: bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs Kfuncs currently support specifying the KF_TRUSTED_ARGS flag to signal to the verifier that it should enforce that a BPF program passes it a "safe", trusted pointer. Currently, "safe" means that the pointer is either PTR_TO_CTX, or is refcounted. There may be cases, however, where the kernel passes a BPF program a safe / trusted pointer to an object that the BPF program wishes to use as a kptr, but because the object does not yet have a ref_obj_id from the perspective of the verifier, the program would be unable to pass it to a KF_ACQUIRE | KF_TRUSTED_ARGS kfunc. The solution is to expand the set of pointers that are considered trusted according to KF_TRUSTED_ARGS, so that programs can invoke kfuncs with these pointers without getting rejected by the verifier. There is already a PTR_UNTRUSTED flag that is set in some scenarios, such as when a BPF program reads a kptr directly from a map without performing a bpf_kptr_xchg() call. These pointers of course can and should be rejected by the verifier. Unfortunately, however, PTR_UNTRUSTED does not cover all the cases for safety that need to be addressed to adequately protect kfuncs. Specifically, pointers obtained by a BPF program "walking" a struct are _not_ considered PTR_UNTRUSTED according to BPF. For example, say that we were to add a kfunc called bpf_task_acquire(), with KF_ACQUIRE | KF_TRUSTED_ARGS, to acquire a struct task_struct *. If we only used PTR_UNTRUSTED to signal that a task was unsafe to pass to a kfunc, the verifier would mistakenly allow the following unsafe BPF program to be loaded: SEC("tp_btf/task_newtask") int BPF_PROG(unsafe_acquire_task, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired, *nested; nested = task->last_wakee; /* Would not be rejected by the verifier. */ acquired = bpf_task_acquire(nested); if (!acquired) return 0; bpf_task_release(acquired); return 0; } To address this, this patch defines a new type flag called PTR_TRUSTED which tracks whether a PTR_TO_BTF_ID pointer is safe to pass to a KF_TRUSTED_ARGS kfunc or a BPF helper function. PTR_TRUSTED pointers are passed directly from the kernel as a tracepoint or struct_ops callback argument. Any nested pointer that is obtained from walking a PTR_TRUSTED pointer is no longer PTR_TRUSTED. From the example above, the struct task_struct *task argument is PTR_TRUSTED, but the 'nested' pointer obtained from 'task->last_wakee' is not PTR_TRUSTED. A subsequent patch will add kfuncs for storing a task kfunc as a kptr, and then another patch will add selftests to validate. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20221120051004.3605026-3-void@manifault.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 30 ++++++++++++++++++++ include/linux/bpf_verifier.h | 7 +++++ include/linux/btf.h | 65 ++++++++++++++++++++++++++++---------------- 3 files changed, 78 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8b32376ce746..c9eafa67f2a2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -543,6 +543,35 @@ enum bpf_type_flag { */ MEM_ALLOC = BIT(11 + BPF_BASE_TYPE_BITS), + /* PTR was passed from the kernel in a trusted context, and may be + * passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions. + * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above. + * PTR_UNTRUSTED refers to a kptr that was read directly from a map + * without invoking bpf_kptr_xchg(). What we really need to know is + * whether a pointer is safe to pass to a kfunc or BPF helper function. + * While PTR_UNTRUSTED pointers are unsafe to pass to kfuncs and BPF + * helpers, they do not cover all possible instances of unsafe + * pointers. For example, a pointer that was obtained from walking a + * struct will _not_ get the PTR_UNTRUSTED type modifier, despite the + * fact that it may be NULL, invalid, etc. This is due to backwards + * compatibility requirements, as this was the behavior that was first + * introduced when kptrs were added. The behavior is now considered + * deprecated, and PTR_UNTRUSTED will eventually be removed. + * + * PTR_TRUSTED, on the other hand, is a pointer that the kernel + * guarantees to be valid and safe to pass to kfuncs and BPF helpers. + * For example, pointers passed to tracepoint arguments are considered + * PTR_TRUSTED, as are pointers that are passed to struct_ops + * callbacks. As alluded to above, pointers that are obtained from + * walking PTR_TRUSTED pointers are _not_ trusted. For example, if a + * struct task_struct *task is PTR_TRUSTED, then accessing + * task->last_wakee will lose the PTR_TRUSTED modifier when it's stored + * in a BPF register. Similarly, pointers passed to certain programs + * types such as kretprobes are not guaranteed to be valid, as they may + * for example contain an object that was recently freed. + */ + PTR_TRUSTED = BIT(12 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; @@ -636,6 +665,7 @@ enum bpf_return_type { RET_PTR_TO_RINGBUF_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_RINGBUF | RET_PTR_TO_MEM, RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, + RET_PTR_TO_BTF_ID_TRUSTED = PTR_TRUSTED | RET_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 608dde740fef..545152ac136c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -680,4 +680,11 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) } } +#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED) + +static inline bool bpf_type_has_unsafe_modifiers(u32 type) +{ + return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS; +} + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/include/linux/btf.h b/include/linux/btf.h index d5b26380a60f..d38aa4251c28 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -19,36 +19,53 @@ #define KF_RELEASE (1 << 1) /* kfunc is a release function */ #define KF_RET_NULL (1 << 2) /* kfunc returns a pointer that may be NULL */ #define KF_KPTR_GET (1 << 3) /* kfunc returns reference to a kptr */ -/* Trusted arguments are those which are meant to be referenced arguments with - * unchanged offset. It is used to enforce that pointers obtained from acquire - * kfuncs remain unmodified when being passed to helpers taking trusted args. +/* Trusted arguments are those which are guaranteed to be valid when passed to + * the kfunc. It is used to enforce that pointers obtained from either acquire + * kfuncs, or from the main kernel on a tracepoint or struct_ops callback + * invocation, remain unmodified when being passed to helpers taking trusted + * args. * - * Consider - * struct foo { - * int data; - * struct foo *next; - * }; + * Consider, for example, the following new task tracepoint: * - * struct bar { - * int data; - * struct foo f; - * }; + * SEC("tp_btf/task_newtask") + * int BPF_PROG(new_task_tp, struct task_struct *task, u64 clone_flags) + * { + * ... + * } * - * struct foo *f = alloc_foo(); // Acquire kfunc - * struct bar *b = alloc_bar(); // Acquire kfunc + * And the following kfunc: * - * If a kfunc set_foo_data() wants to operate only on the allocated object, it - * will set the KF_TRUSTED_ARGS flag, which will prevent unsafe usage like: + * BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) * - * set_foo_data(f, 42); // Allowed - * set_foo_data(f->next, 42); // Rejected, non-referenced pointer - * set_foo_data(&f->next, 42);// Rejected, referenced, but wrong type - * set_foo_data(&b->f, 42); // Rejected, referenced, but bad offset + * All invocations to the kfunc must pass the unmodified, unwalked task: * - * In the final case, usually for the purposes of type matching, it is deduced - * by looking at the type of the member at the offset, but due to the - * requirement of trusted argument, this deduction will be strict and not done - * for this case. + * bpf_task_acquire(task); // Allowed + * bpf_task_acquire(task->last_wakee); // Rejected, walked task + * + * Programs may also pass referenced tasks directly to the kfunc: + * + * struct task_struct *acquired; + * + * acquired = bpf_task_acquire(task); // Allowed, same as above + * bpf_task_acquire(acquired); // Allowed + * bpf_task_acquire(task); // Allowed + * bpf_task_acquire(acquired->last_wakee); // Rejected, walked task + * + * Programs may _not_, however, pass a task from an arbitrary fentry/fexit, or + * kprobe/kretprobe to the kfunc, as BPF cannot guarantee that all of these + * pointers are guaranteed to be safe. For example, the following BPF program + * would be rejected: + * + * SEC("kretprobe/free_task") + * int BPF_PROG(free_task_probe, struct task_struct *tsk) + * { + * struct task_struct *acquired; + * + * acquired = bpf_task_acquire(acquired); // Rejected, not a trusted pointer + * bpf_task_release(acquired); + * + * return 0; + * } */ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ -- cgit v1.2.3 From fd264ca020948a743e4c36731dfdecc4a812153c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 20 Nov 2022 11:54:32 -0800 Subject: bpf: Add a kfunc to type cast from bpf uapi ctx to kernel ctx Implement bpf_cast_to_kern_ctx() kfunc which does a type cast of a uapi ctx object to the corresponding kernel ctx. Previously if users want to access some data available in kctx but not in uapi ctx, bpf_probe_read_kernel() helper is needed. The introduction of bpf_cast_to_kern_ctx() allows direct memory access which makes code simpler and easier to understand. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221120195432.3113982-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index d38aa4251c28..9ed00077db6e 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -487,6 +487,7 @@ const struct btf_member * btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg); +int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type); bool btf_types_are_same(const struct btf *btf1, u32 id1, const struct btf *btf2, u32 id2); #else @@ -531,6 +532,10 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, { return NULL; } +static inline int get_kern_ctx_btf_id(struct bpf_verifier_log *log, + enum bpf_prog_type prog_type) { + return -EINVAL; +} static inline bool btf_types_are_same(const struct btf *btf1, u32 id1, const struct btf *btf2, u32 id2) { -- cgit v1.2.3 From a3400e8746b626531099e4d9fd8eac41be066683 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Nov 2022 10:46:08 -0800 Subject: mptcp: more detailed error reporting on endpoint creation Endpoint creation can fail for a number of reasons; in case of failure append the error number to the extended ack message, using a newly introduced generic helper. Additionally let mptcp_pm_nl_append_new_local_addr() report different error reasons. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/genetlink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index d21210709f84..ed4622dd4828 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -125,6 +125,9 @@ static inline void genl_info_net_set(struct genl_info *info, struct net *net) #define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG((info)->extack, msg) +#define GENL_SET_ERR_MSG_FMT(info, msg, args...) \ + NL_SET_ERR_MSG_FMT((info)->extack, msg, ##args) + /* Report that a root attribute is missing */ #define GENL_REQ_ATTR_CHECK(info, attr) ({ \ struct genl_info *__info = (info); \ -- cgit v1.2.3 From 33d5eeb9a684a2e962ca0be3fe557a2c9f48d8e2 Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Sat, 19 Nov 2022 15:14:04 -0800 Subject: net: mscc: ocelot: remove redundant stats_layout pointers Ever since commit 4d1d157fb6a4 ("net: mscc: ocelot: share the common stat definitions between all drivers") the stats_layout entry in ocelot and felix drivers have become redundant. Remove the unnecessary code. Suggested-by: Vladimir Oltean Signed-off-by: Colin Foster Reviewed-by: Vladimir Oltean Signed-off-by: Paolo Abeni --- include/soc/mscc/ocelot.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 967ba30ea636..995b5950afe6 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -967,7 +967,6 @@ struct ocelot { struct regmap *targets[TARGET_MAX]; struct regmap_field *regfields[REGFIELD_MAX]; const u32 *const *map; - const struct ocelot_stat_layout *stats_layout; struct list_head stats_regions; u32 pool_size[OCELOT_SB_NUM][OCELOT_SB_POOL_NUM]; -- cgit v1.2.3 From a3bb8f521fd8703cdb429d3e52f3e6802706c87d Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Sat, 19 Nov 2022 15:14:05 -0800 Subject: net: mscc: ocelot: remove unnecessary exposure of stats structures Since commit 4d1d157fb6a4 ("net: mscc: ocelot: share the common stat definitions between all drivers") there is no longer a need to share the stats structures to the world. Relocate these definitions to inside ocelot_stats.c instead of a global include header. Signed-off-by: Colin Foster Reviewed-by: Vladimir Oltean Signed-off-by: Paolo Abeni --- include/soc/mscc/ocelot.h | 215 ---------------------------------------------- 1 file changed, 215 deletions(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 995b5950afe6..df62be80a193 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -596,221 +596,6 @@ enum ocelot_ptp_pins { TOD_ACC_PIN }; -enum ocelot_stat { - OCELOT_STAT_RX_OCTETS, - OCELOT_STAT_RX_UNICAST, - OCELOT_STAT_RX_MULTICAST, - OCELOT_STAT_RX_BROADCAST, - OCELOT_STAT_RX_SHORTS, - OCELOT_STAT_RX_FRAGMENTS, - OCELOT_STAT_RX_JABBERS, - OCELOT_STAT_RX_CRC_ALIGN_ERRS, - OCELOT_STAT_RX_SYM_ERRS, - OCELOT_STAT_RX_64, - OCELOT_STAT_RX_65_127, - OCELOT_STAT_RX_128_255, - OCELOT_STAT_RX_256_511, - OCELOT_STAT_RX_512_1023, - OCELOT_STAT_RX_1024_1526, - OCELOT_STAT_RX_1527_MAX, - OCELOT_STAT_RX_PAUSE, - OCELOT_STAT_RX_CONTROL, - OCELOT_STAT_RX_LONGS, - OCELOT_STAT_RX_CLASSIFIED_DROPS, - OCELOT_STAT_RX_RED_PRIO_0, - OCELOT_STAT_RX_RED_PRIO_1, - OCELOT_STAT_RX_RED_PRIO_2, - OCELOT_STAT_RX_RED_PRIO_3, - OCELOT_STAT_RX_RED_PRIO_4, - OCELOT_STAT_RX_RED_PRIO_5, - OCELOT_STAT_RX_RED_PRIO_6, - OCELOT_STAT_RX_RED_PRIO_7, - OCELOT_STAT_RX_YELLOW_PRIO_0, - OCELOT_STAT_RX_YELLOW_PRIO_1, - OCELOT_STAT_RX_YELLOW_PRIO_2, - OCELOT_STAT_RX_YELLOW_PRIO_3, - OCELOT_STAT_RX_YELLOW_PRIO_4, - OCELOT_STAT_RX_YELLOW_PRIO_5, - OCELOT_STAT_RX_YELLOW_PRIO_6, - OCELOT_STAT_RX_YELLOW_PRIO_7, - OCELOT_STAT_RX_GREEN_PRIO_0, - OCELOT_STAT_RX_GREEN_PRIO_1, - OCELOT_STAT_RX_GREEN_PRIO_2, - OCELOT_STAT_RX_GREEN_PRIO_3, - OCELOT_STAT_RX_GREEN_PRIO_4, - OCELOT_STAT_RX_GREEN_PRIO_5, - OCELOT_STAT_RX_GREEN_PRIO_6, - OCELOT_STAT_RX_GREEN_PRIO_7, - OCELOT_STAT_TX_OCTETS, - OCELOT_STAT_TX_UNICAST, - OCELOT_STAT_TX_MULTICAST, - OCELOT_STAT_TX_BROADCAST, - OCELOT_STAT_TX_COLLISION, - OCELOT_STAT_TX_DROPS, - OCELOT_STAT_TX_PAUSE, - OCELOT_STAT_TX_64, - OCELOT_STAT_TX_65_127, - OCELOT_STAT_TX_128_255, - OCELOT_STAT_TX_256_511, - OCELOT_STAT_TX_512_1023, - OCELOT_STAT_TX_1024_1526, - OCELOT_STAT_TX_1527_MAX, - OCELOT_STAT_TX_YELLOW_PRIO_0, - OCELOT_STAT_TX_YELLOW_PRIO_1, - OCELOT_STAT_TX_YELLOW_PRIO_2, - OCELOT_STAT_TX_YELLOW_PRIO_3, - OCELOT_STAT_TX_YELLOW_PRIO_4, - OCELOT_STAT_TX_YELLOW_PRIO_5, - OCELOT_STAT_TX_YELLOW_PRIO_6, - OCELOT_STAT_TX_YELLOW_PRIO_7, - OCELOT_STAT_TX_GREEN_PRIO_0, - OCELOT_STAT_TX_GREEN_PRIO_1, - OCELOT_STAT_TX_GREEN_PRIO_2, - OCELOT_STAT_TX_GREEN_PRIO_3, - OCELOT_STAT_TX_GREEN_PRIO_4, - OCELOT_STAT_TX_GREEN_PRIO_5, - OCELOT_STAT_TX_GREEN_PRIO_6, - OCELOT_STAT_TX_GREEN_PRIO_7, - OCELOT_STAT_TX_AGED, - OCELOT_STAT_DROP_LOCAL, - OCELOT_STAT_DROP_TAIL, - OCELOT_STAT_DROP_YELLOW_PRIO_0, - OCELOT_STAT_DROP_YELLOW_PRIO_1, - OCELOT_STAT_DROP_YELLOW_PRIO_2, - OCELOT_STAT_DROP_YELLOW_PRIO_3, - OCELOT_STAT_DROP_YELLOW_PRIO_4, - OCELOT_STAT_DROP_YELLOW_PRIO_5, - OCELOT_STAT_DROP_YELLOW_PRIO_6, - OCELOT_STAT_DROP_YELLOW_PRIO_7, - OCELOT_STAT_DROP_GREEN_PRIO_0, - OCELOT_STAT_DROP_GREEN_PRIO_1, - OCELOT_STAT_DROP_GREEN_PRIO_2, - OCELOT_STAT_DROP_GREEN_PRIO_3, - OCELOT_STAT_DROP_GREEN_PRIO_4, - OCELOT_STAT_DROP_GREEN_PRIO_5, - OCELOT_STAT_DROP_GREEN_PRIO_6, - OCELOT_STAT_DROP_GREEN_PRIO_7, - OCELOT_NUM_STATS, -}; - -struct ocelot_stat_layout { - u32 reg; - char name[ETH_GSTRING_LEN]; -}; - -/* 32-bit counter checked for wraparound by ocelot_port_update_stats() - * and copied to ocelot->stats. - */ -#define OCELOT_STAT(kind) \ - [OCELOT_STAT_ ## kind] = { .reg = SYS_COUNT_ ## kind } -/* Same as above, except also exported to ethtool -S. Standard counters should - * only be exposed to more specific interfaces rather than by their string name. - */ -#define OCELOT_STAT_ETHTOOL(kind, ethtool_name) \ - [OCELOT_STAT_ ## kind] = { .reg = SYS_COUNT_ ## kind, .name = ethtool_name } - -#define OCELOT_COMMON_STATS \ - OCELOT_STAT_ETHTOOL(RX_OCTETS, "rx_octets"), \ - OCELOT_STAT_ETHTOOL(RX_UNICAST, "rx_unicast"), \ - OCELOT_STAT_ETHTOOL(RX_MULTICAST, "rx_multicast"), \ - OCELOT_STAT_ETHTOOL(RX_BROADCAST, "rx_broadcast"), \ - OCELOT_STAT_ETHTOOL(RX_SHORTS, "rx_shorts"), \ - OCELOT_STAT_ETHTOOL(RX_FRAGMENTS, "rx_fragments"), \ - OCELOT_STAT_ETHTOOL(RX_JABBERS, "rx_jabbers"), \ - OCELOT_STAT_ETHTOOL(RX_CRC_ALIGN_ERRS, "rx_crc_align_errs"), \ - OCELOT_STAT_ETHTOOL(RX_SYM_ERRS, "rx_sym_errs"), \ - OCELOT_STAT_ETHTOOL(RX_64, "rx_frames_below_65_octets"), \ - OCELOT_STAT_ETHTOOL(RX_65_127, "rx_frames_65_to_127_octets"), \ - OCELOT_STAT_ETHTOOL(RX_128_255, "rx_frames_128_to_255_octets"), \ - OCELOT_STAT_ETHTOOL(RX_256_511, "rx_frames_256_to_511_octets"), \ - OCELOT_STAT_ETHTOOL(RX_512_1023, "rx_frames_512_to_1023_octets"), \ - OCELOT_STAT_ETHTOOL(RX_1024_1526, "rx_frames_1024_to_1526_octets"), \ - OCELOT_STAT_ETHTOOL(RX_1527_MAX, "rx_frames_over_1526_octets"), \ - OCELOT_STAT_ETHTOOL(RX_PAUSE, "rx_pause"), \ - OCELOT_STAT_ETHTOOL(RX_CONTROL, "rx_control"), \ - OCELOT_STAT_ETHTOOL(RX_LONGS, "rx_longs"), \ - OCELOT_STAT_ETHTOOL(RX_CLASSIFIED_DROPS, "rx_classified_drops"), \ - OCELOT_STAT_ETHTOOL(RX_RED_PRIO_0, "rx_red_prio_0"), \ - OCELOT_STAT_ETHTOOL(RX_RED_PRIO_1, "rx_red_prio_1"), \ - OCELOT_STAT_ETHTOOL(RX_RED_PRIO_2, "rx_red_prio_2"), \ - OCELOT_STAT_ETHTOOL(RX_RED_PRIO_3, "rx_red_prio_3"), \ - OCELOT_STAT_ETHTOOL(RX_RED_PRIO_4, "rx_red_prio_4"), \ - OCELOT_STAT_ETHTOOL(RX_RED_PRIO_5, "rx_red_prio_5"), \ - OCELOT_STAT_ETHTOOL(RX_RED_PRIO_6, "rx_red_prio_6"), \ - OCELOT_STAT_ETHTOOL(RX_RED_PRIO_7, "rx_red_prio_7"), \ - OCELOT_STAT_ETHTOOL(RX_YELLOW_PRIO_0, "rx_yellow_prio_0"), \ - OCELOT_STAT_ETHTOOL(RX_YELLOW_PRIO_1, "rx_yellow_prio_1"), \ - OCELOT_STAT_ETHTOOL(RX_YELLOW_PRIO_2, "rx_yellow_prio_2"), \ - OCELOT_STAT_ETHTOOL(RX_YELLOW_PRIO_3, "rx_yellow_prio_3"), \ - OCELOT_STAT_ETHTOOL(RX_YELLOW_PRIO_4, "rx_yellow_prio_4"), \ - OCELOT_STAT_ETHTOOL(RX_YELLOW_PRIO_5, "rx_yellow_prio_5"), \ - OCELOT_STAT_ETHTOOL(RX_YELLOW_PRIO_6, "rx_yellow_prio_6"), \ - OCELOT_STAT_ETHTOOL(RX_YELLOW_PRIO_7, "rx_yellow_prio_7"), \ - OCELOT_STAT_ETHTOOL(RX_GREEN_PRIO_0, "rx_green_prio_0"), \ - OCELOT_STAT_ETHTOOL(RX_GREEN_PRIO_1, "rx_green_prio_1"), \ - OCELOT_STAT_ETHTOOL(RX_GREEN_PRIO_2, "rx_green_prio_2"), \ - OCELOT_STAT_ETHTOOL(RX_GREEN_PRIO_3, "rx_green_prio_3"), \ - OCELOT_STAT_ETHTOOL(RX_GREEN_PRIO_4, "rx_green_prio_4"), \ - OCELOT_STAT_ETHTOOL(RX_GREEN_PRIO_5, "rx_green_prio_5"), \ - OCELOT_STAT_ETHTOOL(RX_GREEN_PRIO_6, "rx_green_prio_6"), \ - OCELOT_STAT_ETHTOOL(RX_GREEN_PRIO_7, "rx_green_prio_7"), \ - OCELOT_STAT_ETHTOOL(TX_OCTETS, "tx_octets"), \ - OCELOT_STAT_ETHTOOL(TX_UNICAST, "tx_unicast"), \ - OCELOT_STAT_ETHTOOL(TX_MULTICAST, "tx_multicast"), \ - OCELOT_STAT_ETHTOOL(TX_BROADCAST, "tx_broadcast"), \ - OCELOT_STAT_ETHTOOL(TX_COLLISION, "tx_collision"), \ - OCELOT_STAT_ETHTOOL(TX_DROPS, "tx_drops"), \ - OCELOT_STAT_ETHTOOL(TX_PAUSE, "tx_pause"), \ - OCELOT_STAT_ETHTOOL(TX_64, "tx_frames_below_65_octets"), \ - OCELOT_STAT_ETHTOOL(TX_65_127, "tx_frames_65_to_127_octets"), \ - OCELOT_STAT_ETHTOOL(TX_128_255, "tx_frames_128_255_octets"), \ - OCELOT_STAT_ETHTOOL(TX_256_511, "tx_frames_256_511_octets"), \ - OCELOT_STAT_ETHTOOL(TX_512_1023, "tx_frames_512_1023_octets"), \ - OCELOT_STAT_ETHTOOL(TX_1024_1526, "tx_frames_1024_1526_octets"), \ - OCELOT_STAT_ETHTOOL(TX_1527_MAX, "tx_frames_over_1526_octets"), \ - OCELOT_STAT_ETHTOOL(TX_YELLOW_PRIO_0, "tx_yellow_prio_0"), \ - OCELOT_STAT_ETHTOOL(TX_YELLOW_PRIO_1, "tx_yellow_prio_1"), \ - OCELOT_STAT_ETHTOOL(TX_YELLOW_PRIO_2, "tx_yellow_prio_2"), \ - OCELOT_STAT_ETHTOOL(TX_YELLOW_PRIO_3, "tx_yellow_prio_3"), \ - OCELOT_STAT_ETHTOOL(TX_YELLOW_PRIO_4, "tx_yellow_prio_4"), \ - OCELOT_STAT_ETHTOOL(TX_YELLOW_PRIO_5, "tx_yellow_prio_5"), \ - OCELOT_STAT_ETHTOOL(TX_YELLOW_PRIO_6, "tx_yellow_prio_6"), \ - OCELOT_STAT_ETHTOOL(TX_YELLOW_PRIO_7, "tx_yellow_prio_7"), \ - OCELOT_STAT_ETHTOOL(TX_GREEN_PRIO_0, "tx_green_prio_0"), \ - OCELOT_STAT_ETHTOOL(TX_GREEN_PRIO_1, "tx_green_prio_1"), \ - OCELOT_STAT_ETHTOOL(TX_GREEN_PRIO_2, "tx_green_prio_2"), \ - OCELOT_STAT_ETHTOOL(TX_GREEN_PRIO_3, "tx_green_prio_3"), \ - OCELOT_STAT_ETHTOOL(TX_GREEN_PRIO_4, "tx_green_prio_4"), \ - OCELOT_STAT_ETHTOOL(TX_GREEN_PRIO_5, "tx_green_prio_5"), \ - OCELOT_STAT_ETHTOOL(TX_GREEN_PRIO_6, "tx_green_prio_6"), \ - OCELOT_STAT_ETHTOOL(TX_GREEN_PRIO_7, "tx_green_prio_7"), \ - OCELOT_STAT_ETHTOOL(TX_AGED, "tx_aged"), \ - OCELOT_STAT_ETHTOOL(DROP_LOCAL, "drop_local"), \ - OCELOT_STAT_ETHTOOL(DROP_TAIL, "drop_tail"), \ - OCELOT_STAT_ETHTOOL(DROP_YELLOW_PRIO_0, "drop_yellow_prio_0"), \ - OCELOT_STAT_ETHTOOL(DROP_YELLOW_PRIO_1, "drop_yellow_prio_1"), \ - OCELOT_STAT_ETHTOOL(DROP_YELLOW_PRIO_2, "drop_yellow_prio_2"), \ - OCELOT_STAT_ETHTOOL(DROP_YELLOW_PRIO_3, "drop_yellow_prio_3"), \ - OCELOT_STAT_ETHTOOL(DROP_YELLOW_PRIO_4, "drop_yellow_prio_4"), \ - OCELOT_STAT_ETHTOOL(DROP_YELLOW_PRIO_5, "drop_yellow_prio_5"), \ - OCELOT_STAT_ETHTOOL(DROP_YELLOW_PRIO_6, "drop_yellow_prio_6"), \ - OCELOT_STAT_ETHTOOL(DROP_YELLOW_PRIO_7, "drop_yellow_prio_7"), \ - OCELOT_STAT_ETHTOOL(DROP_GREEN_PRIO_0, "drop_green_prio_0"), \ - OCELOT_STAT_ETHTOOL(DROP_GREEN_PRIO_1, "drop_green_prio_1"), \ - OCELOT_STAT_ETHTOOL(DROP_GREEN_PRIO_2, "drop_green_prio_2"), \ - OCELOT_STAT_ETHTOOL(DROP_GREEN_PRIO_3, "drop_green_prio_3"), \ - OCELOT_STAT_ETHTOOL(DROP_GREEN_PRIO_4, "drop_green_prio_4"), \ - OCELOT_STAT_ETHTOOL(DROP_GREEN_PRIO_5, "drop_green_prio_5"), \ - OCELOT_STAT_ETHTOOL(DROP_GREEN_PRIO_6, "drop_green_prio_6"), \ - OCELOT_STAT_ETHTOOL(DROP_GREEN_PRIO_7, "drop_green_prio_7") - -struct ocelot_stats_region { - struct list_head node; - u32 base; - int count; - u32 *buf; -}; - enum ocelot_tag_prefix { OCELOT_TAG_PREFIX_DISABLED = 0, OCELOT_TAG_PREFIX_NONE, -- cgit v1.2.3 From c5fb8ead3283955dc68671f853017b181f96fdc1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:39 +0200 Subject: net: dsa: unexport dsa_dev_to_net_device() dsa.o and dsa2.o are linked into the same dsa_core.o, there is no reason to export this symbol when its only caller is local. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 82da44561f4c..d5bfcb63d4c2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1285,8 +1285,6 @@ struct dsa_switch_driver { const struct dsa_switch_ops *ops; }; -struct net_device *dsa_dev_to_net_device(struct device *dev); - bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); -- cgit v1.2.3 From 19d05ea712ecbbb67d302664da5ec58b37b9aece Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:54 +0200 Subject: net: dsa: move tag_8021q headers to their proper place tag_8021q definitions are all over the place. Some are exported to linux/dsa/8021q.h (visible by DSA core, taggers, switch drivers and everyone else), and some are in dsa_priv.h. Move the structures that don't need external visibility into tag_8021q.c, and the ones which don't need the world or switch drivers to see them into tag_8021q.h. We also have the tag_8021q.h inclusion from switch.c, which is basically the entire reason why tag_8021q.c was built into DSA in commit 8b6e638b4be2 ("net: dsa: build tag_8021q.c as part of DSA core"). I still don't know how to better deal with that, so leave it alone. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/linux/dsa/8021q.h | 31 +------------------------------ include/net/dsa.h | 1 + 2 files changed, 2 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 3ed117e299ec..f3664ee12170 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -5,28 +5,8 @@ #ifndef _NET_DSA_8021Q_H #define _NET_DSA_8021Q_H -#include -#include #include - -struct dsa_switch; -struct dsa_port; -struct sk_buff; -struct net_device; - -struct dsa_tag_8021q_vlan { - struct list_head list; - int port; - u16 vid; - refcount_t refcount; -}; - -struct dsa_8021q_context { - struct dsa_switch *ds; - struct list_head vlans; - /* EtherType of RX VID, used for filtering on master interface */ - __be16 proto; -}; +#include int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto); @@ -38,15 +18,6 @@ int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, int port, void dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, int port, struct dsa_bridge bridge); -struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, - u16 tpid, u16 tci); - -void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, - int *vbid); - -struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master, - int vbid); - u16 dsa_tag_8021q_bridge_vid(unsigned int bridge_num); u16 dsa_tag_8021q_standalone_vid(const struct dsa_port *dp); diff --git a/include/net/dsa.h b/include/net/dsa.h index d5bfcb63d4c2..96086289aa9b 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -22,6 +22,7 @@ #include #include +struct dsa_8021q_context; struct tc_action; struct phy_device; struct fixed_phy_status; -- cgit v1.2.3 From beb3d47d1d3d7185bb401af628ad32ee204a9526 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 23 Nov 2022 07:57:59 -0800 Subject: bpf: Fix a BTF_ID_LIST bug with CONFIG_DEBUG_INFO_BTF not set With CONFIG_DEBUG_INFO_BTF not set, we hit the following compilation error, /.../kernel/bpf/verifier.c:8196:23: error: array index 6 is past the end of the array (that has type 'u32[5]' (aka 'unsigned int[5]')) [-Werror,-Warray-bounds] if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) ^ ~~~~~~~~~~~~~~~~~~~~~~~ /.../kernel/bpf/verifier.c:8174:1: note: array 'special_kfunc_list' declared here BTF_ID_LIST(special_kfunc_list) ^ /.../include/linux/btf_ids.h:207:27: note: expanded from macro 'BTF_ID_LIST' #define BTF_ID_LIST(name) static u32 __maybe_unused name[5]; ^ /.../kernel/bpf/verifier.c:8443:19: error: array index 5 is past the end of the array (that has type 'u32[5]' (aka 'unsigned int[5]')) [-Werror,-Warray-bounds] btf_id == special_kfunc_list[KF_bpf_list_pop_back]; ^ ~~~~~~~~~~~~~~~~~~~~ /.../kernel/bpf/verifier.c:8174:1: note: array 'special_kfunc_list' declared here BTF_ID_LIST(special_kfunc_list) ^ /.../include/linux/btf_ids.h:207:27: note: expanded from macro 'BTF_ID_LIST' #define BTF_ID_LIST(name) static u32 __maybe_unused name[5]; ... Fix the problem by increase the size of BTF_ID_LIST to 16 to avoid compilation error and also prevent potentially unintended issue due to out-of-bound access. Reported-by: kernel test robot Reported-by: Dan Carpenter Reported-by: Nathan Chancellor Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221123155759.2669749-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/btf_ids.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index c9744efd202f..93397711a68c 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -204,7 +204,7 @@ extern struct btf_id_set8 name; #else -#define BTF_ID_LIST(name) static u32 __maybe_unused name[5]; +#define BTF_ID_LIST(name) static u32 __maybe_unused name[16]; #define BTF_ID(prefix, name) #define BTF_ID_FLAGS(prefix, name, ...) #define BTF_ID_UNUSED -- cgit v1.2.3 From 72b43bde38de4aa05e6a7fa12d7965f48180deb6 Mon Sep 17 00:00:00 2001 From: Ji Rongfeng Date: Fri, 18 Nov 2022 16:18:18 +0800 Subject: bpf: Update bpf_{g,s}etsockopt() documentation * append missing optnames to the end * simplify bpf_getsockopt()'s doc Signed-off-by: Ji Rongfeng Link: https://lore.kernel.org/r/DU0P192MB15479B86200B1216EC90E162D6099@DU0P192MB1547.EURP192.PROD.OUTLOOK.COM Signed-off-by: Martin KaFai Lau --- include/uapi/linux/bpf.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ab86145df760..f89de51a45db 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2584,14 +2584,19 @@ union bpf_attr { * * **SOL_SOCKET**, which supports the following *optname*\ s: * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, - * **SO_BINDTODEVICE**, **SO_KEEPALIVE**. + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**, **SO_REUSEADDR**, + * **SO_REUSEPORT**, **SO_BINDTOIFINDEX**, **SO_TXREHASH**. * * **IPPROTO_TCP**, which supports the following *optname*\ s: * **TCP_CONGESTION**, **TCP_BPF_IW**, * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, - * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**. + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**, + * **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**, + * **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**, + * **TCP_BPF_RTO_MIN**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. - * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. + * * **IPPROTO_IPV6**, which supports the following *optname*\ s: + * **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**. * Return * 0 on success, or a negative error in case of failure. * @@ -2808,12 +2813,10 @@ union bpf_attr { * and **BPF_CGROUP_INET6_CONNECT**. * * This helper actually implements a subset of **getsockopt()**. - * It supports the following *level*\ s: - * - * * **IPPROTO_TCP**, which supports *optname* - * **TCP_CONGESTION**. - * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. - * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. + * It supports the same set of *optname*\ s that is supported by + * the **bpf_setsockopt**\ () helper. The exceptions are + * **TCP_BPF_*** is **bpf_setsockopt**\ () only and + * **TCP_SAVED_SYN** is **bpf_getsockopt**\ () only. * Return * 0 on success, or a negative error in case of failure. * -- cgit v1.2.3 From 14e5f71e31ff3925cc970fa7907393ee7f4b748d Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 22 Nov 2022 20:09:47 -0800 Subject: net: use %pS for kfree_skb tracing event location For the cases where 'reason' doesn't give any clue, it's still nice to be able to track the kfree_skb caller location. %p doesn't help much so let's use %pS which prints the symbol+offset. Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20221123040947.1015721-1-sdf@google.com Signed-off-by: Paolo Abeni --- include/trace/events/skb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 50a974f7dfb4..25ab1ff9423d 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -42,7 +42,7 @@ TRACE_EVENT(kfree_skb, __entry->reason = reason; ), - TP_printk("skbaddr=%p protocol=%u location=%p reason: %s", + TP_printk("skbaddr=%p protocol=%u location=%pS reason: %s", __entry->skbaddr, __entry->protocol, __entry->location, __print_symbolic(__entry->reason, DEFINE_DROP_REASON(FN, FNe))) -- cgit v1.2.3 From 5a0f663f0189cf9e031e444f97c029717a99548d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 23 Nov 2022 21:32:06 -0800 Subject: compiler_types: Define __rcu as __attribute__((btf_type_tag("rcu"))) Currently, without rcu attribute info in BTF, the verifier treats rcu tagged pointer as a normal pointer. This might be a problem for sleepable program where rcu_read_lock()/unlock() is not available. For example, for a sleepable fentry program, if rcu protected memory access is interleaved with a sleepable helper/kfunc, it is possible the memory access after the sleepable helper/kfunc might be invalid since the object might have been freed then. To prevent such cases, introducing rcu tagging for memory accesses in verifier can help to reject such programs. To enable rcu tagging in BTF, during kernel compilation, define __rcu as attribute btf_type_tag("rcu") so __rcu information can be preserved in dwarf and btf, and later can be used for bpf prog verification. Acked-by: KP Singh Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221124053206.2373141-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/compiler_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index eb0466236661..7c1afe0f4129 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -49,7 +49,8 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # endif # define __iomem # define __percpu BTF_TYPE_TAG(percpu) -# define __rcu +# define __rcu BTF_TYPE_TAG(rcu) + # define __chk_user_ptr(x) (void)0 # define __chk_io_ptr(x) (void)0 /* context/locking */ -- cgit v1.2.3 From 01685c5bddaa6df3d662c8afed5e5289fcc68e5a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 23 Nov 2022 21:32:11 -0800 Subject: bpf: Introduce might_sleep field in bpf_func_proto Introduce bpf_func_proto->might_sleep to indicate a particular helper might sleep. This will make later check whether a helper might be sleepable or not easier. Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221124053211.2373553-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c9eafa67f2a2..43fd7eeeeabb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -682,6 +682,7 @@ struct bpf_func_proto { u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool gpl_only; bool pkt_access; + bool might_sleep; enum bpf_return_type ret_type; union { struct { -- cgit v1.2.3 From 9bb00b2895cbfe0ad410457b605d0a72524168c1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 23 Nov 2022 21:32:17 -0800 Subject: bpf: Add kfunc bpf_rcu_read_lock/unlock() Add two kfunc's bpf_rcu_read_lock() and bpf_rcu_read_unlock(). These two kfunc's can be used for all program types. The following is an example about how rcu pointer are used w.r.t. bpf_rcu_read_lock()/bpf_rcu_read_unlock(). struct task_struct { ... struct task_struct *last_wakee; struct task_struct __rcu *real_parent; ... }; Let us say prog does 'task = bpf_get_current_task_btf()' to get a 'task' pointer. The basic rules are: - 'real_parent = task->real_parent' should be inside bpf_rcu_read_lock region. This is to simulate rcu_dereference() operation. The 'real_parent' is marked as MEM_RCU only if (1). task->real_parent is inside bpf_rcu_read_lock region, and (2). task is a trusted ptr. So MEM_RCU marked ptr can be 'trusted' inside the bpf_rcu_read_lock region. - 'last_wakee = real_parent->last_wakee' should be inside bpf_rcu_read_lock region since it tries to access rcu protected memory. - the ptr 'last_wakee' will be marked as PTR_UNTRUSTED since in general it is not clear whether the object pointed by 'last_wakee' is valid or not even inside bpf_rcu_read_lock region. The verifier will reset all rcu pointer register states to untrusted at bpf_rcu_read_unlock() kfunc call site, so any such rcu pointer won't be trusted any more outside the bpf_rcu_read_lock() region. The current implementation does not support nested rcu read lock region in the prog. Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221124053217.2373910-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ include/linux/bpf_verifier.h | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 43fd7eeeeabb..c6aa6912ea16 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -572,6 +572,9 @@ enum bpf_type_flag { */ PTR_TRUSTED = BIT(12 + BPF_BASE_TYPE_BITS), + /* MEM is tagged with rcu and memory access needs rcu_read_lock protection. */ + MEM_RCU = BIT(13 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 545152ac136c..c05aa6e1f6f5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -344,6 +344,7 @@ struct bpf_verifier_state { u32 id; } active_lock; bool speculative; + bool active_rcu_lock; /* first and last insn idx of this verifier state */ u32 first_insn_idx; @@ -445,6 +446,7 @@ struct bpf_insn_aux_data { u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ bool zext_dst; /* this insn zero extends dst reg */ + bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ u8 alu_state; /* used in combination with alu_limit */ /* below fields are initialized once */ @@ -534,6 +536,7 @@ struct bpf_verifier_env { bool bypass_spec_v1; bool bypass_spec_v4; bool seen_direct_write; + bool rcu_tag_supported; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; @@ -680,7 +683,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) } } -#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED) +#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | MEM_RCU | PTR_TRUSTED) static inline bool bpf_type_has_unsafe_modifiers(u32 type) { -- cgit v1.2.3 From bd12240337f43522b99c43f8976af34c712b5f57 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 24 Nov 2022 15:43:42 +0100 Subject: xfrm: add extack to xfrm_do_migrate Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index dbc81f5eb553..576566bd0be9 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1703,7 +1703,8 @@ struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_bundles, struct xfrm_kmaddress *k, struct net *net, - struct xfrm_encap_tmpl *encap, u32 if_id); + struct xfrm_encap_tmpl *encap, u32 if_id, + struct netlink_ext_ack *extack); #endif int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport); -- cgit v1.2.3 From c2dad11e0466a27d40041845cf63cdfb4fbd991f Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 24 Nov 2022 15:43:43 +0100 Subject: xfrm: add extack to xfrm_alloc_userspi Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 576566bd0be9..e0cc6791c001 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1681,8 +1681,9 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, int xfrm_policy_flush(struct net *net, u8 type, bool task_valid); void xfrm_policy_hash_rebuild(struct net *net); u32 xfrm_get_acqseq(void); -int verify_spi_info(u8 proto, u32 min, u32 max); -int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi); +int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack); +int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi, + struct netlink_ext_ack *extack); struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, u32 if_id, u8 proto, const xfrm_address_t *daddr, -- cgit v1.2.3 From a66d79ee0bd5140a64b72cde588f8c83a55a1eb9 Mon Sep 17 00:00:00 2001 From: Sujuan Chen Date: Thu, 24 Nov 2022 11:18:14 +0800 Subject: net: ethernet: mtk_wed: add wcid overwritten support for wed v1 All wed versions should enable the wcid overwritten feature, since the wcid size is controlled by the wlan driver. Tested-by: Sujuan Chen Co-developed-by: Bo Jiao Signed-off-by: Bo Jiao Signed-off-by: Sujuan Chen Signed-off-by: David S. Miller --- include/linux/soc/mediatek/mtk_wed.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 8294978f4bca..1b1ef57609f7 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -85,6 +85,9 @@ struct mtk_wed_device { int irq; u8 version; + /* used by wlan driver */ + u32 rev_id; + struct mtk_wed_ring tx_ring[MTK_WED_TX_QUEUES]; struct mtk_wed_ring rx_ring[MTK_WED_RX_QUEUES]; struct mtk_wed_ring txfree_ring; -- cgit v1.2.3 From 4c47867bc789bdc722f3bb760355c2c246fbe9af Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 24 Nov 2022 12:15:54 +0100 Subject: of: net: export of_get_mac_address_nvmem() Export of_get_mac_addr_nvmem() and rename it to of_get_mac_address_nvmem() in order to fit the convention followed by the existing exported helpers of the same kind. This way, OF compatible drivers using eg. fwnode_get_mac_address() can do a direct call to it instead of calling of_get_mac_address() just for the nvmem step, avoiding to repeat an expensive DT lookup which has already been done once. Eventually, fwnode_get_mac_address() should probably be updated to perform the nvmem lookup directly, but as of today, nvmem cells seem not to be supported by ACPI yet which would defeat this kind of extension. Suggested-by: Marcin Wojtas Signed-off-by: Miquel Raynal Signed-off-by: Paolo Abeni --- include/linux/of_net.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/of_net.h b/include/linux/of_net.h index 0484b613ca64..d88715a0b3a5 100644 --- a/include/linux/of_net.h +++ b/include/linux/of_net.h @@ -14,6 +14,7 @@ struct net_device; extern int of_get_phy_mode(struct device_node *np, phy_interface_t *interface); extern int of_get_mac_address(struct device_node *np, u8 *mac); +extern int of_get_mac_address_nvmem(struct device_node *np, u8 *mac); int of_get_ethdev_address(struct device_node *np, struct net_device *dev); extern struct net_device *of_find_net_device_by_node(struct device_node *np); #else @@ -28,6 +29,11 @@ static inline int of_get_mac_address(struct device_node *np, u8 *mac) return -ENODEV; } +static inline int of_get_mac_address_nvmem(struct device_node *np, u8 *mac) +{ + return -ENODEV; +} + static inline int of_get_ethdev_address(struct device_node *np, struct net_device *dev) { return -ENODEV; -- cgit v1.2.3 From f78cd9c783e09a0fe454b0fc8b39c22025d7869e Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 24 Nov 2022 16:22:53 +0100 Subject: net: ethernet: mtk_wed: update mtk_wed_stop Update mtk_wed_stop routine and rename old mtk_wed_stop() to mtk_wed_deinit(). This is a preliminary patch to add Wireless Ethernet Dispatcher reset support. Co-developed-by: Sujuan Chen Signed-off-by: Sujuan Chen Signed-off-by: Lorenzo Bianconi Signed-off-by: Paolo Abeni --- include/linux/soc/mediatek/mtk_wed.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 1b1ef57609f7..c43510e541df 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -234,6 +234,8 @@ mtk_wed_get_rx_capa(struct mtk_wed_device *dev) (_dev)->ops->ppe_check(_dev, _skb, _reason, _hash) #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) \ (_dev)->ops->msg_update(_dev, _id, _msg, _len) +#define mtk_wed_device_stop(_dev) (_dev)->ops->stop(_dev) +#define mtk_wed_device_dma_reset(_dev) (_dev)->ops->reset_dma(_dev) #else static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) { @@ -250,6 +252,8 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) #define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) -ENODEV #define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash) do {} while (0) #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) -ENODEV +#define mtk_wed_device_stop(_dev) do {} while (0) +#define mtk_wed_device_dma_reset(_dev) do {} while (0) #endif #endif -- cgit v1.2.3 From 23dca7a90017ff2512c501f7da4c7ca7a95c2d6e Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 24 Nov 2022 16:22:55 +0100 Subject: net: ethernet: mtk_wed: add reset to tx_ring_setup callback Introduce reset parameter to mtk_wed_tx_ring_setup signature. This is a preliminary patch to add Wireless Ethernet Dispatcher reset support. Co-developed-by: Sujuan Chen Signed-off-by: Sujuan Chen Signed-off-by: Lorenzo Bianconi Signed-off-by: Paolo Abeni --- include/linux/soc/mediatek/mtk_wed.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index c43510e541df..beb190449704 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -158,7 +158,7 @@ struct mtk_wed_device { struct mtk_wed_ops { int (*attach)(struct mtk_wed_device *dev); int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring, - void __iomem *regs); + void __iomem *regs, bool reset); int (*rx_ring_setup)(struct mtk_wed_device *dev, int ring, void __iomem *regs); int (*txfree_ring_setup)(struct mtk_wed_device *dev, @@ -216,8 +216,8 @@ mtk_wed_get_rx_capa(struct mtk_wed_device *dev) #define mtk_wed_device_active(_dev) !!(_dev)->ops #define mtk_wed_device_detach(_dev) (_dev)->ops->detach(_dev) #define mtk_wed_device_start(_dev, _mask) (_dev)->ops->start(_dev, _mask) -#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs) \ - (_dev)->ops->tx_ring_setup(_dev, _ring, _regs) +#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs, _reset) \ + (_dev)->ops->tx_ring_setup(_dev, _ring, _regs, _reset) #define mtk_wed_device_txfree_ring_setup(_dev, _regs) \ (_dev)->ops->txfree_ring_setup(_dev, _regs) #define mtk_wed_device_reg_read(_dev, _reg) \ @@ -243,7 +243,7 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) } #define mtk_wed_device_detach(_dev) do {} while (0) #define mtk_wed_device_start(_dev, _mask) do {} while (0) -#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs) -ENODEV +#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs, _reset) -ENODEV #define mtk_wed_device_txfree_ring_setup(_dev, _ring, _regs) -ENODEV #define mtk_wed_device_reg_read(_dev, _reg) 0 #define mtk_wed_device_reg_write(_dev, _reg, _val) do {} while (0) -- cgit v1.2.3 From 51147284eb7d685a689a5d1b7772faec278a2338 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 29 Nov 2022 14:55:34 +0100 Subject: ieee802154: Advertize coordinators discovery Let's introduce the basics for advertizing discovered PANs and coordinators, which is: - A new "scan" netlink message group. - A couple of netlink command/attribute. - The main netlink helper to send a netlink message with all the necessary information to forward the main information to the user. Two netlink attributes are proactively added to support future UWB complex channels, but are not actually used yet. Co-developed-by: David Girault Signed-off-by: David Girault Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20221129135535.532513-2-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/cfg802154.h | 18 ++++++++++++++++++ include/net/nl802154.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) (limited to 'include') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index e1481f9cf049..d09c393d229f 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -260,6 +260,24 @@ struct ieee802154_addr { }; }; +/** + * struct ieee802154_coord_desc - Coordinator descriptor + * @addr: PAN ID and coordinator address + * @page: page this coordinator is using + * @channel: channel this coordinator is using + * @superframe_spec: SuperFrame specification as received + * @link_quality: link quality indicator at which the beacon was received + * @gts_permit: the coordinator accepts GTS requests + */ +struct ieee802154_coord_desc { + struct ieee802154_addr addr; + u8 page; + u8 channel; + u16 superframe_spec; + u8 link_quality; + bool gts_permit; +}; + struct ieee802154_llsec_key_id { u8 mode; u8 id; diff --git a/include/net/nl802154.h b/include/net/nl802154.h index f5850b569c52..b79a89d5207c 100644 --- a/include/net/nl802154.h +++ b/include/net/nl802154.h @@ -72,6 +72,8 @@ enum nl802154_commands { NL802154_CMD_NEW_SEC_LEVEL, NL802154_CMD_DEL_SEC_LEVEL, + NL802154_CMD_SCAN_EVENT, + /* add new commands above here */ /* used to define NL802154_CMD_MAX below */ @@ -131,6 +133,8 @@ enum nl802154_attrs { NL802154_ATTR_PID, NL802154_ATTR_NETNS_FD, + NL802154_ATTR_COORDINATOR, + /* add attributes here, update the policy in nl802154.c */ #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL @@ -216,6 +220,45 @@ enum nl802154_wpan_phy_capability_attr { NL802154_CAP_ATTR_MAX = __NL802154_CAP_ATTR_AFTER_LAST - 1 }; +/** + * enum nl802154_coord - Netlink attributes for a coord + * + * @__NL802154_COORD_INVALID: invalid + * @NL802154_COORD_PANID: PANID of the coordinator (2 bytes) + * @NL802154_COORD_ADDR: coordinator address, (8 bytes or 2 bytes) + * @NL802154_COORD_CHANNEL: channel number, related to @NL802154_COORD_PAGE (u8) + * @NL802154_COORD_PAGE: channel page, related to @NL802154_COORD_CHANNEL (u8) + * @NL802154_COORD_PREAMBLE_CODE: Preamble code used when the beacon was received, + * this is PHY dependent and optional (u8) + * @NL802154_COORD_MEAN_PRF: Mean PRF used when the beacon was received, + * this is PHY dependent and optional (u8) + * @NL802154_COORD_SUPERFRAME_SPEC: superframe specification of the PAN (u16) + * @NL802154_COORD_LINK_QUALITY: signal quality of beacon in unspecified units, + * scaled to 0..255 (u8) + * @NL802154_COORD_GTS_PERMIT: set to true if GTS is permitted on this PAN + * @NL802154_COORD_PAYLOAD_DATA: binary data containing the raw data from the + * frame payload, (only if beacon or probe response had data) + * @NL802154_COORD_PAD: attribute used for padding for 64-bit alignment + * @NL802154_COORD_MAX: highest coordinator attribute + */ +enum nl802154_coord { + __NL802154_COORD_INVALID, + NL802154_COORD_PANID, + NL802154_COORD_ADDR, + NL802154_COORD_CHANNEL, + NL802154_COORD_PAGE, + NL802154_COORD_PREAMBLE_CODE, + NL802154_COORD_MEAN_PRF, + NL802154_COORD_SUPERFRAME_SPEC, + NL802154_COORD_LINK_QUALITY, + NL802154_COORD_GTS_PERMIT, + NL802154_COORD_PAYLOAD_DATA, + NL802154_COORD_PAD, + + /* keep last */ + NL802154_COORD_MAX, +}; + /** * enum nl802154_cca_modes - cca modes * -- cgit v1.2.3 From 12eb0f84a601cec8920b56d7ffd4182a6bfd6521 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Thu, 29 Sep 2022 13:45:08 +0200 Subject: net/mlx5: Remove unused ctx variables Remove mlx5_priv.ctx_list and ctx_lock which are no longer used after commit 601c10c89cbb ("net/mlx5: Delete custom device management logic"). Signed-off-by: Petr Pavlu Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 06cbad166225..d476255c9a3f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -606,8 +606,6 @@ struct mlx5_priv { struct list_head pgdir_list; /* end: alloc staff */ - struct list_head ctx_list; - spinlock_t ctx_lock; struct mlx5_adev **adev; int adev_idx; int sw_vhca_id; -- cgit v1.2.3 From b146658f2ed90768b769222ca418617274242b32 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 31 Oct 2022 14:18:22 +0200 Subject: net/mlx5e: Add padding when needed in UMR WQEs Per the device spec, MTTs/KLMs list in a UMR WQE must be aligned to 64B. Per our SW design, the MTT/KLMs list would need alignment only if it's too small, for example on PPC when PAGE_SIZE is 64KB, and only 4 pages are needed to cover a MPWQE of size 256KB. Padding, if needed, is taken into account when calculating the UMR WQE fields (ds_cnt and xlt_octowords), however no entries are provided, instead garbage is passed. No real harm though, as these parts act as gaps between the RX MPWQEs and not used by any of them. Hence, in practice, device does not try to write any incoming packet to them. Still, prefer providing clean padding marking the end of the list, and do not map garbage into the RQ memory region. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index eb3fac30488b..97275965f156 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -294,6 +294,7 @@ enum { #define MLX5_UMR_MTT_ALIGNMENT 0x40 #define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1) #define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT +#define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt)) #define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8) -- cgit v1.2.3 From 683d78a0d46235edfd3c50ddbc4af1065b422670 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 31 Oct 2022 14:30:23 +0200 Subject: net/mlx5: Remove unused UMR MTT definitions Defines MLX5_UMR_MTT_MASK and MLX5_UMR_MTT_MIN_CHUNK_SIZE are not in use. Remove them. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 97275965f156..ecf840e2989b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -292,8 +292,6 @@ enum { #define MLX5_UMR_KLM_ALIGNMENT 4 #define MLX5_UMR_MTT_ALIGNMENT 0x40 -#define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1) -#define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT #define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt)) #define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8) -- cgit v1.2.3 From 02648b4b09d506bd4df2e17bf109c229fc728640 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 31 Oct 2022 14:44:57 +0200 Subject: net/mlx5: Generalize name of UMR alignment definition Per the device spec, MLX5_UMR_MTT_ALIGNMENT is good not only for UMR MTT entries, but for all other entries as well, like KLMs and KSMs. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index ecf840e2989b..a02f779f5c5b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -291,8 +291,8 @@ enum { }; #define MLX5_UMR_KLM_ALIGNMENT 4 -#define MLX5_UMR_MTT_ALIGNMENT 0x40 -#define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt)) +#define MLX5_UMR_FLEX_ALIGNMENT 0x40 +#define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_mtt)) #define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8) -- cgit v1.2.3 From daab2e9c54a52dea8dbd1af516e6f986eeed2556 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 31 Oct 2022 14:24:02 +0200 Subject: net/mlx5: Use generic definition for UMR KLM alignment MLX5_UMR_KLM_ALIGNMENT is in units of number of entries, while MLX5_UMR_MTT_ALIGNMENT (generalized and renamed to MLX5_UMR_FLEX_ALIGNMENT) is in byte units. This is misleading and confusing. Replace this KLM definition with one based on the generic definition. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index a02f779f5c5b..5fe5d198b57a 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -290,9 +290,9 @@ enum { MLX5_UMR_INLINE = (1 << 7), }; -#define MLX5_UMR_KLM_ALIGNMENT 4 #define MLX5_UMR_FLEX_ALIGNMENT 0x40 #define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_mtt)) +#define MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_klm)) #define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8) -- cgit v1.2.3 From bff3d0534804452e19c097ae6b4eb4b4d846d67f Mon Sep 17 00:00:00 2001 From: Sriram Yagnaraman Date: Fri, 4 Nov 2022 18:18:35 +0100 Subject: netfilter: conntrack: add sctp DATA_SENT state SCTP conntrack currently assumes that the SCTP endpoints will probe secondary paths using HEARTBEAT before sending traffic. But, according to RFC 9260, SCTP endpoints can send any traffic on any of the confirmed paths after SCTP association is up. SCTP endpoints that sends INIT will confirm all peer addresses that upper layer configures, and the SCTP endpoint that receives COOKIE_ECHO will only confirm the address it sent the INIT_ACK to. So, we can have a situation where the INIT sender can start to use secondary paths without the need to send HEARTBEAT. This patch allows DATA/SACK packets to create new connection tracking entry. A new state has been added to indicate that a DATA/SACK chunk has been seen in the original direction - SCTP_CONNTRACK_DATA_SENT. State transitions mostly follows the HEARTBEAT_SENT, except on receiving HEARTBEAT/HEARTBEAT_ACK/DATA/SACK in the reply direction. State transitions in original direction: - DATA_SENT behaves similar to HEARTBEAT_SENT for all chunks, except that it remains in DATA_SENT on receving HEARTBEAT, HEARTBEAT_ACK/DATA/SACK chunks State transitions in reply direction: - DATA_SENT behaves similar to HEARTBEAT_SENT for all chunks, except that it moves to HEARTBEAT_ACKED on receiving HEARTBEAT/HEARTBEAT_ACK/DATA/SACK chunks Note: This patch still doesn't solve the problem when the SCTP endpoint decides to use primary paths for association establishment but uses a secondary path for association shutdown. We still have to depend on timeout for connections to expire in such a case. Signed-off-by: Sriram Yagnaraman Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_sctp.h | 1 + include/uapi/linux/netfilter/nfnetlink_cttimeout.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_conntrack_sctp.h b/include/uapi/linux/netfilter/nf_conntrack_sctp.h index edc6ddab0de6..c742469afe21 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_sctp.h +++ b/include/uapi/linux/netfilter/nf_conntrack_sctp.h @@ -16,6 +16,7 @@ enum sctp_conntrack { SCTP_CONNTRACK_SHUTDOWN_ACK_SENT, SCTP_CONNTRACK_HEARTBEAT_SENT, SCTP_CONNTRACK_HEARTBEAT_ACKED, + SCTP_CONNTRACK_DATA_SENT, SCTP_CONNTRACK_MAX }; diff --git a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h index 6b20fb22717b..94e74034706d 100644 --- a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h +++ b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h @@ -95,6 +95,7 @@ enum ctattr_timeout_sctp { CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, CTA_TIMEOUT_SCTP_HEARTBEAT_SENT, CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED, + CTA_TIMEOUT_SCTP_DATA_SENT, __CTA_TIMEOUT_SCTP_MAX }; #define CTA_TIMEOUT_SCTP_MAX (__CTA_TIMEOUT_SCTP_MAX - 1) -- cgit v1.2.3 From a70e483460d58e64504dd679fd127e9549385c86 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 9 Nov 2022 12:21:58 +0100 Subject: netfilter: conntrack: merge ipv4+ipv6 confirm functions No need to have distinct functions. After merge, ipv6 can avoid protooff computation if the connection neither needs sequence adjustment nor helper invocation -- this is the normal case. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index b2b9de70d9f4..71d1269fe4d4 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -71,8 +71,7 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb) return ret; } -unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, enum ip_conntrack_info ctinfo); +unsigned int nf_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l4proto *proto); -- cgit v1.2.3 From e9374524950512a1769f610a868fcdf89ea59b8e Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Tue, 22 Nov 2022 20:30:57 +0100 Subject: netfilter: ipset: Add support for new bitmask parameter Add a new parameter to complement the existing 'netmask' option. The main difference between netmask and bitmask is that bitmask takes any arbitrary ip address as input, it does not have to be a valid netmask. The name of the new parameter is 'bitmask'. This lets us mask out arbitrary bits in the ip address, for example: ipset create set1 hash:ip bitmask 255.128.255.0 ipset create set2 hash:ip,port family inet6 bitmask ffff::ff80 Signed-off-by: Vishwanath Pai Signed-off-by: Joshua Hunt Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 10 ++++++++++ include/uapi/linux/netfilter/ipset/ip_set.h | 2 ++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index ada1296c87d5..ab934ad951a8 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -515,6 +515,16 @@ ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo, *skbinfo = ext->skbinfo; } +static inline void +nf_inet_addr_mask_inplace(union nf_inet_addr *a1, + const union nf_inet_addr *mask) +{ + a1->all[0] &= mask->all[0]; + a1->all[1] &= mask->all[1]; + a1->all[2] &= mask->all[2]; + a1->all[3] &= mask->all[3]; +} + #define IP_SET_INIT_KEXT(skb, opt, set) \ { .bytes = (skb)->len, .packets = 1, .target = true,\ .timeout = ip_set_adt_opt_timeout(opt, set) } diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index 79e5d68b87af..333807efd32b 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -85,6 +85,7 @@ enum { IPSET_ATTR_CADT_LINENO = IPSET_ATTR_LINENO, /* 9 */ IPSET_ATTR_MARK, /* 10 */ IPSET_ATTR_MARKMASK, /* 11 */ + IPSET_ATTR_BITMASK, /* 12 */ /* Reserve empty slots */ IPSET_ATTR_CADT_MAX = 16, /* Create-only specific attributes */ @@ -153,6 +154,7 @@ enum ipset_errno { IPSET_ERR_COMMENT, IPSET_ERR_INVALID_MARKMASK, IPSET_ERR_SKBINFO, + IPSET_ERR_BITMASK_NETMASK_EXCL, /* Type specific error codes */ IPSET_ERR_TYPE_SPECIFIC = 4352, -- cgit v1.2.3 From c67cae551f0df80421b5703ee56ff5e2fe9c4de6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 25 Nov 2022 14:06:17 -0800 Subject: bpf: Tighten ptr_to_btf_id checks. The networking programs typically don't require CAP_PERFMON, but through kfuncs like bpf_cast_to_kern_ctx() they can access memory through PTR_TO_BTF_ID. In such case enforce CAP_PERFMON. Also make sure that only GPL programs can access kernel data structures. All kfuncs require GPL already. Also remove allow_ptr_to_map_access. It's the same as allow_ptr_leaks and different name for the same check only causes confusion. Fixes: fd264ca02094 ("bpf: Add a kfunc to type cast from bpf uapi ctx to kernel ctx") Fixes: 50c6b8a9aea2 ("selftests/bpf: Add a test for btf_type_tag "percpu"") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221125220617.26846-1-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 5 ----- include/linux/bpf_verifier.h | 1 - 2 files changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 67452103bb86..4920ac252754 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1909,11 +1909,6 @@ static inline bool bpf_allow_uninit_stack(void) return perfmon_capable(); } -static inline bool bpf_allow_ptr_to_map_access(void) -{ - return perfmon_capable(); -} - static inline bool bpf_bypass_spec_v1(void) { return perfmon_capable(); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c05aa6e1f6f5..b5090e89cb3f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -531,7 +531,6 @@ struct bpf_verifier_env { bool explore_alu_limits; bool allow_ptr_leaks; bool allow_uninit_stack; - bool allow_ptr_to_map_access; bool bpf_capable; bool bypass_spec_v1; bool bypass_spec_v4; -- cgit v1.2.3 From a351d6087bf7d3d8440d58d3bf244ec64b89394a Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Tue, 29 Nov 2022 18:40:39 +0800 Subject: bpf, sockmap: Fix missing BPF_F_INGRESS flag when using apply_bytes When redirecting, we use sk_msg_to_ingress() to get the BPF_F_INGRESS flag from the msg->flags. If apply_bytes is used and it is larger than the current data being processed, sk_psock_msg_verdict() will not be called when sendmsg() is called again. At this time, the msg->flags is 0, and we lost the BPF_F_INGRESS flag. So we need to save the BPF_F_INGRESS flag in sk_psock and use it when redirection. Fixes: 8934ce2fd081 ("bpf: sockmap redirect ingress support") Signed-off-by: Pengcheng Yang Signed-off-by: Daniel Borkmann Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/1669718441-2654-3-git-send-email-yangpc@wangsu.com --- include/linux/skmsg.h | 1 + include/net/tcp.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 70d6cb94e580..84f787416a54 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -82,6 +82,7 @@ struct sk_psock { u32 apply_bytes; u32 cork_bytes; u32 eval; + bool redir_ingress; /* undefined if sk_redir is null */ struct sk_msg *cork; struct sk_psock_progs progs; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) diff --git a/include/net/tcp.h b/include/net/tcp.h index 6b814e788f00..b87e7381bddf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2319,8 +2319,8 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #endif /* CONFIG_BPF_SYSCALL */ -int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, - int flags); +int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, + struct sk_msg *msg, u32 bytes, int flags); #endif /* CONFIG_NET_SOCK_MSG */ #if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG) -- cgit v1.2.3 From 3144bfa5078e0df7507a4de72061501e6a0e56be Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 29 Nov 2022 21:21:47 -0800 Subject: bpf: Fix a compilation failure with clang lto build When building the kernel with clang lto (CONFIG_LTO_CLANG_FULL=y), the following compilation error will appear: $ make LLVM=1 LLVM_IAS=1 -j ... ld.lld: error: ld-temp.o :26889:1: symbol 'cgroup_storage_map_btf_ids' is already defined cgroup_storage_map_btf_ids:; ^ make[1]: *** [/.../bpf-next/scripts/Makefile.vmlinux_o:61: vmlinux.o] Error 1 In local_storage.c, we have BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, bpf_local_storage_map) Commit c4bcfb38a95e ("bpf: Implement cgroup storage available to non-cgroup-attached bpf progs") added the above identical BTF_ID_LIST_SINGLE definition in bpf_cgrp_storage.c. With duplicated definitions, llvm linker complains with lto build. Also, extracting btf_id of 'struct bpf_local_storage_map' is defined four times for sk, inode, task and cgrp local storages. Let us define a single global one with a different name than cgroup_storage_map_btf_ids, which also fixed the lto compilation error. Fixes: c4bcfb38a95e ("bpf: Implement cgroup storage available to non-cgroup-attached bpf progs") Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221130052147.1591625-1-yhs@fb.com --- include/linux/btf_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 93397711a68c..3a4f7cd882ca 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -266,5 +266,6 @@ MAX_BTF_TRACING_TYPE, extern u32 btf_tracing_ids[]; extern u32 bpf_cgroup_btf_id[]; +extern u32 bpf_local_storage_map_btf_id[]; #endif -- cgit v1.2.3 From af6397c9ee2b42988c912dcad2fca1f43d5c1c99 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 28 Nov 2022 12:36:44 -0800 Subject: devlink: support directly reading from region memory To read from a region, user space must currently request a new snapshot of the region and then read from that snapshot. This can sometimes be overkill if user space only reads a tiny portion. They first create the snapshot, then request a read, then destroy the snapshot. For regions which have a single underlying "contents", it makes sense to allow supporting direct reading of the region data. Extend the DEVLINK_CMD_REGION_READ to allow direct reading from a region if requested via the new DEVLINK_ATTR_REGION_DIRECT. If this attribute is set, then perform a direct read instead of using a snapshot. Direct read is mutually exclusive with DEVLINK_ATTR_REGION_SNAPSHOT_ID, and care is taken to ensure that we reject commands which provide incorrect attributes. Regions must enable support for direct read by implementing the .read() callback function. If a region does not support such direct reads, a suitable extended error message is reported. Signed-off-by: Jacob Keller Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 16 ++++++++++++++++ include/uapi/linux/devlink.h | 2 ++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 074a79b8933f..02528f736f65 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -650,6 +650,10 @@ struct devlink_info_req; * the data variable must be updated to point to the snapshot data. * The function will be called while the devlink instance lock is * held. + * @read: callback to directly read a portion of the region. On success, + * the data pointer will be updated with the contents of the + * requested portion of the region. The function will be called + * while the devlink instance lock is held. * @priv: Pointer to driver private data for the region operation */ struct devlink_region_ops { @@ -659,6 +663,10 @@ struct devlink_region_ops { const struct devlink_region_ops *ops, struct netlink_ext_ack *extack, u8 **data); + int (*read)(struct devlink *devlink, + const struct devlink_region_ops *ops, + struct netlink_ext_ack *extack, + u64 offset, u32 size, u8 *data); void *priv; }; @@ -670,6 +678,10 @@ struct devlink_region_ops { * the data variable must be updated to point to the snapshot data. * The function will be called while the devlink instance lock is * held. + * @read: callback to directly read a portion of the region. On success, + * the data pointer will be updated with the contents of the + * requested portion of the region. The function will be called + * while the devlink instance lock is held. * @priv: Pointer to driver private data for the region operation */ struct devlink_port_region_ops { @@ -679,6 +691,10 @@ struct devlink_port_region_ops { const struct devlink_port_region_ops *ops, struct netlink_ext_ack *extack, u8 **data); + int (*read)(struct devlink_port *port, + const struct devlink_port_region_ops *ops, + struct netlink_ext_ack *extack, + u64 offset, u32 size, u8 *data); void *priv; }; diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 498d0d5d0957..70191d96af89 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -610,6 +610,8 @@ enum devlink_attr { DEVLINK_ATTR_RATE_TX_PRIORITY, /* u32 */ DEVLINK_ATTR_RATE_TX_WEIGHT, /* u32 */ + DEVLINK_ATTR_REGION_DIRECT, /* flag */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, -- cgit v1.2.3 From 226bf980550627c88549b112ac6c8fb40873afb4 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 29 Nov 2022 18:51:38 +0900 Subject: net: devlink: let the core report the driver name instead of the drivers The driver name is available in device_driver::name. Right now, drivers still have to report this piece of information themselves in their devlink_ops::info_get callback function. In order to factorize code, make devlink_nl_info_fill() add the driver name attribute. Now that the core sets the driver name attribute, drivers are not supposed to call devlink_info_driver_name_put() anymore. Remove devlink_info_driver_name_put() and clean-up all the drivers using this function in their callback. Signed-off-by: Vincent Mailhol Tested-by: Ido Schimmel # mlxsw Reviewed-by: Jacob Keller Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 02528f736f65..5f6eca5e4a40 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1762,8 +1762,6 @@ int devlink_region_snapshot_create(struct devlink_region *region, u8 *data, u32 snapshot_id); int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn); -int devlink_info_driver_name_put(struct devlink_info_req *req, - const char *name); int devlink_info_board_serial_number_put(struct devlink_info_req *req, const char *bsn); -- cgit v1.2.3 From bc66fa87d4fda9053a8145e5718fc278c2b88253 Mon Sep 17 00:00:00 2001 From: Xiaolei Wang Date: Wed, 30 Nov 2022 10:12:16 +0800 Subject: net: phy: Add link between phy dev and mac dev If the external phy used by current mac interface is managed by another mac interface, it means that this network port cannot work independently, especially when the system suspends and resumes, the following trace may appear, so we should create a device link between phy dev and mac dev. WARNING: CPU: 0 PID: 24 at drivers/net/phy/phy.c:983 phy_error+0x20/0x68 Modules linked in: CPU: 0 PID: 24 Comm: kworker/0:2 Not tainted 6.1.0-rc3-00011-g5aaef24b5c6d-dirty #34 Hardware name: Freescale i.MX6 SoloX (Device Tree) Workqueue: events_power_efficient phy_state_machine unwind_backtrace from show_stack+0x10/0x14 show_stack from dump_stack_lvl+0x68/0x90 dump_stack_lvl from __warn+0xb4/0x24c __warn from warn_slowpath_fmt+0x5c/0xd8 warn_slowpath_fmt from phy_error+0x20/0x68 phy_error from phy_state_machine+0x22c/0x23c phy_state_machine from process_one_work+0x288/0x744 process_one_work from worker_thread+0x3c/0x500 worker_thread from kthread+0xf0/0x114 kthread from ret_from_fork+0x14/0x28 Exception stack(0xf0951fb0 to 0xf0951ff8) Signed-off-by: Xiaolei Wang Tested-by: Florian Fainelli Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20221130021216.1052230-1-xiaolei.wang@windriver.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 9a3752c0c444..71eeb4e3b1fd 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -529,6 +529,8 @@ struct macsec_ops; * * @mdio: MDIO bus this PHY is on * @drv: Pointer to the driver for this PHY instance + * @devlink: Create a link between phy dev and mac dev, if the external phy + * used by current mac interface is managed by another mac interface. * @phy_id: UID for this device found during discovery * @c45_ids: 802.3-c45 Device Identifiers if is_c45. * @is_c45: Set to true if this PHY uses clause 45 addressing. @@ -618,6 +620,8 @@ struct phy_device { /* And management functions */ struct phy_driver *drv; + struct device_link *devlink; + u32 phy_id; struct phy_c45_device_ids c45_ids; -- cgit v1.2.3 From 278ab9793116a8531ffaa52b4e61644971131e35 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 30 Nov 2022 13:26:45 -0800 Subject: wifi: ieee80211: Do not open-code qos address offsets When building with -Wstringop-overflow, GCC's KASAN implementation does not correctly perform bounds checking within some complex structures when faced with literal offsets, and can get very confused. For example, this warning is seen due to literal offsets into sturct ieee80211_hdr that may or may not be large enough: drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c: In function 'iwl_mvm_rx_mpdu_mq': drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c:2022:29: warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=] 2022 | *qc &= ~IEEE80211_QOS_CTL_A_MSDU_PRESENT; In file included from drivers/net/wireless/intel/iwlwifi/mvm/fw-api.h:32, from drivers/net/wireless/intel/iwlwifi/mvm/sta.h:15, from drivers/net/wireless/intel/iwlwifi/mvm/mvm.h:27, from drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c:10: drivers/net/wireless/intel/iwlwifi/mvm/../fw/api/rx.h:559:16: note: at offset [78, 166] into destination object 'mpdu_len' of size 2 559 | __le16 mpdu_len; | ^~~~~~~~ Refactor ieee80211_get_qos_ctl() to avoid using literal offsets, requiring the creation of the actual structure that is described in the comments. Explicitly choose the desired offset, making the code more human-readable too. This is one of the last remaining warning to fix before enabling -Wstringop-overflow globally. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97490 Link: https://github.com/KSPP/linux/issues/181 Cc: Johannes Berg Cc: Kalle Valo Cc: Gregory Greenman Cc: "Gustavo A. R. Silva" Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221130212641.never.627-kees@kernel.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 6252f02f38b7..80d6308dea06 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -338,6 +338,17 @@ struct ieee80211_qos_hdr { __le16 qos_ctrl; } __packed __aligned(2); +struct ieee80211_qos_hdr_4addr { + __le16 frame_control; + __le16 duration_id; + u8 addr1[ETH_ALEN]; + u8 addr2[ETH_ALEN]; + u8 addr3[ETH_ALEN]; + __le16 seq_ctrl; + u8 addr4[ETH_ALEN]; + __le16 qos_ctrl; +} __packed __aligned(2); + struct ieee80211_trigger { __le16 frame_control; __le16 duration; @@ -4060,16 +4071,21 @@ struct ieee80211_he_6ghz_capa { * @hdr: the frame * * The qos ctrl bytes come after the frame_control, duration, seq_num - * and 3 or 4 addresses of length ETH_ALEN. - * 3 addr: 2 + 2 + 2 + 3*6 = 24 - * 4 addr: 2 + 2 + 2 + 4*6 = 30 + * and 3 or 4 addresses of length ETH_ALEN. Checks frame_control to choose + * between struct ieee80211_qos_hdr_4addr and struct ieee80211_qos_hdr. */ static inline u8 *ieee80211_get_qos_ctl(struct ieee80211_hdr *hdr) { - if (ieee80211_has_a4(hdr->frame_control)) - return (u8 *)hdr + 30; + union { + struct ieee80211_qos_hdr addr3; + struct ieee80211_qos_hdr_4addr addr4; + } *qos; + + qos = (void *)hdr; + if (ieee80211_has_a4(qos->addr3.frame_control)) + return (u8 *)&qos->addr4.qos_ctrl; else - return (u8 *)hdr + 24; + return (u8 *)&qos->addr3.qos_ctrl; } /** -- cgit v1.2.3 From 4d371d6e3746d84056f150fdaa66c85f65f60124 Mon Sep 17 00:00:00 2001 From: Philipp Hortmann Date: Mon, 14 Nov 2022 21:01:35 +0100 Subject: wifi: cfg80211: Correct example of ieee80211_iface_limit Correct wrong closing bracket. Signed-off-by: Philipp Hortmann Link: https://lore.kernel.org/r/20221114200135.GA100176@matrix-ESPRIMO-P710 Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 11a370e64143..03d4f4deadae 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4742,7 +4742,7 @@ struct ieee80211_iface_limit { * * struct ieee80211_iface_limit limits1[] = { * { .max = 1, .types = BIT(NL80211_IFTYPE_STATION), }, - * { .max = 1, .types = BIT(NL80211_IFTYPE_AP}, }, + * { .max = 1, .types = BIT(NL80211_IFTYPE_AP), }, * }; * struct ieee80211_iface_combination combination1 = { * .limits = limits1, -- cgit v1.2.3 From 75bfdbf2fca372e2709bcaa43e8cf1147766ae96 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 3 Nov 2022 22:27:52 +0000 Subject: rxrpc: Implement an in-kernel rxperf server for testing purposes Implement an in-kernel rxperf server to allow kernel-based rxrpc services to be tested directly, unlike with AFS where they're accessed by the fileserver when the latter decides it wants to. This is implemented as a module that, if loaded, opens UDP port 7009 (afs3-rmtsys) and listens on it for incoming calls. Calls can be generated using the rxperf command shipped with OpenAFS, for example. Changes ======= ver #2) - Use min_t() instead of min(). Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: Jakub Kicinski --- include/net/af_rxrpc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index b69ca695935c..dc033f08191e 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -71,5 +71,6 @@ void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *, unsigned long); int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val); +int rxrpc_sock_set_security_keyring(struct sock *, struct key *); #endif /* _NET_RXRPC_H */ -- cgit v1.2.3 From 2ed83ed2be1b2395f11a95f0fec2b87ed71aebd8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 17 Nov 2022 10:04:15 +0000 Subject: rxrpc: Remove decl for rxrpc_kernel_call_is_complete() rxrpc_kernel_call_is_complete() has been removed, so remove its declaration too. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/net/af_rxrpc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index dc033f08191e..d5a5ae926380 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -66,7 +66,6 @@ int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *); u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); -bool rxrpc_kernel_call_is_complete(struct rxrpc_call *); void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *, unsigned long); -- cgit v1.2.3 From 2ebdb26e6abd2a773ab5f009ac38a6de973a2bcf Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Oct 2022 11:51:06 +0100 Subject: rxrpc: Remove the [k_]proto() debugging macros Remove the kproto() and _proto() debugging macros in preference to using tracepoints for this. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 60 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index b9886d1df825..2b77f9a75bf7 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -733,6 +733,66 @@ TRACE_EVENT(rxrpc_rx_abort, __entry->abort_code) ); +TRACE_EVENT(rxrpc_rx_challenge, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, + u32 version, u32 nonce, u32 min_level), + + TP_ARGS(conn, serial, version, nonce, min_level), + + TP_STRUCT__entry( + __field(unsigned int, conn ) + __field(rxrpc_serial_t, serial ) + __field(u32, version ) + __field(u32, nonce ) + __field(u32, min_level ) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->serial = serial; + __entry->version = version; + __entry->nonce = nonce; + __entry->min_level = min_level; + ), + + TP_printk("C=%08x CHALLENGE %08x v=%x n=%x ml=%x", + __entry->conn, + __entry->serial, + __entry->version, + __entry->nonce, + __entry->min_level) + ); + +TRACE_EVENT(rxrpc_rx_response, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, + u32 version, u32 kvno, u32 ticket_len), + + TP_ARGS(conn, serial, version, kvno, ticket_len), + + TP_STRUCT__entry( + __field(unsigned int, conn ) + __field(rxrpc_serial_t, serial ) + __field(u32, version ) + __field(u32, kvno ) + __field(u32, ticket_len ) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->serial = serial; + __entry->version = version; + __entry->kvno = kvno; + __entry->ticket_len = ticket_len; + ), + + TP_printk("C=%08x RESPONSE %08x v=%x kvno=%x tl=%x", + __entry->conn, + __entry->serial, + __entry->version, + __entry->kvno, + __entry->ticket_len) + ); + TRACE_EVENT(rxrpc_rx_rwind_change, TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial, u32 rwind, bool wake), -- cgit v1.2.3 From 0fde882fc9ee9cc2e66e8c5a5a93c83932d7ca95 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Oct 2022 13:00:34 +0100 Subject: rxrpc: trace: Don't use __builtin_return_address for rxrpc_local tracing In rxrpc tracing, use enums to generate lists of points of interest rather than __builtin_return_address() for the rxrpc_local tracepoint Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 49 ++++++++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 2b77f9a75bf7..015569845b1d 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -32,12 +32,35 @@ E_(rxrpc_skb_unshared_nomem, "US0") #define rxrpc_local_traces \ - EM(rxrpc_local_got, "GOT") \ - EM(rxrpc_local_new, "NEW") \ - EM(rxrpc_local_processing, "PRO") \ - EM(rxrpc_local_put, "PUT") \ - EM(rxrpc_local_queued, "QUE") \ - E_(rxrpc_local_tx_ack, "TAK") + EM(rxrpc_local_free, "FREE ") \ + EM(rxrpc_local_get_client_conn, "GET conn-cln") \ + EM(rxrpc_local_get_for_use, "GET for-use ") \ + EM(rxrpc_local_get_peer, "GET peer ") \ + EM(rxrpc_local_get_prealloc_conn, "GET conn-pre") \ + EM(rxrpc_local_get_queue, "GET queue ") \ + EM(rxrpc_local_new, "NEW ") \ + EM(rxrpc_local_processing, "PROCESSING ") \ + EM(rxrpc_local_put_already_queued, "PUT alreadyq") \ + EM(rxrpc_local_put_bind, "PUT bind ") \ + EM(rxrpc_local_put_for_use, "PUT for-use ") \ + EM(rxrpc_local_put_kill_conn, "PUT conn-kil") \ + EM(rxrpc_local_put_peer, "PUT peer ") \ + EM(rxrpc_local_put_prealloc_conn, "PUT conn-pre") \ + EM(rxrpc_local_put_release_sock, "PUT rel-sock") \ + EM(rxrpc_local_put_queue, "PUT queue ") \ + EM(rxrpc_local_queued, "QUEUED ") \ + EM(rxrpc_local_see_tx_ack, "SEE tx-ack ") \ + EM(rxrpc_local_stop, "STOP ") \ + EM(rxrpc_local_stopped, "STOPPED ") \ + EM(rxrpc_local_unuse_bind, "UNU bind ") \ + EM(rxrpc_local_unuse_conn_work, "UNU conn-wrk") \ + EM(rxrpc_local_unuse_peer_keepalive, "UNU peer-kpa") \ + EM(rxrpc_local_unuse_release_sock, "UNU rel-sock") \ + EM(rxrpc_local_unuse_work, "UNU work ") \ + EM(rxrpc_local_use_conn_work, "USE conn-wrk") \ + EM(rxrpc_local_use_lookup, "USE lookup ") \ + EM(rxrpc_local_use_peer_keepalive, "USE peer-kpa") \ + E_(rxrpc_local_use_work, "USE work ") #define rxrpc_peer_traces \ EM(rxrpc_peer_got, "GOT") \ @@ -345,29 +368,29 @@ rxrpc_txqueue_traces; TRACE_EVENT(rxrpc_local, TP_PROTO(unsigned int local_debug_id, enum rxrpc_local_trace op, - int usage, const void *where), + int ref, int usage), - TP_ARGS(local_debug_id, op, usage, where), + TP_ARGS(local_debug_id, op, ref, usage), TP_STRUCT__entry( __field(unsigned int, local ) __field(int, op ) + __field(int, ref ) __field(int, usage ) - __field(const void *, where ) ), TP_fast_assign( __entry->local = local_debug_id; __entry->op = op; + __entry->ref = ref; __entry->usage = usage; - __entry->where = where; ), - TP_printk("L=%08x %s u=%d sp=%pSR", + TP_printk("L=%08x %s r=%d u=%d", __entry->local, __print_symbolic(__entry->op, rxrpc_local_traces), - __entry->usage, - __entry->where) + __entry->ref, + __entry->usage) ); TRACE_EVENT(rxrpc_peer, -- cgit v1.2.3 From 47c810a79844462d3468d831edc00971757693e0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Oct 2022 13:39:34 +0100 Subject: rxrpc: trace: Don't use __builtin_return_address for rxrpc_peer tracing In rxrpc tracing, use enums to generate lists of points of interest rather than __builtin_return_address() for the rxrpc_peer tracepoint Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 015569845b1d..1c74143a51c1 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -63,10 +63,23 @@ E_(rxrpc_local_use_work, "USE work ") #define rxrpc_peer_traces \ - EM(rxrpc_peer_got, "GOT") \ - EM(rxrpc_peer_new, "NEW") \ - EM(rxrpc_peer_processing, "PRO") \ - E_(rxrpc_peer_put, "PUT") + EM(rxrpc_peer_free, "FREE ") \ + EM(rxrpc_peer_get_accept, "GET accept ") \ + EM(rxrpc_peer_get_activate_call, "GET act-call") \ + EM(rxrpc_peer_get_bundle, "GET bundle ") \ + EM(rxrpc_peer_get_client_conn, "GET cln-conn") \ + EM(rxrpc_peer_get_input_error, "GET inpt-err") \ + EM(rxrpc_peer_get_keepalive, "GET keepaliv") \ + EM(rxrpc_peer_get_lookup_client, "GET look-cln") \ + EM(rxrpc_peer_get_service_conn, "GET srv-conn") \ + EM(rxrpc_peer_new_client, "NEW client ") \ + EM(rxrpc_peer_new_prealloc, "NEW prealloc") \ + EM(rxrpc_peer_put_bundle, "PUT bundle ") \ + EM(rxrpc_peer_put_call, "PUT call ") \ + EM(rxrpc_peer_put_conn, "PUT conn ") \ + EM(rxrpc_peer_put_discard_tmp, "PUT disc-tmp") \ + EM(rxrpc_peer_put_input_error, "PUT inpt-err") \ + E_(rxrpc_peer_put_keepalive, "PUT keepaliv") #define rxrpc_conn_traces \ EM(rxrpc_conn_got, "GOT") \ @@ -394,30 +407,26 @@ TRACE_EVENT(rxrpc_local, ); TRACE_EVENT(rxrpc_peer, - TP_PROTO(unsigned int peer_debug_id, enum rxrpc_peer_trace op, - int usage, const void *where), + TP_PROTO(unsigned int peer_debug_id, int ref, enum rxrpc_peer_trace why), - TP_ARGS(peer_debug_id, op, usage, where), + TP_ARGS(peer_debug_id, ref, why), TP_STRUCT__entry( __field(unsigned int, peer ) - __field(int, op ) - __field(int, usage ) - __field(const void *, where ) + __field(int, ref ) + __field(int, why ) ), TP_fast_assign( __entry->peer = peer_debug_id; - __entry->op = op; - __entry->usage = usage; - __entry->where = where; + __entry->ref = ref; + __entry->why = why; ), - TP_printk("P=%08x %s u=%d sp=%pSR", + TP_printk("P=%08x %s r=%d", __entry->peer, - __print_symbolic(__entry->op, rxrpc_peer_traces), - __entry->usage, - __entry->where) + __print_symbolic(__entry->why, rxrpc_peer_traces), + __entry->ref) ); TRACE_EVENT(rxrpc_conn, -- cgit v1.2.3 From 7fa25105b2d32fcb0f38668bc20d0adf6508322f Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Oct 2022 14:06:16 +0100 Subject: rxrpc: trace: Don't use __builtin_return_address for rxrpc_conn tracing In rxrpc tracing, use enums to generate lists of points of interest rather than __builtin_return_address() for the rxrpc_conn tracepoint Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 58 ++++++++++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 1c74143a51c1..e09568a8c173 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -82,14 +82,34 @@ E_(rxrpc_peer_put_keepalive, "PUT keepaliv") #define rxrpc_conn_traces \ - EM(rxrpc_conn_got, "GOT") \ - EM(rxrpc_conn_new_client, "NWc") \ - EM(rxrpc_conn_new_service, "NWs") \ - EM(rxrpc_conn_put_client, "PTc") \ - EM(rxrpc_conn_put_service, "PTs") \ - EM(rxrpc_conn_queued, "QUE") \ - EM(rxrpc_conn_reap_service, "RPs") \ - E_(rxrpc_conn_seen, "SEE") + EM(rxrpc_conn_free, "FREE ") \ + EM(rxrpc_conn_get_activate_call, "GET act-call") \ + EM(rxrpc_conn_get_call_input, "GET inp-call") \ + EM(rxrpc_conn_get_conn_input, "GET inp-conn") \ + EM(rxrpc_conn_get_idle, "GET idle ") \ + EM(rxrpc_conn_get_poke, "GET poke ") \ + EM(rxrpc_conn_get_service_conn, "GET svc-conn") \ + EM(rxrpc_conn_new_client, "NEW client ") \ + EM(rxrpc_conn_new_service, "NEW service ") \ + EM(rxrpc_conn_put_already_queued, "PUT alreadyq") \ + EM(rxrpc_conn_put_call, "PUT call ") \ + EM(rxrpc_conn_put_call_input, "PUT inp-call") \ + EM(rxrpc_conn_put_conn_input, "PUT inp-conn") \ + EM(rxrpc_conn_put_discard, "PUT discard ") \ + EM(rxrpc_conn_put_discard_idle, "PUT disc-idl") \ + EM(rxrpc_conn_put_local_dead, "PUT loc-dead") \ + EM(rxrpc_conn_put_noreuse, "PUT noreuse ") \ + EM(rxrpc_conn_put_poke, "PUT poke ") \ + EM(rxrpc_conn_put_unbundle, "PUT unbundle") \ + EM(rxrpc_conn_put_unidle, "PUT unidle ") \ + EM(rxrpc_conn_put_work, "PUT work ") \ + EM(rxrpc_conn_queue_challenge, "GQ chall ") \ + EM(rxrpc_conn_queue_retry_work, "GQ retry-wk") \ + EM(rxrpc_conn_queue_rx_work, "GQ rx-work ") \ + EM(rxrpc_conn_queue_timer, "GQ timer ") \ + EM(rxrpc_conn_see_new_service_conn, "SEE new-svc ") \ + EM(rxrpc_conn_see_reap_service, "SEE reap-svc") \ + E_(rxrpc_conn_see_work, "SEE work ") #define rxrpc_client_traces \ EM(rxrpc_client_activate_chans, "Activa") \ @@ -430,30 +450,26 @@ TRACE_EVENT(rxrpc_peer, ); TRACE_EVENT(rxrpc_conn, - TP_PROTO(unsigned int conn_debug_id, enum rxrpc_conn_trace op, - int usage, const void *where), + TP_PROTO(unsigned int conn_debug_id, int ref, enum rxrpc_conn_trace why), - TP_ARGS(conn_debug_id, op, usage, where), + TP_ARGS(conn_debug_id, ref, why), TP_STRUCT__entry( __field(unsigned int, conn ) - __field(int, op ) - __field(int, usage ) - __field(const void *, where ) + __field(int, ref ) + __field(int, why ) ), TP_fast_assign( __entry->conn = conn_debug_id; - __entry->op = op; - __entry->usage = usage; - __entry->where = where; + __entry->ref = ref; + __entry->why = why; ), - TP_printk("C=%08x %s u=%d sp=%pSR", + TP_printk("C=%08x %s r=%d", __entry->conn, - __print_symbolic(__entry->op, rxrpc_conn_traces), - __entry->usage, - __entry->where) + __print_symbolic(__entry->why, rxrpc_conn_traces), + __entry->ref) ); TRACE_EVENT(rxrpc_client, -- cgit v1.2.3 From cb0fc0c9722c0c001510e2a6d9b0a78b80421487 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Oct 2022 14:39:26 +0100 Subject: rxrpc: trace: Don't use __builtin_return_address for rxrpc_call tracing In rxrpc tracing, use enums to generate lists of points of interest rather than __builtin_return_address() for the rxrpc_call tracepoint Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 83 ++++++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index e09568a8c173..3f6de4294148 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -127,26 +127,44 @@ E_(rxrpc_client_to_idle, "->Idle") #define rxrpc_call_traces \ - EM(rxrpc_call_connected, "CON") \ - EM(rxrpc_call_error, "*E*") \ - EM(rxrpc_call_got, "GOT") \ - EM(rxrpc_call_got_kernel, "Gke") \ - EM(rxrpc_call_got_timer, "GTM") \ - EM(rxrpc_call_got_tx, "Gtx") \ - EM(rxrpc_call_got_userid, "Gus") \ - EM(rxrpc_call_new_client, "NWc") \ - EM(rxrpc_call_new_service, "NWs") \ - EM(rxrpc_call_put, "PUT") \ - EM(rxrpc_call_put_kernel, "Pke") \ - EM(rxrpc_call_put_noqueue, "PnQ") \ - EM(rxrpc_call_put_notimer, "PnT") \ - EM(rxrpc_call_put_timer, "PTM") \ - EM(rxrpc_call_put_tx, "Ptx") \ - EM(rxrpc_call_put_userid, "Pus") \ - EM(rxrpc_call_queued, "QUE") \ - EM(rxrpc_call_queued_ref, "QUR") \ - EM(rxrpc_call_release, "RLS") \ - E_(rxrpc_call_seen, "SEE") + EM(rxrpc_call_get_input, "GET input ") \ + EM(rxrpc_call_get_kernel_service, "GET krnl-srv") \ + EM(rxrpc_call_get_notify_socket, "GET notify ") \ + EM(rxrpc_call_get_recvmsg, "GET recvmsg ") \ + EM(rxrpc_call_get_release_sock, "GET rel-sock") \ + EM(rxrpc_call_get_sendmsg, "GET sendmsg ") \ + EM(rxrpc_call_get_send_ack, "GET send-ack") \ + EM(rxrpc_call_get_timer, "GET timer ") \ + EM(rxrpc_call_get_userid, "GET user-id ") \ + EM(rxrpc_call_new_client, "NEW client ") \ + EM(rxrpc_call_new_prealloc_service, "NEW prealloc") \ + EM(rxrpc_call_put_already_queued, "PUT alreadyq") \ + EM(rxrpc_call_put_discard_prealloc, "PUT disc-pre") \ + EM(rxrpc_call_put_input, "PUT input ") \ + EM(rxrpc_call_put_kernel, "PUT kernel ") \ + EM(rxrpc_call_put_recvmsg, "PUT recvmsg ") \ + EM(rxrpc_call_put_release_sock, "PUT rls-sock") \ + EM(rxrpc_call_put_release_sock_tba, "PUT rls-sk-a") \ + EM(rxrpc_call_put_send_ack, "PUT send-ack") \ + EM(rxrpc_call_put_sendmsg, "PUT sendmsg ") \ + EM(rxrpc_call_put_timer, "PUT timer ") \ + EM(rxrpc_call_put_timer_already, "PUT timer-al") \ + EM(rxrpc_call_put_unnotify, "PUT unnotify") \ + EM(rxrpc_call_put_userid_exists, "PUT u-exists") \ + EM(rxrpc_call_put_work, "PUT work ") \ + EM(rxrpc_call_queue_abort, "QUE abort ") \ + EM(rxrpc_call_queue_requeue, "QUE requeue ") \ + EM(rxrpc_call_queue_resend, "QUE resend ") \ + EM(rxrpc_call_queue_timer, "QUE timer ") \ + EM(rxrpc_call_see_accept, "SEE accept ") \ + EM(rxrpc_call_see_activate_client, "SEE act-clnt") \ + EM(rxrpc_call_see_connect_failed, "SEE con-fail") \ + EM(rxrpc_call_see_connected, "SEE connect ") \ + EM(rxrpc_call_see_distribute_error, "SEE dist-err") \ + EM(rxrpc_call_see_input, "SEE input ") \ + EM(rxrpc_call_see_release, "SEE release ") \ + EM(rxrpc_call_see_userid_exists, "SEE u-exists") \ + E_(rxrpc_call_see_zap, "SEE zap ") #define rxrpc_txqueue_traces \ EM(rxrpc_txqueue_await_reply, "AWR") \ @@ -503,32 +521,29 @@ TRACE_EVENT(rxrpc_client, ); TRACE_EVENT(rxrpc_call, - TP_PROTO(unsigned int call_debug_id, enum rxrpc_call_trace op, - int usage, const void *where, const void *aux), + TP_PROTO(unsigned int call_debug_id, int ref, unsigned long aux, + enum rxrpc_call_trace why), - TP_ARGS(call_debug_id, op, usage, where, aux), + TP_ARGS(call_debug_id, ref, aux, why), TP_STRUCT__entry( __field(unsigned int, call ) - __field(int, op ) - __field(int, usage ) - __field(const void *, where ) - __field(const void *, aux ) + __field(int, ref ) + __field(int, why ) + __field(unsigned long, aux ) ), TP_fast_assign( __entry->call = call_debug_id; - __entry->op = op; - __entry->usage = usage; - __entry->where = where; + __entry->ref = ref; + __entry->why = why; __entry->aux = aux; ), - TP_printk("c=%08x %s u=%d sp=%pSR a=%p", + TP_printk("c=%08x %s r=%d a=%lx", __entry->call, - __print_symbolic(__entry->op, rxrpc_call_traces), - __entry->usage, - __entry->where, + __print_symbolic(__entry->why, rxrpc_call_traces), + __entry->ref, __entry->aux) ); -- cgit v1.2.3 From fa3492abb64b93b2b5d6fdf7a5b687a1fa810d74 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 2 Nov 2022 21:54:46 +0000 Subject: rxrpc: Trace rxrpc_bundle refcount Add a tracepoint for the rxrpc_bundle refcounting. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 3f6de4294148..6f5be7ac7f6b 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -81,6 +81,15 @@ EM(rxrpc_peer_put_input_error, "PUT inpt-err") \ E_(rxrpc_peer_put_keepalive, "PUT keepaliv") +#define rxrpc_bundle_traces \ + EM(rxrpc_bundle_free, "FREE ") \ + EM(rxrpc_bundle_get_client_call, "GET clt-call") \ + EM(rxrpc_bundle_get_client_conn, "GET clt-conn") \ + EM(rxrpc_bundle_get_service_conn, "GET svc-conn") \ + EM(rxrpc_bundle_put_conn, "PUT conn ") \ + EM(rxrpc_bundle_put_discard, "PUT discard ") \ + E_(rxrpc_bundle_new, "NEW ") + #define rxrpc_conn_traces \ EM(rxrpc_conn_free, "FREE ") \ EM(rxrpc_conn_get_activate_call, "GET act-call") \ @@ -361,6 +370,7 @@ #define EM(a, b) a, #define E_(a, b) a +enum rxrpc_bundle_trace { rxrpc_bundle_traces } __mode(byte); enum rxrpc_call_trace { rxrpc_call_traces } __mode(byte); enum rxrpc_client_trace { rxrpc_client_traces } __mode(byte); enum rxrpc_congest_change { rxrpc_congest_changes } __mode(byte); @@ -390,6 +400,7 @@ enum rxrpc_txqueue_trace { rxrpc_txqueue_traces } __mode(byte); #define EM(a, b) TRACE_DEFINE_ENUM(a); #define E_(a, b) TRACE_DEFINE_ENUM(a); +rxrpc_bundle_traces; rxrpc_call_traces; rxrpc_client_traces; rxrpc_congest_changes; @@ -467,6 +478,29 @@ TRACE_EVENT(rxrpc_peer, __entry->ref) ); +TRACE_EVENT(rxrpc_bundle, + TP_PROTO(unsigned int bundle_debug_id, int ref, enum rxrpc_bundle_trace why), + + TP_ARGS(bundle_debug_id, ref, why), + + TP_STRUCT__entry( + __field(unsigned int, bundle ) + __field(int, ref ) + __field(int, why ) + ), + + TP_fast_assign( + __entry->bundle = bundle_debug_id; + __entry->ref = ref; + __entry->why = why; + ), + + TP_printk("CB=%08x %s r=%d", + __entry->bundle, + __print_symbolic(__entry->why, rxrpc_bundle_traces), + __entry->ref) + ); + TRACE_EVENT(rxrpc_conn, TP_PROTO(unsigned int conn_debug_id, int ref, enum rxrpc_conn_trace why), -- cgit v1.2.3 From 9a36a6bc22ca1c0a9d82228171e05dc785fa1154 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Oct 2022 15:31:21 +0100 Subject: rxrpc: trace: Don't use __builtin_return_address for sk_buff tracing In rxrpc tracing, use enums to generate lists of points of interest rather than __builtin_return_address() for the sk_buff tracepoint. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 57 +++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 6f5be7ac7f6b..5a2292baffc8 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -17,19 +17,31 @@ * Declare tracing information enums and their string mappings for display. */ #define rxrpc_skb_traces \ - EM(rxrpc_skb_ack, "ACK") \ - EM(rxrpc_skb_cleaned, "CLN") \ - EM(rxrpc_skb_cloned_jumbo, "CLJ") \ - EM(rxrpc_skb_freed, "FRE") \ - EM(rxrpc_skb_got, "GOT") \ - EM(rxrpc_skb_lost, "*L*") \ - EM(rxrpc_skb_new, "NEW") \ - EM(rxrpc_skb_purged, "PUR") \ - EM(rxrpc_skb_received, "RCV") \ - EM(rxrpc_skb_rotated, "ROT") \ - EM(rxrpc_skb_seen, "SEE") \ - EM(rxrpc_skb_unshared, "UNS") \ - E_(rxrpc_skb_unshared_nomem, "US0") + EM(rxrpc_skb_eaten_by_unshare, "ETN unshare ") \ + EM(rxrpc_skb_eaten_by_unshare_nomem, "ETN unshar-nm") \ + EM(rxrpc_skb_get_ack, "GET ack ") \ + EM(rxrpc_skb_get_conn_work, "GET conn-work") \ + EM(rxrpc_skb_get_to_recvmsg, "GET to-recv ") \ + EM(rxrpc_skb_get_to_recvmsg_oos, "GET to-recv-o") \ + EM(rxrpc_skb_new_encap_rcv, "NEW encap-rcv") \ + EM(rxrpc_skb_new_error_report, "NEW error-rpt") \ + EM(rxrpc_skb_new_jumbo_subpacket, "NEW jumbo-sub") \ + EM(rxrpc_skb_new_unshared, "NEW unshared ") \ + EM(rxrpc_skb_put_ack, "PUT ack ") \ + EM(rxrpc_skb_put_conn_work, "PUT conn-work") \ + EM(rxrpc_skb_put_error_report, "PUT error-rep") \ + EM(rxrpc_skb_put_input, "PUT input ") \ + EM(rxrpc_skb_put_jumbo_subpacket, "PUT jumbo-sub") \ + EM(rxrpc_skb_put_lose, "PUT lose ") \ + EM(rxrpc_skb_put_purge, "PUT purge ") \ + EM(rxrpc_skb_put_rotate, "PUT rotate ") \ + EM(rxrpc_skb_put_unknown, "PUT unknown ") \ + EM(rxrpc_skb_see_conn_work, "SEE conn-work") \ + EM(rxrpc_skb_see_local_work, "SEE locl-work") \ + EM(rxrpc_skb_see_recvmsg, "SEE recvmsg ") \ + EM(rxrpc_skb_see_reject, "SEE reject ") \ + EM(rxrpc_skb_see_rotate, "SEE rotate ") \ + E_(rxrpc_skb_see_version, "SEE version ") #define rxrpc_local_traces \ EM(rxrpc_local_free, "FREE ") \ @@ -582,33 +594,30 @@ TRACE_EVENT(rxrpc_call, ); TRACE_EVENT(rxrpc_skb, - TP_PROTO(struct sk_buff *skb, enum rxrpc_skb_trace op, - int usage, int mod_count, const void *where), + TP_PROTO(struct sk_buff *skb, int usage, int mod_count, + enum rxrpc_skb_trace why), - TP_ARGS(skb, op, usage, mod_count, where), + TP_ARGS(skb, usage, mod_count, why), TP_STRUCT__entry( __field(struct sk_buff *, skb ) - __field(enum rxrpc_skb_trace, op ) __field(int, usage ) __field(int, mod_count ) - __field(const void *, where ) + __field(enum rxrpc_skb_trace, why ) ), TP_fast_assign( __entry->skb = skb; - __entry->op = op; __entry->usage = usage; __entry->mod_count = mod_count; - __entry->where = where; + __entry->why = why; ), - TP_printk("s=%p Rx %s u=%d m=%d p=%pSR", + TP_printk("s=%p Rx %s u=%d m=%d", __entry->skb, - __print_symbolic(__entry->op, rxrpc_skb_traces), + __print_symbolic(__entry->why, rxrpc_skb_traces), __entry->usage, - __entry->mod_count, - __entry->where) + __entry->mod_count) ); TRACE_EVENT(rxrpc_rx_packet, -- cgit v1.2.3 From 3feda9d69c83983b530cea6287ba4fea0e5c3b87 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 25 Nov 2022 09:00:55 +0000 Subject: rxrpc: Don't hold a ref for call timer or workqueue Currently, rxrpc gives the call timer a ref on the call when it starts it and this is passed along to the workqueue by the timer expiration function. The problem comes when queue_work() fails (ie. the work item is already queued): the timer routine must put the ref - but this may cause the cleanup code to run. This has the unfortunate effect that the cleanup code may then be run in softirq context - which means that any spinlocks it might need to touch have to be guarded to disable softirqs (ie. they need a "_bh" suffix). Fix this by: (1) Don't give a ref to the timer. (2) Making the expiration function not do anything if the refcount is 0. Note that this is more of an optimisation. (3) Make sure that the cleanup routine waits for timer to complete. However, this has a consequence that timer cannot give a ref to the work item. Therefore the following fixes are also necessary: (4) Don't give a ref to the work item. (5) Make the work item return asap if it sees the ref count is 0. (6) Make sure that the cleanup routine waits for the work item to complete. Unfortunately, neither the timer nor the work item can simply get around the problem by just using refcount_inc_not_zero() as the waits would still have to be done, and there would still be the possibility of having to put the ref in the expiration function. Note the call work item is going to go away with the work being transferred to the I/O thread, so the wait in (6) will become obsolete. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 5a2292baffc8..4538de0079a5 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -155,11 +155,9 @@ EM(rxrpc_call_get_release_sock, "GET rel-sock") \ EM(rxrpc_call_get_sendmsg, "GET sendmsg ") \ EM(rxrpc_call_get_send_ack, "GET send-ack") \ - EM(rxrpc_call_get_timer, "GET timer ") \ EM(rxrpc_call_get_userid, "GET user-id ") \ EM(rxrpc_call_new_client, "NEW client ") \ EM(rxrpc_call_new_prealloc_service, "NEW prealloc") \ - EM(rxrpc_call_put_already_queued, "PUT alreadyq") \ EM(rxrpc_call_put_discard_prealloc, "PUT disc-pre") \ EM(rxrpc_call_put_input, "PUT input ") \ EM(rxrpc_call_put_kernel, "PUT kernel ") \ @@ -168,11 +166,8 @@ EM(rxrpc_call_put_release_sock_tba, "PUT rls-sk-a") \ EM(rxrpc_call_put_send_ack, "PUT send-ack") \ EM(rxrpc_call_put_sendmsg, "PUT sendmsg ") \ - EM(rxrpc_call_put_timer, "PUT timer ") \ - EM(rxrpc_call_put_timer_already, "PUT timer-al") \ EM(rxrpc_call_put_unnotify, "PUT unnotify") \ EM(rxrpc_call_put_userid_exists, "PUT u-exists") \ - EM(rxrpc_call_put_work, "PUT work ") \ EM(rxrpc_call_queue_abort, "QUE abort ") \ EM(rxrpc_call_queue_requeue, "QUE requeue ") \ EM(rxrpc_call_queue_resend, "QUE resend ") \ @@ -368,6 +363,7 @@ EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ EM(rxrpc_txbuf_put_send_aborted, "PUT SEND-X ") \ EM(rxrpc_txbuf_put_trans, "PUT TRANS ") \ + EM(rxrpc_txbuf_see_out_of_step, "OUT-OF-STEP") \ EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \ E_(rxrpc_txbuf_see_unacked, "SEE UNACKED") -- cgit v1.2.3 From 3cec055c56958c5498eeb3ed9fb2aef2d28c030f Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 25 Nov 2022 12:43:50 +0000 Subject: rxrpc: Don't hold a ref for connection workqueue Currently, rxrpc gives the connection's work item a ref on the connection when it queues it - and this is called from the timer expiration function. The problem comes when queue_work() fails (ie. the work item is already queued): the timer routine must put the ref - but this may cause the cleanup code to run. This has the unfortunate effect that the cleanup code may then be run in softirq context - which means that any spinlocks it might need to touch have to be guarded to disable softirqs (ie. they need a "_bh" suffix). (1) Don't give a ref to the work item. (2) Simplify handling of service connections by adding a separate active count so that the refcount isn't also used for this. (3) Connection destruction for both client and service connections can then be cleaned up by putting rxrpc_put_connection() out of line and making a tidy progression through the destruction code (offloaded to a workqueue if put from softirq or processor function context). The RCU part of the cleanup then only deals with the freeing at the end. (4) Make rxrpc_queue_conn() return immediately if it sees the active count is -1 rather then queuing the connection. (5) Make sure that the cleanup routine waits for the work item to complete. (6) Stash the rxrpc_net pointer in the conn struct so that the rcu free routine can use it, even if the local endpoint has been freed. Unfortunately, neither the timer nor the work item can simply get around the problem by just using refcount_inc_not_zero() as the waits would still have to be done, and there would still be the possibility of having to put the ref in the expiration function. Note the connection work item is mostly going to go away with the main event work being transferred to the I/O thread, so the wait in (6) will become obsolete. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 4538de0079a5..44a9be9836f9 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -112,7 +112,6 @@ EM(rxrpc_conn_get_service_conn, "GET svc-conn") \ EM(rxrpc_conn_new_client, "NEW client ") \ EM(rxrpc_conn_new_service, "NEW service ") \ - EM(rxrpc_conn_put_already_queued, "PUT alreadyq") \ EM(rxrpc_conn_put_call, "PUT call ") \ EM(rxrpc_conn_put_call_input, "PUT inp-call") \ EM(rxrpc_conn_put_conn_input, "PUT inp-conn") \ @@ -121,13 +120,13 @@ EM(rxrpc_conn_put_local_dead, "PUT loc-dead") \ EM(rxrpc_conn_put_noreuse, "PUT noreuse ") \ EM(rxrpc_conn_put_poke, "PUT poke ") \ + EM(rxrpc_conn_put_service_reaped, "PUT svc-reap") \ EM(rxrpc_conn_put_unbundle, "PUT unbundle") \ EM(rxrpc_conn_put_unidle, "PUT unidle ") \ - EM(rxrpc_conn_put_work, "PUT work ") \ - EM(rxrpc_conn_queue_challenge, "GQ chall ") \ - EM(rxrpc_conn_queue_retry_work, "GQ retry-wk") \ - EM(rxrpc_conn_queue_rx_work, "GQ rx-work ") \ - EM(rxrpc_conn_queue_timer, "GQ timer ") \ + EM(rxrpc_conn_queue_challenge, "QUE chall ") \ + EM(rxrpc_conn_queue_retry_work, "QUE retry-wk") \ + EM(rxrpc_conn_queue_rx_work, "QUE rx-work ") \ + EM(rxrpc_conn_queue_timer, "QUE timer ") \ EM(rxrpc_conn_see_new_service_conn, "SEE new-svc ") \ EM(rxrpc_conn_see_reap_service, "SEE reap-svc") \ E_(rxrpc_conn_see_work, "SEE work ") -- cgit v1.2.3 From 15f661dc95daec9b38e8e4cc931c95afe0ae0cef Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Oct 2022 15:51:39 +0100 Subject: rxrpc: Implement a mechanism to send an event notification to a call Provide a means by which an event notification can be sent to a call such that the I/O thread can process it rather than it being done in a separate workqueue. This will allow a lot of locking to be removed. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 52 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 44a9be9836f9..0b12d96c7921 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -16,6 +16,13 @@ /* * Declare tracing information enums and their string mappings for display. */ +#define rxrpc_call_poke_traces \ + EM(rxrpc_call_poke_error, "Error") \ + EM(rxrpc_call_poke_idle, "Idle") \ + EM(rxrpc_call_poke_start, "Start") \ + EM(rxrpc_call_poke_timer, "Timer") \ + E_(rxrpc_call_poke_timer_now, "Timer-now") + #define rxrpc_skb_traces \ EM(rxrpc_skb_eaten_by_unshare, "ETN unshare ") \ EM(rxrpc_skb_eaten_by_unshare_nomem, "ETN unshar-nm") \ @@ -150,6 +157,7 @@ EM(rxrpc_call_get_input, "GET input ") \ EM(rxrpc_call_get_kernel_service, "GET krnl-srv") \ EM(rxrpc_call_get_notify_socket, "GET notify ") \ + EM(rxrpc_call_get_poke, "GET poke ") \ EM(rxrpc_call_get_recvmsg, "GET recvmsg ") \ EM(rxrpc_call_get_release_sock, "GET rel-sock") \ EM(rxrpc_call_get_sendmsg, "GET sendmsg ") \ @@ -160,6 +168,7 @@ EM(rxrpc_call_put_discard_prealloc, "PUT disc-pre") \ EM(rxrpc_call_put_input, "PUT input ") \ EM(rxrpc_call_put_kernel, "PUT kernel ") \ + EM(rxrpc_call_put_poke, "PUT poke ") \ EM(rxrpc_call_put_recvmsg, "PUT recvmsg ") \ EM(rxrpc_call_put_release_sock, "PUT rls-sock") \ EM(rxrpc_call_put_release_sock_tba, "PUT rls-sk-a") \ @@ -378,6 +387,7 @@ #define E_(a, b) a enum rxrpc_bundle_trace { rxrpc_bundle_traces } __mode(byte); +enum rxrpc_call_poke_trace { rxrpc_call_poke_traces } __mode(byte); enum rxrpc_call_trace { rxrpc_call_traces } __mode(byte); enum rxrpc_client_trace { rxrpc_client_traces } __mode(byte); enum rxrpc_congest_change { rxrpc_congest_changes } __mode(byte); @@ -408,6 +418,7 @@ enum rxrpc_txqueue_trace { rxrpc_txqueue_traces } __mode(byte); #define E_(a, b) TRACE_DEFINE_ENUM(a); rxrpc_bundle_traces; +rxrpc_call_poke_traces; rxrpc_call_traces; rxrpc_client_traces; rxrpc_congest_changes; @@ -1747,6 +1758,47 @@ TRACE_EVENT(rxrpc_txbuf, __entry->ref) ); +TRACE_EVENT(rxrpc_poke_call, + TP_PROTO(struct rxrpc_call *call, bool busy, + enum rxrpc_call_poke_trace what), + + TP_ARGS(call, busy, what), + + TP_STRUCT__entry( + __field(unsigned int, call_debug_id ) + __field(bool, busy ) + __field(enum rxrpc_call_poke_trace, what ) + ), + + TP_fast_assign( + __entry->call_debug_id = call->debug_id; + __entry->busy = busy; + __entry->what = what; + ), + + TP_printk("c=%08x %s%s", + __entry->call_debug_id, + __print_symbolic(__entry->what, rxrpc_call_poke_traces), + __entry->busy ? "!" : "") + ); + +TRACE_EVENT(rxrpc_call_poked, + TP_PROTO(struct rxrpc_call *call), + + TP_ARGS(call), + + TP_STRUCT__entry( + __field(unsigned int, call_debug_id ) + ), + + TP_fast_assign( + __entry->call_debug_id = call->debug_id; + ), + + TP_printk("c=%08x", + __entry->call_debug_id) + ); + #undef EM #undef E_ #endif /* _TRACE_RXRPC_H */ -- cgit v1.2.3 From f3441d4125fc98995858550a5521b8d7daf0504a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Oct 2022 21:58:36 +0100 Subject: rxrpc: Copy client call parameters into rxrpc_call earlier Copy client call parameters into rxrpc_call earlier so that that can be used to convey them to the connection code - which can then be offloaded to the I/O thread. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 0b12d96c7921..8bd48358f757 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -52,6 +52,7 @@ #define rxrpc_local_traces \ EM(rxrpc_local_free, "FREE ") \ + EM(rxrpc_local_get_call, "GET call ") \ EM(rxrpc_local_get_client_conn, "GET conn-cln") \ EM(rxrpc_local_get_for_use, "GET for-use ") \ EM(rxrpc_local_get_peer, "GET peer ") \ @@ -61,6 +62,7 @@ EM(rxrpc_local_processing, "PROCESSING ") \ EM(rxrpc_local_put_already_queued, "PUT alreadyq") \ EM(rxrpc_local_put_bind, "PUT bind ") \ + EM(rxrpc_local_put_call, "PUT call ") \ EM(rxrpc_local_put_for_use, "PUT for-use ") \ EM(rxrpc_local_put_kill_conn, "PUT conn-kil") \ EM(rxrpc_local_put_peer, "PUT peer ") \ @@ -166,6 +168,7 @@ EM(rxrpc_call_new_client, "NEW client ") \ EM(rxrpc_call_new_prealloc_service, "NEW prealloc") \ EM(rxrpc_call_put_discard_prealloc, "PUT disc-pre") \ + EM(rxrpc_call_put_discard_error, "PUT disc-err") \ EM(rxrpc_call_put_input, "PUT input ") \ EM(rxrpc_call_put_kernel, "PUT kernel ") \ EM(rxrpc_call_put_poke, "PUT poke ") \ -- cgit v1.2.3 From cf37b5987508878647161ec3cdba0bb00a1b607a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 31 Mar 2022 23:55:08 +0100 Subject: rxrpc: Move DATA transmission into call processor work item Move DATA transmission into the call processor work item. In a future patch, this will be called from the I/O thread rather than being itsown work item. This will allow DATA transmission to be driven directly by incoming ACKs, pokes and timers as those are processed. The Tx queue is also split: The queue of packets prepared by sendmsg is now places in call->tx_sendmsg and the packet dispatcher decants the packets into call->tx_buffer as space becomes available in the transmission window. This allows sendmsg to run ahead of the available space to try and prevent an underflow in transmission. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 8bd48358f757..c3043fbea0e6 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -183,6 +183,7 @@ EM(rxrpc_call_queue_requeue, "QUE requeue ") \ EM(rxrpc_call_queue_resend, "QUE resend ") \ EM(rxrpc_call_queue_timer, "QUE timer ") \ + EM(rxrpc_call_queue_tx_data, "QUE tx-data ") \ EM(rxrpc_call_see_accept, "SEE accept ") \ EM(rxrpc_call_see_activate_client, "SEE act-clnt") \ EM(rxrpc_call_see_connect_failed, "SEE con-fail") \ @@ -738,6 +739,7 @@ TRACE_EVENT(rxrpc_txqueue, __field(rxrpc_seq_t, acks_hard_ack ) __field(rxrpc_seq_t, tx_bottom ) __field(rxrpc_seq_t, tx_top ) + __field(rxrpc_seq_t, tx_prepared ) __field(int, tx_winsize ) ), @@ -747,16 +749,18 @@ TRACE_EVENT(rxrpc_txqueue, __entry->acks_hard_ack = call->acks_hard_ack; __entry->tx_bottom = call->tx_bottom; __entry->tx_top = call->tx_top; + __entry->tx_prepared = call->tx_prepared; __entry->tx_winsize = call->tx_winsize; ), - TP_printk("c=%08x %s f=%08x h=%08x n=%u/%u/%u", + TP_printk("c=%08x %s f=%08x h=%08x n=%u/%u/%u/%u", __entry->call, __print_symbolic(__entry->why, rxrpc_txqueue_traces), __entry->tx_bottom, __entry->acks_hard_ack, __entry->tx_top - __entry->tx_bottom, __entry->tx_top - __entry->acks_hard_ack, + __entry->tx_prepared - __entry->tx_bottom, __entry->tx_winsize) ); -- cgit v1.2.3 From 2d1faf7a0ca3c0b327cf064c80e4e775532c9319 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 6 Oct 2022 15:43:51 +0100 Subject: rxrpc: Simplify skbuff accounting in receive path A received skbuff needs a ref when it gets put on a call data queue or conn packet queue, and rxrpc_input_packet() and co. jump through a lot of hoops to avoid double-dropping the skbuff ref so that we can avoid getting a ref when we queue the packet. Change this so that the skbuff ref is unconditionally dropped by the caller of rxrpc_input_packet(). An additional ref is then taken on the packet if it is pushed onto a queue. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index c3043fbea0e6..82b1327c2ba6 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -28,6 +28,8 @@ EM(rxrpc_skb_eaten_by_unshare_nomem, "ETN unshar-nm") \ EM(rxrpc_skb_get_ack, "GET ack ") \ EM(rxrpc_skb_get_conn_work, "GET conn-work") \ + EM(rxrpc_skb_get_local_work, "GET locl-work") \ + EM(rxrpc_skb_get_reject_work, "GET rej-work ") \ EM(rxrpc_skb_get_to_recvmsg, "GET to-recv ") \ EM(rxrpc_skb_get_to_recvmsg_oos, "GET to-recv-o") \ EM(rxrpc_skb_new_encap_rcv, "NEW encap-rcv") \ @@ -39,7 +41,6 @@ EM(rxrpc_skb_put_error_report, "PUT error-rep") \ EM(rxrpc_skb_put_input, "PUT input ") \ EM(rxrpc_skb_put_jumbo_subpacket, "PUT jumbo-sub") \ - EM(rxrpc_skb_put_lose, "PUT lose ") \ EM(rxrpc_skb_put_purge, "PUT purge ") \ EM(rxrpc_skb_put_rotate, "PUT rotate ") \ EM(rxrpc_skb_put_unknown, "PUT unknown ") \ -- cgit v1.2.3 From 5e6ef4f1017c7f844e305283bbd8875af475e2fc Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 23 Jan 2020 13:13:41 +0000 Subject: rxrpc: Make the I/O thread take over the call and local processor work Move the functions from the call->processor and local->processor work items into the domain of the I/O thread. The call event processor, now called from the I/O thread, then takes over the job of cranking the call state machine, processing incoming packets and transmitting DATA, ACK and ABORT packets. In a future patch, rxrpc_send_ACK() will transmit the ACK on the spot rather than queuing it for later transmission. The call event processor becomes purely received-skb driven. It only transmits things in response to events. We use "pokes" to queue a dummy skb to make it do things like start/resume transmitting data. Timer expiry also results in pokes. The connection event processor, becomes similar, though crypto events, such as dealing with CHALLENGE and RESPONSE packets is offloaded to a work item to avoid doing crypto in the I/O thread. The local event processor is removed and VERSION response packets are generated directly from the packet parser. Similarly, ABORTs generated in response to protocol errors will be transmitted immediately rather than being pushed onto a queue for later transmission. Changes: ======== ver #2) - Fix a couple of introduced lock context imbalances. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 82b1327c2ba6..c49b0c233594 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -26,7 +26,6 @@ #define rxrpc_skb_traces \ EM(rxrpc_skb_eaten_by_unshare, "ETN unshare ") \ EM(rxrpc_skb_eaten_by_unshare_nomem, "ETN unshar-nm") \ - EM(rxrpc_skb_get_ack, "GET ack ") \ EM(rxrpc_skb_get_conn_work, "GET conn-work") \ EM(rxrpc_skb_get_local_work, "GET locl-work") \ EM(rxrpc_skb_get_reject_work, "GET rej-work ") \ @@ -36,7 +35,6 @@ EM(rxrpc_skb_new_error_report, "NEW error-rpt") \ EM(rxrpc_skb_new_jumbo_subpacket, "NEW jumbo-sub") \ EM(rxrpc_skb_new_unshared, "NEW unshared ") \ - EM(rxrpc_skb_put_ack, "PUT ack ") \ EM(rxrpc_skb_put_conn_work, "PUT conn-work") \ EM(rxrpc_skb_put_error_report, "PUT error-rep") \ EM(rxrpc_skb_put_input, "PUT input ") \ @@ -45,7 +43,6 @@ EM(rxrpc_skb_put_rotate, "PUT rotate ") \ EM(rxrpc_skb_put_unknown, "PUT unknown ") \ EM(rxrpc_skb_see_conn_work, "SEE conn-work") \ - EM(rxrpc_skb_see_local_work, "SEE locl-work") \ EM(rxrpc_skb_see_recvmsg, "SEE recvmsg ") \ EM(rxrpc_skb_see_reject, "SEE reject ") \ EM(rxrpc_skb_see_rotate, "SEE rotate ") \ @@ -58,10 +55,7 @@ EM(rxrpc_local_get_for_use, "GET for-use ") \ EM(rxrpc_local_get_peer, "GET peer ") \ EM(rxrpc_local_get_prealloc_conn, "GET conn-pre") \ - EM(rxrpc_local_get_queue, "GET queue ") \ EM(rxrpc_local_new, "NEW ") \ - EM(rxrpc_local_processing, "PROCESSING ") \ - EM(rxrpc_local_put_already_queued, "PUT alreadyq") \ EM(rxrpc_local_put_bind, "PUT bind ") \ EM(rxrpc_local_put_call, "PUT call ") \ EM(rxrpc_local_put_for_use, "PUT for-use ") \ @@ -69,8 +63,6 @@ EM(rxrpc_local_put_peer, "PUT peer ") \ EM(rxrpc_local_put_prealloc_conn, "PUT conn-pre") \ EM(rxrpc_local_put_release_sock, "PUT rel-sock") \ - EM(rxrpc_local_put_queue, "PUT queue ") \ - EM(rxrpc_local_queued, "QUEUED ") \ EM(rxrpc_local_see_tx_ack, "SEE tx-ack ") \ EM(rxrpc_local_stop, "STOP ") \ EM(rxrpc_local_stopped, "STOPPED ") \ @@ -78,11 +70,9 @@ EM(rxrpc_local_unuse_conn_work, "UNU conn-wrk") \ EM(rxrpc_local_unuse_peer_keepalive, "UNU peer-kpa") \ EM(rxrpc_local_unuse_release_sock, "UNU rel-sock") \ - EM(rxrpc_local_unuse_work, "UNU work ") \ EM(rxrpc_local_use_conn_work, "USE conn-wrk") \ EM(rxrpc_local_use_lookup, "USE lookup ") \ - EM(rxrpc_local_use_peer_keepalive, "USE peer-kpa") \ - E_(rxrpc_local_use_work, "USE work ") + E_(rxrpc_local_use_peer_keepalive, "USE peer-kpa") #define rxrpc_peer_traces \ EM(rxrpc_peer_free, "FREE ") \ @@ -90,6 +80,7 @@ EM(rxrpc_peer_get_activate_call, "GET act-call") \ EM(rxrpc_peer_get_bundle, "GET bundle ") \ EM(rxrpc_peer_get_client_conn, "GET cln-conn") \ + EM(rxrpc_peer_get_input, "GET input ") \ EM(rxrpc_peer_get_input_error, "GET inpt-err") \ EM(rxrpc_peer_get_keepalive, "GET keepaliv") \ EM(rxrpc_peer_get_lookup_client, "GET look-cln") \ @@ -100,6 +91,7 @@ EM(rxrpc_peer_put_call, "PUT call ") \ EM(rxrpc_peer_put_conn, "PUT conn ") \ EM(rxrpc_peer_put_discard_tmp, "PUT disc-tmp") \ + EM(rxrpc_peer_put_input, "PUT input ") \ EM(rxrpc_peer_put_input_error, "PUT inpt-err") \ E_(rxrpc_peer_put_keepalive, "PUT keepaliv") @@ -180,11 +172,6 @@ EM(rxrpc_call_put_sendmsg, "PUT sendmsg ") \ EM(rxrpc_call_put_unnotify, "PUT unnotify") \ EM(rxrpc_call_put_userid_exists, "PUT u-exists") \ - EM(rxrpc_call_queue_abort, "QUE abort ") \ - EM(rxrpc_call_queue_requeue, "QUE requeue ") \ - EM(rxrpc_call_queue_resend, "QUE resend ") \ - EM(rxrpc_call_queue_timer, "QUE timer ") \ - EM(rxrpc_call_queue_tx_data, "QUE tx-data ") \ EM(rxrpc_call_see_accept, "SEE accept ") \ EM(rxrpc_call_see_activate_client, "SEE act-clnt") \ EM(rxrpc_call_see_connect_failed, "SEE con-fail") \ @@ -282,6 +269,7 @@ EM(rxrpc_propose_ack_respond_to_ping, "Rsp2Png") \ EM(rxrpc_propose_ack_retry_tx, "RetryTx") \ EM(rxrpc_propose_ack_rotate_rx, "RxAck ") \ + EM(rxrpc_propose_ack_rx_idle, "RxIdle ") \ E_(rxrpc_propose_ack_terminal_ack, "ClTerm ") #define rxrpc_congest_modes \ @@ -1532,6 +1520,7 @@ TRACE_EVENT(rxrpc_connect_call, __field(unsigned long, user_call_ID ) __field(u32, cid ) __field(u32, call_id ) + __field_struct(struct sockaddr_rxrpc, srx ) ), TP_fast_assign( @@ -1539,33 +1528,42 @@ TRACE_EVENT(rxrpc_connect_call, __entry->user_call_ID = call->user_call_ID; __entry->cid = call->cid; __entry->call_id = call->call_id; + __entry->srx = call->dest_srx; ), - TP_printk("c=%08x u=%p %08x:%08x", + TP_printk("c=%08x u=%p %08x:%08x dst=%pISp", __entry->call, (void *)__entry->user_call_ID, __entry->cid, - __entry->call_id) + __entry->call_id, + &__entry->srx.transport) ); TRACE_EVENT(rxrpc_resend, - TP_PROTO(struct rxrpc_call *call), + TP_PROTO(struct rxrpc_call *call, struct sk_buff *ack), - TP_ARGS(call), + TP_ARGS(call, ack), TP_STRUCT__entry( __field(unsigned int, call ) __field(rxrpc_seq_t, seq ) + __field(rxrpc_seq_t, transmitted ) + __field(rxrpc_serial_t, ack_serial ) ), TP_fast_assign( + struct rxrpc_skb_priv *sp = ack ? rxrpc_skb(ack) : NULL; __entry->call = call->debug_id; __entry->seq = call->acks_hard_ack; + __entry->transmitted = call->tx_transmitted; + __entry->ack_serial = sp ? sp->hdr.serial : 0; ), - TP_printk("c=%08x q=%x", + TP_printk("c=%08x r=%x q=%x tq=%x", __entry->call, - __entry->seq) + __entry->ack_serial, + __entry->seq, + __entry->transmitted) ); TRACE_EVENT(rxrpc_rx_icmp, -- cgit v1.2.3 From 32cf8edb079a6a687a2b5dba39a813a0bbd0ddf9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Nov 2022 13:47:35 +0000 Subject: rxrpc: Trace/count transmission underflows and cwnd resets Add a tracepoint to log when a cwnd reset occurs due to lack of transmission on a call. Add stat counters to count transmission underflows (ie. when we have tx window space, but sendmsg doesn't manage to keep up), cwnd resets and transmission failures. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index c49b0c233594..b41e913ae78a 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1446,6 +1446,44 @@ TRACE_EVENT(rxrpc_congest, __entry->sum.retrans_timeo ? " rTxTo" : "") ); +TRACE_EVENT(rxrpc_reset_cwnd, + TP_PROTO(struct rxrpc_call *call, ktime_t now), + + TP_ARGS(call, now), + + TP_STRUCT__entry( + __field(unsigned int, call ) + __field(enum rxrpc_congest_mode, mode ) + __field(unsigned short, cwnd ) + __field(unsigned short, extra ) + __field(rxrpc_seq_t, hard_ack ) + __field(rxrpc_seq_t, prepared ) + __field(ktime_t, since_last_tx ) + __field(bool, has_data ) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->mode = call->cong_mode; + __entry->cwnd = call->cong_cwnd; + __entry->extra = call->cong_extra; + __entry->hard_ack = call->acks_hard_ack; + __entry->prepared = call->tx_prepared - call->tx_bottom; + __entry->since_last_tx = ktime_sub(now, call->tx_last_sent); + __entry->has_data = !list_empty(&call->tx_sendmsg); + ), + + TP_printk("c=%08x q=%08x %s cw=%u+%u pr=%u tm=%llu d=%u", + __entry->call, + __entry->hard_ack, + __print_symbolic(__entry->mode, rxrpc_congest_modes), + __entry->cwnd, + __entry->extra, + __entry->prepared, + ktime_to_ns(__entry->since_last_tx), + __entry->has_data) + ); + TRACE_EVENT(rxrpc_disconnect_call, TP_PROTO(struct rxrpc_call *call), -- cgit v1.2.3 From b0346843b1076b34a0278ff601f8f287535cb064 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 30 Jan 2020 21:48:13 +0000 Subject: rxrpc: Transmit ACKs at the point of generation For ACKs generated inside the I/O thread, transmit the ACK at the point of generation. Where the ACK is generated outside of the I/O thread, it's offloaded to the I/O thread to transmit it. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index b41e913ae78a..049b52e7aa6a 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -63,7 +63,6 @@ EM(rxrpc_local_put_peer, "PUT peer ") \ EM(rxrpc_local_put_prealloc_conn, "PUT conn-pre") \ EM(rxrpc_local_put_release_sock, "PUT rel-sock") \ - EM(rxrpc_local_see_tx_ack, "SEE tx-ack ") \ EM(rxrpc_local_stop, "STOP ") \ EM(rxrpc_local_stopped, "STOPPED ") \ EM(rxrpc_local_unuse_bind, "UNU bind ") \ @@ -156,7 +155,6 @@ EM(rxrpc_call_get_recvmsg, "GET recvmsg ") \ EM(rxrpc_call_get_release_sock, "GET rel-sock") \ EM(rxrpc_call_get_sendmsg, "GET sendmsg ") \ - EM(rxrpc_call_get_send_ack, "GET send-ack") \ EM(rxrpc_call_get_userid, "GET user-id ") \ EM(rxrpc_call_new_client, "NEW client ") \ EM(rxrpc_call_new_prealloc_service, "NEW prealloc") \ @@ -168,7 +166,6 @@ EM(rxrpc_call_put_recvmsg, "PUT recvmsg ") \ EM(rxrpc_call_put_release_sock, "PUT rls-sock") \ EM(rxrpc_call_put_release_sock_tba, "PUT rls-sk-a") \ - EM(rxrpc_call_put_send_ack, "PUT send-ack") \ EM(rxrpc_call_put_sendmsg, "PUT sendmsg ") \ EM(rxrpc_call_put_unnotify, "PUT unnotify") \ EM(rxrpc_call_put_userid_exists, "PUT u-exists") \ -- cgit v1.2.3 From 7d360f6061db01830adfdb1eaa3977b19db0c30b Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 10 Oct 2022 11:43:38 +0200 Subject: wifi: mac80211: add support for restricting netdev features per vif This can be used to selectively disable feature flags for checksum offload, scatter/gather or GSO by changing vif->netdev_features. Removing features from vif->netdev_features does not affect the netdev features themselves, but instead fixes up skbs in the tx path so that the offloads are not needed in the driver. Aside from making it easier to deal with vif type based hardware limitations, this also makes it possible to optimize performance on hardware without native GSO support by declaring GSO support in hw->netdev_features and removing it from vif->netdev_features. This allows mac80211 to handle GSO segmentation after the sta lookup, but before itxq enqueue, thus reducing the number of unnecessary sta lookups, as well as some other per-packet processing. Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20221010094338.78070-1-nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/fq_impl.h | 16 ++++++++++------ include/net/mac80211.h | 5 +++++ 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h index 524b510f1c68..9467e33dfb36 100644 --- a/include/net/fq_impl.h +++ b/include/net/fq_impl.h @@ -200,6 +200,7 @@ static void fq_tin_enqueue(struct fq *fq, fq_skb_free_t free_func) { struct fq_flow *flow; + struct sk_buff *next; bool oom; lockdep_assert_held(&fq->lock); @@ -214,11 +215,15 @@ static void fq_tin_enqueue(struct fq *fq, } flow->tin = tin; - flow->backlog += skb->len; - tin->backlog_bytes += skb->len; - tin->backlog_packets++; - fq->memory_usage += skb->truesize; - fq->backlog++; + skb_list_walk_safe(skb, skb, next) { + skb_mark_not_on_list(skb); + flow->backlog += skb->len; + tin->backlog_bytes += skb->len; + tin->backlog_packets++; + fq->memory_usage += skb->truesize; + fq->backlog++; + __skb_queue_tail(&flow->queue, skb); + } if (list_empty(&flow->flowchain)) { flow->deficit = fq->quantum; @@ -226,7 +231,6 @@ static void fq_tin_enqueue(struct fq *fq, &tin->new_flows); } - __skb_queue_tail(&flow->queue, skb); oom = (fq->memory_usage > fq->memory_limit); while (fq->backlog > fq->limit || oom) { flow = fq_find_fattest_flow(fq); diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 721c450a9ccd..689da327ce2e 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1807,6 +1807,10 @@ struct ieee80211_vif_cfg { * @addr: address of this interface * @p2p: indicates whether this AP or STA interface is a p2p * interface, i.e. a GO or p2p-sta respectively + * @netdev_features: tx netdev features supported by the hardware for this + * vif. mac80211 initializes this to hw->netdev_features, and the driver + * can mask out specific tx features. mac80211 will handle software fixup + * for masked offloads (GSO, CSUM) * @driver_flags: flags/capabilities the driver has for this interface, * these need to be set (or cleared) when the interface is added * or, if supported by the driver, the interface type is changed @@ -1848,6 +1852,7 @@ struct ieee80211_vif { struct ieee80211_txq *txq; + netdev_features_t netdev_features; u32 driver_flags; u32 offload_flags; -- cgit v1.2.3 From eb8c507296f6038d46010396d91b42a05c3b64d9 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Wed, 23 Nov 2022 17:38:55 +0000 Subject: jump_label: Prevent key->enabled int overflow 1. With CONFIG_JUMP_LABEL=n static_key_slow_inc() doesn't have any protection against key->enabled refcounter overflow. 2. With CONFIG_JUMP_LABEL=y static_key_slow_inc_cpuslocked() still may turn the refcounter negative as (v + 1) may overflow. key->enabled is indeed a ref-counter as it's documented in multiple places: top comment in jump_label.h, Documentation/staging/static-keys.rst, etc. As -1 is reserved for static key that's in process of being enabled, functions would break with negative key->enabled refcount: - for CONFIG_JUMP_LABEL=n negative return of static_key_count() breaks static_key_false(), static_key_true() - the ref counter may become 0 from negative side by too many static_key_slow_inc() calls and lead to use-after-free issues. These flaws result in that some users have to introduce an additional mutex and prevent the reference counter from overflowing themselves, see bpf_enable_runtime_stats() checking the counter against INT_MAX / 2. Prevent the reference counter overflow by checking if (v + 1) > 0. Change functions API to return whether the increment was successful. Signed-off-by: Dmitry Safonov Acked-by: Jakub Kicinski Acked-by: Peter Zijlstra (Intel) Signed-off-by: Jakub Kicinski --- include/linux/jump_label.h | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 570831ca9951..4e968ebadce6 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -224,9 +224,10 @@ extern bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type); extern void arch_jump_label_transform_apply(void); extern int jump_label_text_reserved(void *start, void *end); -extern void static_key_slow_inc(struct static_key *key); +extern bool static_key_slow_inc(struct static_key *key); +extern bool static_key_fast_inc_not_disabled(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); -extern void static_key_slow_inc_cpuslocked(struct static_key *key); +extern bool static_key_slow_inc_cpuslocked(struct static_key *key); extern void static_key_slow_dec_cpuslocked(struct static_key *key); extern int static_key_count(struct static_key *key); extern void static_key_enable(struct static_key *key); @@ -278,11 +279,23 @@ static __always_inline bool static_key_true(struct static_key *key) return false; } -static inline void static_key_slow_inc(struct static_key *key) +static inline bool static_key_fast_inc_not_disabled(struct static_key *key) { + int v; + STATIC_KEY_CHECK_USE(key); - atomic_inc(&key->enabled); + /* + * Prevent key->enabled getting negative to follow the same semantics + * as for CONFIG_JUMP_LABEL=y, see kernel/jump_label.c comment. + */ + v = atomic_read(&key->enabled); + do { + if (v < 0 || (v + 1) < 0) + return false; + } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1))); + return true; } +#define static_key_slow_inc(key) static_key_fast_inc_not_disabled(key) static inline void static_key_slow_dec(struct static_key *key) { -- cgit v1.2.3 From 459837b522f7dff3b6681f534d8fff4eca19b7d1 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Wed, 23 Nov 2022 17:38:57 +0000 Subject: net/tcp: Disable TCP-MD5 static key on tcp_md5sig_info destruction To do that, separate two scenarios: - where it's the first MD5 key on the system, which means that enabling of the static key may need to sleep; - copying of an existing key from a listening socket to the request socket upon receiving a signed TCP segment, where static key was already enabled (when the key was added to the listening socket). Now the life-time of the static branch for TCP-MD5 is until: - last tcp_md5sig_info is destroyed - last socket in time-wait state with MD5 key is closed. Which means that after all sockets with TCP-MD5 keys are gone, the system gets back the performance of disabled md5-key static branch. While at here, provide static_key_fast_inc() helper that does ref counter increment in atomic fashion (without grabbing cpus_read_lock() on CONFIG_JUMP_LABEL=y). This is needed to add a new user for a static_key when the caller controls the lifetime of another user. Signed-off-by: Dmitry Safonov Acked-by: Jakub Kicinski Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 6b814e788f00..f925377066fe 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1675,7 +1675,11 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb); int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags, - const u8 *newkey, u8 newkeylen, gfp_t gfp); + const u8 *newkey, u8 newkeylen); +int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, + int family, u8 prefixlen, int l3index, + struct tcp_md5sig_key *key); + int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags); struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, @@ -1683,7 +1687,7 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, #ifdef CONFIG_TCP_MD5SIG #include -extern struct static_key_false tcp_md5_needed; +extern struct static_key_false_deferred tcp_md5_needed; struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family); @@ -1691,7 +1695,7 @@ static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { - if (!static_branch_unlikely(&tcp_md5_needed)) + if (!static_branch_unlikely(&tcp_md5_needed.key)) return NULL; return __tcp_md5_do_lookup(sk, l3index, addr, family); } -- cgit v1.2.3 From f8c9dfbd875b17fee59c7f1aa35a4944d4e6d810 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 30 Nov 2022 15:06:28 +0100 Subject: mptcp: add pm listener events This patch adds two new MPTCP netlink event types for PM listening socket create and close, named MPTCP_EVENT_LISTENER_CREATED and MPTCP_EVENT_LISTENER_CLOSED. Add a new function mptcp_event_pm_listener() to push the new events with family, port and addr to userspace. Invoke mptcp_event_pm_listener() with MPTCP_EVENT_LISTENER_CREATED in mptcp_listen() and mptcp_pm_nl_create_listen_socket(), invoke it with MPTCP_EVENT_LISTENER_CLOSED in __mptcp_close_ssk(). Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index dfe19bf13f4c..32af2d278cb4 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -160,6 +160,12 @@ struct mptcp_info { * daddr4 | daddr6, sport, dport, backup, if_idx * [, error] * The priority of a subflow has changed. 'error' should not be set. + * + * MPTCP_EVENT_LISTENER_CREATED: family, sport, saddr4 | saddr6 + * A new PM listener is created. + * + * MPTCP_EVENT_LISTENER_CLOSED: family, sport, saddr4 | saddr6 + * A PM listener is closed. */ enum mptcp_event_type { MPTCP_EVENT_UNSPEC = 0, @@ -174,6 +180,9 @@ enum mptcp_event_type { MPTCP_EVENT_SUB_CLOSED = 11, MPTCP_EVENT_SUB_PRIORITY = 13, + + MPTCP_EVENT_LISTENER_CREATED = 15, + MPTCP_EVENT_LISTENER_CLOSED = 16, }; enum mptcp_event_attr { -- cgit v1.2.3 From 7d802c8098c50fb7dcf5dfcb6466482e1f2b15e4 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 30 Nov 2022 18:04:31 -0500 Subject: sctp: delete free member from struct sctp_sched_ops After commit 9ed7bfc79542 ("sctp: fix memory leak in sctp_stream_outq_migrate()"), sctp_sched_set_sched() is the only place calling sched->free(), and it can actually be replaced by sched->free_sid() on each stream, and yet there's already a loop to traverse all streams in sctp_sched_set_sched(). This patch adds a function sctp_sched_free_sched() where it calls sched->free_sid() for each stream to replace sched->free() calls in sctp_sched_set_sched() and then deletes the unused free member from struct sctp_sched_ops. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Link: https://lore.kernel.org/r/e10aac150aca2686cb0bd0570299ec716da5a5c0.1669849471.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- include/net/sctp/stream_sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h index 65058faea4db..fa00dc20a0d7 100644 --- a/include/net/sctp/stream_sched.h +++ b/include/net/sctp/stream_sched.h @@ -28,8 +28,6 @@ struct sctp_sched_ops { int (*init_sid)(struct sctp_stream *stream, __u16 sid, gfp_t gfp); /* free a stream */ void (*free_sid)(struct sctp_stream *stream, __u16 sid); - /* Frees the entire thing */ - void (*free)(struct sctp_stream *stream); /* Enqueue a chunk */ void (*enqueue)(struct sctp_outq *q, struct sctp_datamsg *msg); -- cgit v1.2.3 From d93607082e982223cf92750f2d9039ff365b9d24 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 30 Nov 2022 23:28:26 +0100 Subject: net: add netdev_sw_irq_coalesce_default_on() Add a helper for drivers wanting to set SW IRQ coalescing by default. The related sysfs attributes can be used to override the default values. Follow Jakub's suggestion and put this functionality into net core so that drivers wanting to use software interrupt coalescing per default don't have to open-code it. Note that this function needs to be called before the netdevice is registered. Suggested-by: Jakub Kicinski Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5aa35c58c342..f78db610ada5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -78,6 +78,7 @@ struct xdp_buff; void synchronize_net(void); void netdev_set_default_ethtool_ops(struct net_device *dev, const struct ethtool_ops *ops); +void netdev_sw_irq_coalesce_default_on(struct net_device *dev); /* Backlog congestion levels */ #define NET_RX_SUCCESS 0 /* keep 'em coming, baby */ -- cgit v1.2.3 From fca1aa75518c03b04c3c249e9a9134faf9ca18c5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 3 Dec 2022 10:46:02 -0800 Subject: bpf: Handle MEM_RCU type properly Commit 9bb00b2895cb ("bpf: Add kfunc bpf_rcu_read_lock/unlock()") introduced MEM_RCU and bpf_rcu_read_lock/unlock() support. In that commit, a rcu pointer is tagged with both MEM_RCU and PTR_TRUSTED so that it can be passed into kfuncs or helpers as an argument. Martin raised a good question in [1] such that the rcu pointer, although being able to accessing the object, might have reference count of 0. This might cause a problem if the rcu pointer is passed to a kfunc which expects trusted arguments where ref count should be greater than 0. This patch makes the following changes related to MEM_RCU pointer: - MEM_RCU pointer might be NULL (PTR_MAYBE_NULL). - Introduce KF_RCU so MEM_RCU ptr can be acquired with a KF_RCU tagged kfunc which assumes ref count of rcu ptr could be zero. - For mem access 'b = ptr->a', say 'ptr' is a MEM_RCU ptr, and 'a' is tagged with __rcu as well. Let us mark 'b' as MEM_RCU | PTR_MAYBE_NULL. [1] https://lore.kernel.org/bpf/ac70f574-4023-664e-b711-e0d3b18117fd@linux.dev/ Fixes: 9bb00b2895cb ("bpf: Add kfunc bpf_rcu_read_lock/unlock()") Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221203184602.477272-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- include/linux/btf.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b5090e89cb3f..c07b351a5bc7 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -682,7 +682,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) } } -#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | MEM_RCU | PTR_TRUSTED) +#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED) static inline bool bpf_type_has_unsafe_modifiers(u32 type) { diff --git a/include/linux/btf.h b/include/linux/btf.h index 9ed00077db6e..cbd6e4096f8c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -70,6 +70,7 @@ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ +#define KF_RCU (1 << 7) /* kfunc only takes rcu pointer arguments */ /* * Return the name of the passed struct, if exists, or halt the build if for -- cgit v1.2.3 From c0c852dd1876dc1db4600ce951a92aadd3073b1c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 3 Dec 2022 12:49:54 -0800 Subject: bpf: Do not mark certain LSM hook arguments as trusted Martin mentioned that the verifier cannot assume arguments from LSM hook sk_alloc_security being trusted since after the hook is called, the sk ref_count is set to 1. This will overwrite the ref_count changed by the bpf program and may cause ref_count underflow later on. I then further checked some other hooks. For example, for bpf_lsm_file_alloc() hook in fs/file_table.c, f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { file_free_rcu(&f->f_rcuhead); return ERR_PTR(error); } atomic_long_set(&f->f_count, 1); The input parameter 'f' to security_file_alloc() cannot be trusted as well. Specifically, I investiaged bpf_map/bpf_prog/file/sk/task alloc/free lsm hooks. Except bpf_map_alloc and task_alloc, arguments for all other hooks should not be considered as trusted. This may not be a complete list, but it covers common usage for sk and task. Fixes: 3f00c5239344 ("bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs") Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221203204954.2043348-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_lsm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 4bcf76a9bb06..1de7ece5d36d 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -28,6 +28,7 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); bool bpf_lsm_is_sleepable_hook(u32 btf_id); +bool bpf_lsm_is_trusted(const struct bpf_prog *prog); static inline struct bpf_storage_blob *bpf_inode( const struct inode *inode) @@ -51,6 +52,11 @@ static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id) return false; } +static inline bool bpf_lsm_is_trusted(const struct bpf_prog *prog) +{ + return false; +} + static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { -- cgit v1.2.3 From d14f28b8c1de668bab863bf5892a49c824cb110d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 2 Dec 2022 20:41:27 +0200 Subject: xfrm: add new packet offload flag In the next patches, the xfrm core code will be extended to support new type of offload - packet offload. In that mode, both policy and state should be specially configured in order to perform whole offloaded data path. Full offload takes care of encryption, decryption, encapsulation and other operations with headers. As this mode is new for XFRM policy flow, we can "start fresh" with flag bits and release first and second bit for future use. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 7 +++++++ include/uapi/linux/xfrm.h | 6 ++++++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index e0cc6791c001..b39d24fa2ef0 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -131,12 +131,19 @@ enum { XFRM_DEV_OFFLOAD_OUT, }; +enum { + XFRM_DEV_OFFLOAD_UNSPECIFIED, + XFRM_DEV_OFFLOAD_CRYPTO, + XFRM_DEV_OFFLOAD_PACKET, +}; + struct xfrm_dev_offload { struct net_device *dev; netdevice_tracker dev_tracker; struct net_device *real_dev; unsigned long offload_handle; u8 dir : 2; + u8 type : 2; }; struct xfrm_mode { diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 4f84ea7ee14c..23543c33fee8 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -519,6 +519,12 @@ struct xfrm_user_offload { */ #define XFRM_OFFLOAD_IPV6 1 #define XFRM_OFFLOAD_INBOUND 2 +/* Two bits above are relevant for state path only, while + * offload is used for both policy and state flows. + * + * In policy offload mode, they are free and can be safely reused. + */ +#define XFRM_OFFLOAD_PACKET 4 struct xfrm_userpolicy_default { #define XFRM_USERPOLICY_UNSPEC 0 -- cgit v1.2.3 From 919e43fad5163a8ceb39826ecdee897a9f799351 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 2 Dec 2022 20:41:29 +0200 Subject: xfrm: add an interface to offload policy Extend netlink interface to add and delete XFRM policy from the device. This functionality is a first step to implement packet IPsec offload solution. Signed-off-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/linux/netdevice.h | 3 +++ include/net/xfrm.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5aa35c58c342..4096e3fe8e4a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1040,6 +1040,9 @@ struct xfrmdev_ops { bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); + int (*xdo_dev_policy_add) (struct xfrm_policy *x); + void (*xdo_dev_policy_delete) (struct xfrm_policy *x); + void (*xdo_dev_policy_free) (struct xfrm_policy *x); }; #endif diff --git a/include/net/xfrm.h b/include/net/xfrm.h index b39d24fa2ef0..6fea34cbdf48 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -129,6 +129,7 @@ struct xfrm_state_walk { enum { XFRM_DEV_OFFLOAD_IN = 1, XFRM_DEV_OFFLOAD_OUT, + XFRM_DEV_OFFLOAD_FWD, }; enum { @@ -541,6 +542,8 @@ struct xfrm_policy { struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH]; struct hlist_node bydst_inexact_list; struct rcu_head rcu; + + struct xfrm_dev_offload xdo; }; static inline struct net *xp_net(const struct xfrm_policy *xp) @@ -1585,6 +1588,8 @@ struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); int xfrm_state_delete(struct xfrm_state *x); int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); +int xfrm_dev_policy_flush(struct net *net, struct net_device *dev, + bool task_valid); void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si); void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); @@ -1899,6 +1904,9 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo, struct netlink_ext_ack *extack); +int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, + struct xfrm_user_offload *xuo, u8 dir, + struct netlink_ext_ack *extack); bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x) @@ -1947,6 +1955,28 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x) netdev_put(dev, &xso->dev_tracker); } } + +static inline void xfrm_dev_policy_delete(struct xfrm_policy *x) +{ + struct xfrm_dev_offload *xdo = &x->xdo; + struct net_device *dev = xdo->dev; + + if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_policy_delete) + dev->xfrmdev_ops->xdo_dev_policy_delete(x); +} + +static inline void xfrm_dev_policy_free(struct xfrm_policy *x) +{ + struct xfrm_dev_offload *xdo = &x->xdo; + struct net_device *dev = xdo->dev; + + if (dev && dev->xfrmdev_ops) { + if (dev->xfrmdev_ops->xdo_dev_policy_free) + dev->xfrmdev_ops->xdo_dev_policy_free(x); + xdo->dev = NULL; + netdev_put(dev, &xdo->dev_tracker); + } +} #else static inline void xfrm_dev_resume(struct sk_buff *skb) { @@ -1974,6 +2004,21 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x) { } +static inline int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, + struct xfrm_user_offload *xuo, u8 dir, + struct netlink_ext_ack *extack) +{ + return 0; +} + +static inline void xfrm_dev_policy_delete(struct xfrm_policy *x) +{ +} + +static inline void xfrm_dev_policy_free(struct xfrm_policy *x) +{ +} + static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) { return false; -- cgit v1.2.3 From 5958372ddf628fe6f4c3e49425734ad32fcfb13c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 2 Dec 2022 20:41:31 +0200 Subject: xfrm: add RX datapath protection for IPsec packet offload mode Traffic received by device with enabled IPsec packet offload should be forwarded to the stack only after decryption, packet headers and trailers removed. Such packets are expected to be seen as normal (non-XFRM) ones, while not-supported packets should be dropped by the HW. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 55 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 6fea34cbdf48..b6ee14991a32 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1102,6 +1102,29 @@ xfrm_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, un return !0; } +#ifdef CONFIG_XFRM +static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb) +{ + struct sec_path *sp = skb_sec_path(skb); + + return sp->xvec[sp->len - 1]; +} +#endif + +static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb) +{ +#ifdef CONFIG_XFRM + struct sec_path *sp = skb_sec_path(skb); + + if (!sp || !sp->olen || sp->len != sp->olen) + return NULL; + + return &sp->ovec[sp->olen - 1]; +#else + return NULL; +#endif +} + #ifdef CONFIG_XFRM int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb, unsigned short family); @@ -1133,10 +1156,19 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir, { struct net *net = dev_net(skb->dev); int ndir = dir | (reverse ? XFRM_POLICY_MASK + 1 : 0); + struct xfrm_offload *xo = xfrm_offload(skb); + struct xfrm_state *x; if (sk && sk->sk_policy[XFRM_POLICY_IN]) return __xfrm_policy_check(sk, ndir, skb, family); + if (xo) { + x = xfrm_input_state(skb); + if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) + return (xo->flags & CRYPTO_DONE) && + (xo->status & CRYPTO_SUCCESS); + } + return __xfrm_check_nopolicy(net, skb, dir) || __xfrm_check_dev_nopolicy(skb, dir, family) || __xfrm_policy_check(sk, ndir, skb, family); @@ -1872,29 +1904,6 @@ static inline void xfrm_states_delete(struct xfrm_state **states, int n) } #endif -#ifdef CONFIG_XFRM -static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb) -{ - struct sec_path *sp = skb_sec_path(skb); - - return sp->xvec[sp->len - 1]; -} -#endif - -static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb) -{ -#ifdef CONFIG_XFRM - struct sec_path *sp = skb_sec_path(skb); - - if (!sp || !sp->olen || sp->len != sp->olen) - return NULL; - - return &sp->ovec[sp->olen - 1]; -#else - return NULL; -#endif -} - void __init xfrm_dev_init(void); #ifdef CONFIG_XFRM_OFFLOAD -- cgit v1.2.3 From f3da86dc2c8c9004445cfbb15ac086773622d853 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 2 Dec 2022 20:41:33 +0200 Subject: xfrm: add support to HW update soft and hard limits Both in RX and TX, the traffic that performs IPsec packet offload transformation is accounted by HW. It is needed to properly handle hard limits that require to drop the packet. It means that XFRM core needs to update internal counters with the one that accounted by the HW, so new callbacks are introduced in this patch. In case of soft or hard limit is occurred, the driver should call to xfrm_state_check_expire() that will perform key rekeying exactly as done by XFRM core. Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/linux/netdevice.h | 1 + include/net/xfrm.h | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4096e3fe8e4a..29ae964e3b89 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1040,6 +1040,7 @@ struct xfrmdev_ops { bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); + void (*xdo_dev_state_update_curlft) (struct xfrm_state *x); int (*xdo_dev_policy_add) (struct xfrm_policy *x); void (*xdo_dev_policy_delete) (struct xfrm_policy *x); void (*xdo_dev_policy_free) (struct xfrm_policy *x); diff --git a/include/net/xfrm.h b/include/net/xfrm.h index b6ee14991a32..5413cdd5ad62 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1571,6 +1571,23 @@ struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi, unsigned short family); int xfrm_state_check_expire(struct xfrm_state *x); +#ifdef CONFIG_XFRM_OFFLOAD +static inline void xfrm_dev_state_update_curlft(struct xfrm_state *x) +{ + struct xfrm_dev_offload *xdo = &x->xso; + struct net_device *dev = xdo->dev; + + if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) + return; + + if (dev && dev->xfrmdev_ops && + dev->xfrmdev_ops->xdo_dev_state_update_curlft) + dev->xfrmdev_ops->xdo_dev_state_update_curlft(x); + +} +#else +static inline void xfrm_dev_state_update_curlft(struct xfrm_state *x) {} +#endif void xfrm_state_insert(struct xfrm_state *x); int xfrm_state_add(struct xfrm_state *x); int xfrm_state_update(struct xfrm_state *x); -- cgit v1.2.3 From a46e9010124256f5bf5fc2c241a45cf1944b768e Mon Sep 17 00:00:00 2001 From: Revanth Kumar Uppala Date: Thu, 1 Dec 2022 15:58:43 +0000 Subject: net: stmmac: Power up SERDES after the PHY link The Tegra MGBE ethernet controller requires that the SERDES link is powered-up after the PHY link is up, otherwise the link fails to become ready following a resume from suspend. Add a variable to indicate that the SERDES link must be powered-up after the PHY link. Signed-off-by: Revanth Kumar Uppala Signed-off-by: Jon Hunter Signed-off-by: David S. Miller --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index fb2e88614f5d..83ca2e8eb6b5 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -271,5 +271,6 @@ struct plat_stmmacenet_data { int msi_tx_base_vec; bool use_phy_wol; bool sph_disable; + bool serdes_up_after_phy_linkup; }; #endif -- cgit v1.2.3 From 7112a04664bfc10ae4709b2079fe3991cbd1fe18 Mon Sep 17 00:00:00 2001 From: Sudheer Mogilappagari Date: Thu, 1 Dec 2022 16:25:55 -0800 Subject: ethtool: add netlink based get rss support Add netlink based support for "ethtool -x [context x]" command by implementing ETHTOOL_MSG_RSS_GET netlink message. This is equivalent to functionality provided via ETHTOOL_GRSSH in ioctl path. It sends RSS table, hash key and hash function of an interface to user space. This patch implements existing functionality available in ioctl path and enables addition of new RSS context based parameters in future. Signed-off-by: Sudheer Mogilappagari Link: https://lore.kernel.org/r/20221202002555.241580-1-sudheer.mogilappagari@intel.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool_netlink.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index aaf7c6963d61..5799a9db034e 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -51,6 +51,7 @@ enum { ETHTOOL_MSG_MODULE_SET, ETHTOOL_MSG_PSE_GET, ETHTOOL_MSG_PSE_SET, + ETHTOOL_MSG_RSS_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -97,6 +98,7 @@ enum { ETHTOOL_MSG_MODULE_GET_REPLY, ETHTOOL_MSG_MODULE_NTF, ETHTOOL_MSG_PSE_GET_REPLY, + ETHTOOL_MSG_RSS_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -880,6 +882,18 @@ enum { ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1) }; +enum { + ETHTOOL_A_RSS_UNSPEC, + ETHTOOL_A_RSS_HEADER, + ETHTOOL_A_RSS_CONTEXT, /* u32 */ + ETHTOOL_A_RSS_HFUNC, /* u32 */ + ETHTOOL_A_RSS_INDIR, /* binary */ + ETHTOOL_A_RSS_HKEY, /* binary */ + + __ETHTOOL_A_RSS_CNT, + ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1), +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 -- cgit v1.2.3 From 94151f5aa9667c562281abeaaa5e89b9d5c17729 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Sat, 3 Dec 2022 10:46:57 +0200 Subject: xfrm: interface: Add unstable helpers for setting/getting XFRM metadata from TC-BPF This change adds xfrm metadata helpers using the unstable kfunc call interface for the TC-BPF hooks. This allows steering traffic towards different IPsec connections based on logic implemented in bpf programs. This object is built based on the availability of BTF debug info. When setting the xfrm metadata, percpu metadata dsts are used in order to avoid allocating a metadata dst per packet. In order to guarantee safe module unload, the percpu dsts are allocated on first use and never freed. The percpu pointer is stored in net/core/filter.c so that it can be reused on module reload. The metadata percpu dsts take ownership of the original skb dsts so that they may be used as part of the xfrm transmission logic - e.g. for MTU calculations. Signed-off-by: Eyal Birger Link: https://lore.kernel.org/r/20221203084659.1837829-3-eyal.birger@gmail.com Signed-off-by: Martin KaFai Lau --- include/net/dst_metadata.h | 1 + include/net/xfrm.h | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index a454cf4327fe..1b7fae4c6b24 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -26,6 +26,7 @@ struct macsec_info { struct xfrm_md_info { u32 if_id; int link; + struct dst_entry *dst_orig; }; struct metadata_dst { diff --git a/include/net/xfrm.h b/include/net/xfrm.h index e0cc6791c001..3707e6b34e67 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2086,4 +2086,21 @@ static inline bool xfrm6_local_dontfrag(const struct sock *sk) return false; } #endif + +#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ + (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) + +extern struct metadata_dst __percpu *xfrm_bpf_md_dst; + +int register_xfrm_interface_bpf(void); + +#else + +static inline int register_xfrm_interface_bpf(void) +{ + return 0; +} + +#endif + #endif /* _NET_XFRM_H */ -- cgit v1.2.3 From 3afee4ed336ed7a6cff78d23339879ffb654e02e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 2 Dec 2022 22:10:23 +0200 Subject: net/mlx5: Add HW definitions for IPsec packet offload Add all needed bits to support IPsec packet offload mode. Reviewed-by: Raed Salem Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/linux/mlx5/mlx5_ifc.h | 53 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5a4e914e2a6f..300b56ea5ff4 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -445,7 +445,10 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 max_modify_header_actions[0x8]; u8 max_ft_level[0x8]; - u8 reserved_at_40[0x6]; + u8 reformat_add_esp_trasport[0x1]; + u8 reserved_at_41[0x2]; + u8 reformat_del_esp_trasport[0x1]; + u8 reserved_at_44[0x2]; u8 execute_aso[0x1]; u8 reserved_at_47[0x19]; @@ -638,8 +641,10 @@ struct mlx5_ifc_fte_match_set_misc2_bits { u8 reserved_at_1a0[0x8]; u8 macsec_syndrome[0x8]; + u8 ipsec_syndrome[0x8]; + u8 reserved_at_1b8[0x8]; - u8 reserved_at_1b0[0x50]; + u8 reserved_at_1c0[0x40]; }; struct mlx5_ifc_fte_match_set_misc3_bits { @@ -6384,6 +6389,9 @@ enum mlx5_reformat_ctx_type { MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL = 0x2, MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2 = 0x3, MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x4, + MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV4 = 0x5, + MLX5_REFORMAT_TYPE_DEL_ESP_TRANSPORT = 0x8, + MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV6 = 0xb, MLX5_REFORMAT_TYPE_INSERT_HDR = 0xf, MLX5_REFORMAT_TYPE_REMOVE_HDR = 0x10, MLX5_REFORMAT_TYPE_ADD_MACSEC = 0x11, @@ -11563,6 +11571,41 @@ enum { MLX5_IPSEC_OBJECT_ICV_LEN_16B, }; +enum { + MLX5_IPSEC_ASO_REG_C_0_1 = 0x0, + MLX5_IPSEC_ASO_REG_C_2_3 = 0x1, + MLX5_IPSEC_ASO_REG_C_4_5 = 0x2, + MLX5_IPSEC_ASO_REG_C_6_7 = 0x3, +}; + +enum { + MLX5_IPSEC_ASO_MODE = 0x0, + MLX5_IPSEC_ASO_REPLAY_PROTECTION = 0x1, + MLX5_IPSEC_ASO_INC_SN = 0x2, +}; + +struct mlx5_ifc_ipsec_aso_bits { + u8 valid[0x1]; + u8 reserved_at_201[0x1]; + u8 mode[0x2]; + u8 window_sz[0x2]; + u8 soft_lft_arm[0x1]; + u8 hard_lft_arm[0x1]; + u8 remove_flow_enable[0x1]; + u8 esn_event_arm[0x1]; + u8 reserved_at_20a[0x16]; + + u8 remove_flow_pkt_cnt[0x20]; + + u8 remove_flow_soft_lft[0x20]; + + u8 reserved_at_260[0x80]; + + u8 mode_parameter[0x20]; + + u8 replay_protection_window[0x100]; +}; + struct mlx5_ifc_ipsec_obj_bits { u8 modify_field_select[0x40]; u8 full_offload[0x1]; @@ -11584,7 +11627,11 @@ struct mlx5_ifc_ipsec_obj_bits { u8 implicit_iv[0x40]; - u8 reserved_at_100[0x700]; + u8 reserved_at_100[0x8]; + u8 ipsec_aso_access_pd[0x18]; + u8 reserved_at_120[0xe0]; + + struct mlx5_ifc_ipsec_aso_bits ipsec_aso; }; struct mlx5_ifc_create_ipsec_obj_in_bits { -- cgit v1.2.3 From bffdeaa8a5af7200b0e74c9d5a41167f86626a36 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 6 Dec 2022 15:33:43 -0800 Subject: bpf: decouple prune and jump points BPF verifier marks some instructions as prune points. Currently these prune points serve two purposes. It's a point where verifier tries to find previously verified state and check current state's equivalence to short circuit verification for current code path. But also currently it's a point where jump history, used for precision backtracking, is updated. This is done so that non-linear flow of execution could be properly backtracked. Such coupling is coincidental and unnecessary. Some prune points are not part of some non-linear jump path, so don't need update of jump history. On the other hand, not all instructions which have to be recorded in jump history necessarily are good prune points. This patch splits prune and jump points into independent flags. Currently all prune points are marked as jump points to minimize amount of changes in this patch, but next patch will perform some optimization of prune vs jmp point placement. No functional changes are intended. Acked-by: John Fastabend Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221206233345.438540-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c07b351a5bc7..70d06a99f0b8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -452,6 +452,7 @@ struct bpf_insn_aux_data { /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ bool prune_point; + bool jmp_point; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ -- cgit v1.2.3 From e9b4aeed56699b469206d05e706ddf2db95700a9 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 3 Dec 2022 17:51:04 +0100 Subject: net: xsk: Don't include There is no need to include here. Prefer the less invasive which is needed for 'hlist_head'. Signed-off-by: Christophe JAILLET Acked-by: John Fastabend Link: https://lore.kernel.org/r/88d6a1d88764cca328610854f890a9ca1f4b029e.1670086246.git.christophe.jaillet@wanadoo.fr Signed-off-by: Alexei Starovoitov --- include/net/netns/xdp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/xdp.h b/include/net/netns/xdp.h index e5734261ba0a..21a4f25a187a 100644 --- a/include/net/netns/xdp.h +++ b/include/net/netns/xdp.h @@ -2,8 +2,8 @@ #ifndef __NETNS_XDP_H__ #define __NETNS_XDP_H__ -#include #include +#include struct netns_xdp { struct mutex lock; -- cgit v1.2.3 From ed883bec679b027b198d57a336715f8298fb88b4 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 5 Dec 2022 12:34:42 +0100 Subject: net: ethernet: mtk_wed: add reset to rx_ring_setup callback This patch adds reset parameter to mtk_wed_rx_ring_setup signature in order to align rx_ring_setup callback to tx_ring_setup one introduced in 'commit 23dca7a90017 ("net: ethernet: mtk_wed: add reset to tx_ring_setup callback")' Co-developed-by: Sujuan Chen Signed-off-by: Sujuan Chen Signed-off-by: Lorenzo Bianconi Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/29c6e7a5469e784406cf3e2920351d1207713d05.1670239984.git.lorenzo@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/soc/mediatek/mtk_wed.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index beb190449704..a0746d4aec20 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -160,7 +160,7 @@ struct mtk_wed_ops { int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring, void __iomem *regs, bool reset); int (*rx_ring_setup)(struct mtk_wed_device *dev, int ring, - void __iomem *regs); + void __iomem *regs, bool reset); int (*txfree_ring_setup)(struct mtk_wed_device *dev, void __iomem *regs); int (*msg_update)(struct mtk_wed_device *dev, int cmd_id, @@ -228,8 +228,8 @@ mtk_wed_get_rx_capa(struct mtk_wed_device *dev) (_dev)->ops->irq_get(_dev, _mask) #define mtk_wed_device_irq_set_mask(_dev, _mask) \ (_dev)->ops->irq_set_mask(_dev, _mask) -#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) \ - (_dev)->ops->rx_ring_setup(_dev, _ring, _regs) +#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs, _reset) \ + (_dev)->ops->rx_ring_setup(_dev, _ring, _regs, _reset) #define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash) \ (_dev)->ops->ppe_check(_dev, _skb, _reason, _hash) #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) \ @@ -249,7 +249,7 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) #define mtk_wed_device_reg_write(_dev, _reg, _val) do {} while (0) #define mtk_wed_device_irq_get(_dev, _mask) 0 #define mtk_wed_device_irq_set_mask(_dev, _mask) do {} while (0) -#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) -ENODEV +#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs, _reset) -ENODEV #define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash) do {} while (0) #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) -ENODEV #define mtk_wed_device_stop(_dev) do {} while (0) -- cgit v1.2.3 From 5b481acab4ce017fda8166fa9428511da41109e5 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Dec 2022 15:59:32 +0100 Subject: bpf: do not rely on ALLOW_ERROR_INJECTION for fmod_ret The current way of expressing that a non-bpf kernel component is willing to accept that bpf programs can be attached to it and that they can change the return value is to abuse ALLOW_ERROR_INJECTION. This is debated in the link below, and the result is that it is not a reasonable thing to do. Reuse the kfunc declaration structure to also tag the kernel functions we want to be fmodret. This way we can control from any subsystem which functions are being modified by bpf without touching the verifier. Link: https://lore.kernel.org/all/20221121104403.1545f9b5@gandalf.local.home/ Suggested-by: Alexei Starovoitov Signed-off-by: Benjamin Tissoires Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20221206145936.922196-2-benjamin.tissoires@redhat.com --- include/linux/btf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index f9aababc5d78..6a0808e7845c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -412,8 +412,10 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog); u32 *btf_kfunc_id_set_contains(const struct btf *btf, enum bpf_prog_type prog_type, u32 kfunc_btf_id); +u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id); int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, const struct btf_kfunc_id_set *s); +int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset); s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); -- cgit v1.2.3 From df268f6ca7da1088d7ecff4250d1478d3fa2406b Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 20:51:12 +0200 Subject: net/mlx5: Introduce IFC bits for migratable Introduce IFC related capabilities to enable setting VF to be able to perform live migration. e.g.: to be migratable. Signed-off-by: Yishai Hadas Reviewed-by: Mark Bloch Acked-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5a4e914e2a6f..2093131483c7 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -68,6 +68,7 @@ enum { MLX5_SET_HCA_CAP_OP_MOD_ODP = 0x2, MLX5_SET_HCA_CAP_OP_MOD_ATOMIC = 0x3, MLX5_SET_HCA_CAP_OP_MOD_ROCE = 0x4, + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE2 = 0x20, MLX5_SET_HCA_CAP_OP_MODE_PORT_SELECTION = 0x25, }; @@ -1875,7 +1876,10 @@ struct mlx5_ifc_cmd_hca_cap_bits { }; struct mlx5_ifc_cmd_hca_cap_2_bits { - u8 reserved_at_0[0xa0]; + u8 reserved_at_0[0x80]; + + u8 migratable[0x1]; + u8 reserved_at_81[0x1f]; u8 max_reformat_insert_size[0x8]; u8 max_reformat_insert_offset[0x8]; -- cgit v1.2.3 From da65e9ff3bf614d2836e38e1d405c7073e6ba3b7 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 20:51:15 +0200 Subject: devlink: Expose port function commands to control RoCE Expose port function commands to enable / disable RoCE, this is used to control the port RoCE device capabilities. When RoCE is disabled for a function of the port, function cannot create any RoCE specific resources (e.g GID table). It also saves system memory utilization. For example disabling RoCE enable a VF/SF saves 1 Mbytes of system memory per function. Example of a PCI VF port which supports function configuration: Set RoCE of the VF's port function. $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 roce enable $ devlink port function set pci/0000:06:00.0/2 roce disable $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 roce disable Signed-off-by: Shay Drory Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 18 ++++++++++++++++++ include/uapi/linux/devlink.h | 10 ++++++++++ 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 5f6eca5e4a40..ce4c65d2f2e7 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1451,6 +1451,24 @@ struct devlink_ops { int (*port_function_hw_addr_set)(struct devlink_port *port, const u8 *hw_addr, int hw_addr_len, struct netlink_ext_ack *extack); + /** + * @port_fn_roce_get: Port function's roce get function. + * + * Query RoCE state of a function managed by the devlink port. + * Return -EOPNOTSUPP if port function RoCE handling is not supported. + */ + int (*port_fn_roce_get)(struct devlink_port *devlink_port, + bool *is_enable, + struct netlink_ext_ack *extack); + /** + * @port_fn_roce_set: Port function's roce set function. + * + * Enable/Disable the RoCE state of a function managed by the devlink + * port. + * Return -EOPNOTSUPP if port function RoCE handling is not supported. + */ + int (*port_fn_roce_set)(struct devlink_port *devlink_port, + bool enable, struct netlink_ext_ack *extack); /** * port_new() - Add a new port function of a specified flavor * @devlink: Devlink instance diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 70191d96af89..6cc2925bd478 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -658,11 +658,21 @@ enum devlink_resource_unit { DEVLINK_RESOURCE_UNIT_ENTRY, }; +enum devlink_port_fn_attr_cap { + DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT, + + /* Add new caps above */ + __DEVLINK_PORT_FN_ATTR_CAPS_MAX, +}; + +#define DEVLINK_PORT_FN_CAP_ROCE _BITUL(DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT) + enum devlink_port_function_attr { DEVLINK_PORT_FUNCTION_ATTR_UNSPEC, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, /* binary */ DEVLINK_PORT_FN_ATTR_STATE, /* u8 */ DEVLINK_PORT_FN_ATTR_OPSTATE, /* u8 */ + DEVLINK_PORT_FN_ATTR_CAPS, /* bitfield32 */ __DEVLINK_PORT_FUNCTION_ATTR_MAX, DEVLINK_PORT_FUNCTION_ATTR_MAX = __DEVLINK_PORT_FUNCTION_ATTR_MAX - 1 -- cgit v1.2.3 From 47d0c500d76c7b1d220c47cd875763fef04eba2e Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 20:51:16 +0200 Subject: net/mlx5: Add generic getters for other functions caps Downstream patch requires to get other function GENERAL2 caps while mlx5_vport_get_other_func_cap() gets only one type of caps (general). Rename it to represent this and introduce a generic implementation of mlx5_vport_get_other_func_cap(). Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Acked-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/vport.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index aad53cb72f17..7f31432f44c2 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -132,4 +132,6 @@ int mlx5_nic_vport_affiliate_multiport(struct mlx5_core_dev *master_mdev, int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev); u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev); +int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out, + u16 opmod); #endif /* __MLX5_VPORT_H__ */ -- cgit v1.2.3 From a8ce7b26a51efc4d7753b23d639ae092878a6193 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 20:51:18 +0200 Subject: devlink: Expose port function commands to control migratable Expose port function commands to enable / disable migratable capability, this is used to set the port function as migratable. Live migration is the process of transferring a live virtual machine from one physical host to another without disrupting its normal operation. In order for a VM to be able to perform LM, all the VM components must be able to perform migration. e.g.: to be migratable. In order for VF to be migratable, VF must be bound to VFIO driver with migration support. When migratable capability is enabled for a function of the port, the device is making the necessary preparations for the function to be migratable, which might include disabling features which cannot be migrated. Example of LM with migratable function configuration: Set migratable of the VF's port function. $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 migratable disable $ devlink port function set pci/0000:06:00.0/2 migratable enable $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 migratable enable Bind VF to VFIO driver with migration support: $ echo > /sys/bus/pci/devices/0000:08:00.0/driver/unbind $ echo mlx5_vfio_pci > /sys/bus/pci/devices/0000:08:00.0/driver_override $ echo > /sys/bus/pci/devices/0000:08:00.0/driver/bind Attach VF to the VM. Start the VM. Perform LM. Signed-off-by: Shay Drory Reviewed-by: Jiri Pirko Acked-by: Shannon Nelson Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 21 +++++++++++++++++++++ include/uapi/linux/devlink.h | 3 +++ 2 files changed, 24 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index ce4c65d2f2e7..0f376a28b9c4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1469,6 +1469,27 @@ struct devlink_ops { */ int (*port_fn_roce_set)(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack); + /** + * @port_fn_migratable_get: Port function's migratable get function. + * + * Query migratable state of a function managed by the devlink port. + * Return -EOPNOTSUPP if port function migratable handling is not + * supported. + */ + int (*port_fn_migratable_get)(struct devlink_port *devlink_port, + bool *is_enable, + struct netlink_ext_ack *extack); + /** + * @port_fn_migratable_set: Port function's migratable set function. + * + * Enable/Disable migratable state of a function managed by the devlink + * port. + * Return -EOPNOTSUPP if port function migratable handling is not + * supported. + */ + int (*port_fn_migratable_set)(struct devlink_port *devlink_port, + bool enable, + struct netlink_ext_ack *extack); /** * port_new() - Add a new port function of a specified flavor * @devlink: Devlink instance diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 6cc2925bd478..3782d4219ac9 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -660,12 +660,15 @@ enum devlink_resource_unit { enum devlink_port_fn_attr_cap { DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT, + DEVLINK_PORT_FN_ATTR_CAP_MIGRATABLE_BIT, /* Add new caps above */ __DEVLINK_PORT_FN_ATTR_CAPS_MAX, }; #define DEVLINK_PORT_FN_CAP_ROCE _BITUL(DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT) +#define DEVLINK_PORT_FN_CAP_MIGRATABLE \ + _BITUL(DEVLINK_PORT_FN_ATTR_CAP_MIGRATABLE_BIT) enum devlink_port_function_attr { DEVLINK_PORT_FUNCTION_ATTR_UNSPEC, -- cgit v1.2.3 From f1543c7abab25d93bc8e9fae79b4cb3153ed6669 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Tue, 31 May 2022 13:55:28 +0300 Subject: net/mlx5: mlx5_ifc updates for MATCH_DEFINER general object Update full structure of match definer and add an ID of the SELECT match definer type. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 68 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 2093131483c7..294cfe175c4b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -6108,6 +6108,38 @@ struct mlx5_ifc_match_definer_format_32_bits { u8 inner_dmac_15_0[0x10]; }; +enum { + MLX5_IFC_DEFINER_FORMAT_ID_SELECT = 61, +}; + +#define MLX5_IFC_DEFINER_FORMAT_OFFSET_UNUSED 0x0 +#define MLX5_IFC_DEFINER_FORMAT_OFFSET_OUTER_ETH_PKT_LEN 0x48 +#define MLX5_IFC_DEFINER_DW_SELECTORS_NUM 9 +#define MLX5_IFC_DEFINER_BYTE_SELECTORS_NUM 8 + +struct mlx5_ifc_match_definer_match_mask_bits { + u8 reserved_at_1c0[5][0x20]; + u8 match_dw_8[0x20]; + u8 match_dw_7[0x20]; + u8 match_dw_6[0x20]; + u8 match_dw_5[0x20]; + u8 match_dw_4[0x20]; + u8 match_dw_3[0x20]; + u8 match_dw_2[0x20]; + u8 match_dw_1[0x20]; + u8 match_dw_0[0x20]; + + u8 match_byte_7[0x8]; + u8 match_byte_6[0x8]; + u8 match_byte_5[0x8]; + u8 match_byte_4[0x8]; + + u8 match_byte_3[0x8]; + u8 match_byte_2[0x8]; + u8 match_byte_1[0x8]; + u8 match_byte_0[0x8]; +}; + struct mlx5_ifc_match_definer_bits { u8 modify_field_select[0x40]; @@ -6116,9 +6148,41 @@ struct mlx5_ifc_match_definer_bits { u8 reserved_at_80[0x10]; u8 format_id[0x10]; - u8 reserved_at_a0[0x160]; + u8 reserved_at_a0[0x60]; - u8 match_mask[16][0x20]; + u8 format_select_dw3[0x8]; + u8 format_select_dw2[0x8]; + u8 format_select_dw1[0x8]; + u8 format_select_dw0[0x8]; + + u8 format_select_dw7[0x8]; + u8 format_select_dw6[0x8]; + u8 format_select_dw5[0x8]; + u8 format_select_dw4[0x8]; + + u8 reserved_at_100[0x18]; + u8 format_select_dw8[0x8]; + + u8 reserved_at_120[0x20]; + + u8 format_select_byte3[0x8]; + u8 format_select_byte2[0x8]; + u8 format_select_byte1[0x8]; + u8 format_select_byte0[0x8]; + + u8 format_select_byte7[0x8]; + u8 format_select_byte6[0x8]; + u8 format_select_byte5[0x8]; + u8 format_select_byte4[0x8]; + + u8 reserved_at_180[0x40]; + + union { + struct { + u8 match_mask[16][0x20]; + }; + struct mlx5_ifc_match_definer_match_mask_bits match_mask_format; + }; }; struct mlx5_ifc_general_obj_in_cmd_hdr_bits { -- cgit v1.2.3 From 38bf24c38d19dc4ba5ec684ac2afa2f8e256db1e Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Mon, 15 Aug 2022 12:45:28 +0300 Subject: net/mlx5: fs, add match on ranges API Range is a new flow destination type which allows matching on a range of values instead of matching on a specific value. Range flow destination has the following fields: - hit_ft: flow table to forward the traffic in case of hit - miss_ft: flow table to forward the traffic in case of miss - field: which packet characteristic to match on - min: minimal value for the selected field - max: maximal value for the selected field Note: - In order to match, the value in the packet should meet the following criteria: min <= value < max - Currently, the only supported field type is L2 packet length Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index c7a91981cd5a..ba6958b49a8e 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -50,6 +50,7 @@ enum mlx5_flow_destination_type { MLX5_FLOW_DESTINATION_TYPE_PORT, MLX5_FLOW_DESTINATION_TYPE_COUNTER, MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM, + MLX5_FLOW_DESTINATION_TYPE_RANGE, }; enum { @@ -143,6 +144,10 @@ enum { MLX5_FLOW_DEST_VPORT_REFORMAT_ID = BIT(1), }; +enum mlx5_flow_dest_range_field { + MLX5_FLOW_DEST_RANGE_FIELD_PKT_LEN = 0, +}; + struct mlx5_flow_destination { enum mlx5_flow_destination_type type; union { @@ -156,6 +161,13 @@ struct mlx5_flow_destination { struct mlx5_pkt_reformat *pkt_reformat; u8 flags; } vport; + struct { + struct mlx5_flow_table *hit_ft; + struct mlx5_flow_table *miss_ft; + enum mlx5_flow_dest_range_field field; + u32 min; + u32 max; + } range; u32 sampler_id; }; }; -- cgit v1.2.3 From 6b75bd3d036745b9be30917909f03602099adbdb Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 8 Dec 2022 02:11:35 +0530 Subject: bpf: Refactor ARG_PTR_TO_DYNPTR checks into process_dynptr_func ARG_PTR_TO_DYNPTR is akin to ARG_PTR_TO_TIMER, ARG_PTR_TO_KPTR, where the underlying register type is subjected to more special checks to determine the type of object represented by the pointer and its state consistency. Move dynptr checks to their own 'process_dynptr_func' function so that is consistent and in-line with existing code. This also makes it easier to reuse this code for kfunc handling. Then, reuse this consolidated function in kfunc dynptr handling too. Note that for kfuncs, the arg_type constraint of DYNPTR_TYPE_LOCAL has been lifted. Acked-by: David Vernet Acked-by: Joanne Koong Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221207204141.308952-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 70d06a99f0b8..df0cb825e0e3 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -615,11 +615,9 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, enum bpf_arg_type arg_type); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); -bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, - struct bpf_reg_state *reg); -bool is_dynptr_type_expected(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - enum bpf_arg_type arg_type); +struct bpf_call_arg_meta; +int process_dynptr_func(struct bpf_verifier_env *env, int regno, + enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, -- cgit v1.2.3 From 270605317366e4535d8d9fc3d9da1ad0fb3c9d45 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 8 Dec 2022 02:11:37 +0530 Subject: bpf: Rework process_dynptr_func Recently, user ringbuf support introduced a PTR_TO_DYNPTR register type for use in callback state, because in case of user ringbuf helpers, there is no dynptr on the stack that is passed into the callback. To reflect such a state, a special register type was created. However, some checks have been bypassed incorrectly during the addition of this feature. First, for arg_type with MEM_UNINIT flag which initialize a dynptr, they must be rejected for such register type. Secondly, in the future, there are plans to add dynptr helpers that operate on the dynptr itself and may change its offset and other properties. In all of these cases, PTR_TO_DYNPTR shouldn't be allowed to be passed to such helpers, however the current code simply returns 0. The rejection for helpers that release the dynptr is already handled. For fixing this, we take a step back and rework existing code in a way that will allow fitting in all classes of helpers and have a coherent model for dealing with the variety of use cases in which dynptr is used. First, for ARG_PTR_TO_DYNPTR, it can either be set alone or together with a DYNPTR_TYPE_* constant that denotes the only type it accepts. Next, helpers which initialize a dynptr use MEM_UNINIT to indicate this fact. To make the distinction clear, use MEM_RDONLY flag to indicate that the helper only operates on the memory pointed to by the dynptr, not the dynptr itself. In C parlance, it would be equivalent to taking the dynptr as a point to const argument. When either of these flags are not present, the helper is allowed to mutate both the dynptr itself and also the memory it points to. Currently, the read only status of the memory is not tracked in the dynptr, but it would be trivial to add this support inside dynptr state of the register. With these changes and renaming PTR_TO_DYNPTR to CONST_PTR_TO_DYNPTR to better reflect its usage, it can no longer be passed to helpers that initialize a dynptr, i.e. bpf_dynptr_from_mem, bpf_ringbuf_reserve_dynptr. A note to reviewers is that in code that does mark_stack_slots_dynptr, and unmark_stack_slots_dynptr, we implicitly rely on the fact that PTR_TO_STACK reg is the only case that can reach that code path, as one cannot pass CONST_PTR_TO_DYNPTR to helpers that don't set MEM_RDONLY. In both cases such helpers won't be setting that flag. The next patch will add a couple of selftest cases to make sure this doesn't break. Fixes: 205715673844 ("bpf: Add bpf_user_ringbuf_drain() helper") Acked-by: Joanne Koong Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221207204141.308952-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- include/uapi/linux/bpf.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4920ac252754..3de24cfb7a3d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -775,7 +775,7 @@ enum bpf_reg_type { PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ - PTR_TO_DYNPTR, /* reg points to a dynptr */ + CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */ __BPF_REG_TYPE_MAX, /* Extended reg_types. */ @@ -2828,7 +2828,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); int bpf_dynptr_check_size(u32 size); -u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr); +u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr); #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f89de51a45db..464ca3f01fe7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5293,7 +5293,7 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags) + * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. @@ -5303,7 +5303,7 @@ union bpf_attr { * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if * *flags* is not 0. * - * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. @@ -5313,7 +5313,7 @@ union bpf_attr { * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* * is a read-only dynptr or if *flags* is not 0. * - * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len) + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) * Description * Get a pointer to the underlying dynptr data. * @@ -5414,7 +5414,7 @@ union bpf_attr { * Drain samples from the specified user ring buffer, and invoke * the provided callback for each such sample: * - * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx); + * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx); * * If **callback_fn** returns 0, the helper will continue to try * and drain the next sample, up to a maximum of -- cgit v1.2.3 From b534dc46c8ae0165b1b2509be24dbea4fa9c4011 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 7 Dec 2022 09:37:01 -0500 Subject: net_tstamp: add SOF_TIMESTAMPING_OPT_ID_TCP Add an option to initialize SOF_TIMESTAMPING_OPT_ID for TCP from write_seq sockets instead of snd_una. This should have been the behavior from the start. Because processes may now exist that rely on the established behavior, do not change behavior of the existing option, but add the right behavior with a new flag. It is encouraged to always set SOF_TIMESTAMPING_OPT_ID_TCP on stream sockets along with the existing SOF_TIMESTAMPING_OPT_ID. Intuitively the contract is that the counter is zero after the setsockopt, so that the next write N results in a notification for the last byte N - 1. On idle sockets snd_una == write_seq and this holds for both. But on sockets with data in transmission, snd_una records the unacked offset in the stream. This depends on the ACK response from the peer. A process cannot learn this in a race free manner (ioctl SIOCOUTQ is one racy approach). write_seq records the offset at the last byte written by the process. This is a better starting point. It matches the intuitive contract in all circumstances, unaffected by external behavior. The new timestamp flag necessitates increasing sk_tsflags to 32 bits. Move the field in struct sock to avoid growing the socket (for some common CONFIG variants). The UAPI interface so_timestamping.flags is already int, so 32 bits wide. Reported-by: Sotirios Delimanolis Signed-off-by: Willem de Bruijn Link: https://lore.kernel.org/r/20221207143701.29861-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 6 +++--- include/uapi/linux/net_tstamp.h | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 6d207e7c4ad0..ecea3dcc2217 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -503,10 +503,10 @@ struct sock { #if BITS_PER_LONG==32 seqlock_t sk_stamp_seq; #endif - u16 sk_tsflags; - u8 sk_shutdown; atomic_t sk_tskey; atomic_t sk_zckey; + u32 sk_tsflags; + u8 sk_shutdown; u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, @@ -1899,7 +1899,7 @@ static inline void sock_replace_proto(struct sock *sk, struct proto *proto) struct sockcm_cookie { u64 transmit_time; u32 mark; - u16 tsflags; + u32 tsflags; }; static inline void sockcm_init(struct sockcm_cookie *sockc, diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 55501e5e7ac8..a2c66b3d7f0f 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -31,8 +31,9 @@ enum { SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13), SOF_TIMESTAMPING_OPT_TX_SWHW = (1<<14), SOF_TIMESTAMPING_BIND_PHC = (1 << 15), + SOF_TIMESTAMPING_OPT_ID_TCP = (1 << 16), - SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_BIND_PHC, + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_ID_TCP, SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | SOF_TIMESTAMPING_LAST }; -- cgit v1.2.3 From 2a7d228f1ae78b6eabef5f18bd1a8d2280555628 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Tue, 6 Dec 2022 10:55:10 -0300 Subject: net/sched: move struct action_ops definition out of ifdef The type definition should be visible even in configurations not using CONFIG_NET_CLS_ACT. Signed-off-by: Pedro Tammela Reviewed-by: Jamal Hadi Salim Reviewed-by: Victor Nogueira Signed-off-by: David S. Miller --- include/net/act_api.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index c94ea1a306e0..2a6f443f0ef6 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -101,11 +101,6 @@ static inline enum flow_action_hw_stats tc_act_hw_stats(u8 hw_stats) return hw_stats; } -#ifdef CONFIG_NET_CLS_ACT - -#define ACT_P_CREATED 1 -#define ACT_P_DELETED 1 - typedef void (*tc_action_priv_destructor)(void *priv); struct tc_action_ops { @@ -140,6 +135,11 @@ struct tc_action_ops { struct netlink_ext_ack *extack); }; +#ifdef CONFIG_NET_CLS_ACT + +#define ACT_P_CREATED 1 +#define ACT_P_DELETED 1 + struct tc_action_net { struct tcf_idrinfo *idrinfo; const struct tc_action_ops *ops; -- cgit v1.2.3 From 7f0e810220e2d985338ecdd907c1598404db251d Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Tue, 6 Dec 2022 10:55:11 -0300 Subject: net/sched: add retpoline wrapper for tc On kernels using retpoline as a spectrev2 mitigation, optimize actions and filters that are compiled as built-ins into a direct call. On subsequent patches we expose the classifiers and actions functions and wire up the wrapper into tc. Signed-off-by: Pedro Tammela Reviewed-by: Jamal Hadi Salim Reviewed-by: Victor Nogueira Signed-off-by: David S. Miller --- include/net/tc_wrapper.h | 251 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 251 insertions(+) create mode 100644 include/net/tc_wrapper.h (limited to 'include') diff --git a/include/net/tc_wrapper.h b/include/net/tc_wrapper.h new file mode 100644 index 000000000000..ceed2fc089ff --- /dev/null +++ b/include/net/tc_wrapper.h @@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_TC_WRAPPER_H +#define __NET_TC_WRAPPER_H + +#include + +#if IS_ENABLED(CONFIG_RETPOLINE) + +#include +#include +#include + +#define TC_INDIRECT_SCOPE + +extern struct static_key_false tc_skip_wrapper; + +/* TC Actions */ +#ifdef CONFIG_NET_CLS_ACT + +#define TC_INDIRECT_ACTION_DECLARE(fname) \ + INDIRECT_CALLABLE_DECLARE(int fname(struct sk_buff *skb, \ + const struct tc_action *a, \ + struct tcf_result *res)) + +TC_INDIRECT_ACTION_DECLARE(tcf_bpf_act); +TC_INDIRECT_ACTION_DECLARE(tcf_connmark_act); +TC_INDIRECT_ACTION_DECLARE(tcf_csum_act); +TC_INDIRECT_ACTION_DECLARE(tcf_ct_act); +TC_INDIRECT_ACTION_DECLARE(tcf_ctinfo_act); +TC_INDIRECT_ACTION_DECLARE(tcf_gact_act); +TC_INDIRECT_ACTION_DECLARE(tcf_gate_act); +TC_INDIRECT_ACTION_DECLARE(tcf_ife_act); +TC_INDIRECT_ACTION_DECLARE(tcf_ipt_act); +TC_INDIRECT_ACTION_DECLARE(tcf_mirred_act); +TC_INDIRECT_ACTION_DECLARE(tcf_mpls_act); +TC_INDIRECT_ACTION_DECLARE(tcf_nat_act); +TC_INDIRECT_ACTION_DECLARE(tcf_pedit_act); +TC_INDIRECT_ACTION_DECLARE(tcf_police_act); +TC_INDIRECT_ACTION_DECLARE(tcf_sample_act); +TC_INDIRECT_ACTION_DECLARE(tcf_simp_act); +TC_INDIRECT_ACTION_DECLARE(tcf_skbedit_act); +TC_INDIRECT_ACTION_DECLARE(tcf_skbmod_act); +TC_INDIRECT_ACTION_DECLARE(tcf_vlan_act); +TC_INDIRECT_ACTION_DECLARE(tunnel_key_act); + +static inline int tc_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + if (static_branch_likely(&tc_skip_wrapper)) + goto skip; + +#if IS_BUILTIN(CONFIG_NET_ACT_GACT) + if (a->ops->act == tcf_gact_act) + return tcf_gact_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_MIRRED) + if (a->ops->act == tcf_mirred_act) + return tcf_mirred_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_PEDIT) + if (a->ops->act == tcf_pedit_act) + return tcf_pedit_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_SKBEDIT) + if (a->ops->act == tcf_skbedit_act) + return tcf_skbedit_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_SKBMOD) + if (a->ops->act == tcf_skbmod_act) + return tcf_skbmod_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_POLICE) + if (a->ops->act == tcf_police_act) + return tcf_police_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_BPF) + if (a->ops->act == tcf_bpf_act) + return tcf_bpf_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_CONNMARK) + if (a->ops->act == tcf_connmark_act) + return tcf_connmark_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_CSUM) + if (a->ops->act == tcf_csum_act) + return tcf_csum_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_CT) + if (a->ops->act == tcf_ct_act) + return tcf_ct_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_CTINFO) + if (a->ops->act == tcf_ctinfo_act) + return tcf_ctinfo_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_GATE) + if (a->ops->act == tcf_gate_act) + return tcf_gate_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_MPLS) + if (a->ops->act == tcf_mpls_act) + return tcf_mpls_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_NAT) + if (a->ops->act == tcf_nat_act) + return tcf_nat_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_TUNNEL_KEY) + if (a->ops->act == tunnel_key_act) + return tunnel_key_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_VLAN) + if (a->ops->act == tcf_vlan_act) + return tcf_vlan_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_IFE) + if (a->ops->act == tcf_ife_act) + return tcf_ife_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_IPT) + if (a->ops->act == tcf_ipt_act) + return tcf_ipt_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_SIMP) + if (a->ops->act == tcf_simp_act) + return tcf_simp_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_SAMPLE) + if (a->ops->act == tcf_sample_act) + return tcf_sample_act(skb, a, res); +#endif + +skip: + return a->ops->act(skb, a, res); +} + +#endif /* CONFIG_NET_CLS_ACT */ + +/* TC Filters */ +#ifdef CONFIG_NET_CLS + +#define TC_INDIRECT_FILTER_DECLARE(fname) \ + INDIRECT_CALLABLE_DECLARE(int fname(struct sk_buff *skb, \ + const struct tcf_proto *tp, \ + struct tcf_result *res)) + +TC_INDIRECT_FILTER_DECLARE(basic_classify); +TC_INDIRECT_FILTER_DECLARE(cls_bpf_classify); +TC_INDIRECT_FILTER_DECLARE(cls_cgroup_classify); +TC_INDIRECT_FILTER_DECLARE(fl_classify); +TC_INDIRECT_FILTER_DECLARE(flow_classify); +TC_INDIRECT_FILTER_DECLARE(fw_classify); +TC_INDIRECT_FILTER_DECLARE(mall_classify); +TC_INDIRECT_FILTER_DECLARE(route4_classify); +TC_INDIRECT_FILTER_DECLARE(rsvp_classify); +TC_INDIRECT_FILTER_DECLARE(rsvp6_classify); +TC_INDIRECT_FILTER_DECLARE(tcindex_classify); +TC_INDIRECT_FILTER_DECLARE(u32_classify); + +static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + if (static_branch_likely(&tc_skip_wrapper)) + goto skip; + +#if IS_BUILTIN(CONFIG_NET_CLS_BPF) + if (tp->classify == cls_bpf_classify) + return cls_bpf_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_U32) + if (tp->classify == u32_classify) + return u32_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_FLOWER) + if (tp->classify == fl_classify) + return fl_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_FW) + if (tp->classify == fw_classify) + return fw_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_MATCHALL) + if (tp->classify == mall_classify) + return mall_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_BASIC) + if (tp->classify == basic_classify) + return basic_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_CGROUP) + if (tp->classify == cls_cgroup_classify) + return cls_cgroup_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_FLOW) + if (tp->classify == flow_classify) + return flow_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_ROUTE4) + if (tp->classify == route4_classify) + return route4_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_RSVP) + if (tp->classify == rsvp_classify) + return rsvp_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_RSVP6) + if (tp->classify == rsvp6_classify) + return rsvp6_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_TCINDEX) + if (tp->classify == tcindex_classify) + return tcindex_classify(skb, tp, res); +#endif + +skip: + return tp->classify(skb, tp, res); +} + +static inline void tc_wrapper_init(void) +{ +#ifdef CONFIG_X86 + if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE)) + static_branch_enable(&tc_skip_wrapper); +#endif +} + +#endif /* CONFIG_NET_CLS */ + +#else + +#define TC_INDIRECT_SCOPE static + +static inline int tc_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + return a->ops->act(skb, a, res); +} + +static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + return tp->classify(skb, tp, res); +} + +static inline void tc_wrapper_init(void) +{ +} + +#endif + +#endif /* __NET_TC_WRAPPER_H */ -- cgit v1.2.3 From e47877c7aa821c413b45e05f804819579bdfa1a3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 6 Dec 2022 11:36:32 -1000 Subject: rhashtable: Allow rhashtable to be used from irq-safe contexts rhashtable currently only does bh-safe synchronization making it impossible to use from irq-safe contexts. Switch it to use irq-safe synchronization to remove the restriction. v2: Update the lock functions to return the ulong flags value and unlock functions to take the value directly instead of passing around the pointer. Suggested by Linus. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden Cc: Linus Torvalds Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 61 +++++++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 68dab3e08aad..5b5357c0bd8c 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -323,29 +323,36 @@ static inline struct rhash_lock_head __rcu **rht_bucket_insert( * When we write to a bucket without unlocking, we use rht_assign_locked(). */ -static inline void rht_lock(struct bucket_table *tbl, - struct rhash_lock_head __rcu **bkt) +static inline unsigned long rht_lock(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bkt) { - local_bh_disable(); + unsigned long flags; + + local_irq_save(flags); bit_spin_lock(0, (unsigned long *)bkt); lock_map_acquire(&tbl->dep_map); + return flags; } -static inline void rht_lock_nested(struct bucket_table *tbl, - struct rhash_lock_head __rcu **bucket, - unsigned int subclass) +static inline unsigned long rht_lock_nested(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bucket, + unsigned int subclass) { - local_bh_disable(); + unsigned long flags; + + local_irq_save(flags); bit_spin_lock(0, (unsigned long *)bucket); lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_); + return flags; } static inline void rht_unlock(struct bucket_table *tbl, - struct rhash_lock_head __rcu **bkt) + struct rhash_lock_head __rcu **bkt, + unsigned long flags) { lock_map_release(&tbl->dep_map); bit_spin_unlock(0, (unsigned long *)bkt); - local_bh_enable(); + local_irq_restore(flags); } static inline struct rhash_head *__rht_ptr( @@ -393,7 +400,8 @@ static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, static inline void rht_assign_unlock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt, - struct rhash_head *obj) + struct rhash_head *obj, + unsigned long flags) { if (rht_is_a_nulls(obj)) obj = NULL; @@ -401,7 +409,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, rcu_assign_pointer(*bkt, (void *)obj); preempt_enable(); __release(bitlock); - local_bh_enable(); + local_irq_restore(flags); } /** @@ -706,6 +714,7 @@ static inline void *__rhashtable_insert_fast( struct rhash_head __rcu **pprev; struct bucket_table *tbl; struct rhash_head *head; + unsigned long flags; unsigned int hash; int elasticity; void *data; @@ -720,11 +729,11 @@ static inline void *__rhashtable_insert_fast( if (!bkt) goto out; pprev = NULL; - rht_lock(tbl, bkt); + flags = rht_lock(tbl, bkt); if (unlikely(rcu_access_pointer(tbl->future_tbl))) { slow_path: - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); rcu_read_unlock(); return rhashtable_insert_slow(ht, key, obj); } @@ -756,9 +765,9 @@ slow_path: RCU_INIT_POINTER(list->rhead.next, head); if (pprev) { rcu_assign_pointer(*pprev, obj); - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); } else - rht_assign_unlock(tbl, bkt, obj); + rht_assign_unlock(tbl, bkt, obj, flags); data = NULL; goto out; } @@ -785,7 +794,7 @@ slow_path: } atomic_inc(&ht->nelems); - rht_assign_unlock(tbl, bkt, obj); + rht_assign_unlock(tbl, bkt, obj, flags); if (rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); @@ -797,7 +806,7 @@ out: return data; out_unlock: - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); goto out; } @@ -991,6 +1000,7 @@ static inline int __rhashtable_remove_fast_one( struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; + unsigned long flags; unsigned int hash; int err = -ENOENT; @@ -999,7 +1009,7 @@ static inline int __rhashtable_remove_fast_one( if (!bkt) return -ENOENT; pprev = NULL; - rht_lock(tbl, bkt); + flags = rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *list; @@ -1043,14 +1053,14 @@ static inline int __rhashtable_remove_fast_one( if (pprev) { rcu_assign_pointer(*pprev, obj); - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); } else { - rht_assign_unlock(tbl, bkt, obj); + rht_assign_unlock(tbl, bkt, obj, flags); } goto unlocked; } - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); unlocked: if (err > 0) { atomic_dec(&ht->nelems); @@ -1143,6 +1153,7 @@ static inline int __rhashtable_replace_fast( struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; + unsigned long flags; unsigned int hash; int err = -ENOENT; @@ -1158,7 +1169,7 @@ static inline int __rhashtable_replace_fast( return -ENOENT; pprev = NULL; - rht_lock(tbl, bkt); + flags = rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { if (he != obj_old) { @@ -1169,15 +1180,15 @@ static inline int __rhashtable_replace_fast( rcu_assign_pointer(obj_new->next, obj_old->next); if (pprev) { rcu_assign_pointer(*pprev, obj_new); - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); } else { - rht_assign_unlock(tbl, bkt, obj_new); + rht_assign_unlock(tbl, bkt, obj_new, flags); } err = 0; goto unlocked; } - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); unlocked: return err; -- cgit v1.2.3 From 1933ea365aa7a48ce26bea2ea09c9f7cc48cc668 Mon Sep 17 00:00:00 2001 From: wangchuanlei Date: Tue, 6 Dec 2022 20:38:57 -0500 Subject: net: openvswitch: Add support to count upcall packets Add support to count upall packets, when kmod of openvswitch upcall to count the number of packets for upcall succeed and failed, which is a better way to see how many packets upcalled on every interfaces. Signed-off-by: wangchuanlei Acked-by: Eelco Chaudron Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 94066f87e9ee..c5d62ee82567 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -277,11 +277,25 @@ enum ovs_vport_attr { OVS_VPORT_ATTR_PAD, OVS_VPORT_ATTR_IFINDEX, OVS_VPORT_ATTR_NETNSID, + OVS_VPORT_ATTR_UPCALL_STATS, __OVS_VPORT_ATTR_MAX }; #define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1) +/** + * enum ovs_vport_upcall_attr - attributes for %OVS_VPORT_UPCALL* commands + * @OVS_VPORT_UPCALL_SUCCESS: 64-bit upcall success packets. + * @OVS_VPORT_UPCALL_FAIL: 64-bit upcall fail packets. + */ +enum ovs_vport_upcall_attr { + OVS_VPORT_UPCALL_ATTR_SUCCESS, + OVS_VPORT_UPCALL_ATTR_FAIL, + __OVS_VPORT_UPCALL_ATTR_MAX +}; + +#define OVS_VPORT_UPCALL_ATTR_MAX (__OVS_VPORT_UPCALL_ATTR_MAX - 1) + enum { OVS_VXLAN_EXT_UNSPEC, OVS_VXLAN_EXT_GBP, /* Flag or __u32 */ -- cgit v1.2.3 From ce098da1497c6dee9589fce2c61d1910f4fcf0e7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 7 Dec 2022 22:02:59 -0800 Subject: skbuff: Introduce slab_build_skb() syzkaller reported: BUG: KASAN: slab-out-of-bounds in __build_skb_around+0x235/0x340 net/core/skbuff.c:294 Write of size 32 at addr ffff88802aa172c0 by task syz-executor413/5295 For bpf_prog_test_run_skb(), which uses a kmalloc()ed buffer passed to build_skb(). When build_skb() is passed a frag_size of 0, it means the buffer came from kmalloc. In these cases, ksize() is used to find its actual size, but since the allocation may not have been made to that size, actually perform the krealloc() call so that all the associated buffer size checking will be correctly notified (and use the "new" pointer so that compiler hinting works correctly). Split this logic out into a new interface, slab_build_skb(), but leave the original 0 checking for now to catch any stragglers. Reported-by: syzbot+fda18eaa8c12534ccb3b@syzkaller.appspotmail.com Link: https://groups.google.com/g/syzkaller-bugs/c/UnIKxTtU5-0/m/-wbXinkgAQAJ Fixes: 38931d8989b5 ("mm: Make ksize() a reporting-only function") Cc: Pavel Begunkov Cc: pepsipu Cc: syzbot+fda18eaa8c12534ccb3b@syzkaller.appspotmail.com Cc: Vlastimil Babka Cc: kasan-dev Cc: Andrii Nakryiko Cc: ast@kernel.org Cc: Daniel Borkmann Cc: Hao Luo Cc: Jesper Dangaard Brouer Cc: John Fastabend Cc: jolsa@kernel.org Cc: KP Singh Cc: martin.lau@linux.dev Cc: Stanislav Fomichev Cc: song@kernel.org Cc: Yonghong Song Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221208060256.give.994-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4e464a27adaf..4c8492401a10 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1255,6 +1255,7 @@ struct sk_buff *build_skb_around(struct sk_buff *skb, void skb_attempt_defer_free(struct sk_buff *skb); struct sk_buff *napi_build_skb(void *data, unsigned int frag_size); +struct sk_buff *slab_build_skb(void *data); /** * alloc_skb - allocate a network buffer -- cgit v1.2.3 From 5dd9cdbc9dec3e99b19e483767e247d15ca8cc0d Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 9 Dec 2022 15:57:29 +0200 Subject: bpf: states_equal() must build idmap for all function frames verifier.c:states_equal() must maintain register ID mapping across all function frames. Otherwise the following example might be erroneously marked as safe: main: fp[-24] = map_lookup_elem(...) ; frame[0].fp[-24].id == 1 fp[-32] = map_lookup_elem(...) ; frame[0].fp[-32].id == 2 r1 = &fp[-24] r2 = &fp[-32] call foo() r0 = 0 exit foo: 0: r9 = r1 1: r8 = r2 2: r7 = ktime_get_ns() 3: r6 = ktime_get_ns() 4: if (r6 > r7) goto skip_assign 5: r9 = r8 skip_assign: ; <--- checkpoint 6: r9 = *r9 ; (a) frame[1].r9.id == 2 ; (b) frame[1].r9.id == 1 7: if r9 == 0 goto exit: ; mark_ptr_or_null_regs() transfers != 0 info ; for all regs sharing ID: ; (a) r9 != 0 => &frame[0].fp[-32] != 0 ; (b) r9 != 0 => &frame[0].fp[-24] != 0 8: r8 = *r8 ; (a) r8 == &frame[0].fp[-32] ; (b) r8 == &frame[0].fp[-32] 9: r0 = *r8 ; (a) safe ; (b) unsafe exit: 10: exit While processing call to foo() verifier considers the following execution paths: (a) 0-10 (b) 0-4,6-10 (There is also path 0-7,10 but it is not interesting for the issue at hand. (a) is verified first.) Suppose that checkpoint is created at (6) when path (a) is verified, next path (b) is verified and (6) is reached. If states_equal() maintains separate 'idmap' for each frame the mapping at (6) for frame[1] would be empty and regsafe(r9)::check_ids() would add a pair 2->1 and return true, which is an error. If states_equal() maintains single 'idmap' for all frames the mapping at (6) would be { 1->1, 2->2 } and regsafe(r9)::check_ids() would return false when trying to add a pair 2->1. This issue was suggested in the following discussion: https://lore.kernel.org/bpf/CAEf4BzbFB5g4oUfyxk9rHy-PJSLQ3h8q9mV=rVoXfr_JVm8+1Q@mail.gmail.com/ Suggested-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20221209135733.28851-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index df0cb825e0e3..53d175cbaa02 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -273,9 +273,9 @@ struct bpf_id_pair { u32 cur; }; -/* Maximum number of register states that can exist at once */ -#define BPF_ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) #define MAX_CALL_FRAMES 8 +/* Maximum number of register states that can exist at once */ +#define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES) struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; -- cgit v1.2.3 From 5df7d714d8cbcce7642936cc0f6532f0c4c3d197 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Nov 2022 18:45:59 +0200 Subject: ipvs: add rcu protection to stats In preparation to using RCU locking for the list with estimators, make sure the struct ip_vs_stats are released after RCU grace period by using RCU callbacks. This affects ipvs->tot_stats where we can not use RCU callbacks for ipvs, so we use allocated struct ip_vs_stats_rcu. For services and dests we force RCU callbacks for all cases. Signed-off-by: Julian Anastasov Cc: yunhong-cgl jiang Cc: "dust.li" Reviewed-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index ff1804a0c469..bd8ae137e43b 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -405,6 +405,11 @@ struct ip_vs_stats { struct ip_vs_kstats kstats0; /* reset values */ }; +struct ip_vs_stats_rcu { + struct ip_vs_stats s; + struct rcu_head rcu_head; +}; + struct dst_entry; struct iphdr; struct ip_vs_conn; @@ -688,6 +693,7 @@ struct ip_vs_dest { union nf_inet_addr vaddr; /* virtual IP address */ __u32 vfwmark; /* firewall mark of service */ + struct rcu_head rcu_head; struct list_head t_list; /* in dest_trash */ unsigned int in_rs_table:1; /* we are in rs_table */ }; @@ -869,7 +875,7 @@ struct netns_ipvs { atomic_t conn_count; /* connection counter */ /* ip_vs_ctl */ - struct ip_vs_stats tot_stats; /* Statistics & est. */ + struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */ int num_services; /* no of virtual services */ int num_services6; /* IPv6 virtual services */ -- cgit v1.2.3 From de39afb3d811ba2c028de8662adafedb4899327b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Nov 2022 18:46:00 +0200 Subject: ipvs: use common functions for stats allocation Move alloc_percpu/free_percpu logic in new functions Signed-off-by: Julian Anastasov Cc: yunhong-cgl jiang Cc: "dust.li" Reviewed-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index bd8ae137e43b..e5582c01a4a3 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -410,6 +410,11 @@ struct ip_vs_stats_rcu { struct rcu_head rcu_head; }; +int ip_vs_stats_init_alloc(struct ip_vs_stats *s); +struct ip_vs_stats *ip_vs_stats_alloc(void); +void ip_vs_stats_release(struct ip_vs_stats *stats); +void ip_vs_stats_free(struct ip_vs_stats *stats); + struct dst_entry; struct iphdr; struct ip_vs_conn; -- cgit v1.2.3 From 1dbd8d9a82e3f26b9d063292d47ece673f48fce2 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Nov 2022 18:46:01 +0200 Subject: ipvs: use u64_stats_t for the per-cpu counters Use the provided u64_stats_t type to avoid load/store tearing. Fixes: 316580b69d0a ("u64_stats: provide u64_stats_t type") Signed-off-by: Julian Anastasov Cc: yunhong-cgl jiang Cc: "dust.li" Reviewed-by: Jiri Wiesner Tested-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index e5582c01a4a3..a4d44138c2a8 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -351,11 +351,11 @@ struct ip_vs_seq { /* counters per cpu */ struct ip_vs_counters { - __u64 conns; /* connections scheduled */ - __u64 inpkts; /* incoming packets */ - __u64 outpkts; /* outgoing packets */ - __u64 inbytes; /* incoming bytes */ - __u64 outbytes; /* outgoing bytes */ + u64_stats_t conns; /* connections scheduled */ + u64_stats_t inpkts; /* incoming packets */ + u64_stats_t outpkts; /* outgoing packets */ + u64_stats_t inbytes; /* incoming bytes */ + u64_stats_t outbytes; /* outgoing bytes */ }; /* Stats per cpu */ struct ip_vs_cpu_stats { -- cgit v1.2.3 From 705dd34440812735ece298eb5bc153fde9544d42 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Nov 2022 18:46:02 +0200 Subject: ipvs: use kthreads for stats estimation Estimating all entries in single list in timer context by single CPU causes large latency with multiple IPVS rules as reported in [1], [2], [3]. Spread the estimator structures in multiple chains and use kthread(s) for the estimation. The chains are processed in multiple (50) timer ticks to ensure the 2-second interval between estimations with some accuracy. Every chain is processed under RCU lock. Every kthread works over its own data structure and all such contexts are attached to array. The contexts can be preserved while the kthread tasks are stopped or restarted. When estimators are removed, unused kthread contexts are released and the slots in array are left empty. First kthread determines parameters to use, eg. maximum number of estimators to process per kthread based on chain's length (chain_max), allowing sub-100us cond_resched rate and estimation taking up to 1/8 of the CPU capacity to avoid any problems if chain_max is not correctly calculated. chain_max is calculated taking into account factors such as CPU speed and memory/cache speed where the cache_factor (4) is selected from real tests with current generation of CPU/NUMA configurations to correct the difference in CPU usage between cached (during calc phase) and non-cached (working) state of the estimated per-cpu data. First kthread also plays the role of distributor of added estimators to all kthreads, keeping low the time to add estimators. The optimization is based on the fact that newly added estimator should be estimated after 2 seconds, so we have the time to offload the adding to chain from controlling process to kthread 0. The allocated kthread context may grow from 1 to 50 allocated structures for timer ticks which saves memory for setups with small number of estimators. We also add delayed work est_reload_work that will make sure the kthread tasks are properly started/stopped. ip_vs_start_estimator() is changed to report errors which allows to safely store the estimators in allocated structures. Many thanks to Jiri Wiesner for his valuable comments and for spending a lot of time reviewing and testing the changes on different platforms with 48-256 CPUs and 1-8 NUMA nodes under different cpufreq governors. [1] Report from Yunhong Jiang: https://lore.kernel.org/netdev/D25792C1-1B89-45DE-9F10-EC350DC04ADC@gmail.com/ [2] https://marc.info/?l=linux-virtual-server&m=159679809118027&w=2 [3] Report from Dust: https://archive.linuxvirtualserver.org/html/lvs-devel/2020-12/msg00000.html Signed-off-by: Julian Anastasov Cc: yunhong-cgl jiang Cc: "dust.li" Reviewed-by: Jiri Wiesner Tested-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 83 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index a4d44138c2a8..04960dc6228f 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -42,6 +42,8 @@ static inline struct netns_ipvs *net_ipvs(struct net* net) /* Connections' size value needed by ip_vs_ctl.c */ extern int ip_vs_conn_tab_size; +extern struct mutex __ip_vs_mutex; + struct ip_vs_iphdr { int hdr_flags; /* ipvs flags */ __u32 off; /* Where IP or IPv4 header starts */ @@ -365,7 +367,7 @@ struct ip_vs_cpu_stats { /* IPVS statistics objects */ struct ip_vs_estimator { - struct list_head list; + struct hlist_node list; u64 last_inbytes; u64 last_outbytes; @@ -378,6 +380,10 @@ struct ip_vs_estimator { u64 outpps; u64 inbps; u64 outbps; + + s32 ktid:16, /* kthread ID, -1=temp list */ + ktrow:8, /* row/tick ID for kthread */ + ktcid:8; /* chain ID for kthread tick */ }; /* @@ -415,6 +421,66 @@ struct ip_vs_stats *ip_vs_stats_alloc(void); void ip_vs_stats_release(struct ip_vs_stats *stats); void ip_vs_stats_free(struct ip_vs_stats *stats); +/* Process estimators in multiple timer ticks (20/50/100, see ktrow) */ +#define IPVS_EST_NTICKS 50 +/* Estimation uses a 2-second period containing ticks (in jiffies) */ +#define IPVS_EST_TICK ((2 * HZ) / IPVS_EST_NTICKS) + +/* Limit of CPU load per kthread (8 for 12.5%), ratio of CPU capacity (1/C). + * Value of 4 and above ensures kthreads will take work without exceeding + * the CPU capacity under different circumstances. + */ +#define IPVS_EST_LOAD_DIVISOR 8 + +/* Kthreads should not have work that exceeds the CPU load above 50% */ +#define IPVS_EST_CPU_KTHREADS (IPVS_EST_LOAD_DIVISOR / 2) + +/* Desired number of chains per timer tick (chain load factor in 100us units), + * 48=4.8ms of 40ms tick (12% CPU usage): + * 2 sec * 1000 ms in sec * 10 (100us in ms) / 8 (12.5%) / 50 + */ +#define IPVS_EST_CHAIN_FACTOR \ + ALIGN_DOWN(2 * 1000 * 10 / IPVS_EST_LOAD_DIVISOR / IPVS_EST_NTICKS, 8) + +/* Compiled number of chains per tick + * The defines should match cond_resched_rcu + */ +#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) +#define IPVS_EST_TICK_CHAINS IPVS_EST_CHAIN_FACTOR +#else +#define IPVS_EST_TICK_CHAINS 1 +#endif + +#if IPVS_EST_NTICKS > 127 +#error Too many timer ticks for ktrow +#endif + +/* Multiple chains processed in same tick */ +struct ip_vs_est_tick_data { + struct hlist_head chains[IPVS_EST_TICK_CHAINS]; + DECLARE_BITMAP(present, IPVS_EST_TICK_CHAINS); + DECLARE_BITMAP(full, IPVS_EST_TICK_CHAINS); + int chain_len[IPVS_EST_TICK_CHAINS]; +}; + +/* Context for estimation kthread */ +struct ip_vs_est_kt_data { + struct netns_ipvs *ipvs; + struct task_struct *task; /* task if running */ + struct ip_vs_est_tick_data __rcu *ticks[IPVS_EST_NTICKS]; + DECLARE_BITMAP(avail, IPVS_EST_NTICKS); /* tick has space for ests */ + unsigned long est_timer; /* estimation timer (jiffies) */ + struct ip_vs_stats *calc_stats; /* Used for calculation */ + int tick_len[IPVS_EST_NTICKS]; /* est count */ + int id; /* ktid per netns */ + int chain_max; /* max ests per tick chain */ + int tick_max; /* max ests per tick */ + int est_count; /* attached ests to kthread */ + int est_max_count; /* max ests per kthread */ + int add_row; /* row for new ests */ + int est_row; /* estimated row */ +}; + struct dst_entry; struct iphdr; struct ip_vs_conn; @@ -953,9 +1019,17 @@ struct netns_ipvs { struct ctl_table_header *lblcr_ctl_header; struct ctl_table *lblcr_ctl_table; /* ip_vs_est */ - struct list_head est_list; /* estimator list */ - spinlock_t est_lock; - struct timer_list est_timer; /* Estimation timer */ + struct delayed_work est_reload_work;/* Reload kthread tasks */ + struct mutex est_mutex; /* protect kthread tasks */ + struct hlist_head est_temp_list; /* Ests during calc phase */ + struct ip_vs_est_kt_data **est_kt_arr; /* Array of kthread data ptrs */ + unsigned long est_max_threads;/* Hard limit of kthreads */ + int est_calc_phase; /* Calculation phase */ + int est_chain_max; /* Calculated chain_max */ + int est_kt_count; /* Allocated ptrs */ + int est_add_ktid; /* ktid where to add ests */ + atomic_t est_genid; /* kthreads reload genid */ + atomic_t est_genid_done; /* applied genid */ /* ip_vs_sync */ spinlock_t sync_lock; struct ipvs_master_sync_state *ms; @@ -1486,10 +1560,14 @@ int stop_sync_thread(struct netns_ipvs *ipvs, int state); void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts); /* IPVS rate estimator prototypes (from ip_vs_est.c) */ -void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); +int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_zero_estimator(struct ip_vs_stats *stats); void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats); +void ip_vs_est_reload_start(struct netns_ipvs *ipvs); +int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, + struct ip_vs_est_kt_data *kd); +void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); /* Various IPVS packet transmitters (from ip_vs_xmit.c) */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, -- cgit v1.2.3 From f0be83d5421718ead31707b6ece34cf77d411c00 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Nov 2022 18:46:03 +0200 Subject: ipvs: add est_cpulist and est_nice sysctl vars Allow the kthreads for stats to be configured for specific cpulist (isolation) and niceness (scheduling priority). Signed-off-by: Julian Anastasov Cc: yunhong-cgl jiang Cc: "dust.li" Reviewed-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 04960dc6228f..dc51b5497cf7 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -29,6 +29,7 @@ #include #endif #include /* Netw namespace */ +#include #define IP_VS_HDR_INVERSE 1 #define IP_VS_HDR_ICMP 2 @@ -365,6 +366,9 @@ struct ip_vs_cpu_stats { struct u64_stats_sync syncp; }; +/* Default nice for estimator kthreads */ +#define IPVS_EST_NICE 0 + /* IPVS statistics objects */ struct ip_vs_estimator { struct hlist_node list; @@ -1009,6 +1013,12 @@ struct netns_ipvs { int sysctl_schedule_icmp; int sysctl_ignore_tunneled; int sysctl_run_estimation; +#ifdef CONFIG_SYSCTL + cpumask_var_t sysctl_est_cpulist; /* kthread cpumask */ + int est_cpulist_valid; /* cpulist set */ + int sysctl_est_nice; /* kthread nice */ + int est_stopped; /* stop tasks */ +#endif /* ip_vs_lblc */ int sysctl_lblc_expiration; @@ -1162,6 +1172,19 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) return ipvs->sysctl_run_estimation; } +static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) +{ + if (ipvs->est_cpulist_valid) + return ipvs->sysctl_est_cpulist; + else + return housekeeping_cpumask(HK_TYPE_KTHREAD); +} + +static inline int sysctl_est_nice(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_est_nice; +} + #else static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) @@ -1259,6 +1282,16 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) return 1; } +static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) +{ + return housekeeping_cpumask(HK_TYPE_KTHREAD); +} + +static inline int sysctl_est_nice(struct netns_ipvs *ipvs) +{ + return IPVS_EST_NICE; +} + #endif /* IPVS core functions @@ -1569,6 +1602,31 @@ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd); void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); +static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs) +{ +#ifdef CONFIG_SYSCTL + ipvs->est_stopped = ipvs->est_cpulist_valid && + cpumask_empty(sysctl_est_cpulist(ipvs)); +#endif +} + +static inline bool ip_vs_est_stopped(struct netns_ipvs *ipvs) +{ +#ifdef CONFIG_SYSCTL + return ipvs->est_stopped; +#else + return false; +#endif +} + +static inline int ip_vs_est_max_threads(struct netns_ipvs *ipvs) +{ + unsigned int limit = IPVS_EST_CPU_KTHREADS * + cpumask_weight(sysctl_est_cpulist(ipvs)); + + return max(1U, limit); +} + /* Various IPVS packet transmitters (from ip_vs_xmit.c) */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); -- cgit v1.2.3 From 144361c1949f227df9244302da02c258a363b674 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Nov 2022 18:46:04 +0200 Subject: ipvs: run_estimation should control the kthread tasks Change the run_estimation flag to start/stop the kthread tasks. Signed-off-by: Julian Anastasov Cc: yunhong-cgl jiang Cc: "dust.li" Reviewed-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index dc51b5497cf7..c6c61100d244 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1605,8 +1605,10 @@ void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs) { #ifdef CONFIG_SYSCTL - ipvs->est_stopped = ipvs->est_cpulist_valid && - cpumask_empty(sysctl_est_cpulist(ipvs)); + /* Stop tasks while cpulist is empty or if disabled with flag */ + ipvs->est_stopped = !sysctl_run_estimation(ipvs) || + (ipvs->est_cpulist_valid && + cpumask_empty(sysctl_est_cpulist(ipvs))); #endif } -- cgit v1.2.3 From b22bbdd17a5af2cc99cf42048970b9355dcbec8b Mon Sep 17 00:00:00 2001 From: Andrew Melnychenko Date: Wed, 7 Dec 2022 13:35:54 +0200 Subject: uapi/linux/if_tun.h: Added new offload types for USO4/6. Added 2 additional offlloads for USO(IPv4 & IPv6). Separate offloads are required for Windows VM guests, g.e. Windows may set USO rx only for IPv4. Signed-off-by: Andrew Melnychenko Signed-off-by: David S. Miller --- include/uapi/linux/if_tun.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index b6d7b868f290..287cdc81c939 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -90,6 +90,8 @@ #define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */ #define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */ #define TUN_F_UFO 0x10 /* I can handle UFO packets */ +#define TUN_F_USO4 0x20 /* I can handle USO for IPv4 packets */ +#define TUN_F_USO6 0x40 /* I can handle USO for IPv6 packets */ /* Protocol info prepended to the packets (when IFF_NO_PI is not set) */ #define TUN_PKT_STRIP 0x0001 -- cgit v1.2.3 From 34061b348ae912b9783fae7fabae69d46977a036 Mon Sep 17 00:00:00 2001 From: Andrew Melnychenko Date: Wed, 7 Dec 2022 13:35:56 +0200 Subject: uapi/linux/virtio_net.h: Added USO types. Added new GSO type for USO: VIRTIO_NET_HDR_GSO_UDP_L4. Feature VIRTIO_NET_F_HOST_USO allows to enable NETIF_F_GSO_UDP_L4. Separated VIRTIO_NET_F_GUEST_USO4 & VIRTIO_NET_F_GUEST_USO6 features required for Windows guests. Signed-off-by: Andrew Melnychenko Signed-off-by: David S. Miller --- include/uapi/linux/virtio_net.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 6cb842ea8979..b4062bed186a 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -57,6 +57,9 @@ * Steering */ #define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ #define VIRTIO_NET_F_NOTF_COAL 53 /* Device supports notifications coalescing */ +#define VIRTIO_NET_F_GUEST_USO4 54 /* Guest can handle USOv4 in. */ +#define VIRTIO_NET_F_GUEST_USO6 55 /* Guest can handle USOv6 in. */ +#define VIRTIO_NET_F_HOST_USO 56 /* Host can handle USO in. */ #define VIRTIO_NET_F_HASH_REPORT 57 /* Supports hash report */ #define VIRTIO_NET_F_RSS 60 /* Supports RSS RX steering */ #define VIRTIO_NET_F_RSC_EXT 61 /* extended coalescing info */ @@ -130,6 +133,7 @@ struct virtio_net_hdr_v1 { #define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ #define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */ #define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */ +#define VIRTIO_NET_HDR_GSO_UDP_L4 5 /* GSO frame, IPv4& IPv6 UDP (USO) */ #define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */ __u8 gso_type; __virtio16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */ -- cgit v1.2.3 From 860b7f27b8f78564ca5a2f607e0820b2d352a562 Mon Sep 17 00:00:00 2001 From: Andrew Melnychenko Date: Wed, 7 Dec 2022 13:35:57 +0200 Subject: linux/virtio_net.h: Support USO offload in vnet header. Now, it's possible to convert USO vnet packets from/to skb. Added support for GSO_UDP_L4 offload. Signed-off-by: Andrew Melnychenko Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index a960de68ac69..bdf8de2cdd93 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -15,6 +15,7 @@ static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type) case VIRTIO_NET_HDR_GSO_TCPV6: return protocol == cpu_to_be16(ETH_P_IPV6); case VIRTIO_NET_HDR_GSO_UDP: + case VIRTIO_NET_HDR_GSO_UDP_L4: return protocol == cpu_to_be16(ETH_P_IP) || protocol == cpu_to_be16(ETH_P_IPV6); default: @@ -31,6 +32,7 @@ static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: case VIRTIO_NET_HDR_GSO_UDP: + case VIRTIO_NET_HDR_GSO_UDP_L4: skb->protocol = cpu_to_be16(ETH_P_IP); break; case VIRTIO_NET_HDR_GSO_TCPV6: @@ -69,6 +71,11 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, ip_proto = IPPROTO_UDP; thlen = sizeof(struct udphdr); break; + case VIRTIO_NET_HDR_GSO_UDP_L4: + gso_type = SKB_GSO_UDP_L4; + ip_proto = IPPROTO_UDP; + thlen = sizeof(struct udphdr); + break; default: return -EINVAL; } @@ -182,6 +189,8 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (sinfo->gso_type & SKB_GSO_TCPV6) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + else if (sinfo->gso_type & SKB_GSO_UDP_L4) + hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP_L4; else return -EINVAL; if (sinfo->gso_type & SKB_GSO_TCP_ECN) -- cgit v1.2.3 From ebddb1404900657b7f03a56ee4c34a9d218c4030 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 8 Dec 2022 11:56:12 -0500 Subject: net: move the nat function to nf_nat_ovs for ovs and tc There are two nat functions are nearly the same in both OVS and TC code, (ovs_)ct_nat_execute() and ovs_ct_nat/tcf_ct_act_nat(). This patch creates nf_nat_ovs.c under netfilter and moves them there then exports nf_ct_nat() so that it can be shared by both OVS and TC, and keeps the nat (type) check and nat flag update in OVS and TC's own place, as these parts are different between OVS and TC. Note that in OVS nat function it was using skb->protocol to get the proto as it already skips vlans in key_extract(), while it doesn't in TC, and TC has to call skb_protocol() to get proto. So in nf_ct_nat_execute(), we keep using skb_protocol() which works for both OVS and TC contrack. Signed-off-by: Xin Long Acked-by: Aaron Conole Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_nat.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index e9eb01e99d2f..9877f064548a 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -104,6 +104,10 @@ unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); +int nf_ct_nat(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, int *action, + const struct nf_nat_range2 *range, bool commit); + static inline int nf_nat_initialized(const struct nf_conn *ct, enum nf_nat_manip_type manip) { -- cgit v1.2.3 From 983055bf839745ec2812fc55cacd96888aa0d7a6 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 1 Dec 2022 02:46:54 +0900 Subject: USB: core: export usb_cache_string() usb_cache_string() can also be useful for the drivers so export it. Signed-off-by: Vincent Mailhol Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/all/20221130174658.29282-4-mailhol.vincent@wanadoo.fr Signed-off-by: Marc Kleine-Budde --- include/linux/usb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/usb.h b/include/linux/usb.h index 9ff1ad4dfad1..d2d2f41052c0 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1829,6 +1829,7 @@ static inline int usb_get_ptm_status(struct usb_device *dev, void *data) extern int usb_string(struct usb_device *dev, int index, char *buf, size_t size); +extern char *usb_cache_string(struct usb_device *udev, int index); /* wrappers that also update important state inside usbcore */ extern int usb_clear_halt(struct usb_device *dev, int pipe); -- cgit v1.2.3 From 01d80532295cd359dc43e6bd71860d5515f84372 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 1 Dec 2022 02:46:55 +0900 Subject: net: devlink: add DEVLINK_INFO_VERSION_GENERIC_FW_BOOTLOADER As discussed in [1], abbreviating the bootloader to "bl" might not be well understood. Instead, a bootloader technically being a firmware, name it "fw.bootloader". Add a new macro to devlink.h to formalize this new info attribute name and update the documentation. [1] https://lore.kernel.org/netdev/20221128142723.2f826d20@kernel.org/ Suggested-by: Jakub Kicinski Signed-off-by: Vincent Mailhol Link: https://lore.kernel.org/all/20221130174658.29282-5-mailhol.vincent@wanadoo.fr Signed-off-by: Marc Kleine-Budde --- include/net/devlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 0f376a28b9c4..6a2e4f21779f 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -621,6 +621,8 @@ enum devlink_param_generic_id { #define DEVLINK_INFO_VERSION_GENERIC_FW_ROCE "fw.roce" /* Firmware bundle identifier */ #define DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID "fw.bundle_id" +/* Bootloader */ +#define DEVLINK_INFO_VERSION_GENERIC_FW_BOOTLOADER "fw.bootloader" /** * struct devlink_flash_update_params - Flash Update parameters -- cgit v1.2.3 From 3fff88186f047627bb128d65155f42517f8e448f Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Fri, 9 Dec 2022 16:28:08 -0800 Subject: mptcp: remove MPTCP 'ifdef' in TCP SYN cookies To ease the maintenance, it is often recommended to avoid having #ifdef preprocessor conditions. Here the section related to CONFIG_MPTCP was quite short but the next commit needs to add more code around. It is then cleaner to move specific MPTCP code to functions located in net/mptcp directory. Now that mptcp_subflow_request_sock_ops structure can be static, it can also be marked as "read only after init". Suggested-by: Paolo Abeni Reviewed-by: Mat Martineau Cc: stable@vger.kernel.org Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/net/mptcp.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 412479ebf5ad..3c5c68618fcc 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -97,8 +97,6 @@ struct mptcp_out_options { }; #ifdef CONFIG_MPTCP -extern struct request_sock_ops mptcp_subflow_request_sock_ops; - void mptcp_init(void); static inline bool sk_is_mptcp(const struct sock *sk) @@ -188,6 +186,9 @@ void mptcp_seq_show(struct seq_file *seq); int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb); +struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk_listener, + bool attach_listener); __be32 mptcp_get_reset_option(const struct sk_buff *skb); @@ -274,6 +275,13 @@ static inline int mptcp_subflow_init_cookie_req(struct request_sock *req, return 0; /* TCP fallback */ } +static inline struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk_listener, + bool attach_listener) +{ + return NULL; +} + static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { return htonl(0u); } #endif /* CONFIG_MPTCP */ -- cgit v1.2.3 From c9209b269afd29c55b8be49916725ed469f8c5d9 Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Thu, 6 Oct 2022 17:09:31 +0800 Subject: Bluetooth: btusb: Introduce generic USB reset On cmd_timeout with no reset_gpio, reset the USB port as a last resort. This patch changes the behavior of btusb_intel_cmd_timeout and btusb_rtl_cmd_timeout. Signed-off-by: Archie Pusaka Reviewed-by: Abhishek Pandit-Subedi Reviewed-by: Ying Hsu Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index c54bc71254af..55a40f5606c3 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -659,6 +659,7 @@ struct hci_dev { int (*set_diag)(struct hci_dev *hdev, bool enable); int (*set_bdaddr)(struct hci_dev *hdev, const bdaddr_t *bdaddr); void (*cmd_timeout)(struct hci_dev *hdev); + void (*reset)(struct hci_dev *hdev); bool (*wakeup)(struct hci_dev *hdev); int (*set_quality_report)(struct hci_dev *hdev, bool enable); int (*get_data_path_id)(struct hci_dev *hdev, __u8 *data_path); -- cgit v1.2.3 From 47c50853bb9c5c35bf5c3eb00b712310ec2969e2 Mon Sep 17 00:00:00 2001 From: Igor Skalkin Date: Mon, 24 Oct 2022 15:40:33 +0200 Subject: virtio_bt: Fix alignment in configuration struct The current version of the configuration structure has unaligned 16-bit fields, but according to the specification [1], access to the configuration space must be aligned. Add a second, aligned version of the configuration structure and a new feature bit indicating that this version is being used. [1] https://docs.oasis-open.org/virtio/virtio/v1.1/virtio-v1.1.pdf Signed-off-by: Igor Skalkin Signed-off-by: Luiz Augusto von Dentz --- include/uapi/linux/virtio_bt.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/virtio_bt.h b/include/uapi/linux/virtio_bt.h index a7bd48daa9a9..af798f4c9680 100644 --- a/include/uapi/linux/virtio_bt.h +++ b/include/uapi/linux/virtio_bt.h @@ -9,6 +9,7 @@ #define VIRTIO_BT_F_VND_HCI 0 /* Indicates vendor command support */ #define VIRTIO_BT_F_MSFT_EXT 1 /* Indicates MSFT vendor support */ #define VIRTIO_BT_F_AOSP_EXT 2 /* Indicates AOSP vendor support */ +#define VIRTIO_BT_F_CONFIG_V2 3 /* Use second version configuration */ enum virtio_bt_config_type { VIRTIO_BT_CONFIG_TYPE_PRIMARY = 0, @@ -28,4 +29,11 @@ struct virtio_bt_config { __u16 msft_opcode; } __attribute__((packed)); +struct virtio_bt_config_v2 { + __u8 type; + __u8 alignment; + __u16 vendor; + __u16 msft_opcode; +}; + #endif /* _UAPI_LINUX_VIRTIO_BT_H */ -- cgit v1.2.3 From ad38e55e1c89384aecee1bb0425bf1bf21ec86fd Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Fri, 4 Nov 2022 22:13:00 +0100 Subject: Bluetooth: hci_event: Ignore reserved bits in LE Extended Adv Report Broadcom controllers present on Apple Silicon devices use the upper 8 bits of the event type in the LE Extended Advertising Report for the channel on which the frame has been received. These bits are reserved according to the Bluetooth spec anyway such that we can just drop them to ensure that the advertising results are parsed correctly. The following excerpt from a btmon trace shows a report received on channel 37 by these controllers: > HCI Event: LE Meta Event (0x3e) plen 55 LE Extended Advertising Report (0x0d) Num reports: 1 Entry 0 Event type: 0x2513 Props: 0x0013 Connectable Scannable Use legacy advertising PDUs Data status: Complete Reserved (0x2500) Legacy PDU Type: Reserved (0x2513) Address type: Public (0x00) Address: XX:XX:XX:XX:XX:XX (Shenzhen Jingxun Software [...]) Primary PHY: LE 1M Secondary PHY: No packets SID: no ADI field (0xff) TX power: 127 dBm RSSI: -76 dBm (0xb4) Periodic advertising interval: 0.00 msec (0x0000) Direct address type: Public (0x00) Direct address: 00:00:00:00:00:00 (OUI 00-00-00) Data length: 0x1d [...] Flags: 0x18 Simultaneous LE and BR/EDR (Controller) Simultaneous LE and BR/EDR (Host) Company: Harman International Industries, Inc. (87) Data: [...] Service Data (UUID 0xfddf): Name (complete): JBL Flip 5 Signed-off-by: Sven Peter Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 684f1cd28730..a035ff6055da 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -2590,6 +2590,7 @@ struct hci_ev_le_conn_complete { #define LE_EXT_ADV_DIRECT_IND 0x0004 #define LE_EXT_ADV_SCAN_RSP 0x0008 #define LE_EXT_ADV_LEGACY_PDU 0x0010 +#define LE_EXT_ADV_EVT_TYPE_MASK 0x007f #define ADDR_LE_DEV_PUBLIC 0x00 #define ADDR_LE_DEV_RANDOM 0x01 -- cgit v1.2.3 From 392fca352c7a95e2828d49e7500e26d0c87ca265 Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Fri, 4 Nov 2022 22:13:01 +0100 Subject: Bluetooth: Add quirk to disable extended scanning Broadcom 4377 controllers found in Apple x86 Macs with the T2 chip claim to support extended scanning when querying supported states, < HCI Command: LE Read Supported St.. (0x08|0x001c) plen 0 > HCI Event: Command Complete (0x0e) plen 12 LE Read Supported States (0x08|0x001c) ncmd 1 Status: Success (0x00) States: 0x000003ffffffffff [...] LE Set Extended Scan Parameters (Octet 37 - Bit 5) LE Set Extended Scan Enable (Octet 37 - Bit 6) [...] , but then fail to actually implement the extended scanning: < HCI Command: LE Set Extended Sca.. (0x08|0x0041) plen 8 Own address type: Random (0x01) Filter policy: Accept all advertisement (0x00) PHYs: 0x01 Entry 0: LE 1M Type: Active (0x01) Interval: 11.250 msec (0x0012) Window: 11.250 msec (0x0012) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Parameters (0x08|0x0041) ncmd 1 Status: Unknown HCI Command (0x01) Signed-off-by: Sven Peter Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 10 ++++++++++ include/net/bluetooth/hci_core.h | 4 +++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index a035ff6055da..4cf6bc363e4f 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -274,6 +274,16 @@ enum { * during the hdev->setup vendor callback. */ HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, + + /* + * When this quirk is set, the HCI_OP_LE_SET_EXT_SCAN_ENABLE command is + * disabled. This is required for some Broadcom controllers which + * erroneously claim to support extended scanning. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_BROKEN_EXT_SCAN, }; /* HCI device flags */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 55a40f5606c3..1113d74e1f9f 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1690,7 +1690,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn); /* Use ext scanning if set ext scan param and ext scan enable is supported */ #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \ - ((dev)->commands[37] & 0x40)) + ((dev)->commands[37] & 0x40) && \ + !test_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &(dev)->quirks)) + /* Use ext create connection if command is supported */ #define use_ext_conn(dev) ((dev)->commands[37] & 0x80) -- cgit v1.2.3 From ffcb0a445ec2d5753751437706aa0a7ea8351099 Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Fri, 4 Nov 2022 22:13:02 +0100 Subject: Bluetooth: Add quirk to disable MWS Transport Configuration Broadcom 4378/4387 controllers found in Apple Silicon Macs claim to support getting MWS Transport Layer Configuration, < HCI Command: Read Local Supported... (0x04|0x0002) plen 0 > HCI Event: Command Complete (0x0e) plen 68 Read Local Supported Commands (0x04|0x0002) ncmd 1 Status: Success (0x00) [...] Get MWS Transport Layer Configuration (Octet 30 - Bit 3)] [...] , but then don't actually allow the required command: > HCI Event: Command Complete (0x0e) plen 15 Get MWS Transport Layer Configuration (0x05|0x000c) ncmd 1 Status: Command Disallowed (0x0c) Number of transports: 0 Baud rate list: 0 entries 00 00 00 00 00 00 00 00 00 00 Signed-off-by: Sven Peter Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 10 ++++++++++ include/net/bluetooth/hci_core.h | 3 +++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 4cf6bc363e4f..8d773b042c85 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -284,6 +284,16 @@ enum { * during the hdev->setup vendor callback. */ HCI_QUIRK_BROKEN_EXT_SCAN, + + /* + * When this quirk is set, the HCI_OP_GET_MWS_TRANSPORT_CONFIG command is + * disabled. This is required for some Broadcom controllers which + * erroneously claim to support MWS Transport Layer Configuration. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, }; /* HCI device flags */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 1113d74e1f9f..7254edfba4c9 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1720,6 +1720,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn); ((dev)->le_features[3] & HCI_LE_CIS_PERIPHERAL) #define bis_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_BROADCASTER) +#define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \ + (!test_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &(dev)->quirks))) + /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 -- cgit v1.2.3 From d7b061b80ee6f91aa0b89daa3069802d7ea4c57f Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Mon, 12 Dec 2022 11:24:26 +0800 Subject: net: tso: inline tso_count_descs() tso_count_descs() is a small function doing simple calculation, and tso_count_descs() is used in fast path, so inline it to reduce the overhead of calls. Signed-off-by: Yunsheng Lin Link: https://lore.kernel.org/r/20221212032426.16050-1-linyunsheng@huawei.com Signed-off-by: Jakub Kicinski --- include/net/tso.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tso.h b/include/net/tso.h index 62c98a9c60f1..e7e157ae0526 100644 --- a/include/net/tso.h +++ b/include/net/tso.h @@ -2,6 +2,7 @@ #ifndef _TSO_H #define _TSO_H +#include #include #define TSO_HEADER_SIZE 256 @@ -16,7 +17,12 @@ struct tso_t { u32 tcp_seq; }; -int tso_count_descs(const struct sk_buff *skb); +/* Calculate the worst case buffer count */ +static inline int tso_count_descs(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags; +} + void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, int size, bool is_last); void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size); -- cgit v1.2.3 From 8a321cf7becc6c065ae595b837b826a2a81036b9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 9 Dec 2022 10:21:38 -0500 Subject: net: add IFF_NO_ADDRCONF and use it in bonding to prevent ipv6 addrconf Currently, in bonding it reused the IFF_SLAVE flag and checked it in ipv6 addrconf to prevent ipv6 addrconf. However, it is not a proper flag to use for no ipv6 addrconf, for bonding it has to move IFF_SLAVE flag setting ahead of dev_open() in bond_enslave(). Also, IFF_MASTER/SLAVE are historical flags used in bonding and eql, as Jiri mentioned, the new devices like Team, Failover do not use this flag. So as Jiri suggested, this patch adds IFF_NO_ADDRCONF in priv_flags of the device to indicate no ipv6 addconf, and uses it in bonding and moves IFF_SLAVE flag setting back to its original place. Signed-off-by: Xin Long Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2287cb8eb9e4..aad12a179e54 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1662,6 +1662,7 @@ struct net_device_ops { * @IFF_FAILOVER: device is a failover master device * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device + * @IFF_NO_ADDRCONF: prevent ipv6 addrconf * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with * skb_headlen(skb) == 0 (data starts from frag0) * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN @@ -1697,7 +1698,7 @@ enum netdev_priv_flags { IFF_FAILOVER = 1<<27, IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, - /* was IFF_LIVE_RENAME_OK */ + IFF_NO_ADDRCONF = BIT_ULL(30), IFF_TX_SKB_NO_LINEAR = BIT_ULL(31), IFF_CHANGE_PROTO_DOWN = BIT_ULL(32), }; -- cgit v1.2.3 From 6afaae6d12f54accf194e73ece617d1f456e438d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 10 Dec 2022 16:56:29 +0200 Subject: bridge: mcast: Allow user space to add (*, G) with a source list and filter mode Add new netlink attributes to the RTM_NEWMDB request that allow user space to add (*, G) with a source list and filter mode. The RTM_NEWMDB message can already dump such entries (created by the kernel) so there is no need to add dump support. However, the message contains a different set of attributes depending if it is a request or a response. The naming and structure of the new attributes try to follow the existing ones used in the response. Request: [ struct nlmsghdr ] [ struct br_port_msg ] [ MDBA_SET_ENTRY ] struct br_mdb_entry [ MDBA_SET_ENTRY_ATTRS ] [ MDBE_ATTR_SOURCE ] struct in_addr / struct in6_addr [ MDBE_ATTR_SRC_LIST ] // new [ MDBE_SRC_LIST_ENTRY ] [ MDBE_SRCATTR_ADDRESS ] struct in_addr / struct in6_addr [ ...] [ MDBE_ATTR_GROUP_MODE ] // new u8 Response: [ struct nlmsghdr ] [ struct br_port_msg ] [ MDBA_MDB ] [ MDBA_MDB_ENTRY ] [ MDBA_MDB_ENTRY_INFO ] struct br_mdb_entry [ MDBA_MDB_EATTR_TIMER ] u32 [ MDBA_MDB_EATTR_SOURCE ] struct in_addr / struct in6_addr [ MDBA_MDB_EATTR_RTPROT ] u8 [ MDBA_MDB_EATTR_SRC_LIST ] [ MDBA_MDB_SRCLIST_ENTRY ] [ MDBA_MDB_SRCATTR_ADDRESS ] struct in_addr / struct in6_addr [ MDBA_MDB_SRCATTR_TIMER ] u8 [...] [ MDBA_MDB_EATTR_GROUP_MODE ] u8 Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index a86a7e7b811f..0d9fe73fc48c 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -723,10 +723,30 @@ enum { enum { MDBE_ATTR_UNSPEC, MDBE_ATTR_SOURCE, + MDBE_ATTR_SRC_LIST, + MDBE_ATTR_GROUP_MODE, __MDBE_ATTR_MAX, }; #define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1) +/* per mdb entry source */ +enum { + MDBE_SRC_LIST_UNSPEC, + MDBE_SRC_LIST_ENTRY, + __MDBE_SRC_LIST_MAX, +}; +#define MDBE_SRC_LIST_MAX (__MDBE_SRC_LIST_MAX - 1) + +/* per mdb entry per source attributes + * these are embedded in MDBE_SRC_LIST_ENTRY + */ +enum { + MDBE_SRCATTR_UNSPEC, + MDBE_SRCATTR_ADDRESS, + __MDBE_SRCATTR_MAX, +}; +#define MDBE_SRCATTR_MAX (__MDBE_SRCATTR_MAX - 1) + /* Embedded inside LINK_XSTATS_TYPE_BRIDGE */ enum { BRIDGE_XSTATS_UNSPEC, -- cgit v1.2.3 From 1d7b66a7d9754c229d12c53ad6a1effe88c16ca4 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 10 Dec 2022 16:56:30 +0200 Subject: bridge: mcast: Allow user space to specify MDB entry routing protocol Add the 'MDBE_ATTR_RTPORT' attribute to allow user space to specify the routing protocol of the MDB port group entry. Enforce a minimum value of 'RTPROT_STATIC' to prevent user space from using protocol values that should only be set by the kernel (e.g., 'RTPROT_KERNEL'). Maintain backward compatibility by defaulting to 'RTPROT_STATIC'. The protocol is already visible to user space in RTM_NEWMDB responses and notifications via the 'MDBA_MDB_EATTR_RTPROT' attribute. The routing protocol allows a routing daemon to distinguish between entries configured by it and those configured by the administrator. Once MDB flush is supported, the protocol can be used as a criterion according to which the flush is performed. Examples: # bridge mdb add dev br0 port dummy10 grp 239.1.1.1 permanent proto kernel Error: integer out of range. # bridge mdb add dev br0 port dummy10 grp 239.1.1.1 permanent proto static # bridge mdb add dev br0 port dummy10 grp 239.1.1.1 src 192.0.2.1 permanent proto zebra # bridge mdb add dev br0 port dummy10 grp 239.1.1.2 permanent source_list 198.51.100.1,198.51.100.2 filter_mode include proto 250 # bridge -d mdb show dev br0 port dummy10 grp 239.1.1.2 src 198.51.100.2 permanent filter_mode include proto 250 dev br0 port dummy10 grp 239.1.1.2 src 198.51.100.1 permanent filter_mode include proto 250 dev br0 port dummy10 grp 239.1.1.2 permanent filter_mode include source_list 198.51.100.2/0.00,198.51.100.1/0.00 proto 250 dev br0 port dummy10 grp 239.1.1.1 src 192.0.2.1 permanent filter_mode include proto zebra dev br0 port dummy10 grp 239.1.1.1 permanent filter_mode exclude proto static Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 0d9fe73fc48c..d9de241d90f9 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -725,6 +725,7 @@ enum { MDBE_ATTR_SOURCE, MDBE_ATTR_SRC_LIST, MDBE_ATTR_GROUP_MODE, + MDBE_ATTR_RTPROT, __MDBE_ATTR_MAX, }; #define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1) -- cgit v1.2.3 From 89300468e2b2ec216c7827ba04ac45c129794403 Mon Sep 17 00:00:00 2001 From: Coco Li Date: Sat, 10 Dec 2022 04:16:45 +0000 Subject: IPv6/GRO: generic helper to remove temporary HBH/jumbo header in driver IPv6/TCP and GRO stacks can build big TCP packets with an added temporary Hop By Hop header. Is GSO is not involved, then the temporary header needs to be removed in the driver. This patch provides a generic helper for drivers that need to modify their headers in place. Tested: Compiled and ran with ethtool -K eth1 tso off Could send Big TCP packets Signed-off-by: Coco Li Link: https://lore.kernel.org/r/20221210041646.3587757-1-lixiaoyan@google.com Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index d383c895592a..03f3af02a9a6 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -500,6 +500,39 @@ static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb) return jhdr->nexthdr; } +/* Return 0 if HBH header is successfully removed + * Or if HBH removal is unnecessary (packet is not big TCP) + * Return error to indicate dropping the packet + */ +static inline int ipv6_hopopt_jumbo_remove(struct sk_buff *skb) +{ + const int hophdr_len = sizeof(struct hop_jumbo_hdr); + int nexthdr = ipv6_has_hopopt_jumbo(skb); + struct ipv6hdr *h6; + + if (!nexthdr) + return 0; + + if (skb_cow_head(skb, 0)) + return -1; + + /* Remove the HBH header. + * Layout: [Ethernet header][IPv6 header][HBH][L4 Header] + */ + memmove(skb_mac_header(skb) + hophdr_len, skb_mac_header(skb), + skb_network_header(skb) - skb_mac_header(skb) + + sizeof(struct ipv6hdr)); + + __skb_pull(skb, hophdr_len); + skb->network_header += hophdr_len; + skb->mac_header += hophdr_len; + + h6 = ipv6_hdr(skb); + h6->nexthdr = nexthdr; + + return 0; +} + static inline bool ipv6_accept_ra(struct inet6_dev *idev) { /* If forwarding is enabled, RA are not accepted unless the special -- cgit v1.2.3