From 198bc90e0e734e5f98c3d2833e8390cac3df61b2 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Thu, 18 Jan 2024 09:20:19 +0800 Subject: tcp: make sure init the accept_queue's spinlocks once When I run syz's reproduction C program locally, it causes the following issue: pvqspinlock: lock 0xffff9d181cd5c660 has corrupted value 0x0! WARNING: CPU: 19 PID: 21160 at __pv_queued_spin_unlock_slowpath (kernel/locking/qspinlock_paravirt.h:508) Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:__pv_queued_spin_unlock_slowpath (kernel/locking/qspinlock_paravirt.h:508) Code: 73 56 3a ff 90 c3 cc cc cc cc 8b 05 bb 1f 48 01 85 c0 74 05 c3 cc cc cc cc 8b 17 48 89 fe 48 c7 c7 30 20 ce 8f e8 ad 56 42 ff <0f> 0b c3 cc cc cc cc 0f 0b 0f 1f 40 00 90 90 90 90 90 90 90 90 90 RSP: 0018:ffffa8d200604cb8 EFLAGS: 00010282 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff9d1ef60e0908 RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffff9d1ef60e0900 RBP: ffff9d181cd5c280 R08: 0000000000000000 R09: 00000000ffff7fff R10: ffffa8d200604b68 R11: ffffffff907dcdc8 R12: 0000000000000000 R13: ffff9d181cd5c660 R14: ffff9d1813a3f330 R15: 0000000000001000 FS: 00007fa110184640(0000) GS:ffff9d1ef60c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000000 CR3: 000000011f65e000 CR4: 00000000000006f0 Call Trace: _raw_spin_unlock (kernel/locking/spinlock.c:186) inet_csk_reqsk_queue_add (net/ipv4/inet_connection_sock.c:1321) inet_csk_complete_hashdance (net/ipv4/inet_connection_sock.c:1358) tcp_check_req (net/ipv4/tcp_minisocks.c:868) tcp_v4_rcv (net/ipv4/tcp_ipv4.c:2260) ip_protocol_deliver_rcu (net/ipv4/ip_input.c:205) ip_local_deliver_finish (net/ipv4/ip_input.c:234) __netif_receive_skb_one_core (net/core/dev.c:5529) process_backlog (./include/linux/rcupdate.h:779) __napi_poll (net/core/dev.c:6533) net_rx_action (net/core/dev.c:6604) __do_softirq (./arch/x86/include/asm/jump_label.h:27) do_softirq (kernel/softirq.c:454 kernel/softirq.c:441) __local_bh_enable_ip (kernel/softirq.c:381) __dev_queue_xmit (net/core/dev.c:4374) ip_finish_output2 (./include/net/neighbour.h:540 net/ipv4/ip_output.c:235) __ip_queue_xmit (net/ipv4/ip_output.c:535) __tcp_transmit_skb (net/ipv4/tcp_output.c:1462) tcp_rcv_synsent_state_process (net/ipv4/tcp_input.c:6469) tcp_rcv_state_process (net/ipv4/tcp_input.c:6657) tcp_v4_do_rcv (net/ipv4/tcp_ipv4.c:1929) __release_sock (./include/net/sock.h:1121 net/core/sock.c:2968) release_sock (net/core/sock.c:3536) inet_wait_for_connect (net/ipv4/af_inet.c:609) __inet_stream_connect (net/ipv4/af_inet.c:702) inet_stream_connect (net/ipv4/af_inet.c:748) __sys_connect (./include/linux/file.h:45 net/socket.c:2064) __x64_sys_connect (net/socket.c:2073 net/socket.c:2070 net/socket.c:2070) do_syscall_64 (arch/x86/entry/common.c:51 arch/x86/entry/common.c:82) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) RIP: 0033:0x7fa10ff05a3d Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ab a3 0e 00 f7 d8 64 89 01 48 RSP: 002b:00007fa110183de8 EFLAGS: 00000202 ORIG_RAX: 000000000000002a RAX: ffffffffffffffda RBX: 0000000020000054 RCX: 00007fa10ff05a3d RDX: 000000000000001c RSI: 0000000020000040 RDI: 0000000000000003 RBP: 00007fa110183e20 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000202 R12: 00007fa110184640 R13: 0000000000000000 R14: 00007fa10fe8b060 R15: 00007fff73e23b20 The issue triggering process is analyzed as follows: Thread A Thread B tcp_v4_rcv //receive ack TCP packet inet_shutdown tcp_check_req tcp_disconnect //disconnect sock ... tcp_set_state(sk, TCP_CLOSE) inet_csk_complete_hashdance ... inet_csk_reqsk_queue_add inet_listen //start listen spin_lock(&queue->rskq_lock) inet_csk_listen_start ... reqsk_queue_alloc ... spin_lock_init spin_unlock(&queue->rskq_lock) //warning When the socket receives the ACK packet during the three-way handshake, it will hold spinlock. And then the user actively shutdowns the socket and listens to the socket immediately, the spinlock will be initialized. When the socket is going to release the spinlock, a warning is generated. Also the same issue to fastopenq.lock. Move init spinlock to inet_create and inet_accept to make sure init the accept_queue's spinlocks once. Fixes: fff1f3001cc5 ("tcp: add a spinlock to protect struct request_sock_queue") Fixes: 168a8f58059a ("tcp: TCP Fast Open Server - main code path") Reported-by: Ming Shu Signed-off-by: Zhengchao Shao Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240118012019.1751966-1-shaozhengchao@huawei.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index d0a2f827d5f2..9ab4bf704e86 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -357,4 +357,12 @@ static inline bool inet_csk_has_ulp(const struct sock *sk) return inet_test_bit(IS_ICSK, sk) && !!inet_csk(sk)->icsk_ulp_ops; } +static inline void inet_init_csk_locks(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + spin_lock_init(&icsk->icsk_accept_queue.rskq_lock); + spin_lock_init(&icsk->icsk_accept_queue.fastopenq.lock); +} + #endif /* _INET_CONNECTION_SOCK_H */ -- cgit v1.2.3 From e3f9bed9bee261e3347131764e42aeedf1ffea61 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 18 Jan 2024 17:55:15 -0800 Subject: llc: Drop support for ETH_P_TR_802_2. syzbot reported an uninit-value bug below. [0] llc supports ETH_P_802_2 (0x0004) and used to support ETH_P_TR_802_2 (0x0011), and syzbot abused the latter to trigger the bug. write$tun(r0, &(0x7f0000000040)={@val={0x0, 0x11}, @val, @mpls={[], @llc={@snap={0xaa, 0x1, ')', "90e5dd"}}}}, 0x16) llc_conn_handler() initialises local variables {saddr,daddr}.mac based on skb in llc_pdu_decode_sa()/llc_pdu_decode_da() and passes them to __llc_lookup(). However, the initialisation is done only when skb->protocol is htons(ETH_P_802_2), otherwise, __llc_lookup_established() and __llc_lookup_listener() will read garbage. The missing initialisation existed prior to commit 211ed865108e ("net: delete all instances of special processing for token ring"). It removed the part to kick out the token ring stuff but forgot to close the door allowing ETH_P_TR_802_2 packets to sneak into llc_rcv(). Let's remove llc_tr_packet_type and complete the deprecation. [0]: BUG: KMSAN: uninit-value in __llc_lookup_established+0xe9d/0xf90 __llc_lookup_established+0xe9d/0xf90 __llc_lookup net/llc/llc_conn.c:611 [inline] llc_conn_handler+0x4bd/0x1360 net/llc/llc_conn.c:791 llc_rcv+0xfbb/0x14a0 net/llc/llc_input.c:206 __netif_receive_skb_one_core net/core/dev.c:5527 [inline] __netif_receive_skb+0x1a6/0x5a0 net/core/dev.c:5641 netif_receive_skb_internal net/core/dev.c:5727 [inline] netif_receive_skb+0x58/0x660 net/core/dev.c:5786 tun_rx_batched+0x3ee/0x980 drivers/net/tun.c:1555 tun_get_user+0x53af/0x66d0 drivers/net/tun.c:2002 tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048 call_write_iter include/linux/fs.h:2020 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x8ef/0x1490 fs/read_write.c:584 ksys_write+0x20f/0x4c0 fs/read_write.c:637 __do_sys_write fs/read_write.c:649 [inline] __se_sys_write fs/read_write.c:646 [inline] __x64_sys_write+0x93/0xd0 fs/read_write.c:646 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Local variable daddr created at: llc_conn_handler+0x53/0x1360 net/llc/llc_conn.c:783 llc_rcv+0xfbb/0x14a0 net/llc/llc_input.c:206 CPU: 1 PID: 5004 Comm: syz-executor994 Not tainted 6.6.0-syzkaller-14500-g1c41041124bd #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 Fixes: 211ed865108e ("net: delete all instances of special processing for token ring") Reported-by: syzbot+b5ad66046b913bc04c6f@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=b5ad66046b913bc04c6f Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240119015515.61898-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/llc_pdu.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/llc_pdu.h b/include/net/llc_pdu.h index 7e73f8e5e497..1d55ba7c45be 100644 --- a/include/net/llc_pdu.h +++ b/include/net/llc_pdu.h @@ -262,8 +262,7 @@ static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type, */ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) { - if (skb->protocol == htons(ETH_P_802_2)) - memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN); + memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN); } /** @@ -275,8 +274,7 @@ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) */ static inline void llc_pdu_decode_da(struct sk_buff *skb, u8 *da) { - if (skb->protocol == htons(ETH_P_802_2)) - memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN); + memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN); } /** -- cgit v1.2.3 From a54d51fb2dfb846aedf3751af501e9688db447f5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Jan 2024 20:17:49 +0000 Subject: udp: fix busy polling Generic sk_busy_loop_end() only looks at sk->sk_receive_queue for presence of packets. Problem is that for UDP sockets after blamed commit, some packets could be present in another queue: udp_sk(sk)->reader_queue In some cases, a busy poller could spin until timeout expiration, even if some packets are available in udp_sk(sk)->reader_queue. v3: - make sk_busy_loop_end() nicer (Willem) v2: - add a READ_ONCE(sk->sk_family) in sk_is_inet() to avoid KCSAN splats. - add a sk_is_inet() check in sk_is_udp() (Willem feedback) - add a sk_is_inet() check in sk_is_tcp(). Fixes: 2276f58ac589 ("udp: use a separate rx queue for packet reception") Signed-off-by: Eric Dumazet Reviewed-by: Paolo Abeni Reviewed-by: Willem de Bruijn Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/skmsg.h | 6 ------ include/net/inet_sock.h | 5 ----- include/net/sock.h | 18 +++++++++++++++++- 3 files changed, 17 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 888a4b217829..e65ec3fd2799 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -505,12 +505,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return !!psock->saved_data_ready; } -static inline bool sk_is_udp(const struct sock *sk) -{ - return sk->sk_type == SOCK_DGRAM && - sk->sk_protocol == IPPROTO_UDP; -} - #if IS_ENABLED(CONFIG_NET_SOCK_MSG) #define BPF_F_STRPARSER (1UL << 1) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index aa86453f6b9b..d94c242eb3ed 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -307,11 +307,6 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet) #define inet_assign_bit(nr, sk, val) \ assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val) -static inline bool sk_is_inet(struct sock *sk) -{ - return sk->sk_family == AF_INET || sk->sk_family == AF_INET6; -} - /** * sk_to_full_sk - Access to a full socket * @sk: pointer to a socket diff --git a/include/net/sock.h b/include/net/sock.h index a7f815c7cfdf..54ca8dcbfb43 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2765,9 +2765,25 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) &skb_shinfo(skb)->tskey); } +static inline bool sk_is_inet(const struct sock *sk) +{ + int family = READ_ONCE(sk->sk_family); + + return family == AF_INET || family == AF_INET6; +} + static inline bool sk_is_tcp(const struct sock *sk) { - return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP; + return sk_is_inet(sk) && + sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP; +} + +static inline bool sk_is_udp(const struct sock *sk) +{ + return sk_is_inet(sk) && + sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP; } static inline bool sk_is_stream_unix(const struct sock *sk) -- cgit v1.2.3 From 32f2a0afa95fae0d1ceec2ff06e0e816939964b8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 22 Jan 2024 15:28:43 +0200 Subject: net/sched: flower: Fix chain template offload When a qdisc is deleted from a net device the stack instructs the underlying driver to remove its flow offload callback from the associated filter block using the 'FLOW_BLOCK_UNBIND' command. The stack then continues to replay the removal of the filters in the block for this driver by iterating over the chains in the block and invoking the 'reoffload' operation of the classifier being used. In turn, the classifier in its 'reoffload' operation prepares and emits a 'FLOW_CLS_DESTROY' command for each filter. However, the stack does not do the same for chain templates and the underlying driver never receives a 'FLOW_CLS_TMPLT_DESTROY' command when a qdisc is deleted. This results in a memory leak [1] which can be reproduced using [2]. Fix by introducing a 'tmplt_reoffload' operation and have the stack invoke it with the appropriate arguments as part of the replay. Implement the operation in the sole classifier that supports chain templates (flower) by emitting the 'FLOW_CLS_TMPLT_{CREATE,DESTROY}' command based on whether a flow offload callback is being bound to a filter block or being unbound from one. As far as I can tell, the issue happens since cited commit which reordered tcf_block_offload_unbind() before tcf_block_flush_all_chains() in __tcf_block_put(). The order cannot be reversed as the filter block is expected to be freed after flushing all the chains. [1] unreferenced object 0xffff888107e28800 (size 2048): comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s) hex dump (first 32 bytes): b1 a6 7c 11 81 88 ff ff e0 5b b3 10 81 88 ff ff ..|......[...... 01 00 00 00 00 00 00 00 e0 aa b0 84 ff ff ff ff ................ backtrace: [] __kmem_cache_alloc_node+0x1e8/0x320 [] __kmalloc+0x4e/0x90 [] mlxsw_sp_acl_ruleset_get+0x34d/0x7a0 [] mlxsw_sp_flower_tmplt_create+0x145/0x180 [] mlxsw_sp_flow_block_cb+0x1ea/0x280 [] tc_setup_cb_call+0x183/0x340 [] fl_tmplt_create+0x3da/0x4c0 [] tc_ctl_chain+0xa15/0x1170 [] rtnetlink_rcv_msg+0x3cc/0xed0 [] netlink_rcv_skb+0x170/0x440 [] netlink_unicast+0x540/0x820 [] netlink_sendmsg+0x8d8/0xda0 [] ____sys_sendmsg+0x30f/0xa80 [] ___sys_sendmsg+0x13a/0x1e0 [] __sys_sendmsg+0x11c/0x1f0 [] do_syscall_64+0x40/0xe0 unreferenced object 0xffff88816d2c0400 (size 1024): comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s) hex dump (first 32 bytes): 40 00 00 00 00 00 00 00 57 f6 38 be 00 00 00 00 @.......W.8..... 10 04 2c 6d 81 88 ff ff 10 04 2c 6d 81 88 ff ff ..,m......,m.... backtrace: [] __kmem_cache_alloc_node+0x1e8/0x320 [] __kmalloc_node+0x51/0x90 [] kvmalloc_node+0xa6/0x1f0 [] bucket_table_alloc.isra.0+0x83/0x460 [] rhashtable_init+0x43b/0x7c0 [] mlxsw_sp_acl_ruleset_get+0x428/0x7a0 [] mlxsw_sp_flower_tmplt_create+0x145/0x180 [] mlxsw_sp_flow_block_cb+0x1ea/0x280 [] tc_setup_cb_call+0x183/0x340 [] fl_tmplt_create+0x3da/0x4c0 [] tc_ctl_chain+0xa15/0x1170 [] rtnetlink_rcv_msg+0x3cc/0xed0 [] netlink_rcv_skb+0x170/0x440 [] netlink_unicast+0x540/0x820 [] netlink_sendmsg+0x8d8/0xda0 [] ____sys_sendmsg+0x30f/0xa80 [2] # tc qdisc add dev swp1 clsact # tc chain add dev swp1 ingress proto ip chain 1 flower dst_ip 0.0.0.0/32 # tc qdisc del dev swp1 clsact # devlink dev reload pci/0000:06:00.0 Fixes: bbf73830cd48 ("net: sched: traverse chains in block with tcf_get_next_chain()") Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/sch_generic.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index ba3e1b315de8..934fdb977551 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -375,6 +375,10 @@ struct tcf_proto_ops { struct nlattr **tca, struct netlink_ext_ack *extack); void (*tmplt_destroy)(void *tmplt_priv); + void (*tmplt_reoffload)(struct tcf_chain *chain, + bool add, + flow_setup_cb_t *cb, + void *cb_priv); struct tcf_exts * (*get_exts)(const struct tcf_proto *tp, u32 handle); -- cgit v1.2.3 From 25461ce8b3d28528f2c55f5e737e99d2906eda83 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 15 Dec 2023 19:31:14 -0800 Subject: net/mlx5e: Use the correct lag ports number when creating TISes The cited commit moved the code of mlx5e_create_tises() and changed the loop to create TISes over MLX5_MAX_PORTS constant value, instead of getting the correct lag ports supported by the device, which can cause FW errors on devices with less than MLX5_MAX_PORTS ports. Change that back to mlx5e_get_num_lag_ports(mdev). Also IPoIB interfaces create there own TISes, they don't use the eth TISes, pass a flag to indicate that. This fixes the following errors that might appear in kernel log: mlx5_cmd_out_err:808:(pid 650): CREATE_TIS(0x912) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x595b5d), err(-22) mlx5e_create_mdev_resources:174:(pid 650): alloc tises failed, -22 Fixes: b25bd37c859f ("net/mlx5: Move TISes from priv to mdev HW resources") Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8c55ff351e5f..41f03b352401 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -681,6 +681,7 @@ struct mlx5e_resources { struct mlx5_sq_bfreg bfreg; #define MLX5_MAX_NUM_TC 8 u32 tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC]; + bool tisn_valid; } hw_objs; struct net_device *uplink_netdev; struct mutex uplink_netdev_lock; -- cgit v1.2.3 From cfbc3608a8c69b48bf238bd68f768192f0238e0d Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 19 Dec 2023 14:46:20 +0200 Subject: net/mlx5: Fix query of sd_group field The sd_group field moved in the HW spec from the MPIR register to the vport context. Align the query accordingly. Fixes: f5e956329960 ("net/mlx5: Expose Management PCIe Index Register (MPIR)") Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 10 +++++++--- include/linux/mlx5/vport.h | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index bf5320b28b8b..37230253f9f1 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4036,8 +4036,13 @@ struct mlx5_ifc_nic_vport_context_bits { u8 affiliation_criteria[0x4]; u8 affiliated_vhca_id[0x10]; - u8 reserved_at_60[0xd0]; + u8 reserved_at_60[0xa0]; + u8 reserved_at_100[0x1]; + u8 sd_group[0x3]; + u8 reserved_at_104[0x1c]; + + u8 reserved_at_120[0x10]; u8 mtu[0x10]; u8 system_image_guid[0x40]; @@ -10122,8 +10127,7 @@ struct mlx5_ifc_mpir_reg_bits { u8 reserved_at_20[0x20]; u8 local_port[0x8]; - u8 reserved_at_28[0x15]; - u8 sd_group[0x3]; + u8 reserved_at_28[0x18]; u8 reserved_at_60[0x20]; }; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index fbb9bf447889..c36cc6d82926 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -72,6 +72,7 @@ int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu); int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu); int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, u64 *system_image_guid); +int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group); int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid); int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev, u16 vport, u64 node_guid); -- cgit v1.2.3 From ec7cc38ef9f83553102e84c82536971a81630739 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sat, 30 Dec 2023 22:40:37 +0200 Subject: net/mlx5: Bridge, fix multicast packets sent to uplink To enable multicast packets which are offloaded in bridge multicast offload mode to be sent also to uplink, FTE bit uplink_hairpin_en should be set. Add this bit to FTE for the bridge multicast offload rules. Fixes: 18c2916cee12 ("net/mlx5: Bridge, snoop igmp/mld packets") Signed-off-by: Moshe Shemesh Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 1 + include/linux/mlx5/mlx5_ifc.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 6f7725238abc..3fb428ce7d1c 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -132,6 +132,7 @@ struct mlx5_flow_handle; enum { FLOW_CONTEXT_HAS_TAG = BIT(0), + FLOW_CONTEXT_UPLINK_HAIRPIN_EN = BIT(1), }; struct mlx5_flow_context { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 37230253f9f1..c726f90ab752 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -3576,7 +3576,7 @@ struct mlx5_ifc_flow_context_bits { u8 action[0x10]; u8 extended_destination[0x1]; - u8 reserved_at_81[0x1]; + u8 uplink_hairpin_en[0x1]; u8 flow_source[0x2]; u8 encrypt_decrypt_type[0x4]; u8 destination_list_size[0x18]; -- cgit v1.2.3 From b253d87fd78bf8d3e7efc5d149147765f044e89d Mon Sep 17 00:00:00 2001 From: George Guo Date: Tue, 26 Dec 2023 17:42:42 +0800 Subject: netfilter: nf_tables: cleanup documentation - Correct comments for nlpid, family, udlen and udata in struct nft_table, and afinfo is no longer a member of enum nft_set_class. - Add comment for data in struct nft_set_elem. - Add comment for flags in struct nft_ctx. - Add comments for timeout in struct nft_set_iter, and flags is not a member of struct nft_set_iter, remove the comment for it. - Add comments for commit, abort, estimate and gc_init in struct nft_set_ops. - Add comments for pending_update, num_exprs, exprs and catchall_list in struct nft_set. - Add comment for ext_len in struct nft_set_ext_tmpl. - Add comment for inner_ops in struct nft_expr_type. - Add comments for clone, destroy_clone, reduce, gc, offload, offload_action, offload_stats in struct nft_expr_ops. - Add comments for blob_gen_0, blob_gen_1, bound, genmask, udlen, udata, blob_next in struct nft_chain. - Add comment for flags in struct nft_base_chain. - Add comments for udlen, udata in struct nft_object. - Add comment for type in struct nft_object_ops. - Add comment for hook_list in struct nft_flowtable, and remove comments for dev_name and ops which are not members of struct nft_flowtable. Signed-off-by: George Guo Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 49 +++++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index b157c5cafd14..4e1ea18eb5f0 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -205,6 +205,7 @@ static inline void nft_data_copy(u32 *dst, const struct nft_data *src, * @nla: netlink attributes * @portid: netlink portID of the original message * @seq: netlink sequence number + * @flags: modifiers to new request * @family: protocol family * @level: depth of the chains * @report: notify via unicast netlink message @@ -282,6 +283,7 @@ struct nft_elem_priv { }; * * @key: element key * @key_end: closing element key + * @data: element data * @priv: element private data and extensions */ struct nft_set_elem { @@ -325,10 +327,10 @@ struct nft_set_iter { * @dtype: data type * @dlen: data length * @objtype: object type - * @flags: flags * @size: number of set elements * @policy: set policy * @gc_int: garbage collector interval + * @timeout: element timeout * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @expr: set must support for expressions @@ -351,9 +353,9 @@ struct nft_set_desc { /** * enum nft_set_class - performance class * - * @NFT_LOOKUP_O_1: constant, O(1) - * @NFT_LOOKUP_O_LOG_N: logarithmic, O(log N) - * @NFT_LOOKUP_O_N: linear, O(N) + * @NFT_SET_CLASS_O_1: constant, O(1) + * @NFT_SET_CLASS_O_LOG_N: logarithmic, O(log N) + * @NFT_SET_CLASS_O_N: linear, O(N) */ enum nft_set_class { NFT_SET_CLASS_O_1, @@ -422,9 +424,13 @@ struct nft_set_ext; * @remove: remove element from set * @walk: iterate over all set elements * @get: get set elements + * @commit: commit set elements + * @abort: abort set elements * @privsize: function to return size of set private data + * @estimate: estimate the required memory size and the lookup complexity class * @init: initialize private data of new set instance * @destroy: destroy private data of set instance + * @gc_init: initialize garbage collection * @elemsize: element private size * * Operations lookup, update and delete have simpler interfaces, are faster @@ -540,13 +546,16 @@ struct nft_set_elem_expr { * @policy: set parameterization (see enum nft_set_policies) * @udlen: user data length * @udata: user data - * @expr: stateful expression + * @pending_update: list of pending update set element * @ops: set ops * @flags: set flags * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length + * @num_exprs: numbers of exprs + * @exprs: stateful expression + * @catchall_list: list of catch-all set element * @data: private set data */ struct nft_set { @@ -692,6 +701,7 @@ extern const struct nft_set_ext_type nft_set_ext_types[]; * * @len: length of extension area * @offset: offsets of individual extension types + * @ext_len: length of the expected extension(used to sanity check) */ struct nft_set_ext_tmpl { u16 len; @@ -840,6 +850,7 @@ struct nft_expr_ops; * @select_ops: function to select nft_expr_ops * @release_ops: release nft_expr_ops * @ops: default ops, used when no select_ops functions is present + * @inner_ops: inner ops, used for inner packet operation * @list: used internally * @name: Identifier * @owner: module reference @@ -881,14 +892,22 @@ struct nft_offload_ctx; * struct nft_expr_ops - nf_tables expression operations * * @eval: Expression evaluation function + * @clone: Expression clone function * @size: full expression size, including private data size * @init: initialization function * @activate: activate expression in the next generation * @deactivate: deactivate expression in next generation * @destroy: destruction function, called after synchronize_rcu + * @destroy_clone: destruction clone function * @dump: function to dump parameters - * @type: expression type * @validate: validate expression, called during loop detection + * @reduce: reduce expression + * @gc: garbage collection expression + * @offload: hardware offload expression + * @offload_action: function to report true/false to allocate one slot or not in the flow + * offload array + * @offload_stats: function to synchronize hardware stats via updating the counter expression + * @type: expression type * @data: extra data to attach to this expression operation */ struct nft_expr_ops { @@ -1041,14 +1060,21 @@ struct nft_rule_blob { /** * struct nft_chain - nf_tables chain * + * @blob_gen_0: rule blob pointer to the current generation + * @blob_gen_1: rule blob pointer to the future generation * @rules: list of rules in the chain * @list: used internally * @rhlhead: used internally * @table: table that this chain belongs to * @handle: chain handle * @use: number of jump references to this chain - * @flags: bitmask of enum nft_chain_flags + * @flags: bitmask of enum NFTA_CHAIN_FLAGS + * @bound: bind or not + * @genmask: generation mask * @name: name of the chain + * @udlen: user data length + * @udata: user data in the chain + * @blob_next: rule blob pointer to the next in the chain */ struct nft_chain { struct nft_rule_blob __rcu *blob_gen_0; @@ -1146,6 +1172,7 @@ struct nft_hook { * @hook_list: list of netfilter hooks (for NFPROTO_NETDEV family) * @type: chain type * @policy: default policy + * @flags: indicate the base chain disabled or not * @stats: per-cpu chain stats * @chain: the chain * @flow_block: flow block (for hardware offload) @@ -1274,11 +1301,13 @@ struct nft_object_hash_key { * struct nft_object - nf_tables stateful object * * @list: table stateful object list node - * @key: keys that identify this object * @rhlhead: nft_objname_ht node + * @key: keys that identify this object * @genmask: generation mask * @use: number of references to this stateful object * @handle: unique object handle + * @udlen: length of user data + * @udata: user data * @ops: object operations * @data: object data, layout depends on type */ @@ -1344,6 +1373,7 @@ struct nft_object_type { * @destroy: release existing stateful object * @dump: netlink dump stateful object * @update: update stateful object + * @type: pointer to object type */ struct nft_object_ops { void (*eval)(struct nft_object *obj, @@ -1379,9 +1409,8 @@ void nft_unregister_obj(struct nft_object_type *obj_type); * @genmask: generation mask * @use: number of references to this flow table * @handle: unique object handle - * @dev_name: array of device names + * @hook_list: hook list for hooks per net_device in flowtables * @data: rhashtable and garbage collector - * @ops: array of hooks */ struct nft_flowtable { struct list_head list; -- cgit v1.2.3 From f7f6aa8e24383fbb11ac55942e66da9660110f80 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 24 Jan 2024 20:15:53 +0100 Subject: xsk: make xsk_buff_pool responsible for clearing xdp_buff::flags XDP multi-buffer support introduced XDP_FLAGS_HAS_FRAGS flag that is used by drivers to notify data path whether xdp_buff contains fragments or not. Data path looks up mentioned flag on first buffer that occupies the linear part of xdp_buff, so drivers only modify it there. This is sufficient for SKB and XDP_DRV modes as usually xdp_buff is allocated on stack or it resides within struct representing driver's queue and fragments are carried via skb_frag_t structs. IOW, we are dealing with only one xdp_buff. ZC mode though relies on list of xdp_buff structs that is carried via xsk_buff_pool::xskb_list, so ZC data path has to make sure that fragments do *not* have XDP_FLAGS_HAS_FRAGS set. Otherwise, xsk_buff_free() could misbehave if it would be executed against xdp_buff that carries a frag with XDP_FLAGS_HAS_FRAGS flag set. Such scenario can take place when within supplied XDP program bpf_xdp_adjust_tail() is used with negative offset that would in turn release the tail fragment from multi-buffer frame. Calling xsk_buff_free() on tail fragment with XDP_FLAGS_HAS_FRAGS would result in releasing all the nodes from xskb_list that were produced by driver before XDP program execution, which is not what is intended - only tail fragment should be deleted from xskb_list and then it should be put onto xsk_buff_pool::free_list. Such multi-buffer frame will never make it up to user space, so from AF_XDP application POV there would be no traffic running, however due to free_list getting constantly new nodes, driver will be able to feed HW Rx queue with recycled buffers. Bottom line is that instead of traffic being redirected to user space, it would be continuously dropped. To fix this, let us clear the mentioned flag on xsk_buff_pool side during xdp_buff initialization, which is what should have been done right from the start of XSK multi-buffer support. Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support") Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support") Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") Signed-off-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20240124191602.566724-3-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock_drv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 526c1e7f505e..9819e2af0378 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -164,6 +164,7 @@ static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; xdp->data_meta = xdp->data; xdp->data_end = xdp->data + size; + xdp->flags = 0; } static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, -- cgit v1.2.3 From c5114710c8ce86b8317e9b448f4fd15c711c2a82 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 24 Jan 2024 20:15:54 +0100 Subject: xsk: fix usage of multi-buffer BPF helpers for ZC XDP Currently when packet is shrunk via bpf_xdp_adjust_tail() and memory type is set to MEM_TYPE_XSK_BUFF_POOL, null ptr dereference happens: [1136314.192256] BUG: kernel NULL pointer dereference, address: 0000000000000034 [1136314.203943] #PF: supervisor read access in kernel mode [1136314.213768] #PF: error_code(0x0000) - not-present page [1136314.223550] PGD 0 P4D 0 [1136314.230684] Oops: 0000 [#1] PREEMPT SMP NOPTI [1136314.239621] CPU: 8 PID: 54203 Comm: xdpsock Not tainted 6.6.0+ #257 [1136314.250469] Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019 [1136314.265615] RIP: 0010:__xdp_return+0x6c/0x210 [1136314.274653] Code: ad 00 48 8b 47 08 49 89 f8 a8 01 0f 85 9b 01 00 00 0f 1f 44 00 00 f0 41 ff 48 34 75 32 4c 89 c7 e9 79 cd 80 ff 83 fe 03 75 17 41 34 01 0f 85 02 01 00 00 48 89 cf e9 22 cc 1e 00 e9 3d d2 86 [1136314.302907] RSP: 0018:ffffc900089f8db0 EFLAGS: 00010246 [1136314.312967] RAX: ffffc9003168aed0 RBX: ffff8881c3300000 RCX: 0000000000000000 [1136314.324953] RDX: 0000000000000000 RSI: 0000000000000003 RDI: ffffc9003168c000 [1136314.336929] RBP: 0000000000000ae0 R08: 0000000000000002 R09: 0000000000010000 [1136314.348844] R10: ffffc9000e495000 R11: 0000000000000040 R12: 0000000000000001 [1136314.360706] R13: 0000000000000524 R14: ffffc9003168aec0 R15: 0000000000000001 [1136314.373298] FS: 00007f8df8bbcb80(0000) GS:ffff8897e0e00000(0000) knlGS:0000000000000000 [1136314.386105] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [1136314.396532] CR2: 0000000000000034 CR3: 00000001aa912002 CR4: 00000000007706f0 [1136314.408377] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [1136314.420173] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [1136314.431890] PKRU: 55555554 [1136314.439143] Call Trace: [1136314.446058] [1136314.452465] ? __die+0x20/0x70 [1136314.459881] ? page_fault_oops+0x15b/0x440 [1136314.468305] ? exc_page_fault+0x6a/0x150 [1136314.476491] ? asm_exc_page_fault+0x22/0x30 [1136314.484927] ? __xdp_return+0x6c/0x210 [1136314.492863] bpf_xdp_adjust_tail+0x155/0x1d0 [1136314.501269] bpf_prog_ccc47ae29d3b6570_xdp_sock_prog+0x15/0x60 [1136314.511263] ice_clean_rx_irq_zc+0x206/0xc60 [ice] [1136314.520222] ? ice_xmit_zc+0x6e/0x150 [ice] [1136314.528506] ice_napi_poll+0x467/0x670 [ice] [1136314.536858] ? ttwu_do_activate.constprop.0+0x8f/0x1a0 [1136314.546010] __napi_poll+0x29/0x1b0 [1136314.553462] net_rx_action+0x133/0x270 [1136314.561619] __do_softirq+0xbe/0x28e [1136314.569303] do_softirq+0x3f/0x60 This comes from __xdp_return() call with xdp_buff argument passed as NULL which is supposed to be consumed by xsk_buff_free() call. To address this properly, in ZC case, a node that represents the frag being removed has to be pulled out of xskb_list. Introduce appropriate xsk helpers to do such node operation and use them accordingly within bpf_xdp_adjust_tail(). Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") Acked-by: Magnus Karlsson # For the xsk header part Signed-off-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20240124191602.566724-4-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock_drv.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 9819e2af0378..c9aec9ab6191 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -159,6 +159,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) return ret; } +static inline void xsk_buff_del_tail(struct xdp_buff *tail) +{ + struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); + + list_del(&xskb->xskb_list_node); +} + +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) +{ + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *frag; + + frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, + xskb_list_node); + return &frag->xdp; +} + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; @@ -351,6 +368,15 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) return NULL; } +static inline void xsk_buff_del_tail(struct xdp_buff *tail) +{ +} + +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) +{ + return NULL; +} + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { } -- cgit v1.2.3