tools/testing/selftests/net/tcp_ecmp_failover.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216

#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# Copyright 2026 Google LLC.
#
# This test verifies TCP flow failover between ECMP routes
# upon carrier loss on the active device.
#
#   socat  ----------------------------->  socat
#                        |
#           .-- veth-c1 -|- veth-s1 --.
#   dummy0 -|            |            |-- dummy0
#           '-- veth-c2 -|- veth-s2 --'
#                        |
#

REQUIRE_JQ=no
REQUIRE_MZ=no
NUM_NETIFS=0

source forwarding/lib.sh

CLIENT_IP="10.0.59.1"
SERVER_IP="10.0.92.1"
CLIENT_IP6="2001:db8:5a9a::1"
SERVER_IP6="2001:db8:9292::1"

setup_server()
{
	IP="ip -n $server"
	NS_EXEC="ip netns exec $server"

	$IP link add dummy0 type dummy
	$IP link set dummy0 up

	$IP -4 addr add $SERVER_IP/32 dev dummy0
	$IP -6 addr add $SERVER_IP6/128 dev dummy0 nodad

	$IP link set veth-s1 up
	$IP link set veth-s2 up

	$IP -4 addr add 192.168.1.2/24 dev veth-s1
	$IP -4 addr add 192.168.2.2/24 dev veth-s2

	$IP -4 route add $CLIENT_IP/32 \
		nexthop via 192.168.1.1 dev veth-s1 weight 1 \
		nexthop via 192.168.2.1 dev veth-s2 weight 1

	$IP -6 addr add 2001:db8:1::2/64 dev veth-s1 nodad
	$IP -6 addr add 2001:db8:2::2/64 dev veth-s2 nodad

	$IP -6 route add $CLIENT_IP6/128 \
		nexthop via 2001:db8:1::1 dev veth-s1 weight 1 \
		nexthop via 2001:db8:2::1 dev veth-s2 weight 1
}

setup_client()
{
	IP="ip -n $client"
	NS_EXEC="ip netns exec $client"

	$IP link add dummy0 type dummy
	$IP link set dummy0 up

	$IP -4 addr add $CLIENT_IP/32 dev dummy0
	$IP -6 addr add $CLIENT_IP6/128 dev dummy0 nodad

	$IP link set veth-c1 up
	$IP link set veth-c2 up

	$IP -4 addr add 192.168.1.1/24 dev veth-c1
	$IP -4 addr add 192.168.2.1/24 dev veth-c2

	$IP -4 route add $SERVER_IP/32 \
		nexthop via 192.168.1.2 dev veth-c1 weight 1 \
		nexthop via 192.168.2.2 dev veth-c2 weight 1

	$IP -6 addr add 2001:db8:1::1/64 dev veth-c1 nodad
	$IP -6 addr add 2001:db8:2::1/64 dev veth-c2 nodad

	$IP -6 route add $SERVER_IP6/128 \
		nexthop via 2001:db8:1::2 dev veth-c1 weight 1 \
		nexthop via 2001:db8:2::2 dev veth-c2 weight 1

	# By default, tcp_retries1=3 triggers a route refresh
	# after 3 retransmits (~5s).  Ensure this never occurs
	# for test stability.
	$NS_EXEC sysctl -qw net.ipv4.tcp_retries1=100

	# When NETDEV_CHANGE is issued for a dev tied to an ECMP
	# route, RTNH_F_LINKDOWN is flagged and the sernum is
	# bumped to invalidate the route via sk_dst_check().
	#
	# Without ignore_routes_with_linkdown=1, subsequent
	# lookups may still select the same RTNH_F_LINKDOWN route.
	$NS_EXEC sysctl -qw net.ipv4.conf.veth-c1.ignore_routes_with_linkdown=1
	$NS_EXEC sysctl -qw net.ipv4.conf.veth-c2.ignore_routes_with_linkdown=1

	$NS_EXEC sysctl -qw net.ipv6.conf.veth-c1.ignore_routes_with_linkdown=1
	$NS_EXEC sysctl -qw net.ipv6.conf.veth-c2.ignore_routes_with_linkdown=1
}

setup()
{
	setup_ns client server

	ip -n "$client" link add veth-c1 type veth peer veth-s1 netns "$server"
	ip -n "$client" link add veth-c2 type veth peer veth-s2 netns "$server"

	setup_server
	setup_client
}

cleanup()
{
	cleanup_all_ns > /dev/null 2>&1
}

tcp_ecmp_failover()
{
	local pf=$1; shift
	local server_ip=$1; shift
	local client_ip=$1; shift

	RET=0

	tcpdump_start veth-s1 "$server"
	tcpdump_start veth-s2 "$server"

	ip netns exec "$server" \
		socat -u TCP-LISTEN:8080,pf="$pf",bind="$server_ip",reuseaddr /dev/null &
	server_pid=$!

	# Wait for server to start listening.
	# Sometimes client fails without this sleep.
	sleep 1

	ip netns exec "$client" \
		socat -u /dev/zero TCP:"$server_ip":8080,pf="$pf",bind="$client_ip" &
	client_pid=$!

	# To capture enough packets.
	sleep 3

	tcpdump_stop veth-s1
	tcpdump_stop veth-s2

	pkts_s1=$(tcpdump_show veth-s1 | wc -l)
	pkts_s2=$(tcpdump_show veth-s2 | wc -l)

	tcpdump_cleanup veth-s1
	tcpdump_cleanup veth-s2

	# Detect the device chosen by the client
	if [ "$pkts_s1" -gt "$pkts_s2" ]; then
		veth_down=veth-s1
		veth_up=veth-s2
	else
		veth_down=veth-s2
		veth_up=veth-s1
	fi

	# Taking down $veth_down causes its peer to lose carrier,
	# triggering NETDEV_CHANGE.  This flags RTNH_F_LINKDOWN
	# and bumps the sernum for the route associated with that
	# peer, invalidating the cached dst in the TCP socket.
	#
	# Consequently, sk_dst_check() fails, forcing the subsequent
	# lookup to select the remaining healthy route via $veth_up.
	ip -n "$server" link set "$veth_down" down

	tcpdump_start "$veth_up" "$server"

	# To capture enough packets.
	sleep  3

	tcpdump_stop "$veth_up"

	kill -9 "$client_pid" > /dev/null 2>&1
	kill -9 "$server_pid" > /dev/null 2>&1
	wait 2> /dev/null

	pkts=$(tcpdump_show $veth_up | wc -l)

	tcpdump_cleanup "$veth_up"

	if [ "$pkts" -lt 1000 ]; then
		RET=$ksft_fail
	fi
}

test_ipv4()
{
	setup
	tcp_ecmp_failover IPv4 $SERVER_IP $CLIENT_IP
	log_test "TCP IPv4 failover"
	cleanup
}

test_ipv6()
{
	setup
	tcp_ecmp_failover IPv6 "[$SERVER_IP6]" "[$CLIENT_IP6]"
	log_test "TCP IPv6 failover"
	cleanup
}

require_command socat
require_command tcpdump

trap cleanup EXIT

test_ipv4
test_ipv6

exit "$EXIT_STATUS"