From a9e050f4e7f9d36afe0dcc0bddba864ee442715e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Aug 2012 22:34:50 +0000 Subject: net: tcp: GRO should be ECN friendly While doing TCP ECN tests, I discovered GRO was reordering packets if it receives one packet with CE set, while previous packets in same NAPI run have ECT(0) for the same flow : 09:25:25.857620 IP (tos 0x2,ECT(0), ttl 64, id 27893, offset 0, flags [DF], proto TCP (6), length 4396) 172.30.42.19.54550 > 172.30.42.13.44139: Flags [.], seq 233801:238145, ack 1, win 115, options [nop,nop,TS val 3397779 ecr 1990627], length 4344 09:25:25.857626 IP (tos 0x3,CE, ttl 64, id 27892, offset 0, flags [DF], proto TCP (6), length 1500) 172.30.42.19.54550 > 172.30.42.13.44139: Flags [.], seq 232353:233801, ack 1, win 115, options [nop,nop,TS val 3397779 ecr 1990627], length 1448 09:25:25.857638 IP (tos 0x0, ttl 64, id 34581, offset 0, flags [DF], proto TCP (6), length 64) 172.30.42.13.44139 > 172.30.42.19.54550: Flags [.], cksum 0xac8f (incorrect -> 0xca69), ack 232353, win 1271, options [nop,nop,TS val 1990627 ecr 3397779,nop,nop,sack 1 {233801:238145}], length 0 We have two problems here : 1) GRO reorders packets If NIC gave packet1, then packet2, which happen to be from "different flows" GRO feeds stack with packet2, then packet1. I have yet to understand how to solve this problem. 2) GRO is not ECN friendly Delivering packets out of order makes TCP stack not as fast as it could be. In this patch I suggest we make the tos test not part of the 'same_flow' determination, but part of the 'should flush' logic Signed-off-by: Eric Dumazet Cc: Herbert Xu Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index fe4582ca969a..6681ccf5c3ee 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1364,7 +1364,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, if (*(u8 *)iph != 0x45) goto out_unlock; - if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + if (unlikely(ip_fast_csum((u8 *)iph, 5))) goto out_unlock; id = ntohl(*(__be32 *)&iph->id); @@ -1380,7 +1380,6 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, iph2 = ip_hdr(p); if ((iph->protocol ^ iph2->protocol) | - (iph->tos ^ iph2->tos) | ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { NAPI_GRO_CB(p)->same_flow = 0; @@ -1390,6 +1389,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, /* All fields must match except length and checksum. */ NAPI_GRO_CB(p)->flush |= (iph->ttl ^ iph2->ttl) | + (iph->tos ^ iph2->tos) | ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); NAPI_GRO_CB(p)->flush |= flush; -- cgit v1.2.3 From 8336886f786fdacbc19b719c1f7ea91eb70706d4 Mon Sep 17 00:00:00 2001 From: Jerry Chu Date: Fri, 31 Aug 2012 12:29:12 +0000 Subject: tcp: TCP Fast Open Server - support TFO listeners This patch builds on top of the previous patch to add the support for TFO listeners. This includes - 1. allocating, properly initializing, and managing the per listener fastopen_queue structure when TFO is enabled 2. changes to the inet_csk_accept code to support TFO. E.g., the request_sock can no longer be freed upon accept(), not until 3WHS finishes 3. allowing a TCP_SYN_RECV socket to properly poll() and sendmsg() if it's a TFO socket 4. properly closing a TFO listener, and a TFO socket before 3WHS finishes 5. supporting TCP_FASTOPEN socket option 6. modifying tcp_check_req() to use to check a TFO socket as well as request_sock 7. supporting TCP's TFO cookie option 8. adding a new SYN-ACK retransmit handler to use the timer directly off the TFO socket rather than the listener socket. Note that TFO server side will not retransmit anything other than SYN-ACK until the 3WHS is completed. The patch also contains an important function "reqsk_fastopen_remove()" to manage the somewhat complex relation between a listener, its request_sock, and the corresponding child socket. See the comment above the function for the detail. Signed-off-by: H.K. Jerry Chu Cc: Yuchung Cheng Cc: Neal Cardwell Cc: Eric Dumazet Cc: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6681ccf5c3ee..4f70ef0b946d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -149,6 +149,11 @@ void inet_sock_destruct(struct sock *sk) pr_err("Attempt to release alive inet socket %p\n", sk); return; } + if (sk->sk_type == SOCK_STREAM) { + struct fastopen_queue *fastopenq = + inet_csk(sk)->icsk_accept_queue.fastopenq; + kfree(fastopenq); + } WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(atomic_read(&sk->sk_wmem_alloc)); @@ -212,6 +217,26 @@ int inet_listen(struct socket *sock, int backlog) * we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { + /* Check special setups for testing purpose to enable TFO w/o + * requiring TCP_FASTOPEN sockopt. + * Note that only TCP sockets (SOCK_STREAM) will reach here. + * Also fastopenq may already been allocated because this + * socket was in TCP_LISTEN state previously but was + * shutdown() (rather than close()). + */ + if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && + inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) { + if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) + err = fastopen_init_queue(sk, backlog); + else if ((sysctl_tcp_fastopen & + TFO_SERVER_WO_SOCKOPT2) != 0) + err = fastopen_init_queue(sk, + ((uint)sysctl_tcp_fastopen) >> 16); + else + err = 0; + if (err) + goto out; + } err = inet_csk_listen_start(sk, backlog); if (err) goto out; @@ -701,7 +726,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) sock_rps_record_flow(sk2); WARN_ON(!((1 << sk2->sk_state) & - (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); + (TCPF_ESTABLISHED | TCPF_SYN_RECV | + TCPF_CLOSE_WAIT | TCPF_CLOSE))); sock_graft(sk2, newsock); -- cgit v1.2.3 From 7ab4551f3b391818e29263279031dca1e26417c6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 Sep 2012 08:07:13 +0000 Subject: tcp: fix TFO regression Fengguang Wu reported various panics and bisected to commit 8336886f786fdac (tcp: TCP Fast Open Server - support TFO listeners) Fix this by making sure socket is a TCP socket before accessing TFO data structures. [ 233.046014] kfree_debugcheck: out of range ptr ea6000000bb8h. [ 233.047399] ------------[ cut here ]------------ [ 233.048393] kernel BUG at /c/kernel-tests/src/stable/mm/slab.c:3074! [ 233.048393] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC [ 233.048393] Modules linked in: [ 233.048393] CPU 0 [ 233.048393] Pid: 3929, comm: trinity-watchdo Not tainted 3.6.0-rc3+ #4192 Bochs Bochs [ 233.048393] RIP: 0010:[] [] kfree_debugcheck+0x27/0x2d [ 233.048393] RSP: 0018:ffff88000facbca8 EFLAGS: 00010092 [ 233.048393] RAX: 0000000000000031 RBX: 0000ea6000000bb8 RCX: 00000000a189a188 [ 233.048393] RDX: 000000000000a189 RSI: ffffffff8108ad32 RDI: ffffffff810d30f9 [ 233.048393] RBP: ffff88000facbcb8 R08: 0000000000000002 R09: ffffffff843846f0 [ 233.048393] R10: ffffffff810ae37c R11: 0000000000000908 R12: 0000000000000202 [ 233.048393] R13: ffffffff823dbd5a R14: ffff88000ec5bea8 R15: ffffffff8363c780 [ 233.048393] FS: 00007faa6899c700(0000) GS:ffff88001f200000(0000) knlGS:0000000000000000 [ 233.048393] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 233.048393] CR2: 00007faa6841019c CR3: 0000000012c82000 CR4: 00000000000006f0 [ 233.048393] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 233.048393] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 233.048393] Process trinity-watchdo (pid: 3929, threadinfo ffff88000faca000, task ffff88000faec600) [ 233.048393] Stack: [ 233.048393] 0000000000000000 0000ea6000000bb8 ffff88000facbce8 ffffffff8116ad81 [ 233.048393] ffff88000ff588a0 ffff88000ff58850 ffff88000ff588a0 0000000000000000 [ 233.048393] ffff88000facbd08 ffffffff823dbd5a ffffffff823dbcb0 ffff88000ff58850 [ 233.048393] Call Trace: [ 233.048393] [] kfree+0x5f/0xca [ 233.048393] [] inet_sock_destruct+0xaa/0x13c [ 233.048393] [] ? inet_sk_rebuild_header +0x319/0x319 [ 233.048393] [] __sk_free+0x21/0x14b [ 233.048393] [] sk_free+0x26/0x2a [ 233.048393] [] sctp_close+0x215/0x224 [ 233.048393] [] ? lock_release+0x16f/0x1b9 [ 233.048393] [] inet_release+0x7e/0x85 [ 233.048393] [] sock_release+0x1f/0x77 [ 233.048393] [] sock_close+0x27/0x2b [ 233.048393] [] __fput+0x101/0x20a [ 233.048393] [] ____fput+0xe/0x10 [ 233.048393] [] task_work_run+0x5d/0x75 [ 233.048393] [] do_exit+0x290/0x7f5 [ 233.048393] [] ? retint_swapgs+0x13/0x1b [ 233.048393] [] do_group_exit+0x7b/0xba [ 233.048393] [] sys_exit_group+0x17/0x17 [ 233.048393] [] tracesys+0xdd/0xe2 [ 233.048393] Code: 59 01 5d c3 55 48 89 e5 53 41 50 0f 1f 44 00 00 48 89 fb e8 d4 b0 f0 ff 84 c0 75 11 48 89 de 48 c7 c7 fc fa f7 82 e8 0d 0f 57 01 <0f> 0b 5f 5b 5d c3 55 48 89 e5 0f 1f 44 00 00 48 63 87 d8 00 00 [ 233.048393] RIP [] kfree_debugcheck+0x27/0x2d [ 233.048393] RSP Reported-by: Fengguang Wu Tested-by: Fengguang Wu Signed-off-by: Eric Dumazet Cc: "H.K. Jerry Chu" Acked-by: Neal Cardwell Acked-by: H.K. Jerry Chu Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4f70ef0b946d..845372b025f6 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -149,11 +149,8 @@ void inet_sock_destruct(struct sock *sk) pr_err("Attempt to release alive inet socket %p\n", sk); return; } - if (sk->sk_type == SOCK_STREAM) { - struct fastopen_queue *fastopenq = - inet_csk(sk)->icsk_accept_queue.fastopenq; - kfree(fastopenq); - } + if (sk->sk_protocol == IPPROTO_TCP) + kfree(inet_csk(sk)->icsk_accept_queue.fastopenq); WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(atomic_read(&sk->sk_wmem_alloc)); -- cgit v1.2.3 From bb68b64724a4fd6b93d83b39aeffa4aadb2562fc Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Tue, 18 Sep 2012 14:19:23 +0000 Subject: ipv4: Don't add TCP-code in inet_sock_destruct Signed-off-by: Christoph Paasch Acked-by: H.K. Jerry Chu Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 845372b025f6..766c59658563 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -149,8 +149,6 @@ void inet_sock_destruct(struct sock *sk) pr_err("Attempt to release alive inet socket %p\n", sk); return; } - if (sk->sk_protocol == IPPROTO_TCP) - kfree(inet_csk(sk)->icsk_accept_queue.fastopenq); WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(atomic_read(&sk->sk_wmem_alloc)); -- cgit v1.2.3