From ab185d7b2551e9b8d946a657ddccd86d192bebd1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Apr 2012 02:24:36 +0000 Subject: ipv6: tcp: dont drop packet but consume it When we need to clone skb, we dont drop a packet. Call consume_skb() to not confuse dropwatch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 86cfe6005f40..050c55186bc4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1353,7 +1353,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newnp->pktoptions = NULL; if (treq->pktopts != NULL) { newnp->pktoptions = skb_clone(treq->pktopts, GFP_ATOMIC); - kfree_skb(treq->pktopts); + consume_skb(treq->pktopts); treq->pktopts = NULL; if (newnp->pktoptions) skb_set_owner_r(newnp->pktoptions, newsk); -- cgit v1.2.3 From 900f65d361d333c949ef76a828343075f4fdf523 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 19 Apr 2012 09:55:21 +0000 Subject: tcp: move duplicate code from tcp_v4_init_sock()/tcp_v6_init_sock() This commit moves the (substantial) common code shared between tcp_v4_init_sock() and tcp_v6_init_sock() to a new address-family independent function, tcp_init_sock(). Centralizing this functionality should help avoid drift issues, e.g. where the IPv4 side is updated without a corresponding update to IPv6. There was already some drift: IPv4 initialized snd_cwnd to TCP_INIT_CWND, while the IPv6 side was still initializing snd_cwnd to 2 (in this case it should not matter, since snd_cwnd is also initialized in tcp_init_metrics(), but the general risks and maintenance overhead remain). When diffing the old and new code, note that new tcp_init_sock() function uses the order of steps from the tcp_v4_init_sock() implementation (the order is slightly different in tcp_v6_init_sock()). Signed-off-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 50 +------------------------------------------------- 1 file changed, 1 insertion(+), 49 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 050c55186bc4..24dac6b83fb9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1831,62 +1831,14 @@ static int tcp_v6_init_sock(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - skb_queue_head_init(&tp->out_of_order_queue); - tcp_init_xmit_timers(sk); - tcp_prequeue_init(tp); - - icsk->icsk_rto = TCP_TIMEOUT_INIT; - tp->mdev = TCP_TIMEOUT_INIT; - - /* So many TCP implementations out there (incorrectly) count the - * initial SYN frame in their delayed-ACK and congestion control - * algorithms that we must have the following bandaid to talk - * efficiently to them. -DaveM - */ - tp->snd_cwnd = 2; - - /* See draft-stevens-tcpca-spec-01 for discussion of the - * initialization of these values. - */ - tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - tp->snd_cwnd_clamp = ~0; - tp->mss_cache = TCP_MSS_DEFAULT; - - tp->reordering = sysctl_tcp_reordering; - - sk->sk_state = TCP_CLOSE; + tcp_init_sock(sk); icsk->icsk_af_ops = &ipv6_specific; - icsk->icsk_ca_ops = &tcp_init_congestion_ops; - icsk->icsk_sync_mss = tcp_sync_mss; - sk->sk_write_space = sk_stream_write_space; - sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); #ifdef CONFIG_TCP_MD5SIG tp->af_specific = &tcp_sock_ipv6_specific; #endif - /* TCP Cookie Transactions */ - if (sysctl_tcp_cookie_size > 0) { - /* Default, cookies without s_data_payload. */ - tp->cookie_values = - kzalloc(sizeof(*tp->cookie_values), - sk->sk_allocation); - if (tp->cookie_values != NULL) - kref_init(&tp->cookie_values->kref); - } - /* Presumed zeroed, in order of appearance: - * cookie_in_always, cookie_out_never, - * s_data_constant, s_data_in, s_data_out - */ - sk->sk_sndbuf = sysctl_tcp_wmem[1]; - sk->sk_rcvbuf = sysctl_tcp_rmem[1]; - - local_bh_disable(); - sock_update_memcg(sk); - sk_sockets_allocated_inc(sk); - local_bh_enable(); - return 0; } -- cgit v1.2.3 From ac807fa8e625aff58060876d70298c93a59c4252 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 23 Apr 2012 03:21:58 -0400 Subject: tcp: Fix build warning after tcp_{v4,v6}_init_sock consolidation. net/ipv4/tcp_ipv4.c: In function 'tcp_v4_init_sock': net/ipv4/tcp_ipv4.c:1891:19: warning: unused variable 'tp' [-Wunused-variable] net/ipv6/tcp_ipv6.c: In function 'tcp_v6_init_sock': net/ipv6/tcp_ipv6.c:1836:19: warning: unused variable 'tp' [-Wunused-variable] Reported-by: Stephen Rothwell Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 24dac6b83fb9..8044f6ac1301 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1829,14 +1829,13 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { static int tcp_v6_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); tcp_init_sock(sk); icsk->icsk_af_ops = &ipv6_specific; #ifdef CONFIG_TCP_MD5SIG - tp->af_specific = &tcp_sock_ipv6_specific; + tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; #endif return 0; -- cgit v1.2.3 From f545a38f74584cc7424cb74f792a00c6d2589485 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 22 Apr 2012 23:34:26 +0000 Subject: net: add a limit parameter to sk_add_backlog() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sk_add_backlog() & sk_rcvqueues_full() hard coded sk_rcvbuf as the memory limit. We need to make this limit a parameter for TCP use. No functional change expected in this patch, all callers still using the old sk_rcvbuf limit. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Tom Herbert Cc: Maciej Żenczykowski Cc: Yuchung Cheng Cc: Ilpo Järvinen Cc: Rick Jones Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8044f6ac1301..b04e6d8a8371 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1654,7 +1654,7 @@ process: if (!tcp_prequeue(sk, skb)) ret = tcp_v6_do_rcv(sk, skb); } - } else if (unlikely(sk_add_backlog(sk, skb))) { + } else if (unlikely(sk_add_backlog(sk, skb, sk->sk_rcvbuf))) { bh_unlock_sock(sk); NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); goto discard_and_relse; -- cgit v1.2.3 From da882c1f2ecadb0ed582628ec1585e36b137c0f0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 22 Apr 2012 23:38:54 +0000 Subject: tcp: sk_add_backlog() is too agressive for TCP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While investigating TCP performance problems on 10Gb+ links, we found a tcp sender was dropping lot of incoming ACKS because of sk_rcvbuf limit in sk_add_backlog(), especially if receiver doesnt use GRO/LRO and sends one ACK every two MSS segments. A sender usually tweaks sk_sndbuf, but sk_rcvbuf stays at its default value (87380), allowing a too small backlog. A TCP ACK, even being small, can consume nearly same truesize space than outgoing packets. Using sk_rcvbuf + sk_sndbuf as a limit makes sense and is fast to compute. Performance results on netperf, single flow, receiver with disabled GRO/LRO : 7500 Mbits instead of 6050 Mbits, no more TCPBacklogDrop increments at sender. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Tom Herbert Cc: Maciej Żenczykowski Cc: Yuchung Cheng Cc: Ilpo Järvinen Cc: Rick Jones Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b04e6d8a8371..5fb19d345cfd 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1654,7 +1654,8 @@ process: if (!tcp_prequeue(sk, skb)) ret = tcp_v6_do_rcv(sk, skb); } - } else if (unlikely(sk_add_backlog(sk, skb, sk->sk_rcvbuf))) { + } else if (unlikely(sk_add_backlog(sk, skb, + sk->sk_rcvbuf + sk->sk_sndbuf))) { bh_unlock_sock(sk); NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); goto discard_and_relse; -- cgit v1.2.3 From 67469601406c12ced3db9956aeb0ef0854e2952f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Apr 2012 07:37:38 +0000 Subject: ipv6: RTAX_FEATURE_ALLFRAG causes inefficient TCP segment sizing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Quoting Tore Anderson from : https://bugzilla.kernel.org/show_bug.cgi?id=42572 When RTAX_FEATURE_ALLFRAG is set on a route, the effective TCP segment size does not take into account the size of the IPv6 Fragmentation header that needs to be included in outbound packets, causing every transmitted TCP segment to be fragmented across two IPv6 packets, the latter of which will only contain 8 bytes of actual payload. RTAX_FEATURE_ALLFRAG is typically set on a route in response to receving a ICMPv6 Packet Too Big message indicating a Path MTU of less than 1280 bytes. 1280 bytes is the minimum IPv6 MTU, however ICMPv6 PTBs with MTU < 1280 are still valid, in particular when an IPv6 packet is sent to an IPv4 destination through a stateless translator. Any ICMPv4 Need To Fragment packets originated from the IPv4 part of the path will be translated to ICMPv6 PTB which may then indicate an MTU of less than 1280. The Linux kernel refuses to reduce the effective MTU to anything below 1280 bytes, instead it sets it to exactly 1280 bytes, and RTAX_FEATURE_ALLFRAG is also set. However, the TCP segment size appears to be set to 1240 bytes (1280 Path MTU - 40 bytes of IPv6 header), instead of 1232 (additionally taking into account the 8 bytes required by the IPv6 Fragmentation extension header). This in turn results in rather inefficient transmission, as every transmitted TCP segment now is split in two fragments containing 1232+8 bytes of payload. After this patch, all the outgoing packets that includes a Fragmentation header all are "atomic" or "non-fragmented" fragments, i.e., they both have Offset=0 and More Fragments=0. With help from David S. Miller Reported-by: Tore Anderson Signed-off-by: Eric Dumazet Cc: Maciej Żenczykowski Cc: Tom Herbert Tested-by: Tore Anderson Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index cdbf292ad208..57b210969834 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1778,6 +1778,7 @@ static const struct inet_connection_sock_af_ops ipv6_specific = { .syn_recv_sock = tcp_v6_syn_recv_sock, .get_peer = tcp_v6_get_peer, .net_header_len = sizeof(struct ipv6hdr), + .net_frag_header_len = sizeof(struct frag_hdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, -- cgit v1.2.3 From bd14b1b2e29bd6812597f896dde06eaf7c6d2f24 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 May 2012 05:14:02 +0000 Subject: tcp: be more strict before accepting ECN negociation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It appears some networks play bad games with the two bits reserved for ECN. This can trigger false congestion notifications and very slow transferts. Since RFC 3168 (6.1.1) forbids SYN packets to carry CT bits, we can disable TCP ECN negociation if it happens we receive mangled CT bits in the SYN packet. Signed-off-by: Eric Dumazet Cc: Perry Lorier Cc: Matt Mathis Cc: Yuchung Cheng Cc: Neal Cardwell Cc: Wilmer van der Gaast Cc: Ankur Jain Cc: Tom Herbert Cc: Dave Täht Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 57b210969834..078d039e8fd2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1140,7 +1140,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) treq->rmt_addr = ipv6_hdr(skb)->saddr; treq->loc_addr = ipv6_hdr(skb)->daddr; if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, tcp_hdr(skb)); + TCP_ECN_create_request(req, skb); treq->iif = sk->sk_bound_dev_if; -- cgit v1.2.3 From e87cc4728f0e2fb663e592a1141742b1d6c63256 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 13 May 2012 21:56:26 +0000 Subject: net: Convert net_ratelimit uses to net__ratelimited Standardize the net core ratelimited logging functions. Coalesce formats, align arguments. Change a printk then vprintk sequence to use printf extension %pV. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 078d039e8fd2..4cf55ae7bf80 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -723,12 +723,10 @@ static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) NULL, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { - if (net_ratelimit()) { - printk(KERN_INFO "MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n", - genhash ? "failed" : "mismatch", - &ip6h->saddr, ntohs(th->source), - &ip6h->daddr, ntohs(th->dest)); - } + net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n", + genhash ? "failed" : "mismatch", + &ip6h->saddr, ntohs(th->source), + &ip6h->daddr, ntohs(th->dest)); return 1; } return 0; -- cgit v1.2.3 From a2a385d627e1549da4b43a8b3dfe370589766e1c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 May 2012 23:15:34 +0000 Subject: tcp: bool conversions bool conversions where possible. __inline__ -> inline space cleanups Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4cf55ae7bf80..554d5999abc4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1055,7 +1055,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); __u32 isn = TCP_SKB_CB(skb)->when; struct dst_entry *dst = NULL; - int want_cookie = 0; + bool want_cookie = false; if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); @@ -1116,7 +1116,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) while (l-- > 0) *c++ ^= *hash_location++; - want_cookie = 0; /* not our kind of cookie */ + want_cookie = false; /* not our kind of cookie */ tmp_ext.cookie_out_never = 0; /* false */ tmp_ext.cookie_plus = tmp_opt.cookie_plus; } else if (!tp->rx_opt.cookie_in_always) { -- cgit v1.2.3 From fff3269907897ee91406ece125795f53e722677e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Jun 2012 01:47:50 +0000 Subject: tcp: reflect SYN queue_mapping into SYNACK packets While testing how linux behaves on SYNFLOOD attack on multiqueue device (ixgbe), I found that SYNACK messages were dropped at Qdisc level because we send them all on a single queue. Obvious choice is to reflect incoming SYN packet @queue_mapping to SYNACK packet. Under stress, my machine could only send 25.000 SYNACK per second (for 200.000 incoming SYN per second). NIC : ixgbe with 16 rx/tx queues. After patch, not a single SYNACK is dropped. Signed-off-by: Eric Dumazet Cc: Hans Schillstrom Cc: Jesper Dangaard Brouer Cc: Neal Cardwell Cc: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 554d5999abc4..3a9aec29581a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -476,7 +476,8 @@ out: static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, - struct request_values *rvp) + struct request_values *rvp, + u16 queue_mapping) { struct inet6_request_sock *treq = inet6_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); @@ -513,6 +514,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, __tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr); fl6.daddr = treq->rmt_addr; + skb_set_queue_mapping(skb, queue_mapping); err = ip6_xmit(sk, skb, &fl6, opt, np->tclass); err = net_xmit_eval(err); } @@ -528,7 +530,7 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req, struct request_values *rvp) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); - return tcp_v6_send_synack(sk, req, rvp); + return tcp_v6_send_synack(sk, req, rvp, 0); } static void tcp_v6_reqsk_destructor(struct request_sock *req) @@ -1213,7 +1215,8 @@ have_isn: security_inet_conn_request(sk, skb, req); if (tcp_v6_send_synack(sk, req, - (struct request_values *)&tmp_ext) || + (struct request_values *)&tmp_ext, + skb_get_queue_mapping(skb)) || want_cookie) goto drop_and_free; -- cgit v1.2.3