From 4aea39c11c610e411768649fdc04777903ebfe07 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 3 Jun 2012 20:33:21 +0000 Subject: tcp: tcp_make_synack() consumes dst parameter tcp_make_synack() clones the dst, and callers release it. We can avoid two atomic operations per SYNACK if tcp_make_synack() consumes dst instead of cloning it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3a9aec29581a..80758255556c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -522,7 +522,6 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, done: if (opt && opt != np->opt) sock_kfree_s(sk, opt, opt->tot_len); - dst_release(dst); return err; } -- cgit v1.2.3 From 54db0cc2ba0d38166acc2d6bae21721405305537 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Fri, 8 Jun 2012 01:21:40 +0000 Subject: inetpeer: add parameter net for inet_getpeer_v4,v6 add struct net as a parameter of inet_getpeer_v[4,6], use net to replace &init_net. and modify some places to provide net for inet_getpeer_v[4,6] Signed-off-by: Gao feng Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 80758255556c..1a9cdd09f11c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1736,11 +1736,12 @@ static struct inet_peer *tcp_v6_get_peer(struct sock *sk, bool *release_it) { struct rt6_info *rt = (struct rt6_info *) __sk_dst_get(sk); struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); struct inet_peer *peer; if (!rt || !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr)) { - peer = inet_getpeer_v6(&np->daddr, 1); + peer = inet_getpeer_v6(net, &np->daddr, 1); *release_it = true; } else { if (!rt->rt6i_peer) @@ -1756,11 +1757,12 @@ static void *tcp_v6_tw_get_peer(struct sock *sk) { const struct inet6_timewait_sock *tw6 = inet6_twsk(sk); const struct inet_timewait_sock *tw = inet_twsk(sk); + struct net *net = sock_net(sk); if (tw->tw_family == AF_INET) return tcp_v4_tw_get_peer(sk); - return inet_getpeer_v6(&tw6->tw_v6_daddr, 1); + return inet_getpeer_v6(net, &tw6->tw_v6_daddr, 1); } static struct timewait_sock_ops tcp6_timewait_sock_ops = { -- cgit v1.2.3 From fbfe95a42e90b3dd079cc9019ba7d7700feee0f6 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 8 Jun 2012 23:24:18 -0700 Subject: inet: Create and use rt{,6}_get_peer_create(). There's a lot of places that open-code rt{,6}_get_peer() only because they want to set 'create' to one. So add an rt{,6}_get_peer_create() for their sake. There were also a few spots open-coding plain rt{,6}_get_peer() and those are transformed here as well. Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1a9cdd09f11c..218433cb9928 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1744,9 +1744,7 @@ static struct inet_peer *tcp_v6_get_peer(struct sock *sk, bool *release_it) peer = inet_getpeer_v6(net, &np->daddr, 1); *release_it = true; } else { - if (!rt->rt6i_peer) - rt6_bind_peer(rt, 1); - peer = rt->rt6i_peer; + peer = rt6_get_peer_create(rt); *release_it = false; } -- cgit v1.2.3 From 4670fd819e7f47392c7c6fc6168ea2857c66d163 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 9 Jun 2012 01:25:47 -0700 Subject: tcp: Get rid of inetpeer special cases. The get_peer method TCP uses is full of special cases that make no sense accommodating, and it also gets in the way of doing more reasonable things here. First of all, if the socket doesn't have a usable cached route, there is no sense in trying to optimize timewait recycling. Likewise for the case where we have IP options, such as SRR enabled, that make the IP header destination address (and thus the destination address of the route key) differ from that of the connection's destination address. Just return a NULL peer in these cases, and thus we're also able to get rid of the clumsy inetpeer release logic. Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 218433cb9928..b5ecf37b61a6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1732,23 +1732,18 @@ do_time_wait: goto discard_it; } -static struct inet_peer *tcp_v6_get_peer(struct sock *sk, bool *release_it) +static struct inet_peer *tcp_v6_get_peer(struct sock *sk) { struct rt6_info *rt = (struct rt6_info *) __sk_dst_get(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct net *net = sock_net(sk); - struct inet_peer *peer; - - if (!rt || - !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr)) { - peer = inet_getpeer_v6(net, &np->daddr, 1); - *release_it = true; - } else { - peer = rt6_get_peer_create(rt); - *release_it = false; - } - return peer; + /* If we don't have a valid cached route, or we're doing IP + * options which make the IPv6 header destination address + * different from our peer's, do not bother with this. + */ + if (!rt || !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr)) + return NULL; + return rt6_get_peer_create(rt); } static void *tcp_v6_tw_get_peer(struct sock *sk) -- cgit v1.2.3 From 2397849baa7c44c242e5d5142d5d16d1e7ed53d0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 9 Jun 2012 14:56:12 -0700 Subject: [PATCH] tcp: Cache inetpeer in timewait socket, and only when necessary. Since it's guarenteed that we will access the inetpeer if we're trying to do timewait recycling and TCP options were enabled on the connection, just cache the peer in the timewait socket. In the future, inetpeer lookups will be context dependent (per routing realm), and this helps facilitate that as well. Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b5ecf37b61a6..f91b0bfd12d5 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1746,23 +1746,10 @@ static struct inet_peer *tcp_v6_get_peer(struct sock *sk) return rt6_get_peer_create(rt); } -static void *tcp_v6_tw_get_peer(struct sock *sk) -{ - const struct inet6_timewait_sock *tw6 = inet6_twsk(sk); - const struct inet_timewait_sock *tw = inet_twsk(sk); - struct net *net = sock_net(sk); - - if (tw->tw_family == AF_INET) - return tcp_v4_tw_get_peer(sk); - - return inet_getpeer_v6(net, &tw6->tw_v6_daddr, 1); -} - static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), .twsk_unique = tcp_twsk_unique, .twsk_destructor= tcp_twsk_destructor, - .twsk_getpeer = tcp_v6_tw_get_peer, }; static const struct inet_connection_sock_af_ops ipv6_specific = { -- cgit v1.2.3 From 81aded24675ebda5de8a68843250ad15584ac38a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 15 Jun 2012 14:54:11 -0700 Subject: ipv6: Handle PMTU in ICMP error handlers. One tricky issue on the ipv6 side vs. ipv4 is that the ICMP callouts to handle the error pass the 32-bit info cookie in network byte order whereas ipv4 passes it around in host byte order. Like the ipv4 side, we have two helper functions. One for when we have a socket context and one for when we do not. ip6ip6 tunnels are not handled here, because they handle PMTU events by essentially relaying another ICMP packet-too-big message back to the original sender. This patch allows us to get rid of rt6_do_pmtu_disc(). It handles all kinds of situations that simply cannot happen when we do the PMTU update directly using a fully resolved route. In fact, the "plen == 128" check in ip6_rt_update_pmtu() can very likely be removed or changed into a BUG_ON() check. We should never have a prefixed ipv6 route when we get there. Another piece of strange history here is that TCP and DCCP, unlike in ipv4, never invoke the update_pmtu() method from their ICMP error handlers. This is incredibly astonishing since this is the context where we have the most accurate context in which to make a PMTU update, namely we have a fully connected socket and associated cached socket route. Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f91b0bfd12d5..26a88623940b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -415,6 +415,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } else dst_hold(dst); + dst->ops->update_pmtu(dst, ntohl(info)); + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { tcp_sync_mss(sk, dst_mtu(dst)); tcp_simple_retransmit(sk); -- cgit v1.2.3 From 3840a06e6046aaee95f33a120499d2dc8c054b9d Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 28 Jun 2012 12:34:19 +0000 Subject: tcp: pass fl6 to inet6_csk_route_req() This commit changes inet_csk_route_req() so that it uses a pointer to a struct flowi6, rather than allocating its own on the stack. This brings its behavior in line with its IPv4 cousin, inet_csk_route_req(), and allows a follow-on patch to fix a dst leak. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index fc0b96bf9051..4e5fa5f6ec68 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -477,7 +477,8 @@ out: } -static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, +static int tcp_v6_send_synack(struct sock *sk, + struct request_sock *req, struct request_values *rvp, u16 queue_mapping) { @@ -1058,6 +1059,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); __u32 isn = TCP_SKB_CB(skb)->when; struct dst_entry *dst = NULL; + struct flowi6 fl6; bool want_cookie = false; if (skb->protocol == htons(ETH_P_IP)) @@ -1177,7 +1179,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) */ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && - (dst = inet6_csk_route_req(sk, req)) != NULL && + (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL && (peer = rt6_get_peer((struct rt6_info *)dst)) != NULL && ipv6_addr_equal((struct in6_addr *)peer->daddr.addr.a6, &treq->rmt_addr)) { @@ -1247,6 +1249,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; #endif + struct flowi6 fl6; if (skb->protocol == htons(ETH_P_IP)) { /* @@ -1309,7 +1312,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, goto out_overflow; if (!dst) { - dst = inet6_csk_route_req(sk, req); + dst = inet6_csk_route_req(sk, &fl6, req); if (!dst) goto out; } -- cgit v1.2.3 From 9494218fbae2f88bd3f9b887714734abfdf38bab Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 28 Jun 2012 12:34:20 +0000 Subject: tcp: use inet6_csk_route_req() in tcp_v6_send_synack() With the recent change (earlier in this patch series) to set flowi6_oif to treq->iif in inet6_csk_route_req(), the dst lookup in these two functions is now identical, so tcp_v6_send_synack() can now just call inet6_csk_route_req(), to reduce code duplication and keep things closer to the IPv4 side, which is structured this way. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4e5fa5f6ec68..d1db0caefdcd 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -485,34 +485,17 @@ static int tcp_v6_send_synack(struct sock *sk, struct inet6_request_sock *treq = inet6_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff * skb; - struct ipv6_txoptions *opt = NULL; - struct in6_addr * final_p, final; + struct ipv6_txoptions *opt = np->opt; struct flowi6 fl6; struct dst_entry *dst; - int err; - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_TCP; - fl6.daddr = treq->rmt_addr; - fl6.saddr = treq->loc_addr; - fl6.flowlabel = 0; - fl6.flowi6_oif = treq->iif; - fl6.flowi6_mark = sk->sk_mark; - fl6.fl6_dport = inet_rsk(req)->rmt_port; - fl6.fl6_sport = inet_rsk(req)->loc_port; - security_req_classify_flow(req, flowi6_to_flowi(&fl6)); - - opt = np->opt; - final_p = fl6_update_dst(&fl6, opt, &final); + int err = -ENOMEM; - dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false); - if (IS_ERR(dst)) { - err = PTR_ERR(dst); - dst = NULL; + dst = inet6_csk_route_req(sk, &fl6, req); + if (!dst) goto done; - } + skb = tcp_make_synack(sk, dst, req, rvp); - err = -ENOMEM; + if (skb) { __tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr); -- cgit v1.2.3 From 9f10d3f6f966ef6f6a8d025a4b1d341923d04607 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 28 Jun 2012 12:34:21 +0000 Subject: tcp: plug dst leak in tcp_v6_conn_request() The code in tcp_v6_conn_request() was implicitly assuming that tcp_v6_send_synack() would take care of dst_release(), much as tcp_v4_send_synack() already does. This resulted in tcp_v6_conn_request() leaking a dst if sysctl_tw_recycle is enabled. This commit restructures tcp_v6_send_synack() so that it accepts a dst pointer and takes care of releasing the dst that is passed in, to plug the leak and avoid future surprises by bringing the IPv6 behavior in line with the IPv4 side. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d1db0caefdcd..9c06eafaf695 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -477,7 +477,8 @@ out: } -static int tcp_v6_send_synack(struct sock *sk, +static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, + struct flowi6 *fl6, struct request_sock *req, struct request_values *rvp, u16 queue_mapping) @@ -486,12 +487,10 @@ static int tcp_v6_send_synack(struct sock *sk, struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff * skb; struct ipv6_txoptions *opt = np->opt; - struct flowi6 fl6; - struct dst_entry *dst; int err = -ENOMEM; - dst = inet6_csk_route_req(sk, &fl6, req); - if (!dst) + /* First, grab a route. */ + if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL) goto done; skb = tcp_make_synack(sk, dst, req, rvp); @@ -499,9 +498,9 @@ static int tcp_v6_send_synack(struct sock *sk, if (skb) { __tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr); - fl6.daddr = treq->rmt_addr; + fl6->daddr = treq->rmt_addr; skb_set_queue_mapping(skb, queue_mapping); - err = ip6_xmit(sk, skb, &fl6, opt, np->tclass); + err = ip6_xmit(sk, skb, fl6, opt, np->tclass); err = net_xmit_eval(err); } @@ -514,8 +513,10 @@ done: static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req, struct request_values *rvp) { + struct flowi6 fl6; + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); - return tcp_v6_send_synack(sk, req, rvp, 0); + return tcp_v6_send_synack(sk, NULL, &fl6, req, rvp, 0); } static void tcp_v6_reqsk_destructor(struct request_sock *req) @@ -1201,7 +1202,7 @@ have_isn: if (security_inet_conn_request(sk, skb, req)) goto drop_and_release; - if (tcp_v6_send_synack(sk, req, + if (tcp_v6_send_synack(sk, dst, &fl6, req, (struct request_values *)&tmp_ext, skb_get_queue_mapping(skb)) || want_cookie) -- cgit v1.2.3 From 43264e0bd96304092062c013b0612cc944508288 Mon Sep 17 00:00:00 2001 From: "RongQing.Li" Date: Sun, 1 Jul 2012 17:18:59 +0000 Subject: ipv6: remove unnecessary codes in tcp_ipv6.c opt always equals np->opts, so it is meaningless to define opt, and check if opt does not equal np->opts and then try to free opt. Signed-off-by: RongQing.Li Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9c06eafaf695..6cc67ed6c2e6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -486,7 +486,6 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, struct inet6_request_sock *treq = inet6_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff * skb; - struct ipv6_txoptions *opt = np->opt; int err = -ENOMEM; /* First, grab a route. */ @@ -500,13 +499,11 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, fl6->daddr = treq->rmt_addr; skb_set_queue_mapping(skb, queue_mapping); - err = ip6_xmit(sk, skb, fl6, opt, np->tclass); + err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass); err = net_xmit_eval(err); } done: - if (opt && opt != np->opt) - sock_kfree_s(sk, opt, opt->tot_len); return err; } @@ -1229,7 +1226,6 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct inet_sock *newinet; struct tcp_sock *newtp; struct sock *newsk; - struct ipv6_txoptions *opt; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; #endif @@ -1290,7 +1286,6 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } treq = inet6_rsk(req); - opt = np->opt; if (sk_acceptq_is_full(sk)) goto out_overflow; @@ -1359,11 +1354,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, but we make one more one thing there: reattach optmem to newsk. */ - if (opt) { - newnp->opt = ipv6_dup_options(newsk, opt); - if (opt != np->opt) - sock_kfree_s(sk, opt, opt->tot_len); - } + if (np->opt) + newnp->opt = ipv6_dup_options(newsk, np->opt); inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newnp->opt) @@ -1410,8 +1402,6 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, out_overflow: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); out_nonewsk: - if (opt && opt != np->opt) - sock_kfree_s(sk, opt, opt->tot_len); dst_release(dst); out: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); -- cgit v1.2.3 From ab92bb2f679d66c7e12a6b1c0cdd76fe308f6546 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 9 Jul 2012 16:19:30 -0700 Subject: tcp: Abstract back handling peer aliveness test into helper function. Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6cc67ed6c2e6..75d179555c28 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1177,7 +1177,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && (!peer || !peer->tcp_ts_stamp) && - (!dst || !dst_metric(dst, RTAX_RTT))) { + !tcp_peer_is_proven(req, dst)) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. -- cgit v1.2.3 From 81166dd6fa8eb780b2132d32fbc77eb6ac04e44e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 10 Jul 2012 03:14:24 -0700 Subject: tcp: Move timestamps from inetpeer to metrics cache. With help from Lin Ming. Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 33 +++++---------------------------- 1 file changed, 5 insertions(+), 28 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 75d179555c28..9e96b5f21d2a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -277,22 +277,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, rt = (struct rt6_info *) dst; if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && - ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr)) { - struct inet_peer *peer = rt6_get_peer(rt); - /* - * VJ's idea. We save last timestamp seen from - * the destination in peer table, when entering state - * TIME-WAIT * and initialize rx_opt.ts_recent from it, - * when trying new connection. - */ - if (peer) { - inet_peer_refcheck(peer); - if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { - tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; - tp->rx_opt.ts_recent = peer->tcp_ts; - } - } - } + ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr)) + tcp_fetch_timewait_stamp(sk, dst); icsk->icsk_ext_hdr_len = 0; if (np->opt) @@ -1134,8 +1120,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) treq->iif = inet6_iif(skb); if (!isn) { - struct inet_peer *peer = NULL; - if (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { @@ -1160,14 +1144,8 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) */ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && - (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL && - (peer = rt6_get_peer((struct rt6_info *)dst)) != NULL && - ipv6_addr_equal((struct in6_addr *)peer->daddr.addr.a6, - &treq->rmt_addr)) { - inet_peer_refcheck(peer); - if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && - (s32)(peer->tcp_ts - req->ts_recent) > - TCP_PAWS_WINDOW) { + (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) { + if (!tcp_peer_is_proven(req, dst, true)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } @@ -1176,8 +1154,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) else if (!sysctl_tcp_syncookies && (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && - (!peer || !peer->tcp_ts_stamp) && - !tcp_peer_is_proven(req, dst)) { + !tcp_peer_is_proven(req, dst, false)) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. -- cgit v1.2.3 From 16d1839907e695387654901995f9286b65fbbc6a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 10 Jul 2012 03:32:59 -0700 Subject: inet: Remove ->get_peer() method. No longer used. Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9e96b5f21d2a..61175cb2478f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1689,20 +1689,6 @@ do_time_wait: goto discard_it; } -static struct inet_peer *tcp_v6_get_peer(struct sock *sk) -{ - struct rt6_info *rt = (struct rt6_info *) __sk_dst_get(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - - /* If we don't have a valid cached route, or we're doing IP - * options which make the IPv6 header destination address - * different from our peer's, do not bother with this. - */ - if (!rt || !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr)) - return NULL; - return rt6_get_peer_create(rt); -} - static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), .twsk_unique = tcp_twsk_unique, @@ -1715,7 +1701,6 @@ static const struct inet_connection_sock_af_ops ipv6_specific = { .rebuild_header = inet6_sk_rebuild_header, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, - .get_peer = tcp_v6_get_peer, .net_header_len = sizeof(struct ipv6hdr), .net_frag_header_len = sizeof(struct frag_hdr), .setsockopt = ipv6_setsockopt, @@ -1747,7 +1732,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .rebuild_header = inet_sk_rebuild_header, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, - .get_peer = tcp_v4_get_peer, .net_header_len = sizeof(struct iphdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, -- cgit v1.2.3 From 46d3ceabd8d98ed0ad10f20c595ca784e34786c5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Jul 2012 05:50:31 +0000 Subject: tcp: TCP Small Queues This introduce TSQ (TCP Small Queues) TSQ goal is to reduce number of TCP packets in xmit queues (qdisc & device queues), to reduce RTT and cwnd bias, part of the bufferbloat problem. sk->sk_wmem_alloc not allowed to grow above a given limit, allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a given time. TSO packets are sized/capped to half the limit, so that we have two TSO packets in flight, allowing better bandwidth use. As a side effect, setting the limit to 40000 automatically reduces the standard gso max limit (65536) to 40000/2 : It can help to reduce latencies of high prio packets, having smaller TSO packets. This means we divert sock_wfree() to a tcp_wfree() handler, to queue/send following frames when skb_orphan() [2] is called for the already queued skbs. Results on my dev machines (tg3/ixgbe nics) are really impressive, using standard pfifo_fast, and with or without TSO/GSO. Without reduction of nominal bandwidth, we have reduction of buffering per bulk sender : < 1ms on Gbit (instead of 50ms with TSO) < 8ms on 100Mbit (instead of 132 ms) I no longer have 4 MBytes backlogged in qdisc by a single netperf session, and both side socket autotuning no longer use 4 Mbytes. As skb destructor cannot restart xmit itself ( as qdisc lock might be taken at this point ), we delegate the work to a tasklet. We use one tasklest per cpu for performance reasons. If tasklet finds a socket owned by the user, it sets TSQ_OWNED flag. This flag is tested in a new protocol method called from release_sock(), to eventually send new segments. [1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable [2] skb_orphan() is usually called at TX completion time, but some drivers call it in their start_xmit() handler. These drivers should at least use BQL, or else a single TCP session can still fill the whole NIC TX ring, since TSQ will have no effect. Signed-off-by: Eric Dumazet Cc: Dave Taht Cc: Tom Herbert Cc: Matt Mathis Cc: Yuchung Cheng Cc: Nandita Dukkipati Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 61175cb2478f..70458a9cd837 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1970,6 +1970,7 @@ struct proto tcpv6_prot = { .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, + .release_cb = tcp_release_cb, .hash = tcp_v6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, -- cgit v1.2.3 From ec18d9a2691d69cd14b48f9b919fddcef28b7f5c Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 12 Jul 2012 00:25:15 -0700 Subject: ipv6: Add redirect support to all protocol icmp error handlers. Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 70458a9cd837..7249e4bb9b8a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -363,6 +363,13 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, np = inet6_sk(sk); + if (type == NDISC_REDIRECT) { + struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst && dst->ops->redirect) + dst->ops->redirect(dst,skb); + } + if (type == ICMPV6_PKT_TOOBIG) { struct dst_entry *dst; -- cgit v1.2.3 From 1ed5c48f231cd00eac0b3d2350ac61e3c825063e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 12 Jul 2012 00:41:25 -0700 Subject: net: Remove checks for dst_ops->redirect being NULL. No longer necessary. Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7249e4bb9b8a..3071f377145c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -366,7 +366,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (type == NDISC_REDIRECT) { struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); - if (dst && dst->ops->redirect) + if (dst) dst->ops->redirect(dst,skb); } -- cgit v1.2.3 From 35ad9b9cf7d8a2e6259a0d24022e910adb6f3489 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 16 Jul 2012 03:44:56 -0700 Subject: ipv6: Add helper inet6_csk_update_pmtu(). This is the ipv6 version of inet_csk_update_pmtu(). Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 37 ++++--------------------------------- 1 file changed, 4 insertions(+), 33 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3071f377145c..ecdf241cad02 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -378,43 +378,14 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) goto out; - /* icmp should have updated the destination cache entry */ - dst = __sk_dst_check(sk, np->dst_cookie); - - if (dst == NULL) { - struct inet_sock *inet = inet_sk(sk); - struct flowi6 fl6; - - /* BUGGG_FUTURE: Again, it is not clear how - to handle rthdr case. Ignore this complexity - for now. - */ - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_TCP; - fl6.daddr = np->daddr; - fl6.saddr = np->saddr; - fl6.flowi6_oif = sk->sk_bound_dev_if; - fl6.flowi6_mark = sk->sk_mark; - fl6.fl6_dport = inet->inet_dport; - fl6.fl6_sport = inet->inet_sport; - security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); - - dst = ip6_dst_lookup_flow(sk, &fl6, NULL, false); - if (IS_ERR(dst)) { - sk->sk_err_soft = -PTR_ERR(dst); - goto out; - } - - } else - dst_hold(dst); - - dst->ops->update_pmtu(dst, ntohl(info)); + dst = inet6_csk_update_pmtu(sk, ntohl(info)); + if (!dst) + goto out; if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { tcp_sync_mss(sk, dst_mtu(dst)); tcp_simple_retransmit(sk); - } /* else let the usual retransmit timer handle it */ - dst_release(dst); + } goto out; } -- cgit v1.2.3 From 6700c2709c08d74ae2c3c29b84a30da012dbc7f1 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 03:29:28 -0700 Subject: net: Pass optional SKB and SK arguments to dst_ops->{update_pmtu,redirect}() This will be used so that we can compose a full flow key. Even though we have a route in this context, we need more. In the future the routes will be without destination address, source address, etc. keying. One ipv4 route will cover entire subnets, etc. In this environment we have to have a way to possess persistent storage for redirects and PMTU information. This persistent storage will exist in the FIB tables, and that's why we'll need to be able to rebuild a full lookup flow key here. Using that flow key will do a fib_lookup() and create/update the persistent entry. Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ecdf241cad02..c9dabdd832d7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -367,7 +367,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); if (dst) - dst->ops->redirect(dst,skb); + dst->ops->redirect(dst, sk, skb); } if (type == ICMPV6_PKT_TOOBIG) { -- cgit v1.2.3 From 2100c8d2d9db23c0a09901a782bb4e3b21bee298 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 19 Jul 2012 06:43:05 +0000 Subject: net-tcp: Fast Open base This patch impelements the common code for both the client and server. 1. TCP Fast Open option processing. Since Fast Open does not have an option number assigned by IANA yet, it shares the experiment option code 254 by implementing draft-ietf-tcpm-experimental-options with a 16 bits magic number 0xF989. This enables global experiments without clashing the scarce(2) experimental options available for TCP. When the draft status becomes standard (maybe), the client should switch to the new option number assigned while the server supports both numbers for transistion. 2. The new sysctl tcp_fastopen 3. A place holder init function Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c9dabdd832d7..0302ec3fecfc 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1033,7 +1033,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); if (tmp_opt.cookie_plus > 0 && tmp_opt.saw_tstamp && -- cgit v1.2.3 From 563d34d05786263893ba4a1042eb9b9374127cf5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Jul 2012 09:48:52 +0200 Subject: tcp: dont drop MTU reduction indications MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ICMP messages generated in output path if frame length is bigger than mtu are actually lost because socket is owned by user (doing the xmit) One example is the ipgre_tunnel_xmit() calling icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); We had a similar case fixed in commit a34a101e1e6 (ipv6: disable GSO on sockets hitting dst_allfrag). Problem of such fix is that it relied on retransmit timers, so short tcp sessions paid a too big latency increase price. This patch uses the tcp_release_cb() infrastructure so that MTU reduction messages (ICMP messages) are not lost, and no extra delay is added in TCP transmits. Reported-by: Maciej Żenczykowski Diagnosed-by: Neal Cardwell Signed-off-by: Eric Dumazet Cc: Nandita Dukkipati Cc: Tom Herbert Cc: Tore Anderson Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0302ec3fecfc..f49476e2d884 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -315,6 +315,23 @@ failure: return err; } +static void tcp_v6_mtu_reduced(struct sock *sk) +{ + struct dst_entry *dst; + + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) + return; + + dst = inet6_csk_update_pmtu(sk, tcp_sk(sk)->mtu_info); + if (!dst) + return; + + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { + tcp_sync_mss(sk, dst_mtu(dst)); + tcp_simple_retransmit(sk); + } +} + static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { @@ -342,7 +359,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } bh_lock_sock(sk); - if (sock_owned_by_user(sk)) + if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) @@ -371,21 +388,11 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } if (type == ICMPV6_PKT_TOOBIG) { - struct dst_entry *dst; - - if (sock_owned_by_user(sk)) - goto out; - if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) - goto out; - - dst = inet6_csk_update_pmtu(sk, ntohl(info)); - if (!dst) - goto out; - - if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { - tcp_sync_mss(sk, dst_mtu(dst)); - tcp_simple_retransmit(sk); - } + tp->mtu_info = ntohl(info); + if (!sock_owned_by_user(sk)) + tcp_v6_mtu_reduced(sk); + else + set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags); goto out; } @@ -1949,6 +1956,7 @@ struct proto tcpv6_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, + .mtu_reduced = tcp_v6_mtu_reduced, .hash = tcp_v6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, -- cgit v1.2.3