From fd4f2cead6983735a4e6283126b9276873d7ff09 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 Apr 2012 19:48:40 +0000 Subject: tcp: RFC6298 supersedes RFC2988bis Updates some comments to track RFC6298 Signed-off-by: Eric Dumazet Cc: H.K. Jerry Chu Cc: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9944c1d9a218..dc1e0be30b77 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -936,7 +936,7 @@ static void tcp_init_metrics(struct sock *sk) tcp_set_rto(sk); reset: if (tp->srtt == 0) { - /* RFC2988bis: We've failed to get a valid RTT sample from + /* RFC6298: 5.7 We've failed to get a valid RTT sample from * 3WHS. This is most likely due to retransmission, * including spurious one. Reset the RTO back to 3secs * from the more aggressive 1sec to avoid more spurious @@ -946,7 +946,7 @@ reset: inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; } /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been - * retransmitted. In light of RFC2988bis' more aggressive 1sec + * retransmitted. In light of RFC6298 more aggressive 1sec * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK * retransmission has occurred. */ -- cgit v1.2.3 From a8cb05b238e0edfd7c63f496d9ce3e238ea68a16 Mon Sep 17 00:00:00 2001 From: Vijay Subramanian Date: Fri, 13 Apr 2012 13:23:59 +0000 Subject: tcp: Remove redundant code entering quickack mode tcp_enter_quickack_mode() already calls tcp_incr_quickack() and sets icsk->icsk_ack.ato to TCP_ATO_MIN. This patch removes the duplication. Signed-off-by: Vijay Subramanian Reviewed-by: Flavio Leitner Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dc1e0be30b77..3dc94febc9ee 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5734,8 +5734,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, */ inet_csk_schedule_ack(sk); icsk->icsk_ack.lrcvtime = tcp_time_stamp; - icsk->icsk_ack.ato = TCP_ATO_MIN; - tcp_incr_quickack(sk); tcp_enter_quickack_mode(sk); inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX); -- cgit v1.2.3 From 95c961747284a6b83a5e2d81240e214b0fa3464d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 Apr 2012 05:58:06 +0000 Subject: net: cleanup unsigned to unsigned int Use of "unsigned int" is preferred to bare "unsigned" in net tree. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3dc94febc9ee..99448f06a42a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -175,7 +175,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) static void tcp_incr_quickack(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); + unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); if (quickacks == 0) quickacks = 2; -- cgit v1.2.3 From 370816aef0c5436c2adbec3966038f36ca326933 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 19 Apr 2012 03:40:01 +0000 Subject: tcp: Move code around This is just the preparation patch, which makes the needed for TCP repair code ready for use. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 81 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 33 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 99448f06a42a..37e1c5cd2c01 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5325,6 +5325,14 @@ discard: return 0; } +void tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen) +{ + __skb_pull(skb, hdrlen); + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); + tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; +} + /* * TCP receive function for the ESTABLISHED state. * @@ -5490,10 +5498,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ - __skb_pull(skb, tcp_header_len); - __skb_queue_tail(&sk->sk_receive_queue, skb); - skb_set_owner_r(skb, sk); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_queue_rcv(sk, skb, tcp_header_len); } tcp_event_data_recv(sk, skb); @@ -5559,6 +5564,44 @@ discard: } EXPORT_SYMBOL(tcp_rcv_established); +void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_set_state(sk, TCP_ESTABLISHED); + + if (skb != NULL) + security_inet_conn_established(sk, skb); + + /* Make sure socket is routed, for correct metrics. */ + icsk->icsk_af_ops->rebuild_header(sk); + + tcp_init_metrics(sk); + + tcp_init_congestion_control(sk); + + /* Prevent spurious tcp_cwnd_restart() on first data + * packet. + */ + tp->lsndtime = tcp_time_stamp; + + tcp_init_buffer_space(sk); + + if (sock_flag(sk, SOCK_KEEPOPEN)) + inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); + + if (!tp->rx_opt.snd_wscale) + __tcp_fast_path_on(tp, tp->snd_wnd); + else + tp->pred_flags = 0; + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); + } +} + static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { @@ -5691,36 +5734,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, } smp_mb(); - tcp_set_state(sk, TCP_ESTABLISHED); - - security_inet_conn_established(sk, skb); - - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - - tcp_init_metrics(sk); - tcp_init_congestion_control(sk); - - /* Prevent spurious tcp_cwnd_restart() on first data - * packet. - */ - tp->lsndtime = tcp_time_stamp; - - tcp_init_buffer_space(sk); - - if (sock_flag(sk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); - - if (!tp->rx_opt.snd_wscale) - __tcp_fast_path_on(tp, tp->snd_wnd); - else - tp->pred_flags = 0; - - if (!sock_flag(sk, SOCK_DEAD)) { - sk->sk_state_change(sk); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); - } + tcp_finish_connect(sk, skb); if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || -- cgit v1.2.3 From 1402d366019fedaa2b024f2bac06b7cc9a8782e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Apr 2012 07:11:42 +0000 Subject: tcp: introduce tcp_try_coalesce MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit c8628155ece3 (tcp: reduce out_of_order memory use) took care of coalescing tcp segments provided by legacy devices (linear skbs) We extend this idea to fragged skbs, as their truesize can be heavy. ixgbe for example uses 256+1024+PAGE_SIZE/2 = 3328 bytes per segment. Use this coalescing strategy for receive queue too. This contributes to reduce number of tcp collapses, at minimal cost, and reduces memory overhead and packets drops. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Tom Herbert Cc: Maciej Żenczykowski Cc: Ilpo Järvinen Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 79 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 62 insertions(+), 17 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 37e1c5cd2c01..bd7aef59c385 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4449,6 +4449,58 @@ static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) return 0; } +/** + * tcp_try_coalesce - try to merge skb to prior one + * @sk: socket + * @to: prior buffer + * @from: buffer to add in queue + * + * Before queueing skb @from after @to, try to merge them + * to reduce overall memory use and queue lengths, if cost is small. + * Packets in ofo or receive queues can stay a long time. + * Better try to coalesce them right now to avoid future collapses. + * Returns > 0 value if caller should free @from instead of queueing it + */ +static int tcp_try_coalesce(struct sock *sk, + struct sk_buff *to, + struct sk_buff *from) +{ + int len = from->len; + + if (tcp_hdr(from)->fin) + return 0; + if (len <= skb_tailroom(to)) { + BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); +merge: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); + TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; + TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; + return 1; + } + if (skb_headlen(from) == 0 && + !skb_has_frag_list(to) && + !skb_has_frag_list(from) && + (skb_shinfo(to)->nr_frags + + skb_shinfo(from)->nr_frags <= MAX_SKB_FRAGS)) { + int delta = from->truesize - ksize(from->head) - + SKB_DATA_ALIGN(sizeof(struct sk_buff)); + + WARN_ON_ONCE(delta < len); + memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags, + skb_shinfo(from)->frags, + skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); + skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags; + skb_shinfo(from)->nr_frags = 0; + to->truesize += delta; + atomic_add(delta, &sk->sk_rmem_alloc); + sk_mem_charge(sk, delta); + to->len += len; + to->data_len += len; + goto merge; + } + return 0; +} + static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -4487,23 +4539,11 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) end_seq = TCP_SKB_CB(skb)->end_seq; if (seq == TCP_SKB_CB(skb1)->end_seq) { - /* Packets in ofo can stay in queue a long time. - * Better try to coalesce them right now - * to avoid future tcp_collapse_ofo_queue(), - * probably the most expensive function in tcp stack. - */ - if (skb->len <= skb_tailroom(skb1) && !tcp_hdr(skb)->fin) { - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPRCVCOALESCE); - BUG_ON(skb_copy_bits(skb, 0, - skb_put(skb1, skb->len), - skb->len)); - TCP_SKB_CB(skb1)->end_seq = end_seq; - TCP_SKB_CB(skb1)->ack_seq = TCP_SKB_CB(skb)->ack_seq; + if (tcp_try_coalesce(sk, skb1, skb) <= 0) { + __skb_queue_after(&tp->out_of_order_queue, skb1, skb); + } else { __kfree_skb(skb); skb = NULL; - } else { - __skb_queue_after(&tp->out_of_order_queue, skb1, skb); } if (!tp->rx_opt.num_sacks || @@ -4624,13 +4664,18 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } if (eaten <= 0) { + struct sk_buff *tail; queue_and_out: if (eaten < 0 && tcp_try_rmem_schedule(sk, skb->truesize)) goto drop; - skb_set_owner_r(skb, sk); - __skb_queue_tail(&sk->sk_receive_queue, skb); + tail = skb_peek_tail(&sk->sk_receive_queue); + eaten = tail ? tcp_try_coalesce(sk, tail, skb) : -1; + if (eaten <= 0) { + skb_set_owner_r(skb, sk); + __skb_queue_tail(&sk->sk_receive_queue, skb); + } } tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (skb->len) -- cgit v1.2.3 From 783c175f902b1ae011f12de45770e7912638ea1a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Apr 2012 17:34:36 +0000 Subject: tcp: tcp_try_coalesce returns a boolean This clarifies code intention, as suggested by David. Suggested-by: David Miller Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c1c611b385a7..c93b0cbb7fc1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4460,23 +4460,23 @@ static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) * to reduce overall memory use and queue lengths, if cost is small. * Packets in ofo or receive queues can stay a long time. * Better try to coalesce them right now to avoid future collapses. - * Returns > 0 value if caller should free @from instead of queueing it + * Returns true if caller should free @from instead of queueing it */ -static int tcp_try_coalesce(struct sock *sk, - struct sk_buff *to, - struct sk_buff *from) +static bool tcp_try_coalesce(struct sock *sk, + struct sk_buff *to, + struct sk_buff *from) { int len = from->len; if (tcp_hdr(from)->fin) - return 0; + return false; if (len <= skb_tailroom(to)) { BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); merge: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; - return 1; + return true; } if (skb_headlen(from) == 0 && !skb_has_frag_list(to) && @@ -4499,7 +4499,7 @@ merge: to->data_len += len; goto merge; } - return 0; + return false; } static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) @@ -4540,7 +4540,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) end_seq = TCP_SKB_CB(skb)->end_seq; if (seq == TCP_SKB_CB(skb1)->end_seq) { - if (tcp_try_coalesce(sk, skb1, skb) <= 0) { + if (!tcp_try_coalesce(sk, skb1, skb)) { __skb_queue_after(&tp->out_of_order_queue, skb1, skb); } else { __kfree_skb(skb); @@ -4672,7 +4672,7 @@ queue_and_out: goto drop; tail = skb_peek_tail(&sk->sk_receive_queue); - eaten = tail ? tcp_try_coalesce(sk, tail, skb) : -1; + eaten = (tail && tcp_try_coalesce(sk, tail, skb)) ? 1 : 0; if (eaten <= 0) { skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); -- cgit v1.2.3 From 329033f645d93b5f9160b9b972dbc5431ad22a33 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Apr 2012 00:38:33 +0000 Subject: tcp: makes tcp_try_coalesce aware of skb->head_frag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TCP coalesce can check if skb to be merged has its skb->head mapped to a page fragment, instead of a kmalloc() area. We had to disable coalescing in this case, for performance reasons. We 'upgrade' skb->head as a fragment in itself. This reduces number of cache misses when user makes its copies, since a less sk_buff are fetched. This makes receive and ofo queues shorter and thus reduce cache line misses in TCP stack. This is a followup of patch "net: allow skb->head to be a page fragment" Tested with tg3 nic, with GRO on or off. We can see "TCPRcvCoalesce" counter being incremented. Signed-off-by: Eric Dumazet Cc: Ilpo Järvinen Cc: Herbert Xu Cc: Maciej Żenczykowski Cc: Neal Cardwell Cc: Tom Herbert Cc: Jeff Kirsher Cc: Ben Hutchings Cc: Matt Carlson Cc: Michael Chan Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 55 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 12 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c93b0cbb7fc1..96a631deb4e6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4464,10 +4464,12 @@ static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) */ static bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, - struct sk_buff *from) + struct sk_buff *from, + bool *fragstolen) { - int len = from->len; + int delta, len = from->len; + *fragstolen = false; if (tcp_hdr(from)->fin) return false; if (len <= skb_tailroom(to)) { @@ -4478,15 +4480,19 @@ merge: TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; return true; } + + if (skb_has_frag_list(to) || skb_has_frag_list(from)) + return false; + if (skb_headlen(from) == 0 && - !skb_has_frag_list(to) && - !skb_has_frag_list(from) && (skb_shinfo(to)->nr_frags + skb_shinfo(from)->nr_frags <= MAX_SKB_FRAGS)) { - int delta = from->truesize - ksize(from->head) - - SKB_DATA_ALIGN(sizeof(struct sk_buff)); + WARN_ON_ONCE(from->head_frag); + delta = from->truesize - ksize(from->head) - + SKB_DATA_ALIGN(sizeof(struct sk_buff)); WARN_ON_ONCE(delta < len); +copyfrags: memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags, skb_shinfo(from)->frags, skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); @@ -4499,6 +4505,20 @@ merge: to->data_len += len; goto merge; } + if (from->head_frag) { + struct page *page; + unsigned int offset; + + if (skb_shinfo(to)->nr_frags + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) + return false; + page = virt_to_head_page(from->head); + offset = from->data - (unsigned char *)page_address(page); + skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, + page, offset, skb_headlen(from)); + *fragstolen = true; + delta = len; /* we dont know real truesize... */ + goto copyfrags; + } return false; } @@ -4540,10 +4560,15 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) end_seq = TCP_SKB_CB(skb)->end_seq; if (seq == TCP_SKB_CB(skb1)->end_seq) { - if (!tcp_try_coalesce(sk, skb1, skb)) { + bool fragstolen; + + if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { __skb_queue_after(&tp->out_of_order_queue, skb1, skb); } else { - __kfree_skb(skb); + if (fragstolen) + kmem_cache_free(skbuff_head_cache, skb); + else + __kfree_skb(skb); skb = NULL; } @@ -4626,6 +4651,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); int eaten = -1; + bool fragstolen = false; if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) goto drop; @@ -4672,7 +4698,9 @@ queue_and_out: goto drop; tail = skb_peek_tail(&sk->sk_receive_queue); - eaten = (tail && tcp_try_coalesce(sk, tail, skb)) ? 1 : 0; + eaten = (tail && + tcp_try_coalesce(sk, tail, skb, + &fragstolen)) ? 1 : 0; if (eaten <= 0) { skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); @@ -4699,9 +4727,12 @@ queue_and_out: tcp_fast_path_check(sk); - if (eaten > 0) - __kfree_skb(skb); - else if (!sock_flag(sk, SOCK_DEAD)) + if (eaten > 0) { + if (fragstolen) + kmem_cache_free(skbuff_head_cache, skb); + else + __kfree_skb(skb); + } else if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk, 0); return; } -- cgit v1.2.3 From 1fbc340514fc3003514bd681b372e1f47ae6183f Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 2 May 2012 13:30:02 +0000 Subject: tcp: early retransmit: tcp_enter_recovery() This a prepartion patch that refactors the code to enter recovery into a new function tcp_enter_recovery(). It's needed to implement the delayed fast retransmit in ER. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 61 +++++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 27 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 96a631deb4e6..be8e09d2c6b1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3022,6 +3022,38 @@ static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked, tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; } +static void tcp_enter_recovery(struct sock *sk, bool ece_ack) +{ + struct tcp_sock *tp = tcp_sk(sk); + int mib_idx; + + if (tcp_is_reno(tp)) + mib_idx = LINUX_MIB_TCPRENORECOVERY; + else + mib_idx = LINUX_MIB_TCPSACKRECOVERY; + + NET_INC_STATS_BH(sock_net(sk), mib_idx); + + tp->high_seq = tp->snd_nxt; + tp->prior_ssthresh = 0; + tp->undo_marker = tp->snd_una; + tp->undo_retrans = tp->retrans_out; + + if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + if (!ece_ack) + tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + TCP_ECN_queue_cwr(tp); + } + + tp->bytes_acked = 0; + tp->snd_cwnd_cnt = 0; + tp->prior_cwnd = tp->snd_cwnd; + tp->prr_delivered = 0; + tp->prr_out = 0; + tcp_set_ca_state(sk, TCP_CA_Recovery); +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -3041,7 +3073,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, struct tcp_sock *tp = tcp_sk(sk); int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - int fast_rexmit = 0, mib_idx; + int fast_rexmit = 0; if (WARN_ON(!tp->packets_out && tp->sacked_out)) tp->sacked_out = 0; @@ -3142,32 +3174,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, } /* Otherwise enter Recovery state */ - - if (tcp_is_reno(tp)) - mib_idx = LINUX_MIB_TCPRENORECOVERY; - else - mib_idx = LINUX_MIB_TCPSACKRECOVERY; - - NET_INC_STATS_BH(sock_net(sk), mib_idx); - - tp->high_seq = tp->snd_nxt; - tp->prior_ssthresh = 0; - tp->undo_marker = tp->snd_una; - tp->undo_retrans = tp->retrans_out; - - if (icsk->icsk_ca_state < TCP_CA_CWR) { - if (!(flag & FLAG_ECE)) - tp->prior_ssthresh = tcp_current_ssthresh(sk); - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); - TCP_ECN_queue_cwr(tp); - } - - tp->bytes_acked = 0; - tp->snd_cwnd_cnt = 0; - tp->prior_cwnd = tp->snd_cwnd; - tp->prr_delivered = 0; - tp->prr_out = 0; - tcp_set_ca_state(sk, TCP_CA_Recovery); + tcp_enter_recovery(sk, (flag & FLAG_ECE)); fast_rexmit = 1; } -- cgit v1.2.3 From eed530b6c67624db3f2cf477bac7c4d005d8f7ba Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 2 May 2012 13:30:03 +0000 Subject: tcp: early retransmit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch implements RFC 5827 early retransmit (ER) for TCP. It reduces DUPACK threshold (dupthresh) if outstanding packets are less than 4 to recover losses by fast recovery instead of timeout. While the algorithm is simple, small but frequent network reordering makes this feature dangerous: the connection repeatedly enter false recovery and degrade performance. Therefore we implement a mitigation suggested in the appendix of the RFC that delays entering fast recovery by a small interval, i.e., RTT/4. Currently ER is conservative and is disabled for the rest of the connection after the first reordering event. A large scale web server experiment on the performance impact of ER is summarized in section 6 of the paper "Proportional Rate Reduction for TCP”, IMC 2011. http://conferences.sigcomm.org/imc/2011/docs/p155.pdf Note that Linux has a similar feature called THIN_DUPACK. The differences are THIN_DUPACK do not mitigate reorderings and is only used after slow start. Currently ER is disabled if THIN_DUPACK is enabled. I would be happy to merge THIN_DUPACK feature with ER if people think it's a good idea. ER is enabled by sysctl_tcp_early_retrans: 0: Disables ER 1: Reduce dupthresh to packets_out - 1 when outstanding packets < 4. 2: (Default) reduce dupthresh like mode 1. In addition, delay entering fast recovery by RTT/4. Note: mode 2 is implemented in the third part of this patch series. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index be8e09d2c6b1..e042cabb695e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -99,6 +99,7 @@ int sysctl_tcp_thin_dupack __read_mostly; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_abc __read_mostly; +int sysctl_tcp_early_retrans __read_mostly = 2; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -906,6 +907,7 @@ static void tcp_init_metrics(struct sock *sk) if (dst_metric(dst, RTAX_REORDERING) && tp->reordering != dst_metric(dst, RTAX_REORDERING)) { tcp_disable_fack(tp); + tcp_disable_early_retrans(tp); tp->reordering = dst_metric(dst, RTAX_REORDERING); } @@ -988,6 +990,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric, #endif tcp_disable_fack(tp); } + + if (metric > 0) + tcp_disable_early_retrans(tp); } /* This must be called before lost_out is incremented */ @@ -2492,6 +2497,16 @@ static int tcp_time_to_recover(struct sock *sk) tcp_is_sack(tp) && !tcp_send_head(sk)) return 1; + /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious + * retransmissions due to small network reorderings, we implement + * Mitigation A.3 in the RFC and delay the retransmission for a short + * interval if appropriate. + */ + if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && + (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && + !tcp_may_send_now(sk)) + return 1; + return 0; } -- cgit v1.2.3 From 750ea2bafa55aaed208b2583470ecd7122225634 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 2 May 2012 13:30:04 +0000 Subject: tcp: early retransmit: delayed fast retransmit Implementing the advanced early retransmit (sysctl_tcp_early_retrans==2). Delays the fast retransmit by an interval of RTT/4. We borrow the RTO timer to implement the delay. If we receive another ACK or send a new packet, the timer is cancelled and restored to original RTO value offset by time elapsed. When the delayed-ER timer fires, we enter fast recovery and perform fast retransmit. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e042cabb695e..7096790e06bf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2344,6 +2344,27 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; } +static bool tcp_pause_early_retransmit(struct sock *sk, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned long delay; + + /* Delay early retransmit and entering fast recovery for + * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples + * available, or RTO is scheduled to fire first. + */ + if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt) + return false; + + delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); + if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) + return false; + + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); + tp->early_retrans_delayed = 1; + return true; +} + static inline int tcp_skb_timedout(const struct sock *sk, const struct sk_buff *skb) { @@ -2451,7 +2472,7 @@ static inline int tcp_head_timedout(const struct sock *sk) * Main question: may we further continue forward transmission * with the same cwnd? */ -static int tcp_time_to_recover(struct sock *sk) +static int tcp_time_to_recover(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); __u32 packets_out; @@ -2505,7 +2526,7 @@ static int tcp_time_to_recover(struct sock *sk) if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && !tcp_may_send_now(sk)) - return 1; + return !tcp_pause_early_retransmit(sk, flag); return 0; } @@ -3172,7 +3193,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); - if (!tcp_time_to_recover(sk)) { + if (!tcp_time_to_recover(sk, flag)) { tcp_try_to_open(sk, flag); return; } @@ -3271,16 +3292,47 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ -static void tcp_rearm_rto(struct sock *sk) +void tcp_rearm_rto(struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + u32 rto = inet_csk(sk)->icsk_rto; + /* Offset the time elapsed after installing regular RTO */ + if (tp->early_retrans_delayed) { + struct sk_buff *skb = tcp_write_queue_head(sk); + const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; + s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); + /* delta may not be positive if the socket is locked + * when the delayed ER timer fires and is rescheduled. + */ + if (delta > 0) + rto = delta; + } + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, + TCP_RTO_MAX); } + tp->early_retrans_delayed = 0; +} + +/* This function is called when the delayed ER timer fires. TCP enters + * fast recovery and performs fast-retransmit. + */ +void tcp_resume_early_retransmit(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_rearm_rto(sk); + + /* Stop if ER is disabled after the delayed ER timer is scheduled */ + if (!tp->do_early_retrans) + return; + + tcp_enter_recovery(sk, false); + tcp_update_scoreboard(sk, 1); + tcp_xmit_retransmit_queue(sk); } /* If we get here, the whole TSO packet has not been acked. */ @@ -3729,6 +3781,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; + if (tp->early_retrans_delayed) + tcp_rearm_rto(sk); + if (after(ack, prior_snd_una)) flag |= FLAG_SND_UNA_ADVANCED; -- cgit v1.2.3 From 923dd347b8904c24bcac89bf038ed4da87f8aa90 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 May 2012 07:55:58 +0000 Subject: net: take care of cloned skbs in tcp_try_coalesce() Before stealing fragments or skb head, we must make sure skbs are not cloned. Alexander was worried about destination skb being cloned : In bridge setups, a driver could be fooled if skb->data_len would not match skb nr_frags. If source skb is cloned, we must take references on pages instead. Bug happened using tcpdump (if not using mmap()) Introduce kfree_skb_partial() helper to cleanup code. Reported-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7096790e06bf..a8829370f712 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4532,6 +4532,7 @@ static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) * @sk: socket * @to: prior buffer * @from: buffer to add in queue + * @fragstolen: pointer to boolean * * Before queueing skb @from after @to, try to merge them * to reduce overall memory use and queue lengths, if cost is small. @@ -4544,10 +4545,10 @@ static bool tcp_try_coalesce(struct sock *sk, struct sk_buff *from, bool *fragstolen) { - int delta, len = from->len; + int i, delta, len = from->len; *fragstolen = false; - if (tcp_hdr(from)->fin) + if (tcp_hdr(from)->fin || skb_cloned(to)) return false; if (len <= skb_tailroom(to)) { BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); @@ -4574,7 +4575,13 @@ copyfrags: skb_shinfo(from)->frags, skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags; - skb_shinfo(from)->nr_frags = 0; + + if (skb_cloned(from)) + for (i = 0; i < skb_shinfo(from)->nr_frags; i++) + skb_frag_ref(from, i); + else + skb_shinfo(from)->nr_frags = 0; + to->truesize += delta; atomic_add(delta, &sk->sk_rmem_alloc); sk_mem_charge(sk, delta); @@ -4592,13 +4599,26 @@ copyfrags: offset = from->data - (unsigned char *)page_address(page); skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, page, offset, skb_headlen(from)); - *fragstolen = true; + + if (skb_cloned(from)) + get_page(page); + else + *fragstolen = true; + delta = len; /* we dont know real truesize... */ goto copyfrags; } return false; } +static void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) +{ + if (head_stolen) + kmem_cache_free(skbuff_head_cache, skb); + else + __kfree_skb(skb); +} + static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -4642,10 +4662,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { __skb_queue_after(&tp->out_of_order_queue, skb1, skb); } else { - if (fragstolen) - kmem_cache_free(skbuff_head_cache, skb); - else - __kfree_skb(skb); + kfree_skb_partial(skb, fragstolen); skb = NULL; } @@ -4804,12 +4821,9 @@ queue_and_out: tcp_fast_path_check(sk); - if (eaten > 0) { - if (fragstolen) - kmem_cache_free(skbuff_head_cache, skb); - else - __kfree_skb(skb); - } else if (!sock_flag(sk, SOCK_DEAD)) + if (eaten > 0) + kfree_skb_partial(skb, fragstolen); + else if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk, 0); return; } -- cgit v1.2.3 From b081f85c2977b1cbb6e635d53d9512f1ef985972 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 May 2012 09:58:29 +0000 Subject: net: implement tcp coalescing in tcp_queue_rcv() Extend tcp coalescing implementing it from tcp_queue_rcv(), the main receiver function when application is not blocked in recvmsg(). Function tcp_queue_rcv() is moved a bit to allow its call from tcp_data_queue() This gives good results especially if GRO could not kick, and if skb head is a fragment. Signed-off-by: Eric Dumazet Cc: Alexander Duyck Cc: Neal Cardwell Cc: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a8829370f712..2f696ef13dcd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4739,6 +4739,22 @@ end: skb_set_owner_r(skb, sk); } +int tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, + bool *fragstolen) +{ + int eaten; + struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); + + __skb_pull(skb, hdrlen); + eaten = (tail && + tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; + tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if (!eaten) { + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); + } + return eaten; +} static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { @@ -4785,20 +4801,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } if (eaten <= 0) { - struct sk_buff *tail; queue_and_out: if (eaten < 0 && tcp_try_rmem_schedule(sk, skb->truesize)) goto drop; - tail = skb_peek_tail(&sk->sk_receive_queue); - eaten = (tail && - tcp_try_coalesce(sk, tail, skb, - &fragstolen)) ? 1 : 0; - if (eaten <= 0) { - skb_set_owner_r(skb, sk); - __skb_queue_tail(&sk->sk_receive_queue, skb); - } + eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); } tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (skb->len) @@ -5493,14 +5501,6 @@ discard: return 0; } -void tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen) -{ - __skb_pull(skb, hdrlen); - __skb_queue_tail(&sk->sk_receive_queue, skb); - skb_set_owner_r(skb, sk); - tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; -} - /* * TCP receive function for the ESTABLISHED state. * @@ -5609,6 +5609,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } else { int eaten = 0; int copied_early = 0; + bool fragstolen = false; if (tp->copied_seq == tp->rcv_nxt && len - tcp_header_len <= tp->ucopy.len) { @@ -5666,7 +5667,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ - tcp_queue_rcv(sk, skb, tcp_header_len); + eaten = tcp_queue_rcv(sk, skb, tcp_header_len, + &fragstolen); } tcp_event_data_recv(sk, skb); @@ -5688,7 +5690,7 @@ no_ack: else #endif if (eaten) - __kfree_skb(skb); + kfree_skb_partial(skb, fragstolen); else sk->sk_data_ready(sk, 0); return 0; -- cgit v1.2.3 From 2996d31f9f292cce67cd9105dc0a4a5ee43d2f14 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 2 May 2012 18:18:42 +0000 Subject: net: Stop decapitating clones that have a head_frag This change is meant ot prevent stealing the skb->head to use as a page in the event that the skb->head was cloned. This allows the other clones to track each other via shinfo->dataref. Without this we break down to two methods for tracking the reference count, one being dataref, the other being the page count. As a result it becomes difficult to track how many references there are to skb->head. Signed-off-by: Alexander Duyck Cc: Eric Dumazet Cc: Jeff Kirsher Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2f696ef13dcd..c6f78e2b590f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4589,7 +4589,7 @@ copyfrags: to->data_len += len; goto merge; } - if (from->head_frag) { + if (from->head_frag && !skb_cloned(from)) { struct page *page; unsigned int offset; @@ -4599,12 +4599,7 @@ copyfrags: offset = from->data - (unsigned char *)page_address(page); skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, page, offset, skb_headlen(from)); - - if (skb_cloned(from)) - get_page(page); - else - *fragstolen = true; - + *fragstolen = true; delta = len; /* we dont know real truesize... */ goto copyfrags; } -- cgit v1.2.3 From c73c3d9c49daae8acbac4cd14bcd81626887c0e6 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 2 May 2012 21:18:59 +0000 Subject: tcp: Fix truesize accounting in tcp_try_coalesce This patch addresses several issues in the way we were tracking the truesize in tcp_try_coalesce. First it was using ksize which prevents us from having a 0 sized head frag and getting a usable result. To resolve that this patch uses the end pointer which is set based off either ksize, or the frag_size supplied in build_skb. This allows us to compute the original truesize of the entire buffer and remove that value leaving us with just what was added as pages. The second issue was the use of skb->len if there is a mergeable head frag. We should only need to remove the size of an data aligned sk_buff from our current skb->truesize to compute the delta for a buffer with a reused head. By using skb->len the value of truesize was being artificially reduced which means that head frags could use more memory than buffers using standard allocations. Signed-off-by: Alexander Duyck Cc: Eric Dumazet Cc: Jeff Kirsher Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c6f78e2b590f..3cb273ac8821 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4565,12 +4565,10 @@ merge: if (skb_headlen(from) == 0 && (skb_shinfo(to)->nr_frags + skb_shinfo(from)->nr_frags <= MAX_SKB_FRAGS)) { - WARN_ON_ONCE(from->head_frag); - delta = from->truesize - ksize(from->head) - - SKB_DATA_ALIGN(sizeof(struct sk_buff)); - - WARN_ON_ONCE(delta < len); + delta = from->truesize - + SKB_TRUESIZE(skb_end_pointer(from) - from->head); copyfrags: + WARN_ON_ONCE(delta < len); memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags, skb_shinfo(from)->frags, skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); @@ -4600,7 +4598,7 @@ copyfrags: skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, page, offset, skb_headlen(from)); *fragstolen = true; - delta = len; /* we dont know real truesize... */ + delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); goto copyfrags; } return false; -- cgit v1.2.3 From 57b55a7ec684d8b846d6d5e67f4982363a83db7e Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 2 May 2012 21:19:04 +0000 Subject: tcp: Move code related to head frag in tcp_try_coalesce This change reorders the code related to the use of an skb->head_frag so it is placed before we check the rest of the frags. This allows the code to read more linearly instead of like some sort of loop. Signed-off-by: Alexander Duyck Cc: Eric Dumazet Cc: Jeff Kirsher Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3cb273ac8821..41fa5df9abbc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4562,9 +4562,31 @@ merge: if (skb_has_frag_list(to) || skb_has_frag_list(from)) return false; - if (skb_headlen(from) == 0 && - (skb_shinfo(to)->nr_frags + - skb_shinfo(from)->nr_frags <= MAX_SKB_FRAGS)) { + if (skb_headlen(from) != 0) { + struct page *page; + unsigned int offset; + + if (skb_shinfo(to)->nr_frags + + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) + return false; + + if (!from->head_frag || skb_cloned(from)) + return false; + + delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); + + page = virt_to_head_page(from->head); + offset = from->data - (unsigned char *)page_address(page); + + skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, + page, offset, skb_headlen(from)); + *fragstolen = true; + goto copyfrags; + } else { + if (skb_shinfo(to)->nr_frags + + skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS) + return false; + delta = from->truesize - SKB_TRUESIZE(skb_end_pointer(from) - from->head); copyfrags: @@ -4587,20 +4609,6 @@ copyfrags: to->data_len += len; goto merge; } - if (from->head_frag && !skb_cloned(from)) { - struct page *page; - unsigned int offset; - - if (skb_shinfo(to)->nr_frags + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) - return false; - page = virt_to_head_page(from->head); - offset = from->data - (unsigned char *)page_address(page); - skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, - page, offset, skb_headlen(from)); - *fragstolen = true; - delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); - goto copyfrags; - } return false; } -- cgit v1.2.3 From 34a802a5b9c4c4efb61828781230805be6dd0f8e Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 2 May 2012 21:19:09 +0000 Subject: tcp: move stats merge to the end of tcp_try_coalesce This change cleans up the last bits of tcp_try_coalesce so that we only need one goto which jumps to the end of the function. The idea is to make the code more readable by putting things in a linear order so that we start execution at the top of the function, and end it at the bottom. I also made a slight tweak to the code for handling frags when we are a clone. Instead of making it an if (clone) loop else nr_frags = 0 I changed the logic so that if (!clone) we just set the number of frags to 0 which disables the for loop anyway. Signed-off-by: Alexander Duyck Cc: Eric Dumazet Cc: Jeff Kirsher Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 55 +++++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 26 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 41fa5df9abbc..84e69e02fe20 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4548,15 +4548,13 @@ static bool tcp_try_coalesce(struct sock *sk, int i, delta, len = from->len; *fragstolen = false; + if (tcp_hdr(from)->fin || skb_cloned(to)) return false; + if (len <= skb_tailroom(to)) { BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); -merge: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); - TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; - TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; - return true; + goto merge; } if (skb_has_frag_list(to) || skb_has_frag_list(from)) @@ -4581,7 +4579,6 @@ merge: skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, page, offset, skb_headlen(from)); *fragstolen = true; - goto copyfrags; } else { if (skb_shinfo(to)->nr_frags + skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS) @@ -4589,27 +4586,33 @@ merge: delta = from->truesize - SKB_TRUESIZE(skb_end_pointer(from) - from->head); -copyfrags: - WARN_ON_ONCE(delta < len); - memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags, - skb_shinfo(from)->frags, - skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); - skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags; - - if (skb_cloned(from)) - for (i = 0; i < skb_shinfo(from)->nr_frags; i++) - skb_frag_ref(from, i); - else - skb_shinfo(from)->nr_frags = 0; - - to->truesize += delta; - atomic_add(delta, &sk->sk_rmem_alloc); - sk_mem_charge(sk, delta); - to->len += len; - to->data_len += len; - goto merge; } - return false; + + WARN_ON_ONCE(delta < len); + + memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags, + skb_shinfo(from)->frags, + skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); + skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags; + + if (!skb_cloned(from)) + skb_shinfo(from)->nr_frags = 0; + + /* if the skb is cloned this does nothing since we set nr_frags to 0 */ + for (i = 0; i < skb_shinfo(from)->nr_frags; i++) + skb_frag_ref(from, i); + + to->truesize += delta; + atomic_add(delta, &sk->sk_rmem_alloc); + sk_mem_charge(sk, delta); + to->len += len; + to->data_len += len; + +merge: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); + TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; + TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; + return true; } static void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) -- cgit v1.2.3 From 3a7c1ee4ab89f9250b8f82656a7be0ae14aa3691 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 3 May 2012 01:09:42 +0000 Subject: skb: Add skb_head_is_locked helper function This patch adds support for a skb_head_is_locked helper function. It is meant to be used any time we are considering transferring the head from skb->head to a paged frag. If the head is locked it means we cannot remove the head from the skb so it must be copied or we must take the skb as a whole. Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 84e69e02fe20..7b2d351f24db 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4568,7 +4568,7 @@ static bool tcp_try_coalesce(struct sock *sk, skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) return false; - if (!from->head_frag || skb_cloned(from)) + if (skb_head_is_locked(from)) return false; delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); -- cgit v1.2.3 From 292e8d8c853889140ed77b7b37c66979b13080ae Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 10 May 2012 01:49:41 +0000 Subject: tcp: Move rcvq sending to tcp_input.c It actually works on the input queue and will use its read mem routines, thus it's better to have in in the tcp_input.c file. Signed-off-by: Pavel Emelyanov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eb58b94301ec..7c6c99dcc962 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4746,7 +4746,7 @@ end: skb_set_owner_r(skb, sk); } -int tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, +static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, bool *fragstolen) { int eaten; @@ -4763,6 +4763,39 @@ int tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, return eaten; } +int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) +{ + struct sk_buff *skb; + struct tcphdr *th; + bool fragstolen; + + skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); + if (!skb) + goto err; + + th = (struct tcphdr *)skb_put(skb, sizeof(*th)); + skb_reset_transport_header(skb); + memset(th, 0, sizeof(*th)); + + if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) + goto err_free; + + TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; + TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; + + if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) { + WARN_ON_ONCE(fragstolen); /* should not happen */ + __kfree_skb(skb); + } + return size; + +err_free: + kfree_skb(skb); +err: + return -ENOMEM; +} + static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); -- cgit v1.2.3 From 3c961afed4d4e766b66092e7af8c8e8005053505 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 10 May 2012 01:50:01 +0000 Subject: tcp: Schedule rmem for rcvq repair send As noted by Eric, no checks are performed on the data size we're putting in the read queue during repair. Thus, validate the given data size with the common rmem management routine. Signed-off-by: Pavel Emelyanov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7c6c99dcc962..164659f2d636 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4769,6 +4769,9 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) struct tcphdr *th; bool fragstolen; + if (tcp_try_rmem_schedule(sk, size + sizeof(*th))) + goto err; + skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); if (!skb) goto err; -- cgit v1.2.3 From 1070b1b831404455d79d15fe94ae9216fb5f8ab4 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 10 May 2012 01:50:20 +0000 Subject: tcp: Out-line tcp_try_rmem_schedule As proposed by Eric, make the tcp_input.o thinner. add/remove: 1/1 grow/shrink: 1/4 up/down: 868/-1329 (-461) function old new delta tcp_try_rmem_schedule - 864 +864 tcp_ack 4811 4815 +4 tcp_validate_incoming 817 815 -2 tcp_collapse 860 858 -2 tcp_send_rcvq 555 353 -202 tcp_data_queue 3435 3033 -402 tcp_prune_queue 721 - -721 Signed-off-by: Pavel Emelyanov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 164659f2d636..b99ada27a136 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4511,7 +4511,7 @@ static void tcp_ofo_queue(struct sock *sk) static int tcp_prune_ofo_queue(struct sock *sk); static int tcp_prune_queue(struct sock *sk); -static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) +static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) { if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || !sk_rmem_schedule(sk, size)) { -- cgit v1.2.3 From e87cc4728f0e2fb663e592a1141742b1d6c63256 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 13 May 2012 21:56:26 +0000 Subject: net: Convert net_ratelimit uses to net__ratelimited Standardize the net core ratelimited logging functions. Coalesce formats, align arguments. Change a printk then vprintk sequence to use printf extension %pV. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b99ada27a136..100b242135b1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3952,10 +3952,9 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o __u8 snd_wscale = *(__u8 *)ptr; opt_rx->wscale_ok = 1; if (snd_wscale > 14) { - if (net_ratelimit()) - pr_info("%s: Illegal window scaling value %d >14 received\n", - __func__, - snd_wscale); + net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n", + __func__, + snd_wscale); snd_wscale = 14; } opt_rx->snd_wscale = snd_wscale; -- cgit v1.2.3 From 91df42bedccb919902c7cf7eb876c982ae7f1b1d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 15 May 2012 14:11:54 +0000 Subject: net: ipv4 and ipv6: Convert printk(KERN_DEBUG to pr_debug Use the current debugging style and enable dynamic_debug. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 100b242135b1..eb97787be757 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -981,12 +981,12 @@ static void tcp_update_reordering(struct sock *sk, const int metric, NET_INC_STATS_BH(sock_net(sk), mib_idx); #if FASTRETRANS_DEBUG > 1 - printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", - tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, - tp->reordering, - tp->fackets_out, - tp->sacked_out, - tp->undo_marker ? tp->undo_retrans : 0); + pr_debug("Disorder%d %d %u f%u s%u rr%d\n", + tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, + tp->reordering, + tp->fackets_out, + tp->sacked_out, + tp->undo_marker ? tp->undo_retrans : 0); #endif tcp_disable_fack(tp); } @@ -2716,22 +2716,22 @@ static void DBGUNDO(struct sock *sk, const char *msg) struct inet_sock *inet = inet_sk(sk); if (sk->sk_family == AF_INET) { - printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", - msg, - &inet->inet_daddr, ntohs(inet->inet_dport), - tp->snd_cwnd, tcp_left_out(tp), - tp->snd_ssthresh, tp->prior_ssthresh, - tp->packets_out); + pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", + msg, + &inet->inet_daddr, ntohs(inet->inet_dport), + tp->snd_cwnd, tcp_left_out(tp), + tp->snd_ssthresh, tp->prior_ssthresh, + tp->packets_out); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); - printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", - msg, - &np->daddr, ntohs(inet->inet_dport), - tp->snd_cwnd, tcp_left_out(tp), - tp->snd_ssthresh, tp->prior_ssthresh, - tp->packets_out); + pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", + msg, + &np->daddr, ntohs(inet->inet_dport), + tp->snd_cwnd, tcp_left_out(tp), + tp->snd_ssthresh, tp->prior_ssthresh, + tp->packets_out); } #endif } @@ -3511,18 +3511,18 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (!tp->packets_out && tcp_is_sack(tp)) { icsk = inet_csk(sk); if (tp->lost_out) { - printk(KERN_DEBUG "Leak l=%u %d\n", - tp->lost_out, icsk->icsk_ca_state); + pr_debug("Leak l=%u %d\n", + tp->lost_out, icsk->icsk_ca_state); tp->lost_out = 0; } if (tp->sacked_out) { - printk(KERN_DEBUG "Leak s=%u %d\n", - tp->sacked_out, icsk->icsk_ca_state); + pr_debug("Leak s=%u %d\n", + tp->sacked_out, icsk->icsk_ca_state); tp->sacked_out = 0; } if (tp->retrans_out) { - printk(KERN_DEBUG "Leak r=%u %d\n", - tp->retrans_out, icsk->icsk_ca_state); + pr_debug("Leak r=%u %d\n", + tp->retrans_out, icsk->icsk_ca_state); tp->retrans_out = 0; } } -- cgit v1.2.3 From a2a385d627e1549da4b43a8b3dfe370589766e1c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 May 2012 23:15:34 +0000 Subject: tcp: bool conversions bool conversions where possible. __inline__ -> inline space cleanups Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 214 ++++++++++++++++++++++++++------------------------- 1 file changed, 108 insertions(+), 106 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eb97787be757..b961ef54b17d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -196,9 +196,10 @@ static void tcp_enter_quickack_mode(struct sock *sk) * and the session is not interactive. */ -static inline int tcp_in_quickack_mode(const struct sock *sk) +static inline bool tcp_in_quickack_mode(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); + return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; } @@ -253,11 +254,11 @@ static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) tp->ecn_flags &= ~TCP_ECN_OK; } -static inline int TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) +static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) { if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) - return 1; - return 0; + return true; + return false; } /* Buffer size and advertised window tuning. @@ -1123,36 +1124,36 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, * the exact amount is rather hard to quantify. However, tp->max_window can * be used as an exaggerated estimate. */ -static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, - u32 start_seq, u32 end_seq) +static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, + u32 start_seq, u32 end_seq) { /* Too far in future, or reversed (interpretation is ambiguous) */ if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) - return 0; + return false; /* Nasty start_seq wrap-around check (see comments above) */ if (!before(start_seq, tp->snd_nxt)) - return 0; + return false; /* In outstanding window? ...This is valid exit for D-SACKs too. * start_seq == snd_una is non-sensical (see comments above) */ if (after(start_seq, tp->snd_una)) - return 1; + return true; if (!is_dsack || !tp->undo_marker) - return 0; + return false; /* ...Then it's D-SACK, and must reside below snd_una completely */ if (after(end_seq, tp->snd_una)) - return 0; + return false; if (!before(start_seq, tp->undo_marker)) - return 1; + return true; /* Too old */ if (!after(end_seq, tp->undo_marker)) - return 0; + return false; /* Undo_marker boundary crossing (overestimates a lot). Known already: * start_seq < undo_marker and end_seq >= undo_marker. @@ -1224,17 +1225,17 @@ static void tcp_mark_lost_retrans(struct sock *sk) tp->lost_retrans_low = new_low_seq; } -static int tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, - struct tcp_sack_block_wire *sp, int num_sacks, - u32 prior_snd_una) +static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, + struct tcp_sack_block_wire *sp, int num_sacks, + u32 prior_snd_una) { struct tcp_sock *tp = tcp_sk(sk); u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); - int dup_sack = 0; + bool dup_sack = false; if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { - dup_sack = 1; + dup_sack = true; tcp_dsack_seen(tp); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV); } else if (num_sacks > 1) { @@ -1243,7 +1244,7 @@ static int tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, if (!after(end_seq_0, end_seq_1) && !before(start_seq_0, start_seq_1)) { - dup_sack = 1; + dup_sack = true; tcp_dsack_seen(tp); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV); @@ -1274,9 +1275,10 @@ struct tcp_sacktag_state { * FIXME: this could be merged to shift decision code */ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, - u32 start_seq, u32 end_seq) + u32 start_seq, u32 end_seq) { - int in_sack, err; + int err; + bool in_sack; unsigned int pkt_len; unsigned int mss; @@ -1322,7 +1324,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, - int dup_sack, int pcount) + bool dup_sack, int pcount) { struct tcp_sock *tp = tcp_sk(sk); int fack_count = state->fack_count; @@ -1402,10 +1404,10 @@ static u8 tcp_sacktag_one(struct sock *sk, /* Shift newly-SACKed bytes from this skb to the immediately previous * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. */ -static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, - struct tcp_sacktag_state *state, - unsigned int pcount, int shifted, int mss, - int dup_sack) +static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, + struct tcp_sacktag_state *state, + unsigned int pcount, int shifted, int mss, + bool dup_sack) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev = tcp_write_queue_prev(sk, skb); @@ -1455,7 +1457,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, if (skb->len > 0) { BUG_ON(!tcp_skb_pcount(skb)); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED); - return 0; + return false; } /* Whole SKB was eaten :-) */ @@ -1478,7 +1480,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED); - return 1; + return true; } /* I wish gso_size would have a bit more sane initialization than @@ -1501,7 +1503,7 @@ static int skb_can_shift(const struct sk_buff *skb) static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, struct tcp_sacktag_state *state, u32 start_seq, u32 end_seq, - int dup_sack) + bool dup_sack) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev; @@ -1640,14 +1642,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, struct tcp_sack_block *next_dup, struct tcp_sacktag_state *state, u32 start_seq, u32 end_seq, - int dup_sack_in) + bool dup_sack_in) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *tmp; tcp_for_write_queue_from(skb, sk) { int in_sack = 0; - int dup_sack = dup_sack_in; + bool dup_sack = dup_sack_in; if (skb == tcp_send_head(sk)) break; @@ -1662,7 +1664,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, next_dup->start_seq, next_dup->end_seq); if (in_sack > 0) - dup_sack = 1; + dup_sack = true; } /* skb reference here is a bit tricky to get right, since @@ -1767,7 +1769,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, struct sk_buff *skb; int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); int used_sacks; - int found_dup_sack = 0; + bool found_dup_sack = false; int i, j; int first_sack_index; @@ -1798,7 +1800,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, used_sacks = 0; first_sack_index = 0; for (i = 0; i < num_sacks; i++) { - int dup_sack = !i && found_dup_sack; + bool dup_sack = !i && found_dup_sack; sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq); sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq); @@ -1865,7 +1867,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, while (i < used_sacks) { u32 start_seq = sp[i].start_seq; u32 end_seq = sp[i].end_seq; - int dup_sack = (found_dup_sack && (i == first_sack_index)); + bool dup_sack = (found_dup_sack && (i == first_sack_index)); struct tcp_sack_block *next_dup = NULL; if (found_dup_sack && ((i + 1) == first_sack_index)) @@ -1967,9 +1969,9 @@ out: } /* Limits sacked_out so that sum with lost_out isn't ever larger than - * packets_out. Returns zero if sacked_out adjustement wasn't necessary. + * packets_out. Returns false if sacked_out adjustement wasn't necessary. */ -static int tcp_limit_reno_sacked(struct tcp_sock *tp) +static bool tcp_limit_reno_sacked(struct tcp_sock *tp) { u32 holes; @@ -1978,9 +1980,9 @@ static int tcp_limit_reno_sacked(struct tcp_sock *tp) if ((tp->sacked_out + holes) > tp->packets_out) { tp->sacked_out = tp->packets_out - holes; - return 1; + return true; } - return 0; + return false; } /* If we receive more dupacks than we expected counting segments @@ -2034,40 +2036,40 @@ static int tcp_is_sackfrto(const struct tcp_sock *tp) /* F-RTO can only be used if TCP has never retransmitted anything other than * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) */ -int tcp_use_frto(struct sock *sk) +bool tcp_use_frto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; if (!sysctl_tcp_frto) - return 0; + return false; /* MTU probe and F-RTO won't really play nicely along currently */ if (icsk->icsk_mtup.probe_size) - return 0; + return false; if (tcp_is_sackfrto(tp)) - return 1; + return true; /* Avoid expensive walking of rexmit queue if possible */ if (tp->retrans_out > 1) - return 0; + return false; skb = tcp_write_queue_head(sk); if (tcp_skb_is_last(sk, skb)) - return 1; + return true; skb = tcp_write_queue_next(sk, skb); /* Skips head */ tcp_for_write_queue_from(skb, sk) { if (skb == tcp_send_head(sk)) break; if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) - return 0; + return false; /* Short-circuit when first non-SACKed skb has been checked */ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) break; } - return 1; + return true; } /* RTO occurred, but do not yet enter Loss state. Instead, defer RTO @@ -2303,7 +2305,7 @@ void tcp_enter_loss(struct sock *sk, int how) * * Do processing similar to RTO timeout. */ -static int tcp_check_sack_reneging(struct sock *sk, int flag) +static bool tcp_check_sack_reneging(struct sock *sk, int flag) { if (flag & FLAG_SACK_RENEGING) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -2314,9 +2316,9 @@ static int tcp_check_sack_reneging(struct sock *sk, int flag) tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); - return 1; + return true; } - return 0; + return false; } static inline int tcp_fackets_out(const struct tcp_sock *tp) @@ -2472,28 +2474,28 @@ static inline int tcp_head_timedout(const struct sock *sk) * Main question: may we further continue forward transmission * with the same cwnd? */ -static int tcp_time_to_recover(struct sock *sk, int flag) +static bool tcp_time_to_recover(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); __u32 packets_out; /* Do not perform any recovery during F-RTO algorithm */ if (tp->frto_counter) - return 0; + return false; /* Trick#1: The loss is proven. */ if (tp->lost_out) - return 1; + return true; /* Not-A-Trick#2 : Classic rule... */ if (tcp_dupack_heuristics(tp) > tp->reordering) - return 1; + return true; /* Trick#3 : when we use RFC2988 timer restart, fast * retransmit can be triggered by timeout of queue head. */ if (tcp_is_fack(tp) && tcp_head_timedout(sk)) - return 1; + return true; /* Trick#4: It is still not OK... But will it be useful to delay * recovery more? @@ -2505,7 +2507,7 @@ static int tcp_time_to_recover(struct sock *sk, int flag) /* We have nothing to send. This connection is limited * either by receiver window or by application. */ - return 1; + return true; } /* If a thin stream is detected, retransmit after first @@ -2516,7 +2518,7 @@ static int tcp_time_to_recover(struct sock *sk, int flag) if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && tcp_is_sack(tp) && !tcp_send_head(sk)) - return 1; + return true; /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious * retransmissions due to small network reorderings, we implement @@ -2528,7 +2530,7 @@ static int tcp_time_to_recover(struct sock *sk, int flag) !tcp_may_send_now(sk)) return !tcp_pause_early_retransmit(sk, flag); - return 0; + return false; } /* New heuristics: it is possible only after we switched to restart timer @@ -2767,7 +2769,7 @@ static inline int tcp_may_undo(const struct tcp_sock *tp) } /* People celebrate: "We love our President!" */ -static int tcp_try_undo_recovery(struct sock *sk) +static bool tcp_try_undo_recovery(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2792,10 +2794,10 @@ static int tcp_try_undo_recovery(struct sock *sk) * is ACKed. For Reno it is MUST to prevent false * fast retransmits (RFC2582). SACK TCP is safe. */ tcp_moderate_cwnd(tp); - return 1; + return true; } tcp_set_ca_state(sk, TCP_CA_Open); - return 0; + return false; } /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ @@ -2825,19 +2827,19 @@ static void tcp_try_undo_dsack(struct sock *sk) * that successive retransmissions of a segment must not advance * retrans_stamp under any conditions. */ -static int tcp_any_retrans_done(const struct sock *sk) +static bool tcp_any_retrans_done(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; if (tp->retrans_out) - return 1; + return true; skb = tcp_write_queue_head(sk); if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) - return 1; + return true; - return 0; + return false; } /* Undo during fast recovery after partial ACK. */ @@ -2871,7 +2873,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) } /* Undo during loss recovery after partial ACK. */ -static int tcp_try_undo_loss(struct sock *sk) +static bool tcp_try_undo_loss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2893,9 +2895,9 @@ static int tcp_try_undo_loss(struct sock *sk) tp->undo_marker = 0; if (tcp_is_sack(tp)) tcp_set_ca_state(sk, TCP_CA_Open); - return 1; + return true; } - return 0; + return false; } static inline void tcp_complete_cwr(struct sock *sk) @@ -3370,7 +3372,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; u32 now = tcp_time_stamp; - int fully_acked = 1; + int fully_acked = true; int flag = 0; u32 pkts_acked = 0; u32 reord = tp->packets_out; @@ -3394,7 +3396,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (!acked_pcount) break; - fully_acked = 0; + fully_acked = false; } else { acked_pcount = tcp_skb_pcount(skb); } @@ -3673,7 +3675,7 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag) * to prove that the RTO is indeed spurious. It transfers the control * from F-RTO to the conventional RTO recovery */ -static int tcp_process_frto(struct sock *sk, int flag) +static bool tcp_process_frto(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -3689,7 +3691,7 @@ static int tcp_process_frto(struct sock *sk, int flag) if (!before(tp->snd_una, tp->frto_highmark)) { tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); - return 1; + return true; } if (!tcp_is_sackfrto(tp)) { @@ -3698,19 +3700,19 @@ static int tcp_process_frto(struct sock *sk, int flag) * data, winupdate */ if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP)) - return 1; + return true; if (!(flag & FLAG_DATA_ACKED)) { tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), flag); - return 1; + return true; } } else { if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { /* Prevent sending of new data. */ tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)); - return 1; + return true; } if ((tp->frto_counter >= 2) && @@ -3720,10 +3722,10 @@ static int tcp_process_frto(struct sock *sk, int flag) /* RFC4138 shortcoming (see comment above) */ if (!(flag & FLAG_FORWARD_PROGRESS) && (flag & FLAG_NOT_DUP)) - return 1; + return true; tcp_enter_frto_loss(sk, 3, flag); - return 1; + return true; } } @@ -3735,7 +3737,7 @@ static int tcp_process_frto(struct sock *sk, int flag) if (!tcp_may_send_now(sk)) tcp_enter_frto_loss(sk, 2, flag); - return 1; + return true; } else { switch (sysctl_tcp_frto_response) { case 2: @@ -3752,7 +3754,7 @@ static int tcp_process_frto(struct sock *sk, int flag) tp->undo_marker = 0; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); } - return 0; + return false; } /* This routine deals with incoming acks, but not outgoing ones. */ @@ -3770,7 +3772,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int prior_sacked = tp->sacked_out; int pkts_acked = 0; int newly_acked_sacked = 0; - int frto_cwnd = 0; + bool frto_cwnd = false; /* If the ack is older than previous acks * then we can probably ignore it. @@ -4025,7 +4027,7 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o } EXPORT_SYMBOL(tcp_parse_options); -static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th) +static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th) { const __be32 *ptr = (const __be32 *)(th + 1); @@ -4036,31 +4038,31 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr tp->rx_opt.rcv_tsval = ntohl(*ptr); ++ptr; tp->rx_opt.rcv_tsecr = ntohl(*ptr); - return 1; + return true; } - return 0; + return false; } /* Fast parse options. This hopes to only see timestamps. * If it is wrong it falls back on tcp_parse_options(). */ -static int tcp_fast_parse_options(const struct sk_buff *skb, - const struct tcphdr *th, - struct tcp_sock *tp, const u8 **hvpp) +static bool tcp_fast_parse_options(const struct sk_buff *skb, + const struct tcphdr *th, + struct tcp_sock *tp, const u8 **hvpp) { /* In the spirit of fast parsing, compare doff directly to constant * values. Because equality is used, short doff can be ignored here. */ if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0; - return 0; + return false; } else if (tp->rx_opt.tstamp_ok && th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { if (tcp_parse_aligned_timestamp(tp, th)) - return 1; + return true; } tcp_parse_options(skb, &tp->rx_opt, hvpp, 1); - return 1; + return true; } #ifdef CONFIG_TCP_MD5SIG @@ -4301,7 +4303,7 @@ static void tcp_fin(struct sock *sk) } } -static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, +static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) { if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { @@ -4309,9 +4311,9 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, sp->start_seq = seq; if (after(end_seq, sp->end_seq)) sp->end_seq = end_seq; - return 1; + return true; } - return 0; + return false; } static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) @@ -4507,7 +4509,7 @@ static void tcp_ofo_queue(struct sock *sk) } } -static int tcp_prune_ofo_queue(struct sock *sk); +static bool tcp_prune_ofo_queue(struct sock *sk); static int tcp_prune_queue(struct sock *sk); static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) @@ -5092,10 +5094,10 @@ static void tcp_collapse_ofo_queue(struct sock *sk) * Purge the out-of-order queue. * Return true if queue was pruned. */ -static int tcp_prune_ofo_queue(struct sock *sk) +static bool tcp_prune_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - int res = 0; + bool res = false; if (!skb_queue_empty(&tp->out_of_order_queue)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); @@ -5109,7 +5111,7 @@ static int tcp_prune_ofo_queue(struct sock *sk) if (tp->rx_opt.sack_ok) tcp_sack_reset(&tp->rx_opt); sk_mem_reclaim(sk); - res = 1; + res = true; } return res; } @@ -5186,7 +5188,7 @@ void tcp_cwnd_application_limited(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -static int tcp_should_expand_sndbuf(const struct sock *sk) +static bool tcp_should_expand_sndbuf(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -5194,21 +5196,21 @@ static int tcp_should_expand_sndbuf(const struct sock *sk) * not modify it. */ if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) - return 0; + return false; /* If we are under global TCP memory pressure, do not expand. */ if (sk_under_memory_pressure(sk)) - return 0; + return false; /* If we are under soft global TCP memory pressure, do not expand. */ if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0)) - return 0; + return false; /* If we filled the congestion window, do not expand. */ if (tp->packets_out >= tp->snd_cwnd) - return 0; + return false; - return 1; + return true; } /* When incoming ACK allowed to free some skb from write_queue, @@ -5434,16 +5436,16 @@ static inline int tcp_checksum_complete_user(struct sock *sk, } #ifdef CONFIG_NET_DMA -static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, +static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen) { struct tcp_sock *tp = tcp_sk(sk); int chunk = skb->len - hlen; int dma_cookie; - int copied_early = 0; + bool copied_early = false; if (tp->ucopy.wakeup) - return 0; + return false; if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) tp->ucopy.dma_chan = net_dma_find_channel(); @@ -5459,7 +5461,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, goto out; tp->ucopy.dma_cookie = dma_cookie; - copied_early = 1; + copied_early = true; tp->ucopy.len -= chunk; tp->copied_seq += chunk; -- cgit v1.2.3 From bad43ca8325f493dcaa0896c2f036276af059c7e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 May 2012 03:02:02 +0000 Subject: net: introduce skb_try_coalesce() Move tcp_try_coalesce() protocol independent part to skb_try_coalesce(). skb_try_coalesce() can be used in IPv4 defrag and IPv6 reassembly, to build optimized skbs (less sk_buff, and possibly less 'headers') skb_try_coalesce() is zero copy, unless the copy can fit in destination header (its a rare case) kfree_skb_partial() is also moved to net/core/skbuff.c and exported, because IPv6 will need it in patch (ipv6: use skb coalescing in reassembly). Signed-off-by: Eric Dumazet Cc: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 67 +++------------------------------------------------- 1 file changed, 3 insertions(+), 64 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b961ef54b17d..cfa2aa128342 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4549,84 +4549,23 @@ static bool tcp_try_coalesce(struct sock *sk, struct sk_buff *from, bool *fragstolen) { - int i, delta, len = from->len; + int delta; *fragstolen = false; - if (tcp_hdr(from)->fin || skb_cloned(to)) + if (tcp_hdr(from)->fin) return false; - - if (len <= skb_tailroom(to)) { - BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); - goto merge; - } - - if (skb_has_frag_list(to) || skb_has_frag_list(from)) + if (!skb_try_coalesce(to, from, fragstolen, &delta)) return false; - if (skb_headlen(from) != 0) { - struct page *page; - unsigned int offset; - - if (skb_shinfo(to)->nr_frags + - skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) - return false; - - if (skb_head_is_locked(from)) - return false; - - delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); - - page = virt_to_head_page(from->head); - offset = from->data - (unsigned char *)page_address(page); - - skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, - page, offset, skb_headlen(from)); - *fragstolen = true; - } else { - if (skb_shinfo(to)->nr_frags + - skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS) - return false; - - delta = from->truesize - - SKB_TRUESIZE(skb_end_pointer(from) - from->head); - } - - WARN_ON_ONCE(delta < len); - - memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags, - skb_shinfo(from)->frags, - skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); - skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags; - - if (!skb_cloned(from)) - skb_shinfo(from)->nr_frags = 0; - - /* if the skb is cloned this does nothing since we set nr_frags to 0 */ - for (i = 0; i < skb_shinfo(from)->nr_frags; i++) - skb_frag_ref(from, i); - - to->truesize += delta; atomic_add(delta, &sk->sk_rmem_alloc); sk_mem_charge(sk, delta); - to->len += len; - to->data_len += len; - -merge: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; return true; } -static void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) -{ - if (head_stolen) - kmem_cache_free(skbuff_head_cache, skb); - else - __kfree_skb(skb); -} - static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); -- cgit v1.2.3 From 1ca7ee30630e1022dbcf1b51be20580815ffab73 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 23 May 2012 17:51:37 +0000 Subject: tcp: take care of overlaps in tcp_try_coalesce() Sergio Correia reported following warning : WARNING: at net/ipv4/tcp.c:1301 tcp_cleanup_rbuf+0x4f/0x110() WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); It appears TCP coalescing, and more specifically commit b081f85c297 (net: implement tcp coalescing in tcp_queue_rcv()) should take care of possible segment overlaps in receive queue. This was properly done in the case of out_or_order_queue by the caller. For example, segment at tail of queue have sequence 1000-2000, and we add a segment with sequence 1500-2500. This can happen in case of retransmits. In this case, just don't do the coalescing. Reported-by: Sergio Correia Signed-off-by: Eric Dumazet Tested-by: Sergio Correia Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cfa2aa128342..b224eb8bce8b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4555,6 +4555,11 @@ static bool tcp_try_coalesce(struct sock *sk, if (tcp_hdr(from)->fin) return false; + + /* Its possible this segment overlaps with prior segment in queue */ + if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) + return false; + if (!skb_try_coalesce(to, from, fragstolen, &delta)) return false; -- cgit v1.2.3