From b5da623ae9be680ea59f268eeb339f0acb2d88c4 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 10 Aug 2005 18:32:36 -0700 Subject: [TCP]: Adjust {p,f}ackets_out correctly in tcp_retransmit_skb() Well I've only found one potential cause for the assertion failure in tcp_mark_head_lost. First of all, this can only occur if cnt > 1 since tp->packets_out is never zero here. If it did hit zero we'd have much bigger problems. So cnt is equal to fackets_out - reordering. Normally fackets_out is less than packets_out. The only reason I've found that might cause fackets_out to exceed packets_out is if tcp_fragment is called from tcp_retransmit_skb with a TSO skb and the current MSS is greater than the MSS stored in the TSO skb. This might occur as the result of an expiring dst entry. In that case, packets_out may decrease (line 1380-1381 in tcp_output.c). However, fackets_out is unchanged which means that it may in fact exceed packets_out. Previously tcp_retrans_try_collapse was the only place where packets_out can go down and it takes care of this by decrementing fackets_out. So we should make sure that fackets_out is reduced by an appropriate amount here as well. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7d076f0db100..3ed6fc15815b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1370,15 +1370,21 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (skb->len > cur_mss) { int old_factor = tcp_skb_pcount(skb); - int new_factor; + int diff; if (tcp_fragment(sk, skb, cur_mss, cur_mss)) return -ENOMEM; /* We'll try again later. */ /* New SKB created, account for it. */ - new_factor = tcp_skb_pcount(skb); - tp->packets_out -= old_factor - new_factor; - tp->packets_out += tcp_skb_pcount(skb->next); + diff = old_factor - tcp_skb_pcount(skb) - + tcp_skb_pcount(skb->next); + tp->packets_out -= diff; + + if (diff > 0) { + tp->fackets_out -= diff; + if ((int)tp->fackets_out < 0) + tp->fackets_out = 0; + } } /* Collapse two adjacent packets if worthwhile and we can. */ -- cgit v1.2.3 From c8ac37746489f05a32a958b048f29ae45487e81e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 16 Aug 2005 20:43:40 -0700 Subject: [TCP]: Fix bug #5070: kernel BUG at net/ipv4/tcp_output.c:864 1) We send out a normal sized packet with TSO on to start off. 2) ICMP is received indicating a smaller MTU. 3) We send the current sk_send_head which needs to be fragmented since it was created before the ICMP event. The first fragment is then sent out. At this point the remaining fragment is allocated by tcp_fragment. However, its size is padded to fit the L1 cache-line size therefore creating tail-room up to 124 bytes long. This fragment will also be sitting at sk_send_head. 4) tcp_sendmsg is called again and it stores data in the tail-room of of the fragment. 5) tcp_push_one is called by tcp_sendmsg which then calls tso_fragment since the packet as a whole exceeds the MTU. At this point we have a packet that has data in the head area being fed to tso_fragment which bombs out. My take on this is that we shouldn't ever call tcp_fragment on a TSO socket for a packet that is yet to be transmitted since this creates a packet on sk_send_head that cannot be extended. So here is a patch to change it so that tso_fragment is always used in this case. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3ed6fc15815b..566045e58437 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -861,7 +861,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, u16 flags; /* All of a TSO frame must be composed of paged data. */ - BUG_ON(skb->len != skb->data_len); + if (skb->len != skb->data_len) + return tcp_fragment(sk, skb, len, mss_now); buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC); if (unlikely(buff == NULL)) @@ -974,6 +975,8 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) sent_pkts = 0; while ((skb = sk->sk_send_head)) { + unsigned int limit; + tso_segs = tcp_init_tso_segs(sk, skb, mss_now); BUG_ON(!tso_segs); @@ -994,9 +997,10 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) break; } + limit = mss_now; if (tso_segs > 1) { - u32 limit = tcp_window_allows(tp, skb, - mss_now, cwnd_quota); + limit = tcp_window_allows(tp, skb, + mss_now, cwnd_quota); if (skb->len < limit) { unsigned int trim = skb->len % mss_now; @@ -1004,15 +1008,12 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) if (trim) limit = skb->len - trim; } - if (skb->len > limit) { - if (tso_fragment(sk, skb, limit, mss_now)) - break; - } - } else if (unlikely(skb->len > mss_now)) { - if (unlikely(tcp_fragment(sk, skb, mss_now, mss_now))) - break; } + if (skb->len > limit && + unlikely(tso_fragment(sk, skb, limit, mss_now))) + break; + TCP_SKB_CB(skb)->when = tcp_time_stamp; if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))) @@ -1064,11 +1065,14 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH); if (likely(cwnd_quota)) { + unsigned int limit; + BUG_ON(!tso_segs); + limit = mss_now; if (tso_segs > 1) { - u32 limit = tcp_window_allows(tp, skb, - mss_now, cwnd_quota); + limit = tcp_window_allows(tp, skb, + mss_now, cwnd_quota); if (skb->len < limit) { unsigned int trim = skb->len % mss_now; @@ -1076,15 +1080,12 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) if (trim) limit = skb->len - trim; } - if (skb->len > limit) { - if (unlikely(tso_fragment(sk, skb, limit, mss_now))) - return; - } - } else if (unlikely(skb->len > mss_now)) { - if (unlikely(tcp_fragment(sk, skb, mss_now, mss_now))) - return; } + if (skb->len > limit && + unlikely(tso_fragment(sk, skb, limit, mss_now))) + return; + /* Send it out now. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; -- cgit v1.2.3