From f5184d267c1aedb9b7a8cc44e08ff6b8d382c3b5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 12 May 2008 20:48:31 -0700 Subject: net: Allow netdevices to specify needed head/tailroom This patch adds needed_headroom/needed_tailroom members to struct net_device and updates many places that allocate sbks to use them. Not all of them can be converted though, and I'm sure I missed some (I mostly grepped for LL_RESERVED_SPACE) Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/ipv4/arp.c | 2 +- net/ipv4/igmp.c | 4 ++-- net/ipv4/ipconfig.c | 6 +++--- net/ipv4/raw.c | 10 ++++------ 4 files changed, 10 insertions(+), 12 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 68b72a7a1806..418862f1bf22 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -570,7 +570,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, * Allocate a buffer */ - skb = alloc_skb(arp_hdr_len(dev) + LL_RESERVED_SPACE(dev), GFP_ATOMIC); + skb = alloc_skb(arp_hdr_len(dev) + LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); if (skb == NULL) return NULL; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 6250f4239b61..2769dc4a4c84 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -292,7 +292,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) struct iphdr *pip; struct igmpv3_report *pig; - skb = alloc_skb(size + LL_RESERVED_SPACE(dev), GFP_ATOMIC); + skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); if (skb == NULL) return NULL; @@ -653,7 +653,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, return -1; } - skb=alloc_skb(IGMP_SIZE+LL_RESERVED_SPACE(dev), GFP_ATOMIC); + skb=alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); if (skb == NULL) { ip_rt_put(rt); return -1; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 89dee4346f60..ed45037ce9be 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -710,14 +710,14 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d struct net_device *dev = d->dev; struct sk_buff *skb; struct bootp_pkt *b; - int hh_len = LL_RESERVED_SPACE(dev); struct iphdr *h; /* Allocate packet */ - skb = alloc_skb(sizeof(struct bootp_pkt) + hh_len + 15, GFP_KERNEL); + skb = alloc_skb(sizeof(struct bootp_pkt) + LL_ALLOCATED_SPACE(dev) + 15, + GFP_KERNEL); if (!skb) return; - skb_reserve(skb, hh_len); + skb_reserve(skb, LL_RESERVED_SPACE(dev)); b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt)); memset(b, 0, sizeof(struct bootp_pkt)); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 11d7f753a820..fead049daf43 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -322,7 +322,6 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); - int hh_len; struct iphdr *iph; struct sk_buff *skb; unsigned int iphlen; @@ -336,13 +335,12 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, if (flags&MSG_PROBE) goto out; - hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); - - skb = sock_alloc_send_skb(sk, length+hh_len+15, - flags&MSG_DONTWAIT, &err); + skb = sock_alloc_send_skb(sk, + length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15, + flags & MSG_DONTWAIT, &err); if (skb == NULL) goto error; - skb_reserve(skb, hh_len); + skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev)); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; -- cgit v1.2.3 From a1c1f281b84a751fdb5ff919da3b09df7297619f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 13 May 2008 02:53:26 -0700 Subject: tcp FRTO: Fix fallback to conventional recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It seems that commit 009a2e3e4ec ("[TCP] FRTO: Improve interoperability with other undo_marker users") run into another land-mine which caused fallback to conventional recovery to break: 1. Cumulative ACK arrives after FRTO retransmission 2. tcp_try_to_open sees zero retrans_out, clears retrans_stamp which should be kept like in CA_Loss state it would be 3. undo_marker change allowed tcp_packet_delayed to return true because of the cleared retrans_stamp once FRTO is terminated causing LossUndo to occur, which means all loss markings FRTO made are reverted. This means that the conventional recovery basically recovered one loss per RTT, which is not that efficient. It was quite unobvious that the undo_marker change broken something like this, I had a quite long session to track it down because of the non-intuitiviness of the bug (luckily I had a trivial reproducer at hand and I was also able to learn to use kprobes in the process as well :-)). This together with the NewReno+FRTO fix and FRTO in-order workaround this fixes Damon's problems, this and the first mentioned are enough to fix Bugzilla #10063. Signed-off-by: Ilpo Järvinen Reported-by: Damon L. Chesser Tested-by: Damon L. Chesser Tested-by: Sebastian Hyrwall Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 26c936930e92..d6edb98fd526 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2482,7 +2482,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) tcp_verify_left_out(tp); - if (tp->retrans_out == 0) + if (!tp->frto_counter && tp->retrans_out == 0) tp->retrans_stamp = 0; if (flag & FLAG_ECE) -- cgit v1.2.3 From 79d44516b4b178ffb6e2159c75584cfcfc097914 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 13 May 2008 02:54:19 -0700 Subject: tcp FRTO: work-around inorder receivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If receiver consumes segments successfully only in-order, FRTO fallback to conventional recovery produces RTO loop because FRTO's forward transmissions will always get dropped and need to be resent, yet by default they're not marked as lost (which are the only segments we will retransmit in CA_Loss). Price to pay about this is occassionally unnecessarily retransmitting the forward transmission(s). SACK blocks help a bit to avoid this, so it's mainly a concern for NewReno case though SACK is not fully immune either. This change has a side-effect of fixing SACKFRTO problem where it didn't have snd_nxt of the RTO time available anymore when fallback become necessary (this problem would have only occured when RTO would occur for two or more segments and ECE arrives in step 3; no need to figure out how to fix that unless the TODO item of selective behavior is considered in future). Signed-off-by: Ilpo Järvinen Reported-by: Damon L. Chesser Tested-by: Damon L. Chesser Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d6edb98fd526..b54d9d37b636 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1842,9 +1842,16 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; } - /* Don't lost mark skbs that were fwd transmitted after RTO */ - if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) && - !after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) { + /* Marking forward transmissions that were made after RTO lost + * can cause unnecessary retransmissions in some scenarios, + * SACK blocks will mitigate that in some but not in all cases. + * We used to not mark them but it was causing break-ups with + * receivers that do only in-order receival. + * + * TODO: we could detect presence of such receiver and select + * different behavior per flow. + */ + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); } @@ -1860,7 +1867,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); - tp->high_seq = tp->frto_highmark; + tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); tcp_clear_retrans_hints_partial(tp); -- cgit v1.2.3 From 5e0f8923f350ff522f8f6aecf198df045af3615f Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Tue, 13 May 2008 23:23:55 -0700 Subject: cipso: Relax too much careful cipso hash function. The cipso_v4_cache is allocated to contain CIPSO_V4_CACHE_BUCKETS buckets. The CIPSO_V4_CACHE_BUCKETS = 1 << CIPSO_V4_CACHE_BUCKETBITS, where CIPSO_V4_CACHE_BUCKETBITS = 7. The bucket-selection function for this hash is calculated like this: bkt = hash & (CIPSO_V4_CACHE_BUCKETBITS - 1); ^^^ i.e. picking only 4 buckets of possible 128 :) Signed-off-by: Pavel Emelyanov Acked-by: Paul Moore Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 05afb576d935..2c0e4572cc90 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -338,7 +338,7 @@ static int cipso_v4_cache_check(const unsigned char *key, return -ENOENT; hash = cipso_v4_map_cache_hash(key, key_len); - bkt = hash & (CIPSO_V4_CACHE_BUCKETBITS - 1); + bkt = hash & (CIPSO_V4_CACHE_BUCKETS - 1); spin_lock_bh(&cipso_v4_cache[bkt].lock); list_for_each_entry(entry, &cipso_v4_cache[bkt].list, list) { if (entry->hash == hash && @@ -417,7 +417,7 @@ int cipso_v4_cache_add(const struct sk_buff *skb, atomic_inc(&secattr->cache->refcount); entry->lsm_data = secattr->cache; - bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETBITS - 1); + bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1); spin_lock_bh(&cipso_v4_cache[bkt].lock); if (cipso_v4_cache[bkt].size < cipso_v4_cache_bucketsize) { list_add(&entry->list, &cipso_v4_cache[bkt].list); -- cgit v1.2.3 From 1ac06e0306d0192a7a4d9ea1c9e06d355ce7e7d3 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 20 May 2008 14:32:14 -0700 Subject: ipsec: Use the correct ip_local_out function Because the IPsec output function xfrm_output_resume does its own dst_output call it should always call __ip_local_output instead of ip_local_output as the latter may invoke dst_output directly. Otherwise the return values from nf_hook and dst_output may clash as they both use the value 1 but for different purposes. When that clash occurs this can cause a packet to be used after it has been freed which usually leads to a crash. Because the offending value is only returned from dst_output with qdiscs such as HTB, this bug is normally not visible. Thanks to Marco Berizzi for his perseverance in tracking this down. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 92f90ae46f4a..df41026b60db 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -160,7 +160,7 @@ static struct dst_ops ipv4_dst_ops = { .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, - .local_out = ip_local_out, + .local_out = __ip_local_out, .entry_size = sizeof(struct rtable), .entries = ATOMIC_INIT(0), }; -- cgit v1.2.3 From 7d227cd235c809c36c847d6a597956ad9e9d2bae Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Wed, 21 May 2008 16:42:20 -0700 Subject: tcp: TCP connection times out if ICMP frag needed is delayed We are seeing an issue with TCP in handling an ICMP frag needed message that is received after net.ipv4.tcp_retries1 retransmits. The default value of retries1 is 3. So if the path mtu changes and ICMP frag needed is lost for the first 3 retransmits or if it gets delayed until 3 retransmits are done, TCP doesn't update MSS correctly and continues to retransmit the orginal message until it timesout after tcp_retries2 retransmits. I am seeing this issue even with the latest 2.6.25.4 kernel. In tcp_retransmit_timer(), when retransmits counter exceeds tcp_retries1 value, the dst cache entry of the socket is reset. At this time, if we receive an ICMP frag needed message, the dst entry gets updated with the new MTU, but the TCP sockets dst_cache entry remains NULL. So the next time when we try to retransmit after the ICMP frag needed is received, tcp_retransmit_skb() gets called. Here the cur_mss value is calculated at the start of the routine with a NULL sk_dst_cache. Instead we should call tcp_current_mss after the rebuild_header that caches the dst entry with the updated mtu. Also the rebuild_header should be called before tcp_fragment so that skb is fragmented if the mss goes down. Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index debf23581606..e399bde7813a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1836,7 +1836,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - unsigned int cur_mss = tcp_current_mss(sk, 0); + unsigned int cur_mss; int err; /* Inconslusive MTU probe */ @@ -1858,6 +1858,11 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) return -ENOMEM; } + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) + return -EHOSTUNREACH; /* Routing failure or similar. */ + + cur_mss = tcp_current_mss(sk, 0); + /* If receiver has shrunk his window, and skb is out of * new window, do not retransmit it. The exception is the * case, when window is shrunk to zero. In this case @@ -1884,9 +1889,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) (sysctl_tcp_retrans_collapse != 0)) tcp_retrans_try_collapse(sk, skb, cur_mss); - if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) - return -EHOSTUNREACH; /* Routing failure or similar. */ - /* Some Solaris stacks overoptimize and ignore the FIN on a * retransmit when old data is attached. So strip it off * since it is cheap to do so and saves bytes on the network. -- cgit v1.2.3 From 51f82a2b128131c411880aed2cb802b166fe3445 Mon Sep 17 00:00:00 2001 From: Denis Cheng Date: Wed, 21 May 2008 17:34:32 -0700 Subject: net/ipv4/arp.c: Use common hex_asc helpers Here the local hexbuf is a duplicate of global const char hex_asc from lib/hexdump.c, except the hex letters' cases: const char hexbuf[] = "0123456789ABCDEF"; const char hex_asc[] = "0123456789abcdef"; and here to print HW addresses, the hex cases are not significant. Thanks to Harvey Harrison to introduce the hex_asc_hi/hex_asc_lo helpers. Signed-off-by: Denis Cheng Signed-off-by: Harvey Harrison Signed-off-by: David S. Miller --- net/ipv4/arp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 418862f1bf22..9b539fa9fe18 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1288,7 +1288,6 @@ static void arp_format_neigh_entry(struct seq_file *seq, struct neighbour *n) { char hbuffer[HBUFFERLEN]; - const char hexbuf[] = "0123456789ABCDEF"; int k, j; char tbuf[16]; struct net_device *dev = n->dev; @@ -1302,8 +1301,8 @@ static void arp_format_neigh_entry(struct seq_file *seq, else { #endif for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) { - hbuffer[k++] = hexbuf[(n->ha[j] >> 4) & 15]; - hbuffer[k++] = hexbuf[n->ha[j] & 15]; + hbuffer[k++] = hex_asc_hi(n->ha[j]); + hbuffer[k++] = hex_asc_lo(n->ha[j]); hbuffer[k++] = ':'; } hbuffer[--k] = 0; -- cgit v1.2.3 From 071f92d05967a0c8422f1c8587ce0b4d90a8b447 Mon Sep 17 00:00:00 2001 From: Rami Rosen Date: Wed, 21 May 2008 17:47:54 -0700 Subject: net: The world is not perfect patch. Unless there will be any objection here, I suggest consider the following patch which simply removes the code for the -DI_WISH_WORLD_WERE_PERFECT in the three methods which use it. The compilation errors we get when using -DI_WISH_WORLD_WERE_PERFECT show that this code was not built and not used for really a long time. Signed-off-by: Rami Rosen Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 146 +----------------------------------------------------- net/ipv4/ipip.c | 130 +----------------------------------------------- 2 files changed, 2 insertions(+), 274 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2ada033406de..4342cba4ff82 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -313,9 +313,8 @@ static void ipgre_tunnel_uninit(struct net_device *dev) static void ipgre_err(struct sk_buff *skb, u32 info) { -#ifndef I_WISH_WORLD_WERE_PERFECT -/* It is not :-( All the routers (except for Linux) return only +/* All the routers (except for Linux) return only 8 bytes of packet payload. It means, that precise relaying of ICMP in the real Internet is absolutely infeasible. @@ -398,149 +397,6 @@ static void ipgre_err(struct sk_buff *skb, u32 info) out: read_unlock(&ipgre_lock); return; -#else - struct iphdr *iph = (struct iphdr*)dp; - struct iphdr *eiph; - __be16 *p = (__be16*)(dp+(iph->ihl<<2)); - const int type = icmp_hdr(skb)->type; - const int code = icmp_hdr(skb)->code; - int rel_type = 0; - int rel_code = 0; - __be32 rel_info = 0; - __u32 n = 0; - __be16 flags; - int grehlen = (iph->ihl<<2) + 4; - struct sk_buff *skb2; - struct flowi fl; - struct rtable *rt; - - if (p[1] != htons(ETH_P_IP)) - return; - - flags = p[0]; - if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { - if (flags&(GRE_VERSION|GRE_ROUTING)) - return; - if (flags&GRE_CSUM) - grehlen += 4; - if (flags&GRE_KEY) - grehlen += 4; - if (flags&GRE_SEQ) - grehlen += 4; - } - if (len < grehlen + sizeof(struct iphdr)) - return; - eiph = (struct iphdr*)(dp + grehlen); - - switch (type) { - default: - return; - case ICMP_PARAMETERPROB: - n = ntohl(icmp_hdr(skb)->un.gateway) >> 24; - if (n < (iph->ihl<<2)) - return; - - /* So... This guy found something strange INSIDE encapsulated - packet. Well, he is fool, but what can we do ? - */ - rel_type = ICMP_PARAMETERPROB; - n -= grehlen; - rel_info = htonl(n << 24); - break; - - case ICMP_DEST_UNREACH: - switch (code) { - case ICMP_SR_FAILED: - case ICMP_PORT_UNREACH: - /* Impossible event. */ - return; - case ICMP_FRAG_NEEDED: - /* And it is the only really necessary thing :-) */ - n = ntohs(icmp_hdr(skb)->un.frag.mtu); - if (n < grehlen+68) - return; - n -= grehlen; - /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */ - if (n > ntohs(eiph->tot_len)) - return; - rel_info = htonl(n); - break; - default: - /* All others are translated to HOST_UNREACH. - rfc2003 contains "deep thoughts" about NET_UNREACH, - I believe, it is just ether pollution. --ANK - */ - rel_type = ICMP_DEST_UNREACH; - rel_code = ICMP_HOST_UNREACH; - break; - } - break; - case ICMP_TIME_EXCEEDED: - if (code != ICMP_EXC_TTL) - return; - break; - } - - /* Prepare fake skb to feed it to icmp_send */ - skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2 == NULL) - return; - dst_release(skb2->dst); - skb2->dst = NULL; - skb_pull(skb2, skb->data - (u8*)eiph); - skb_reset_network_header(skb2); - - /* Try to guess incoming interface */ - memset(&fl, 0, sizeof(fl)); - fl.fl4_dst = eiph->saddr; - fl.fl4_tos = RT_TOS(eiph->tos); - fl.proto = IPPROTO_GRE; - if (ip_route_output_key(dev_net(skb->dev), &rt, &fl)) { - kfree_skb(skb2); - return; - } - skb2->dev = rt->u.dst.dev; - - /* route "incoming" packet */ - if (rt->rt_flags&RTCF_LOCAL) { - ip_rt_put(rt); - rt = NULL; - fl.fl4_dst = eiph->daddr; - fl.fl4_src = eiph->saddr; - fl.fl4_tos = eiph->tos; - if (ip_route_output_key(dev_net(skb->dev), &rt, &fl) || - rt->u.dst.dev->type != ARPHRD_IPGRE) { - ip_rt_put(rt); - kfree_skb(skb2); - return; - } - } else { - ip_rt_put(rt); - if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || - skb2->dst->dev->type != ARPHRD_IPGRE) { - kfree_skb(skb2); - return; - } - } - - /* change mtu on this route */ - if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { - if (n > dst_mtu(skb2->dst)) { - kfree_skb(skb2); - return; - } - skb2->dst->ops->update_pmtu(skb2->dst, n); - } else if (type == ICMP_TIME_EXCEEDED) { - struct ip_tunnel *t = netdev_priv(skb2->dev); - if (t->parms.iph.ttl) { - rel_type = ICMP_DEST_UNREACH; - rel_code = ICMP_HOST_UNREACH; - } - } - - icmp_send(skb2, rel_type, rel_code, rel_info); - kfree_skb(skb2); -#endif } static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 149111f08e8d..af5cb53da5cc 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -278,9 +278,8 @@ static void ipip_tunnel_uninit(struct net_device *dev) static int ipip_err(struct sk_buff *skb, u32 info) { -#ifndef I_WISH_WORLD_WERE_PERFECT -/* It is not :-( All the routers (except for Linux) return only +/* All the routers (except for Linux) return only 8 bytes of packet payload. It means, that precise relaying of ICMP in the real Internet is absolutely infeasible. */ @@ -337,133 +336,6 @@ static int ipip_err(struct sk_buff *skb, u32 info) out: read_unlock(&ipip_lock); return err; -#else - struct iphdr *iph = (struct iphdr*)dp; - int hlen = iph->ihl<<2; - struct iphdr *eiph; - const int type = icmp_hdr(skb)->type; - const int code = icmp_hdr(skb)->code; - int rel_type = 0; - int rel_code = 0; - __be32 rel_info = 0; - __u32 n = 0; - struct sk_buff *skb2; - struct flowi fl; - struct rtable *rt; - - if (len < hlen + sizeof(struct iphdr)) - return 0; - eiph = (struct iphdr*)(dp + hlen); - - switch (type) { - default: - return 0; - case ICMP_PARAMETERPROB: - n = ntohl(icmp_hdr(skb)->un.gateway) >> 24; - if (n < hlen) - return 0; - - /* So... This guy found something strange INSIDE encapsulated - packet. Well, he is fool, but what can we do ? - */ - rel_type = ICMP_PARAMETERPROB; - rel_info = htonl((n - hlen) << 24); - break; - - case ICMP_DEST_UNREACH: - switch (code) { - case ICMP_SR_FAILED: - case ICMP_PORT_UNREACH: - /* Impossible event. */ - return 0; - case ICMP_FRAG_NEEDED: - /* And it is the only really necessary thing :-) */ - n = ntohs(icmp_hdr(skb)->un.frag.mtu); - if (n < hlen+68) - return 0; - n -= hlen; - /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */ - if (n > ntohs(eiph->tot_len)) - return 0; - rel_info = htonl(n); - break; - default: - /* All others are translated to HOST_UNREACH. - rfc2003 contains "deep thoughts" about NET_UNREACH, - I believe, it is just ether pollution. --ANK - */ - rel_type = ICMP_DEST_UNREACH; - rel_code = ICMP_HOST_UNREACH; - break; - } - break; - case ICMP_TIME_EXCEEDED: - if (code != ICMP_EXC_TTL) - return 0; - break; - } - - /* Prepare fake skb to feed it to icmp_send */ - skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2 == NULL) - return 0; - dst_release(skb2->dst); - skb2->dst = NULL; - skb_pull(skb2, skb->data - (u8*)eiph); - skb_reset_network_header(skb2); - - /* Try to guess incoming interface */ - memset(&fl, 0, sizeof(fl)); - fl.fl4_daddr = eiph->saddr; - fl.fl4_tos = RT_TOS(eiph->tos); - fl.proto = IPPROTO_IPIP; - if (ip_route_output_key(dev_net(skb->dev), &rt, &key)) { - kfree_skb(skb2); - return 0; - } - skb2->dev = rt->u.dst.dev; - - /* route "incoming" packet */ - if (rt->rt_flags&RTCF_LOCAL) { - ip_rt_put(rt); - rt = NULL; - fl.fl4_daddr = eiph->daddr; - fl.fl4_src = eiph->saddr; - fl.fl4_tos = eiph->tos; - if (ip_route_output_key(dev_net(skb->dev), &rt, &fl) || - rt->u.dst.dev->type != ARPHRD_TUNNEL) { - ip_rt_put(rt); - kfree_skb(skb2); - return 0; - } - } else { - ip_rt_put(rt); - if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || - skb2->dst->dev->type != ARPHRD_TUNNEL) { - kfree_skb(skb2); - return 0; - } - } - - /* change mtu on this route */ - if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { - if (n > dst_mtu(skb2->dst)) { - kfree_skb(skb2); - return 0; - } - skb2->dst->ops->update_pmtu(skb2->dst, n); - } else if (type == ICMP_TIME_EXCEEDED) { - struct ip_tunnel *t = netdev_priv(skb2->dev); - if (t->parms.iph.ttl) { - rel_type = ICMP_DEST_UNREACH; - rel_code = ICMP_HOST_UNREACH; - } - } - - icmp_send(skb2, rel_type, rel_code, rel_info); - kfree_skb(skb2); - return 0; -#endif } static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph, -- cgit v1.2.3 From 51b77cae0d5aa8e1546fca855dcfe48ddfadfa9c Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 3 Jun 2008 16:36:01 -0700 Subject: route: Mark unused route cache flags as such. Also removes an obsolete check for the unused flag RTCF_MASQ. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index df41026b60db..96be336064fb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1792,7 +1792,7 @@ static int __mkroute_input(struct sk_buff *skb, if (err) flags |= RTCF_DIRECTSRC; - if (out_dev == in_dev && err && !(flags & RTCF_MASQ) && + if (out_dev == in_dev && err && (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) flags |= RTCF_DOREDIRECT; -- cgit v1.2.3 From 1f9d11c7c99da706e33646c3a9080dd5a8ef9a0b Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 3 Jun 2008 16:36:27 -0700 Subject: route: Mark unused routing attributes as such Also removes an unused policy entry for an attribute which is only used in kernel->user direction. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 0f1557a4ac7a..0b2ac6a3d903 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -506,7 +506,6 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = { [RTA_PREFSRC] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, - [RTA_PROTOINFO] = { .type = NLA_U32 }, [RTA_FLOW] = { .type = NLA_U32 }, }; -- cgit v1.2.3 From ab32cd793dca21eec846a8204390d9594ed994d5 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 3 Jun 2008 16:37:33 -0700 Subject: route: Remove unused ifa_anycast field The field was supposed to allow the creation of an anycast route by assigning an anycast address to an address prefix. It was never implemented so this field is unused and serves no purpose. Remove it. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 6848e4760f34..79a7ef6209ff 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -90,7 +90,6 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U32 }, [IFA_ADDRESS] = { .type = NLA_U32 }, [IFA_BROADCAST] = { .type = NLA_U32 }, - [IFA_ANYCAST] = { .type = NLA_U32 }, [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, }; @@ -536,9 +535,6 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh) if (tb[IFA_BROADCAST]) ifa->ifa_broadcast = nla_get_be32(tb[IFA_BROADCAST]); - if (tb[IFA_ANYCAST]) - ifa->ifa_anycast = nla_get_be32(tb[IFA_ANYCAST]); - if (tb[IFA_LABEL]) nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); else @@ -745,7 +741,6 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) break; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_broadcast = 0; - ifa->ifa_anycast = 0; ifa->ifa_scope = 0; } @@ -1113,7 +1108,6 @@ static inline size_t inet_nlmsg_size(void) + nla_total_size(4) /* IFA_ADDRESS */ + nla_total_size(4) /* IFA_LOCAL */ + nla_total_size(4) /* IFA_BROADCAST */ - + nla_total_size(4) /* IFA_ANYCAST */ + nla_total_size(IFNAMSIZ); /* IFA_LABEL */ } @@ -1143,9 +1137,6 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, if (ifa->ifa_broadcast) NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast); - if (ifa->ifa_anycast) - NLA_PUT_BE32(skb, IFA_ANYCAST, ifa->ifa_anycast); - if (ifa->ifa_label[0]) NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label); -- cgit v1.2.3 From 8aca6cb1179ed9bef9351028c8d8af852903eae2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 4 Jun 2008 11:34:22 -0700 Subject: tcp: Fix inconsistency source (CA_Open only when !tcp_left_out(tp)) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is possible that this skip path causes TCP to end up into an invalid state where ca_state was left to CA_Open while some segments already came into sacked_out. If next valid ACK doesn't contain new SACK information TCP fails to enter into tcp_fastretrans_alert(). Thus at least high_seq is set incorrectly to a too high seqno because some new data segments could be sent in between (and also, limited transmit is not being correctly invoked there). Reordering in both directions can easily cause this situation to occur. I guess we would want to use tcp_moderate_cwnd(tp) there as well as it may be possible to use this to trigger oversized burst to network by sending an old ACK with huge amount of SACK info, but I'm a bit unsure about its effects (mainly to FlightSize), so to be on the safe side I just currently fixed it minimally to keep TCP's state consistent (obviously, such nasty ACKs have been possible this far). Though it seems that FlightSize is already underestimated by some amount, so probably on the long term we might want to trigger recovery there too, if appropriate, to make FlightSize calculation to resemble reality at the time when the losses where discovered (but such change scares me too much now and requires some more thinking anyway how to do that as it likely involves some code shuffling). This bug was found by Brian Vowell while running my TCP debug patch to find cause of another TCP issue (fackets_out miscount). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b54d9d37b636..54a0b7412782 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2483,6 +2483,20 @@ static inline void tcp_complete_cwr(struct sock *sk) tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } +static void tcp_try_keep_open(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int state = TCP_CA_Open; + + if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) + state = TCP_CA_Disorder; + + if (inet_csk(sk)->icsk_ca_state != state) { + tcp_set_ca_state(sk, state); + tp->high_seq = tp->snd_nxt; + } +} + static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -2496,15 +2510,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) tcp_enter_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { - int state = TCP_CA_Open; - - if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) - state = TCP_CA_Disorder; - - if (inet_csk(sk)->icsk_ca_state != state) { - tcp_set_ca_state(sk, state); - tp->high_seq = tp->snd_nxt; - } + tcp_try_keep_open(sk); tcp_moderate_cwnd(tp); } else { tcp_cwnd_down(sk, flag); @@ -3310,8 +3316,11 @@ no_queue: return 1; old_ack: - if (TCP_SKB_CB(skb)->sacked) + if (TCP_SKB_CB(skb)->sacked) { tcp_sacktag_write_queue(sk, skb, prior_snd_una); + if (icsk->icsk_ca_state == TCP_CA_Open) + tcp_try_keep_open(sk); + } uninteresting_ack: SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt); -- cgit v1.2.3 From baa2bfb8aef24bb7fe1875b256918724b3884662 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Fri, 30 May 2008 11:35:03 +0900 Subject: [IPV4] TUNNEL4: Fix incoming packet length check for inter-protocol tunnel. Signed-off-by: YOSHIFUJI Hideaki --- net/ipv4/tunnel4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index d3b709a6f264..cb1f0e83830b 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -97,7 +97,7 @@ static int tunnel64_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; - if (!pskb_may_pull(skb, sizeof(struct iphdr))) + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; for (handler = tunnel64_handlers; handler; handler = handler->next) -- cgit v1.2.3 From 36d926b94a9908937593e5669162305a071b9cc3 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Wed, 4 Jun 2008 15:49:07 +0400 Subject: [IPV6]: inet_sk(sk)->cork.opt leak IPv6 UDP sockets wth IPv4 mapped address use udp_sendmsg to send the data actually. In this case ip_flush_pending_frames should be called instead of ip6_flush_pending_frames. Signed-off-by: Denis V. Lunev Signed-off-by: YOSHIFUJI Hideaki --- net/ipv4/udp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index db1cb7c96d63..56fcda3694ba 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -420,7 +420,7 @@ void udp_err(struct sk_buff *skb, u32 info) /* * Throw away all pending data and cancel the corking. Socket is locked. */ -static void udp_flush_pending_frames(struct sock *sk) +void udp_flush_pending_frames(struct sock *sk) { struct udp_sock *up = udp_sk(sk); @@ -430,6 +430,7 @@ static void udp_flush_pending_frames(struct sock *sk) ip_flush_pending_frames(sk); } } +EXPORT_SYMBOL(udp_flush_pending_frames); /** * udp4_hwcsum_outgoing - handle outgoing HW checksumming -- cgit v1.2.3 From a6604471db5e7a33474a7f16c64d6b118fae3e74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 4 Jun 2008 12:07:44 -0700 Subject: tcp: fix skb vs fack_count out-of-sync condition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This bug is able to corrupt fackets_out in very rare cases. In order for this to cause corruption: 1) DSACK in the middle of previous SACK block must be generated. 2) In order to take that particular branch, part or all of the DSACKed segment must already be SACKed so that we have that in cache in the first place. 3) The new info must be top enough so that fackets_out will be updated on this iteration. ...then fack_count is updated while skb wasn't, then we walk again that particular segment thus updating fack_count twice for a single skb and finally that value is assigned to fackets_out by tcp_sacktag_one. It is safe to call tcp_sacktag_one just once for a segment (at DSACK), no need to call again for plain SACK. Potential problem of the miscount are limited to premature entry to recovery and to inflated reordering metric (which could even cancel each other out in the most the luckiest scenarios :-)). Both are quite insignificant in worst case too and there exists also code to reset them (fackets_out once sacked_out becomes zero and reordering metric on RTO). This has been reported by a number of people, because it occurred quite rarely, it has been very evasive. Andy Furniss was able to get it to occur couple of times so that a bit more info was collected about the problem using a debug patch, though it still required lot of checking around. Thanks also to others who have tried to help here. This is listed as Bugzilla #10346. The bug was introduced by me in commit 68f8353b48 ([TCP]: Rewrite SACK block processing & sack_recv_cache use), I probably thought back then that there's need to scan that entry twice or didn't dare to make it go through it just once there. Going through twice would have required restoring fack_count after the walk but as noted above, I chose to drop the additional walk step altogether here. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 54a0b7412782..eba873e9b560 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1392,9 +1392,9 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, if (before(next_dup->start_seq, skip_to_seq)) { skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq, fack_count); - tcp_sacktag_walk(skb, sk, NULL, - next_dup->start_seq, next_dup->end_seq, - 1, fack_count, reord, flag); + skb = tcp_sacktag_walk(skb, sk, NULL, + next_dup->start_seq, next_dup->end_seq, + 1, fack_count, reord, flag); } return skb; -- cgit v1.2.3 From 22dd485022f3d0b162ceb5e67d85de7c3806aa20 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Wed, 4 Jun 2008 15:16:12 -0700 Subject: raw: Raw socket leak. The program below just leaks the raw kernel socket int main() { int fd = socket(PF_INET, SOCK_RAW, IPPROTO_UDP); struct sockaddr_in addr; memset(&addr, 0, sizeof(addr)); inet_aton("127.0.0.1", &addr.sin_addr); addr.sin_family = AF_INET; addr.sin_port = htons(2048); sendto(fd, "a", 1, MSG_MORE, &addr, sizeof(addr)); return 0; } Corked packet is allocated via sock_wmalloc which holds the owner socket, so one should uncork it and flush all pending data on close. Do this in the same way as in UDP. Signed-off-by: Denis V. Lunev Acked-by: Alexey Kuznetsov Signed-off-by: David S. Miller --- net/ipv4/raw.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index fead049daf43..e7e091d365ff 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -608,6 +608,14 @@ static void raw_close(struct sock *sk, long timeout) sk_common_release(sk); } +static int raw_destroy(struct sock *sk) +{ + lock_sock(sk); + ip_flush_pending_frames(sk); + release_sock(sk); + return 0; +} + /* This gets rid of all the nasties in af_inet. -DaveM */ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -820,6 +828,7 @@ struct proto raw_prot = { .name = "RAW", .owner = THIS_MODULE, .close = raw_close, + .destroy = raw_destroy, .connect = ip4_datagram_connect, .disconnect = udp_disconnect, .ioctl = raw_ioctl, -- cgit v1.2.3 From 26af65cbeb2467a486ae4fc7242c94e470c67c50 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Wed, 4 Jun 2008 15:19:35 -0700 Subject: tcp: Increment OUTRSTS in tcp_send_active_reset() TCP "resets sent" counter is not incremented when a TCP Reset is sent via tcp_send_active_reset(). Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e399bde7813a..ad993ecb4810 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2131,6 +2131,8 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); + + TCP_INC_STATS(TCP_MIB_OUTRSTS); } /* WARNING: This routine must only be called when we have already sent -- cgit v1.2.3 From 293ad60401da621b8b329abbe8c388edb25f658a Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 4 Jun 2008 15:45:58 -0700 Subject: tcp: Fix for race due to temporary drop of the socket lock in skb_splice_bits. skb_splice_bits temporary drops the socket lock while iterating over the socket queue in order to break a reverse locking condition which happens with sendfile. This, however, opens a window of opportunity for tcp_collapse() to aggregate skbs and thus potentially free the current skb used in skb_splice_bits and tcp_read_sock. This patch fixes the problem by (re-)getting the same "logical skb" after the lock has been temporary dropped. Based on idea and initial patch from Evgeniy Polyakov. Signed-off-by: Octavian Purdila Acked-by: Evgeniy Polyakov Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f88653138621..ab66683b8043 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1227,7 +1227,14 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, copied += used; offset += used; } - if (offset != skb->len) + /* + * If recv_actor drops the lock (e.g. TCP splice + * receive) the skb pointer might be invalid when + * getting here: tcp_collapse might have deleted it + * while aggregating skbs from the socket queue. + */ + skb = tcp_recv_skb(sk, seq-1, &offset); + if (!skb || (offset+1 != skb->len)) break; } if (tcp_hdr(skb)->fin) { -- cgit v1.2.3 From ddb2c43594f22843e9f3153da151deaba1a834c5 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Wed, 4 Jun 2008 09:16:33 -0700 Subject: asn1: additional sanity checking during BER decoding - Don't trust a length which is greater than the working buffer. An invalid length could cause overflow when calculating buffer size for decoding oid. - An oid length of zero is invalid and allows for an off-by-one error when decoding oid because the first subid actually encodes first 2 subids. - A primitive encoding may not have an indefinite length. Thanks to Wei Wang from McAfee for report. Cc: Steven French Cc: stable@kernel.org Acked-by: Patrick McHardy Signed-off-by: Chris Wright Signed-off-by: Linus Torvalds --- net/ipv4/netfilter/nf_nat_snmp_basic.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 5daefad3d193..7750c97fde7b 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -232,6 +232,11 @@ static unsigned char asn1_length_decode(struct asn1_ctx *ctx, } } } + + /* don't trust len bigger than ctx buffer */ + if (*len > ctx->end - ctx->pointer) + return 0; + return 1; } @@ -250,6 +255,10 @@ static unsigned char asn1_header_decode(struct asn1_ctx *ctx, if (!asn1_length_decode(ctx, &def, &len)) return 0; + /* primitive shall be definite, indefinite shall be constructed */ + if (*con == ASN1_PRI && !def) + return 0; + if (def) *eoc = ctx->pointer + len; else @@ -434,6 +443,11 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx, unsigned long *optr; size = eoc - ctx->pointer + 1; + + /* first subid actually encodes first two subids */ + if (size < 2 || size > ULONG_MAX/sizeof(unsigned long)) + return 0; + *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); if (*oid == NULL) { if (net_ratelimit()) -- cgit v1.2.3 From ce4a7d0d48bbaed78ccbb0bafb9229651a40303a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 10 Jun 2008 12:39:35 -0700 Subject: inet{6}_request_sock: Init ->opt and ->pktopts in the constructor Wei Yongjun noticed that we may call reqsk_free on request sock objects where the opt fields may not be initialized, fix it by introducing inet_reqsk_alloc where we initialize ->opt to NULL and set ->pktopts to NULL in inet6_reqsk_alloc. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 3 +-- net/ipv4/tcp_ipv4.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 73ba98921d64..d182a2a26291 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -285,7 +285,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, cookie_check_timestamp(&tcp_opt); ret = NULL; - req = reqsk_alloc(&tcp_request_sock_ops); /* for safety */ + req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */ if (!req) goto out; @@ -301,7 +301,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, ireq->rmt_port = th->source; ireq->loc_addr = ip_hdr(skb)->daddr; ireq->rmt_addr = ip_hdr(skb)->saddr; - ireq->opt = NULL; ireq->snd_wscale = tcp_opt.snd_wscale; ireq->rcv_wscale = tcp_opt.rcv_wscale; ireq->sack_ok = tcp_opt.sack_ok; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cd601a866c2f..4f8485c67d1a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1285,7 +1285,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = reqsk_alloc(&tcp_request_sock_ops); + req = inet_reqsk_alloc(&tcp_request_sock_ops); if (!req) goto drop; -- cgit v1.2.3 From 709772e6e06564ed94ba740de70185ac3d792773 Mon Sep 17 00:00:00 2001 From: Krzysztof Piotr Oledzki Date: Tue, 10 Jun 2008 15:44:49 -0700 Subject: net: Fix routing tables with id > 255 for legacy software Most legacy software do not like tables > 255 as rtm_table is u8 so tb_id is sent &0xff and it is possible to mismatch for example table 510 with table 254 (main). This patch introduces RT_TABLE_COMPAT=252 so the code uses it if tb_id > 255. It makes such old applications happy, new ones are still able to use RTA_TABLE to get a proper table id. Signed-off-by: Krzysztof Piotr Oledzki Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 3b83c34019fc..0d4d72827e4b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -960,7 +960,10 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, rtm->rtm_dst_len = dst_len; rtm->rtm_src_len = 0; rtm->rtm_tos = tos; - rtm->rtm_table = tb_id; + if (tb_id < 256) + rtm->rtm_table = tb_id; + else + rtm->rtm_table = RT_TABLE_COMPAT; NLA_PUT_U32(skb, RTA_TABLE, tb_id); rtm->rtm_type = type; rtm->rtm_flags = fi->fib_flags; -- cgit v1.2.3 From ec0a196626bd12e0ba108d7daa6d95a4fb25c2c5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 12 Jun 2008 16:31:35 -0700 Subject: tcp: Revert 'process defer accept as established' changes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts two changesets, ec3c0982a2dd1e671bad8e9d26c28dcba0039d87 ("[TCP]: TCP_DEFER_ACCEPT updates - process as established") and the follow-on bug fix 9ae27e0adbf471c7a6b80102e38e1d5a346b3b38 ("tcp: Fix slab corruption with ipv6 and tcp6fuzz"). This change causes several problems, first reported by Ingo Molnar as a distcc-over-loopback regression where connections were getting stuck. Ilpo Järvinen first spotted the locking problems. The new function added by this code, tcp_defer_accept_check(), only has the child socket locked, yet it is modifying state of the parent listening socket. Fixing that is non-trivial at best, because we can't simply just grab the parent listening socket lock at this point, because it would create an ABBA deadlock. The normal ordering is parent listening socket --> child socket, but this code path would require the reverse lock ordering. Next is a problem noticed by Vitaliy Gusev, he noted: ---------------------------------------- >--- a/net/ipv4/tcp_timer.c >+++ b/net/ipv4/tcp_timer.c >@@ -481,6 +481,11 @@ static void tcp_keepalive_timer (unsigned long data) > goto death; > } > >+ if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) { >+ tcp_send_active_reset(sk, GFP_ATOMIC); >+ goto death; Here socket sk is not attached to listening socket's request queue. tcp_done() will not call inet_csk_destroy_sock() (and tcp_v4_destroy_sock() which should release this sk) as socket is not DEAD. Therefore socket sk will be lost for freeing. ---------------------------------------- Finally, Alexey Kuznetsov argues that there might not even be any real value or advantage to these new semantics even if we fix all of the bugs: ---------------------------------------- Hiding from accept() sockets with only out-of-order data only is the only thing which is impossible with old approach. Is this really so valuable? My opinion: no, this is nothing but a new loophole to consume memory without control. ---------------------------------------- So revert this thing for now. Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 11 +++++++--- net/ipv4/tcp.c | 18 ++++++++++------- net/ipv4/tcp_input.c | 45 ----------------------------------------- net/ipv4/tcp_ipv4.c | 8 -------- net/ipv4/tcp_minisocks.c | 32 +++++++++++------------------ net/ipv4/tcp_timer.c | 5 ----- 6 files changed, 31 insertions(+), 88 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 828ea211ff21..045e799d3e1d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -419,7 +419,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, struct inet_connection_sock *icsk = inet_csk(parent); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct listen_sock *lopt = queue->listen_opt; - int thresh = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + int thresh = max_retries; unsigned long now = jiffies; struct request_sock **reqp, *req; int i, budget; @@ -455,6 +456,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, } } + if (queue->rskq_defer_accept) + max_retries = queue->rskq_defer_accept; + budget = 2 * (lopt->nr_table_entries / (timeout / interval)); i = lopt->clock_hand; @@ -462,8 +466,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, reqp=&lopt->syn_table[i]; while ((req = *reqp) != NULL) { if (time_after_eq(now, req->expires)) { - if (req->retrans < thresh && - !req->rsk_ops->rtx_syn_ack(parent, req)) { + if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh)) && + (inet_rsk(req)->acked || + !req->rsk_ops->rtx_syn_ack(parent, req))) { unsigned long timeo; if (req->retrans++ == 0) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ab66683b8043..fc54a48fde1e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2112,12 +2112,15 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_DEFER_ACCEPT: - if (val < 0) { - err = -EINVAL; - } else { - if (val > MAX_TCP_ACCEPT_DEFERRED) - val = MAX_TCP_ACCEPT_DEFERRED; - icsk->icsk_accept_queue.rskq_defer_accept = val; + icsk->icsk_accept_queue.rskq_defer_accept = 0; + if (val > 0) { + /* Translate value in seconds to number of + * retransmits */ + while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && + val > ((TCP_TIMEOUT_INIT / HZ) << + icsk->icsk_accept_queue.rskq_defer_accept)) + icsk->icsk_accept_queue.rskq_defer_accept++; + icsk->icsk_accept_queue.rskq_defer_accept++; } break; @@ -2299,7 +2302,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = (val ? : sysctl_tcp_fin_timeout) / HZ; break; case TCP_DEFER_ACCEPT: - val = icsk->icsk_accept_queue.rskq_defer_accept; + val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : + ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eba873e9b560..cad73b7dfef0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4541,49 +4541,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th) } } -static int tcp_defer_accept_check(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (tp->defer_tcp_accept.request) { - int queued_data = tp->rcv_nxt - tp->copied_seq; - int hasfin = !skb_queue_empty(&sk->sk_receive_queue) ? - tcp_hdr((struct sk_buff *) - sk->sk_receive_queue.prev)->fin : 0; - - if (queued_data && hasfin) - queued_data--; - - if (queued_data && - tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) { - if (sock_flag(sk, SOCK_KEEPOPEN)) { - inet_csk_reset_keepalive_timer(sk, - keepalive_time_when(tp)); - } else { - inet_csk_delete_keepalive_timer(sk); - } - - inet_csk_reqsk_queue_add( - tp->defer_tcp_accept.listen_sk, - tp->defer_tcp_accept.request, - sk); - - tp->defer_tcp_accept.listen_sk->sk_data_ready( - tp->defer_tcp_accept.listen_sk, 0); - - sock_put(tp->defer_tcp_accept.listen_sk); - sock_put(sk); - tp->defer_tcp_accept.listen_sk = NULL; - tp->defer_tcp_accept.request = NULL; - } else if (hasfin || - tp->defer_tcp_accept.listen_sk->sk_state != TCP_LISTEN) { - tcp_reset(sk); - return -1; - } - } - return 0; -} - static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) { struct tcp_sock *tp = tcp_sk(sk); @@ -4944,8 +4901,6 @@ step5: tcp_data_snd_check(sk); tcp_ack_snd_check(sk); - - tcp_defer_accept_check(sk); return 0; csum_error: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4f8485c67d1a..97a230026e13 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1918,14 +1918,6 @@ int tcp_v4_destroy_sock(struct sock *sk) sk->sk_sndmsg_page = NULL; } - if (tp->defer_tcp_accept.request) { - reqsk_free(tp->defer_tcp_accept.request); - sock_put(tp->defer_tcp_accept.listen_sk); - sock_put(sk); - tp->defer_tcp_accept.listen_sk = NULL; - tp->defer_tcp_accept.request = NULL; - } - atomic_dec(&tcp_sockets_allocated); return 0; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 019c8c16e5cc..8245247a6ceb 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -571,8 +571,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, does sequence test, SYN is truncated, and thus we consider it a bare ACK. - Both ends (listening sockets) accept the new incoming - connection and try to talk to each other. 8-) + If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this + bare ACK. Otherwise, we create an established connection. Both + ends (listening sockets) accept the new incoming connection and try + to talk to each other. 8-) Note: This case is both harmless, and rare. Possibility is about the same as us discovering intelligent life on another plant tomorrow. @@ -640,6 +642,13 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; + /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ + if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { + inet_rsk(req)->acked = 1; + return NULL; + } + /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all * the tests. THIS SEGMENT MUST MOVE SOCKET TO @@ -678,24 +687,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, inet_csk_reqsk_queue_unlink(sk, req, prev); inet_csk_reqsk_queue_removed(sk, req); - if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && - TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { - - /* the accept queue handling is done is est recv slow - * path so lets make sure to start there - */ - tcp_sk(child)->pred_flags = 0; - sock_hold(sk); - sock_hold(child); - tcp_sk(child)->defer_tcp_accept.listen_sk = sk; - tcp_sk(child)->defer_tcp_accept.request = req; - - inet_csk_reset_keepalive_timer(child, - inet_csk(sk)->icsk_accept_queue.rskq_defer_accept * HZ); - } else { - inet_csk_reqsk_queue_add(sk, req, child); - } - + inet_csk_reqsk_queue_add(sk, req, child); return child; listen_overflow: diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 4de68cf5f2aa..63ed9d6830e7 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -489,11 +489,6 @@ static void tcp_keepalive_timer (unsigned long data) goto death; } - if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) { - tcp_send_active_reset(sk, GFP_ATOMIC); - goto death; - } - if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) goto out; -- cgit v1.2.3