From 98322f22eca889478045cf896b572250d03dc45f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 26 Jan 2009 21:35:35 -0800
Subject: udp: optimize bind(0) if many ports are in use

commit 9088c5609584684149f3fb5b065aa7f18dcb03ff
(udp: Improve port randomization) introduced a regression for UDP bind() syscall
to null port (getting a random port) in case lot of ports are already in use.

This is because we do about 28000 scans of very long chains (220 sockets per chain),
with many spin_lock_bh()/spin_unlock_bh() calls.

Fix this using a bitmap (64 bytes for current value of UDP_HTABLE_SIZE)
so that we scan chains at most once.

Instead of 250 ms per bind() call, we get after patch a time of 2.9 ms

Based on a report from Vitaly Mayatskikh

Reported-by: Vitaly Mayatskikh <v.mayatskih@gmail.com>
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Tested-by: Vitaly Mayatskikh <v.mayatskih@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 55 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 39 insertions(+), 16 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cf5ab0581eba..b7faffe5c029 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -120,8 +120,11 @@ EXPORT_SYMBOL(sysctl_udp_wmem_min);
 atomic_t udp_memory_allocated;
 EXPORT_SYMBOL(udp_memory_allocated);
 
+#define PORTS_PER_CHAIN (65536 / UDP_HTABLE_SIZE)
+
 static int udp_lib_lport_inuse(struct net *net, __u16 num,
 			       const struct udp_hslot *hslot,
+			       unsigned long *bitmap,
 			       struct sock *sk,
 			       int (*saddr_comp)(const struct sock *sk1,
 						 const struct sock *sk2))
@@ -132,12 +135,17 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 	sk_nulls_for_each(sk2, node, &hslot->head)
 		if (net_eq(sock_net(sk2), net)			&&
 		    sk2 != sk					&&
-		    sk2->sk_hash == num				&&
+		    (bitmap || sk2->sk_hash == num)		&&
 		    (!sk2->sk_reuse || !sk->sk_reuse)		&&
 		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
 			|| sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
-		    (*saddr_comp)(sk, sk2))
-			return 1;
+		    (*saddr_comp)(sk, sk2)) {
+			if (bitmap)
+				__set_bit(sk2->sk_hash / UDP_HTABLE_SIZE,
+					  bitmap);
+			else
+				return 1;
+		}
 	return 0;
 }
 
@@ -160,32 +168,47 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 	if (!snum) {
 		int low, high, remaining;
 		unsigned rand;
-		unsigned short first;
+		unsigned short first, last;
+		DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
 
 		inet_get_local_port_range(&low, &high);
 		remaining = (high - low) + 1;
 
 		rand = net_random();
-		snum = first = rand % remaining + low;
-		rand |= 1;
-		for (;;) {
-			hslot = &udptable->hash[udp_hashfn(net, snum)];
+		first = (((u64)rand * remaining) >> 32) + low;
+		/*
+		 * force rand to be an odd multiple of UDP_HTABLE_SIZE
+		 */
+		rand = (rand | 1) * UDP_HTABLE_SIZE;
+		for (last = first + UDP_HTABLE_SIZE; first != last; first++) {
+			hslot = &udptable->hash[udp_hashfn(net, first)];
+			bitmap_zero(bitmap, PORTS_PER_CHAIN);
 			spin_lock_bh(&hslot->lock);
-			if (!udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp))
-				break;
-			spin_unlock_bh(&hslot->lock);
+			udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
+					    saddr_comp);
+
+			snum = first;
+			/*
+			 * Iterate on all possible values of snum for this hash.
+			 * Using steps of an odd multiple of UDP_HTABLE_SIZE
+			 * give us randomization and full range coverage.
+			 */
 			do {
-				snum = snum + rand;
-			} while (snum < low || snum > high);
-			if (snum == first)
-				goto fail;
+				if (low <= snum && snum <= high &&
+				    !test_bit(snum / UDP_HTABLE_SIZE, bitmap))
+					goto found;
+				snum += rand;
+			} while (snum != first);
+			spin_unlock_bh(&hslot->lock);
 		}
+		goto fail;
 	} else {
 		hslot = &udptable->hash[udp_hashfn(net, snum)];
 		spin_lock_bh(&hslot->lock);
-		if (udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp))
+		if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, saddr_comp))
 			goto fail_unlock;
 	}
+found:
 	inet_sk(sk)->num = snum;
 	sk->sk_hash = snum;
 	if (sk_unhashed(sk)) {
-- 
cgit v1.2.3


From 9fa5fdf291c9b58b1cb8b4bb2a0ee57efa21d635 Mon Sep 17 00:00:00 2001
From: Dimitris Michailidis <dm@chelsio.com>
Date: Mon, 26 Jan 2009 22:15:31 -0800
Subject: tcp: Fix length tcp_splice_data_recv passes to skb_splice_bits.

tcp_splice_data_recv has two lengths to consider: the len parameter it
gets from tcp_read_sock, which specifies the amount of data in the skb,
and rd_desc->count, which is the amount of data the splice caller still
wants.  Currently it passes just the latter to skb_splice_bits, which then
splices min(rd_desc->count, skb->len - offset) bytes.

Most of the time this is fine, except when the skb contains urgent data.
In that case len goes only up to the urgent byte and is less than
skb->len - offset.  By ignoring len tcp_splice_data_recv may a) splice
data tcp_read_sock told it not to, b) return to tcp_read_sock a value > len.

Now, tcp_read_sock doesn't handle used > len and leaves the socket in a
bad state (both sk_receive_queue and copied_seq are bad at that point)
resulting in duplicated data and corruption.

Fix by passing min(rd_desc->count, len) to skb_splice_bits.

Signed-off-by: Dimitris Michailidis <dm@chelsio.com>
Acked-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0cd71b84e483..76b148bcb0dc 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -524,7 +524,8 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
 	struct tcp_splice_state *tss = rd_desc->arg.data;
 	int ret;
 
-	ret = skb_splice_bits(skb, offset, tss->pipe, rd_desc->count, tss->flags);
+	ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
+			      tss->flags);
 	if (ret > 0)
 		rd_desc->count -= ret;
 	return ret;
-- 
cgit v1.2.3


From 9d8dba6c979fa99c96938c869611b9a23b73efa9 Mon Sep 17 00:00:00 2001
From: Benjamin Zores <benjamin.zores@alcatel-lucent.fr>
Date: Thu, 29 Jan 2009 16:19:13 -0800
Subject: ipv4: fix infinite retry loop in IP-Config

Signed-off-by: Benjamin Zores <benjamin.zores@alcatel-lucent.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipconfig.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 42a0f3dd3fd6..d722013c1cae 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1268,6 +1268,9 @@ __be32 __init root_nfs_parse_addr(char *name)
 static int __init ip_auto_config(void)
 {
 	__be32 addr;
+#ifdef IPCONFIG_DYNAMIC
+	int retries = CONF_OPEN_RETRIES;
+#endif
 
 #ifdef CONFIG_PROC_FS
 	proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops);
@@ -1304,9 +1307,6 @@ static int __init ip_auto_config(void)
 #endif
 	    ic_first_dev->next) {
 #ifdef IPCONFIG_DYNAMIC
-
-		int retries = CONF_OPEN_RETRIES;
-
 		if (ic_dynamic() < 0) {
 			ic_close_devs();
 
-- 
cgit v1.2.3


From e408b8dcb5ce42243a902205005208e590f28454 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 2 Feb 2009 13:41:57 -0800
Subject: udp: increments sk_drops in __udp_queue_rcv_skb()

Commit 93821778def10ec1e69aa3ac10adee975dad4ff3 (udp: Fix rcv socket
locking) accidentally removed sk_drops increments for UDP IPV4
sockets.

This field can be used to detect incorrect sizing of socket receive
buffers.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index b7faffe5c029..1ab180bad72a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1015,9 +1015,11 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 
 	if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) {
 		/* Note that an ENOMEM error is charged twice */
-		if (rc == -ENOMEM)
+		if (rc == -ENOMEM) {
 			UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
 					 is_udplite);
+			atomic_inc(&sk->sk_drops);
+		}
 		goto drop;
 	}
 
-- 
cgit v1.2.3


From 7b5e56f9d635643ad54f2f42e69ad16b80a2cff1 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@comx.dk>
Date: Thu, 5 Feb 2009 15:05:45 -0800
Subject: udp: Fix UDP short packet false positive

The UDP header pointer assignment must happen after calling
pskb_may_pull().  As pskb_may_pull() can potentially alter the SKB
buffer.

This was exposted by running multicast traffic through the NIU driver,
as it won't prepull the protocol headers into the linear area on
receive.

Signed-off-by: Jesper Dangaard Brouer <hawk@comx.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1ab180bad72a..cc3a0a06c004 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1231,7 +1231,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 		   int proto)
 {
 	struct sock *sk;
-	struct udphdr *uh = udp_hdr(skb);
+	struct udphdr *uh;
 	unsigned short ulen;
 	struct rtable *rt = (struct rtable*)skb->dst;
 	__be32 saddr = ip_hdr(skb)->saddr;
@@ -1244,6 +1244,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
 		goto drop;		/* No space for header. */
 
+	uh   = udp_hdr(skb);
 	ulen = ntohs(uh->len);
 	if (ulen > skb->len)
 		goto short_packet;
-- 
cgit v1.2.3


From a23f4bbd8d27ac8ddc5d71ace1f91bb503f0469a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 5 Feb 2009 15:38:31 -0800
Subject: Revert "tcp: Always set urgent pointer if it's beyond snd_nxt"

This reverts commit 64ff3b938ec6782e6585a83d5459b98b0c3f6eb8.

Jeff Chua reports that it breaks rlogin for him.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 557fe16cbfb0..dda42f0bd7a3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -663,14 +663,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	th->urg_ptr		= 0;
 
 	/* The urg_mode check is necessary during a below snd_una win probe */
-	if (unlikely(tcp_urg_mode(tp))) {
-		if (between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF)) {
-			th->urg_ptr = htons(tp->snd_up - tcb->seq);
-			th->urg = 1;
-		} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
-			th->urg_ptr = 0xFFFF;
-			th->urg = 1;
-		}
+	if (unlikely(tcp_urg_mode(tp) &&
+		     between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) {
+		th->urg_ptr		= htons(tp->snd_up - tcb->seq);
+		th->urg			= 1;
 	}
 
 	tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
-- 
cgit v1.2.3


From 2783ef23128ad0a4b34e4121c1f7ff664785712f Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@comx.dk>
Date: Fri, 6 Feb 2009 01:59:12 -0800
Subject: udp: Fix potential wrong ip_hdr(skb) pointers

Like the UDP header fix, pskb_may_pull() can potentially
alter the SKB buffer.  Thus the saddr and daddr, pointers
may point to the old skb->data buffer.

I haven't seen corruptions, as its only seen if the old
skb->data buffer were reallocated by another user and
written into very quickly (or poison'd by SLAB debugging).

Signed-off-by: Jesper Dangaard Brouer <hawk@comx.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cc3a0a06c004..c47c989cb1fb 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1234,8 +1234,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 	struct udphdr *uh;
 	unsigned short ulen;
 	struct rtable *rt = (struct rtable*)skb->dst;
-	__be32 saddr = ip_hdr(skb)->saddr;
-	__be32 daddr = ip_hdr(skb)->daddr;
+	__be32 saddr, daddr;
 	struct net *net = dev_net(skb->dev);
 
 	/*
@@ -1259,6 +1258,9 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 	if (udp4_csum_init(skb, uh, proto))
 		goto csum_error;
 
+	saddr = ip_hdr(skb)->saddr;
+	daddr = ip_hdr(skb)->daddr;
+
 	if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
 		return __udp4_lib_mcast_deliver(net, skb, uh,
 				saddr, daddr, udptable);
-- 
cgit v1.2.3


From 5209921cf15452cbe43097afce11d2846630cb51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Wed, 18 Feb 2009 17:45:44 -0800
Subject: tcp: remove obsoleted comment about different passes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is obsolete since the passes got combined.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index dda42f0bd7a3..da2c3b8794f2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2023,7 +2023,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 		last_lost = tp->snd_una;
 	}
 
-	/* First pass: retransmit lost packets. */
 	tcp_for_write_queue_from(skb, sk) {
 		__u8 sacked = TCP_SKB_CB(skb)->sacked;
 
-- 
cgit v1.2.3


From 586c25003707067f074043d80fb2071671c58db0 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Fri, 20 Feb 2009 16:32:55 -0500
Subject: cipso: Fix documentation comment

The CIPSO protocol engine incorrectly stated that the FIPS-188 specification
could be found in the kernel's Documentation directory.  This patch corrects
that by removing the comment and directing users to the FIPS-188 documented
hosted online.  For the sake of completeness I've also included a link to the
CIPSO draft specification on the NetLabel website.

Thanks to Randy Dunlap for spotting the error and letting me know.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 net/ipv4/cipso_ipv4.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 6bb2635b5ded..7bc992976d29 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -3,11 +3,16 @@
  *
  * This is an implementation of the CIPSO 2.2 protocol as specified in
  * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in
- * FIPS-188, copies of both documents can be found in the Documentation
- * directory.  While CIPSO never became a full IETF RFC standard many vendors
+ * FIPS-188.  While CIPSO never became a full IETF RFC standard many vendors
  * have chosen to adopt the protocol and over the years it has become a
  * de-facto standard for labeled networking.
  *
+ * The CIPSO draft specification can be found in the kernel's Documentation
+ * directory as well as the following URL:
+ *   http://netlabel.sourceforge.net/files/draft-ietf-cipso-ipsecurity-01.txt
+ * The FIPS-188 specification can be found at the following URL:
+ *   http://www.itl.nist.gov/fipspubs/fip188.htm
+ *
  * Author: Paul Moore <paul.moore@hp.com>
  *
  */
-- 
cgit v1.2.3


From a52b8bd338630f78a6bfe39fe17cb8469d2679ae Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 24 Feb 2009 16:40:16 -0800
Subject: tcp_scalable: Update malformed & dead url

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_scalable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 2747ec7bfb63..4660b088a8ce 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -1,6 +1,6 @@
 /* Tom Kelly's Scalable TCP
  *
- * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/
+ * See http://www.deneholme.net/tom/scalable/
  *
  * John Heffner <jheffner@sc.edu>
  */
-- 
cgit v1.2.3


From 9ec06ff57a9badef3b6b019f35efc6b21fc27d03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sun, 1 Mar 2009 00:21:36 -0800
Subject: tcp: fix retrans_out leaks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There's conflicting assumptions in shifting, the caller assumes
that dupsack results in S'ed skbs (or a part of it) for sure but
never gave a hint to tcp_sacktag_one when dsack is actually in
use. Thus DSACK retrans_out -= pcount was not taken and the
counter became out of sync. Remove obstacle from that information
flow to get DSACKs accounted in tcp_sacktag_one as expected.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Tested-by: Denys Fedoryshchenko <denys@visp.net.lb>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a6961d75c7ea..c28976a7e596 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1374,7 +1374,8 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
 
 static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 			   struct tcp_sacktag_state *state,
-			   unsigned int pcount, int shifted, int mss)
+			   unsigned int pcount, int shifted, int mss,
+			   int dup_sack)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
@@ -1410,7 +1411,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 	}
 
 	/* We discard results */
-	tcp_sacktag_one(skb, sk, state, 0, pcount);
+	tcp_sacktag_one(skb, sk, state, dup_sack, pcount);
 
 	/* Difference in this won't matter, both ACKed by the same cumul. ACK */
 	TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
@@ -1561,7 +1562,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 
 	if (!skb_shift(prev, skb, len))
 		goto fallback;
-	if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss))
+	if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
 		goto out;
 
 	/* Hole filled allows collapsing with the next as well, this is very
@@ -1580,7 +1581,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 	len = skb->len;
 	if (skb_shift(prev, skb, len)) {
 		pcount += tcp_skb_pcount(skb);
-		tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss);
+		tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
 	}
 
 out:
-- 
cgit v1.2.3


From 6eb0777228f31932fc941eafe8b08848466630a1 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@aristanetworks.com>
Date: Sun, 22 Feb 2009 00:09:14 -0800
Subject: netns: Fix icmp shutdown.

Recently I had a kernel panic in icmp_send during a network namespace
cleanup.  There were packets in the arp queue that failed to be sent
and we attempted to generate an ICMP host unreachable message, but
failed because icmp_sk_exit had already been called.

The network devices are removed from a network namespace and their
arp queues are flushed before we do attempt to shutdown subsystems
so this error should have been impossible.

It turns out icmp_init is using register_pernet_device instead
of register_pernet_subsys.  Which resulted in icmp being shut down
while we still had the possibility of packets in flight, making
a nasty NULL pointer deference in interrupt context possible.

Changing this to register_pernet_subsys fixes the problem in
my testing.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
Acked-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 705b33b184a3..fc562d29cc46 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1205,7 +1205,7 @@ static struct pernet_operations __net_initdata icmp_sk_ops = {
 
 int __init icmp_init(void)
 {
-	return register_pernet_device(&icmp_sk_ops);
+	return register_pernet_subsys(&icmp_sk_ops);
 }
 
 EXPORT_SYMBOL(icmp_err_convert);
-- 
cgit v1.2.3


From 2f20d2e667ab1ca44cde5fb361386dff5bb6081d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@aristanetworks.com>
Date: Sun, 22 Feb 2009 00:10:18 -0800
Subject: tcp: Like icmp use register_pernet_subsys

To remove the possibility of packets flying around when network
devices are being cleaned up use reisger_pernet_subsys instead of
register_pernet_device.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
Acked-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 19d7b429a262..cf74c416831a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2443,7 +2443,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
 void __init tcp_v4_init(void)
 {
 	inet_hashinfo_init(&tcp_hashinfo);
-	if (register_pernet_device(&tcp_sk_ops))
+	if (register_pernet_subsys(&tcp_sk_ops))
 		panic("Failed to create the TCP control socket.\n");
 }
 
-- 
cgit v1.2.3


From 2bad35b7c9588eb5e65c03bcae54e7eb6b1a6504 Mon Sep 17 00:00:00 2001
From: "Jorge Boncompte [DTI2]" <jorge@dti2.net>
Date: Wed, 18 Mar 2009 23:26:11 -0700
Subject: netns: oops in ip[6]_frag_reasm incrementing stats

dev can be NULL in ip[6]_frag_reasm for skb's coming from RAW sockets.

Quagga's OSPFD sends fragmented packets on a RAW socket, when netfilter
conntrack reassembles them on the OUTPUT path you hit this code path.

You can test it with something like "hping2 -0 -d 2000 -f AA.BB.CC.DD"

With help from Jarek Poplawski.

Signed-off-by: Jorge Boncompte [DTI2] <jorge@dti2.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_fragment.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 6659ac000eeb..7985346653bd 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -463,6 +463,7 @@ err:
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 			 struct net_device *dev)
 {
+	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
 	struct iphdr *iph;
 	struct sk_buff *fp, *head = qp->q.fragments;
 	int len;
@@ -548,7 +549,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 	iph = ip_hdr(head);
 	iph->frag_off = 0;
 	iph->tot_len = htons(len);
-	IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMOKS);
+	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
 	qp->q.fragments = NULL;
 	return 0;
 
-- 
cgit v1.2.3