From 6eac56040787c3ff604fe7d48bbbb7897cd1387c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 28 Aug 2008 01:08:02 -0700 Subject: tcp: Skip empty hash buckets faster in /proc/net/tcp On most systems most of the TCP established/time-wait hash buckets are empty. When walking the hash table for /proc/net/tcp their read locks would always be aquired just to find out they're empty. This patch changes the code to check first if the buckets have any entries before taking the lock, which is much cheaper than taking a lock. Since the hash tables are large this makes a measurable difference on processing /proc/net/tcp, especially on architectures with slow read_lock (e.g. PPC) On a 2GB Core2 system time cat /proc/net/tcp > /dev/null (with a mostly empty hash table) goes from 0.046s to 0.005s. On systems with slower atomics (like P4 or POWER4) or larger hash tables (more RAM) the difference is much higher. This can be noticeable because there are some daemons around who regularly scan /proc/net/tcp. Original idea for this patch from Marcus Meissner, but redone by me. Signed-off-by: Andi Kleen Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 44c1e934824b..37ca3843c40b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1946,6 +1946,12 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) return rc; } +static inline int empty_bucket(struct tcp_iter_state *st) +{ + return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) && + hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain); +} + static void *established_get_first(struct seq_file *seq) { struct tcp_iter_state* st = seq->private; @@ -1958,6 +1964,10 @@ static void *established_get_first(struct seq_file *seq) struct inet_timewait_sock *tw; rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); + /* Lockless fast path for the common case of empty buckets */ + if (empty_bucket(st)) + continue; + read_lock_bh(lock); sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { if (sk->sk_family != st->family || @@ -2008,13 +2018,15 @@ get_tw: read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); st->state = TCP_SEQ_STATE_ESTABLISHED; - if (++st->bucket < tcp_hashinfo.ehash_size) { - read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); - sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); - } else { - cur = NULL; - goto out; - } + /* Look for next non empty bucket */ + while (++st->bucket < tcp_hashinfo.ehash_size && + empty_bucket(st)) + ; + if (st->bucket >= tcp_hashinfo.ehash_size) + return NULL; + + read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); } else sk = sk_next(sk); -- cgit v1.2.3 From d315492b1a6ba29da0fa2860759505ae1b2db857 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 8 Sep 2008 13:17:27 -0700 Subject: netns : fix kernel panic in timewait socket destruction How to reproduce ? - create a network namespace - use tcp protocol and get timewait socket - exit the network namespace - after a moment (when the timewait socket is destroyed), the kernel panics. # BUG: unable to handle kernel NULL pointer dereference at 0000000000000007 IP: [] inet_twdr_do_twkill_work+0x6e/0xb8 PGD 119985067 PUD 11c5c0067 PMD 0 Oops: 0000 [1] SMP CPU 1 Modules linked in: ipv6 button battery ac loop dm_mod tg3 libphy ext3 jbd edd fan thermal processor thermal_sys sg sata_svw libata dock serverworks sd_mod scsi_mod ide_disk ide_core [last unloaded: freq_table] Pid: 0, comm: swapper Not tainted 2.6.27-rc2 #3 RIP: 0010:[] [] inet_twdr_do_twkill_work+0x6e/0xb8 RSP: 0018:ffff88011ff7fed0 EFLAGS: 00010246 RAX: ffffffffffffffff RBX: ffffffff82339420 RCX: ffff88011ff7ff30 RDX: 0000000000000001 RSI: ffff88011a4d03c0 RDI: ffff88011ac2fc00 RBP: ffffffff823392e0 R08: 0000000000000000 R09: ffff88002802a200 R10: ffff8800a5c4b000 R11: ffffffff823e4080 R12: ffff88011ac2fc00 R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000 FS: 0000000041cbd940(0000) GS:ffff8800bff839c0(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b CR2: 0000000000000007 CR3: 00000000bd87c000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process swapper (pid: 0, threadinfo ffff8800bff9e000, task ffff88011ff76690) Stack: ffffffff823392e0 0000000000000100 ffffffff821e3a3a 0000000000000008 0000000000000000 ffffffff821e3a61 ffff8800bff7c000 ffffffff8203c7e7 ffff88011ff7ff10 ffff88011ff7ff10 0000000000000021 ffffffff82351108 Call Trace: [] ? inet_twdr_hangman+0x0/0x9e [] ? inet_twdr_hangman+0x27/0x9e [] ? run_timer_softirq+0x12c/0x193 [] ? __do_softirq+0x5e/0xcd [] ? call_softirq+0x1c/0x28 [] ? do_softirq+0x2c/0x68 [] ? smp_apic_timer_interrupt+0x8e/0xa9 [] ? apic_timer_interrupt+0x66/0x70 [] ? default_idle+0x27/0x3b [] ? cpu_idle+0x5f/0x7d Code: e8 01 00 00 4c 89 e7 41 ff c5 e8 8d fd ff ff 49 8b 44 24 38 4c 89 e7 65 8b 14 25 24 00 00 00 89 d2 48 8b 80 e8 00 00 00 48 f7 d0 <48> 8b 04 d0 48 ff 40 58 e8 fc fc ff ff 48 89 df e8 c0 5f 04 00 RIP [] inet_twdr_do_twkill_work+0x6e/0xb8 RSP CR2: 0000000000000007 This patch provides a function to purge all timewait sockets related to a network namespace. The timewait sockets life cycle is not tied with the network namespace, that means the timewait sockets stay alive while the network namespace dies. The timewait sockets are for avoiding to receive a duplicate packet from the network, if the network namespace is freed, the network stack is removed, so no chance to receive any packets from the outside world. Furthermore, having a pending destruction timer on these sockets with a network namespace freed is not safe and will lead to an oops if the timer callback which try to access data belonging to the namespace like for example in: inet_twdr_do_twkill_work -> NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); Purging the timewait sockets at the network namespace destruction will: 1) speed up memory freeing for the namespace 2) fix kernel panic on asynchronous timewait destruction Signed-off-by: Daniel Lezcano Acked-by: Denis V. Lunev Acked-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 44c1e934824b..1b4fee20fc93 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2376,6 +2376,7 @@ static int __net_init tcp_sk_init(struct net *net) static void __net_exit tcp_sk_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv4.tcp_sock); + inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET); } static struct pernet_operations __net_initdata tcp_sk_ops = { -- cgit v1.2.3 From f5fff5dc8a7a3f395b0525c02ba92c95d42b7390 Mon Sep 17 00:00:00 2001 From: Tom Quetchenbach Date: Sun, 21 Sep 2008 00:21:51 -0700 Subject: tcp: advertise MSS requested by user I'm trying to use the TCP_MAXSEG option to setsockopt() to set the MSS for both sides of a bidirectional connection. man tcp says: "If this option is set before connection establishment, it also changes the MSS value announced to the other end in the initial packet." However, the kernel only uses the MTU/route cache to set the advertised MSS. That means if I set the MSS to, say, 500 before calling connect(), I will send at most 500-byte packets, but I will still receive 1500-byte packets in reply. This is a bug, either in the kernel or the documentation. This patch (applies to latest net-2.6) reduces the advertised value to that requested by the user as long as setsockopt() is called before connect() or accept(). This seems like the behavior that one would expect as well as that which is documented. I've tried to make sure that things that depend on the advertised MSS are set correctly. Signed-off-by: Tom Quetchenbach Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3dfbc21e555a..44aef1c1f373 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1364,6 +1364,10 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric(dst, RTAX_ADVMSS); + if (tcp_sk(sk)->rx_opt.user_mss && + tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) + newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; + tcp_initialize_rcv_mss(newsk); #ifdef CONFIG_TCP_MD5SIG -- cgit v1.2.3 From 4dd7972d1204c3851a4092cecd2207e05eb29b09 Mon Sep 17 00:00:00 2001 From: Vitaliy Gusev Date: Wed, 1 Oct 2008 01:51:39 -0700 Subject: tcp: Fix NULL dereference in tcp_4_send_ack() Fix NULL dereference in tcp_4_send_ack(). As skb->dev is reset to NULL in tcp_v4_rcv() thus OOPS occurs: BUG: unable to handle kernel NULL pointer dereference at 00000000000004d0 IP: [] tcp_v4_send_ack+0x203/0x250 Stack: ffff810005dbb000 ffff810015c8acc0 e77b2c6e5f861600 a01610802e90cb6d 0a08010100000000 88afffff88afffff 0000000080762be8 0000000115c872e8 0004122000000000 0000000000000001 ffffffff80762b88 0000000000000020 Call Trace: [] tcp_v4_reqsk_send_ack+0x20/0x22 [] tcp_check_req+0x108/0x14c [] ? rt_intern_hash+0x322/0x33c [] tcp_v4_do_rcv+0x399/0x4ec [] ? skb_checksum+0x4f/0x272 [] ? __inet_lookup_listener+0x14a/0x15c [] tcp_v4_rcv+0x6a1/0x701 [] ip_local_deliver_finish+0x157/0x24a [] ip_local_deliver+0x72/0x7c [] ip_rcv_finish+0x38d/0x3b2 [] ? scsi_io_completion+0x19d/0x39e [] ip_rcv+0x2a2/0x2e5 [] netif_receive_skb+0x293/0x303 [] process_backlog+0x80/0xd0 [] ? __rcu_process_callbacks+0x125/0x1b4 [] net_rx_action+0xb9/0x17f [] __do_softirq+0xa3/0x164 [] call_softirq+0x1c/0x28 [] do_softirq+0x34/0x72 [] local_bh_enable_ip+0x3f/0x50 [] _spin_unlock_bh+0x12/0x14 [] release_sock+0xb8/0xc1 [] inet_stream_connect+0x146/0x25c [] ? autoremove_wake_function+0x0/0x38 [] sys_connect+0x68/0x8e [] ? fd_install+0x5f/0x68 [] ? sock_map_fd+0x55/0x62 [] system_call_after_swapgs+0x7b/0x80 Code: 41 10 11 d0 83 d0 00 4d 85 ed 89 45 c0 c7 45 c4 08 00 00 00 74 07 41 8b 45 04 89 45 c8 48 8b 43 20 8b 4d b8 48 8d 55 b0 48 89 de <48> 8b 80 d0 04 00 00 48 8b b8 60 01 00 00 e8 20 ae fe ff 65 48 RIP [] tcp_v4_send_ack+0x203/0x250 RSP CR2: 00000000000004d0 Signed-off-by: Vitaliy Gusev Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1b4fee20fc93..011478e46c40 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -618,7 +618,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, ]; } rep; struct ip_reply_arg arg; - struct net *net = dev_net(skb->dev); + struct net *net = dev_net(skb->dst->dev); memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); -- cgit v1.2.3 From 88ef4a5a78e63420dd1dd770f1bd1dc198926b04 Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Wed, 1 Oct 2008 07:41:00 -0700 Subject: tcp: Handle TCP SYN+ACK/ACK/RST transparency The TCP stack sends out SYN+ACK/ACK/RST reply packets in response to incoming packets. The non-local source address check on output bites us again, as replies for transparently redirected traffic won't have a chance to leave the node. This patch selectively sets the FLOWI_FLAG_ANYSRC flag when doing the route lookup for those replies. Transparent replies are enabled if the listening socket has the transparent socket flag set. Signed-off-by: KOVACS Krisztian Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d13688e3558d..8b24bd833cb4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -591,6 +591,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) ip_hdr(skb)->saddr, /* XXX */ sizeof(struct tcphdr), IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; + arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; net = dev_net(skb->dst->dev); ip_send_reply(net->ipv4.tcp_sock, skb, @@ -606,7 +607,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts, int oif, - struct tcp_md5sig_key *key) + struct tcp_md5sig_key *key, + int reply_flags) { struct tcphdr *th = tcp_hdr(skb); struct { @@ -659,6 +661,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, ip_hdr(skb)->daddr, &rep.th); } #endif + arg.flags = reply_flags; arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); @@ -681,7 +684,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent, tw->tw_bound_dev_if, - tcp_twsk_md5_key(tcptw) + tcp_twsk_md5_key(tcptw), + tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0 ); inet_twsk_put(tw); @@ -694,7 +698,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, req->ts_recent, 0, - tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr)); + tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr), + inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0); } /* @@ -1244,6 +1249,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); ireq->loc_addr = daddr; ireq->rmt_addr = saddr; + ireq->no_srccheck = inet_sk(sk)->transparent; ireq->opt = tcp_v4_save_options(sk, skb); if (!want_cookie) TCP_ECN_create_request(req, tcp_hdr(skb)); -- cgit v1.2.3 From 9a1f27c48065ce713eb47f2fd475b717e63ef239 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 7 Oct 2008 11:41:57 -0700 Subject: inet_hashtables: Add inet_lookup_skb helpers To be able to use the cached socket reference in the skb during input processing we add a new set of lookup functions that receive the skb on their argument list. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: KOVACS Krisztian Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8b24bd833cb4..24ffc5e1d3da 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1577,8 +1577,7 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->flags = iph->tos; TCP_SKB_CB(skb)->sacked = 0; - sk = __inet_lookup(net, &tcp_hashinfo, iph->saddr, - th->source, iph->daddr, th->dest, inet_iif(skb)); + sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); if (!sk) goto no_tcp_socket; -- cgit v1.2.3 From 52cd5750e81ec8d213949fa7c0d2e08907bf498b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 8 Oct 2008 11:34:06 -0700 Subject: tcp: fix length used for checksum in a reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While looking for some common code I came across difference in checksum calculation between tcp_v6_send_(reset|ack) I couldn't explain. I checked both v4 and v6 and found out that both seem to have the same "feature". I couldn't find anything in rfc nor anywhere else which would state that md5 option should be ignored like it was in case of reset so I came to a conclusion that this is probably a genuine bug. I suspect that addition of md5 just was fooled by the excessive copy-paste code in those functions and the reset part was never tested well enough to find out the problem. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 24ffc5e1d3da..ba46769c6e97 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -589,7 +589,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) #endif arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, /* XXX */ - sizeof(struct tcphdr), IPPROTO_TCP, 0); + arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; -- cgit v1.2.3 From 78e645cb890b0f32ea81a974e29427d9cd2f64f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 9 Oct 2008 14:37:47 -0700 Subject: tcpv[46]: fix md5 pseudoheader address field ordering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Maybe it's just me but I guess those md5 people made a mess out of it by having *_md5_hash_* to use daddr, saddr order instead of the one that is natural (and equal to what csum functions use). For the segment were sending, the original addresses are reversed so buff's saddr == skb's daddr and vice-versa. Maybe I can finally proceed with unification of some code after fixing it first... :-) Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ba46769c6e97..5c8fa7f1e327 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -583,8 +583,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) rep.th.doff = arg.iov[0].iov_len / 4; tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], - key, ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr, &rep.th); + key, ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, &rep.th); } #endif arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, -- cgit v1.2.3