From cca77b7c81876d819a5806f408b3c29b5b61a815 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 23 Aug 2010 14:41:22 -0700 Subject: netfilter: fix CONFIG_COMPAT support commit f3c5c1bfd430858d3a05436f82c51e53104feb6b (netfilter: xtables: make ip_tables reentrant) forgot to also compute the jumpstack size in the compat handlers. Result is that "iptables -I INPUT -j userchain" turns into -j DROP. Reported by Sebastian Roesner on #netfilter, closes http://bugzilla.netfilter.org/show_bug.cgi?id=669. Note: arptables change is compile-tested only. Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Tested-by: Mikael Pettersson Signed-off-by: David S. Miller --- net/ipv4/netfilter/arp_tables.c | 3 +++ net/ipv4/netfilter/ip_tables.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 51d6c3167975..e8f4f9a57f12 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1420,6 +1420,9 @@ static int translate_compat_table(const char *name, if (ret != 0) break; ++i; + if (strcmp(arpt_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } if (ret) { /* diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 97b64b22c412..d163f2e3b2e9 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1751,6 +1751,9 @@ translate_compat_table(struct net *net, if (ret != 0) break; ++i; + if (strcmp(ipt_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } if (ret) { /* -- cgit v1.2.3 From ad1af0fedba14f82b240a03fe20eb9b2fdbd0357 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 25 Aug 2010 02:27:49 -0700 Subject: tcp: Combat per-cpu skew in orphan tests. As reported by Anton Blanchard when we use percpu_counter_read_positive() to make our orphan socket limit checks, the check can be off by up to num_cpus_online() * batch (which is 32 by default) which on a 128 cpu machine can be as large as the default orphan limit itself. Fix this by doing the full expensive sum check if the optimized check triggers. Reported-by: Anton Blanchard Signed-off-by: David S. Miller Acked-by: Eric Dumazet --- net/ipv4/tcp.c | 5 +---- net/ipv4/tcp_timer.c | 8 ++++---- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 176e11aaea77..197b9b77fa3e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2011,11 +2011,8 @@ adjudge_to_death: } } if (sk->sk_state != TCP_CLOSE) { - int orphan_count = percpu_counter_read_positive( - sk->sk_prot->orphan_count); - sk_mem_reclaim(sk); - if (tcp_too_many_orphans(sk, orphan_count)) { + if (tcp_too_many_orphans(sk, 0)) { if (net_ratelimit()) printk(KERN_INFO "TCP: too many of orphaned " "sockets\n"); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 808bb920c9f5..c35b469e851c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -66,18 +66,18 @@ static void tcp_write_err(struct sock *sk) static int tcp_out_of_resources(struct sock *sk, int do_reset) { struct tcp_sock *tp = tcp_sk(sk); - int orphans = percpu_counter_read_positive(&tcp_orphan_count); + int shift = 0; /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) - orphans <<= 1; + shift++; /* If some dubious ICMP arrived, penalize even more. */ if (sk->sk_err_soft) - orphans <<= 1; + shift++; - if (tcp_too_many_orphans(sk, orphans)) { + if (tcp_too_many_orphans(sk, shift)) { if (net_ratelimit()) printk(KERN_INFO "Out of socket memory\n"); -- cgit v1.2.3 From c5ed63d66f24fd4f7089b5a6e087b0ce7202aa8e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 25 Aug 2010 23:02:17 -0700 Subject: tcp: fix three tcp sysctls tuning As discovered by Anton Blanchard, current code to autotune tcp_death_row.sysctl_max_tw_buckets, sysctl_tcp_max_orphans and sysctl_max_syn_backlog makes little sense. The bigger a page is, the less tcp_max_orphans is : 4096 on a 512GB machine in Anton's case. (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)) is much bigger if spinlock debugging is on. Its wrong to select bigger limits in this case (where kernel structures are also bigger) bhash_size max is 65536, and we get this value even for small machines. A better ground is to use size of ehash table, this also makes code shorter and more obvious. Based on a patch from Anton, and another from David. Reported-and-tested-by: Anton Blanchard Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 197b9b77fa3e..e2add5ff9cb1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3209,7 +3209,7 @@ void __init tcp_init(void) { struct sk_buff *skb = NULL; unsigned long nr_pages, limit; - int order, i, max_share; + int i, max_share, cnt; unsigned long jiffy = jiffies; BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -3258,22 +3258,12 @@ void __init tcp_init(void) INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); } - /* Try to be a bit smarter and adjust defaults depending - * on available memory. - */ - for (order = 0; ((1 << order) << PAGE_SHIFT) < - (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); - order++) - ; - if (order >= 4) { - tcp_death_row.sysctl_max_tw_buckets = 180000; - sysctl_tcp_max_orphans = 4096 << (order - 4); - sysctl_max_syn_backlog = 1024; - } else if (order < 3) { - tcp_death_row.sysctl_max_tw_buckets >>= (3 - order); - sysctl_tcp_max_orphans >>= (3 - order); - sysctl_max_syn_backlog = 128; - } + + cnt = tcp_hashinfo.ehash_mask + 1; + + tcp_death_row.sysctl_max_tw_buckets = cnt / 2; + sysctl_tcp_max_orphans = cnt / 2; + sysctl_max_syn_backlog = max(128, cnt / 256); /* Set the pressure threshold to be a fraction of global memory that * is up to 1/2 at 256 MB, decreasing toward zero with the amount of -- cgit v1.2.3 From d84ba638e4ba3c40023ff997aa5e8d3ed002af36 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 24 Aug 2010 16:05:48 +0000 Subject: tcp: select(writefds) don't hang up when a peer close connection This issue come from ruby language community. Below test program hang up when only run on Linux. % uname -mrsv Linux 2.6.26-2-486 #1 Sat Dec 26 08:37:39 UTC 2009 i686 % ruby -rsocket -ve ' BasicSocket.do_not_reverse_lookup = true serv = TCPServer.open("127.0.0.1", 0) s1 = TCPSocket.open("127.0.0.1", serv.addr[1]) s2 = serv.accept s2.close s1.write("a") rescue p $! s1.write("a") rescue p $! Thread.new { s1.write("a") }.join' ruby 1.9.3dev (2010-07-06 trunk 28554) [i686-linux] # [Hang Here] FreeBSD, Solaris, Mac doesn't. because Ruby's write() method call select() internally. and tcp_poll has a bug. SUS defined 'ready for writing' of select() as following. | A descriptor shall be considered ready for writing when a call to an output | function with O_NONBLOCK clear would not block, whether or not the function | would transfer data successfully. That said, EPIPE situation is clearly one of 'ready for writing'. We don't have read-side issue because tcp_poll() already has read side shutdown care. | if (sk->sk_shutdown & RCV_SHUTDOWN) | mask |= POLLIN | POLLRDNORM | POLLRDHUP; So, Let's insert same logic in write side. - reference url http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-core/31065 http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-core/31068 Signed-off-by: KOSAKI Motohiro Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e2add5ff9cb1..3fb1428e526e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -451,7 +451,8 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) mask |= POLLOUT | POLLWRNORM; } - } + } else + mask |= POLLOUT | POLLWRNORM; if (tp->urg_data & TCP_URG_VALID) mask |= POLLPRI; -- cgit v1.2.3 From c34186ed008229e7f7e3f1de8e6acf6374995358 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 27 Aug 2010 19:31:56 -0700 Subject: net/ipv4: Eliminate kstrdup memory leak The string clone is only used as a temporary copy of the argument val within the while loop, and so it should be freed before leaving the function. The call to strsep, however, modifies clone, so a pointer to the front of the string is kept in saved_clone, to make it possible to free it. The sematic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @r exists@ local idexpression x; expression E; identifier l; statement S; @@ *x= \(kasprintf\|kstrdup\)(...); ... if (x == NULL) S ... when != kfree(x) when != E = x if (...) { <... when != kfree(x) * goto l; ...> * return ...; } // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/ipv4/tcp_cong.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 0ec9bd0ae94f..850c737e08e2 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -196,10 +196,10 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) int tcp_set_allowed_congestion_control(char *val) { struct tcp_congestion_ops *ca; - char *clone, *name; + char *saved_clone, *clone, *name; int ret = 0; - clone = kstrdup(val, GFP_USER); + saved_clone = clone = kstrdup(val, GFP_USER); if (!clone) return -ENOMEM; @@ -226,6 +226,7 @@ int tcp_set_allowed_congestion_control(char *val) } out: spin_unlock(&tcp_cong_list_lock); + kfree(saved_clone); return ret; } -- cgit v1.2.3 From 750e9fad8c7f44e0005ffb7195f72dd76978c2cf Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 31 Aug 2010 05:50:43 +0000 Subject: ipv4: minor fix about RPF in help of Kconfig Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 7c3a7d191249..571f8950ed06 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -46,7 +46,7 @@ config IP_ADVANCED_ROUTER rp_filter on use: echo 1 > /proc/sys/net/ipv4/conf//rp_filter - and + or echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter Note that some distributions enable it in startup scripts. -- cgit v1.2.3