From 07ee0722bf941960fb3888f9c9b5839473372fd1 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 15 May 2015 11:30:47 +0800 Subject: rhashtable: Add cap on number of elements in hash table We currently have no limit on the number of elements in a hash table. This is a problem because some users (tipc) set a ceiling on the maximum table size and when that is reached the hash table may degenerate. Others may encounter OOM when growing and if we allow insertions when that happens the hash table perofrmance may also suffer. This patch adds a new paramater insecure_max_entries which becomes the cap on the table. If unset it defaults to max_size * 2. If it is also zero it means that there is no cap on the number of elements in the table. However, the table will grow whenever the utilisation hits 100% and if that growth fails, you will get ENOMEM on insertion. As allowing oversubscription is potentially dangerous, the name contains the word insecure. Note that the cap is not a hard limit. This is done for performance reasons as enforcing a hard limit will result in use of atomic ops that are heavier than the ones we currently use. The reasoning is that we're only guarding against a gross over- subscription of the table, rather than a small breach of the limit. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index dbcbcc59aa92..843ceca9a21e 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -17,6 +17,7 @@ #ifndef _LINUX_RHASHTABLE_H #define _LINUX_RHASHTABLE_H +#include #include #include #include @@ -100,6 +101,7 @@ struct rhashtable; * @key_len: Length of key * @key_offset: Offset of key in struct to be hashed * @head_offset: Offset of rhash_head in struct to be hashed + * @insecure_max_entries: Maximum number of entries (may be exceeded) * @max_size: Maximum size while expanding * @min_size: Minimum size while shrinking * @nulls_base: Base value to generate nulls marker @@ -115,6 +117,7 @@ struct rhashtable_params { size_t key_len; size_t key_offset; size_t head_offset; + unsigned int insecure_max_entries; unsigned int max_size; unsigned int min_size; u32 nulls_base; @@ -286,6 +289,18 @@ static inline bool rht_grow_above_100(const struct rhashtable *ht, (!ht->p.max_size || tbl->size < ht->p.max_size); } +/** + * rht_grow_above_max - returns true if table is above maximum + * @ht: hash table + * @tbl: current table + */ +static inline bool rht_grow_above_max(const struct rhashtable *ht, + const struct bucket_table *tbl) +{ + return ht->p.insecure_max_entries && + atomic_read(&ht->nelems) >= ht->p.insecure_max_entries; +} + /* The bucket lock is selected based on the hash and protects mutations * on a group of hash buckets. * @@ -589,6 +604,10 @@ restart: goto out; } + err = -E2BIG; + if (unlikely(rht_grow_above_max(ht, tbl))) + goto out; + if (unlikely(rht_grow_above_100(ht, tbl))) { slow_path: spin_unlock_bh(lock); -- cgit v1.2.3 From faecbb45ebefb20260ad4a631e011e93c896cb73 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 20 May 2015 13:42:25 +0200 Subject: Revert "netfilter: bridge: query conntrack about skb dnat" This reverts commit c055d5b03bb4cb69d349d787c9787c0383abd8b2. There are two issues: 'dnat_took_place' made me think that this is related to -j DNAT/MASQUERADE. But thats only one part of the story. This is also relevant for SNAT when we undo snat translation in reverse/reply direction. Furthermore, I originally wanted to do this mainly to avoid storing ipv6 addresses once we make DNAT/REDIRECT work for ipv6 on bridges. However, I forgot about SNPT/DNPT which is stateless. So we can't escape storing address for ipv6 anyway. Might as well do it for ipv4 too. Reported-and-tested-by: Bernhard Thaler Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 66e374d62f64..f15154a879c7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -176,6 +176,7 @@ struct nf_bridge_info { struct net_device *physindev; struct net_device *physoutdev; char neigh_header[8]; + __be32 ipv4_daddr; }; #endif -- cgit v1.2.3 From d654976cbf852ee20612ee10dbe57cdacda9f452 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 May 2015 21:51:19 -0700 Subject: tcp: fix a potential deadlock in tcp_get_info() Taking socket spinlock in tcp_get_info() can deadlock, as inet_diag_dump_icsk() holds the &hashinfo->ehash_locks[i], while packet processing can use the reverse locking order. We could avoid this locking for TCP_LISTEN states, but lockdep would certainly get confused as all TCP sockets share same lockdep classes. [ 523.722504] ====================================================== [ 523.728706] [ INFO: possible circular locking dependency detected ] [ 523.734990] 4.1.0-dbg-DEV #1676 Not tainted [ 523.739202] ------------------------------------------------------- [ 523.745474] ss/18032 is trying to acquire lock: [ 523.750002] (slock-AF_INET){+.-...}, at: [] tcp_get_info+0x2c4/0x360 [ 523.758129] [ 523.758129] but task is already holding lock: [ 523.763968] (&(&hashinfo->ehash_locks[i])->rlock){+.-...}, at: [] inet_diag_dump_icsk+0x1d5/0x6c0 [ 523.774661] [ 523.774661] which lock already depends on the new lock. [ 523.774661] [ 523.782850] [ 523.782850] the existing dependency chain (in reverse order) is: [ 523.790326] -> #1 (&(&hashinfo->ehash_locks[i])->rlock){+.-...}: [ 523.796599] [] lock_acquire+0xbb/0x270 [ 523.802565] [] _raw_spin_lock+0x38/0x50 [ 523.808628] [] __inet_hash_nolisten+0x78/0x110 [ 523.815273] [] tcp_v4_syn_recv_sock+0x24b/0x350 [ 523.822067] [] tcp_check_req+0x3c1/0x500 [ 523.828199] [] tcp_v4_do_rcv+0x239/0x3d0 [ 523.834331] [] tcp_v4_rcv+0xa8e/0xc10 [ 523.840202] [] ip_local_deliver_finish+0x133/0x3e0 [ 523.847214] [] ip_local_deliver+0xaa/0xc0 [ 523.853440] [] ip_rcv_finish+0x168/0x5c0 [ 523.859624] [] ip_rcv+0x307/0x420 Lets use u64_sync infrastructure instead. As a bonus, 64bit arches get optimized, as these are nop for them. Fixes: 0df48c26d841 ("tcp: add tcpi_bytes_acked to tcp_info") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3b2911502a8c..e8bbf403618f 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -158,6 +158,8 @@ struct tcp_sock { * sum(delta(snd_una)), or how many bytes * were acked. */ + struct u64_stats_sync syncp; /* protects 64bit vars (cf tcp_get_info()) */ + u32 snd_una; /* First byte we want an ack for */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ -- cgit v1.2.3