From b98e1747eecc19b872572c5fffedc1868531dac6 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@computergmbh.de>
Date: Mon, 5 Nov 2007 20:42:16 -0800
Subject: [NETFILTER]: Sort matches/targets in Kbuild file

Sort matches and targets in the Kbuild file.

Signed-off-by: Jan Engelhardt <jengelh@computergmbh.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/Kbuild      | 18 +++++++++---------
 include/linux/netfilter_ipv4/Kbuild | 28 ++++++++++++++--------------
 include/linux/netfilter_ipv6/Kbuild |  2 +-
 3 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild
index f2eaea2234ec..b87e83a5e070 100644
--- a/include/linux/netfilter/Kbuild
+++ b/include/linux/netfilter/Kbuild
@@ -4,25 +4,28 @@ header-y += nfnetlink_conntrack.h
 header-y += nfnetlink_log.h
 header-y += nfnetlink_queue.h
 header-y += xt_CLASSIFY.h
+header-y += xt_CONNMARK.h
+header-y += xt_CONNSECMARK.h
+header-y += xt_DSCP.h
+header-y += xt_MARK.h
+header-y += xt_NFLOG.h
+header-y += xt_NFQUEUE.h
+header-y += xt_SECMARK.h
+header-y += xt_TCPMSS.h
 header-y += xt_comment.h
 header-y += xt_connbytes.h
 header-y += xt_connmark.h
-header-y += xt_CONNMARK.h
 header-y += xt_conntrack.h
 header-y += xt_dccp.h
 header-y += xt_dscp.h
-header-y += xt_DSCP.h
 header-y += xt_esp.h
-header-y += xt_helper.h
 header-y += xt_hashlimit.h
+header-y += xt_helper.h
 header-y += xt_length.h
 header-y += xt_limit.h
 header-y += xt_mac.h
 header-y += xt_mark.h
-header-y += xt_MARK.h
 header-y += xt_multiport.h
-header-y += xt_NFQUEUE.h
-header-y += xt_NFLOG.h
 header-y += xt_pkttype.h
 header-y += xt_policy.h
 header-y += xt_realm.h
@@ -32,9 +35,6 @@ header-y += xt_statistic.h
 header-y += xt_string.h
 header-y += xt_tcpmss.h
 header-y += xt_tcpudp.h
-header-y += xt_SECMARK.h
-header-y += xt_CONNSECMARK.h
-header-y += xt_TCPMSS.h
 
 unifdef-y += nf_conntrack_common.h
 unifdef-y += nf_conntrack_ftp.h
diff --git a/include/linux/netfilter_ipv4/Kbuild b/include/linux/netfilter_ipv4/Kbuild
index 7185792b900f..3a7105bb8f33 100644
--- a/include/linux/netfilter_ipv4/Kbuild
+++ b/include/linux/netfilter_ipv4/Kbuild
@@ -1,47 +1,47 @@
-header-y += ipt_addrtype.h
-header-y += ipt_ah.h
 header-y += ipt_CLASSIFY.h
 header-y += ipt_CLUSTERIP.h
+header-y += ipt_CONNMARK.h
+header-y += ipt_DSCP.h
+header-y += ipt_ECN.h
+header-y += ipt_LOG.h
+header-y += ipt_MARK.h
+header-y += ipt_NFQUEUE.h
+header-y += ipt_REJECT.h
+header-y += ipt_SAME.h
+header-y += ipt_TCPMSS.h
+header-y += ipt_TOS.h
+header-y += ipt_TTL.h
+header-y += ipt_ULOG.h
+header-y += ipt_addrtype.h
+header-y += ipt_ah.h
 header-y += ipt_comment.h
 header-y += ipt_connbytes.h
 header-y += ipt_connmark.h
-header-y += ipt_CONNMARK.h
 header-y += ipt_conntrack.h
 header-y += ipt_dccp.h
 header-y += ipt_dscp.h
-header-y += ipt_DSCP.h
 header-y += ipt_ecn.h
-header-y += ipt_ECN.h
 header-y += ipt_esp.h
 header-y += ipt_hashlimit.h
 header-y += ipt_helper.h
 header-y += ipt_iprange.h
 header-y += ipt_length.h
 header-y += ipt_limit.h
-header-y += ipt_LOG.h
 header-y += ipt_mac.h
 header-y += ipt_mark.h
-header-y += ipt_MARK.h
 header-y += ipt_multiport.h
-header-y += ipt_NFQUEUE.h
 header-y += ipt_owner.h
 header-y += ipt_physdev.h
 header-y += ipt_pkttype.h
 header-y += ipt_policy.h
 header-y += ipt_realm.h
 header-y += ipt_recent.h
-header-y += ipt_REJECT.h
-header-y += ipt_SAME.h
 header-y += ipt_sctp.h
 header-y += ipt_state.h
 header-y += ipt_string.h
 header-y += ipt_tcpmss.h
-header-y += ipt_TCPMSS.h
 header-y += ipt_tos.h
-header-y += ipt_TOS.h
 header-y += ipt_ttl.h
-header-y += ipt_TTL.h
-header-y += ipt_ULOG.h
 
 unifdef-y += ip_queue.h
 unifdef-y += ip_tables.h
diff --git a/include/linux/netfilter_ipv6/Kbuild b/include/linux/netfilter_ipv6/Kbuild
index 9dd978d149ff..8887a5fcd1d0 100644
--- a/include/linux/netfilter_ipv6/Kbuild
+++ b/include/linux/netfilter_ipv6/Kbuild
@@ -14,8 +14,8 @@ header-y += ip6t_mark.h
 header-y += ip6t_multiport.h
 header-y += ip6t_opts.h
 header-y += ip6t_owner.h
-header-y += ip6t_policy.h
 header-y += ip6t_physdev.h
+header-y += ip6t_policy.h
 header-y += ip6t_rt.h
 
 unifdef-y += ip6_tables.h
-- 
cgit v1.2.3


From 6a9fb9479f2672fa392711735de9e642395c9a14 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Mon, 5 Nov 2007 21:32:31 -0800
Subject: [IPV4]: Clean the ip_sockglue.c from some ugly ifdefs

The #idfed CONFIG_IP_MROUTE is sometimes places inside the if-s,
which looks completely bad. Similar ifdefs inside the functions
looks a bit better, but they are also not recommended to be used.

Provide an ifdef-ed ip_mroute_opt() helper to cleanup the code.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 7da2cee8e132..35a8277ec1bd 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -128,6 +128,18 @@ struct igmpmsg
 #ifdef __KERNEL__
 #include <net/sock.h>
 
+#ifdef CONFIG_IP_MROUTE
+static inline int ip_mroute_opt(int opt)
+{
+	return (opt >= MRT_BASE) && (opt <= MRT_BASE + 10);
+}
+#else
+static inline int ip_mroute_opt(int opt)
+{
+	return 0;
+}
+#endif
+
 extern int ip_mroute_setsockopt(struct sock *, int, char __user *, int);
 extern int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *);
 extern int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg);
-- 
cgit v1.2.3


From 286ab3d46058840d68e5d7d52e316c1f7e98c59f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 5 Nov 2007 23:38:39 -0800
Subject: [NET]: Define infrastructure to keep 'inuse' changes in an efficent
 SMP/NUMA way.

"struct proto" currently uses an array stats[NR_CPUS] to track change on
'inuse' sockets per protocol.

If NR_CPUS is big, this means we use a big memory area for this.
Moreover, all this memory area is located on a single node on NUMA
machines, increasing memory pressure on the boot node.

In this patch, I tried to :

- Keep a fast !CONFIG_SMP implementation
- Keep a fast CONFIG_SMP implementation for often used protocols
(tcp,udp,raw,...)
- Introduce a NUMA efficient implementation

Some helper macros are defined in include/net/sock.h
These macros take into account CONFIG_SMP

If a "struct proto" is declared without using DEFINE_PROTO_INUSE /
REF_PROTO_INUSE
macros, it will automatically use a default implementation, using a
dynamically allocated percpu zone.
This default implementation will be NUMA efficient, but might use 32/64
bytes per possible cpu
because of current alloc_percpu() implementation.
However it still should be better than previous implementation based on
stats[NR_CPUS] field.

When a "struct proto" is changed to use the new macros, we use a single
static "int" percpu variable,
lowering the memory and cpu costs, still preserving NUMA efficiency.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 57 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 20de3fa7ae40..5504fb9fa88a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -560,6 +560,14 @@ struct proto {
 	void			(*unhash)(struct sock *sk);
 	int			(*get_port)(struct sock *sk, unsigned short snum);
 
+#ifdef CONFIG_SMP
+	/* Keeping track of sockets in use */
+	void			(*inuse_add)(struct proto *prot, int inc);
+	int			(*inuse_getval)(const struct proto *prot);
+	int			*inuse_ptr;
+#else
+	int			inuse;
+#endif
 	/* Memory pressure */
 	void			(*enter_memory_pressure)(void);
 	atomic_t		*memory_allocated;	/* Current allocated memory. */
@@ -592,12 +600,38 @@ struct proto {
 #ifdef SOCK_REFCNT_DEBUG
 	atomic_t		socks;
 #endif
-	struct {
-		int inuse;
-		u8  __pad[SMP_CACHE_BYTES - sizeof(int)];
-	} stats[NR_CPUS];
 };
 
+/*
+ * Special macros to let protos use a fast version of inuse{get|add}
+ * using a static percpu variable per proto instead of an allocated one,
+ * saving one dereference.
+ * This might be changed if/when dynamic percpu vars become fast.
+ */
+#ifdef CONFIG_SMP
+# define DEFINE_PROTO_INUSE(NAME)			\
+static DEFINE_PER_CPU(int, NAME##_inuse);		\
+static void NAME##_inuse_add(struct proto *prot, int inc)	\
+{							\
+	__get_cpu_var(NAME##_inuse) += inc;		\
+}							\
+							\
+static int NAME##_inuse_getval(const struct proto *prot)\
+{							\
+	int res = 0, cpu;				\
+							\
+	for_each_possible_cpu(cpu)			\
+		res += per_cpu(NAME##_inuse, cpu);	\
+	return res;					\
+}
+# define REF_PROTO_INUSE(NAME)				\
+	.inuse_add = NAME##_inuse_add,			\
+	.inuse_getval = NAME##_inuse_getval,
+#else
+# define DEFINE_PROTO_INUSE(NAME)
+# define REF_PROTO_INUSE(NAME)
+#endif
+
 extern int proto_register(struct proto *prot, int alloc_slab);
 extern void proto_unregister(struct proto *prot);
 
@@ -629,12 +663,29 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)
 /* Called with local bh disabled */
 static __inline__ void sock_prot_inc_use(struct proto *prot)
 {
-	prot->stats[smp_processor_id()].inuse++;
+#ifdef CONFIG_SMP
+	prot->inuse_add(prot, 1);
+#else
+	prot->inuse++;
+#endif
 }
 
 static __inline__ void sock_prot_dec_use(struct proto *prot)
 {
-	prot->stats[smp_processor_id()].inuse--;
+#ifdef CONFIG_SMP
+	prot->inuse_add(prot, -1);
+#else
+	prot->inuse--;
+#endif
+}
+
+static __inline__ int sock_prot_inuse(struct proto *proto)
+{
+#ifdef CONFIG_SMP
+	return proto->inuse_getval(proto);
+#else
+	return proto->inuse;
+#endif
 }
 
 /* With per-bucket locks this operation is not-atomic, so that
-- 
cgit v1.2.3


From 44656ba1286d82b5a5f8817eb2e4ea744143c3ca Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Wed, 7 Nov 2007 04:10:52 -0800
Subject: [NET]: Kill proc_net_create()

There are no more users.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/proc_fs.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 1ff461672060..1273c6ec535c 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -196,8 +196,6 @@ static inline struct proc_dir_entry *create_proc_info_entry(const char *name,
 	return res;
 }
 
-extern struct proc_dir_entry *proc_net_create(struct net *net,
-	const char *name, mode_t mode, get_info_t *get_info);
 extern struct proc_dir_entry *proc_net_fops_create(struct net *net,
 	const char *name, mode_t mode, const struct file_operations *fops);
 extern void proc_net_remove(struct net *net, const char *name);
@@ -208,7 +206,6 @@ extern void proc_net_remove(struct net *net, const char *name);
 #define proc_bus NULL
 
 #define proc_net_fops_create(net, name, mode, fops)  ({ (void)(mode), NULL; })
-#define proc_net_create(net, name, mode, info)	({ (void)(mode), NULL; })
 static inline void proc_net_remove(struct net *net, const char *name) {}
 
 static inline void proc_flush_task(struct task_struct *task)
-- 
cgit v1.2.3


From c3e9a353d8fc64a82ab11a07e21902e25e1e96d1 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Tue, 6 Nov 2007 23:34:04 -0800
Subject: [IPV4]: Compact some ifdefs in the fib code.

There are places that check for CONFIG_IP_MULTIPLE_TABLES
twice in the same file, but the internals of these #ifdefs
can be merged.

As a side effect - remove one ifdef from inside a function.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 8cadc77c7df4..ed514bfb61ba 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -185,6 +185,12 @@ static inline void fib_select_default(const struct flowi *flp, struct fib_result
 }
 
 #else /* CONFIG_IP_MULTIPLE_TABLES */
+extern void __init fib4_rules_init(void);
+
+#ifdef CONFIG_NET_CLS_ROUTE
+extern u32 fib_rules_tclass(struct fib_result *res);
+#endif
+
 #define ip_fib_local_table fib_get_table(RT_TABLE_LOCAL)
 #define ip_fib_main_table fib_get_table(RT_TABLE_MAIN)
 
@@ -214,15 +220,6 @@ extern __be32  __fib_res_prefsrc(struct fib_result *res);
 /* Exported by fib_hash.c */
 extern struct fib_table *fib_hash_init(u32 id);
 
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-extern void __init fib4_rules_init(void);
-
-#ifdef CONFIG_NET_CLS_ROUTE
-extern u32 fib_rules_tclass(struct fib_result *res);
-#endif
-
-#endif
-
 static inline void fib_combine_itag(u32 *itag, struct fib_result *res)
 {
 #ifdef CONFIG_NET_CLS_ROUTE
-- 
cgit v1.2.3


From 0fc00e2440b717e19bab1ae0015f03936bdf7967 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Wed, 7 Nov 2007 01:24:56 -0800
Subject: [TTY]: Fix network driver interactions with TCGET/SET calls.

Dave Miller noted various cases where line disciplines for things like
ppp go poking around in termios themselves in ways that broke with the
new termios code. Rather than have them all learning about termios
internals provide proper methods for this

- tty_mode_ioctl()

	This handles all the terminal mode handling for speed/carrier
etc and none of the methods are ldisc dependant so they can be called
by any user

- tty_perform_flush()

	This extracts the flush functionality and enables pppd the ppp
layer to share it cleanly.

The existing n_tty_ioctl code is refactored in this patch to provide
the new functions and to call them itself appropriately. This patch
has no (intended) behaviour changes and simply prepares for the other
fixes.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tty.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 56164d7ba0ad..c555f5442bd7 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -332,7 +332,9 @@ extern void tty_ldisc_flush(struct tty_struct *tty);
 
 extern int tty_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 		     unsigned long arg);
-
+extern int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
+			unsigned int cmd, unsigned long arg);
+extern int tty_perform_flush(struct tty_struct *tty, unsigned long arg);
 extern dev_t tty_devnum(struct tty_struct *tty);
 extern void proc_clear_tty(struct task_struct *p);
 extern struct tty_struct *get_current_tty(void);
-- 
cgit v1.2.3


From 1e356f9cdfa885c78791d5d6e5d2baef22f01853 Mon Sep 17 00:00:00 2001
From: "Rumen G. Bogdanovski" <rumen@voicecho.com>
Date: Wed, 7 Nov 2007 02:35:54 -0800
Subject: [IPVS]: Bind connections on stanby if the destination exists

This patch fixes the problem with node overload on director fail-over.
Given the scenario: 2 nodes each accepting 3 connections at a time and 2
directors, director failover occurs when the nodes are fully loaded (6
connections to the cluster) in this case the new director will assign
another 6 connections to the cluster, If the same real servers exist
there.

The problem turned to be in not binding the inherited connections to
the real servers (destinations) on the backup director. Therefore:
"ipvsadm -l" reports 0 connections:
root@test2:~# ipvsadm -l
IP Virtual Server version 1.2.1 (size=4096)
Prot LocalAddress:Port Scheduler Flags
  -> RemoteAddress:Port           Forward Weight ActiveConn InActConn
TCP  test2.local:5999 wlc
  -> node473.local:5999           Route   1000   0          0
  -> node484.local:5999           Route   1000   0          0

while "ipvs -lnc" is right
root@test2:~# ipvsadm -lnc
IPVS connection entries
pro expire state       source             virtual            destination
TCP 14:56  ESTABLISHED 192.168.0.10:39164 192.168.0.222:5999
192.168.0.51:5999
TCP 14:59  ESTABLISHED 192.168.0.10:39165 192.168.0.222:5999
192.168.0.52:5999

So the patch I am sending fixes the problem by binding the received
connections to the appropriate service on the backup director, if it
exists, else the connection will be handled the old way. So if the
master and the backup directors are synchronized in terms of real
services there will be no problem with server over-committing since
new connections will not be created on the nonexistent real services
on the backup. However if the service is created later on the backup,
the binding will be performed when the next connection update is
received. With this patch the inherited connections will show as
inactive on the backup:

root@test2:~# ipvsadm -l
IP Virtual Server version 1.2.1 (size=4096)
Prot LocalAddress:Port Scheduler Flags
  -> RemoteAddress:Port           Forward Weight ActiveConn InActConn
TCP  test2.local:5999 wlc
  -> node473.local:5999           Route   1000   0          1
  -> node484.local:5999           Route   1000   0          1

rumen@test2:~$ cat /proc/net/ip_vs
IP Virtual Server version 1.2.1 (size=4096)
Prot LocalAddress:Port Scheduler Flags
  -> RemoteAddress:Port Forward Weight ActiveConn InActConn
TCP  C0A800DE:176F wlc
  -> C0A80033:176F      Route   1000   0          1
  -> C0A80032:176F      Route   1000   0          1

Regards,
Rumen Bogdanovski

Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Rumen G. Bogdanovski <rumen@voicecho.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 41870564df8e..1fd1ee896f39 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -901,6 +901,10 @@ extern int ip_vs_use_count_inc(void);
 extern void ip_vs_use_count_dec(void);
 extern int ip_vs_control_init(void);
 extern void ip_vs_control_cleanup(void);
+extern struct ip_vs_dest *
+ip_vs_find_dest(__be32 daddr, __be16 dport,
+		 __be32 vaddr, __be16 vport, __u16 protocol);
+extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp);
 
 
 /*
-- 
cgit v1.2.3


From efac52762b1e3fe3035d29e82d8ee1aebc45e4a7 Mon Sep 17 00:00:00 2001
From: "Rumen G. Bogdanovski" <rumen@voicecho.com>
Date: Wed, 7 Nov 2007 02:36:55 -0800
Subject: [IPVS]: Synchronize closing of Connections

This patch makes the master daemon to sync the connection when it is about
to close.  This makes the connections on the backup to close or timeout
according their state.  Before the sync was performed only if the
connection is in ESTABLISHED state which always made the connections to
timeout in the hard coded 3 minutes. However the Andy Gospodarek's patch
([IPVS]: use proper timeout instead of fixed value) effectively did nothing
more than increasing this to 15 minutes (Established state timeout).  So
this patch makes use of proper timeout since it syncs the connections on
status changes to FIN_WAIT (2min timeout) and CLOSE (10sec timeout).
However if the backup misses CLOSE hopefully it did not miss FIN_WAIT.
Otherwise we will just have to wait for the ESTABLISHED state timeout. As
it is without this patch.  This way the number of the hanging connections
on the backup is kept to minimum. And very few of them will be left to
timeout with a long timeout.

This is important if we want to make use of the fix for the real server
overcommit on master/backup fail-over.

Signed-off-by: Rumen G. Bogdanovski <rumen@voicecho.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_vs.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 1fd1ee896f39..67ea2c0c0ab7 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -520,6 +520,10 @@ struct ip_vs_conn {
 	spinlock_t              lock;           /* lock for state transition */
 	volatile __u16          flags;          /* status flags */
 	volatile __u16          state;          /* state info */
+	volatile __u16          old_state;      /* old state, to be used for
+						 * state transition triggerd
+						 * synchronization
+						 */
 
 	/* Control members */
 	struct ip_vs_conn       *control;       /* Master control connection */
-- 
cgit v1.2.3


From 230140cffa7feae90ad50bf259db1fa07674f3a7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 7 Nov 2007 02:40:20 -0800
Subject: [INET]: Remove per bucket rwlock in tcp/dccp ehash table.

As done two years ago on IP route cache table (commit
22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one
lock per hash bucket for the huge TCP/DCCP hash tables.

On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for
litle performance differences. (we hit a different cache line for the
rwlock, but then the bucket cache line have a better sharing factor
among cpus, since we dirty it less often). For netstat or ss commands
that want a full scan of hash table, we perform fewer memory accesses.

Using a 'small' table of hashed rwlocks should be more than enough to
provide correct SMP concurrency between different buckets, without
using too much memory. Sizing of this table depends on
num_possible_cpus() and various CONFIG settings.

This patch provides some locking abstraction that may ease a future
work using a different model for TCP/DCCP table.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h | 71 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 65 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 4427dcd1e53a..8461cda37490 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -37,7 +37,6 @@
  * I'll experiment with dynamic table growth later.
  */
 struct inet_ehash_bucket {
-	rwlock_t	  lock;
 	struct hlist_head chain;
 	struct hlist_head twchain;
 };
@@ -100,6 +99,9 @@ struct inet_hashinfo {
 	 * TIME_WAIT sockets use a separate chain (twchain).
 	 */
 	struct inet_ehash_bucket	*ehash;
+	rwlock_t			*ehash_locks;
+	unsigned int			ehash_size;
+	unsigned int			ehash_locks_mask;
 
 	/* Ok, let's try this, I give up, we do need a local binding
 	 * TCP hash as well as the others for fast bind/connect.
@@ -107,7 +109,7 @@ struct inet_hashinfo {
 	struct inet_bind_hashbucket	*bhash;
 
 	unsigned int			bhash_size;
-	unsigned int			ehash_size;
+	/* Note : 4 bytes padding on 64 bit arches */
 
 	/* All sockets in TCP_LISTEN state will be in here.  This is the only
 	 * table where wildcard'd TCP sockets can exist.  Hash function here
@@ -134,6 +136,62 @@ static inline struct inet_ehash_bucket *inet_ehash_bucket(
 	return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)];
 }
 
+static inline rwlock_t *inet_ehash_lockp(
+	struct inet_hashinfo *hashinfo,
+	unsigned int hash)
+{
+	return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask];
+}
+
+static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
+{
+	unsigned int i, size = 256;
+#if defined(CONFIG_PROVE_LOCKING)
+	unsigned int nr_pcpus = 2;
+#else
+	unsigned int nr_pcpus = num_possible_cpus();
+#endif
+	if (nr_pcpus >= 4)
+		size = 512;
+	if (nr_pcpus >= 8)
+		size = 1024;
+	if (nr_pcpus >= 16)
+		size = 2048;
+	if (nr_pcpus >= 32)
+		size = 4096;
+	if (sizeof(rwlock_t) != 0) {
+#ifdef CONFIG_NUMA
+		if (size * sizeof(rwlock_t) > PAGE_SIZE)
+			hashinfo->ehash_locks = vmalloc(size * sizeof(rwlock_t));
+		else
+#endif
+		hashinfo->ehash_locks =	kmalloc(size * sizeof(rwlock_t),
+						GFP_KERNEL);
+		if (!hashinfo->ehash_locks)
+			return ENOMEM;
+		for (i = 0; i < size; i++)
+			rwlock_init(&hashinfo->ehash_locks[i]);
+	}
+	hashinfo->ehash_locks_mask = size - 1;
+	return 0;
+}
+
+static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
+{
+	if (hashinfo->ehash_locks) {
+#ifdef CONFIG_NUMA
+		unsigned int size = (hashinfo->ehash_locks_mask + 1) *
+							sizeof(rwlock_t);
+		if (size > PAGE_SIZE)
+			vfree(hashinfo->ehash_locks);
+		else
+#else
+		kfree(hashinfo->ehash_locks);
+#endif
+		hashinfo->ehash_locks = NULL;
+	}
+}
+
 extern struct inet_bind_bucket *
 		    inet_bind_bucket_create(struct kmem_cache *cachep,
 					    struct inet_bind_hashbucket *head,
@@ -222,7 +280,7 @@ static inline void __inet_hash(struct inet_hashinfo *hashinfo,
 		sk->sk_hash = inet_sk_ehashfn(sk);
 		head = inet_ehash_bucket(hashinfo, sk->sk_hash);
 		list = &head->chain;
-		lock = &head->lock;
+		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 		write_lock(lock);
 	}
 	__sk_add_node(sk, list);
@@ -253,7 +311,7 @@ static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk)
 		inet_listen_wlock(hashinfo);
 		lock = &hashinfo->lhash_lock;
 	} else {
-		lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock;
+		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 		write_lock_bh(lock);
 	}
 
@@ -354,9 +412,10 @@ static inline struct sock *
 	 */
 	unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport);
 	struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
+	rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);
 
 	prefetch(head->chain.first);
-	read_lock(&head->lock);
+	read_lock(lock);
 	sk_for_each(sk, node, &head->chain) {
 		if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif))
 			goto hit; /* You sunk my battleship! */
@@ -369,7 +428,7 @@ static inline struct sock *
 	}
 	sk = NULL;
 out:
-	read_unlock(&head->lock);
+	read_unlock(lock);
 	return sk;
 hit:
 	sock_hold(sk);
-- 
cgit v1.2.3


From c3d8d1e30cace31fed6186a4b8c6b1401836d89c Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 7 Nov 2007 02:42:09 -0800
Subject: [NETLINK]: Fix unicast timeouts

Commit ed6dcf4a in the history.git tree broke netlink_unicast timeouts
by moving the schedule_timeout() call to a new function that doesn't
propagate the remaining timeout back to the caller. This means on each
retry we start with the full timeout again.

ipc/mqueue.c seems to actually want to wait indefinitely so this
behaviour is retained.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 7c1f3b1d2ee5..d5bfaba595c7 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -192,7 +192,7 @@ extern int netlink_unregister_notifier(struct notifier_block *nb);
 /* finegrained unicast helpers: */
 struct sock *netlink_getsockbyfilp(struct file *filp);
 int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock,
-		long timeo, struct sock *ssk);
+		      long *timeo, struct sock *ssk);
 void netlink_detachskb(struct sock *sk, struct sk_buff *skb);
 int netlink_sendskb(struct sock *sk, struct sk_buff *skb);
 
-- 
cgit v1.2.3