From b98e1747eecc19b872572c5fffedc1868531dac6 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Mon, 5 Nov 2007 20:42:16 -0800 Subject: [NETFILTER]: Sort matches/targets in Kbuild file Sort matches and targets in the Kbuild file. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter/Kbuild | 18 +++++++++--------- include/linux/netfilter_ipv4/Kbuild | 28 ++++++++++++++-------------- include/linux/netfilter_ipv6/Kbuild | 2 +- 3 files changed, 24 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild index f2eaea2234ec..b87e83a5e070 100644 --- a/include/linux/netfilter/Kbuild +++ b/include/linux/netfilter/Kbuild @@ -4,25 +4,28 @@ header-y += nfnetlink_conntrack.h header-y += nfnetlink_log.h header-y += nfnetlink_queue.h header-y += xt_CLASSIFY.h +header-y += xt_CONNMARK.h +header-y += xt_CONNSECMARK.h +header-y += xt_DSCP.h +header-y += xt_MARK.h +header-y += xt_NFLOG.h +header-y += xt_NFQUEUE.h +header-y += xt_SECMARK.h +header-y += xt_TCPMSS.h header-y += xt_comment.h header-y += xt_connbytes.h header-y += xt_connmark.h -header-y += xt_CONNMARK.h header-y += xt_conntrack.h header-y += xt_dccp.h header-y += xt_dscp.h -header-y += xt_DSCP.h header-y += xt_esp.h -header-y += xt_helper.h header-y += xt_hashlimit.h +header-y += xt_helper.h header-y += xt_length.h header-y += xt_limit.h header-y += xt_mac.h header-y += xt_mark.h -header-y += xt_MARK.h header-y += xt_multiport.h -header-y += xt_NFQUEUE.h -header-y += xt_NFLOG.h header-y += xt_pkttype.h header-y += xt_policy.h header-y += xt_realm.h @@ -32,9 +35,6 @@ header-y += xt_statistic.h header-y += xt_string.h header-y += xt_tcpmss.h header-y += xt_tcpudp.h -header-y += xt_SECMARK.h -header-y += xt_CONNSECMARK.h -header-y += xt_TCPMSS.h unifdef-y += nf_conntrack_common.h unifdef-y += nf_conntrack_ftp.h diff --git a/include/linux/netfilter_ipv4/Kbuild b/include/linux/netfilter_ipv4/Kbuild index 7185792b900f..3a7105bb8f33 100644 --- a/include/linux/netfilter_ipv4/Kbuild +++ b/include/linux/netfilter_ipv4/Kbuild @@ -1,47 +1,47 @@ -header-y += ipt_addrtype.h -header-y += ipt_ah.h header-y += ipt_CLASSIFY.h header-y += ipt_CLUSTERIP.h +header-y += ipt_CONNMARK.h +header-y += ipt_DSCP.h +header-y += ipt_ECN.h +header-y += ipt_LOG.h +header-y += ipt_MARK.h +header-y += ipt_NFQUEUE.h +header-y += ipt_REJECT.h +header-y += ipt_SAME.h +header-y += ipt_TCPMSS.h +header-y += ipt_TOS.h +header-y += ipt_TTL.h +header-y += ipt_ULOG.h +header-y += ipt_addrtype.h +header-y += ipt_ah.h header-y += ipt_comment.h header-y += ipt_connbytes.h header-y += ipt_connmark.h -header-y += ipt_CONNMARK.h header-y += ipt_conntrack.h header-y += ipt_dccp.h header-y += ipt_dscp.h -header-y += ipt_DSCP.h header-y += ipt_ecn.h -header-y += ipt_ECN.h header-y += ipt_esp.h header-y += ipt_hashlimit.h header-y += ipt_helper.h header-y += ipt_iprange.h header-y += ipt_length.h header-y += ipt_limit.h -header-y += ipt_LOG.h header-y += ipt_mac.h header-y += ipt_mark.h -header-y += ipt_MARK.h header-y += ipt_multiport.h -header-y += ipt_NFQUEUE.h header-y += ipt_owner.h header-y += ipt_physdev.h header-y += ipt_pkttype.h header-y += ipt_policy.h header-y += ipt_realm.h header-y += ipt_recent.h -header-y += ipt_REJECT.h -header-y += ipt_SAME.h header-y += ipt_sctp.h header-y += ipt_state.h header-y += ipt_string.h header-y += ipt_tcpmss.h -header-y += ipt_TCPMSS.h header-y += ipt_tos.h -header-y += ipt_TOS.h header-y += ipt_ttl.h -header-y += ipt_TTL.h -header-y += ipt_ULOG.h unifdef-y += ip_queue.h unifdef-y += ip_tables.h diff --git a/include/linux/netfilter_ipv6/Kbuild b/include/linux/netfilter_ipv6/Kbuild index 9dd978d149ff..8887a5fcd1d0 100644 --- a/include/linux/netfilter_ipv6/Kbuild +++ b/include/linux/netfilter_ipv6/Kbuild @@ -14,8 +14,8 @@ header-y += ip6t_mark.h header-y += ip6t_multiport.h header-y += ip6t_opts.h header-y += ip6t_owner.h -header-y += ip6t_policy.h header-y += ip6t_physdev.h +header-y += ip6t_policy.h header-y += ip6t_rt.h unifdef-y += ip6_tables.h -- cgit v1.2.3 From 6a9fb9479f2672fa392711735de9e642395c9a14 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 5 Nov 2007 21:32:31 -0800 Subject: [IPV4]: Clean the ip_sockglue.c from some ugly ifdefs The #idfed CONFIG_IP_MROUTE is sometimes places inside the if-s, which looks completely bad. Similar ifdefs inside the functions looks a bit better, but they are also not recommended to be used. Provide an ifdef-ed ip_mroute_opt() helper to cleanup the code. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- include/linux/mroute.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 7da2cee8e132..35a8277ec1bd 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -128,6 +128,18 @@ struct igmpmsg #ifdef __KERNEL__ #include +#ifdef CONFIG_IP_MROUTE +static inline int ip_mroute_opt(int opt) +{ + return (opt >= MRT_BASE) && (opt <= MRT_BASE + 10); +} +#else +static inline int ip_mroute_opt(int opt) +{ + return 0; +} +#endif + extern int ip_mroute_setsockopt(struct sock *, int, char __user *, int); extern int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *); extern int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); -- cgit v1.2.3 From 286ab3d46058840d68e5d7d52e316c1f7e98c59f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Nov 2007 23:38:39 -0800 Subject: [NET]: Define infrastructure to keep 'inuse' changes in an efficent SMP/NUMA way. "struct proto" currently uses an array stats[NR_CPUS] to track change on 'inuse' sockets per protocol. If NR_CPUS is big, this means we use a big memory area for this. Moreover, all this memory area is located on a single node on NUMA machines, increasing memory pressure on the boot node. In this patch, I tried to : - Keep a fast !CONFIG_SMP implementation - Keep a fast CONFIG_SMP implementation for often used protocols (tcp,udp,raw,...) - Introduce a NUMA efficient implementation Some helper macros are defined in include/net/sock.h These macros take into account CONFIG_SMP If a "struct proto" is declared without using DEFINE_PROTO_INUSE / REF_PROTO_INUSE macros, it will automatically use a default implementation, using a dynamically allocated percpu zone. This default implementation will be NUMA efficient, but might use 32/64 bytes per possible cpu because of current alloc_percpu() implementation. However it still should be better than previous implementation based on stats[NR_CPUS] field. When a "struct proto" is changed to use the new macros, we use a single static "int" percpu variable, lowering the memory and cpu costs, still preserving NUMA efficiency. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 20de3fa7ae40..5504fb9fa88a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -560,6 +560,14 @@ struct proto { void (*unhash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); +#ifdef CONFIG_SMP + /* Keeping track of sockets in use */ + void (*inuse_add)(struct proto *prot, int inc); + int (*inuse_getval)(const struct proto *prot); + int *inuse_ptr; +#else + int inuse; +#endif /* Memory pressure */ void (*enter_memory_pressure)(void); atomic_t *memory_allocated; /* Current allocated memory. */ @@ -592,12 +600,38 @@ struct proto { #ifdef SOCK_REFCNT_DEBUG atomic_t socks; #endif - struct { - int inuse; - u8 __pad[SMP_CACHE_BYTES - sizeof(int)]; - } stats[NR_CPUS]; }; +/* + * Special macros to let protos use a fast version of inuse{get|add} + * using a static percpu variable per proto instead of an allocated one, + * saving one dereference. + * This might be changed if/when dynamic percpu vars become fast. + */ +#ifdef CONFIG_SMP +# define DEFINE_PROTO_INUSE(NAME) \ +static DEFINE_PER_CPU(int, NAME##_inuse); \ +static void NAME##_inuse_add(struct proto *prot, int inc) \ +{ \ + __get_cpu_var(NAME##_inuse) += inc; \ +} \ + \ +static int NAME##_inuse_getval(const struct proto *prot)\ +{ \ + int res = 0, cpu; \ + \ + for_each_possible_cpu(cpu) \ + res += per_cpu(NAME##_inuse, cpu); \ + return res; \ +} +# define REF_PROTO_INUSE(NAME) \ + .inuse_add = NAME##_inuse_add, \ + .inuse_getval = NAME##_inuse_getval, +#else +# define DEFINE_PROTO_INUSE(NAME) +# define REF_PROTO_INUSE(NAME) +#endif + extern int proto_register(struct proto *prot, int alloc_slab); extern void proto_unregister(struct proto *prot); @@ -629,12 +663,29 @@ static inline void sk_refcnt_debug_release(const struct sock *sk) /* Called with local bh disabled */ static __inline__ void sock_prot_inc_use(struct proto *prot) { - prot->stats[smp_processor_id()].inuse++; +#ifdef CONFIG_SMP + prot->inuse_add(prot, 1); +#else + prot->inuse++; +#endif } static __inline__ void sock_prot_dec_use(struct proto *prot) { - prot->stats[smp_processor_id()].inuse--; +#ifdef CONFIG_SMP + prot->inuse_add(prot, -1); +#else + prot->inuse--; +#endif +} + +static __inline__ int sock_prot_inuse(struct proto *proto) +{ +#ifdef CONFIG_SMP + return proto->inuse_getval(proto); +#else + return proto->inuse; +#endif } /* With per-bucket locks this operation is not-atomic, so that -- cgit v1.2.3 From 44656ba1286d82b5a5f8817eb2e4ea744143c3ca Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 7 Nov 2007 04:10:52 -0800 Subject: [NET]: Kill proc_net_create() There are no more users. Signed-off-by: David S. Miller --- include/linux/proc_fs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 1ff461672060..1273c6ec535c 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -196,8 +196,6 @@ static inline struct proc_dir_entry *create_proc_info_entry(const char *name, return res; } -extern struct proc_dir_entry *proc_net_create(struct net *net, - const char *name, mode_t mode, get_info_t *get_info); extern struct proc_dir_entry *proc_net_fops_create(struct net *net, const char *name, mode_t mode, const struct file_operations *fops); extern void proc_net_remove(struct net *net, const char *name); @@ -208,7 +206,6 @@ extern void proc_net_remove(struct net *net, const char *name); #define proc_bus NULL #define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; }) -#define proc_net_create(net, name, mode, info) ({ (void)(mode), NULL; }) static inline void proc_net_remove(struct net *net, const char *name) {} static inline void proc_flush_task(struct task_struct *task) -- cgit v1.2.3 From c3e9a353d8fc64a82ab11a07e21902e25e1e96d1 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Tue, 6 Nov 2007 23:34:04 -0800 Subject: [IPV4]: Compact some ifdefs in the fib code. There are places that check for CONFIG_IP_MULTIPLE_TABLES twice in the same file, but the internals of these #ifdefs can be merged. As a side effect - remove one ifdef from inside a function. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- include/net/ip_fib.h | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 8cadc77c7df4..ed514bfb61ba 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -185,6 +185,12 @@ static inline void fib_select_default(const struct flowi *flp, struct fib_result } #else /* CONFIG_IP_MULTIPLE_TABLES */ +extern void __init fib4_rules_init(void); + +#ifdef CONFIG_NET_CLS_ROUTE +extern u32 fib_rules_tclass(struct fib_result *res); +#endif + #define ip_fib_local_table fib_get_table(RT_TABLE_LOCAL) #define ip_fib_main_table fib_get_table(RT_TABLE_MAIN) @@ -214,15 +220,6 @@ extern __be32 __fib_res_prefsrc(struct fib_result *res); /* Exported by fib_hash.c */ extern struct fib_table *fib_hash_init(u32 id); -#ifdef CONFIG_IP_MULTIPLE_TABLES -extern void __init fib4_rules_init(void); - -#ifdef CONFIG_NET_CLS_ROUTE -extern u32 fib_rules_tclass(struct fib_result *res); -#endif - -#endif - static inline void fib_combine_itag(u32 *itag, struct fib_result *res) { #ifdef CONFIG_NET_CLS_ROUTE -- cgit v1.2.3 From 0fc00e2440b717e19bab1ae0015f03936bdf7967 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 7 Nov 2007 01:24:56 -0800 Subject: [TTY]: Fix network driver interactions with TCGET/SET calls. Dave Miller noted various cases where line disciplines for things like ppp go poking around in termios themselves in ways that broke with the new termios code. Rather than have them all learning about termios internals provide proper methods for this - tty_mode_ioctl() This handles all the terminal mode handling for speed/carrier etc and none of the methods are ldisc dependant so they can be called by any user - tty_perform_flush() This extracts the flush functionality and enables pppd the ppp layer to share it cleanly. The existing n_tty_ioctl code is refactored in this patch to provide the new functions and to call them itself appropriately. This patch has no (intended) behaviour changes and simply prepares for the other fixes. Signed-off-by: Alan Cox Signed-off-by: David S. Miller --- include/linux/tty.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/tty.h b/include/linux/tty.h index 56164d7ba0ad..c555f5442bd7 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -332,7 +332,9 @@ extern void tty_ldisc_flush(struct tty_struct *tty); extern int tty_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg); - +extern int tty_mode_ioctl(struct tty_struct *tty, struct file *file, + unsigned int cmd, unsigned long arg); +extern int tty_perform_flush(struct tty_struct *tty, unsigned long arg); extern dev_t tty_devnum(struct tty_struct *tty); extern void proc_clear_tty(struct task_struct *p); extern struct tty_struct *get_current_tty(void); -- cgit v1.2.3 From 1e356f9cdfa885c78791d5d6e5d2baef22f01853 Mon Sep 17 00:00:00 2001 From: "Rumen G. Bogdanovski" Date: Wed, 7 Nov 2007 02:35:54 -0800 Subject: [IPVS]: Bind connections on stanby if the destination exists This patch fixes the problem with node overload on director fail-over. Given the scenario: 2 nodes each accepting 3 connections at a time and 2 directors, director failover occurs when the nodes are fully loaded (6 connections to the cluster) in this case the new director will assign another 6 connections to the cluster, If the same real servers exist there. The problem turned to be in not binding the inherited connections to the real servers (destinations) on the backup director. Therefore: "ipvsadm -l" reports 0 connections: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 0 -> node484.local:5999 Route 1000 0 0 while "ipvs -lnc" is right root@test2:~# ipvsadm -lnc IPVS connection entries pro expire state source virtual destination TCP 14:56 ESTABLISHED 192.168.0.10:39164 192.168.0.222:5999 192.168.0.51:5999 TCP 14:59 ESTABLISHED 192.168.0.10:39165 192.168.0.222:5999 192.168.0.52:5999 So the patch I am sending fixes the problem by binding the received connections to the appropriate service on the backup director, if it exists, else the connection will be handled the old way. So if the master and the backup directors are synchronized in terms of real services there will be no problem with server over-committing since new connections will not be created on the nonexistent real services on the backup. However if the service is created later on the backup, the binding will be performed when the next connection update is received. With this patch the inherited connections will show as inactive on the backup: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 1 -> node484.local:5999 Route 1000 0 1 rumen@test2:~$ cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP C0A800DE:176F wlc -> C0A80033:176F Route 1000 0 1 -> C0A80032:176F Route 1000 0 1 Regards, Rumen Bogdanovski Acked-by: Julian Anastasov Signed-off-by: Rumen G. Bogdanovski Signed-off-by: Simon Horman --- include/net/ip_vs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 41870564df8e..1fd1ee896f39 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -901,6 +901,10 @@ extern int ip_vs_use_count_inc(void); extern void ip_vs_use_count_dec(void); extern int ip_vs_control_init(void); extern void ip_vs_control_cleanup(void); +extern struct ip_vs_dest * +ip_vs_find_dest(__be32 daddr, __be16 dport, + __be32 vaddr, __be16 vport, __u16 protocol); +extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp); /* -- cgit v1.2.3 From efac52762b1e3fe3035d29e82d8ee1aebc45e4a7 Mon Sep 17 00:00:00 2001 From: "Rumen G. Bogdanovski" Date: Wed, 7 Nov 2007 02:36:55 -0800 Subject: [IPVS]: Synchronize closing of Connections This patch makes the master daemon to sync the connection when it is about to close. This makes the connections on the backup to close or timeout according their state. Before the sync was performed only if the connection is in ESTABLISHED state which always made the connections to timeout in the hard coded 3 minutes. However the Andy Gospodarek's patch ([IPVS]: use proper timeout instead of fixed value) effectively did nothing more than increasing this to 15 minutes (Established state timeout). So this patch makes use of proper timeout since it syncs the connections on status changes to FIN_WAIT (2min timeout) and CLOSE (10sec timeout). However if the backup misses CLOSE hopefully it did not miss FIN_WAIT. Otherwise we will just have to wait for the ESTABLISHED state timeout. As it is without this patch. This way the number of the hanging connections on the backup is kept to minimum. And very few of them will be left to timeout with a long timeout. This is important if we want to make use of the fix for the real server overcommit on master/backup fail-over. Signed-off-by: Rumen G. Bogdanovski Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- include/net/ip_vs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 1fd1ee896f39..67ea2c0c0ab7 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -520,6 +520,10 @@ struct ip_vs_conn { spinlock_t lock; /* lock for state transition */ volatile __u16 flags; /* status flags */ volatile __u16 state; /* state info */ + volatile __u16 old_state; /* old state, to be used for + * state transition triggerd + * synchronization + */ /* Control members */ struct ip_vs_conn *control; /* Master control connection */ -- cgit v1.2.3 From 230140cffa7feae90ad50bf259db1fa07674f3a7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 7 Nov 2007 02:40:20 -0800 Subject: [INET]: Remove per bucket rwlock in tcp/dccp ehash table. As done two years ago on IP route cache table (commit 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per hash bucket for the huge TCP/DCCP hash tables. On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle performance differences. (we hit a different cache line for the rwlock, but then the bucket cache line have a better sharing factor among cpus, since we dirty it less often). For netstat or ss commands that want a full scan of hash table, we perform fewer memory accesses. Using a 'small' table of hashed rwlocks should be more than enough to provide correct SMP concurrency between different buckets, without using too much memory. Sizing of this table depends on num_possible_cpus() and various CONFIG settings. This patch provides some locking abstraction that may ease a future work using a different model for TCP/DCCP table. Signed-off-by: Eric Dumazet Acked-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 71 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 4427dcd1e53a..8461cda37490 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -37,7 +37,6 @@ * I'll experiment with dynamic table growth later. */ struct inet_ehash_bucket { - rwlock_t lock; struct hlist_head chain; struct hlist_head twchain; }; @@ -100,6 +99,9 @@ struct inet_hashinfo { * TIME_WAIT sockets use a separate chain (twchain). */ struct inet_ehash_bucket *ehash; + rwlock_t *ehash_locks; + unsigned int ehash_size; + unsigned int ehash_locks_mask; /* Ok, let's try this, I give up, we do need a local binding * TCP hash as well as the others for fast bind/connect. @@ -107,7 +109,7 @@ struct inet_hashinfo { struct inet_bind_hashbucket *bhash; unsigned int bhash_size; - unsigned int ehash_size; + /* Note : 4 bytes padding on 64 bit arches */ /* All sockets in TCP_LISTEN state will be in here. This is the only * table where wildcard'd TCP sockets can exist. Hash function here @@ -134,6 +136,62 @@ static inline struct inet_ehash_bucket *inet_ehash_bucket( return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)]; } +static inline rwlock_t *inet_ehash_lockp( + struct inet_hashinfo *hashinfo, + unsigned int hash) +{ + return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask]; +} + +static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) +{ + unsigned int i, size = 256; +#if defined(CONFIG_PROVE_LOCKING) + unsigned int nr_pcpus = 2; +#else + unsigned int nr_pcpus = num_possible_cpus(); +#endif + if (nr_pcpus >= 4) + size = 512; + if (nr_pcpus >= 8) + size = 1024; + if (nr_pcpus >= 16) + size = 2048; + if (nr_pcpus >= 32) + size = 4096; + if (sizeof(rwlock_t) != 0) { +#ifdef CONFIG_NUMA + if (size * sizeof(rwlock_t) > PAGE_SIZE) + hashinfo->ehash_locks = vmalloc(size * sizeof(rwlock_t)); + else +#endif + hashinfo->ehash_locks = kmalloc(size * sizeof(rwlock_t), + GFP_KERNEL); + if (!hashinfo->ehash_locks) + return ENOMEM; + for (i = 0; i < size; i++) + rwlock_init(&hashinfo->ehash_locks[i]); + } + hashinfo->ehash_locks_mask = size - 1; + return 0; +} + +static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) +{ + if (hashinfo->ehash_locks) { +#ifdef CONFIG_NUMA + unsigned int size = (hashinfo->ehash_locks_mask + 1) * + sizeof(rwlock_t); + if (size > PAGE_SIZE) + vfree(hashinfo->ehash_locks); + else +#else + kfree(hashinfo->ehash_locks); +#endif + hashinfo->ehash_locks = NULL; + } +} + extern struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct inet_bind_hashbucket *head, @@ -222,7 +280,7 @@ static inline void __inet_hash(struct inet_hashinfo *hashinfo, sk->sk_hash = inet_sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); list = &head->chain; - lock = &head->lock; + lock = inet_ehash_lockp(hashinfo, sk->sk_hash); write_lock(lock); } __sk_add_node(sk, list); @@ -253,7 +311,7 @@ static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk) inet_listen_wlock(hashinfo); lock = &hashinfo->lhash_lock; } else { - lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock; + lock = inet_ehash_lockp(hashinfo, sk->sk_hash); write_lock_bh(lock); } @@ -354,9 +412,10 @@ static inline struct sock * */ unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); prefetch(head->chain.first); - read_lock(&head->lock); + read_lock(lock); sk_for_each(sk, node, &head->chain) { if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) goto hit; /* You sunk my battleship! */ @@ -369,7 +428,7 @@ static inline struct sock * } sk = NULL; out: - read_unlock(&head->lock); + read_unlock(lock); return sk; hit: sock_hold(sk); -- cgit v1.2.3 From c3d8d1e30cace31fed6186a4b8c6b1401836d89c Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 7 Nov 2007 02:42:09 -0800 Subject: [NETLINK]: Fix unicast timeouts Commit ed6dcf4a in the history.git tree broke netlink_unicast timeouts by moving the schedule_timeout() call to a new function that doesn't propagate the remaining timeout back to the caller. This means on each retry we start with the full timeout again. ipc/mqueue.c seems to actually want to wait indefinitely so this behaviour is retained. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 7c1f3b1d2ee5..d5bfaba595c7 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -192,7 +192,7 @@ extern int netlink_unregister_notifier(struct notifier_block *nb); /* finegrained unicast helpers: */ struct sock *netlink_getsockbyfilp(struct file *filp); int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, - long timeo, struct sock *ssk); + long *timeo, struct sock *ssk); void netlink_detachskb(struct sock *sk, struct sk_buff *skb); int netlink_sendskb(struct sock *sk, struct sk_buff *skb); -- cgit v1.2.3