diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-19 10:05:34 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-19 10:05:34 -0700 |
commit | 1200b6809dfd9d73bc4c7db76d288c35fa4b2ebe (patch) | |
tree | 552e03de245cdbd0780ca1215914edc4a26540f7 /net/ipv4/inet_hashtables.c | |
parent | 6b5f04b6cf8ebab9a65d9c0026c650bb2538fd0f (diff) | |
parent | fe30937b65354c7fec244caebbdaae68e28ca797 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
"Highlights:
1) Support more Realtek wireless chips, from Jes Sorenson.
2) New BPF types for per-cpu hash and arrap maps, from Alexei
Starovoitov.
3) Make several TCP sysctls per-namespace, from Nikolay Borisov.
4) Allow the use of SO_REUSEPORT in order to do per-thread processing
of incoming TCP/UDP connections. The muxing can be done using a
BPF program which hashes the incoming packet. From Craig Gallek.
5) Add a multiplexer for TCP streams, to provide a messaged based
interface. BPF programs can be used to determine the message
boundaries. From Tom Herbert.
6) Add 802.1AE MACSEC support, from Sabrina Dubroca.
7) Avoid factorial complexity when taking down an inetdev interface
with lots of configured addresses. We were doing things like
traversing the entire address less for each address removed, and
flushing the entire netfilter conntrack table for every address as
well.
8) Add and use SKB bulk free infrastructure, from Jesper Brouer.
9) Allow offloading u32 classifiers to hardware, and implement for
ixgbe, from John Fastabend.
10) Allow configuring IRQ coalescing parameters on a per-queue basis,
from Kan Liang.
11) Extend ethtool so that larger link mode masks can be supported.
From David Decotigny.
12) Introduce devlink, which can be used to configure port link types
(ethernet vs Infiniband, etc.), port splitting, and switch device
level attributes as a whole. From Jiri Pirko.
13) Hardware offload support for flower classifiers, from Amir Vadai.
14) Add "Local Checksum Offload". Basically, for a tunneled packet
the checksum of the outer header is 'constant' (because with the
checksum field filled into the inner protocol header, the payload
of the outer frame checksums to 'zero'), and we can take advantage
of that in various ways. From Edward Cree"
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1548 commits)
bonding: fix bond_get_stats()
net: bcmgenet: fix dma api length mismatch
net/mlx4_core: Fix backward compatibility on VFs
phy: mdio-thunder: Fix some Kconfig typos
lan78xx: add ndo_get_stats64
lan78xx: handle statistics counter rollover
RDS: TCP: Remove unused constant
RDS: TCP: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket
net: smc911x: convert pxa dma to dmaengine
team: remove duplicate set of flag IFF_MULTICAST
bonding: remove duplicate set of flag IFF_MULTICAST
net: fix a comment typo
ethernet: micrel: fix some error codes
ip_tunnels, bpf: define IP_TUNNEL_OPTS_MAX and use it
bpf, dst: add and use dst_tclassid helper
bpf: make skb->tc_classid also readable
net: mvneta: bm: clarify dependencies
cls_bpf: reset class and reuse major in da
ldmvsw: Checkpatch sunvnet.c and sunvnet_common.c
ldmvsw: Add ldmvsw.c driver code
...
Diffstat (limited to 'net/ipv4/inet_hashtables.c')
-rw-r--r-- | net/ipv4/inet_hashtables.c | 237 |
1 files changed, 148 insertions, 89 deletions
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ccc5980797fc..bc68eced0105 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -20,10 +20,12 @@ #include <linux/wait.h> #include <linux/vmalloc.h> +#include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> +#include <net/sock_reuseport.h> static u32 inet_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, @@ -205,6 +207,7 @@ static inline int compute_score(struct sock *sk, struct net *net, struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif) @@ -214,6 +217,7 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore, matches = 0, reuseport = 0; + bool select_ok = true; u32 phash = 0; rcu_read_lock(); @@ -229,6 +233,15 @@ begin: if (reuseport) { phash = inet_ehashfn(net, daddr, hnum, saddr, sport); + if (select_ok) { + struct sock *sk2; + sk2 = reuseport_select_sock(sk, phash, + skb, doff); + if (sk2) { + result = sk2; + goto found; + } + } matches = 1; } } else if (score == hiscore && reuseport) { @@ -246,11 +259,13 @@ begin: if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) result = NULL; else if (unlikely(compute_score(result, net, hnum, daddr, dif) < hiscore)) { sock_put(result); + select_ok = false; goto begin; } } @@ -449,32 +464,74 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) } EXPORT_SYMBOL_GPL(inet_ehash_nolisten); -void __inet_hash(struct sock *sk, struct sock *osk) +static int inet_reuseport_add_sock(struct sock *sk, + struct inet_listen_hashbucket *ilb, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)) +{ + struct sock *sk2; + struct hlist_nulls_node *node; + kuid_t uid = sock_i_uid(sk); + + sk_nulls_for_each_rcu(sk2, node, &ilb->head) { + if (sk2 != sk && + sk2->sk_family == sk->sk_family && + ipv6_only_sock(sk2) == ipv6_only_sock(sk) && + sk2->sk_bound_dev_if == sk->sk_bound_dev_if && + sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && + saddr_same(sk, sk2, false)) + return reuseport_add_sock(sk, sk2); + } + + /* Initial allocation may have already happened via setsockopt */ + if (!rcu_access_pointer(sk->sk_reuseport_cb)) + return reuseport_alloc(sk); + return 0; +} + +int __inet_hash(struct sock *sk, struct sock *osk, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; + int err = 0; if (sk->sk_state != TCP_LISTEN) { inet_ehash_nolisten(sk, osk); - return; + return 0; } WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; spin_lock(&ilb->lock); + if (sk->sk_reuseport) { + err = inet_reuseport_add_sock(sk, ilb, saddr_same); + if (err) + goto unlock; + } __sk_nulls_add_node_rcu(sk, &ilb->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); +unlock: spin_unlock(&ilb->lock); + + return err; } EXPORT_SYMBOL(__inet_hash); -void inet_hash(struct sock *sk) +int inet_hash(struct sock *sk) { + int err = 0; + if (sk->sk_state != TCP_CLOSE) { local_bh_disable(); - __inet_hash(sk, NULL); + err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal); local_bh_enable(); } + + return err; } EXPORT_SYMBOL_GPL(inet_hash); @@ -493,6 +550,8 @@ void inet_unhash(struct sock *sk) lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); done = __sk_nulls_del_node_init_rcu(sk); if (done) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); @@ -506,106 +565,106 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *, __u16, struct inet_timewait_sock **)) { struct inet_hashinfo *hinfo = death_row->hashinfo; - const unsigned short snum = inet_sk(sk)->inet_num; + struct inet_timewait_sock *tw = NULL; struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; + int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); + struct inet_bind_bucket *tb; + u32 remaining, offset; + int ret, i, low, high; + static u32 hint; + + if (port) { + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { + inet_ehash_nolisten(sk, NULL); + spin_unlock_bh(&head->lock); + return 0; + } + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = check_established(death_row, sk, port, NULL); + local_bh_enable(); + return ret; + } - if (!snum) { - int i, remaining, low, high, port; - static u32 hint; - u32 offset = hint + port_offset; - struct inet_timewait_sock *tw = NULL; + inet_get_local_port_range(net, &low, &high); + high++; /* [32768, 60999] -> [32768, 61000[ */ + remaining = high - low; + if (likely(remaining > 1)) + remaining &= ~1U; - inet_get_local_port_range(net, &low, &high); - remaining = (high - low) + 1; + offset = (hint + port_offset) % remaining; + /* In first pass we try ports of @low parity. + * inet_csk_get_port() does the opposite choice. + */ + offset &= ~1U; +other_parity_scan: + port = low + offset; + for (i = 0; i < remaining; i += 2, port += 2) { + if (unlikely(port >= high)) + port -= remaining; + if (inet_is_local_reserved_port(net, port)) + continue; + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); - /* By starting with offset being an even number, - * we tend to leave about 50% of ports for other uses, - * like bind(0). + /* Does not bother with rcv_saddr checks, because + * the established check is already unique enough. */ - offset &= ~1; - - local_bh_disable(); - for (i = 0; i < remaining; i++) { - port = low + (i + offset) % remaining; - if (inet_is_local_reserved_port(net, port)) - continue; - head = &hinfo->bhash[inet_bhashfn(net, port, - hinfo->bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, &head->chain) { - if (net_eq(ib_net(tb), net) && - tb->port == port) { - if (tb->fastreuse >= 0 || - tb->fastreuseport >= 0) - goto next_port; - WARN_ON(hlist_empty(&tb->owners)); - if (!check_established(death_row, sk, - port, &tw)) - goto ok; + inet_bind_bucket_for_each(tb, &head->chain) { + if (net_eq(ib_net(tb), net) && tb->port == port) { + if (tb->fastreuse >= 0 || + tb->fastreuseport >= 0) goto next_port; - } + WARN_ON(hlist_empty(&tb->owners)); + if (!check_established(death_row, sk, + port, &tw)) + goto ok; + goto next_port; } - - tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, - net, head, port); - if (!tb) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - tb->fastreuseport = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); } - local_bh_enable(); - - return -EADDRNOTAVAIL; -ok: - hint += (i + 2) & ~1; - - /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, port); - if (sk_unhashed(sk)) { - inet_sk(sk)->inet_sport = htons(port); - inet_ehash_nolisten(sk, (struct sock *)tw); + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + net, head, port); + if (!tb) { + spin_unlock_bh(&head->lock); + return -ENOMEM; } - if (tw) - inet_twsk_bind_unhash(tw, hinfo); - spin_unlock(&head->lock); + tb->fastreuse = -1; + tb->fastreuseport = -1; + goto ok; +next_port: + spin_unlock_bh(&head->lock); + cond_resched(); + } - if (tw) - inet_twsk_deschedule_put(tw); + offset++; + if ((offset & 1) && remaining > 1) + goto other_parity_scan; - ret = 0; - goto out; - } + return -EADDRNOTAVAIL; - head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - inet_ehash_nolisten(sk, NULL); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = check_established(death_row, sk, snum, NULL); -out: - local_bh_enable(); - return ret; +ok: + hint += i + 2; + + /* Head lock still held and bh's disabled */ + inet_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->inet_sport = htons(port); + inet_ehash_nolisten(sk, (struct sock *)tw); } + if (tw) + inet_twsk_bind_unhash(tw, hinfo); + spin_unlock(&head->lock); + if (tw) + inet_twsk_deschedule_put(tw); + local_bh_enable(); + return 0; } /* |