summaryrefslogtreecommitdiff
path: root/net/ipv4/inet_hashtables.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-03-19 10:05:34 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-19 10:05:34 -0700
commit1200b6809dfd9d73bc4c7db76d288c35fa4b2ebe (patch)
tree552e03de245cdbd0780ca1215914edc4a26540f7 /net/ipv4/inet_hashtables.c
parent6b5f04b6cf8ebab9a65d9c0026c650bb2538fd0f (diff)
parentfe30937b65354c7fec244caebbdaae68e28ca797 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Highlights: 1) Support more Realtek wireless chips, from Jes Sorenson. 2) New BPF types for per-cpu hash and arrap maps, from Alexei Starovoitov. 3) Make several TCP sysctls per-namespace, from Nikolay Borisov. 4) Allow the use of SO_REUSEPORT in order to do per-thread processing of incoming TCP/UDP connections. The muxing can be done using a BPF program which hashes the incoming packet. From Craig Gallek. 5) Add a multiplexer for TCP streams, to provide a messaged based interface. BPF programs can be used to determine the message boundaries. From Tom Herbert. 6) Add 802.1AE MACSEC support, from Sabrina Dubroca. 7) Avoid factorial complexity when taking down an inetdev interface with lots of configured addresses. We were doing things like traversing the entire address less for each address removed, and flushing the entire netfilter conntrack table for every address as well. 8) Add and use SKB bulk free infrastructure, from Jesper Brouer. 9) Allow offloading u32 classifiers to hardware, and implement for ixgbe, from John Fastabend. 10) Allow configuring IRQ coalescing parameters on a per-queue basis, from Kan Liang. 11) Extend ethtool so that larger link mode masks can be supported. From David Decotigny. 12) Introduce devlink, which can be used to configure port link types (ethernet vs Infiniband, etc.), port splitting, and switch device level attributes as a whole. From Jiri Pirko. 13) Hardware offload support for flower classifiers, from Amir Vadai. 14) Add "Local Checksum Offload". Basically, for a tunneled packet the checksum of the outer header is 'constant' (because with the checksum field filled into the inner protocol header, the payload of the outer frame checksums to 'zero'), and we can take advantage of that in various ways. From Edward Cree" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1548 commits) bonding: fix bond_get_stats() net: bcmgenet: fix dma api length mismatch net/mlx4_core: Fix backward compatibility on VFs phy: mdio-thunder: Fix some Kconfig typos lan78xx: add ndo_get_stats64 lan78xx: handle statistics counter rollover RDS: TCP: Remove unused constant RDS: TCP: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket net: smc911x: convert pxa dma to dmaengine team: remove duplicate set of flag IFF_MULTICAST bonding: remove duplicate set of flag IFF_MULTICAST net: fix a comment typo ethernet: micrel: fix some error codes ip_tunnels, bpf: define IP_TUNNEL_OPTS_MAX and use it bpf, dst: add and use dst_tclassid helper bpf: make skb->tc_classid also readable net: mvneta: bm: clarify dependencies cls_bpf: reset class and reuse major in da ldmvsw: Checkpatch sunvnet.c and sunvnet_common.c ldmvsw: Add ldmvsw.c driver code ...
Diffstat (limited to 'net/ipv4/inet_hashtables.c')
-rw-r--r--net/ipv4/inet_hashtables.c237
1 files changed, 148 insertions, 89 deletions
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ccc5980797fc..bc68eced0105 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -20,10 +20,12 @@
#include <linux/wait.h>
#include <linux/vmalloc.h>
+#include <net/addrconf.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
#include <net/secure_seq.h>
#include <net/ip.h>
+#include <net/sock_reuseport.h>
static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
const __u16 lport, const __be32 faddr,
@@ -205,6 +207,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
+ struct sk_buff *skb, int doff,
const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum,
const int dif)
@@ -214,6 +217,7 @@ struct sock *__inet_lookup_listener(struct net *net,
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
int score, hiscore, matches = 0, reuseport = 0;
+ bool select_ok = true;
u32 phash = 0;
rcu_read_lock();
@@ -229,6 +233,15 @@ begin:
if (reuseport) {
phash = inet_ehashfn(net, daddr, hnum,
saddr, sport);
+ if (select_ok) {
+ struct sock *sk2;
+ sk2 = reuseport_select_sock(sk, phash,
+ skb, doff);
+ if (sk2) {
+ result = sk2;
+ goto found;
+ }
+ }
matches = 1;
}
} else if (score == hiscore && reuseport) {
@@ -246,11 +259,13 @@ begin:
if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
goto begin;
if (result) {
+found:
if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
result = NULL;
else if (unlikely(compute_score(result, net, hnum, daddr,
dif) < hiscore)) {
sock_put(result);
+ select_ok = false;
goto begin;
}
}
@@ -449,32 +464,74 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
}
EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
-void __inet_hash(struct sock *sk, struct sock *osk)
+static int inet_reuseport_add_sock(struct sock *sk,
+ struct inet_listen_hashbucket *ilb,
+ int (*saddr_same)(const struct sock *sk1,
+ const struct sock *sk2,
+ bool match_wildcard))
+{
+ struct sock *sk2;
+ struct hlist_nulls_node *node;
+ kuid_t uid = sock_i_uid(sk);
+
+ sk_nulls_for_each_rcu(sk2, node, &ilb->head) {
+ if (sk2 != sk &&
+ sk2->sk_family == sk->sk_family &&
+ ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
+ sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
+ saddr_same(sk, sk2, false))
+ return reuseport_add_sock(sk, sk2);
+ }
+
+ /* Initial allocation may have already happened via setsockopt */
+ if (!rcu_access_pointer(sk->sk_reuseport_cb))
+ return reuseport_alloc(sk);
+ return 0;
+}
+
+int __inet_hash(struct sock *sk, struct sock *osk,
+ int (*saddr_same)(const struct sock *sk1,
+ const struct sock *sk2,
+ bool match_wildcard))
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;
+ int err = 0;
if (sk->sk_state != TCP_LISTEN) {
inet_ehash_nolisten(sk, osk);
- return;
+ return 0;
}
WARN_ON(!sk_unhashed(sk));
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
spin_lock(&ilb->lock);
+ if (sk->sk_reuseport) {
+ err = inet_reuseport_add_sock(sk, ilb, saddr_same);
+ if (err)
+ goto unlock;
+ }
__sk_nulls_add_node_rcu(sk, &ilb->head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+unlock:
spin_unlock(&ilb->lock);
+
+ return err;
}
EXPORT_SYMBOL(__inet_hash);
-void inet_hash(struct sock *sk)
+int inet_hash(struct sock *sk)
{
+ int err = 0;
+
if (sk->sk_state != TCP_CLOSE) {
local_bh_disable();
- __inet_hash(sk, NULL);
+ err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal);
local_bh_enable();
}
+
+ return err;
}
EXPORT_SYMBOL_GPL(inet_hash);
@@ -493,6 +550,8 @@ void inet_unhash(struct sock *sk)
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
spin_lock_bh(lock);
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_detach_sock(sk);
done = __sk_nulls_del_node_init_rcu(sk);
if (done)
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
@@ -506,106 +565,106 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *, __u16, struct inet_timewait_sock **))
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
- const unsigned short snum = inet_sk(sk)->inet_num;
+ struct inet_timewait_sock *tw = NULL;
struct inet_bind_hashbucket *head;
- struct inet_bind_bucket *tb;
- int ret;
+ int port = inet_sk(sk)->inet_num;
struct net *net = sock_net(sk);
+ struct inet_bind_bucket *tb;
+ u32 remaining, offset;
+ int ret, i, low, high;
+ static u32 hint;
+
+ if (port) {
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ tb = inet_csk(sk)->icsk_bind_hash;
+ spin_lock_bh(&head->lock);
+ if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+ inet_ehash_nolisten(sk, NULL);
+ spin_unlock_bh(&head->lock);
+ return 0;
+ }
+ spin_unlock(&head->lock);
+ /* No definite answer... Walk to established hash table */
+ ret = check_established(death_row, sk, port, NULL);
+ local_bh_enable();
+ return ret;
+ }
- if (!snum) {
- int i, remaining, low, high, port;
- static u32 hint;
- u32 offset = hint + port_offset;
- struct inet_timewait_sock *tw = NULL;
+ inet_get_local_port_range(net, &low, &high);
+ high++; /* [32768, 60999] -> [32768, 61000[ */
+ remaining = high - low;
+ if (likely(remaining > 1))
+ remaining &= ~1U;
- inet_get_local_port_range(net, &low, &high);
- remaining = (high - low) + 1;
+ offset = (hint + port_offset) % remaining;
+ /* In first pass we try ports of @low parity.
+ * inet_csk_get_port() does the opposite choice.
+ */
+ offset &= ~1U;
+other_parity_scan:
+ port = low + offset;
+ for (i = 0; i < remaining; i += 2, port += 2) {
+ if (unlikely(port >= high))
+ port -= remaining;
+ if (inet_is_local_reserved_port(net, port))
+ continue;
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ spin_lock_bh(&head->lock);
- /* By starting with offset being an even number,
- * we tend to leave about 50% of ports for other uses,
- * like bind(0).
+ /* Does not bother with rcv_saddr checks, because
+ * the established check is already unique enough.
*/
- offset &= ~1;
-
- local_bh_disable();
- for (i = 0; i < remaining; i++) {
- port = low + (i + offset) % remaining;
- if (inet_is_local_reserved_port(net, port))
- continue;
- head = &hinfo->bhash[inet_bhashfn(net, port,
- hinfo->bhash_size)];
- spin_lock(&head->lock);
-
- /* Does not bother with rcv_saddr checks,
- * because the established check is already
- * unique enough.
- */
- inet_bind_bucket_for_each(tb, &head->chain) {
- if (net_eq(ib_net(tb), net) &&
- tb->port == port) {
- if (tb->fastreuse >= 0 ||
- tb->fastreuseport >= 0)
- goto next_port;
- WARN_ON(hlist_empty(&tb->owners));
- if (!check_established(death_row, sk,
- port, &tw))
- goto ok;
+ inet_bind_bucket_for_each(tb, &head->chain) {
+ if (net_eq(ib_net(tb), net) && tb->port == port) {
+ if (tb->fastreuse >= 0 ||
+ tb->fastreuseport >= 0)
goto next_port;
- }
+ WARN_ON(hlist_empty(&tb->owners));
+ if (!check_established(death_row, sk,
+ port, &tw))
+ goto ok;
+ goto next_port;
}
-
- tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
- net, head, port);
- if (!tb) {
- spin_unlock(&head->lock);
- break;
- }
- tb->fastreuse = -1;
- tb->fastreuseport = -1;
- goto ok;
-
- next_port:
- spin_unlock(&head->lock);
}
- local_bh_enable();
-
- return -EADDRNOTAVAIL;
-ok:
- hint += (i + 2) & ~1;
-
- /* Head lock still held and bh's disabled */
- inet_bind_hash(sk, tb, port);
- if (sk_unhashed(sk)) {
- inet_sk(sk)->inet_sport = htons(port);
- inet_ehash_nolisten(sk, (struct sock *)tw);
+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+ net, head, port);
+ if (!tb) {
+ spin_unlock_bh(&head->lock);
+ return -ENOMEM;
}
- if (tw)
- inet_twsk_bind_unhash(tw, hinfo);
- spin_unlock(&head->lock);
+ tb->fastreuse = -1;
+ tb->fastreuseport = -1;
+ goto ok;
+next_port:
+ spin_unlock_bh(&head->lock);
+ cond_resched();
+ }
- if (tw)
- inet_twsk_deschedule_put(tw);
+ offset++;
+ if ((offset & 1) && remaining > 1)
+ goto other_parity_scan;
- ret = 0;
- goto out;
- }
+ return -EADDRNOTAVAIL;
- head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
- tb = inet_csk(sk)->icsk_bind_hash;
- spin_lock_bh(&head->lock);
- if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- inet_ehash_nolisten(sk, NULL);
- spin_unlock_bh(&head->lock);
- return 0;
- } else {
- spin_unlock(&head->lock);
- /* No definite answer... Walk to established hash table */
- ret = check_established(death_row, sk, snum, NULL);
-out:
- local_bh_enable();
- return ret;
+ok:
+ hint += i + 2;
+
+ /* Head lock still held and bh's disabled */
+ inet_bind_hash(sk, tb, port);
+ if (sk_unhashed(sk)) {
+ inet_sk(sk)->inet_sport = htons(port);
+ inet_ehash_nolisten(sk, (struct sock *)tw);
}
+ if (tw)
+ inet_twsk_bind_unhash(tw, hinfo);
+ spin_unlock(&head->lock);
+ if (tw)
+ inet_twsk_deschedule_put(tw);
+ local_bh_enable();
+ return 0;
}
/*