From 421b3885bf6d56391297844f43fb7154a6396e12 Mon Sep 17 00:00:00 2001 From: Shawn Bohrer Date: Mon, 7 Oct 2013 11:01:39 -0500 Subject: udp: ipv4: Add udp early demux The removal of the routing cache introduced a performance regression for some UDP workloads since a dst lookup must be done for each packet. This change caches the dst per socket in a similar manner to what we do for TCP by implementing early_demux. For UDP multicast we can only cache the dst if there is only one receiving socket on the host. Since caching only works when there is one receiving socket we do the multicast socket lookup using RCU. For UDP unicast we only demux sockets with an exact match in order to not break forwarding setups. Additionally since the hash chains may be long we only check the first socket to see if it is a match and not waste extra time searching the whole chain when we might not find an exact match. Benchmark results from a netperf UDP_RR test: Before 87961.22 transactions/s After 89789.68 transactions/s Benchmark results from a fio 1 byte UDP multicast pingpong test (Multicast one way unicast response): Before 12.97us RTT After 12.63us RTT Signed-off-by: Shawn Bohrer Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index cfeb85cff4f0..35913fb77dc8 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1546,6 +1546,7 @@ static const struct net_protocol tcp_protocol = { }; static const struct net_protocol udp_protocol = { + .early_demux = udp_v4_early_demux, .handler = udp_rcv, .err_handler = udp_err, .no_policy = 1, -- cgit v1.2.3 From 47d27aad44169372f358cda88a223883f6760fa5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 18 Oct 2013 13:13:27 -0700 Subject: ipv4: gso: send_check() & segment() cleanups inet_gso_segment() and inet_gso_send_check() are called by skb_mac_gso_segment() under rcu lock, no need to use rcu_read_lock() / rcu_read_unlock() Avoid calling ip_hdr() twice per function. We can use ip_send_check() helper. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 35913fb77dc8..4f8cd4fc451d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1254,20 +1254,19 @@ static int inet_gso_send_check(struct sk_buff *skb) if (ihl < sizeof(*iph)) goto out; + proto = iph->protocol; + + /* Warning: after this point, iph might be no longer valid */ if (unlikely(!pskb_may_pull(skb, ihl))) goto out; - __skb_pull(skb, ihl); + skb_reset_transport_header(skb); - iph = ip_hdr(skb); - proto = iph->protocol; err = -EPROTONOSUPPORT; - rcu_read_lock(); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_send_check)) err = ops->callbacks.gso_send_check(skb); - rcu_read_unlock(); out: return err; @@ -1305,23 +1304,23 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, if (ihl < sizeof(*iph)) goto out; + id = ntohs(iph->id); + proto = iph->protocol; + + /* Warning: after this point, iph might be no longer valid */ if (unlikely(!pskb_may_pull(skb, ihl))) goto out; + __skb_pull(skb, ihl); tunnel = !!skb->encapsulation; - __skb_pull(skb, ihl); skb_reset_transport_header(skb); - iph = ip_hdr(skb); - id = ntohs(iph->id); - proto = iph->protocol; + segs = ERR_PTR(-EPROTONOSUPPORT); - rcu_read_lock(); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); - rcu_read_unlock(); if (IS_ERR_OR_NULL(segs)) goto out; @@ -1339,8 +1338,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, iph->id = htons(id++); } iph->tot_len = htons(skb->len - skb->mac_len); - iph->check = 0; - iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); + ip_send_check(iph); } while ((skb = skb->next)); out: -- cgit v1.2.3 From 3347c960295583eee3fd58e5c539fb1972fbc005 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Oct 2013 11:42:56 -0700 Subject: ipv4: gso: make inet_gso_segment() stackable In order to support GSO on IPIP, we need to make inet_gso_segment() stackable. It should not assume network header starts right after mac header. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4f8cd4fc451d..5783ab5b5ef8 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1273,16 +1273,17 @@ out: } static struct sk_buff *inet_gso_segment(struct sk_buff *skb, - netdev_features_t features) + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; + unsigned int offset = 0; struct iphdr *iph; + bool tunnel; int proto; + int nhoff; int ihl; int id; - unsigned int offset = 0; - bool tunnel; if (unlikely(skb_shinfo(skb)->gso_type & ~(SKB_GSO_TCPV4 | @@ -1296,6 +1297,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, 0))) goto out; + skb_reset_network_header(skb); + nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) goto out; @@ -1312,7 +1315,10 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, goto out; __skb_pull(skb, ihl); - tunnel = !!skb->encapsulation; + tunnel = SKB_GSO_CB(skb)->encap_level > 0; + if (tunnel) + features = skb->dev->hw_enc_features & netif_skb_features(skb); + SKB_GSO_CB(skb)->encap_level += ihl; skb_reset_transport_header(skb); @@ -1327,18 +1333,23 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, skb = segs; do { - iph = ip_hdr(skb); + iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); if (!tunnel && proto == IPPROTO_UDP) { iph->id = htons(id); iph->frag_off = htons(offset >> 3); if (skb->next != NULL) iph->frag_off |= htons(IP_MF); - offset += (skb->len - skb->mac_len - iph->ihl * 4); + offset += skb->len - nhoff - ihl; } else { iph->id = htons(id++); } - iph->tot_len = htons(skb->len - skb->mac_len); + iph->tot_len = htons(skb->len - nhoff); ip_send_check(iph); + if (tunnel) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + skb->network_header = (u8 *)iph - skb->head; } while ((skb = skb->next)); out: -- cgit v1.2.3 From cb32f511a70be8967ac9025cf49c44324ced9a39 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Oct 2013 11:42:57 -0700 Subject: ipip: add GSO/TSO support Now inet_gso_segment() is stackable, its relatively easy to implement GSO/TSO support for IPIP Performance results, when segmentation is done after tunnel device (as no NIC is yet enabled for TSO IPIP support) : Before patch : lpq83:~# ./netperf -H 7.7.9.84 -Cc MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.9.84 () port 0 AF_INET Recv Send Send Utilization Service Demand Socket Socket Message Elapsed Send Recv Send Recv Size Size Size Time Throughput local remote local remote bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB 87380 16384 16384 10.00 3357.88 5.09 3.70 2.983 2.167 After patch : lpq83:~# ./netperf -H 7.7.9.84 -Cc MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.9.84 () port 0 AF_INET Recv Send Send Utilization Service Demand Socket Socket Message Elapsed Send Recv Send Recv Size Size Size Time Throughput local remote local remote bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB 87380 16384 16384 10.00 7710.19 4.52 6.62 1.152 1.687 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5783ab5b5ef8..4049906010f7 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1291,6 +1291,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_GRE | + SKB_GSO_IPIP | SKB_GSO_TCPV6 | SKB_GSO_UDP_TUNNEL | SKB_GSO_MPLS | @@ -1656,6 +1657,13 @@ static struct packet_offload ip_packet_offload __read_mostly = { }, }; +static const struct net_offload ipip_offload = { + .callbacks = { + .gso_send_check = inet_gso_send_check, + .gso_segment = inet_gso_segment, + }, +}; + static int __init ipv4_offload_init(void) { /* @@ -1667,6 +1675,7 @@ static int __init ipv4_offload_init(void) pr_crit("%s: Cannot add TCP protocol offload\n", __func__); dev_add_offload(&ip_packet_offload); + inet_add_offload(&ipip_offload, IPPROTO_IPIP); return 0; } -- cgit v1.2.3 From 1bbdceef1e535add893bf71d7b7ab102e4eb69eb Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sat, 19 Oct 2013 21:48:57 +0200 Subject: inet: convert inet_ehash_secret and ipv6_hash_secret to net_get_random_once Initialize the ehash and ipv6_hash_secrets with net_get_random_once. Each compilation unit gets its own secret now: ipv4/inet_hashtables.o ipv4/udp.o ipv6/inet6_hashtables.o ipv6/udp.o rds/connection.o The functions still get inlined into the hashing functions. In the fast path we have at most two (needed in ipv6) if (unlikely(...)). Cc: Eric Dumazet Cc: "David S. Miller" Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4049906010f7..9433a6186f54 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -245,29 +245,6 @@ out: } EXPORT_SYMBOL(inet_listen); -u32 inet_ehash_secret __read_mostly; -EXPORT_SYMBOL(inet_ehash_secret); - -u32 ipv6_hash_secret __read_mostly; -EXPORT_SYMBOL(ipv6_hash_secret); - -/* - * inet_ehash_secret must be set exactly once, and to a non nul value - * ipv6_hash_secret must be set exactly once. - */ -void build_ehash_secret(void) -{ - u32 rnd; - - do { - get_random_bytes(&rnd, sizeof(rnd)); - } while (rnd == 0); - - if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) - get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); -} -EXPORT_SYMBOL(build_ehash_secret); - /* * Create an inet socket. */ @@ -284,10 +261,6 @@ static int inet_create(struct net *net, struct socket *sock, int protocol, int try_loading_module = 0; int err; - if (unlikely(!inet_ehash_secret)) - if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) - build_ehash_secret(); - sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ -- cgit v1.2.3 From a4fe34bf902b8f709c635ab37f1f39de0b86cff2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 19 Oct 2013 16:25:36 -0700 Subject: tcp_memcontrol: Remove the per netns control. The code that is implemented is per memory cgroup not per netns, and having per netns bits is just confusing. Remove the per netns bits to make it easier to see what is really going on. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9433a6186f54..24a53fc275b0 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1697,8 +1697,6 @@ static int __init inet_init(void) ip_static_sysctl_init(); #endif - tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem; - /* * Add all the base protocols. */ -- cgit v1.2.3 From 61c1db7fae21ed33c614356a43bf6580c5e53118 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 20 Oct 2013 20:47:30 -0700 Subject: ipv6: sit: add GSO/TSO support Now ipv6_gso_segment() is stackable, its relatively easy to implement GSO/TSO support for SIT tunnels Performance results, when segmentation is done after tunnel device (as no NIC is yet enabled for TSO SIT support) : Before patch : lpq84:~# ./netperf -H 2002:af6:1153:: -Cc MIGRATED TCP STREAM TEST from ::0 (::) port 0 AF_INET6 to 2002:af6:1153:: () port 0 AF_INET6 Recv Send Send Utilization Service Demand Socket Socket Message Elapsed Send Recv Send Recv Size Size Size Time Throughput local remote local remote bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB 87380 16384 16384 10.00 3168.31 4.81 4.64 2.988 2.877 After patch : lpq84:~# ./netperf -H 2002:af6:1153:: -Cc MIGRATED TCP STREAM TEST from ::0 (::) port 0 AF_INET6 to 2002:af6:1153:: () port 0 AF_INET6 Recv Send Send Utilization Service Demand Socket Socket Message Elapsed Send Recv Send Recv Size Size Size Time Throughput local remote local remote bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB 87380 16384 16384 10.00 5525.00 7.76 5.17 2.763 1.840 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 24a53fc275b0..f4a159e705c0 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1265,6 +1265,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_TCP_ECN | SKB_GSO_GRE | SKB_GSO_IPIP | + SKB_GSO_SIT | SKB_GSO_TCPV6 | SKB_GSO_UDP_TUNNEL | SKB_GSO_MPLS | -- cgit v1.2.3 From 8c3a897bfab10f68f90252440bb29e6749a7312a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 27 Oct 2013 18:18:16 -0700 Subject: inet: restore gso for vxlan Alexei reported a performance regression on vxlan, caused by commit 3347c9602955 "ipv4: gso: make inet_gso_segment() stackable" GSO vxlan packets were not properly segmented, adding IP fragments while they were not expected. Rename 'bool tunnel' to 'bool encap', and add a new boolean to express the fact that UDP should be fragmented. This fragmentation is triggered by skb->encapsulation being set. Remove a "skb->encapsulation = 1" added in above commit, as its not needed, as frags inherit skb->frag from original GSO skb. Reported-by: Alexei Starovoitov Signed-off-by: Eric Dumazet Tested-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f4a159e705c0..09d78d4a3cff 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1251,8 +1251,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; unsigned int offset = 0; + bool udpfrag, encap; struct iphdr *iph; - bool tunnel; int proto; int nhoff; int ihl; @@ -1290,8 +1290,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, goto out; __skb_pull(skb, ihl); - tunnel = SKB_GSO_CB(skb)->encap_level > 0; - if (tunnel) + encap = SKB_GSO_CB(skb)->encap_level > 0; + if (encap) features = skb->dev->hw_enc_features & netif_skb_features(skb); SKB_GSO_CB(skb)->encap_level += ihl; @@ -1306,24 +1306,23 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, if (IS_ERR_OR_NULL(segs)) goto out; + udpfrag = !!skb->encapsulation && proto == IPPROTO_UDP; skb = segs; do { iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); - if (!tunnel && proto == IPPROTO_UDP) { + if (udpfrag) { iph->id = htons(id); iph->frag_off = htons(offset >> 3); if (skb->next != NULL) iph->frag_off |= htons(IP_MF); offset += skb->len - nhoff - ihl; - } else { + } else { iph->id = htons(id++); } iph->tot_len = htons(skb->len - nhoff); ip_send_check(iph); - if (tunnel) { + if (encap) skb_reset_inner_headers(skb); - skb->encapsulation = 1; - } skb->network_header = (u8 *)iph - skb->head; } while ((skb = skb->next)); -- cgit v1.2.3 From dcd607718385d02ce3741de225927a57f528f93b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2013 18:32:06 -0800 Subject: inet: fix a UFO regression While testing virtio_net and skb_segment() changes, Hannes reported that UFO was sending wrong frames. It appears this was introduced by a recent commit : 8c3a897bfab1 ("inet: restore gso for vxlan") The old condition to perform IP frag was : tunnel = !!skb->encapsulation; ... if (!tunnel && proto == IPPROTO_UDP) { So the new one should be : udpfrag = !skb->encapsulation && proto == IPPROTO_UDP; ... if (udpfrag) { Initialization of udpfrag must be done before call to ops->callbacks.gso_segment(skb, features), as skb_udp_tunnel_segment() clears skb->encapsulation (We want udpfrag to be true for UFO, false for VXLAN) With help from Alexei Starovoitov Reported-by: Hannes Frederic Sowa Signed-off-by: Eric Dumazet Cc: Alexei Starovoitov Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 09d78d4a3cff..68af9aac91d0 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1299,6 +1299,9 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); + /* Note : following gso_segment() might change skb->encapsulation */ + udpfrag = !skb->encapsulation && proto == IPPROTO_UDP; + ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); @@ -1306,7 +1309,6 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, if (IS_ERR_OR_NULL(segs)) goto out; - udpfrag = !!skb->encapsulation && proto == IPPROTO_UDP; skb = segs; do { iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); -- cgit v1.2.3