From 79ffef1fe213851f44bfccf037170a140e929f85 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Feb 2013 07:05:03 +0000 Subject: tcp: avoid wakeups for pure ACK TCP prequeue mechanism purpose is to let incoming packets being processed by the thread currently blocked in tcp_recvmsg(), instead of behalf of the softirq handler, to better adapt flow control on receiver host capacity to schedule the consumer. But in typical request/answer workloads, we send request, then block to receive the answer. And before the actual answer, TCP stack receives the ACK packets acknowledging the request. Processing pure ACK on behalf of the thread blocked in tcp_recvmsg() is a waste of resources, as thread has to immediately sleep again because it got no payload. This patch avoids the extra context switches and scheduler overhead. Before patch : a:~# echo 0 >/proc/sys/net/ipv4/tcp_low_latency a:~# perf stat ./super_netperf 300 -t TCP_RR -l 10 -H 7.7.7.84 -- -r 8k,8k 231676 Performance counter stats for './super_netperf 300 -t TCP_RR -l 10 -H 7.7.7.84 -- -r 8k,8k': 116251.501765 task-clock # 11.369 CPUs utilized 5,025,463 context-switches # 0.043 M/sec 1,074,511 CPU-migrations # 0.009 M/sec 216,923 page-faults # 0.002 M/sec 311,636,972,396 cycles # 2.681 GHz 260,507,138,069 stalled-cycles-frontend # 83.59% frontend cycles idle 155,590,092,840 stalled-cycles-backend # 49.93% backend cycles idle 100,101,255,411 instructions # 0.32 insns per cycle # 2.60 stalled cycles per insn 16,535,930,999 branches # 142.243 M/sec 646,483,591 branch-misses # 3.91% of all branches 10.225482774 seconds time elapsed After patch : a:~# echo 0 >/proc/sys/net/ipv4/tcp_low_latency a:~# perf stat ./super_netperf 300 -t TCP_RR -l 10 -H 7.7.7.84 -- -r 8k,8k 233297 Performance counter stats for './super_netperf 300 -t TCP_RR -l 10 -H 7.7.7.84 -- -r 8k,8k': 91084.870855 task-clock # 8.887 CPUs utilized 2,485,916 context-switches # 0.027 M/sec 815,520 CPU-migrations # 0.009 M/sec 216,932 page-faults # 0.002 M/sec 245,195,022,629 cycles # 2.692 GHz 202,635,777,041 stalled-cycles-frontend # 82.64% frontend cycles idle 124,280,372,407 stalled-cycles-backend # 50.69% backend cycles idle 83,457,289,618 instructions # 0.34 insns per cycle # 2.43 stalled cycles per insn 13,431,472,361 branches # 147.461 M/sec 504,470,665 branch-misses # 3.76% of all branches 10.249594448 seconds time elapsed Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Tom Herbert Cc: Yuchung Cheng Cc: Andi Kleen Signed-off-by: David S. Miller --- include/net/tcp.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 23f2e98d4b65..cf0694d4ad60 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1045,6 +1045,10 @@ static inline bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) if (sysctl_tcp_low_latency || !tp->ucopy.task) return false; + if (skb->len <= tcp_hdrlen(skb) && + skb_queue_len(&tp->ucopy.prequeue) == 0) + return false; + __skb_queue_tail(&tp->ucopy.prequeue, skb); tp->ucopy.memory += skb->truesize; if (tp->ucopy.memory > sk->sk_rcvbuf) { -- cgit v1.2.3 From 5b9e12dbf92b441b37136ea71dac59f05f2673a9 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Wed, 13 Mar 2013 00:24:15 +0000 Subject: ipv4: fix definition of FIB_TABLE_HASHSZ a long time ago by the commit commit 93456b6d7753def8760b423ac6b986eb9d5a4a95 Author: Denis V. Lunev Date: Thu Jan 10 03:23:38 2008 -0800 [IPV4]: Unify access to the routing tables. the defenition of FIB_HASH_TABLE size has obtained wrong dependency: it should depend upon CONFIG_IP_MULTIPLE_TABLES (as was in the original code) but it was depended from CONFIG_IP_ROUTE_MULTIPATH This patch returns the situation to the original state. The problem was spotted by Tingwei Liu. Signed-off-by: Denis V. Lunev CC: Tingwei Liu CC: Alexey Kuznetsov Signed-off-by: David S. Miller --- include/net/ip_fib.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 9497be1ad4c0..e49db91593a9 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -152,18 +152,16 @@ struct fib_result_nl { }; #ifdef CONFIG_IP_ROUTE_MULTIPATH - #define FIB_RES_NH(res) ((res).fi->fib_nh[(res).nh_sel]) - -#define FIB_TABLE_HASHSZ 2 - #else /* CONFIG_IP_ROUTE_MULTIPATH */ - #define FIB_RES_NH(res) ((res).fi->fib_nh[0]) +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ +#ifdef CONFIG_IP_MULTIPLE_TABLES #define FIB_TABLE_HASHSZ 256 - -#endif /* CONFIG_IP_ROUTE_MULTIPATH */ +#else +#define FIB_TABLE_HASHSZ 2 +#endif extern __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh); -- cgit v1.2.3 From aaa0c23cb90141309f5076ba5e3bfbd39544b985 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Thu, 14 Mar 2013 17:21:50 +0000 Subject: Fix dst_neigh_lookup/dst_neigh_lookup_skb return value handling bug When neighbour table is full, dst_neigh_lookup/dst_neigh_lookup_skb will return -ENOBUFS which is absolutely non zero, while all the code in kernel which use above functions assume failure only on zero return which will cause panic. (for example: : https://bugzilla.kernel.org/show_bug.cgi?id=54731). This patch corrects above error with smallest changes to kernel source code and also correct two return value check missing bugs in drivers/infiniband/hw/cxgb4/cm.c Tested on my x86_64 SMP machine Reported-by: Zhouyi Zhou Tested-by: Zhouyi Zhou Signed-off-by: Zhouyi Zhou Signed-off-by: David S. Miller --- include/net/dst.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index 853cda11e518..1f8fd109e225 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -413,13 +413,15 @@ static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n, static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr) { - return dst->ops->neigh_lookup(dst, NULL, daddr); + struct neighbour *n = dst->ops->neigh_lookup(dst, NULL, daddr); + return IS_ERR(n) ? NULL : n; } static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst, struct sk_buff *skb) { - return dst->ops->neigh_lookup(dst, skb, NULL); + struct neighbour *n = dst->ops->neigh_lookup(dst, skb, NULL); + return IS_ERR(n) ? NULL : n; } static inline void dst_link_failure(struct sk_buff *skb) -- cgit v1.2.3 From 0c12582fbcdea0cbb0dfd224e1c5f9a8428ffa18 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 9 Mar 2013 23:25:04 +0200 Subject: ipvs: add backup_only flag to avoid loops Dmitry Akindinov is reporting for a problem where SYNs are looping between the master and backup server when the backup server is used as real server in DR mode and has IPVS rules to function as director. Even when the backup function is enabled we continue to forward traffic and schedule new connections when the current master is using the backup server as real server. While this is not a problem for NAT, for DR and TUN method the backup server can not determine if a request comes from client or from director. To avoid such loops add new sysctl flag backup_only. It can be needed for DR/TUN setups that do not need backup and director function at the same time. When the backup function is enabled we stop any forwarding and pass the traffic to the local stack (real server mode). The flag disables the director function when the backup function is enabled. For setups that enable backup function for some virtual services and director function for other virtual services there should be another more complex solution to support DR/TUN mode, may be to assign per-virtual service syncid value, so that we can differentiate the requests. Reported-by: Dmitry Akindinov Tested-by: German Myzovsky Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 68c69d54d392..fce8e6b66d55 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -976,6 +976,7 @@ struct netns_ipvs { int sysctl_sync_retries; int sysctl_nat_icmp_send; int sysctl_pmtu_disc; + int sysctl_backup_only; /* ip_vs_lblc */ int sysctl_lblc_expiration; @@ -1067,6 +1068,12 @@ static inline int sysctl_pmtu_disc(struct netns_ipvs *ipvs) return ipvs->sysctl_pmtu_disc; } +static inline int sysctl_backup_only(struct netns_ipvs *ipvs) +{ + return ipvs->sync_state & IP_VS_STATE_BACKUP && + ipvs->sysctl_backup_only; +} + #else static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) @@ -1114,6 +1121,11 @@ static inline int sysctl_pmtu_disc(struct netns_ipvs *ipvs) return 1; } +static inline int sysctl_backup_only(struct netns_ipvs *ipvs) +{ + return 0; +} + #endif /* -- cgit v1.2.3 From 5a3da1fe9561828d0ca7eca664b16ec2b9bf0055 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Fri, 15 Mar 2013 11:32:30 +0000 Subject: inet: limit length of fragment queue hash table bucket lists This patch introduces a constant limit of the fragment queue hash table bucket list lengths. Currently the limit 128 is choosen somewhat arbitrary and just ensures that we can fill up the fragment cache with empty packets up to the default ip_frag_high_thresh limits. It should just protect from list iteration eating considerable amounts of cpu. If we reach the maximum length in one hash bucket a warning is printed. This is implemented on the caller side of inet_frag_find to distinguish between the different users of inet_fragment.c. I dropped the out of memory warning in the ipv4 fragment lookup path, because we already get a warning by the slab allocator. Cc: Eric Dumazet Cc: Jesper Dangaard Brouer Signed-off-by: Hannes Frederic Sowa Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_frag.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 76c3fe5ecc2e..0a1dcc2fa2f5 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -43,6 +43,13 @@ struct inet_frag_queue { #define INETFRAGS_HASHSZ 64 +/* averaged: + * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ / + * rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or + * struct frag_queue)) + */ +#define INETFRAGS_MAXDEPTH 128 + struct inet_frags { struct hlist_head hash[INETFRAGS_HASHSZ]; /* This rwlock is a global lock (seperate per IPv4, IPv6 and @@ -76,6 +83,8 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force); struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frags *f, void *key, unsigned int hash) __releases(&f->lock); +void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, + const char *prefix); static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f) { -- cgit v1.2.3 From 8ed781668dd49b608f1e67a22e3b445fd0c2cd6f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 19 Mar 2013 06:39:29 +0000 Subject: flow_keys: include thoff into flow_keys for later usage In skb_flow_dissect(), we perform a dissection of a skbuff. Since we're doing the work here anyway, also store thoff for a later usage, e.g. in the BPF filter. Suggested-by: Eric Dumazet Signed-off-by: Daniel Borkmann Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/flow_keys.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h index 80461c1ae9ef..bb8271d487b7 100644 --- a/include/net/flow_keys.h +++ b/include/net/flow_keys.h @@ -9,6 +9,7 @@ struct flow_keys { __be32 ports; __be16 port16[2]; }; + u16 thoff; u8 ip_proto; }; -- cgit v1.2.3 From 330305cc4a6b0cb75c22fc01b8826f0ad755550f Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Sun, 24 Mar 2013 17:36:29 +0000 Subject: ipv4: Fix ip-header identification for gso packets. ip-header id needs to be incremented even if IP_DF flag is set. This behaviour was changed in commit 490ab08127cebc25e3a26 (IP_GRE: Fix IP-Identification). Following patch fixes it so that identification is always incremented. Reported-by: Cong Wang Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/ipip.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/ipip.h b/include/net/ipip.h index fd19625ff99d..982141c15200 100644 --- a/include/net/ipip.h +++ b/include/net/ipip.h @@ -77,15 +77,11 @@ static inline void tunnel_ip_select_ident(struct sk_buff *skb, { struct iphdr *iph = ip_hdr(skb); - if (iph->frag_off & htons(IP_DF)) - iph->id = 0; - else { - /* Use inner packet iph-id if possible. */ - if (skb->protocol == htons(ETH_P_IP) && old_iph->id) - iph->id = old_iph->id; - else - __ip_select_ident(iph, dst, - (skb_shinfo(skb)->gso_segs ?: 1) - 1); - } + /* Use inner packet iph-id if possible. */ + if (skb->protocol == htons(ETH_P_IP) && old_iph->id) + iph->id = old_iph->id; + else + __ip_select_ident(iph, dst, + (skb_shinfo(skb)->gso_segs ?: 1) - 1); } #endif -- cgit v1.2.3