From 9f5afeae51526b3ad7b7cb21ee8b145ce6ea7a7a Mon Sep 17 00:00:00 2001 From: Yaogong Wang Date: Wed, 7 Sep 2016 14:49:28 -0700 Subject: tcp: use an RB tree for ooo receive queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Over the years, TCP BDP has increased by several orders of magnitude, and some people are considering to reach the 2 Gbytes limit. Even with current window scale limit of 14, ~1 Gbytes maps to ~740,000 MSS. In presence of packet losses (or reorders), TCP stores incoming packets into an out of order queue, and number of skbs sitting there waiting for the missing packets to be received can be in the 10^5 range. Most packets are appended to the tail of this queue, and when packets can finally be transferred to receive queue, we scan the queue from its head. However, in presence of heavy losses, we might have to find an arbitrary point in this queue, involving a linear scan for every incoming packet, throwing away cpu caches. This patch converts it to a RB tree, to get bounded latencies. Yaogong wrote a preliminary patch about 2 years ago. Eric did the rebase, added ofo_last_skb cache, polishing and tests. Tested with network dropping between 1 and 10 % packets, with good success (about 30 % increase of throughput in stress tests) Next step would be to also use an RB tree for the write queue at sender side ;) Signed-off-by: Yaogong Wang Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Cc: Ilpo Järvinen Acked-By: Ilpo Järvinen Signed-off-by: David S. Miller --- net/core/skbuff.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3864b4b68fa1..1e329d411242 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2444,6 +2444,25 @@ void skb_queue_purge(struct sk_buff_head *list) } EXPORT_SYMBOL(skb_queue_purge); +/** + * skb_rbtree_purge - empty a skb rbtree + * @root: root of the rbtree to empty + * + * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from + * the list and one reference dropped. This function does not take + * any lock. Synchronization should be handled by the caller (e.g., TCP + * out-of-order queue is protected by the socket lock). + */ +void skb_rbtree_purge(struct rb_root *root) +{ + struct sk_buff *skb, *next; + + rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode) + kfree_skb(skb); + + *root = RB_ROOT; +} + /** * skb_queue_head - queue a buffer at the list head * @list: list to use -- cgit v1.2.3 From 07b26c9454a2a19fff86d6fcf2aba6bc801eb8d8 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 19 Sep 2016 12:58:47 +0200 Subject: gso: Support partial splitting at the frag_list pointer Since commit 8a29111c7 ("net: gro: allow to build full sized skb") gro may build buffers with a frag_list. This can hurt forwarding because most NICs can't offload such packets, they need to be segmented in software. This patch splits buffers with a frag_list at the frag_list pointer into buffers that can be TSO offloaded. Signed-off-by: Steffen Klassert Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/skbuff.c | 51 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 11 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1e329d411242..7bf82a28e10a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3097,11 +3097,31 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, sg = !!(features & NETIF_F_SG); csum = !!can_checksum_protocol(features, proto); - /* GSO partial only requires that we trim off any excess that - * doesn't fit into an MSS sized block, so take care of that - * now. - */ - if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) { + if (sg && csum && (mss != GSO_BY_FRAGS)) { + if (!(features & NETIF_F_GSO_PARTIAL)) { + struct sk_buff *iter; + + if (!list_skb || + !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) + goto normal; + + /* Split the buffer at the frag_list pointer. + * This is based on the assumption that all + * buffers in the chain excluding the last + * containing the same amount of data. + */ + skb_walk_frags(head_skb, iter) { + if (skb_headlen(iter)) + goto normal; + + len -= iter->len; + } + } + + /* GSO partial only requires that we trim off any excess that + * doesn't fit into an MSS sized block, so take care of that + * now. + */ partial_segs = len / mss; if (partial_segs > 1) mss *= partial_segs; @@ -3109,6 +3129,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, partial_segs = 0; } +normal: headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); @@ -3300,21 +3321,29 @@ perform_csum_check: */ segs->prev = tail; - /* Update GSO info on first skb in partial sequence. */ if (partial_segs) { + struct sk_buff *iter; int type = skb_shinfo(head_skb)->gso_type; + unsigned short gso_size = skb_shinfo(head_skb)->gso_size; /* Update type to add partial and then remove dodgy if set */ - type |= SKB_GSO_PARTIAL; + type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; type &= ~SKB_GSO_DODGY; /* Update GSO info and prepare to start updating headers on * our way back down the stack of protocols. */ - skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size; - skb_shinfo(segs)->gso_segs = partial_segs; - skb_shinfo(segs)->gso_type = type; - SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset; + for (iter = segs; iter; iter = iter->next) { + skb_shinfo(iter)->gso_size = gso_size; + skb_shinfo(iter)->gso_segs = partial_segs; + skb_shinfo(iter)->gso_type = type; + SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; + } + + if (tail->len - doffset <= gso_size) + skb_shinfo(tail)->gso_size = 0; + else if (tail != segs) + skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); } /* Following permits correct backpressure, for protocols -- cgit v1.2.3 From bfca4c520f7ea78138ddccea2de18dc062b0fefd Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Mon, 19 Sep 2016 19:11:09 +0300 Subject: net: skbuff: Export __skb_vlan_pop This exports the functionality of extracting the tag from the payload, without moving next vlan tag into hw accel tag. Signed-off-by: Shmulik Ladkani Signed-off-by: David S. Miller --- net/core/skbuff.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7bf82a28e10a..6c22351bd519 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4522,8 +4522,10 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len) } EXPORT_SYMBOL(skb_ensure_writable); -/* remove VLAN header from packet and update csum accordingly. */ -static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) +/* remove VLAN header from packet and update csum accordingly. + * expects a non skb_vlan_tag_present skb with a vlan tag payload + */ +int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) { struct vlan_hdr *vhdr; unsigned int offset = skb->data - skb_mac_header(skb); @@ -4554,6 +4556,7 @@ pull: return err; } +EXPORT_SYMBOL(__skb_vlan_pop); int skb_vlan_pop(struct sk_buff *skb) { -- cgit v1.2.3 From 636c2628086e40c86dac7ddc84a1c4b4fcccc6e3 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Tue, 20 Sep 2016 12:48:36 +0300 Subject: net: skbuff: Remove errornous length validation in skb_vlan_pop() In 93515d53b1 "net: move vlan pop/push functions into common code" skb_vlan_pop was moved from its private location in openvswitch to skbuff common code. In case skb has non hw-accel vlan tag, the original 'pop_vlan()' assured that skb->len is sufficient (if skb->len < VLAN_ETH_HLEN then pop was considered a no-op). This validation was moved as is into the new common 'skb_vlan_pop'. Alas, in its original location (openvswitch), there was a guarantee that 'data' points to the mac_header, therefore the 'skb->len < VLAN_ETH_HLEN' condition made sense. However there's no such guarantee in the generic 'skb_vlan_pop'. For short packets received in rx path going through 'skb_vlan_pop', this causes 'skb_vlan_pop' to fail pop-ing a valid vlan hdr (in the non hw-accel case) or to fail moving next tag into hw-accel tag. Remove the 'skb->len < VLAN_ETH_HLEN' condition entirely: It is superfluous since inner '__skb_vlan_pop' already verifies there are VLAN_ETH_HLEN writable bytes at the mac_header. Note this presents a slight change to skb_vlan_pop() users: In case total length is smaller than VLAN_ETH_HLEN, skb_vlan_pop() now returns an error, as opposed to previous "no-op" behavior. Existing callers (e.g. tc act vlan, ovs) usually drop the packet if 'skb_vlan_pop' fails. Fixes: 93515d53b1 ("net: move vlan pop/push functions into common code") Signed-off-by: Shmulik Ladkani Cc: Pravin Shelar Reviewed-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/core/skbuff.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6c22351bd519..b2a51bf1b0f9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4567,9 +4567,8 @@ int skb_vlan_pop(struct sk_buff *skb) if (likely(skb_vlan_tag_present(skb))) { skb->vlan_tci = 0; } else { - if (unlikely((skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD)) || - skb->len < VLAN_ETH_HLEN)) + if (unlikely(skb->protocol != htons(ETH_P_8021Q) && + skb->protocol != htons(ETH_P_8021AD))) return 0; err = __skb_vlan_pop(skb, &vlan_tci); @@ -4577,9 +4576,8 @@ int skb_vlan_pop(struct sk_buff *skb) return err; } /* move next vlan tag to hw accel tag */ - if (likely((skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD)) || - skb->len < VLAN_ETH_HLEN)) + if (likely(skb->protocol != htons(ETH_P_8021Q) && + skb->protocol != htons(ETH_P_8021AD))) return 0; vlan_proto = skb->protocol; -- cgit v1.2.3 From ecf4ee41d25832a6ec52f8b54dfaa46c08b949d5 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Tue, 20 Sep 2016 12:48:37 +0300 Subject: net: skbuff: Coding: Use eth_type_vlan() instead of open coding it Fix 'skb_vlan_pop' to use eth_type_vlan instead of directly comparing skb->protocol to ETH_P_8021Q or ETH_P_8021AD. Signed-off-by: Shmulik Ladkani Reviewed-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/core/skbuff.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b2a51bf1b0f9..d36c7548952f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4567,8 +4567,7 @@ int skb_vlan_pop(struct sk_buff *skb) if (likely(skb_vlan_tag_present(skb))) { skb->vlan_tci = 0; } else { - if (unlikely(skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD))) + if (unlikely(!eth_type_vlan(skb->protocol))) return 0; err = __skb_vlan_pop(skb, &vlan_tci); @@ -4576,8 +4575,7 @@ int skb_vlan_pop(struct sk_buff *skb) return err; } /* move next vlan tag to hw accel tag */ - if (likely(skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD))) + if (likely(!eth_type_vlan(skb->protocol))) return 0; vlan_proto = skb->protocol; -- cgit v1.2.3