From 121fa4b77775549c3c5eb41eb335d7dcbb801f90 Mon Sep 17 00:00:00 2001 From: Zoltan Kiss Date: Thu, 6 Mar 2014 21:48:24 +0000 Subject: xen-netback: Minor refactoring of netback code This patch contains a few bits of refactoring before introducing the grant mapping changes: - introducing xenvif_tx_pending_slots_available(), as this is used several times, and will be used more often - rename the thread to vifX.Y-guest-rx, to signify it does RX work from the guest point of view Signed-off-by: Zoltan Kiss Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'drivers/net/xen-netback/common.h') diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index ae413a2cbee7..9d3584545e5d 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -108,6 +108,15 @@ struct xenvif_rx_meta { */ #define MAX_GRANT_COPY_OPS (MAX_SKB_FRAGS * XEN_NETIF_RX_RING_SIZE) +#define NETBACK_INVALID_HANDLE -1 + +/* To avoid confusion, we define XEN_NETBK_LEGACY_SLOTS_MAX indicating + * the maximum slots a valid packet can use. Now this value is defined + * to be XEN_NETIF_NR_SLOTS_MIN, which is supposed to be supported by + * all backend. + */ +#define XEN_NETBK_LEGACY_SLOTS_MAX XEN_NETIF_NR_SLOTS_MIN + struct xenvif { /* Unique identifier for this interface. */ domid_t domid; @@ -216,7 +225,7 @@ void xenvif_carrier_off(struct xenvif *vif); int xenvif_tx_action(struct xenvif *vif, int budget); -int xenvif_kthread(void *data); +int xenvif_kthread_guest_rx(void *data); void xenvif_kick_thread(struct xenvif *vif); /* Determine whether the needed number of slots (req) are available, @@ -226,6 +235,18 @@ bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed); void xenvif_stop_queue(struct xenvif *vif); +static inline pending_ring_idx_t nr_pending_reqs(struct xenvif *vif) +{ + return MAX_PENDING_REQS - + vif->pending_prod + vif->pending_cons; +} + +static inline bool xenvif_tx_pending_slots_available(struct xenvif *vif) +{ + return nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX + < MAX_PENDING_REQS; +} + extern bool separate_tx_rx_irq; #endif /* __XEN_NETBACK__COMMON_H__ */ -- cgit v1.2.3 From 3e2234b3149f66bc4be2343a3a0f637d922e4a36 Mon Sep 17 00:00:00 2001 From: Zoltan Kiss Date: Thu, 6 Mar 2014 21:48:25 +0000 Subject: xen-netback: Handle foreign mapped pages on the guest RX path RX path need to know if the SKB fragments are stored on pages from another domain. Logically this patch should be after introducing the grant mapping itself, as it makes sense only after that. But to keep bisectability, I moved it here. It shouldn't change any functionality here. xenvif_zerocopy_callback and ubuf_to_vif are just stubs here, they will be introduced properly later on. Signed-off-by: Zoltan Kiss Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/net/xen-netback/common.h') diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 9d3584545e5d..8f264df8818a 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -247,6 +247,9 @@ static inline bool xenvif_tx_pending_slots_available(struct xenvif *vif) < MAX_PENDING_REQS; } +/* Callback from stack when TX packet can be released */ +void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success); + extern bool separate_tx_rx_irq; #endif /* __XEN_NETBACK__COMMON_H__ */ -- cgit v1.2.3 From f53c3fe8dad725b014e9c7682720d8e3e2a8a5b3 Mon Sep 17 00:00:00 2001 From: Zoltan Kiss Date: Thu, 6 Mar 2014 21:48:26 +0000 Subject: xen-netback: Introduce TX grant mapping This patch introduces grant mapping on netback TX path. It replaces grant copy operations, ditching grant copy coalescing along the way. Another solution for copy coalescing is introduced in "xen-netback: Handle guests with too many frags", older guests and Windows can broke before that patch applies. There is a callback (xenvif_zerocopy_callback) from core stack to release the slots back to the guests when kfree_skb or skb_orphan_frags called. It feeds a separate dealloc thread, as scheduling NAPI instance from there is inefficient, therefore we can't do dealloc from the instance. Signed-off-by: Zoltan Kiss Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) (limited to 'drivers/net/xen-netback/common.h') diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 8f264df8818a..5a991266a394 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -79,6 +79,17 @@ struct pending_tx_info { * if it is head of one or more tx * reqs */ + /* Callback data for released SKBs. The callback is always + * xenvif_zerocopy_callback, desc contains the pending_idx, which is + * also an index in pending_tx_info array. It is initialized in + * xenvif_alloc and it never changes. + * skb_shinfo(skb)->destructor_arg points to the first mapped slot's + * callback_struct in this array of struct pending_tx_info's, then ctx + * to the next, or NULL if there is no more slot for this skb. + * ubuf_to_vif is a helper which finds the struct xenvif from a pointer + * to this field. + */ + struct ubuf_info callback_struct; }; #define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE) @@ -135,13 +146,31 @@ struct xenvif { pending_ring_idx_t pending_cons; u16 pending_ring[MAX_PENDING_REQS]; struct pending_tx_info pending_tx_info[MAX_PENDING_REQS]; + grant_handle_t grant_tx_handle[MAX_PENDING_REQS]; /* Coalescing tx requests before copying makes number of grant * copy ops greater or equal to number of slots required. In * worst case a tx request consumes 2 gnttab_copy. */ struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS]; - + struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS]; + struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS]; + /* passed to gnttab_[un]map_refs with pages under (un)mapping */ + struct page *pages_to_map[MAX_PENDING_REQS]; + struct page *pages_to_unmap[MAX_PENDING_REQS]; + + /* This prevents zerocopy callbacks to race over dealloc_ring */ + spinlock_t callback_lock; + /* This prevents dealloc thread and NAPI instance to race over response + * creation and pending_ring in xenvif_idx_release. In xenvif_tx_err + * it only protect response creation + */ + spinlock_t response_lock; + pending_ring_idx_t dealloc_prod; + pending_ring_idx_t dealloc_cons; + u16 dealloc_ring[MAX_PENDING_REQS]; + struct task_struct *dealloc_task; + wait_queue_head_t dealloc_wq; /* Use kthread for guest RX */ struct task_struct *task; @@ -228,6 +257,8 @@ int xenvif_tx_action(struct xenvif *vif, int budget); int xenvif_kthread_guest_rx(void *data); void xenvif_kick_thread(struct xenvif *vif); +int xenvif_dealloc_kthread(void *data); + /* Determine whether the needed number of slots (req) are available, * and set req_event if not. */ @@ -235,6 +266,12 @@ bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed); void xenvif_stop_queue(struct xenvif *vif); +/* Callback from stack when TX packet can be released */ +void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success); + +/* Unmap a pending page and release it back to the guest */ +void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx); + static inline pending_ring_idx_t nr_pending_reqs(struct xenvif *vif) { return MAX_PENDING_REQS - -- cgit v1.2.3 From 62bad3199a4c20505fc36c169deef20b25e17c5f Mon Sep 17 00:00:00 2001 From: Zoltan Kiss Date: Thu, 6 Mar 2014 21:48:27 +0000 Subject: xen-netback: Remove old TX grant copy definitons and fix indentations These became obsolete with grant mapping. I've left intentionally the indentations in this way, to improve readability of previous patches. NOTE: if bisect brought you here, you should apply the series up until "xen-netback: Timeout packets in RX path", otherwise Windows guests can't work properly and malicious guests can block other guests by not releasing their sent packets. Signed-off-by: Zoltan Kiss Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 36 +----------------------------------- 1 file changed, 1 insertion(+), 35 deletions(-) (limited to 'drivers/net/xen-netback/common.h') diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 5a991266a394..49109afa2253 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -48,37 +48,8 @@ typedef unsigned int pending_ring_idx_t; #define INVALID_PENDING_RING_IDX (~0U) -/* For the head field in pending_tx_info: it is used to indicate - * whether this tx info is the head of one or more coalesced requests. - * - * When head != INVALID_PENDING_RING_IDX, it means the start of a new - * tx requests queue and the end of previous queue. - * - * An example sequence of head fields (I = INVALID_PENDING_RING_IDX): - * - * ...|0 I I I|5 I|9 I I I|... - * -->|<-INUSE---------------- - * - * After consuming the first slot(s) we have: - * - * ...|V V V V|5 I|9 I I I|... - * -----FREE->|<-INUSE-------- - * - * where V stands for "valid pending ring index". Any number other - * than INVALID_PENDING_RING_IDX is OK. These entries are considered - * free and can contain any number other than - * INVALID_PENDING_RING_IDX. In practice we use 0. - * - * The in use non-INVALID_PENDING_RING_IDX (say 0, 5 and 9 in the - * above example) number is the index into pending_tx_info and - * mmap_pages arrays. - */ struct pending_tx_info { - struct xen_netif_tx_request req; /* coalesced tx request */ - pending_ring_idx_t head; /* head != INVALID_PENDING_RING_IDX - * if it is head of one or more tx - * reqs - */ + struct xen_netif_tx_request req; /* tx request */ /* Callback data for released SKBs. The callback is always * xenvif_zerocopy_callback, desc contains the pending_idx, which is * also an index in pending_tx_info array. It is initialized in @@ -148,11 +119,6 @@ struct xenvif { struct pending_tx_info pending_tx_info[MAX_PENDING_REQS]; grant_handle_t grant_tx_handle[MAX_PENDING_REQS]; - /* Coalescing tx requests before copying makes number of grant - * copy ops greater or equal to number of slots required. In - * worst case a tx request consumes 2 gnttab_copy. - */ - struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS]; struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS]; struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS]; /* passed to gnttab_[un]map_refs with pages under (un)mapping */ -- cgit v1.2.3 From 1bb332af4cd889e4b64dacbf4a793ceb3a70445d Mon Sep 17 00:00:00 2001 From: Zoltan Kiss Date: Thu, 6 Mar 2014 21:48:28 +0000 Subject: xen-netback: Add stat counters for zerocopy These counters help determine how often the buffers had to be copied. Also they help find out if packets are leaked, as if "sent != success + fail", there are probably packets never freed up properly. NOTE: if bisect brought you here, you should apply the series up until "xen-netback: Timeout packets in RX path", otherwise Windows guests can't work properly and malicious guests can block other guests by not releasing their sent packets. Signed-off-by: Zoltan Kiss Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/net/xen-netback/common.h') diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 49109afa2253..683d30160a7c 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -179,6 +179,9 @@ struct xenvif { /* Statistics */ unsigned long rx_gso_checksum_fixup; + unsigned long tx_zerocopy_sent; + unsigned long tx_zerocopy_success; + unsigned long tx_zerocopy_fail; /* Miscellaneous private stuff. */ struct net_device *dev; -- cgit v1.2.3 From e3377f36ca20a034dce56335dc9b89f41094d845 Mon Sep 17 00:00:00 2001 From: Zoltan Kiss Date: Thu, 6 Mar 2014 21:48:29 +0000 Subject: xen-netback: Handle guests with too many frags Xen network protocol had implicit dependency on MAX_SKB_FRAGS. Netback has to handle guests sending up to XEN_NETBK_LEGACY_SLOTS_MAX slots. To achieve that: - create a new skb - map the leftover slots to its frags (no linear buffer here!) - chain it to the previous through skb_shinfo(skb)->frag_list - map them - copy and coalesce the frags into a brand new one and send it to the stack - unmap the 2 old skb's pages It's also introduces new stat counters, which help determine how often the guest sends a packet with more than MAX_SKB_FRAGS frags. NOTE: if bisect brought you here, you should apply the series up until "xen-netback: Timeout packets in RX path", otherwise malicious guests can block other guests by not releasing their sent packets. Signed-off-by: Zoltan Kiss Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/net/xen-netback/common.h') diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 683d30160a7c..f2f8a02afc36 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -182,6 +182,7 @@ struct xenvif { unsigned long tx_zerocopy_sent; unsigned long tx_zerocopy_success; unsigned long tx_zerocopy_fail; + unsigned long tx_frag_overflow; /* Miscellaneous private stuff. */ struct net_device *dev; -- cgit v1.2.3 From 093507885ae5dc0288af07fbb922d2f85b3a88a6 Mon Sep 17 00:00:00 2001 From: Zoltan Kiss Date: Thu, 6 Mar 2014 21:48:30 +0000 Subject: xen-netback: Timeout packets in RX path A malicious or buggy guest can leave its queue filled indefinitely, in which case qdisc start to queue packets for that VIF. If those packets came from an another guest, it can block its slots and prevent shutdown. To avoid that, we make sure the queue is drained in every 10 seconds. The QDisc queue in worst case takes 3 round to flush usually. Signed-off-by: Zoltan Kiss Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/net/xen-netback/common.h') diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index f2f8a02afc36..0355f8767e3b 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -148,6 +148,9 @@ struct xenvif { struct xen_netif_rx_back_ring rx; struct sk_buff_head rx_queue; RING_IDX rx_last_skb_slots; + bool rx_queue_purge; + + struct timer_list wake_queue; /* This array is allocated seperately as it is large */ struct gnttab_copy *grant_copy_op; @@ -259,4 +262,7 @@ void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success); extern bool separate_tx_rx_irq; +extern unsigned int rx_drain_timeout_msecs; +extern unsigned int rx_drain_timeout_jiffies; + #endif /* __XEN_NETBACK__COMMON_H__ */ -- cgit v1.2.3 From e9275f5e2df1b2098a8cc405d87b88b9affd73e6 Mon Sep 17 00:00:00 2001 From: Zoltan Kiss Date: Thu, 6 Mar 2014 21:48:31 +0000 Subject: xen-netback: Aggregate TX unmap operations Unmapping causes TLB flushing, therefore we should make it in the largest possible batches. However we shouldn't starve the guest for too long. So if the guest has space for at least two big packets and we don't have at least a quarter ring to unmap, delay it for at most 1 milisec. Signed-off-by: Zoltan Kiss Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/net/xen-netback/common.h') diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 0355f8767e3b..bef37be402b8 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -137,6 +137,8 @@ struct xenvif { u16 dealloc_ring[MAX_PENDING_REQS]; struct task_struct *dealloc_task; wait_queue_head_t dealloc_wq; + struct timer_list dealloc_delay; + bool dealloc_delay_timed_out; /* Use kthread for guest RX */ struct task_struct *task; -- cgit v1.2.3 From 397dfd9f93ccfe71660eafbaac651a96195c24ed Mon Sep 17 00:00:00 2001 From: Zoltan Kiss Date: Fri, 21 Mar 2014 17:23:04 +0000 Subject: Revert "xen-netback: Aggregate TX unmap operations" This reverts commit e9275f5e2df1b2098a8cc405d87b88b9affd73e6. This commit is the last in the netback grant mapping series, and it tries to do more aggressive aggreagtion of unmap operations. However practical use showed almost no positive effect, whilst with certain frontends it causes significant performance regression. Signed-off-by: Zoltan Kiss Acked-by: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/net/xen-netback/common.h') diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index bef37be402b8..0355f8767e3b 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -137,8 +137,6 @@ struct xenvif { u16 dealloc_ring[MAX_PENDING_REQS]; struct task_struct *dealloc_task; wait_queue_head_t dealloc_wq; - struct timer_list dealloc_delay; - bool dealloc_delay_timed_out; /* Use kthread for guest RX */ struct task_struct *task; -- cgit v1.2.3 From 869b9b19b3affd81cee853d33c0b124797f3c387 Mon Sep 17 00:00:00 2001 From: Zoltan Kiss Date: Mon, 24 Mar 2014 23:59:49 +0000 Subject: xen-netback: Stop using xenvif_tx_pending_slots_available Since the early days TX stops if there isn't enough free pending slots to consume a maximum sized (slot-wise) packet. Probably the reason for that is to avoid the case when we don't have enough free pending slot in the ring to finish the packet. But if we make sure that the pending ring has the same size as the shared ring, that shouldn't really happen. The frontend can only post packets which fit the to the free space of the shared ring. If it doesn't, the frontend has to stop, as it can only increase the req_prod when the whole packet fits onto the ring. This patch avoid using this checking, makes sure the 2 ring has the same size, and remove a checking from the callback. As now we don't stop the NAPI instance on this condition, we don't have to wake it up if we free pending slots up. Signed-off-by: Zoltan Kiss Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'drivers/net/xen-netback/common.h') diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 0355f8767e3b..89b2d429c440 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -81,7 +81,7 @@ struct xenvif_rx_meta { #define MAX_BUFFER_OFFSET PAGE_SIZE -#define MAX_PENDING_REQS 256 +#define MAX_PENDING_REQS XEN_NETIF_TX_RING_SIZE /* It's possible for an skb to have a maximal number of frags * but still be less than MAX_BUFFER_OFFSET in size. Thus the @@ -251,12 +251,6 @@ static inline pending_ring_idx_t nr_pending_reqs(struct xenvif *vif) vif->pending_prod + vif->pending_cons; } -static inline bool xenvif_tx_pending_slots_available(struct xenvif *vif) -{ - return nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX - < MAX_PENDING_REQS; -} - /* Callback from stack when TX packet can be released */ void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success); -- cgit v1.2.3 From e9d8b2c2968499c1f96563e6522c56958d5a1d0d Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 1 Apr 2014 12:46:12 +0100 Subject: xen-netback: disable rogue vif in kthread context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When netback discovers frontend is sending malformed packet it will disables the interface which serves that frontend. However disabling a network interface involving taking a mutex which cannot be done in softirq context, so we need to defer this process to kthread context. This patch does the following: 1. introduce a flag to indicate the interface is disabled. 2. check that flag in TX path, don't do any work if it's true. 3. check that flag in RX path, turn off that interface if it's true. The reason to disable it in RX path is because RX uses kthread. After this change the behavior of netback is still consistent -- it won't do any TX work for a rogue frontend, and the interface will be eventually turned off. Also change a "continue" to "break" after xenvif_fatal_tx_err, as it doesn't make sense to continue processing packets if frontend is rogue. This is a fix for XSA-90. Reported-by: Török Edwin Signed-off-by: Wei Liu Cc: Ian Campbell Reviewed-by: David Vrabel Acked-by: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/net/xen-netback/common.h') diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 89b2d429c440..89d1d0556b6e 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -104,6 +104,11 @@ struct xenvif { domid_t domid; unsigned int handle; + /* Is this interface disabled? True when backend discovers + * frontend is rogue. + */ + bool disabled; + /* Use NAPI for guest TX */ struct napi_struct napi; /* When feature-split-event-channels = 0, tx_irq = rx_irq. */ -- cgit v1.2.3