From a0e17064d43e445181bc004d949a4855ea8ccf9c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 16 May 2020 10:46:17 +0200 Subject: mptcp: move common nospace-pattern to a helper Paolo noticed that ssk_check_wmem() has same pattern, so add/use common helper for both places. Suggested-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 1f52a0fa31ed..0413454fcdaf 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -653,6 +653,15 @@ out: return ret; } +static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock) +{ + clear_bit(MPTCP_SEND_SPACE, &msk->flags); + smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ + + /* enables sk->write_space() callbacks */ + set_bit(SOCK_NOSPACE, &sock->flags); +} + static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; @@ -666,13 +675,8 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) if (!sk_stream_memory_free(ssk)) { struct socket *sock = ssk->sk_socket; - if (sock) { - clear_bit(MPTCP_SEND_SPACE, &msk->flags); - smp_mb__after_atomic(); - - /* enables sk->write_space() callbacks */ - set_bit(SOCK_NOSPACE, &sock->flags); - } + if (sock) + mptcp_nospace(msk, sock); return NULL; } @@ -698,13 +702,8 @@ static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) return; sock = READ_ONCE(ssk->sk_socket); - - if (sock) { - clear_bit(MPTCP_SEND_SPACE, &msk->flags); - smp_mb__after_atomic(); - /* set NOSPACE only after clearing SEND_SPACE flag */ - set_bit(SOCK_NOSPACE, &sock->flags); - } + if (sock) + mptcp_nospace(msk, sock); } static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) -- cgit v1.2.3 From fb529e62d3f3e85001108213dc323c35f2765575 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 16 May 2020 10:46:18 +0200 Subject: mptcp: break and restart in case mptcp sndbuf is full Its not enough to check for available tcp send space. We also hold on to transmitted data for mptcp-level retransmits. Right now we will send more and more data if the peer can ack data at the tcp level fast enough, since that frees up tcp send buffer space. But we also need to check that data was acked and reclaimed at the mptcp level. Therefore add needed check in mptcp_sendmsg, flush tcp data and wait until more mptcp snd space becomes available if we are over the limit. Before we wait for more data, also make sure we start the retransmit timer if we ran out of sndbuf space. Otherwise there is a very small chance that we wait forever: * receiver is waiting for data * sender is blocked because mptcp socket buffer is full * at tcp level, all data was acked * mptcp-level snd_una was not updated, because last ack that acknowledged the last data packet carried an older MPTCP-ack. Restarting the retransmit timer avoids this problem: if TCP subflow is idle, data is retransmitted from the RTX queue. New data will make the peer send a new, updated MPTCP-Ack. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 0413454fcdaf..75be8d662ac5 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -739,9 +739,23 @@ fallback: mptcp_clean_una(sk); +wait_for_sndbuf: __mptcp_flush_join_list(msk); ssk = mptcp_subflow_get_send(msk); while (!sk_stream_memory_free(sk) || !ssk) { + if (ssk) { + /* make sure retransmit timer is + * running before we wait for memory. + * + * The retransmit timer might be needed + * to make the peer send an up-to-date + * MPTCP Ack. + */ + mptcp_set_timeout(sk, ssk); + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); + } + ret = sk_stream_wait_memory(sk, &timeo); if (ret) goto out; @@ -776,6 +790,28 @@ fallback: } copied += ret; + + /* memory is charged to mptcp level socket as well, i.e. + * if msg is very large, mptcp socket may run out of buffer + * space. mptcp_clean_una() will release data that has + * been acked at mptcp level in the mean time, so there is + * a good chance we can continue sending data right away. + */ + if (unlikely(!sk_stream_memory_free(sk))) { + tcp_push(ssk, msg->msg_flags, mss_now, + tcp_sk(ssk)->nonagle, size_goal); + mptcp_clean_una(sk); + if (!sk_stream_memory_free(sk)) { + /* can't send more for now, need to wait for + * MPTCP-level ACKs from peer. + * + * Wakeup will happen via mptcp_clean_una(). + */ + mptcp_set_timeout(sk, ssk); + release_sock(ssk); + goto wait_for_sndbuf; + } + } } mptcp_set_timeout(sk, ssk); -- cgit v1.2.3 From 72511aab95c94d7c0f03d0b7db5df47fdca059f6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 16 May 2020 10:46:19 +0200 Subject: mptcp: avoid blocking in tcp_sendpages The transmit loop continues to xmit new data until an error is returned or all data was transmitted. For the blocking i/o case, this means that tcp_sendpages() may block on the subflow until more space becomes available, i.e. we end up sleeping with the mptcp socket lock held. Instead we should check if a different subflow is ready to be used. This restarts the subflow sk lookup when the tx operation succeeded and the tcp subflow can't accept more data or if tcp_sendpages indicates -EAGAIN on a blocking mptcp socket. In that case we also need to set the NOSPACE bit to make sure we get notified once memory becomes available. In case all subflows are busy, the existing logic will wait until a subflow is ready, releasing the mptcp socket lock while doing so. The mptcp worker already sets DONTWAIT, so no need to make changes there. v2: * set NOSPACE bit * add a comment to clarify that mptcp-sk sndbuf limits need to be checked as well. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 75be8d662ac5..e97357066b21 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -590,7 +590,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, * access the skb after the sendpages call */ ret = do_tcp_sendpages(ssk, page, offset, psize, - msg->msg_flags | MSG_SENDPAGE_NOTLAST); + msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT); if (ret <= 0) return ret; @@ -713,6 +713,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct socket *ssock; size_t copied = 0; struct sock *ssk; + bool tx_ok; long timeo; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) @@ -737,6 +738,7 @@ fallback: return ret >= 0 ? ret + copied : (copied ? copied : ret); } +restart: mptcp_clean_una(sk); wait_for_sndbuf: @@ -772,11 +774,18 @@ wait_for_sndbuf: pr_debug("conn_list->subflow=%p", ssk); lock_sock(ssk); - while (msg_data_left(msg)) { + tx_ok = msg_data_left(msg); + while (tx_ok) { ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now, &size_goal); - if (ret < 0) + if (ret < 0) { + if (ret == -EAGAIN && timeo > 0) { + mptcp_set_timeout(sk, ssk); + release_sock(ssk); + goto restart; + } break; + } if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) { /* Can happen for passive sockets: * 3WHS negotiated MPTCP, but first packet after is @@ -791,11 +800,31 @@ wait_for_sndbuf: copied += ret; + tx_ok = msg_data_left(msg); + if (!tx_ok) + break; + + if (!sk_stream_memory_free(ssk)) { + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + tcp_push(ssk, msg->msg_flags, mss_now, + tcp_sk(ssk)->nonagle, size_goal); + mptcp_set_timeout(sk, ssk); + release_sock(ssk); + goto restart; + } + /* memory is charged to mptcp level socket as well, i.e. * if msg is very large, mptcp socket may run out of buffer * space. mptcp_clean_una() will release data that has * been acked at mptcp level in the mean time, so there is * a good chance we can continue sending data right away. + * + * Normally, when the tcp subflow can accept more data, then + * so can the MPTCP socket. However, we need to cope with + * peers that might lag behind in their MPTCP-level + * acknowledgements, i.e. data might have been acked at + * tcp level only. So, we must also check the MPTCP socket + * limits before we send more data. */ if (unlikely(!sk_stream_memory_free(sk))) { tcp_push(ssk, msg->msg_flags, mss_now, -- cgit v1.2.3 From 149f7c71e2c710a8ced836421a631953c9f84aa3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 16 May 2020 10:46:20 +0200 Subject: mptcp: fill skb extension cache outside of mptcp_sendmsg_frag The mptcp_sendmsg_frag helper contains a loop that will wait on the subflow sk. It seems preferrable to only wait in mptcp_sendmsg() when blocking io is requested. mptcp_sendmsg already has such a wait loop that is used when no subflow socket is available for transmission. This is a preparation patch that makes sure we call mptcp_sendmsg_frag only if a skb extension has been allocated. Moreover, such allocation currently uses GFP_ATOMIC while it could use sleeping allocation instead. Followup patches will remove the wait loop from mptcp_sendmsg_frag() and will allow to do a sleeping allocation for the extension. Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e97357066b21..1bdfbca1c23a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -669,6 +669,9 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) sock_owned_by_me((const struct sock *)msk); + if (!mptcp_ext_cache_refill(msk)) + return NULL; + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -804,7 +807,8 @@ wait_for_sndbuf: if (!tx_ok) break; - if (!sk_stream_memory_free(ssk)) { + if (!sk_stream_memory_free(ssk) || + !mptcp_ext_cache_refill(msk)) { set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal); @@ -1158,7 +1162,7 @@ static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct sock *ssk, *sk = &msk->sk.icsk_inet.sk; - int orig_len, orig_offset, ret, mss_now = 0, size_goal = 0; + int orig_len, orig_offset, mss_now = 0, size_goal = 0; struct mptcp_data_frag *dfrag; u64 orig_write_seq; size_t copied = 0; @@ -1180,6 +1184,9 @@ static void mptcp_worker(struct work_struct *work) if (!dfrag) goto unlock; + if (!mptcp_ext_cache_refill(msk)) + goto reset_unlock; + ssk = mptcp_subflow_get_retrans(msk); if (!ssk) goto reset_unlock; @@ -1191,8 +1198,8 @@ static void mptcp_worker(struct work_struct *work) orig_offset = dfrag->offset; orig_write_seq = dfrag->data_seq; while (dfrag->data_len > 0) { - ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, &mss_now, - &size_goal); + int ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, + &mss_now, &size_goal); if (ret < 0) break; @@ -1200,6 +1207,9 @@ static void mptcp_worker(struct work_struct *work) copied += ret; dfrag->data_len -= ret; dfrag->offset += ret; + + if (!mptcp_ext_cache_refill(msk)) + break; } if (copied) tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle, -- cgit v1.2.3 From 17091708d1e503383f20934631305ccb375b0eb1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 16 May 2020 10:46:21 +0200 Subject: mptcp: fill skb page frag cache outside of mptcp_sendmsg_frag The mptcp_sendmsg_frag helper contains a loop that will wait on the subflow sk. It seems preferrable to only wait in mptcp_sendmsg() when blocking io is requested. mptcp_sendmsg already has such a wait loop that is used when no subflow socket is available for transmission. This is another preparation patch that makes sure we call mptcp_sendmsg_frag only if the page frag cache has been refilled. Followup patch will remove the wait loop from mptcp_sendmsg_frag(). The retransmit worker doesn't need to do this refill as it won't transmit new mptcp-level data. Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 1bdfbca1c23a..a11e51222e59 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -713,6 +713,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { int mss_now = 0, size_goal = 0, ret = 0; struct mptcp_sock *msk = mptcp_sk(sk); + struct page_frag *pfrag; struct socket *ssock; size_t copied = 0; struct sock *ssk; @@ -741,13 +742,16 @@ fallback: return ret >= 0 ? ret + copied : (copied ? copied : ret); } + pfrag = sk_page_frag(sk); restart: mptcp_clean_una(sk); wait_for_sndbuf: __mptcp_flush_join_list(msk); ssk = mptcp_subflow_get_send(msk); - while (!sk_stream_memory_free(sk) || !ssk) { + while (!sk_stream_memory_free(sk) || + !ssk || + !mptcp_page_frag_refill(ssk, pfrag)) { if (ssk) { /* make sure retransmit timer is * running before we wait for memory. @@ -808,6 +812,7 @@ wait_for_sndbuf: break; if (!sk_stream_memory_free(ssk) || + !mptcp_page_frag_refill(ssk, pfrag) || !mptcp_ext_cache_refill(msk)) { set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); tcp_push(ssk, msg->msg_flags, mss_now, -- cgit v1.2.3 From 5c8264435d4f6a056ac926989a827aba1961e3c8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 16 May 2020 10:46:22 +0200 Subject: mptcp: remove inner wait loop from mptcp_sendmsg_frag previous patches made sure we only call into this function when these prerequisites are met, so no need to wait on the subflow socket anymore. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/7 Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a11e51222e59..bc950cf818f7 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -510,20 +510,6 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, * fooled into a warning if we don't init here */ pfrag = sk_page_frag(sk); - while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) || - !mptcp_ext_cache_refill(msk)) { - ret = sk_stream_wait_memory(ssk, timeo); - if (ret) - return ret; - - /* if sk_stream_wait_memory() sleeps snd_una can change - * significantly, refresh the rtx queue - */ - mptcp_clean_una(sk); - - if (unlikely(__mptcp_needs_tcp_fallback(msk))) - return 0; - } if (!retransmission) { write_seq = &msk->write_seq; page = pfrag->page; -- cgit v1.2.3 From 4930f4831b1547b52c5968e9307fe3d840d7fba0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 16 May 2020 10:46:23 +0200 Subject: net: allow __skb_ext_alloc to sleep mptcp calls this from the transmit side, from process context. Allow a sleeping allocation instead of unconditional GFP_ATOMIC. Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/core/skbuff.c | 8 +++++--- net/mptcp/protocol.c | 4 +++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3000c526f552..531843952809 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4165,7 +4165,7 @@ struct skb_ext { char data[] __aligned(8); }; -struct skb_ext *__skb_ext_alloc(void); +struct skb_ext *__skb_ext_alloc(gfp_t flags); void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, struct skb_ext *ext); void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1bf0c3d278e7..35a133c6d13b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6087,13 +6087,15 @@ static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) /** * __skb_ext_alloc - allocate a new skb extensions storage * + * @flags: See kmalloc(). + * * Returns the newly allocated pointer. The pointer can later attached to a * skb via __skb_ext_set(). * Note: caller must handle the skb_ext as an opaque data. */ -struct skb_ext *__skb_ext_alloc(void) +struct skb_ext *__skb_ext_alloc(gfp_t flags) { - struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); if (new) { memset(new->offset, 0, sizeof(new->offset)); @@ -6188,7 +6190,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) } else { newoff = SKB_EXT_CHUNKSIZEOF(*new); - new = __skb_ext_alloc(); + new = __skb_ext_alloc(GFP_ATOMIC); if (!new) return NULL; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index bc950cf818f7..e3a628bea2b8 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -367,8 +367,10 @@ static void mptcp_stop_timer(struct sock *sk) static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) { + const struct sock *sk = (const struct sock *)msk; + if (!msk->cached_ext) - msk->cached_ext = __skb_ext_alloc(); + msk->cached_ext = __skb_ext_alloc(sk->sk_allocation); return !!msk->cached_ext; } -- cgit v1.2.3