From e370a7236321773245c5522d8bb299380830d3b2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Aug 2013 14:37:32 -0700 Subject: af_unix: improve STREAM behavior with fragmented memory unix_stream_sendmsg() currently uses order-2 allocations, and we had numerous reports this can fail. The __GFP_REPEAT flag present in sock_alloc_send_pskb() is not helping. This patch extends the work done in commit eb6a24816b247c ("af_unix: reduce high order page allocations) for datagram sockets. This opens the possibility of zero copy IO (splice() and friends) The trick is to not use skb_pull() anymore in recvmsg() path, and instead add a @consumed field in UNIXCB() to track amount of already read payload in the skb. There is a performance regression for large sends because of extra page allocations that will be addressed in a follow-up patch, allowing sock_alloc_send_pskb() to attempt high order page allocations. Signed-off-by: Eric Dumazet Cc: David Rientjes Signed-off-by: David S. Miller --- net/unix/af_unix.c | 65 +++++++++++++++++++++++++----------------------------- 1 file changed, 30 insertions(+), 35 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c4ce243824bb..99dc760cdd95 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1596,6 +1596,10 @@ out: return err; } +/* We use paged skbs for stream sockets, and limit occupancy to 32768 + * bytes, and a minimun of a full page. + */ +#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, struct msghdr *msg, size_t len) @@ -1609,6 +1613,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, struct scm_cookie tmp_scm; bool fds_sent = false; int max_level; + int data_len; if (NULL == siocb->scm) siocb->scm = &tmp_scm; @@ -1635,40 +1640,21 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, goto pipe_err; while (sent < len) { - /* - * Optimisation for the fact that under 0.01% of X - * messages typically need breaking up. - */ - - size = len-sent; + size = len - sent; /* Keep two messages in the pipe so it schedules better */ - if (size > ((sk->sk_sndbuf >> 1) - 64)) - size = (sk->sk_sndbuf >> 1) - 64; + size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); - if (size > SKB_MAX_ALLOC) - size = SKB_MAX_ALLOC; - - /* - * Grab a buffer - */ + /* allow fallback to order-0 allocations */ + size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); - skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT, - &err); + data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); - if (skb == NULL) + skb = sock_alloc_send_pskb(sk, size - data_len, data_len, + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) goto out_err; - /* - * If you pass two values to the sock_alloc_send_skb - * it tries to grab the large buffer with GFP_NOFS - * (which can fail easily), and if it fails grab the - * fallback size buffer which is under a page and will - * succeed. [Alan] - */ - size = min_t(int, size, skb_tailroom(skb)); - - /* Only send the fds in the first buffer */ err = unix_scm_to_skb(siocb->scm, skb, !fds_sent); if (err < 0) { @@ -1678,7 +1664,10 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, max_level = err + 1; fds_sent = true; - err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); + skb_put(skb, size - data_len); + skb->data_len = data_len; + skb->len = size; + err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, size); if (err) { kfree_skb(skb); goto out_err; @@ -1890,6 +1879,11 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, return timeo; } +static unsigned int unix_skb_len(const struct sk_buff *skb) +{ + return skb->len - UNIXCB(skb).consumed; +} + static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags) @@ -1977,8 +1971,8 @@ again: } skip = sk_peek_offset(sk, flags); - while (skip >= skb->len) { - skip -= skb->len; + while (skip >= unix_skb_len(skb)) { + skip -= unix_skb_len(skb); last = skb; skb = skb_peek_next(skb, &sk->sk_receive_queue); if (!skb) @@ -2005,8 +1999,9 @@ again: sunaddr = NULL; } - chunk = min_t(unsigned int, skb->len - skip, size); - if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) { + chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); + if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip, + msg->msg_iov, chunk)) { if (copied == 0) copied = -EFAULT; break; @@ -2016,14 +2011,14 @@ again: /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { - skb_pull(skb, chunk); + UNIXCB(skb).consumed += chunk; sk_peek_offset_bwd(sk, chunk); if (UNIXCB(skb).fp) unix_detach_fds(siocb->scm, skb); - if (skb->len) + if (unix_skb_len(skb)) break; skb_unlink(skb, &sk->sk_receive_queue); @@ -2107,7 +2102,7 @@ long unix_inq_len(struct sock *sk) if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { skb_queue_walk(&sk->sk_receive_queue, skb) - amount += skb->len; + amount += unix_skb_len(skb); } else { skb = skb_peek(&sk->sk_receive_queue); if (skb) -- cgit v1.2.3 From 28d6427109d13b0f447cba5761f88d3548e83605 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Aug 2013 14:38:47 -0700 Subject: net: attempt high order allocations in sock_alloc_send_pskb() Adding paged frags skbs to af_unix sockets introduced a performance regression on large sends because of additional page allocations, even if each skb could carry at least 100% more payload than before. We can instruct sock_alloc_send_pskb() to attempt high order allocations. Most of the time, it does a single page allocation instead of 8. I added an additional parameter to sock_alloc_send_pskb() to let other users to opt-in for this new feature on followup patches. Tested: Before patch : $ netperf -t STREAM_STREAM STREAM STREAM TEST Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 2304 212992 212992 10.00 46861.15 After patch : $ netperf -t STREAM_STREAM STREAM STREAM TEST Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 2304 212992 212992 10.00 57981.11 Signed-off-by: Eric Dumazet Cc: David Rientjes Signed-off-by: David S. Miller --- net/unix/af_unix.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 99dc760cdd95..fee9e3397cd1 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1479,7 +1479,8 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, MAX_SKB_FRAGS * PAGE_SIZE); skb = sock_alloc_send_pskb(sk, len - data_len, data_len, - msg->msg_flags & MSG_DONTWAIT, &err); + msg->msg_flags & MSG_DONTWAIT, &err, + PAGE_ALLOC_COSTLY_ORDER); if (skb == NULL) goto out; @@ -1651,7 +1652,8 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); skb = sock_alloc_send_pskb(sk, size - data_len, data_len, - msg->msg_flags & MSG_DONTWAIT, &err); + msg->msg_flags & MSG_DONTWAIT, &err, + get_order(UNIX_SKB_FRAGS_SZ)); if (!skb) goto out_err; -- cgit v1.2.3 From f3dfd20860db3d0c400dd83a378176a28d3662db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 11 Aug 2013 21:54:48 -0700 Subject: af_unix: fix bug on large send() commit e370a723632 ("af_unix: improve STREAM behavior with fragmented memory") added a bug on large send() because the skb_copy_datagram_from_iovec() call always start from the beginning of iovec. We must instead use the @sent variable to properly skip the already processed part. Reported-by: Hannes Frederic Sowa Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index fee9e3397cd1..86de99ad2976 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1669,7 +1669,8 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, skb_put(skb, size - data_len); skb->data_len = data_len; skb->len = size; - err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, size); + err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, + sent, size); if (err) { kfree_skb(skb); goto out_err; -- cgit v1.2.3 From 90c6bd34f884cd9cee21f1d152baf6c18bcac949 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 17 Oct 2013 22:51:31 +0200 Subject: net: unix: inherit SOCK_PASS{CRED, SEC} flags from socket to fix race In the case of credentials passing in unix stream sockets (dgram sockets seem not affected), we get a rather sparse race after commit 16e5726 ("af_unix: dont send SCM_CREDENTIALS by default"). We have a stream server on receiver side that requests credential passing from senders (e.g. nc -U). Since we need to set SO_PASSCRED on each spawned/accepted socket on server side to 1 first (as it's not inherited), it can happen that in the time between accept() and setsockopt() we get interrupted, the sender is being scheduled and continues with passing data to our receiver. At that time SO_PASSCRED is neither set on sender nor receiver side, hence in cmsg's SCM_CREDENTIALS we get eventually pid:0, uid:65534, gid:65534 (== overflow{u,g}id) instead of what we actually would like to see. On the sender side, here nc -U, the tests in maybe_add_creds() invoked through unix_stream_sendmsg() would fail, as at that exact time, as mentioned, the sender has neither SO_PASSCRED on his side nor sees it on the server side, and we have a valid 'other' socket in place. Thus, sender believes it would just look like a normal connection, not needing/requesting SO_PASSCRED at that time. As reverting 16e5726 would not be an option due to the significant performance regression reported when having creds always passed, one way/trade-off to prevent that would be to set SO_PASSCRED on the listener socket and allow inheriting these flags to the spawned socket on server side in accept(). It seems also logical to do so if we'd tell the listener socket to pass those flags onwards, and would fix the race. Before, strace: recvmsg(4, {msg_name(0)=NULL, msg_iov(1)=[{"blub\n", 4096}], msg_controllen=32, {cmsg_len=28, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS{pid=0, uid=65534, gid=65534}}, msg_flags=0}, 0) = 5 After, strace: recvmsg(4, {msg_name(0)=NULL, msg_iov(1)=[{"blub\n", 4096}], msg_controllen=32, {cmsg_len=28, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS{pid=11580, uid=1000, gid=1000}}, msg_flags=0}, 0) = 5 Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Cc: Eric W. Biederman Signed-off-by: David S. Miller --- net/unix/af_unix.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 86de99ad2976..c1f403bed683 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1246,6 +1246,15 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb) return 0; } +static void unix_sock_inherit_flags(const struct socket *old, + struct socket *new) +{ + if (test_bit(SOCK_PASSCRED, &old->flags)) + set_bit(SOCK_PASSCRED, &new->flags); + if (test_bit(SOCK_PASSSEC, &old->flags)) + set_bit(SOCK_PASSSEC, &new->flags); +} + static int unix_accept(struct socket *sock, struct socket *newsock, int flags) { struct sock *sk = sock->sk; @@ -1280,6 +1289,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags) /* attach accepted sock to socket */ unix_state_lock(tsk); newsock->state = SS_CONNECTED; + unix_sock_inherit_flags(sock, newsock); sock_graft(tsk, newsock); unix_state_unlock(tsk); return 0; -- cgit v1.2.3