From 97af84a6bba2ab2b9c704c08e67de3b5ea551bb2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Jan 2024 09:08:53 -0800 Subject: af_unix: Do not use atomic ops for unix_sk(sk)->inflight. When touching unix_sk(sk)->inflight, we are always under spin_lock(&unix_gc_lock). Let's convert unix_sk(sk)->inflight to the normal unsigned long. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240123170856.41348-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ac1f2bc18fc9..1e9378036dcc 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -993,11 +993,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_write_space = unix_write_space; sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; sk->sk_destruct = unix_sock_destructor; - u = unix_sk(sk); + u = unix_sk(sk); + u->inflight = 0; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); - atomic_long_set(&u->inflight, 0); INIT_LIST_HEAD(&u->link); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ -- cgit v1.2.3 From d9f21b3613337b55cc9d4a6ead484dca68475143 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Jan 2024 09:08:56 -0800 Subject: af_unix: Try to run GC async. If more than 16000 inflight AF_UNIX sockets exist and the garbage collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc(). Also, they wait for unix_gc() to complete. In unix_gc(), all inflight AF_UNIX sockets are traversed at least once, and more if they are the GC candidate. Thus, sendmsg() significantly slows down with too many inflight AF_UNIX sockets. However, if a process sends data with no AF_UNIX FD, the sendmsg() call does not need to wait for GC. After this change, only the process that meets the condition below will be blocked under such a situation. 1) cmsg contains AF_UNIX socket 2) more than 32 AF_UNIX sent by the same user are still inflight Note that even a sendmsg() call that does not meet the condition but has AF_UNIX FD will be blocked later in unix_scm_to_skb() by the spinlock, but we allow that as a bonus for sane users. The results below are the time spent in unix_dgram_sendmsg() sending 1 byte of data with no FD 4096 times on a host where 32K inflight AF_UNIX sockets exist. Without series: the sane sendmsg() needs to wait gc unreasonably. $ sudo /usr/share/bcc/tools/funclatency -p 11165 unix_dgram_sendmsg Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end. ^C nsecs : count distribution [...] 524288 -> 1048575 : 0 | | 1048576 -> 2097151 : 3881 |****************************************| 2097152 -> 4194303 : 214 |** | 4194304 -> 8388607 : 1 | | avg = 1825567 nsecs, total: 7477526027 nsecs, count: 4096 With series: the sane sendmsg() can finish much faster. $ sudo /usr/share/bcc/tools/funclatency -p 8702 unix_dgram_sendmsg Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end. ^C nsecs : count distribution [...] 128 -> 255 : 0 | | 256 -> 511 : 4092 |****************************************| 512 -> 1023 : 2 | | 1024 -> 2047 : 0 | | 2048 -> 4095 : 0 | | 4096 -> 8191 : 1 | | 8192 -> 16383 : 1 | | avg = 410 nsecs, total: 1680510 nsecs, count: 4096 Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240123170856.41348-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1e9378036dcc..1720419d93d6 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1923,11 +1923,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, long timeo; int err; - wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; + wait_for_unix_gc(scm.fp); + err = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) goto out; @@ -2199,11 +2200,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, bool fds_sent = false; int data_len; - wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; + wait_for_unix_gc(scm.fp); + err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) { #if IS_ENABLED(CONFIG_AF_UNIX_OOB) -- cgit v1.2.3 From 99a7a5b9943ea2d05fb0dee38e4ae2290477ed83 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 29 Jan 2024 11:04:35 -0800 Subject: af_unix: Remove CONFIG_UNIX_SCM. Originally, the code related to garbage collection was all in garbage.c. Commit f4e65870e5ce ("net: split out functions related to registering inflight socket files") moved some functions to scm.c for io_uring and added CONFIG_UNIX_SCM just in case AF_UNIX was built as module. However, since commit 97154bcf4d1b ("af_unix: Kconfig: make CONFIG_UNIX bool"), AF_UNIX is no longer built separately. Also, io_uring does not support SCM_RIGHTS now. Let's move the functions back to garbage.c Signed-off-by: Kuniyuki Iwashima Acked-by: Jens Axboe Link: https://lore.kernel.org/r/20240129190435.57228-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1720419d93d6..1cfbc586adb4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -118,8 +118,6 @@ #include #include -#include "scm.h" - static atomic_long_t unix_nr_socks; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; @@ -1790,6 +1788,52 @@ out: return err; } +/* The "user->unix_inflight" variable is protected by the garbage + * collection lock, and we just read it locklessly here. If you go + * over the limit, there might be a tiny race in actually noticing + * it across threads. Tough. + */ +static inline bool too_many_unix_fds(struct task_struct *p) +{ + struct user_struct *user = current_user(); + + if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) + return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); + return false; +} + +static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + int i; + + if (too_many_unix_fds(current)) + return -ETOOMANYREFS; + + /* Need to duplicate file references for the sake of garbage + * collection. Otherwise a socket in the fps might become a + * candidate for GC while the skb is not yet queued. + */ + UNIXCB(skb).fp = scm_fp_dup(scm->fp); + if (!UNIXCB(skb).fp) + return -ENOMEM; + + for (i = scm->fp->count - 1; i >= 0; i--) + unix_inflight(scm->fp->user, scm->fp->fp[i]); + + return 0; +} + +static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + int i; + + scm->fp = UNIXCB(skb).fp; + UNIXCB(skb).fp = NULL; + + for (i = scm->fp->count - 1; i >= 0; i--) + unix_notinflight(scm->fp->user, scm->fp->fp[i]); +} + static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = scm_fp_dup(UNIXCB(skb).fp); @@ -1837,6 +1881,21 @@ static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) spin_unlock(&unix_gc_lock); } +static void unix_destruct_scm(struct sk_buff *skb) +{ + struct scm_cookie scm; + + memset(&scm, 0, sizeof(scm)); + scm.pid = UNIXCB(skb).pid; + if (UNIXCB(skb).fp) + unix_detach_fds(&scm, skb); + + /* Alas, it calls VFS */ + /* So fscking what? fput() had been SMP-safe since the last Summer */ + scm_destroy(&scm); + sock_wfree(skb); +} + static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) { int err = 0; -- cgit v1.2.3