From 4fe38acdac8a71f7cccf347a2e9902bc818ecef7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 7 Jun 2023 19:19:07 +0100 Subject: net: Block MSG_SENDPAGE_* from being passed to sendmsg() by userspace It is necessary to allow MSG_SENDPAGE_* to be passed into ->sendmsg() to allow sendmsg(MSG_SPLICE_PAGES) to replace ->sendpage(). Unblocking them in the network protocol, however, allows these flags to be passed in by userspace too[1]. Fix this by marking MSG_SENDPAGE_NOPOLICY, MSG_SENDPAGE_NOTLAST and MSG_SENDPAGE_DECRYPTED as internal flags, which causes sendmsg() to object if they are passed to sendmsg() by userspace. Network protocol ->sendmsg() implementations can then allow them through. Note that it should be possible to remove MSG_SENDPAGE_NOTLAST once sendpage is removed as a whole slew of pages will be passed in in one go by splice through sendmsg, with MSG_MORE being set if it has more data waiting in the pipe. Signed-off-by: David Howells cc: Chuck Lever cc: Boris Pismenny cc: John Fastabend cc: Jens Axboe cc: Matthew Wilcox Link: https://lore.kernel.org/r/20230526181338.03a99016@kernel.org/ [1] Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index bd1cc3238851..3fd3436bc09f 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -339,7 +339,9 @@ struct ucred { #endif /* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */ -#define MSG_INTERNAL_SENDMSG_FLAGS (MSG_SPLICE_PAGES) +#define MSG_INTERNAL_SENDMSG_FLAGS \ + (MSG_SPLICE_PAGES | MSG_SENDPAGE_NOPOLICY | MSG_SENDPAGE_NOTLAST | \ + MSG_SENDPAGE_DECRYPTED) /* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */ #define SOL_IP 0 -- cgit v1.2.3 From 2dc334f1a63a8839b88483a3e73c0f27c9c1791c Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 7 Jun 2023 19:19:09 +0100 Subject: splice, net: Use sendmsg(MSG_SPLICE_PAGES) rather than ->sendpage() Replace generic_splice_sendpage() + splice_from_pipe + pipe_to_sendpage() with a net-specific handler, splice_to_socket(), that calls sendmsg() with MSG_SPLICE_PAGES set instead of calling ->sendpage(). MSG_MORE is used to indicate if the sendmsg() is expected to be followed with more data. This allows multiple pipe-buffer pages to be passed in a single call in a BVEC iterator, allowing the processing to be pushed down to a loop in the protocol driver. This helps pave the way for passing multipage folios down too. Protocols that haven't been converted to handle MSG_SPLICE_PAGES yet should just ignore it and do a normal sendmsg() for now - although that may be a bit slower as it may copy everything. Signed-off-by: David Howells Reviewed-by: Jakub Kicinski cc: Jens Axboe cc: Matthew Wilcox Signed-off-by: Jakub Kicinski --- include/linux/fs.h | 2 -- include/linux/splice.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 133f0640fb24..df92f4b3d122 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2759,8 +2759,6 @@ extern ssize_t generic_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); extern ssize_t iter_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); -extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, - struct file *out, loff_t *, size_t len, unsigned int flags); extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len, unsigned int flags); diff --git a/include/linux/splice.h b/include/linux/splice.h index a55179fd60fc..991ae318b6eb 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -84,6 +84,8 @@ extern long do_splice(struct file *in, loff_t *off_in, extern long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags); +extern ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags); /* * for dynamic pipe sizing -- cgit v1.2.3 From 2bfc66850952b6921b2033b09729ec59eabbc81d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 7 Jun 2023 19:19:10 +0100 Subject: splice, net: Add a splice_eof op to file-ops and socket-ops Add an optional method, ->splice_eof(), to allow splice to indicate the premature termination of a splice to struct file_operations and struct proto_ops. This is called if sendfile() or splice() encounters all of the following conditions inside splice_direct_to_actor(): (1) the user did not set SPLICE_F_MORE (splice only), and (2) an EOF condition occurred (->splice_read() returned 0), and (3) we haven't read enough to fulfill the request (ie. len > 0 still), and (4) we have already spliced at least one byte. A further patch will modify the behaviour of SPLICE_F_MORE to always be passed to the actor if either the user set it or we haven't yet read sufficient data to fulfill the request. Suggested-by: Linus Torvalds Link: https://lore.kernel.org/r/CAHk-=wh=V579PDYvkpnTobCLGczbgxpMgGmmhqiTyE34Cpi5Gg@mail.gmail.com/ Signed-off-by: David Howells Reviewed-by: Jakub Kicinski cc: Jens Axboe cc: Christoph Hellwig cc: Al Viro cc: Matthew Wilcox cc: Jan Kara cc: Jeff Layton cc: David Hildenbrand cc: Christian Brauner cc: Chuck Lever cc: Boris Pismenny cc: John Fastabend cc: linux-mm@kvack.org Signed-off-by: Jakub Kicinski --- include/linux/fs.h | 1 + include/linux/net.h | 1 + include/linux/splice.h | 1 + include/net/sock.h | 1 + 4 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index df92f4b3d122..de2cb1132f07 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1796,6 +1796,7 @@ struct file_operations { int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); + void (*splice_eof)(struct file *file); int (*setlease)(struct file *, long, struct file_lock **, void **); long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); diff --git a/include/linux/net.h b/include/linux/net.h index b73ad8e3c212..8defc8f1d82e 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -210,6 +210,7 @@ struct proto_ops { int offset, size_t size, int flags); ssize_t (*splice_read)(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); + void (*splice_eof)(struct socket *sock); int (*set_peek_off)(struct sock *sk, int val); int (*peek_len)(struct socket *sock); diff --git a/include/linux/splice.h b/include/linux/splice.h index 991ae318b6eb..4fab18a6e371 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -38,6 +38,7 @@ struct splice_desc { struct file *file; /* file to read/write */ void *data; /* cookie */ } u; + void (*splice_eof)(struct splice_desc *sd); /* Unexpected EOF handler */ loff_t pos; /* file position */ loff_t *opos; /* sendfile: output position */ size_t num_spliced; /* number of bytes already spliced */ diff --git a/include/net/sock.h b/include/net/sock.h index 6f428a7f3567..2790133b4b76 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1279,6 +1279,7 @@ struct proto { size_t len, int flags, int *addr_len); int (*sendpage)(struct sock *sk, struct page *page, int offset, size_t size, int flags); + void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, struct sockaddr *addr, int addr_len); int (*bind_add)(struct sock *sk, -- cgit v1.2.3 From 1d7e4538a5463faa0b0e26a7a7b6bd68c7dfdd78 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 7 Jun 2023 19:19:13 +0100 Subject: ipv4, ipv6: Use splice_eof() to flush Allow splice to undo the effects of MSG_MORE after prematurely ending a splice/sendfile due to getting an EOF condition (->splice_read() returned 0) after splice had called sendmsg() with MSG_MORE set when the user didn't set MSG_MORE. For UDP, a pending packet will not be emitted if the socket is closed before it is flushed; with this change, it be flushed by ->splice_eof(). For TCP, it's not clear that MSG_MORE is actually effective. Suggested-by: Linus Torvalds Link: https://lore.kernel.org/r/CAHk-=wh=V579PDYvkpnTobCLGczbgxpMgGmmhqiTyE34Cpi5Gg@mail.gmail.com/ Signed-off-by: David Howells cc: Kuniyuki Iwashima cc: Willem de Bruijn cc: David Ahern cc: Jens Axboe cc: Matthew Wilcox Signed-off-by: Jakub Kicinski --- include/net/inet_common.h | 1 + include/net/tcp.h | 1 + include/net/udp.h | 1 + 3 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 77f4b0ef5b92..a75333342c4e 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -35,6 +35,7 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk); int inet_send_prepare(struct sock *sk); int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); +void inet_splice_eof(struct socket *sock); ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, diff --git a/include/net/tcp.h b/include/net/tcp.h index 68990a8f556a..49611af31bb7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -327,6 +327,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, size_t size, struct ubuf_info *uarg); +void tcp_splice_eof(struct socket *sock); int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, diff --git a/include/net/udp.h b/include/net/udp.h index 5cad44318d71..4ed0b47c5582 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -278,6 +278,7 @@ int udp_get_port(struct sock *sk, unsigned short snum, int udp_err(struct sk_buff *, u32); int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); +void udp_splice_eof(struct socket *sock); int udp_push_pending_frames(struct sock *sk); void udp_flush_pending_frames(struct sock *sk); int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size); -- cgit v1.2.3