From 8f52de0077ba3bf41e5d53d67a185700f41efce7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 5 Jun 2024 11:33:01 +0100 Subject: netfs: Reduce number of conditional branches in netfs_perform_write() Reduce the number of conditional branches in netfs_perform_write() by merging in netfs_how_to_modify() and then creating a separate if-statement for each way we might modify a folio. Note that this means replicating the data copy in each path. Signed-off-by: David Howells cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-6-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/trace/events/netfs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 606b4a0f92da..a4fd5dea52f4 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -129,7 +129,6 @@ E_(netfs_sreq_trace_put_terminated, "PUT TERM ") #define netfs_folio_traces \ - /* The first few correspond to enum netfs_how_to_modify */ \ EM(netfs_folio_is_uptodate, "mod-uptodate") \ EM(netfs_just_prefetch, "mod-prefetch") \ EM(netfs_whole_folio_modify, "mod-whole-f") \ @@ -139,7 +138,6 @@ EM(netfs_flush_content, "flush") \ EM(netfs_streaming_filled_page, "mod-streamw-f") \ EM(netfs_streaming_cont_filled_page, "mod-streamw-f+") \ - /* The rest are for writeback */ \ EM(netfs_folio_trace_cancel_copy, "cancel-copy") \ EM(netfs_folio_trace_clear, "clear") \ EM(netfs_folio_trace_clear_cc, "clear-cc") \ -- cgit v1.2.3 From 73425800ac948cb78063b897a0c345d788f3594d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 5 Jun 2024 11:26:24 +0100 Subject: netfs, cifs: Move CIFS_INO_MODIFIED_ATTR to netfs_inode Move CIFS_INO_MODIFIED_ATTR to netfs_inode as NETFS_ICTX_MODIFIED_ATTR and then make netfs_perform_write() set it. This means that cifs doesn't need to implement the ->post_modify() hook. Signed-off-by: David Howells cc: Jeff Layton cc: Steve French cc: Paulo Alcantara cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-7-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/netfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index c47443e7a97e..71c748c53a1f 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -73,6 +73,7 @@ struct netfs_inode { #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ #define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ +#define NETFS_ICTX_MODIFIED_ATTR 3 /* Indicate change in mtime/ctime */ }; /* -- cgit v1.2.3 From 52d55922e0f1db1f580c9f91c174d2392bfad481 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 7 Jun 2024 09:02:58 +0100 Subject: netfs: Move max_len/max_nr_segs from netfs_io_subrequest to netfs_io_stream Move max_len/max_nr_segs from struct netfs_io_subrequest to struct netfs_io_stream as we only issue one subreq at a time and then don't need these values again for that subreq unless and until we have to retry it - in which case we want to renegotiate them. Signed-off-by: David Howells cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-8-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/netfs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 71c748c53a1f..eea5ea2842f6 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -134,6 +134,8 @@ static inline struct netfs_group *netfs_folio_group(struct folio *folio) struct netfs_io_stream { /* Submission tracking */ struct netfs_io_subrequest *construct; /* Op being constructed */ + size_t sreq_max_len; /* Maximum size of a subrequest */ + unsigned int sreq_max_segs; /* 0 or max number of segments in an iterator */ unsigned int submit_off; /* Folio offset we're submitting from */ unsigned int submit_len; /* Amount of data left to submit */ unsigned int submit_max_len; /* Amount I/O can be rounded up to */ @@ -177,14 +179,12 @@ struct netfs_io_subrequest { struct list_head rreq_link; /* Link in rreq->subrequests */ struct iov_iter io_iter; /* Iterator for this subrequest */ unsigned long long start; /* Where to start the I/O */ - size_t max_len; /* Maximum size of the I/O */ size_t len; /* Size of the I/O */ size_t transferred; /* Amount of data transferred */ refcount_t ref; short error; /* 0 or error that occurred */ unsigned short debug_index; /* Index in list (for debugging output) */ unsigned int nr_segs; /* Number of segs in io_iter */ - unsigned int max_nr_segs; /* 0 or max number of segments in an iterator */ enum netfs_io_source source; /* Where to read from/write to */ unsigned char stream_nr; /* I/O stream this belongs to */ unsigned long flags; -- cgit v1.2.3 From 51e9a86a4f75c6dc8531407b2ab2ccc4ad137e5a Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 9 Jul 2024 09:45:46 +0100 Subject: netfs: Reserve netfs_sreq_source 0 as unset/unknown Reserve the 0-valued netfs_sreq_source to mean unset or unknown so that it can be seen in the trace as such rather than appearing as download-from-server when it's going to get switched to something else. Signed-off-by: David Howells cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-9-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/netfs.h | 1 + include/trace/events/netfs.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index eea5ea2842f6..37f8ce9c0284 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -43,6 +43,7 @@ static inline void folio_start_private_2(struct folio *folio) #define NETFS_BUF_PAGECACHE_MARK XA_MARK_1 /* - Page needs wb/dirty flag wrangling */ enum netfs_io_source { + NETFS_SOURCE_UNKNOWN, NETFS_FILL_WITH_ZEROES, NETFS_DOWNLOAD_FROM_SERVER, NETFS_READ_FROM_CACHE, diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index a4fd5dea52f4..f4105b8e5894 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -60,6 +60,7 @@ E_(netfs_rreq_trace_write_done, "WR-DONE") #define netfs_sreq_sources \ + EM(NETFS_SOURCE_UNKNOWN, "----") \ EM(NETFS_FILL_WITH_ZEROES, "ZERO") \ EM(NETFS_DOWNLOAD_FROM_SERVER, "DOWN") \ EM(NETFS_READ_FROM_CACHE, "READ") \ -- cgit v1.2.3 From c57de2a9259d7dc18a7a425fca91c77502263d8a Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 9 Jul 2024 09:43:38 +0100 Subject: netfs: Remove NETFS_COPY_TO_CACHE Remove NETFS_COPY_TO_CACHE as it isn't used anymore. Signed-off-by: David Howells cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-10-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/netfs.h | 3 +-- include/trace/events/netfs.h | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 37f8ce9c0284..63ab2c775a21 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -207,11 +207,10 @@ enum netfs_io_origin { NETFS_READAHEAD, /* This read was triggered by readahead */ NETFS_READPAGE, /* This read is a synchronous read */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ - NETFS_COPY_TO_CACHE, /* This write is to copy a read to the cache */ + NETFS_DIO_READ, /* This is a direct I/O read */ NETFS_WRITEBACK, /* This write was triggered by writepages */ NETFS_WRITETHROUGH, /* This write was made by netfs_perform_write() */ NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */ - NETFS_DIO_READ, /* This is a direct I/O read */ NETFS_DIO_WRITE, /* This is a direct I/O write */ nr__netfs_io_origin } __mode(byte); diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index f4105b8e5894..47cd11aaccac 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -34,11 +34,10 @@ EM(NETFS_READAHEAD, "RA") \ EM(NETFS_READPAGE, "RP") \ EM(NETFS_READ_FOR_WRITE, "RW") \ - EM(NETFS_COPY_TO_CACHE, "CC") \ + EM(NETFS_DIO_READ, "DR") \ EM(NETFS_WRITEBACK, "WB") \ EM(NETFS_WRITETHROUGH, "WT") \ EM(NETFS_UNBUFFERED_WRITE, "UW") \ - EM(NETFS_DIO_READ, "DR") \ E_(NETFS_DIO_WRITE, "DW") #define netfs_rreq_traces \ -- cgit v1.2.3 From db0aa2e9566fda2d23dc8f6c102856ead95578a4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 19 Jun 2024 00:20:42 +0100 Subject: mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios Define a data structure, struct folio_queue, to represent a sequence of folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a list of folio_queue structures to be used to provide a buffer to iov_iter-taking functions, such as sendmsg and recvmsg. The folio_queue structure looks like: struct folio_queue { struct folio_batch vec; u8 orders[PAGEVEC_SIZE]; struct folio_queue *next; struct folio_queue *prev; unsigned long marks; unsigned long marks2; }; It does not use a list_head so that next and/or prev can be set to NULL at the ends of the list, allowing iov_iter-handling routines to determine that they *are* the ends without needing to store a head pointer in the iov_iter struct. A folio_batch struct is used to hold the folio pointers which allows the batch to be passed to batch handling functions. Two mark bits are available per slot. The intention is to use at least one of them to mark folios that need putting, but that might not be ultimately necessary. Accessor functions are used to access the slots to do the masking and an additional accessor function is used to indicate the size of the array. The order of each folio is also stored in the structure to avoid the need for iov_iter_advance() and iov_iter_revert() to have to query each folio to find its size. With careful barriering, this can be used as an extending buffer with new folios inserted and new folio_queue structs added without the need for a lock. Further, provided we always keep at least one struct in the buffer, we can also remove consumed folios and consumed structs from the head end as we without the need for locks. [Questions/thoughts] (1) To manage this, I need a head pointer, a tail pointer, a tail slot number (assuming insertion happens at the tail end and the next pointers point from head to tail). Should I put these into a struct of their own, say "folio_queue_head" or "rolling_buffer"? I will end up with two of these in netfs_io_request eventually, one keeping track of the pagecache I'm dealing with for buffered I/O and the other to hold a bounce buffer when we need one. (2) Should I make the slots {folio,off,len} or bio_vec? (3) This is intended to replace ITER_XARRAY eventually. Using an xarray in I/O iteration requires the taking of the RCU read lock, doing copying under the RCU read lock, walking the xarray (which may change under us), handling retries and dealing with special values. The advantage of ITER_XARRAY is that when we're dealing with the pagecache directly, we don't need any allocation - but if we're doing encrypted comms, there's a good chance we'd be using a bounce buffer anyway. This will require afs, erofs, cifs, orangefs and fscache to be converted to not use this. afs still uses it for dirs and symlinks; some of erofs usages should be easy to change, but there's one which won't be so easy; ceph's use via fscache can be fixed by porting ceph to netfslib; cifs is using xarray as a bounce buffer - that can be moved to use sheaves instead; and orangefs has a similar problem to erofs - maybe orangefs could use netfslib? Signed-off-by: David Howells cc: Matthew Wilcox cc: Jeff Layton cc: Steve French cc: Ilya Dryomov cc: Gao Xiang cc: Mike Marshall cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: ceph-devel@vger.kernel.org cc: linux-erofs@lists.ozlabs.org cc: devel@lists.orangefs.org Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/folio_queue.h | 138 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/iov_iter.h | 57 ++++++++++++++++++ include/linux/uio.h | 12 ++++ 3 files changed, 207 insertions(+) create mode 100644 include/linux/folio_queue.h (limited to 'include') diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h new file mode 100644 index 000000000000..52773613bf23 --- /dev/null +++ b/include/linux/folio_queue.h @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Queue of folios definitions + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#ifndef _LINUX_FOLIO_QUEUE_H +#define _LINUX_FOLIO_QUEUE_H + +#include + +/* + * Segment in a queue of running buffers. Each segment can hold a number of + * folios and a portion of the queue can be referenced with the ITER_FOLIOQ + * iterator. The possibility exists of inserting non-folio elements into the + * queue (such as gaps). + * + * Explicit prev and next pointers are used instead of a list_head to make it + * easier to add segments to tail and remove them from the head without the + * need for a lock. + */ +struct folio_queue { + struct folio_batch vec; /* Folios in the queue segment */ + u8 orders[PAGEVEC_SIZE]; /* Order of each folio */ + struct folio_queue *next; /* Next queue segment or NULL */ + struct folio_queue *prev; /* Previous queue segment of NULL */ + unsigned long marks; /* 1-bit mark per folio */ + unsigned long marks2; /* Second 1-bit mark per folio */ +#if PAGEVEC_SIZE > BITS_PER_LONG +#error marks is not big enough +#endif +}; + +static inline void folioq_init(struct folio_queue *folioq) +{ + folio_batch_init(&folioq->vec); + folioq->next = NULL; + folioq->prev = NULL; + folioq->marks = 0; + folioq->marks2 = 0; +} + +static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq) +{ + return PAGEVEC_SIZE; +} + +static inline unsigned int folioq_count(struct folio_queue *folioq) +{ + return folio_batch_count(&folioq->vec); +} + +static inline bool folioq_full(struct folio_queue *folioq) +{ + //return !folio_batch_space(&folioq->vec); + return folioq_count(folioq) >= folioq_nr_slots(folioq); +} + +static inline bool folioq_is_marked(const struct folio_queue *folioq, unsigned int slot) +{ + return test_bit(slot, &folioq->marks); +} + +static inline void folioq_mark(struct folio_queue *folioq, unsigned int slot) +{ + set_bit(slot, &folioq->marks); +} + +static inline void folioq_unmark(struct folio_queue *folioq, unsigned int slot) +{ + clear_bit(slot, &folioq->marks); +} + +static inline bool folioq_is_marked2(const struct folio_queue *folioq, unsigned int slot) +{ + return test_bit(slot, &folioq->marks2); +} + +static inline void folioq_mark2(struct folio_queue *folioq, unsigned int slot) +{ + set_bit(slot, &folioq->marks2); +} + +static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot) +{ + clear_bit(slot, &folioq->marks2); +} + +static inline unsigned int __folio_order(struct folio *folio) +{ + if (!folio_test_large(folio)) + return 0; + return folio->_flags_1 & 0xff; +} + +static inline unsigned int folioq_append(struct folio_queue *folioq, struct folio *folio) +{ + unsigned int slot = folioq->vec.nr++; + + folioq->vec.folios[slot] = folio; + folioq->orders[slot] = __folio_order(folio); + return slot; +} + +static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct folio *folio) +{ + unsigned int slot = folioq->vec.nr++; + + folioq->vec.folios[slot] = folio; + folioq->orders[slot] = __folio_order(folio); + folioq_mark(folioq, slot); + return slot; +} + +static inline struct folio *folioq_folio(const struct folio_queue *folioq, unsigned int slot) +{ + return folioq->vec.folios[slot]; +} + +static inline unsigned int folioq_folio_order(const struct folio_queue *folioq, unsigned int slot) +{ + return folioq->orders[slot]; +} + +static inline size_t folioq_folio_size(const struct folio_queue *folioq, unsigned int slot) +{ + return PAGE_SIZE << folioq_folio_order(folioq, slot); +} + +static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot) +{ + folioq->vec.folios[slot] = NULL; + folioq_unmark(folioq, slot); + folioq_unmark2(folioq, slot); +} + +#endif /* _LINUX_FOLIO_QUEUE_H */ diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h index 270454a6703d..a223370a59a7 100644 --- a/include/linux/iov_iter.h +++ b/include/linux/iov_iter.h @@ -10,6 +10,7 @@ #include #include +#include typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len, void *priv, void *priv2); @@ -140,6 +141,60 @@ size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2, return progress; } +/* + * Handle ITER_FOLIOQ. + */ +static __always_inline +size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2, + iov_step_f step) +{ + const struct folio_queue *folioq = iter->folioq; + unsigned int slot = iter->folioq_slot; + size_t progress = 0, skip = iter->iov_offset; + + if (slot == folioq_nr_slots(folioq)) { + /* The iterator may have been extended. */ + folioq = folioq->next; + slot = 0; + } + + do { + struct folio *folio = folioq_folio(folioq, slot); + size_t part, remain, consumed; + size_t fsize; + void *base; + + if (!folio) + break; + + fsize = folioq_folio_size(folioq, slot); + base = kmap_local_folio(folio, skip); + part = umin(len, PAGE_SIZE - skip % PAGE_SIZE); + remain = step(base, progress, part, priv, priv2); + kunmap_local(base); + consumed = part - remain; + len -= consumed; + progress += consumed; + skip += consumed; + if (skip >= fsize) { + skip = 0; + slot++; + if (slot == folioq_nr_slots(folioq) && folioq->next) { + folioq = folioq->next; + slot = 0; + } + } + if (remain) + break; + } while (len); + + iter->folioq_slot = slot; + iter->folioq = folioq; + iter->iov_offset = skip; + iter->count -= progress; + return progress; +} + /* * Handle ITER_XARRAY. */ @@ -249,6 +304,8 @@ size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv, return iterate_bvec(iter, len, priv, priv2, step); if (iov_iter_is_kvec(iter)) return iterate_kvec(iter, len, priv, priv2, step); + if (iov_iter_is_folioq(iter)) + return iterate_folioq(iter, len, priv, priv2, step); if (iov_iter_is_xarray(iter)) return iterate_xarray(iter, len, priv, priv2, step); return iterate_discard(iter, len, priv, priv2, step); diff --git a/include/linux/uio.h b/include/linux/uio.h index 7020adedfa08..845d110acadc 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -11,6 +11,7 @@ #include struct page; +struct folio_queue; typedef unsigned int __bitwise iov_iter_extraction_t; @@ -25,6 +26,7 @@ enum iter_type { ITER_IOVEC, ITER_BVEC, ITER_KVEC, + ITER_FOLIOQ, ITER_XARRAY, ITER_DISCARD, }; @@ -66,6 +68,7 @@ struct iov_iter { const struct iovec *__iov; const struct kvec *kvec; const struct bio_vec *bvec; + const struct folio_queue *folioq; struct xarray *xarray; void __user *ubuf; }; @@ -74,6 +77,7 @@ struct iov_iter { }; union { unsigned long nr_segs; + u8 folioq_slot; loff_t xarray_start; }; }; @@ -126,6 +130,11 @@ static inline bool iov_iter_is_discard(const struct iov_iter *i) return iov_iter_type(i) == ITER_DISCARD; } +static inline bool iov_iter_is_folioq(const struct iov_iter *i) +{ + return iov_iter_type(i) == ITER_FOLIOQ; +} + static inline bool iov_iter_is_xarray(const struct iov_iter *i) { return iov_iter_type(i) == ITER_XARRAY; @@ -273,6 +282,9 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); +void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction, + const struct folio_queue *folioq, + unsigned int first_slot, unsigned int offset, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, -- cgit v1.2.3 From 197a3de607d92b3d72e69edf5470e0a8fae548cc Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 14 Aug 2024 16:14:21 +0100 Subject: iov_iter: Provide copy_folio_from_iter() Provide a copy_folio_from_iter() wrapper. Signed-off-by: David Howells cc: Alexander Viro cc: Christian Brauner cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/20240814203850.2240469-14-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/uio.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/uio.h b/include/linux/uio.h index 845d110acadc..853f9de5aa05 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -189,6 +189,12 @@ static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset, return copy_page_to_iter(&folio->page, offset, bytes, i); } +static inline size_t copy_folio_from_iter(struct folio *folio, size_t offset, + size_t bytes, struct iov_iter *i) +{ + return copy_page_from_iter(&folio->page, offset, bytes, i); +} + static inline size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { -- cgit v1.2.3 From cd0277ed0c188dd40e7744e89299af7b78831ca4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 29 May 2024 21:47:07 +0100 Subject: netfs: Use new folio_queue data type and iterator instead of xarray iter Make the netfs write-side routines use the new folio_queue struct to hold a rolling buffer of folios, with the issuer adding folios at the tail and the collector removing them from the head as they're processed instead of using an xarray. This will allow a subsequent patch to simplify the write collector. The primary mark (as tested by folioq_is_marked()) is used to note if the corresponding folio needs putting. Signed-off-by: David Howells cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-16-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/netfs.h | 8 ++++---- include/trace/events/netfs.h | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 63ab2c775a21..f7b444f4f251 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -38,10 +38,6 @@ static inline void folio_start_private_2(struct folio *folio) folio_set_private_2(folio); } -/* Marks used on xarray-based buffers */ -#define NETFS_BUF_PUT_MARK XA_MARK_0 /* - Page needs putting */ -#define NETFS_BUF_PAGECACHE_MARK XA_MARK_1 /* - Page needs wb/dirty flag wrangling */ - enum netfs_io_source { NETFS_SOURCE_UNKNOWN, NETFS_FILL_WITH_ZEROES, @@ -233,6 +229,8 @@ struct netfs_io_request { struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */ #define NR_IO_STREAMS 2 //wreq->nr_io_streams struct netfs_group *group; /* Writeback group being written back */ + struct folio_queue *buffer; /* Head of I/O buffer */ + struct folio_queue *buffer_tail; /* Tail of I/O buffer */ struct iov_iter iter; /* Unencrypted-side iterator */ struct iov_iter io_iter; /* I/O (Encrypted-side) iterator */ void *netfs_priv; /* Private data for the netfs */ @@ -254,6 +252,8 @@ struct netfs_io_request { short error; /* 0 or error that occurred */ enum netfs_io_origin origin; /* Origin of the request */ bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ + u8 buffer_head_slot; /* First slot in ->buffer */ + u8 buffer_tail_slot; /* Next slot in ->buffer_tail */ unsigned long long i_size; /* Size of the file */ unsigned long long start; /* Start position */ atomic64_t issued_to; /* Write issuer folio cursor */ diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 47cd11aaccac..4e13774a06e6 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -153,6 +153,7 @@ EM(netfs_folio_trace_mkwrite, "mkwrite") \ EM(netfs_folio_trace_mkwrite_plus, "mkwrite+") \ EM(netfs_folio_trace_not_under_wback, "!wback") \ + EM(netfs_folio_trace_put, "put") \ EM(netfs_folio_trace_read_gaps, "read-gaps") \ EM(netfs_folio_trace_redirtied, "redirtied") \ EM(netfs_folio_trace_store, "store") \ -- cgit v1.2.3 From 983cdcf8fe141b0ce16bc71959a5dc55bcb0764d Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 6 Jun 2024 07:48:55 +0100 Subject: netfs: Simplify the writeback code Use the new folio_queue structures to simplify the writeback code. The problem with referring to the i_pages xarray directly is that we may have gaps in the sequence of folios we're writing from that we need to skip when we're removing the writeback mark from the folios we're writing back from. At the moment the code tries to deal with this by carefully tracking the gaps in each writeback stream (eg. write to server and write to cache) and divining when there's a gap that spans folios (something that's not helped by folios not being a consistent size). Instead, the folio_queue buffer contains pointers only the folios we're dealing with, has them in ascending order and indicates a gap by placing non-consequitive folios next to each other. This makes it possible to track where we need to clean up to by just keeping track of where we've processed to on each stream and taking the minimum. Note that the I/O iterator is always rounded up to the end of the folio, even if that is beyond the EOF position, so that the cache can do DIO from the page. The excess space is cleared, though mmapped writes clobber it. Signed-off-by: David Howells cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-18-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/netfs.h | 1 - include/trace/events/netfs.h | 33 ++------------------------------- 2 files changed, 2 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index f7b444f4f251..bd0e3d147822 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -257,7 +257,6 @@ struct netfs_io_request { unsigned long long i_size; /* Size of the file */ unsigned long long start; /* Start position */ atomic64_t issued_to; /* Write issuer folio cursor */ - unsigned long long contiguity; /* Tracking for gaps in the writeback sequence */ unsigned long long collected_to; /* Point we've collected to */ unsigned long long cleaned_to; /* Position we've cleaned folios to */ pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 4e13774a06e6..58bf23002fc1 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -512,33 +512,6 @@ TRACE_EVENT(netfs_collect, __entry->start + __entry->len) ); -TRACE_EVENT(netfs_collect_contig, - TP_PROTO(const struct netfs_io_request *wreq, unsigned long long to, - enum netfs_collect_contig_trace type), - - TP_ARGS(wreq, to, type), - - TP_STRUCT__entry( - __field(unsigned int, wreq) - __field(enum netfs_collect_contig_trace, type) - __field(unsigned long long, contiguity) - __field(unsigned long long, to) - ), - - TP_fast_assign( - __entry->wreq = wreq->debug_id; - __entry->type = type; - __entry->contiguity = wreq->contiguity; - __entry->to = to; - ), - - TP_printk("R=%08x %llx -> %llx %s", - __entry->wreq, - __entry->contiguity, - __entry->to, - __print_symbolic(__entry->type, netfs_collect_contig_traces)) - ); - TRACE_EVENT(netfs_collect_sreq, TP_PROTO(const struct netfs_io_request *wreq, const struct netfs_io_subrequest *subreq), @@ -610,7 +583,6 @@ TRACE_EVENT(netfs_collect_state, __field(unsigned int, notes ) __field(unsigned long long, collected_to ) __field(unsigned long long, cleaned_to ) - __field(unsigned long long, contiguity ) ), TP_fast_assign( @@ -618,12 +590,11 @@ TRACE_EVENT(netfs_collect_state, __entry->notes = notes; __entry->collected_to = collected_to; __entry->cleaned_to = wreq->cleaned_to; - __entry->contiguity = wreq->contiguity; ), - TP_printk("R=%08x cto=%llx fto=%llx ctg=%llx n=%x", + TP_printk("R=%08x col=%llx cln=%llx n=%x", __entry->wreq, __entry->collected_to, - __entry->cleaned_to, __entry->contiguity, + __entry->cleaned_to, __entry->notes) ); -- cgit v1.2.3 From ee4cdf7ba857a894ad1650d6ab77669cbbfa329e Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Jul 2024 00:40:22 +0100 Subject: netfs: Speed up buffered reading Improve the efficiency of buffered reads in a number of ways: (1) Overhaul the algorithm in general so that it's a lot more compact and split the read submission code between buffered and unbuffered versions. The unbuffered version can be vastly simplified. (2) Read-result collection is handed off to a work queue rather than being done in the I/O thread. Multiple subrequests can be processes simultaneously. (3) When a subrequest is collected, any folios it fully spans are collected and "spare" data on either side is donated to either the previous or the next subrequest in the sequence. Notes: (*) Readahead expansion is massively slows down fio, presumably because it causes a load of extra allocations, both folio and xarray, up front before RPC requests can be transmitted. (*) RDMA with cifs does appear to work, both with SIW and RXE. (*) PG_private_2-based reading and copy-to-cache is split out into its own file and altered to use folio_queue. Note that the copy to the cache now creates a new write transaction against the cache and adds the folios to be copied into it. This allows it to use part of the writeback I/O code. Signed-off-by: David Howells cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-20-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/folio_queue.h | 18 ++++++++ include/linux/netfs.h | 26 +++++++---- include/trace/events/netfs.h | 103 ++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 134 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h index 52773613bf23..955680c3bb5f 100644 --- a/include/linux/folio_queue.h +++ b/include/linux/folio_queue.h @@ -27,6 +27,7 @@ struct folio_queue { struct folio_queue *prev; /* Previous queue segment of NULL */ unsigned long marks; /* 1-bit mark per folio */ unsigned long marks2; /* Second 1-bit mark per folio */ + unsigned long marks3; /* Third 1-bit mark per folio */ #if PAGEVEC_SIZE > BITS_PER_LONG #error marks is not big enough #endif @@ -39,6 +40,7 @@ static inline void folioq_init(struct folio_queue *folioq) folioq->prev = NULL; folioq->marks = 0; folioq->marks2 = 0; + folioq->marks3 = 0; } static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq) @@ -87,6 +89,21 @@ static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot) clear_bit(slot, &folioq->marks2); } +static inline bool folioq_is_marked3(const struct folio_queue *folioq, unsigned int slot) +{ + return test_bit(slot, &folioq->marks3); +} + +static inline void folioq_mark3(struct folio_queue *folioq, unsigned int slot) +{ + set_bit(slot, &folioq->marks3); +} + +static inline void folioq_unmark3(struct folio_queue *folioq, unsigned int slot) +{ + clear_bit(slot, &folioq->marks3); +} + static inline unsigned int __folio_order(struct folio *folio) { if (!folio_test_large(folio)) @@ -133,6 +150,7 @@ static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot) folioq->vec.folios[slot] = NULL; folioq_unmark(folioq, slot); folioq_unmark2(folioq, slot); + folioq_unmark3(folioq, slot); } #endif /* _LINUX_FOLIO_QUEUE_H */ diff --git a/include/linux/netfs.h b/include/linux/netfs.h index bd0e3d147822..c0f0c9c87d86 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -178,36 +178,43 @@ struct netfs_io_subrequest { unsigned long long start; /* Where to start the I/O */ size_t len; /* Size of the I/O */ size_t transferred; /* Amount of data transferred */ + size_t consumed; /* Amount of read data consumed */ + size_t prev_donated; /* Amount of data donated from previous subreq */ + size_t next_donated; /* Amount of data donated from next subreq */ refcount_t ref; short error; /* 0 or error that occurred */ unsigned short debug_index; /* Index in list (for debugging output) */ unsigned int nr_segs; /* Number of segs in io_iter */ enum netfs_io_source source; /* Where to read from/write to */ unsigned char stream_nr; /* I/O stream this belongs to */ + unsigned char curr_folioq_slot; /* Folio currently being read */ + unsigned char curr_folio_order; /* Order of folio */ + struct folio_queue *curr_folioq; /* Queue segment in which current folio resides */ unsigned long flags; #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ #define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */ -#define NETFS_SREQ_SHORT_IO 2 /* Set if the I/O was short */ #define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */ #define NETFS_SREQ_NO_PROGRESS 4 /* Set if we didn't manage to read any data */ #define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */ #define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */ +#define NETFS_SREQ_HIT_EOF 7 /* Set if short due to EOF */ #define NETFS_SREQ_IN_PROGRESS 8 /* Unlocked when the subrequest completes */ #define NETFS_SREQ_NEED_RETRY 9 /* Set if the filesystem requests a retry */ #define NETFS_SREQ_RETRYING 10 /* Set if we're retrying */ #define NETFS_SREQ_FAILED 11 /* Set if the subreq failed unretryably */ -#define NETFS_SREQ_HIT_EOF 12 /* Set if we hit the EOF */ }; enum netfs_io_origin { NETFS_READAHEAD, /* This read was triggered by readahead */ NETFS_READPAGE, /* This read is a synchronous read */ + NETFS_READ_GAPS, /* This read is a synchronous read to fill gaps */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ NETFS_DIO_READ, /* This is a direct I/O read */ NETFS_WRITEBACK, /* This write was triggered by writepages */ NETFS_WRITETHROUGH, /* This write was made by netfs_perform_write() */ NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */ NETFS_DIO_WRITE, /* This is a direct I/O write */ + NETFS_PGPRIV2_COPY_TO_CACHE, /* [DEPRECATED] This is writing read data to the cache */ nr__netfs_io_origin } __mode(byte); @@ -224,6 +231,7 @@ struct netfs_io_request { struct address_space *mapping; /* The mapping being accessed */ struct kiocb *iocb; /* AIO completion vector */ struct netfs_cache_resources cache_resources; + struct readahead_control *ractl; /* Readahead descriptor */ struct list_head proc_link; /* Link in netfs_iorequests */ struct list_head subrequests; /* Contributory I/O operations */ struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */ @@ -244,12 +252,10 @@ struct netfs_io_request { unsigned int nr_group_rel; /* Number of refs to release on ->group */ spinlock_t lock; /* Lock for queuing subreqs */ atomic_t nr_outstanding; /* Number of ops in progress */ - atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */ - size_t upper_len; /* Length can be extended to here */ unsigned long long submitted; /* Amount submitted for I/O so far */ unsigned long long len; /* Length of the request */ size_t transferred; /* Amount to be indicated as transferred */ - short error; /* 0 or error that occurred */ + long error; /* 0 or error that occurred */ enum netfs_io_origin origin; /* Origin of the request */ bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ u8 buffer_head_slot; /* First slot in ->buffer */ @@ -260,9 +266,9 @@ struct netfs_io_request { unsigned long long collected_to; /* Point we've collected to */ unsigned long long cleaned_to; /* Position we've cleaned folios to */ pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ + size_t prev_donated; /* Fallback for subreq->prev_donated */ refcount_t ref; unsigned long flags; -#define NETFS_RREQ_INCOMPLETE_IO 0 /* Some ioreqs terminated short or with error */ #define NETFS_RREQ_COPY_TO_CACHE 1 /* Need to write to the cache */ #define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */ #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ @@ -274,6 +280,7 @@ struct netfs_io_request { #define NETFS_RREQ_PAUSE 11 /* Pause subrequest generation */ #define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */ #define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */ +#define NETFS_RREQ_NEED_RETRY 14 /* Need to try retrying */ #define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark * write to cache on read */ const struct netfs_request_ops *netfs_ops; @@ -292,7 +299,7 @@ struct netfs_request_ops { /* Read request handling */ void (*expand_readahead)(struct netfs_io_request *rreq); - bool (*clamp_length)(struct netfs_io_subrequest *subreq); + int (*prepare_read)(struct netfs_io_subrequest *subreq); void (*issue_read)(struct netfs_io_subrequest *subreq); bool (*is_still_valid)(struct netfs_io_request *rreq); int (*check_write_begin)(struct file *file, loff_t pos, unsigned len, @@ -422,7 +429,10 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp); vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group); /* (Sub)request management API. */ -void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool); +void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, + bool was_async); +void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, + int error, bool was_async); void netfs_get_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what); void netfs_put_subrequest(struct netfs_io_subrequest *subreq, diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 58bf23002fc1..7b26463cb98f 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -20,6 +20,7 @@ EM(netfs_read_trace_expanded, "EXPANDED ") \ EM(netfs_read_trace_readahead, "READAHEAD") \ EM(netfs_read_trace_readpage, "READPAGE ") \ + EM(netfs_read_trace_read_gaps, "READ-GAPS") \ EM(netfs_read_trace_prefetch_for_write, "PREFETCHW") \ E_(netfs_read_trace_write_begin, "WRITEBEGN") @@ -33,12 +34,14 @@ #define netfs_rreq_origins \ EM(NETFS_READAHEAD, "RA") \ EM(NETFS_READPAGE, "RP") \ + EM(NETFS_READ_GAPS, "RG") \ EM(NETFS_READ_FOR_WRITE, "RW") \ EM(NETFS_DIO_READ, "DR") \ EM(NETFS_WRITEBACK, "WB") \ EM(NETFS_WRITETHROUGH, "WT") \ EM(NETFS_UNBUFFERED_WRITE, "UW") \ - E_(NETFS_DIO_WRITE, "DW") + EM(NETFS_DIO_WRITE, "DW") \ + E_(NETFS_PGPRIV2_COPY_TO_CACHE, "2C") #define netfs_rreq_traces \ EM(netfs_rreq_trace_assess, "ASSESS ") \ @@ -69,15 +72,25 @@ E_(NETFS_INVALID_WRITE, "INVL") #define netfs_sreq_traces \ + EM(netfs_sreq_trace_add_donations, "+DON ") \ + EM(netfs_sreq_trace_added, "ADD ") \ + EM(netfs_sreq_trace_clear, "CLEAR") \ EM(netfs_sreq_trace_discard, "DSCRD") \ + EM(netfs_sreq_trace_donate_to_prev, "DON-P") \ + EM(netfs_sreq_trace_donate_to_next, "DON-N") \ EM(netfs_sreq_trace_download_instead, "RDOWN") \ EM(netfs_sreq_trace_fail, "FAIL ") \ EM(netfs_sreq_trace_free, "FREE ") \ + EM(netfs_sreq_trace_hit_eof, "EOF ") \ + EM(netfs_sreq_trace_io_progress, "IO ") \ EM(netfs_sreq_trace_limited, "LIMIT") \ EM(netfs_sreq_trace_prepare, "PREP ") \ EM(netfs_sreq_trace_prep_failed, "PRPFL") \ - EM(netfs_sreq_trace_resubmit_short, "SHORT") \ + EM(netfs_sreq_trace_progress, "PRGRS") \ + EM(netfs_sreq_trace_reprep_failed, "REPFL") \ EM(netfs_sreq_trace_retry, "RETRY") \ + EM(netfs_sreq_trace_short, "SHORT") \ + EM(netfs_sreq_trace_split, "SPLIT") \ EM(netfs_sreq_trace_submit, "SUBMT") \ EM(netfs_sreq_trace_terminated, "TERM ") \ EM(netfs_sreq_trace_write, "WRITE") \ @@ -118,7 +131,7 @@ EM(netfs_sreq_trace_new, "NEW ") \ EM(netfs_sreq_trace_put_cancel, "PUT CANCEL ") \ EM(netfs_sreq_trace_put_clear, "PUT CLEAR ") \ - EM(netfs_sreq_trace_put_discard, "PUT DISCARD") \ + EM(netfs_sreq_trace_put_consumed, "PUT CONSUME") \ EM(netfs_sreq_trace_put_done, "PUT DONE ") \ EM(netfs_sreq_trace_put_failed, "PUT FAILED ") \ EM(netfs_sreq_trace_put_merged, "PUT MERGED ") \ @@ -138,6 +151,7 @@ EM(netfs_flush_content, "flush") \ EM(netfs_streaming_filled_page, "mod-streamw-f") \ EM(netfs_streaming_cont_filled_page, "mod-streamw-f+") \ + EM(netfs_folio_trace_abandon, "abandon") \ EM(netfs_folio_trace_cancel_copy, "cancel-copy") \ EM(netfs_folio_trace_clear, "clear") \ EM(netfs_folio_trace_clear_cc, "clear-cc") \ @@ -154,7 +168,11 @@ EM(netfs_folio_trace_mkwrite_plus, "mkwrite+") \ EM(netfs_folio_trace_not_under_wback, "!wback") \ EM(netfs_folio_trace_put, "put") \ + EM(netfs_folio_trace_read, "read") \ + EM(netfs_folio_trace_read_done, "read-done") \ EM(netfs_folio_trace_read_gaps, "read-gaps") \ + EM(netfs_folio_trace_read_put, "read-put") \ + EM(netfs_folio_trace_read_unlock, "read-unlock") \ EM(netfs_folio_trace_redirtied, "redirtied") \ EM(netfs_folio_trace_store, "store") \ EM(netfs_folio_trace_store_copy, "store-copy") \ @@ -167,6 +185,12 @@ EM(netfs_contig_trace_jump, "-->JUMP-->") \ E_(netfs_contig_trace_unlock, "Unlock") +#define netfs_donate_traces \ + EM(netfs_trace_donate_tail_to_prev, "tail-to-prev") \ + EM(netfs_trace_donate_to_prev, "to-prev") \ + EM(netfs_trace_donate_to_next, "to-next") \ + E_(netfs_trace_donate_to_deferred_next, "defer-next") + #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY #define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY @@ -184,6 +208,7 @@ enum netfs_rreq_ref_trace { netfs_rreq_ref_traces } __mode(byte); enum netfs_sreq_ref_trace { netfs_sreq_ref_traces } __mode(byte); enum netfs_folio_trace { netfs_folio_traces } __mode(byte); enum netfs_collect_contig_trace { netfs_collect_contig_traces } __mode(byte); +enum netfs_donate_trace { netfs_donate_traces } __mode(byte); #endif @@ -206,6 +231,7 @@ netfs_rreq_ref_traces; netfs_sreq_ref_traces; netfs_folio_traces; netfs_collect_contig_traces; +netfs_donate_traces; /* * Now redefine the EM() and E_() macros to map the enums to the strings that @@ -226,6 +252,7 @@ TRACE_EVENT(netfs_read, TP_STRUCT__entry( __field(unsigned int, rreq ) __field(unsigned int, cookie ) + __field(loff_t, i_size ) __field(loff_t, start ) __field(size_t, len ) __field(enum netfs_read_trace, what ) @@ -235,18 +262,19 @@ TRACE_EVENT(netfs_read, TP_fast_assign( __entry->rreq = rreq->debug_id; __entry->cookie = rreq->cache_resources.debug_id; + __entry->i_size = rreq->i_size; __entry->start = start; __entry->len = len; __entry->what = what; __entry->netfs_inode = rreq->inode->i_ino; ), - TP_printk("R=%08x %s c=%08x ni=%x s=%llx %zx", + TP_printk("R=%08x %s c=%08x ni=%x s=%llx l=%zx sz=%llx", __entry->rreq, __print_symbolic(__entry->what, netfs_read_traces), __entry->cookie, __entry->netfs_inode, - __entry->start, __entry->len) + __entry->start, __entry->len, __entry->i_size) ); TRACE_EVENT(netfs_rreq, @@ -651,6 +679,71 @@ TRACE_EVENT(netfs_collect_stream, __entry->collected_to, __entry->front) ); +TRACE_EVENT(netfs_progress, + TP_PROTO(const struct netfs_io_subrequest *subreq, + unsigned long long start, size_t avail, size_t part), + + TP_ARGS(subreq, start, avail, part), + + TP_STRUCT__entry( + __field(unsigned int, rreq) + __field(unsigned int, subreq) + __field(unsigned int, consumed) + __field(unsigned int, transferred) + __field(unsigned long long, f_start) + __field(unsigned int, f_avail) + __field(unsigned int, f_part) + __field(unsigned char, slot) + ), + + TP_fast_assign( + __entry->rreq = subreq->rreq->debug_id; + __entry->subreq = subreq->debug_index; + __entry->consumed = subreq->consumed; + __entry->transferred = subreq->transferred; + __entry->f_start = start; + __entry->f_avail = avail; + __entry->f_part = part; + __entry->slot = subreq->curr_folioq_slot; + ), + + TP_printk("R=%08x[%02x] s=%llx ct=%x/%x pa=%x/%x sl=%x", + __entry->rreq, __entry->subreq, __entry->f_start, + __entry->consumed, __entry->transferred, + __entry->f_part, __entry->f_avail, __entry->slot) + ); + +TRACE_EVENT(netfs_donate, + TP_PROTO(const struct netfs_io_request *rreq, + const struct netfs_io_subrequest *from, + const struct netfs_io_subrequest *to, + size_t amount, + enum netfs_donate_trace trace), + + TP_ARGS(rreq, from, to, amount, trace), + + TP_STRUCT__entry( + __field(unsigned int, rreq) + __field(unsigned int, from) + __field(unsigned int, to) + __field(unsigned int, amount) + __field(enum netfs_donate_trace, trace) + ), + + TP_fast_assign( + __entry->rreq = rreq->debug_id; + __entry->from = from->debug_index; + __entry->to = to ? to->debug_index : -1; + __entry->amount = amount; + __entry->trace = trace; + ), + + TP_printk("R=%08x[%02x] -> [%02x] %s am=%x", + __entry->rreq, __entry->from, __entry->to, + __print_symbolic(__entry->trace, netfs_donate_traces), + __entry->amount) + ); + #undef EM #undef E_ #endif /* _TRACE_NETFS_H */ -- cgit v1.2.3 From c4f1450ecccc5311db87f806998eda1c824c4e35 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 12 Jul 2024 12:44:30 +0100 Subject: cachefiles, netfs: Fix write to partial block at EOF Because it uses DIO writes, cachefiles is unable to make a write to the backing file if that write is not aligned to and sized according to the backing file's DIO block alignment. This makes it tricky to handle a write to the cache where the EOF on the network file is not correctly aligned. To get around this, netfslib attempts to tell the driver it is calling how much more data there is available beyond the EOF that it can use to pad the write (netfslib preclears the part of the folio above the EOF). However, it tries to tell the cache what the maximum length is, but doesn't calculate this correctly; and, in any case, cachefiles actually ignores the value and just skips the block. Fix this by: (1) Change the value passed to indicate the amount of extra data that can be added to the operation (now ->submit_extendable_to). This is much simpler to calculate as it's just the end of the folio minus the top of the data within the folio - rather than having to account for data spread over multiple folios. (2) Make cachefiles add some of this data if the subrequest it is given ends at the network file's i_size if the extra data is sufficient to pad out to a whole block. Signed-off-by: David Howells cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-22-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/netfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index c0f0c9c87d86..5eaceef41e6c 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -135,7 +135,7 @@ struct netfs_io_stream { unsigned int sreq_max_segs; /* 0 or max number of segments in an iterator */ unsigned int submit_off; /* Folio offset we're submitting from */ unsigned int submit_len; /* Amount of data left to submit */ - unsigned int submit_max_len; /* Amount I/O can be rounded up to */ + unsigned int submit_extendable_to; /* Amount I/O can be rounded up to */ void (*prepare_write)(struct netfs_io_subrequest *subreq); void (*issue_write)(struct netfs_io_subrequest *subreq); /* Collection tracking */ -- cgit v1.2.3 From 8f246b7c0a1be0882374f2ff831a61f0dbe77678 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 29 Jul 2024 12:23:11 +0100 Subject: netfs: Cancel dirty folios that have no storage destination Kafs wants to be able to cache the contents of directories (and symlinks), but whilst these are downloaded from the server with the FS.FetchData RPC op and similar, the same as for regular files, they can't be updated by FS.StoreData, but rather have special operations (FS.MakeDir, etc.). Now, rather than redownloading a directory's content after each change made to that directory, kafs modifies the local blob. This blob can be saved out to the cache, and since it's using netfslib, kafs just marks the folios dirty and lets ->writepages() on the directory take care of it, as for an regular file. This is fine as long as there's a cache as although the upload stream is disabled, there's a cache stream to drive the procedure. But if the cache goes away in the meantime, suddenly there's no way do any writes and the code gets confused, complains "R=%x: No submit" to dmesg and leaves the dirty folio hanging. Fix this by just cancelling the store of the folio if neither stream is active. (If there's no cache at the time of dirtying, we should just not mark the folio dirty). Signed-off-by: David Howells cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-23-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/trace/events/netfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 7b26463cb98f..76bd42a96815 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -153,6 +153,7 @@ EM(netfs_streaming_cont_filled_page, "mod-streamw-f+") \ EM(netfs_folio_trace_abandon, "abandon") \ EM(netfs_folio_trace_cancel_copy, "cancel-copy") \ + EM(netfs_folio_trace_cancel_store, "cancel-store") \ EM(netfs_folio_trace_clear, "clear") \ EM(netfs_folio_trace_clear_cc, "clear-cc") \ EM(netfs_folio_trace_clear_g, "clear-g") \ -- cgit v1.2.3 From 2982c8c19bab020e38da9d503aa21a3b389c53ac Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 26 Jul 2024 20:03:07 +0100 Subject: cifs: Use iterate_and_advance*() routines directly for hashing Replace the bespoke cifs iterators of ITER_BVEC and ITER_KVEC to do hashing with iterate_and_advance_kernel() - a variant on iterate_and_advance() that only supports kernel-internal ITER_* types and not UBUF/IOVEC types. The bespoke ITER_XARRAY is left because we don't really want to be calling crypto_shash_update() under the RCU read lock for large amounts of data; besides, ITER_XARRAY is going to be phased out. Signed-off-by: David Howells cc: Steve French cc: Paulo Alcantara cc: Tom Talpey cc: Enzo Matsumiya cc: linux-cifs@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-24-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/iov_iter.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'include') diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h index a223370a59a7..c4aa58032faf 100644 --- a/include/linux/iov_iter.h +++ b/include/linux/iov_iter.h @@ -328,4 +328,51 @@ size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv, return iterate_and_advance2(iter, len, priv, NULL, ustep, step); } +/** + * iterate_and_advance_kernel - Iterate over a kernel-internal iterator + * @iter: The iterator to iterate over. + * @len: The amount to iterate over. + * @priv: Data for the step functions. + * @priv2: More data for the step functions. + * @step: Function for other iterators; given kernel addresses. + * + * Iterate over the next part of an iterator, up to the specified length. The + * buffer is presented in segments, which for kernel iteration are broken up by + * physical pages and mapped, with the mapped address being presented. + * + * [!] Note This will only handle BVEC, KVEC, FOLIOQ, XARRAY and DISCARD-type + * iterators; it will not handle UBUF or IOVEC-type iterators. + * + * A step functions, @step, must be provided, one for handling mapped kernel + * addresses and the other is given user addresses which have the potential to + * fault since no pinning is performed. + * + * The step functions are passed the address and length of the segment, @priv, + * @priv2 and the amount of data so far iterated over (which can, for example, + * be added to @priv to point to the right part of a second buffer). The step + * functions should return the amount of the segment they didn't process (ie. 0 + * indicates complete processsing). + * + * This function returns the amount of data processed (ie. 0 means nothing was + * processed and the value of @len means processes to completion). + */ +static __always_inline +size_t iterate_and_advance_kernel(struct iov_iter *iter, size_t len, void *priv, + void *priv2, iov_step_f step) +{ + if (unlikely(iter->count < len)) + len = iter->count; + if (unlikely(!len)) + return 0; + if (iov_iter_is_bvec(iter)) + return iterate_bvec(iter, len, priv, priv2, step); + if (iov_iter_is_kvec(iter)) + return iterate_kvec(iter, len, priv, priv2, step); + if (iov_iter_is_folioq(iter)) + return iterate_folioq(iter, len, priv, priv2, step); + if (iov_iter_is_xarray(iter)) + return iterate_xarray(iter, len, priv, priv2, step); + return iterate_discard(iter, len, priv, priv2, step); +} + #endif /* _LINUX_IOV_ITER_H */ -- cgit v1.2.3