diff options
| author | Jeff Layton <jlayton@kernel.org> | 2024-10-02 17:27:18 -0400 |
|---|---|---|
| committer | Christian Brauner <brauner@kernel.org> | 2024-10-07 12:48:56 +0200 |
| commit | 4e40eff0b5737c0de39e1ae5812509efbc0b986e (patch) | |
| tree | 954d9a8b22da5082909cf0fc64fd142ddb32ef5c /include/linux/fs.h | |
| parent | 98f7e32f20d28ec452afb208f9cffc08448a2652 (diff) | |
fs: add infrastructure for multigrain timestamps
The VFS has always used coarse-grained timestamps when updating the
ctime and mtime after a change. This has the benefit of allowing
filesystems to optimize away a lot metadata updates, down to around 1
per jiffy, even when a file is under heavy writes.
Unfortunately, this has always been an issue when we're exporting via
NFSv3, which relies on timestamps to validate caches. A lot of changes
can happen in a jiffy, so timestamps aren't sufficient to help the
client decide when to invalidate the cache. Even with NFSv4, a lot of
exported filesystems don't properly support a change attribute and are
subject to the same problems with timestamp granularity. Other
applications have similar issues with timestamps (e.g backup
applications).
If fine-grained timestamps were always used, that would improve the
situation, but that becomes rather expensive, as the underlying
filesystem would have to log a lot more metadata updates.
What is needed is a way to only use fine-grained timestamps when they
are being actively queried. Use the (unused) top bit in
inode->i_ctime_nsec as a flag that indicates whether the current
timestamps have been queried via stat() or the like. When it's set,
allow the update to use a fine-grained timestamp iff it's necessary to
make the ctime show a different value.
If it has been queried, then first see whether the current coarse time
is later than the existing ctime. If it is, accept that value. If it
isn't, then get a fine-grained timestamp and attempt to stamp the inode
ctime with that value. If that races with another concurrent stamp, then
abandon the update and take the new value without retrying.
Filesystems can opt into this by setting the FS_MGTIME fstype flag.
Others should be unaffected (other than being subject to the same floor
value as multigrain filesystems).
Tested-by: Randy Dunlap <rdunlap@infradead.org> # documentation bits
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/20241002-mgtime-v10-3-d1c4717f5284@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
Diffstat (limited to 'include/linux/fs.h')
| -rw-r--r-- | include/linux/fs.h | 34 |
1 files changed, 26 insertions, 8 deletions
diff --git a/include/linux/fs.h b/include/linux/fs.h index 6ca11e241a24..eff688e75f2f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1613,6 +1613,17 @@ static inline struct timespec64 inode_set_mtime(struct inode *inode, return inode_set_mtime_to_ts(inode, ts); } +/* + * Multigrain timestamps + * + * Conditionally use fine-grained ctime and mtime timestamps when there + * are users actively observing them via getattr. The primary use-case + * for this is NFS clients that use the ctime to distinguish between + * different states of the file, and that are often fooled by multiple + * operations that occur in the same coarse-grained timer tick. + */ +#define I_CTIME_QUERIED ((u32)BIT(31)) + static inline time64_t inode_get_ctime_sec(const struct inode *inode) { return inode->i_ctime_sec; @@ -1620,7 +1631,7 @@ static inline time64_t inode_get_ctime_sec(const struct inode *inode) static inline long inode_get_ctime_nsec(const struct inode *inode) { - return inode->i_ctime_nsec; + return inode->i_ctime_nsec & ~I_CTIME_QUERIED; } static inline struct timespec64 inode_get_ctime(const struct inode *inode) @@ -1631,13 +1642,7 @@ static inline struct timespec64 inode_get_ctime(const struct inode *inode) return ts; } -static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode, - struct timespec64 ts) -{ - inode->i_ctime_sec = ts.tv_sec; - inode->i_ctime_nsec = ts.tv_nsec; - return ts; -} +struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts); /** * inode_set_ctime - set the ctime in the inode @@ -2500,6 +2505,7 @@ struct file_system_type { #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ +#define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; @@ -2523,6 +2529,17 @@ struct file_system_type { #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME) +/** + * is_mgtime: is this inode using multigrain timestamps + * @inode: inode to test for multigrain timestamps + * + * Return true if the inode uses multigrain timestamps, false otherwise. + */ +static inline bool is_mgtime(const struct inode *inode) +{ + return inode->i_sb->s_type->fs_flags & FS_MGTIME; +} + extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)); @@ -3262,6 +3279,7 @@ extern void page_put_link(void *); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); +void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode); void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); void generic_fill_statx_atomic_writes(struct kstat *stat, |
