From 880b9577855edddda1e732748e849c63199d489b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 17 Jul 2023 09:00:09 -0700 Subject: fs: distinguish between user initiated freeze and kernel initiated freeze Userspace can freeze a filesystem using the FIFREEZE ioctl or by suspending the block device; this state persists until userspace thaws the filesystem with the FITHAW ioctl or resuming the block device. Since commit 18e9e5104fcd ("Introduce freeze_super and thaw_super for the fsfreeze ioctl") we only allow the first freeze command to succeed. The kernel may decide that it is necessary to freeze a filesystem for its own internal purposes, such as suspends in progress, filesystem fsck activities, or quiescing a device prior to removal. Userspace thaw commands must never break a kernel freeze, and kernel thaw commands shouldn't undo userspace's freeze command. Introduce a couple of freeze holder flags and wire it into the sb_writers state. One kernel and one userspace freeze are allowed to coexist at the same time; the filesystem will not thaw until both are lifted. I wonder if the f2fs/gfs2 code should be using a kernel freeze here, but for now we'll use FREEZE_HOLDER_USERSPACE to preserve existing behaviors. Cc: mcgrof@kernel.org Cc: jack@suse.cz Cc: hch@infradead.org Cc: ruansy.fnst@fujitsu.com Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Jan Kara --- include/linux/fs.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6867512907d6..5e8d81212abc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1147,7 +1147,8 @@ enum { #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) struct sb_writers { - int frozen; /* Is sb frozen? */ + unsigned short frozen; /* Is sb frozen? */ + unsigned short freeze_holders; /* Who froze fs? */ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; @@ -1902,6 +1903,10 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags); +enum freeze_holder { + FREEZE_HOLDER_KERNEL = (1U << 0), + FREEZE_HOLDER_USERSPACE = (1U << 1), +}; struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); @@ -1914,9 +1919,9 @@ struct super_operations { void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); - int (*freeze_super) (struct super_block *); + int (*freeze_super) (struct super_block *, enum freeze_holder who); int (*freeze_fs) (struct super_block *); - int (*thaw_super) (struct super_block *); + int (*thaw_super) (struct super_block *, enum freeze_holder who); int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); @@ -2290,8 +2295,8 @@ extern int unregister_filesystem(struct file_system_type *); extern int vfs_statfs(const struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); -extern int freeze_super(struct super_block *super); -extern int thaw_super(struct super_block *super); +int freeze_super(struct super_block *super, enum freeze_holder who); +int thaw_super(struct super_block *super, enum freeze_holder who); extern __printf(2, 3) int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); extern int super_setup_bdi(struct super_block *sb); -- cgit v1.2.3 From 6a3207395563f724d91231ec0aa7c4d95bf9591d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 7 Aug 2023 12:26:25 +0100 Subject: fs, block: remove bdev->bd_super bdev->bd_super is unused now, remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Message-Id: <20230807112625.652089-5-hch@lst.de> Signed-off-by: Christian Brauner --- include/linux/blk_types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0bad62cca3d0..d5c5e59ddbd2 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -52,7 +52,6 @@ struct block_device { atomic_t bd_openers; spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ struct inode * bd_inode; /* will die */ - struct super_block * bd_super; void * bd_claiming; void * bd_holder; const struct blk_holder_ops *bd_holder_ops; -- cgit v1.2.3 From cf6da236c27a73ab91b657232cd3841aab27c37a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Aug 2023 17:41:20 +0200 Subject: fs: export setup_bdev_super We'll want to use setup_bdev_super instead of duplicating it in nilfs2. Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Message-Id: <20230802154131.2221419-2-hch@lst.de> Signed-off-by: Christian Brauner --- include/linux/fs_context.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index ff6341e09925..58ef8433a94b 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -158,6 +158,8 @@ extern int get_tree_keyed(struct fs_context *fc, struct fs_context *fc), void *key); +int setup_bdev_super(struct super_block *sb, int sb_flags, + struct fs_context *fc); extern int get_tree_bdev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)); -- cgit v1.2.3 From 7ecd0b6f510005e120f5bc198bacb5951814cf36 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Aug 2023 17:41:27 +0200 Subject: fs: export fs_holder_ops Export fs_holder_ops so that file systems that open additional block devices can use it as well. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Message-Id: <20230802154131.2221419-9-hch@lst.de> Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ed44a997f629..83262702eea7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1464,6 +1464,8 @@ struct blk_holder_ops { void (*mark_dead)(struct block_device *bdev); }; +extern const struct blk_holder_ops fs_holder_ops; + /* * Return the correct open flags for blkdev_get_by_* for super block flags * as stored in sb->s_flags. -- cgit v1.2.3 From ab6860f62bfe329e11e5b5b7295b673c9c3a62d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Aug 2023 12:08:19 +0200 Subject: block: simplify the disk_force_media_change interface Hard code the events to DISK_EVENT_MEDIA_CHANGE as that is the only useful use case, and drop the superfluous return value. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Message-Id: <20230811100828.1897174-9-hch@lst.de> Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 83262702eea7..c8eab6effc22 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -750,7 +750,7 @@ static inline int bdev_read_only(struct block_device *bdev) } bool set_capacity_and_notify(struct gendisk *disk, sector_t size); -bool disk_force_media_change(struct gendisk *disk, unsigned int events); +void disk_force_media_change(struct gendisk *disk); void add_disk_randomness(struct gendisk *disk) __latent_entropy; void rand_initialize_disk(struct gendisk *disk); -- cgit v1.2.3 From 560e20e4bf6484a0c12f9f3c7a1aa55056948e1e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Aug 2023 12:08:24 +0200 Subject: block: consolidate __invalidate_device and fsync_bdev We currently have two interfaces that take a block_devices and the find a mounted file systems to flush or invaldidate data on it. Both are a bit problematic because they only work for the "main" block devices that is used as s_dev for the super_block, and because they don't call into the file system at all. Merge the two into a new bdev_mark_dead helper that does both the syncing and invalidation and which is properly documented. This is in preparation of merging the functionality into the ->mark_dead holder operation so that it will work on additional block devices used by a file systems and give us a single entry point for invalidation of dead devices or media. Note that a single standalone fsync_bdev call for an obscure ioctl remains for now, but that one will also be deal with in a bit. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Message-Id: <20230811100828.1897174-14-hch@lst.de> Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c8eab6effc22..6721595b9f97 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -751,6 +751,7 @@ static inline int bdev_read_only(struct block_device *bdev) bool set_capacity_and_notify(struct gendisk *disk, sector_t size); void disk_force_media_change(struct gendisk *disk); +void bdev_mark_dead(struct block_device *bdev, bool surprise); void add_disk_randomness(struct gendisk *disk) __latent_entropy; void rand_initialize_disk(struct gendisk *disk); @@ -809,7 +810,6 @@ int __register_blkdev(unsigned int major, const char *name, void unregister_blkdev(unsigned int major, const char *name); bool disk_check_media_change(struct gendisk *disk); -int __invalidate_device(struct block_device *bdev, bool kill_dirty); void set_capacity(struct gendisk *disk, sector_t size); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED -- cgit v1.2.3 From d8530de5a6e82be0ce17a5fdf727a394bcf6444c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Aug 2023 12:08:25 +0200 Subject: block: call into the file system for bdev_mark_dead Combine the newly merged bdev_mark_dead helper with the existing mark_dead holder operation so that all operations that invalidate a device that is dead or being removed now go through the holder ops. This allows file systems to explicitly shutdown either ASAP (for a surprise removal) or after writing back data (for an orderly removal), and do so not only for the main device. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Message-Id: <20230811100828.1897174-15-hch@lst.de> Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6721595b9f97..cdd03c612d39 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1461,7 +1461,7 @@ void blkdev_show(struct seq_file *seqf, off_t offset); #endif struct blk_holder_ops { - void (*mark_dead)(struct block_device *bdev); + void (*mark_dead)(struct block_device *bdev, bool surprise); }; extern const struct blk_holder_ops fs_holder_ops; -- cgit v1.2.3 From 2142b88c37a3e49fbca4a36b8674626917d9bf40 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Aug 2023 12:08:26 +0200 Subject: block: call into the file system for ioctl BLKFLSBUF BLKFLSBUF is a historic ioctl that is called on a file handle to a block device and syncs either the file system mounted on that block device if there is one, or otherwise the just the data on the block device. Replace the get_super based syncing with a holder operation to remove the last usage of get_super, and to also support syncing the file system if the block device is not the main block device stored in s_dev. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Message-Id: <20230811100828.1897174-16-hch@lst.de> Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cdd03c612d39..fa462d48ee96 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1462,6 +1462,11 @@ void blkdev_show(struct seq_file *seqf, off_t offset); struct blk_holder_ops { void (*mark_dead)(struct block_device *bdev, bool surprise); + + /* + * Sync the file system mounted on the block device. + */ + void (*sync)(struct block_device *bdev); }; extern const struct blk_holder_ops fs_holder_ops; @@ -1524,8 +1529,6 @@ static inline int early_lookup_bdev(const char *pathname, dev_t *dev) } #endif /* CONFIG_BLOCK */ -int fsync_bdev(struct block_device *bdev); - int freeze_bdev(struct block_device *bdev); int thaw_bdev(struct block_device *bdev); -- cgit v1.2.3 From 38bcdd38935350abceace901313e007e84d84456 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Aug 2023 12:08:27 +0200 Subject: fs: remove get_super get_super is unused now, remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Reviewed-by: Josef Bacik Message-Id: <20230811100828.1897174-17-hch@lst.de> Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6867512907d6..14b5777a24a0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2913,7 +2913,6 @@ extern int vfs_readlink(struct dentry *, char __user *, int); extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); -extern struct super_block *get_super(struct block_device *); extern struct super_block *get_active_super(struct block_device *bdev); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); -- cgit v1.2.3 From 5e87491415217d6bec0bcae08a3156622be2b177 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 18 Aug 2023 16:00:50 +0200 Subject: super: wait for nascent superblocks Recent patches experiment with making it possible to allocate a new superblock before opening the relevant block device. Naturally this has intricate side-effects that we get to learn about while developing this. Superblock allocators such as sget{_fc}() return with s_umount of the new superblock held and lock ordering currently requires that block level locks such as bdev_lock and open_mutex rank above s_umount. Before aca740cecbe5 ("fs: open block device after superblock creation") ordering was guaranteed to be correct as block devices were opened prior to superblock allocation and thus s_umount wasn't held. But now s_umount must be dropped before opening block devices to avoid locking violations. This has consequences. The main one being that iterators over @super_blocks and @fs_supers that grab a temporary reference to the superblock can now also grab s_umount before the caller has managed to open block devices and called fill_super(). So whereas before such iterators or concurrent mounts would have simply slept on s_umount until SB_BORN was set or the superblock was discard due to initalization failure they can now needlessly spin through sget{_fc}(). If the caller is sleeping on bdev_lock or open_mutex one caller waiting on SB_BORN will always spin somewhere and potentially this can go on for quite a while. It should be possible to drop s_umount while allowing iterators to wait on a nascent superblock to either be born or discarded. This patch implements a wait_var_event() mechanism allowing iterators to sleep until they are woken when the superblock is born or discarded. This also allows us to avoid relooping through @fs_supers and @super_blocks if a superblock isn't yet born or dying. Link: aca740cecbe5 ("fs: open block device after superblock creation") Reviewed-by: Jan Kara Message-Id: <20230818-vfs-super-fixes-v3-v3-3-9f0b1876e46b@kernel.org> Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 14b5777a24a0..173672645156 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1095,6 +1095,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ +#define SB_DYING BIT(24) #define SB_SUBMOUNT BIT(26) #define SB_FORCE BIT(27) #define SB_NOSEC BIT(28) -- cgit v1.2.3 From 2c18a63b760a0f68f14cb8bb4c3840bb0b63b73e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 18 Aug 2023 16:00:51 +0200 Subject: super: wait until we passed kill super Recent rework moved block device closing out of sb->put_super() and into sb->kill_sb() to avoid deadlocks as s_umount is held in put_super() and blkdev_put() can end up taking s_umount again. That means we need to move the removal of the superblock from @fs_supers out of generic_shutdown_super() and into deactivate_locked_super() to ensure that concurrent mounters don't fail to open block devices that are still in use because blkdev_put() in sb->kill_sb() hasn't been called yet. We can now do this as we can make iterators through @fs_super and @super_blocks wait without holding s_umount. Concurrent mounts will wait until a dying superblock is fully dead so until sb->kill_sb() has been called and SB_DEAD been set. Concurrent iterators can already discard any SB_DYING superblock. Reviewed-by: Jan Kara Message-Id: <20230818-vfs-super-fixes-v3-v3-4-9f0b1876e46b@kernel.org> Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 173672645156..a63da68305e9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1095,6 +1095,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ +#define SB_DEAD BIT(21) #define SB_DYING BIT(24) #define SB_SUBMOUNT BIT(26) #define SB_FORCE BIT(27) -- cgit v1.2.3