diff options
| -rw-r--r-- | fs/Makefile | 2 | ||||
| -rw-r--r-- | fs/fserror.c | 194 | ||||
| -rw-r--r-- | fs/super.c | 3 | ||||
| -rw-r--r-- | include/linux/fs/super_types.h | 7 | ||||
| -rw-r--r-- | include/linux/fserror.h | 75 |
5 files changed, 280 insertions, 1 deletions
diff --git a/fs/Makefile b/fs/Makefile index a04274a3c854..f238cc5ea2e9 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -16,7 +16,7 @@ obj-y := open.o read_write.o file_table.o super.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \ kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \ - file_attr.o + file_attr.o fserror.o obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o obj-$(CONFIG_PROC_FS) += proc_namespace.o diff --git a/fs/fserror.c b/fs/fserror.c new file mode 100644 index 000000000000..06ca86adab9b --- /dev/null +++ b/fs/fserror.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2025 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include <linux/fs.h> +#include <linux/fsnotify.h> +#include <linux/mempool.h> +#include <linux/fserror.h> + +#define FSERROR_DEFAULT_EVENT_POOL_SIZE (32) + +static struct mempool fserror_events_pool; + +void fserror_mount(struct super_block *sb) +{ + /* + * The pending error counter is biased by 1 so that we don't wake_var + * until we're actually trying to unmount. + */ + refcount_set(&sb->s_pending_errors, 1); +} + +void fserror_unmount(struct super_block *sb) +{ + /* + * If we don't drop the pending error count to zero, then wait for it + * to drop below 1, which means that the pending errors cleared and + * hopefully we didn't saturate with 1 billion+ concurrent events. + */ + if (!refcount_dec_and_test(&sb->s_pending_errors)) + wait_var_event(&sb->s_pending_errors, + refcount_read(&sb->s_pending_errors) < 1); +} + +static inline void fserror_pending_dec(struct super_block *sb) +{ + if (refcount_dec_and_test(&sb->s_pending_errors)) + wake_up_var(&sb->s_pending_errors); +} + +static inline void fserror_free_event(struct fserror_event *event) +{ + fserror_pending_dec(event->sb); + mempool_free(event, &fserror_events_pool); +} + +static void fserror_worker(struct work_struct *work) +{ + struct fserror_event *event = + container_of(work, struct fserror_event, work); + struct super_block *sb = event->sb; + + if (sb->s_flags & SB_ACTIVE) { + struct fs_error_report report = { + /* send positive error number to userspace */ + .error = -event->error, + .inode = event->inode, + .sb = event->sb, + }; + + if (sb->s_op->report_error) + sb->s_op->report_error(event); + + fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, NULL, NULL, + NULL, 0); + } + + iput(event->inode); + fserror_free_event(event); +} + +static inline struct fserror_event *fserror_alloc_event(struct super_block *sb, + gfp_t gfp_flags) +{ + struct fserror_event *event = NULL; + + /* + * If pending_errors already reached zero or is no longer active, + * the superblock is being deactivated so there's no point in + * continuing. + * + * The order of the check of s_pending_errors and SB_ACTIVE are + * mandated by order of accesses in generic_shutdown_super and + * fserror_unmount. Barriers are implicitly provided by the refcount + * manipulations in this function and fserror_unmount. + */ + if (!refcount_inc_not_zero(&sb->s_pending_errors)) + return NULL; + if (!(sb->s_flags & SB_ACTIVE)) + goto out_pending; + + event = mempool_alloc(&fserror_events_pool, gfp_flags); + if (!event) + goto out_pending; + + /* mempool_alloc doesn't support GFP_ZERO */ + memset(event, 0, sizeof(*event)); + event->sb = sb; + INIT_WORK(&event->work, fserror_worker); + + return event; + +out_pending: + fserror_pending_dec(sb); + return NULL; +} + +/** + * fserror_report - report a filesystem error of some kind + * + * @sb: superblock of the filesystem + * @inode: inode within that filesystem, if applicable + * @type: type of error encountered + * @pos: start of inode range affected, if applicable + * @len: length of inode range affected, if applicable + * @error: error number encountered, must be negative + * @gfp: memory allocation flags for conveying the event to a worker, + * since this function can be called from atomic contexts + * + * Report details of a filesystem error to the super_operations::report_error + * callback if present; and to fsnotify for distribution to userspace. @sb, + * @gfp, @type, and @error must all be specified. For file I/O errors, the + * @inode, @pos, and @len fields must also be specified. For file metadata + * errors, @inode must be specified. If @inode is not NULL, then @inode->i_sb + * must point to @sb. + * + * Reporting work is deferred to a workqueue to ensure that ->report_error is + * called from process context without any locks held. An active reference to + * the inode is maintained until event handling is complete, and unmount will + * wait for queued events to drain. + */ +void fserror_report(struct super_block *sb, struct inode *inode, + enum fserror_type type, loff_t pos, u64 len, int error, + gfp_t gfp) +{ + struct fserror_event *event; + + /* sb and inode must be from the same filesystem */ + WARN_ON_ONCE(inode && inode->i_sb != sb); + + /* error number must be negative */ + WARN_ON_ONCE(error >= 0); + + event = fserror_alloc_event(sb, gfp); + if (!event) + goto lost; + + event->type = type; + event->pos = pos; + event->len = len; + event->error = error; + + /* + * Can't iput from non-sleeping context, so grabbing another reference + * to the inode must be the last thing before submitting the event. + */ + if (inode) { + event->inode = igrab(inode); + if (!event->inode) + goto lost_event; + } + + /* + * Use schedule_work here even if we're already in process context so + * that fsnotify and super_operations::report_error implementations are + * guaranteed to run in process context without any locks held. Since + * errors are supposed to be rare, the overhead shouldn't kill us any + * more than the failing device will. + */ + schedule_work(&event->work); + return; + +lost_event: + fserror_free_event(event); +lost: + if (inode) + pr_err_ratelimited( + "%s: lost file I/O error report for ino %lu type %u pos 0x%llx len 0x%llx error %d", + sb->s_id, inode->i_ino, type, pos, len, error); + else + pr_err_ratelimited( + "%s: lost filesystem error report for type %u error %d", + sb->s_id, type, error); +} +EXPORT_SYMBOL_GPL(fserror_report); + +static int __init fserror_init(void) +{ + return mempool_init_kmalloc_pool(&fserror_events_pool, + FSERROR_DEFAULT_EVENT_POOL_SIZE, + sizeof(struct fserror_event)); +} +fs_initcall(fserror_init); diff --git a/fs/super.c b/fs/super.c index 3d85265d1400..b13c1fd6a6f4 100644 --- a/fs/super.c +++ b/fs/super.c @@ -36,6 +36,7 @@ #include <linux/lockdep.h> #include <linux/user_namespace.h> #include <linux/fs_context.h> +#include <linux/fserror.h> #include <uapi/linux/mount.h> #include "internal.h" @@ -363,6 +364,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, spin_lock_init(&s->s_inode_list_lock); INIT_LIST_HEAD(&s->s_inodes_wb); spin_lock_init(&s->s_inode_wblist_lock); + fserror_mount(s); s->s_count = 1; atomic_set(&s->s_active, 1); @@ -622,6 +624,7 @@ void generic_shutdown_super(struct super_block *sb) sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; + fserror_unmount(sb); cgroup_writeback_umount(sb); /* Evict all inodes with zero refcount. */ diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index 6bd3009e09b3..97a8552d8f2b 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -35,6 +35,7 @@ struct user_namespace; struct workqueue_struct; struct writeback_control; struct xattr_handler; +struct fserror_event; extern struct super_block *blockdev_superblock; @@ -124,6 +125,9 @@ struct super_operations { */ int (*remove_bdev)(struct super_block *sb, struct block_device *bdev); void (*shutdown)(struct super_block *sb); + + /* Report a filesystem error */ + void (*report_error)(const struct fserror_event *event); }; struct super_block { @@ -268,6 +272,9 @@ struct super_block { spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ long s_min_writeback_pages; + + /* number of fserrors that are being sent to fsnotify/filesystems */ + refcount_t s_pending_errors; } __randomize_layout; /* diff --git a/include/linux/fserror.h b/include/linux/fserror.h new file mode 100644 index 000000000000..5e1ad78c346e --- /dev/null +++ b/include/linux/fserror.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2025 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef _LINUX_FSERROR_H__ +#define _LINUX_FSERROR_H__ + +void fserror_mount(struct super_block *sb); +void fserror_unmount(struct super_block *sb); + +enum fserror_type { + /* pagecache I/O failed */ + FSERR_BUFFERED_READ, + FSERR_BUFFERED_WRITE, + + /* direct I/O failed */ + FSERR_DIRECTIO_READ, + FSERR_DIRECTIO_WRITE, + + /* out of band media error reported */ + FSERR_DATA_LOST, + + /* filesystem metadata */ + FSERR_METADATA, +}; + +struct fserror_event { + struct work_struct work; + struct super_block *sb; + struct inode *inode; + loff_t pos; + u64 len; + enum fserror_type type; + + /* negative error number */ + int error; +}; + +void fserror_report(struct super_block *sb, struct inode *inode, + enum fserror_type type, loff_t pos, u64 len, int error, + gfp_t gfp); + +static inline void fserror_report_io(struct inode *inode, + enum fserror_type type, loff_t pos, + u64 len, int error, gfp_t gfp) +{ + fserror_report(inode->i_sb, inode, type, pos, len, error, gfp); +} + +static inline void fserror_report_data_lost(struct inode *inode, loff_t pos, + u64 len, gfp_t gfp) +{ + fserror_report(inode->i_sb, inode, FSERR_DATA_LOST, pos, len, -EIO, + gfp); +} + +static inline void fserror_report_file_metadata(struct inode *inode, int error, + gfp_t gfp) +{ + fserror_report(inode->i_sb, inode, FSERR_METADATA, 0, 0, error, gfp); +} + +static inline void fserror_report_metadata(struct super_block *sb, int error, + gfp_t gfp) +{ + fserror_report(sb, NULL, FSERR_METADATA, 0, 0, error, gfp); +} + +static inline void fserror_report_shutdown(struct super_block *sb, gfp_t gfp) +{ + fserror_report(sb, NULL, FSERR_METADATA, 0, 0, -ESHUTDOWN, gfp); +} + +#endif /* _LINUX_FSERROR_H__ */ |
