summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/Makefile2
-rw-r--r--fs/fserror.c194
-rw-r--r--fs/super.c3
-rw-r--r--include/linux/fs/super_types.h7
-rw-r--r--include/linux/fserror.h75
5 files changed, 280 insertions, 1 deletions
diff --git a/fs/Makefile b/fs/Makefile
index a04274a3c854..f238cc5ea2e9 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -16,7 +16,7 @@ obj-y := open.o read_write.o file_table.o super.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \
kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \
- file_attr.o
+ file_attr.o fserror.o
obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o
obj-$(CONFIG_PROC_FS) += proc_namespace.o
diff --git a/fs/fserror.c b/fs/fserror.c
new file mode 100644
index 000000000000..06ca86adab9b
--- /dev/null
+++ b/fs/fserror.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2025 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/mempool.h>
+#include <linux/fserror.h>
+
+#define FSERROR_DEFAULT_EVENT_POOL_SIZE (32)
+
+static struct mempool fserror_events_pool;
+
+void fserror_mount(struct super_block *sb)
+{
+ /*
+ * The pending error counter is biased by 1 so that we don't wake_var
+ * until we're actually trying to unmount.
+ */
+ refcount_set(&sb->s_pending_errors, 1);
+}
+
+void fserror_unmount(struct super_block *sb)
+{
+ /*
+ * If we don't drop the pending error count to zero, then wait for it
+ * to drop below 1, which means that the pending errors cleared and
+ * hopefully we didn't saturate with 1 billion+ concurrent events.
+ */
+ if (!refcount_dec_and_test(&sb->s_pending_errors))
+ wait_var_event(&sb->s_pending_errors,
+ refcount_read(&sb->s_pending_errors) < 1);
+}
+
+static inline void fserror_pending_dec(struct super_block *sb)
+{
+ if (refcount_dec_and_test(&sb->s_pending_errors))
+ wake_up_var(&sb->s_pending_errors);
+}
+
+static inline void fserror_free_event(struct fserror_event *event)
+{
+ fserror_pending_dec(event->sb);
+ mempool_free(event, &fserror_events_pool);
+}
+
+static void fserror_worker(struct work_struct *work)
+{
+ struct fserror_event *event =
+ container_of(work, struct fserror_event, work);
+ struct super_block *sb = event->sb;
+
+ if (sb->s_flags & SB_ACTIVE) {
+ struct fs_error_report report = {
+ /* send positive error number to userspace */
+ .error = -event->error,
+ .inode = event->inode,
+ .sb = event->sb,
+ };
+
+ if (sb->s_op->report_error)
+ sb->s_op->report_error(event);
+
+ fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, NULL, NULL,
+ NULL, 0);
+ }
+
+ iput(event->inode);
+ fserror_free_event(event);
+}
+
+static inline struct fserror_event *fserror_alloc_event(struct super_block *sb,
+ gfp_t gfp_flags)
+{
+ struct fserror_event *event = NULL;
+
+ /*
+ * If pending_errors already reached zero or is no longer active,
+ * the superblock is being deactivated so there's no point in
+ * continuing.
+ *
+ * The order of the check of s_pending_errors and SB_ACTIVE are
+ * mandated by order of accesses in generic_shutdown_super and
+ * fserror_unmount. Barriers are implicitly provided by the refcount
+ * manipulations in this function and fserror_unmount.
+ */
+ if (!refcount_inc_not_zero(&sb->s_pending_errors))
+ return NULL;
+ if (!(sb->s_flags & SB_ACTIVE))
+ goto out_pending;
+
+ event = mempool_alloc(&fserror_events_pool, gfp_flags);
+ if (!event)
+ goto out_pending;
+
+ /* mempool_alloc doesn't support GFP_ZERO */
+ memset(event, 0, sizeof(*event));
+ event->sb = sb;
+ INIT_WORK(&event->work, fserror_worker);
+
+ return event;
+
+out_pending:
+ fserror_pending_dec(sb);
+ return NULL;
+}
+
+/**
+ * fserror_report - report a filesystem error of some kind
+ *
+ * @sb: superblock of the filesystem
+ * @inode: inode within that filesystem, if applicable
+ * @type: type of error encountered
+ * @pos: start of inode range affected, if applicable
+ * @len: length of inode range affected, if applicable
+ * @error: error number encountered, must be negative
+ * @gfp: memory allocation flags for conveying the event to a worker,
+ * since this function can be called from atomic contexts
+ *
+ * Report details of a filesystem error to the super_operations::report_error
+ * callback if present; and to fsnotify for distribution to userspace. @sb,
+ * @gfp, @type, and @error must all be specified. For file I/O errors, the
+ * @inode, @pos, and @len fields must also be specified. For file metadata
+ * errors, @inode must be specified. If @inode is not NULL, then @inode->i_sb
+ * must point to @sb.
+ *
+ * Reporting work is deferred to a workqueue to ensure that ->report_error is
+ * called from process context without any locks held. An active reference to
+ * the inode is maintained until event handling is complete, and unmount will
+ * wait for queued events to drain.
+ */
+void fserror_report(struct super_block *sb, struct inode *inode,
+ enum fserror_type type, loff_t pos, u64 len, int error,
+ gfp_t gfp)
+{
+ struct fserror_event *event;
+
+ /* sb and inode must be from the same filesystem */
+ WARN_ON_ONCE(inode && inode->i_sb != sb);
+
+ /* error number must be negative */
+ WARN_ON_ONCE(error >= 0);
+
+ event = fserror_alloc_event(sb, gfp);
+ if (!event)
+ goto lost;
+
+ event->type = type;
+ event->pos = pos;
+ event->len = len;
+ event->error = error;
+
+ /*
+ * Can't iput from non-sleeping context, so grabbing another reference
+ * to the inode must be the last thing before submitting the event.
+ */
+ if (inode) {
+ event->inode = igrab(inode);
+ if (!event->inode)
+ goto lost_event;
+ }
+
+ /*
+ * Use schedule_work here even if we're already in process context so
+ * that fsnotify and super_operations::report_error implementations are
+ * guaranteed to run in process context without any locks held. Since
+ * errors are supposed to be rare, the overhead shouldn't kill us any
+ * more than the failing device will.
+ */
+ schedule_work(&event->work);
+ return;
+
+lost_event:
+ fserror_free_event(event);
+lost:
+ if (inode)
+ pr_err_ratelimited(
+ "%s: lost file I/O error report for ino %lu type %u pos 0x%llx len 0x%llx error %d",
+ sb->s_id, inode->i_ino, type, pos, len, error);
+ else
+ pr_err_ratelimited(
+ "%s: lost filesystem error report for type %u error %d",
+ sb->s_id, type, error);
+}
+EXPORT_SYMBOL_GPL(fserror_report);
+
+static int __init fserror_init(void)
+{
+ return mempool_init_kmalloc_pool(&fserror_events_pool,
+ FSERROR_DEFAULT_EVENT_POOL_SIZE,
+ sizeof(struct fserror_event));
+}
+fs_initcall(fserror_init);
diff --git a/fs/super.c b/fs/super.c
index 3d85265d1400..b13c1fd6a6f4 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -36,6 +36,7 @@
#include <linux/lockdep.h>
#include <linux/user_namespace.h>
#include <linux/fs_context.h>
+#include <linux/fserror.h>
#include <uapi/linux/mount.h>
#include "internal.h"
@@ -363,6 +364,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
spin_lock_init(&s->s_inode_list_lock);
INIT_LIST_HEAD(&s->s_inodes_wb);
spin_lock_init(&s->s_inode_wblist_lock);
+ fserror_mount(s);
s->s_count = 1;
atomic_set(&s->s_active, 1);
@@ -622,6 +624,7 @@ void generic_shutdown_super(struct super_block *sb)
sync_filesystem(sb);
sb->s_flags &= ~SB_ACTIVE;
+ fserror_unmount(sb);
cgroup_writeback_umount(sb);
/* Evict all inodes with zero refcount. */
diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
index 6bd3009e09b3..97a8552d8f2b 100644
--- a/include/linux/fs/super_types.h
+++ b/include/linux/fs/super_types.h
@@ -35,6 +35,7 @@ struct user_namespace;
struct workqueue_struct;
struct writeback_control;
struct xattr_handler;
+struct fserror_event;
extern struct super_block *blockdev_superblock;
@@ -124,6 +125,9 @@ struct super_operations {
*/
int (*remove_bdev)(struct super_block *sb, struct block_device *bdev);
void (*shutdown)(struct super_block *sb);
+
+ /* Report a filesystem error */
+ void (*report_error)(const struct fserror_event *event);
};
struct super_block {
@@ -268,6 +272,9 @@ struct super_block {
spinlock_t s_inode_wblist_lock;
struct list_head s_inodes_wb; /* writeback inodes */
long s_min_writeback_pages;
+
+ /* number of fserrors that are being sent to fsnotify/filesystems */
+ refcount_t s_pending_errors;
} __randomize_layout;
/*
diff --git a/include/linux/fserror.h b/include/linux/fserror.h
new file mode 100644
index 000000000000..5e1ad78c346e
--- /dev/null
+++ b/include/linux/fserror.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2025 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef _LINUX_FSERROR_H__
+#define _LINUX_FSERROR_H__
+
+void fserror_mount(struct super_block *sb);
+void fserror_unmount(struct super_block *sb);
+
+enum fserror_type {
+ /* pagecache I/O failed */
+ FSERR_BUFFERED_READ,
+ FSERR_BUFFERED_WRITE,
+
+ /* direct I/O failed */
+ FSERR_DIRECTIO_READ,
+ FSERR_DIRECTIO_WRITE,
+
+ /* out of band media error reported */
+ FSERR_DATA_LOST,
+
+ /* filesystem metadata */
+ FSERR_METADATA,
+};
+
+struct fserror_event {
+ struct work_struct work;
+ struct super_block *sb;
+ struct inode *inode;
+ loff_t pos;
+ u64 len;
+ enum fserror_type type;
+
+ /* negative error number */
+ int error;
+};
+
+void fserror_report(struct super_block *sb, struct inode *inode,
+ enum fserror_type type, loff_t pos, u64 len, int error,
+ gfp_t gfp);
+
+static inline void fserror_report_io(struct inode *inode,
+ enum fserror_type type, loff_t pos,
+ u64 len, int error, gfp_t gfp)
+{
+ fserror_report(inode->i_sb, inode, type, pos, len, error, gfp);
+}
+
+static inline void fserror_report_data_lost(struct inode *inode, loff_t pos,
+ u64 len, gfp_t gfp)
+{
+ fserror_report(inode->i_sb, inode, FSERR_DATA_LOST, pos, len, -EIO,
+ gfp);
+}
+
+static inline void fserror_report_file_metadata(struct inode *inode, int error,
+ gfp_t gfp)
+{
+ fserror_report(inode->i_sb, inode, FSERR_METADATA, 0, 0, error, gfp);
+}
+
+static inline void fserror_report_metadata(struct super_block *sb, int error,
+ gfp_t gfp)
+{
+ fserror_report(sb, NULL, FSERR_METADATA, 0, 0, error, gfp);
+}
+
+static inline void fserror_report_shutdown(struct super_block *sb, gfp_t gfp)
+{
+ fserror_report(sb, NULL, FSERR_METADATA, 0, 0, -ESHUTDOWN, gfp);
+}
+
+#endif /* _LINUX_FSERROR_H__ */