diff options
| author | Carlos Maiolino <cem@kernel.org> | 2026-01-28 09:47:42 +0100 |
|---|---|---|
| committer | Carlos Maiolino <cem@kernel.org> | 2026-01-28 10:02:20 +0100 |
| commit | 04a65666a69508fa0022c7343026c5a3d41d166d (patch) | |
| tree | 090cd7d57c17058f6095b02c7b6fd4e4206cd427 | |
| parent | a1ca658d649a4d8972e2e21ac2625b633217e327 (diff) | |
| parent | b8accfd65d31f25b9df15ec2419179b6fa0b21d5 (diff) | |
Merge tag 'health-monitoring-7.0_2026-01-20' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into xfs-7.0-merge
xfs: autonomous self healing of filesystems [v7]
This patchset builds new functionality to deliver live information about
filesystem health events to userspace. This is done by creating an
anonymous file that can be read() for events by userspace programs.
Events are captured by hooking various parts of XFS and iomap so that
metadata health failures, file I/O errors, and major changes in
filesystem state (unmounts, shutdowns, etc.) can be observed by
programs.
When an event occurs, the hook functions queue an event object to each
event anonfd for later processing. Programs must have CAP_SYS_ADMIN
to open the anonfd and there's a maximum event lag to prevent resource
overconsumption. The events themselves can be read() from the anonfd
as C structs for the xfs_healer daemon.
In userspace, we create a new daemon program that will read the event
objects and initiate repairs automatically. This daemon is managed
entirely by systemd and will not block unmounting of the filesystem
unless repairs are ongoing. They are auto-started by a starter
service that uses fanotify.
This patchset depends on the new fserror code that Christian Brauner
has tentatively accepted for Linux 7.0:
https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git/log/?h=vfs-7.0.fserror
v7: more cleanups of the media verification ioctl, improve comments, and
reuse the bio
v6: fix pi-breaking bugs, make verify failures trigger health reports
and filter bio status flags better
v5: add verify-media ioctl, collapse small helper funcs with only
one caller
v4: drop multiple client support so we can make direct calls into
healthmon instead of chasing pointers and doing indirect calls
v3: drag out of rfc status
With a bit of luck, this should all go splendidly.
Conflicts:
This merge required an update on files:
- fs/xfs/xfs_healthmon.c
- fs/xfs/xfs_verify_media.c
Such change was required because a parallel developement changed
XFS header file xfs.h naming to xfs_platform.h, so the merge
required to update those includes in both files above
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
44 files changed, 3146 insertions, 29 deletions
diff --git a/arch/alpha/include/uapi/asm/errno.h b/arch/alpha/include/uapi/asm/errno.h index 3d265f6babaf..6791f6508632 100644 --- a/arch/alpha/include/uapi/asm/errno.h +++ b/arch/alpha/include/uapi/asm/errno.h @@ -55,6 +55,7 @@ #define ENOSR 82 /* Out of streams resources */ #define ETIME 83 /* Timer expired */ #define EBADMSG 84 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EPROTO 85 /* Protocol error */ #define ENODATA 86 /* No data available */ #define ENOSTR 87 /* Device not a stream */ @@ -96,6 +97,7 @@ #define EREMCHG 115 /* Remote address changed */ #define EUCLEAN 117 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 118 /* Not a XENIX named type file */ #define ENAVAIL 119 /* No XENIX semaphores available */ #define EISNAM 120 /* Is a named type file */ diff --git a/arch/mips/include/uapi/asm/errno.h b/arch/mips/include/uapi/asm/errno.h index 2fb714e2d6d8..c01ed91b1ef4 100644 --- a/arch/mips/include/uapi/asm/errno.h +++ b/arch/mips/include/uapi/asm/errno.h @@ -50,6 +50,7 @@ #define EDOTDOT 73 /* RFS specific error */ #define EMULTIHOP 74 /* Multihop attempted */ #define EBADMSG 77 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define ENAMETOOLONG 78 /* File name too long */ #define EOVERFLOW 79 /* Value too large for defined data type */ #define ENOTUNIQ 80 /* Name not unique on network */ @@ -88,6 +89,7 @@ #define EISCONN 133 /* Transport endpoint is already connected */ #define ENOTCONN 134 /* Transport endpoint is not connected */ #define EUCLEAN 135 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 137 /* Not a XENIX named type file */ #define ENAVAIL 138 /* No XENIX semaphores available */ #define EISNAM 139 /* Is a named type file */ diff --git a/arch/parisc/include/uapi/asm/errno.h b/arch/parisc/include/uapi/asm/errno.h index 8d94739d75c6..8cbc07c1903e 100644 --- a/arch/parisc/include/uapi/asm/errno.h +++ b/arch/parisc/include/uapi/asm/errno.h @@ -36,6 +36,7 @@ #define EDOTDOT 66 /* RFS specific error */ #define EBADMSG 67 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EUSERS 68 /* Too many users */ #define EDQUOT 69 /* Quota exceeded */ #define ESTALE 70 /* Stale file handle */ @@ -62,6 +63,7 @@ #define ERESTART 175 /* Interrupted system call should be restarted */ #define ESTRPIPE 176 /* Streams pipe error */ #define EUCLEAN 177 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 178 /* Not a XENIX named type file */ #define ENAVAIL 179 /* No XENIX semaphores available */ #define EISNAM 180 /* Is a named type file */ diff --git a/arch/sparc/include/uapi/asm/errno.h b/arch/sparc/include/uapi/asm/errno.h index 81a732b902ee..4a41e7835fd5 100644 --- a/arch/sparc/include/uapi/asm/errno.h +++ b/arch/sparc/include/uapi/asm/errno.h @@ -48,6 +48,7 @@ #define ENOSR 74 /* Out of streams resources */ #define ENOMSG 75 /* No message of desired type */ #define EBADMSG 76 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EIDRM 77 /* Identifier removed */ #define EDEADLK 78 /* Resource deadlock would occur */ #define ENOLCK 79 /* No record locks available */ @@ -91,6 +92,7 @@ #define ENOTUNIQ 115 /* Name not unique on network */ #define ERESTART 116 /* Interrupted syscall should be restarted */ #define EUCLEAN 117 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 118 /* Not a XENIX named type file */ #define ENAVAIL 119 /* No XENIX semaphores available */ #define EISNAM 120 /* Is a named type file */ diff --git a/fs/Makefile b/fs/Makefile index a04274a3c854..f238cc5ea2e9 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -16,7 +16,7 @@ obj-y := open.o read_write.o file_table.o super.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \ kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \ - file_attr.o + file_attr.o fserror.o obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o obj-$(CONFIG_PROC_FS) += proc_namespace.o diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index f7f622836198..d06e99baf5d5 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -541,6 +541,4 @@ long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long erofs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); -#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ - #endif /* __EROFS_INTERNAL_H */ diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index cf97b76e9fd3..5e0c6c5fcb6c 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -357,7 +357,6 @@ struct ext2_inode { */ #define EXT2_VALID_FS 0x0001 /* Unmounted cleanly */ #define EXT2_ERROR_FS 0x0002 /* Errors detected */ -#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ /* * Mount flags diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 56112f201cac..62c091b52bac 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3938,7 +3938,4 @@ extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, get_block_t *get_block); #endif /* __KERNEL__ */ -#define EFSBADCRC EBADMSG /* Bad CRC detected */ -#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ - #endif /* _EXT4_H */ diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7ce0fc40aec2..ea26cd03d3ce 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -26,6 +26,7 @@ #include <linux/fsmap.h> #include "fsmap.h" #include <trace/events/ext4.h> +#include <linux/fserror.h> typedef void ext4_update_sb_callback(struct ext4_sb_info *sbi, struct ext4_super_block *es, @@ -844,6 +845,7 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags) return -EINVAL; } clear_opt(sb, DISCARD); + fserror_report_shutdown(sb, GFP_KERNEL); return 0; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 87205660c5d0..a6241ffb8639 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -48,6 +48,7 @@ #include <linux/fsnotify.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> +#include <linux/fserror.h> #include "ext4.h" #include "ext4_extents.h" /* Needed for trace points definition */ @@ -824,7 +825,8 @@ void __ext4_error(struct super_block *sb, const char *function, sb->s_id, function, line, current->comm, &vaf); va_end(args); } - fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED); + fserror_report_metadata(sb, error ? -abs(error) : -EFSCORRUPTED, + GFP_ATOMIC); ext4_handle_error(sb, force_ro, error, 0, block, function, line); } @@ -856,7 +858,9 @@ void __ext4_error_inode(struct inode *inode, const char *function, current->comm, &vaf); va_end(args); } - fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED); + fserror_report_file_metadata(inode, + error ? -abs(error) : -EFSCORRUPTED, + GFP_ATOMIC); ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block, function, line); @@ -896,7 +900,7 @@ void __ext4_error_file(struct file *file, const char *function, current->comm, path, &vaf); va_end(args); } - fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED); + fserror_report_file_metadata(inode, -EFSCORRUPTED, GFP_ATOMIC); ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block, function, line); @@ -965,7 +969,8 @@ void __ext4_std_error(struct super_block *sb, const char *function, printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", sb->s_id, function, line, errstr); } - fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED); + fserror_report_metadata(sb, errno ? -abs(errno) : -EFSCORRUPTED, + GFP_ATOMIC); ext4_handle_error(sb, false, -errno, 0, 0, function, line); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 20edbb99b814..9f3aa3c7f126 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -5004,7 +5004,4 @@ static inline void f2fs_invalidate_internal_cache(struct f2fs_sb_info *sbi, f2fs_invalidate_compress_pages_range(sbi, blkaddr, len); } -#define EFSBADCRC EBADMSG /* Bad CRC detected */ -#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ - #endif /* _LINUX_F2FS_H */ diff --git a/fs/fserror.c b/fs/fserror.c new file mode 100644 index 000000000000..06ca86adab9b --- /dev/null +++ b/fs/fserror.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2025 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include <linux/fs.h> +#include <linux/fsnotify.h> +#include <linux/mempool.h> +#include <linux/fserror.h> + +#define FSERROR_DEFAULT_EVENT_POOL_SIZE (32) + +static struct mempool fserror_events_pool; + +void fserror_mount(struct super_block *sb) +{ + /* + * The pending error counter is biased by 1 so that we don't wake_var + * until we're actually trying to unmount. + */ + refcount_set(&sb->s_pending_errors, 1); +} + +void fserror_unmount(struct super_block *sb) +{ + /* + * If we don't drop the pending error count to zero, then wait for it + * to drop below 1, which means that the pending errors cleared and + * hopefully we didn't saturate with 1 billion+ concurrent events. + */ + if (!refcount_dec_and_test(&sb->s_pending_errors)) + wait_var_event(&sb->s_pending_errors, + refcount_read(&sb->s_pending_errors) < 1); +} + +static inline void fserror_pending_dec(struct super_block *sb) +{ + if (refcount_dec_and_test(&sb->s_pending_errors)) + wake_up_var(&sb->s_pending_errors); +} + +static inline void fserror_free_event(struct fserror_event *event) +{ + fserror_pending_dec(event->sb); + mempool_free(event, &fserror_events_pool); +} + +static void fserror_worker(struct work_struct *work) +{ + struct fserror_event *event = + container_of(work, struct fserror_event, work); + struct super_block *sb = event->sb; + + if (sb->s_flags & SB_ACTIVE) { + struct fs_error_report report = { + /* send positive error number to userspace */ + .error = -event->error, + .inode = event->inode, + .sb = event->sb, + }; + + if (sb->s_op->report_error) + sb->s_op->report_error(event); + + fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, NULL, NULL, + NULL, 0); + } + + iput(event->inode); + fserror_free_event(event); +} + +static inline struct fserror_event *fserror_alloc_event(struct super_block *sb, + gfp_t gfp_flags) +{ + struct fserror_event *event = NULL; + + /* + * If pending_errors already reached zero or is no longer active, + * the superblock is being deactivated so there's no point in + * continuing. + * + * The order of the check of s_pending_errors and SB_ACTIVE are + * mandated by order of accesses in generic_shutdown_super and + * fserror_unmount. Barriers are implicitly provided by the refcount + * manipulations in this function and fserror_unmount. + */ + if (!refcount_inc_not_zero(&sb->s_pending_errors)) + return NULL; + if (!(sb->s_flags & SB_ACTIVE)) + goto out_pending; + + event = mempool_alloc(&fserror_events_pool, gfp_flags); + if (!event) + goto out_pending; + + /* mempool_alloc doesn't support GFP_ZERO */ + memset(event, 0, sizeof(*event)); + event->sb = sb; + INIT_WORK(&event->work, fserror_worker); + + return event; + +out_pending: + fserror_pending_dec(sb); + return NULL; +} + +/** + * fserror_report - report a filesystem error of some kind + * + * @sb: superblock of the filesystem + * @inode: inode within that filesystem, if applicable + * @type: type of error encountered + * @pos: start of inode range affected, if applicable + * @len: length of inode range affected, if applicable + * @error: error number encountered, must be negative + * @gfp: memory allocation flags for conveying the event to a worker, + * since this function can be called from atomic contexts + * + * Report details of a filesystem error to the super_operations::report_error + * callback if present; and to fsnotify for distribution to userspace. @sb, + * @gfp, @type, and @error must all be specified. For file I/O errors, the + * @inode, @pos, and @len fields must also be specified. For file metadata + * errors, @inode must be specified. If @inode is not NULL, then @inode->i_sb + * must point to @sb. + * + * Reporting work is deferred to a workqueue to ensure that ->report_error is + * called from process context without any locks held. An active reference to + * the inode is maintained until event handling is complete, and unmount will + * wait for queued events to drain. + */ +void fserror_report(struct super_block *sb, struct inode *inode, + enum fserror_type type, loff_t pos, u64 len, int error, + gfp_t gfp) +{ + struct fserror_event *event; + + /* sb and inode must be from the same filesystem */ + WARN_ON_ONCE(inode && inode->i_sb != sb); + + /* error number must be negative */ + WARN_ON_ONCE(error >= 0); + + event = fserror_alloc_event(sb, gfp); + if (!event) + goto lost; + + event->type = type; + event->pos = pos; + event->len = len; + event->error = error; + + /* + * Can't iput from non-sleeping context, so grabbing another reference + * to the inode must be the last thing before submitting the event. + */ + if (inode) { + event->inode = igrab(inode); + if (!event->inode) + goto lost_event; + } + + /* + * Use schedule_work here even if we're already in process context so + * that fsnotify and super_operations::report_error implementations are + * guaranteed to run in process context without any locks held. Since + * errors are supposed to be rare, the overhead shouldn't kill us any + * more than the failing device will. + */ + schedule_work(&event->work); + return; + +lost_event: + fserror_free_event(event); +lost: + if (inode) + pr_err_ratelimited( + "%s: lost file I/O error report for ino %lu type %u pos 0x%llx len 0x%llx error %d", + sb->s_id, inode->i_ino, type, pos, len, error); + else + pr_err_ratelimited( + "%s: lost filesystem error report for type %u error %d", + sb->s_id, type, error); +} +EXPORT_SYMBOL_GPL(fserror_report); + +static int __init fserror_init(void) +{ + return mempool_init_kmalloc_pool(&fserror_events_pool, + FSERROR_DEFAULT_EVENT_POOL_SIZE, + sizeof(struct fserror_event)); +} +fs_initcall(fserror_init); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index fd9a2cf95620..4a5f96a7c390 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -8,6 +8,7 @@ #include <linux/writeback.h> #include <linux/swap.h> #include <linux/migrate.h> +#include <linux/fserror.h> #include "internal.h" #include "trace.h" @@ -371,8 +372,11 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, if (folio_test_uptodate(folio)) return 0; - if (WARN_ON_ONCE(size > iomap->length)) + if (WARN_ON_ONCE(size > iomap->length)) { + fserror_report_io(iter->inode, FSERR_BUFFERED_READ, + iomap->offset, size, -EIO, GFP_NOFS); return -EIO; + } if (offset > 0) ifs_alloc(iter->inode, folio, iter->flags); @@ -399,6 +403,11 @@ void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, spin_unlock_irqrestore(&ifs->state_lock, flags); } + if (error) + fserror_report_io(folio->mapping->host, FSERR_BUFFERED_READ, + folio_pos(folio) + off, len, error, + GFP_ATOMIC); + if (finished) folio_end_read(folio, uptodate); } @@ -540,6 +549,10 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, if (!*bytes_submitted) iomap_read_init(folio); ret = ctx->ops->read_folio_range(iter, ctx, plen); + if (ret < 0) + fserror_report_io(iter->inode, + FSERR_BUFFERED_READ, pos, + plen, ret, GFP_NOFS); if (ret) return ret; *bytes_submitted += plen; @@ -815,6 +828,10 @@ static int __iomap_write_begin(const struct iomap_iter *iter, else status = iomap_bio_read_folio_range_sync(iter, folio, block_start, plen); + if (status < 0) + fserror_report_io(iter->inode, + FSERR_BUFFERED_READ, pos, + len, status, GFP_NOFS); if (status) return status; } @@ -1825,6 +1842,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) u64 pos = folio_pos(folio); u64 end_pos = pos + folio_size(folio); u64 end_aligned = 0; + loff_t orig_pos = pos; size_t bytes_submitted = 0; int error = 0; u32 rlen; @@ -1868,6 +1886,9 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) if (bytes_submitted) wpc->nr_folios++; + if (error && pos > orig_pos) + fserror_report_io(inode, FSERR_BUFFERED_WRITE, orig_pos, 0, + error, GFP_NOFS); /* * We can have dirty bits set past end of file in page_mkwrite path diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 8e273408453a..a06c73eaa890 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -7,6 +7,7 @@ #include <linux/pagemap.h> #include <linux/iomap.h> #include <linux/task_io_accounting_ops.h> +#include <linux/fserror.h> #include "internal.h" #include "trace.h" @@ -78,6 +79,13 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter, } } +static inline enum fserror_type iomap_dio_err_type(const struct iomap_dio *dio) +{ + if (dio->flags & IOMAP_DIO_WRITE) + return FSERR_DIRECTIO_WRITE; + return FSERR_DIRECTIO_READ; +} + ssize_t iomap_dio_complete(struct iomap_dio *dio) { const struct iomap_dio_ops *dops = dio->dops; @@ -87,6 +95,10 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) if (dops && dops->end_io) ret = dops->end_io(iocb, dio->size, ret, dio->flags); + if (dio->error) + fserror_report_io(file_inode(iocb->ki_filp), + iomap_dio_err_type(dio), offset, dio->size, + dio->error, GFP_NOFS); if (likely(!ret)) { ret = dio->size; diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index 86f44922ed3b..5b27ee988967 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -6,6 +6,7 @@ #include <linux/list_sort.h> #include <linux/pagemap.h> #include <linux/writeback.h> +#include <linux/fserror.h> #include "internal.h" #include "trace.h" @@ -55,6 +56,11 @@ static u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend) /* walk all folios in bio, ending page IO on them */ bio_for_each_folio_all(fi, bio) { + if (ioend->io_error) + fserror_report_io(inode, FSERR_BUFFERED_WRITE, + folio_pos(fi.folio) + fi.offset, + fi.length, ioend->io_error, + GFP_ATOMIC); iomap_finish_folio_write(inode, fi.folio, fi.length); folio_count++; } diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 2bfaf377f208..7e1f652f16d3 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -175,6 +175,4 @@ static inline int minix_test_bit(int nr, const void *vaddr) __minix_error_inode((inode), __func__, __LINE__, \ (fmt), ##__VA_ARGS__) -#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ - #endif /* FS_MINIX_H */ diff --git a/fs/super.c b/fs/super.c index 3d85265d1400..b13c1fd6a6f4 100644 --- a/fs/super.c +++ b/fs/super.c @@ -36,6 +36,7 @@ #include <linux/lockdep.h> #include <linux/user_namespace.h> #include <linux/fs_context.h> +#include <linux/fserror.h> #include <uapi/linux/mount.h> #include "internal.h" @@ -363,6 +364,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, spin_lock_init(&s->s_inode_list_lock); INIT_LIST_HEAD(&s->s_inodes_wb); spin_lock_init(&s->s_inode_wblist_lock); + fserror_mount(s); s->s_count = 1; atomic_set(&s->s_active, 1); @@ -622,6 +624,7 @@ void generic_shutdown_super(struct super_block *sb) sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; + fserror_unmount(sb); cgroup_writeback_umount(sb); /* Evict all inodes with zero refcount. */ diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 08ec8756b948..8399accc788d 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -55,8 +55,6 @@ #define MF_DUPLICATE_MD 0x01 #define MF_MIRROR_FE_LOADED 0x02 -#define EFSCORRUPTED EUCLEAN - struct udf_meta_data { __u32 s_meta_file_loc; __u32 s_mirror_file_loc; diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 5bf501cf8271..9f7133e02576 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -88,6 +88,7 @@ xfs-y += xfs_aops.o \ xfs_globals.o \ xfs_handle.o \ xfs_health.o \ + xfs_healthmon.o \ xfs_icache.o \ xfs_ioctl.o \ xfs_iomap.o \ @@ -105,6 +106,7 @@ xfs-y += xfs_aops.o \ xfs_symlink.o \ xfs_sysfs.o \ xfs_trans.o \ + xfs_verify_media.o \ xfs_xattr.o # low-level transaction/log code diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 12463ba766da..d165de607d17 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -1003,6 +1003,191 @@ struct xfs_rtgroup_geometry { #define XFS_RTGROUP_GEOM_SICK_RMAPBT (1U << 3) /* reverse mappings */ #define XFS_RTGROUP_GEOM_SICK_REFCNTBT (1U << 4) /* reference counts */ +/* Health monitor event domains */ + +/* affects the whole fs */ +#define XFS_HEALTH_MONITOR_DOMAIN_MOUNT (0) + +/* metadata health events */ +#define XFS_HEALTH_MONITOR_DOMAIN_FS (1) +#define XFS_HEALTH_MONITOR_DOMAIN_AG (2) +#define XFS_HEALTH_MONITOR_DOMAIN_INODE (3) +#define XFS_HEALTH_MONITOR_DOMAIN_RTGROUP (4) + +/* disk events */ +#define XFS_HEALTH_MONITOR_DOMAIN_DATADEV (5) +#define XFS_HEALTH_MONITOR_DOMAIN_RTDEV (6) +#define XFS_HEALTH_MONITOR_DOMAIN_LOGDEV (7) + +/* file range events */ +#define XFS_HEALTH_MONITOR_DOMAIN_FILERANGE (8) + +/* Health monitor event types */ + +/* status of the monitor itself */ +#define XFS_HEALTH_MONITOR_TYPE_RUNNING (0) +#define XFS_HEALTH_MONITOR_TYPE_LOST (1) + +/* filesystem was unmounted */ +#define XFS_HEALTH_MONITOR_TYPE_UNMOUNT (2) + +/* metadata health events */ +#define XFS_HEALTH_MONITOR_TYPE_SICK (3) +#define XFS_HEALTH_MONITOR_TYPE_CORRUPT (4) +#define XFS_HEALTH_MONITOR_TYPE_HEALTHY (5) + +/* filesystem shutdown */ +#define XFS_HEALTH_MONITOR_TYPE_SHUTDOWN (6) + +/* media errors */ +#define XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR (7) + +/* pagecache I/O to a file range failed */ +#define XFS_HEALTH_MONITOR_TYPE_BUFREAD (8) +#define XFS_HEALTH_MONITOR_TYPE_BUFWRITE (9) + +/* direct I/O to a file range failed */ +#define XFS_HEALTH_MONITOR_TYPE_DIOREAD (10) +#define XFS_HEALTH_MONITOR_TYPE_DIOWRITE (11) + +/* out of band media error reported for a file range */ +#define XFS_HEALTH_MONITOR_TYPE_DATALOST (12) + +/* lost events */ +struct xfs_health_monitor_lost { + __u64 count; +}; + +/* fs/rt metadata */ +struct xfs_health_monitor_fs { + /* XFS_FSOP_GEOM_SICK_* flags */ + __u32 mask; +}; + +/* ag/rtgroup metadata */ +struct xfs_health_monitor_group { + /* XFS_{AG,RTGROUP}_SICK_* flags */ + __u32 mask; + __u32 gno; +}; + +/* inode metadata */ +struct xfs_health_monitor_inode { + /* XFS_BS_SICK_* flags */ + __u32 mask; + __u32 gen; + __u64 ino; +}; + +/* shutdown reasons */ +#define XFS_HEALTH_SHUTDOWN_META_IO_ERROR (1u << 0) +#define XFS_HEALTH_SHUTDOWN_LOG_IO_ERROR (1u << 1) +#define XFS_HEALTH_SHUTDOWN_FORCE_UMOUNT (1u << 2) +#define XFS_HEALTH_SHUTDOWN_CORRUPT_INCORE (1u << 3) +#define XFS_HEALTH_SHUTDOWN_CORRUPT_ONDISK (1u << 4) +#define XFS_HEALTH_SHUTDOWN_DEVICE_REMOVED (1u << 5) + +/* shutdown */ +struct xfs_health_monitor_shutdown { + /* XFS_HEALTH_SHUTDOWN_* flags */ + __u32 reasons; +}; + +/* file range events */ +struct xfs_health_monitor_filerange { + __u64 pos; + __u64 len; + __u64 ino; + __u32 gen; + __u32 error; +}; + +/* disk media errors */ +struct xfs_health_monitor_media { + __u64 daddr; + __u64 bbcount; +}; + +struct xfs_health_monitor_event { + /* XFS_HEALTH_MONITOR_DOMAIN_* */ + __u32 domain; + + /* XFS_HEALTH_MONITOR_TYPE_* */ + __u32 type; + + /* Timestamp of the event, in nanoseconds since the Unix epoch */ + __u64 time_ns; + + /* + * Details of the event. The primary clients are written in python + * and rust, so break this up because bindgen hates anonymous structs + * and unions. + */ + union { + struct xfs_health_monitor_lost lost; + struct xfs_health_monitor_fs fs; + struct xfs_health_monitor_group group; + struct xfs_health_monitor_inode inode; + struct xfs_health_monitor_shutdown shutdown; + struct xfs_health_monitor_media media; + struct xfs_health_monitor_filerange filerange; + } e; + + /* zeroes */ + __u64 pad[2]; +}; + +struct xfs_health_monitor { + __u64 flags; /* flags */ + __u8 format; /* output format */ + __u8 pad[23]; /* zeroes */ +}; + +/* Return all health status events, not just deltas */ +#define XFS_HEALTH_MONITOR_VERBOSE (1ULL << 0) + +#define XFS_HEALTH_MONITOR_ALL (XFS_HEALTH_MONITOR_VERBOSE) + +/* Initial return format version */ +#define XFS_HEALTH_MONITOR_FMT_V0 (0) + +/* + * Check that a given fd points to the same filesystem that the health monitor + * is monitoring. + */ +struct xfs_health_file_on_monitored_fs { + __s32 fd; + __u32 flags; /* zero for now */ +}; + +/* Verify the media of the underlying devices */ +struct xfs_verify_media { + __u32 me_dev; /* I: XFS_DEV_{DATA,LOG,RT} */ + __u32 me_flags; /* I: XFS_VERIFY_MEDIA_* */ + + /* + * IO: inclusive start of disk range to verify, in 512b blocks. + * Will be adjusted upwards as media verification succeeds. + */ + __u64 me_start_daddr; + + /* + * IO: exclusive end of the disk range to verify, in 512b blocks. + * Can be adjusted downwards to match device size. + */ + __u64 me_end_daddr; + + __u32 me_ioerror; /* O: I/O error (positive) */ + __u32 me_max_io_size; /* I: maximum IO size in bytes */ + + __u32 me_rest_us; /* I: rest time between IOs, usecs */ + __u32 me_pad; /* zero */ +}; + +#define XFS_VERIFY_MEDIA_REPORT (1 << 0) /* report to fsnotify */ + +#define XFS_VERIFY_MEDIA_FLAGS (XFS_VERIFY_MEDIA_REPORT) + /* * ioctl commands that are used by Linux filesystems */ @@ -1042,6 +1227,10 @@ struct xfs_rtgroup_geometry { #define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle) #define XFS_IOC_SCRUBV_METADATA _IOWR('X', 64, struct xfs_scrub_vec_head) #define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 65, struct xfs_rtgroup_geometry) +#define XFS_IOC_HEALTH_MONITOR _IOW ('X', 68, struct xfs_health_monitor) +#define XFS_IOC_HEALTH_FD_ON_MONITORED_FS \ + _IOW ('X', 69, struct xfs_health_file_on_monitored_fs) +#define XFS_IOC_VERIFY_MEDIA _IOWR('X', 70, struct xfs_verify_media) /* * ioctl commands that replace IRIX syssgi()'s diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index b31000f7190c..1d45cf5789e8 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -289,4 +289,9 @@ void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs); #define xfs_metadata_is_sick(error) \ (unlikely((error) == -EFSCORRUPTED || (error) == -EFSBADCRC)) +unsigned int xfs_healthmon_inode_mask(unsigned int sick_mask); +unsigned int xfs_healthmon_rtgroup_mask(unsigned int sick_mask); +unsigned int xfs_healthmon_perag_mask(unsigned int sick_mask); +unsigned int xfs_healthmon_fs_mask(unsigned int sick_mask); + #endif /* __XFS_HEALTH_H__ */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index b687c5fa34ac..17255c41786b 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -25,6 +25,9 @@ #include "xfs_rtrmap_btree.h" #include "xfs_rtrefcount_btree.h" #include "xfs_metafile.h" +#include "xfs_healthmon.h" + +#include <linux/fserror.h> /* * Write new AG headers to disk. Non-transactional, but need to be @@ -540,6 +543,9 @@ xfs_do_force_shutdown( "Please unmount the filesystem and rectify the problem(s)"); if (xfs_error_level >= XFS_ERRLEVEL_HIGH) xfs_stack_trace(); + + fserror_report_shutdown(mp->m_super, GFP_KERNEL); + xfs_healthmon_report_shutdown(mp, flags); } /* diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 12fa8f24da85..169123772cb3 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -19,6 +19,9 @@ #include "xfs_da_btree.h" #include "xfs_quota_defs.h" #include "xfs_rtgroup.h" +#include "xfs_healthmon.h" + +#include <linux/fserror.h> static void xfs_health_unmount_group( @@ -105,12 +108,19 @@ xfs_fs_mark_sick( struct xfs_mount *mp, unsigned int mask) { + unsigned int old_mask; + ASSERT(!(mask & ~XFS_SICK_FS_ALL)); trace_xfs_fs_mark_sick(mp, mask); spin_lock(&mp->m_sb_lock); + old_mask = mp->m_fs_sick; mp->m_fs_sick |= mask; spin_unlock(&mp->m_sb_lock); + + fserror_report_metadata(mp->m_super, -EFSCORRUPTED, GFP_NOFS); + if (mask) + xfs_healthmon_report_fs(mp, XFS_HEALTHMON_SICK, old_mask, mask); } /* Mark per-fs metadata as having been checked and found unhealthy by fsck. */ @@ -119,13 +129,21 @@ xfs_fs_mark_corrupt( struct xfs_mount *mp, unsigned int mask) { + unsigned int old_mask; + ASSERT(!(mask & ~XFS_SICK_FS_ALL)); trace_xfs_fs_mark_corrupt(mp, mask); spin_lock(&mp->m_sb_lock); + old_mask = mp->m_fs_sick; mp->m_fs_sick |= mask; mp->m_fs_checked |= mask; spin_unlock(&mp->m_sb_lock); + + fserror_report_metadata(mp->m_super, -EFSCORRUPTED, GFP_NOFS); + if (mask) + xfs_healthmon_report_fs(mp, XFS_HEALTHMON_CORRUPT, old_mask, + mask); } /* Mark a per-fs metadata healed. */ @@ -134,15 +152,22 @@ xfs_fs_mark_healthy( struct xfs_mount *mp, unsigned int mask) { + unsigned int old_mask; + ASSERT(!(mask & ~XFS_SICK_FS_ALL)); trace_xfs_fs_mark_healthy(mp, mask); spin_lock(&mp->m_sb_lock); + old_mask = mp->m_fs_sick; mp->m_fs_sick &= ~mask; if (!(mp->m_fs_sick & XFS_SICK_FS_PRIMARY)) mp->m_fs_sick &= ~XFS_SICK_FS_SECONDARY; mp->m_fs_checked |= mask; spin_unlock(&mp->m_sb_lock); + + if (mask) + xfs_healthmon_report_fs(mp, XFS_HEALTHMON_HEALTHY, old_mask, + mask); } /* Sample which per-fs metadata are unhealthy. */ @@ -192,12 +217,20 @@ xfs_group_mark_sick( struct xfs_group *xg, unsigned int mask) { + unsigned int old_mask; + xfs_group_check_mask(xg, mask); trace_xfs_group_mark_sick(xg, mask); spin_lock(&xg->xg_state_lock); + old_mask = xg->xg_sick; xg->xg_sick |= mask; spin_unlock(&xg->xg_state_lock); + + fserror_report_metadata(xg->xg_mount->m_super, -EFSCORRUPTED, GFP_NOFS); + if (mask) + xfs_healthmon_report_group(xg, XFS_HEALTHMON_SICK, old_mask, + mask); } /* @@ -208,13 +241,21 @@ xfs_group_mark_corrupt( struct xfs_group *xg, unsigned int mask) { + unsigned int old_mask; + xfs_group_check_mask(xg, mask); trace_xfs_group_mark_corrupt(xg, mask); spin_lock(&xg->xg_state_lock); + old_mask = xg->xg_sick; xg->xg_sick |= mask; xg->xg_checked |= mask; spin_unlock(&xg->xg_state_lock); + + fserror_report_metadata(xg->xg_mount->m_super, -EFSCORRUPTED, GFP_NOFS); + if (mask) + xfs_healthmon_report_group(xg, XFS_HEALTHMON_CORRUPT, old_mask, + mask); } /* @@ -225,15 +266,22 @@ xfs_group_mark_healthy( struct xfs_group *xg, unsigned int mask) { + unsigned int old_mask; + xfs_group_check_mask(xg, mask); trace_xfs_group_mark_healthy(xg, mask); spin_lock(&xg->xg_state_lock); + old_mask = xg->xg_sick; xg->xg_sick &= ~mask; if (!(xg->xg_sick & XFS_SICK_AG_PRIMARY)) xg->xg_sick &= ~XFS_SICK_AG_SECONDARY; xg->xg_checked |= mask; spin_unlock(&xg->xg_state_lock); + + if (mask) + xfs_healthmon_report_group(xg, XFS_HEALTHMON_HEALTHY, old_mask, + mask); } /* Sample which per-ag metadata are unhealthy. */ @@ -272,10 +320,13 @@ xfs_inode_mark_sick( struct xfs_inode *ip, unsigned int mask) { + unsigned int old_mask; + ASSERT(!(mask & ~XFS_SICK_INO_ALL)); trace_xfs_inode_mark_sick(ip, mask); spin_lock(&ip->i_flags_lock); + old_mask = ip->i_sick; ip->i_sick |= mask; spin_unlock(&ip->i_flags_lock); @@ -287,6 +338,11 @@ xfs_inode_mark_sick( spin_lock(&VFS_I(ip)->i_lock); inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); + + fserror_report_file_metadata(VFS_I(ip), -EFSCORRUPTED, GFP_NOFS); + if (mask) + xfs_healthmon_report_inode(ip, XFS_HEALTHMON_SICK, old_mask, + mask); } /* Mark inode metadata as having been checked and found unhealthy by fsck. */ @@ -295,10 +351,13 @@ xfs_inode_mark_corrupt( struct xfs_inode *ip, unsigned int mask) { + unsigned int old_mask; + ASSERT(!(mask & ~XFS_SICK_INO_ALL)); trace_xfs_inode_mark_corrupt(ip, mask); spin_lock(&ip->i_flags_lock); + old_mask = ip->i_sick; ip->i_sick |= mask; ip->i_checked |= mask; spin_unlock(&ip->i_flags_lock); @@ -311,6 +370,11 @@ xfs_inode_mark_corrupt( spin_lock(&VFS_I(ip)->i_lock); inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); + + fserror_report_file_metadata(VFS_I(ip), -EFSCORRUPTED, GFP_NOFS); + if (mask) + xfs_healthmon_report_inode(ip, XFS_HEALTHMON_CORRUPT, old_mask, + mask); } /* Mark parts of an inode healed. */ @@ -319,15 +383,22 @@ xfs_inode_mark_healthy( struct xfs_inode *ip, unsigned int mask) { + unsigned int old_mask; + ASSERT(!(mask & ~XFS_SICK_INO_ALL)); trace_xfs_inode_mark_healthy(ip, mask); spin_lock(&ip->i_flags_lock); + old_mask = ip->i_sick; ip->i_sick &= ~mask; if (!(ip->i_sick & XFS_SICK_INO_PRIMARY)) ip->i_sick &= ~XFS_SICK_INO_SECONDARY; ip->i_checked |= mask; spin_unlock(&ip->i_flags_lock); + + if (mask) + xfs_healthmon_report_inode(ip, XFS_HEALTHMON_HEALTHY, old_mask, + mask); } /* Sample which parts of an inode are unhealthy. */ @@ -407,6 +478,25 @@ xfs_fsop_geom_health( } } +/* + * Translate XFS_SICK_FS_* into XFS_FSOP_GEOM_SICK_* except for the rt free + * space codes, which are sent via the rtgroup events. + */ +unsigned int +xfs_healthmon_fs_mask( + unsigned int sick_mask) +{ + const struct ioctl_sick_map *m; + unsigned int ioctl_mask = 0; + + for_each_sick_map(fs_map, m) { + if (sick_mask & m->sick_mask) + ioctl_mask |= m->ioctl_mask; + } + + return ioctl_mask; +} + static const struct ioctl_sick_map ag_map[] = { { XFS_SICK_AG_SB, XFS_AG_GEOM_SICK_SB }, { XFS_SICK_AG_AGF, XFS_AG_GEOM_SICK_AGF }, @@ -443,6 +533,22 @@ xfs_ag_geom_health( } } +/* Translate XFS_SICK_AG_* into XFS_AG_GEOM_SICK_*. */ +unsigned int +xfs_healthmon_perag_mask( + unsigned int sick_mask) +{ + const struct ioctl_sick_map *m; + unsigned int ioctl_mask = 0; + + for_each_sick_map(ag_map, m) { + if (sick_mask & m->sick_mask) + ioctl_mask |= m->ioctl_mask; + } + + return ioctl_mask; +} + static const struct ioctl_sick_map rtgroup_map[] = { { XFS_SICK_RG_SUPER, XFS_RTGROUP_GEOM_SICK_SUPER }, { XFS_SICK_RG_BITMAP, XFS_RTGROUP_GEOM_SICK_BITMAP }, @@ -473,6 +579,22 @@ xfs_rtgroup_geom_health( } } +/* Translate XFS_SICK_RG_* into XFS_RTGROUP_GEOM_SICK_*. */ +unsigned int +xfs_healthmon_rtgroup_mask( + unsigned int sick_mask) +{ + const struct ioctl_sick_map *m; + unsigned int ioctl_mask = 0; + + for_each_sick_map(rtgroup_map, m) { + if (sick_mask & m->sick_mask) + ioctl_mask |= m->ioctl_mask; + } + + return ioctl_mask; +} + static const struct ioctl_sick_map ino_map[] = { { XFS_SICK_INO_CORE, XFS_BS_SICK_INODE }, { XFS_SICK_INO_BMBTD, XFS_BS_SICK_BMBTD }, @@ -511,6 +633,22 @@ xfs_bulkstat_health( } } +/* Translate XFS_SICK_INO_* into XFS_BS_SICK_*. */ +unsigned int +xfs_healthmon_inode_mask( + unsigned int sick_mask) +{ + const struct ioctl_sick_map *m; + unsigned int ioctl_mask = 0; + + for_each_sick_map(ino_map, m) { + if (sick_mask & m->sick_mask) + ioctl_mask |= m->ioctl_mask; + } + + return ioctl_mask; +} + /* Mark a block mapping sick. */ void xfs_bmap_mark_sick( diff --git a/fs/xfs/xfs_healthmon.c b/fs/xfs/xfs_healthmon.c new file mode 100644 index 000000000000..ca7352dcd182 --- /dev/null +++ b/fs/xfs/xfs_healthmon.c @@ -0,0 +1,1255 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2024-2026 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs_platform.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trace.h" +#include "xfs_ag.h" +#include "xfs_btree.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_quota_defs.h" +#include "xfs_rtgroup.h" +#include "xfs_health.h" +#include "xfs_healthmon.h" +#include "xfs_fsops.h" +#include "xfs_notify_failure.h" +#include "xfs_file.h" +#include "xfs_ioctl.h" + +#include <linux/anon_inodes.h> +#include <linux/eventpoll.h> +#include <linux/poll.h> +#include <linux/fserror.h> + +/* + * Live Health Monitoring + * ====================== + * + * Autonomous self-healing of XFS filesystems requires a means for the kernel + * to send filesystem health events to a monitoring daemon in userspace. To + * accomplish this, we establish a thread_with_file kthread object to handle + * translating internal events about filesystem health into a format that can + * be parsed easily by userspace. When those internal events occur, the core + * filesystem code calls this health monitor to convey the events to userspace. + * Userspace reads events from the file descriptor returned by the ioctl. + * + * The healthmon abstraction has a weak reference to the host filesystem mount + * so that the queueing and processing of the events do not pin the mount and + * cannot slow down the main filesystem. The healthmon object can exist past + * the end of the filesystem mount. + */ + +/* sign of a detached health monitor */ +#define DETACHED_MOUNT_COOKIE ((uintptr_t)0) + +/* Constrain the number of event objects that can build up in memory. */ +#define XFS_HEALTHMON_MAX_EVENTS (SZ_32K / \ + sizeof(struct xfs_healthmon_event)) + +/* Constrain the size of the output buffer for read_iter. */ +#define XFS_HEALTHMON_MAX_OUTBUF SZ_64K + +/* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */ +static DEFINE_SPINLOCK(xfs_healthmon_lock); + +/* Grab a reference to the healthmon object for a given mount, if any. */ +static struct xfs_healthmon * +xfs_healthmon_get( + struct xfs_mount *mp) +{ + struct xfs_healthmon *hm; + + rcu_read_lock(); + hm = mp->m_healthmon; + if (hm && !refcount_inc_not_zero(&hm->ref)) + hm = NULL; + rcu_read_unlock(); + + return hm; +} + +/* + * Release the reference to a healthmon object. If there are no more holders, + * free the health monitor after an RCU grace period to eliminate possibility + * of races with xfs_healthmon_get. + */ +static void +xfs_healthmon_put( + struct xfs_healthmon *hm) +{ + if (refcount_dec_and_test(&hm->ref)) { + struct xfs_healthmon_event *event; + struct xfs_healthmon_event *next = hm->first_event; + + while ((event = next) != NULL) { + trace_xfs_healthmon_drop(hm, event); + next = event->next; + kfree(event); + } + + kfree(hm->unmount_event); + kfree(hm->buffer); + mutex_destroy(&hm->lock); + kfree_rcu_mightsleep(hm); + } +} + +/* Attach a health monitor to an xfs_mount. Only one allowed at a time. */ +STATIC int +xfs_healthmon_attach( + struct xfs_mount *mp, + struct xfs_healthmon *hm) +{ + spin_lock(&xfs_healthmon_lock); + if (mp->m_healthmon != NULL) { + spin_unlock(&xfs_healthmon_lock); + return -EEXIST; + } + + refcount_inc(&hm->ref); + mp->m_healthmon = hm; + hm->mount_cookie = (uintptr_t)mp->m_super; + spin_unlock(&xfs_healthmon_lock); + + return 0; +} + +/* Detach a xfs mount from a specific healthmon instance. */ +STATIC void +xfs_healthmon_detach( + struct xfs_healthmon *hm) +{ + spin_lock(&xfs_healthmon_lock); + if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) { + spin_unlock(&xfs_healthmon_lock); + return; + } + + XFS_M((struct super_block *)hm->mount_cookie)->m_healthmon = NULL; + hm->mount_cookie = DETACHED_MOUNT_COOKIE; + spin_unlock(&xfs_healthmon_lock); + + trace_xfs_healthmon_detach(hm); + xfs_healthmon_put(hm); +} + +static inline void xfs_healthmon_bump_events(struct xfs_healthmon *hm) +{ + hm->events++; + hm->total_events++; +} + +static inline void xfs_healthmon_bump_lost(struct xfs_healthmon *hm) +{ + hm->lost_prev_event++; + hm->total_lost++; +} + +/* + * If possible, merge a new event into an existing event. Returns whether or + * not it merged anything. + */ +static bool +xfs_healthmon_merge_events( + struct xfs_healthmon_event *existing, + const struct xfs_healthmon_event *new) +{ + if (!existing) + return false; + + /* type and domain must match to merge events */ + if (existing->type != new->type || + existing->domain != new->domain) + return false; + + switch (existing->type) { + case XFS_HEALTHMON_RUNNING: + case XFS_HEALTHMON_UNMOUNT: + /* should only ever be one of these events anyway */ + return false; + + case XFS_HEALTHMON_LOST: + existing->lostcount += new->lostcount; + return true; + + case XFS_HEALTHMON_SICK: + case XFS_HEALTHMON_CORRUPT: + case XFS_HEALTHMON_HEALTHY: + switch (existing->domain) { + case XFS_HEALTHMON_FS: + existing->fsmask |= new->fsmask; + return true; + case XFS_HEALTHMON_AG: + case XFS_HEALTHMON_RTGROUP: + if (existing->group == new->group){ + existing->grpmask |= new->grpmask; + return true; + } + return false; + case XFS_HEALTHMON_INODE: + if (existing->ino == new->ino && + existing->gen == new->gen) { + existing->imask |= new->imask; + return true; + } + return false; + default: + ASSERT(0); + return false; + } + return false; + + case XFS_HEALTHMON_SHUTDOWN: + /* yes, we can race to shutdown */ + existing->flags |= new->flags; + return true; + + case XFS_HEALTHMON_MEDIA_ERROR: + /* physically adjacent errors can merge */ + if (existing->daddr + existing->bbcount == new->daddr) { + existing->bbcount += new->bbcount; + return true; + } + if (new->daddr + new->bbcount == existing->daddr) { + existing->daddr = new->daddr; + existing->bbcount += new->bbcount; + return true; + } + return false; + + case XFS_HEALTHMON_BUFREAD: + case XFS_HEALTHMON_BUFWRITE: + case XFS_HEALTHMON_DIOREAD: + case XFS_HEALTHMON_DIOWRITE: + case XFS_HEALTHMON_DATALOST: + /* logically adjacent file ranges can merge */ + if (existing->fino != new->fino || existing->fgen != new->fgen) + return false; + + if (existing->fpos + existing->flen == new->fpos) { + existing->flen += new->flen; + return true; + } + + if (new->fpos + new->flen == existing->fpos) { + existing->fpos = new->fpos; + existing->flen += new->flen; + return true; + } + return false; + } + + return false; +} + +/* Insert an event onto the start of the queue. */ +static inline void +__xfs_healthmon_insert( + struct xfs_healthmon *hm, + struct xfs_healthmon_event *event) +{ + struct timespec64 now; + + ktime_get_coarse_real_ts64(&now); + event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec; + + event->next = hm->first_event; + if (!hm->first_event) + hm->first_event = event; + if (!hm->last_event) + hm->last_event = event; + xfs_healthmon_bump_events(hm); + wake_up(&hm->wait); + + trace_xfs_healthmon_insert(hm, event); +} + +/* Push an event onto the end of the queue. */ +static inline void +__xfs_healthmon_push( + struct xfs_healthmon *hm, + struct xfs_healthmon_event *event) +{ + struct timespec64 now; + + ktime_get_coarse_real_ts64(&now); + event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec; + + if (!hm->first_event) + hm->first_event = event; + if (hm->last_event) + hm->last_event->next = event; + hm->last_event = event; + event->next = NULL; + xfs_healthmon_bump_events(hm); + wake_up(&hm->wait); + + trace_xfs_healthmon_push(hm, event); +} + +/* Deal with any previously lost events */ +static int +xfs_healthmon_clear_lost_prev( + struct xfs_healthmon *hm) +{ + struct xfs_healthmon_event lost_event = { + .type = XFS_HEALTHMON_LOST, + .domain = XFS_HEALTHMON_MOUNT, + .lostcount = hm->lost_prev_event, + }; + struct xfs_healthmon_event *event = NULL; + + if (xfs_healthmon_merge_events(hm->last_event, &lost_event)) { + trace_xfs_healthmon_merge(hm, hm->last_event); + wake_up(&hm->wait); + goto cleared; + } + + if (hm->events < XFS_HEALTHMON_MAX_EVENTS) + event = kmemdup(&lost_event, sizeof(struct xfs_healthmon_event), + GFP_NOFS); + if (!event) + return -ENOMEM; + + __xfs_healthmon_push(hm, event); +cleared: + hm->lost_prev_event = 0; + return 0; +} + +/* + * Push an event onto the end of the list after dealing with lost events and + * possibly full queues. + */ +STATIC int +xfs_healthmon_push( + struct xfs_healthmon *hm, + const struct xfs_healthmon_event *template) +{ + struct xfs_healthmon_event *event = NULL; + int error = 0; + + /* + * Locklessly check if the health monitor has already detached from the + * mount. If so, ignore the event. If we race with deactivation, + * we'll queue the event but never send it. + */ + if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) + return -ESHUTDOWN; + + mutex_lock(&hm->lock); + + /* Report previously lost events before we do anything else */ + if (hm->lost_prev_event) { + error = xfs_healthmon_clear_lost_prev(hm); + if (error) + goto out_unlock; + } + + /* Try to merge with the newest event */ + if (xfs_healthmon_merge_events(hm->last_event, template)) { + trace_xfs_healthmon_merge(hm, hm->last_event); + wake_up(&hm->wait); + goto out_unlock; + } + + /* Only create a heap event object if we're not already at capacity. */ + if (hm->events < XFS_HEALTHMON_MAX_EVENTS) + event = kmemdup(template, sizeof(struct xfs_healthmon_event), + GFP_NOFS); + if (!event) { + /* No memory means we lose the event */ + trace_xfs_healthmon_lost_event(hm); + xfs_healthmon_bump_lost(hm); + error = -ENOMEM; + goto out_unlock; + } + + __xfs_healthmon_push(hm, event); + +out_unlock: + mutex_unlock(&hm->lock); + return error; +} + +/* + * Report that the filesystem is being unmounted, then detach the xfs mount + * from this healthmon instance. + */ +void +xfs_healthmon_unmount( + struct xfs_mount *mp) +{ + struct xfs_healthmon *hm = xfs_healthmon_get(mp); + + if (!hm) + return; + + trace_xfs_healthmon_report_unmount(hm); + + /* + * Insert the unmount notification at the start of the event queue so + * that userspace knows the filesystem went away as soon as possible. + * There's nothing actionable for userspace after an unmount. Once + * we've inserted the unmount event, hm no longer owns that event. + */ + __xfs_healthmon_insert(hm, hm->unmount_event); + hm->unmount_event = NULL; + + xfs_healthmon_detach(hm); + xfs_healthmon_put(hm); +} + +/* Compute the reporting mask for non-unmount metadata health events. */ +static inline unsigned int +metadata_event_mask( + struct xfs_healthmon *hm, + enum xfs_healthmon_type type, + unsigned int old_mask, + unsigned int new_mask) +{ + /* If we want all events, return all events. */ + if (hm->verbose) + return new_mask; + + switch (type) { + case XFS_HEALTHMON_SICK: + /* Always report runtime corruptions */ + return new_mask; + case XFS_HEALTHMON_CORRUPT: + /* Only report new fsck errors */ + return new_mask & ~old_mask; + case XFS_HEALTHMON_HEALTHY: + /* Only report healthy metadata that got fixed */ + return new_mask & old_mask; + default: + ASSERT(0); + break; + } + + return 0; +} + +/* Report XFS_FS_SICK_* events to healthmon */ +void +xfs_healthmon_report_fs( + struct xfs_mount *mp, + enum xfs_healthmon_type type, + unsigned int old_mask, + unsigned int new_mask) +{ + struct xfs_healthmon_event event = { + .type = type, + .domain = XFS_HEALTHMON_FS, + }; + struct xfs_healthmon *hm = xfs_healthmon_get(mp); + + if (!hm) + return; + + event.fsmask = metadata_event_mask(hm, type, old_mask, new_mask) & + ~XFS_SICK_FS_SECONDARY; + trace_xfs_healthmon_report_fs(hm, old_mask, new_mask, &event); + + if (event.fsmask) + xfs_healthmon_push(hm, &event); + + xfs_healthmon_put(hm); +} + +/* Report XFS_SICK_(AG|RG)* flags to healthmon */ +void +xfs_healthmon_report_group( + struct xfs_group *xg, + enum xfs_healthmon_type type, + unsigned int old_mask, + unsigned int new_mask) +{ + struct xfs_healthmon_event event = { + .type = type, + .group = xg->xg_gno, + }; + struct xfs_healthmon *hm = xfs_healthmon_get(xg->xg_mount); + + if (!hm) + return; + + switch (xg->xg_type) { + case XG_TYPE_RTG: + event.domain = XFS_HEALTHMON_RTGROUP; + event.grpmask = metadata_event_mask(hm, type, old_mask, + new_mask) & + ~XFS_SICK_RG_SECONDARY; + break; + case XG_TYPE_AG: + event.domain = XFS_HEALTHMON_AG; + event.grpmask = metadata_event_mask(hm, type, old_mask, + new_mask) & + ~XFS_SICK_AG_SECONDARY; + break; + default: + ASSERT(0); + break; + } + + trace_xfs_healthmon_report_group(hm, old_mask, new_mask, &event); + + if (event.grpmask) + xfs_healthmon_push(hm, &event); + + xfs_healthmon_put(hm); +} + +/* Report XFS_SICK_INO_* flags to healthmon */ +void +xfs_healthmon_report_inode( + struct xfs_inode *ip, + enum xfs_healthmon_type type, + unsigned int old_mask, + unsigned int new_mask) +{ + struct xfs_healthmon_event event = { + .type = type, + .domain = XFS_HEALTHMON_INODE, + .ino = ip->i_ino, + .gen = VFS_I(ip)->i_generation, + }; + struct xfs_healthmon *hm = xfs_healthmon_get(ip->i_mount); + + if (!hm) + return; + + event.imask = metadata_event_mask(hm, type, old_mask, new_mask) & + ~XFS_SICK_INO_SECONDARY; + trace_xfs_healthmon_report_inode(hm, old_mask, event.imask, &event); + + if (event.imask) + xfs_healthmon_push(hm, &event); + + xfs_healthmon_put(hm); +} + +/* Add a shutdown event to the reporting queue. */ +void +xfs_healthmon_report_shutdown( + struct xfs_mount *mp, + uint32_t flags) +{ + struct xfs_healthmon_event event = { + .type = XFS_HEALTHMON_SHUTDOWN, + .domain = XFS_HEALTHMON_MOUNT, + .flags = flags, + }; + struct xfs_healthmon *hm = xfs_healthmon_get(mp); + + if (!hm) + return; + + trace_xfs_healthmon_report_shutdown(hm, flags); + + xfs_healthmon_push(hm, &event); + xfs_healthmon_put(hm); +} + +static inline enum xfs_healthmon_domain +media_error_domain( + enum xfs_device fdev) +{ + switch (fdev) { + case XFS_DEV_DATA: + return XFS_HEALTHMON_DATADEV; + case XFS_DEV_LOG: + return XFS_HEALTHMON_LOGDEV; + case XFS_DEV_RT: + return XFS_HEALTHMON_RTDEV; + } + + ASSERT(0); + return 0; +} + +/* Add a media error event to the reporting queue. */ +void +xfs_healthmon_report_media( + struct xfs_mount *mp, + enum xfs_device fdev, + xfs_daddr_t daddr, + uint64_t bbcount) +{ + struct xfs_healthmon_event event = { + .type = XFS_HEALTHMON_MEDIA_ERROR, + .domain = media_error_domain(fdev), + .daddr = daddr, + .bbcount = bbcount, + }; + struct xfs_healthmon *hm = xfs_healthmon_get(mp); + + if (!hm) + return; + + trace_xfs_healthmon_report_media(hm, fdev, &event); + + xfs_healthmon_push(hm, &event); + xfs_healthmon_put(hm); +} + +static inline enum xfs_healthmon_type file_ioerr_type(enum fserror_type action) +{ + switch (action) { + case FSERR_BUFFERED_READ: + return XFS_HEALTHMON_BUFREAD; + case FSERR_BUFFERED_WRITE: + return XFS_HEALTHMON_BUFWRITE; + case FSERR_DIRECTIO_READ: + return XFS_HEALTHMON_DIOREAD; + case FSERR_DIRECTIO_WRITE: + return XFS_HEALTHMON_DIOWRITE; + case FSERR_DATA_LOST: + return XFS_HEALTHMON_DATALOST; + case FSERR_METADATA: + /* filtered out by xfs_fs_report_error */ + break; + } + + ASSERT(0); + return -1; +} + +/* Add a file io error event to the reporting queue. */ +void +xfs_healthmon_report_file_ioerror( + struct xfs_inode *ip, + const struct fserror_event *p) +{ + struct xfs_healthmon_event event = { + .type = file_ioerr_type(p->type), + .domain = XFS_HEALTHMON_FILERANGE, + .fino = ip->i_ino, + .fgen = VFS_I(ip)->i_generation, + .fpos = p->pos, + .flen = p->len, + /* send positive error number to userspace */ + .error = -p->error, + }; + struct xfs_healthmon *hm = xfs_healthmon_get(ip->i_mount); + + if (!hm) + return; + + trace_xfs_healthmon_report_file_ioerror(hm, p); + + xfs_healthmon_push(hm, &event); + xfs_healthmon_put(hm); +} + +static inline void +xfs_healthmon_reset_outbuf( + struct xfs_healthmon *hm) +{ + hm->buftail = 0; + hm->bufhead = 0; +} + +struct flags_map { + unsigned int in_mask; + unsigned int out_mask; +}; + +static const struct flags_map shutdown_map[] = { + { SHUTDOWN_META_IO_ERROR, XFS_HEALTH_SHUTDOWN_META_IO_ERROR }, + { SHUTDOWN_LOG_IO_ERROR, XFS_HEALTH_SHUTDOWN_LOG_IO_ERROR }, + { SHUTDOWN_FORCE_UMOUNT, XFS_HEALTH_SHUTDOWN_FORCE_UMOUNT }, + { SHUTDOWN_CORRUPT_INCORE, XFS_HEALTH_SHUTDOWN_CORRUPT_INCORE }, + { SHUTDOWN_CORRUPT_ONDISK, XFS_HEALTH_SHUTDOWN_CORRUPT_ONDISK }, + { SHUTDOWN_DEVICE_REMOVED, XFS_HEALTH_SHUTDOWN_DEVICE_REMOVED }, +}; + +static inline unsigned int +__map_flags( + const struct flags_map *map, + size_t array_len, + unsigned int flags) +{ + const struct flags_map *m; + unsigned int ret = 0; + + for (m = map; m < map + array_len; m++) { + if (flags & m->in_mask) + ret |= m->out_mask; + } + + return ret; +} + +#define map_flags(map, flags) __map_flags((map), ARRAY_SIZE(map), (flags)) + +static inline unsigned int shutdown_mask(unsigned int in) +{ + return map_flags(shutdown_map, in); +} + +static const unsigned int domain_map[] = { + [XFS_HEALTHMON_MOUNT] = XFS_HEALTH_MONITOR_DOMAIN_MOUNT, + [XFS_HEALTHMON_FS] = XFS_HEALTH_MONITOR_DOMAIN_FS, + [XFS_HEALTHMON_AG] = XFS_HEALTH_MONITOR_DOMAIN_AG, + [XFS_HEALTHMON_INODE] = XFS_HEALTH_MONITOR_DOMAIN_INODE, + [XFS_HEALTHMON_RTGROUP] = XFS_HEALTH_MONITOR_DOMAIN_RTGROUP, + [XFS_HEALTHMON_DATADEV] = XFS_HEALTH_MONITOR_DOMAIN_DATADEV, + [XFS_HEALTHMON_RTDEV] = XFS_HEALTH_MONITOR_DOMAIN_RTDEV, + [XFS_HEALTHMON_LOGDEV] = XFS_HEALTH_MONITOR_DOMAIN_LOGDEV, + [XFS_HEALTHMON_FILERANGE] = XFS_HEALTH_MONITOR_DOMAIN_FILERANGE, +}; + +static const unsigned int type_map[] = { + [XFS_HEALTHMON_RUNNING] = XFS_HEALTH_MONITOR_TYPE_RUNNING, + [XFS_HEALTHMON_LOST] = XFS_HEALTH_MONITOR_TYPE_LOST, + [XFS_HEALTHMON_SICK] = XFS_HEALTH_MONITOR_TYPE_SICK, + [XFS_HEALTHMON_CORRUPT] = XFS_HEALTH_MONITOR_TYPE_CORRUPT, + [XFS_HEALTHMON_HEALTHY] = XFS_HEALTH_MONITOR_TYPE_HEALTHY, + [XFS_HEALTHMON_UNMOUNT] = XFS_HEALTH_MONITOR_TYPE_UNMOUNT, + [XFS_HEALTHMON_SHUTDOWN] = XFS_HEALTH_MONITOR_TYPE_SHUTDOWN, + [XFS_HEALTHMON_MEDIA_ERROR] = XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR, + [XFS_HEALTHMON_BUFREAD] = XFS_HEALTH_MONITOR_TYPE_BUFREAD, + [XFS_HEALTHMON_BUFWRITE] = XFS_HEALTH_MONITOR_TYPE_BUFWRITE, + [XFS_HEALTHMON_DIOREAD] = XFS_HEALTH_MONITOR_TYPE_DIOREAD, + [XFS_HEALTHMON_DIOWRITE] = XFS_HEALTH_MONITOR_TYPE_DIOWRITE, + [XFS_HEALTHMON_DATALOST] = XFS_HEALTH_MONITOR_TYPE_DATALOST, +}; + +/* Render event as a V0 structure */ +STATIC int +xfs_healthmon_format_v0( + struct xfs_healthmon *hm, + const struct xfs_healthmon_event *event) +{ + struct xfs_health_monitor_event hme = { + .time_ns = event->time_ns, + }; + + trace_xfs_healthmon_format(hm, event); + + if (event->domain < 0 || event->domain >= ARRAY_SIZE(domain_map) || + event->type < 0 || event->type >= ARRAY_SIZE(type_map)) + return -EFSCORRUPTED; + + hme.domain = domain_map[event->domain]; + hme.type = type_map[event->type]; + + /* fill in the event-specific details */ + switch (event->domain) { + case XFS_HEALTHMON_MOUNT: + switch (event->type) { + case XFS_HEALTHMON_LOST: + hme.e.lost.count = event->lostcount; + break; + case XFS_HEALTHMON_SHUTDOWN: + hme.e.shutdown.reasons = shutdown_mask(event->flags); + break; + default: + break; + } + break; + case XFS_HEALTHMON_FS: + hme.e.fs.mask = xfs_healthmon_fs_mask(event->fsmask); + break; + case XFS_HEALTHMON_RTGROUP: + hme.e.group.mask = xfs_healthmon_rtgroup_mask(event->grpmask); + hme.e.group.gno = event->group; + break; + case XFS_HEALTHMON_AG: + hme.e.group.mask = xfs_healthmon_perag_mask(event->grpmask); + hme.e.group.gno = event->group; + break; + case XFS_HEALTHMON_INODE: + hme.e.inode.mask = xfs_healthmon_inode_mask(event->imask); + hme.e.inode.ino = event->ino; + hme.e.inode.gen = event->gen; + break; + case XFS_HEALTHMON_DATADEV: + case XFS_HEALTHMON_LOGDEV: + case XFS_HEALTHMON_RTDEV: + hme.e.media.daddr = event->daddr; + hme.e.media.bbcount = event->bbcount; + break; + case XFS_HEALTHMON_FILERANGE: + hme.e.filerange.ino = event->fino; + hme.e.filerange.gen = event->fgen; + hme.e.filerange.pos = event->fpos; + hme.e.filerange.len = event->flen; + hme.e.filerange.error = abs(event->error); + break; + default: + break; + } + + ASSERT(hm->bufhead + sizeof(hme) <= hm->bufsize); + + /* copy formatted object to the outbuf */ + if (hm->bufhead + sizeof(hme) <= hm->bufsize) { + memcpy(hm->buffer + hm->bufhead, &hme, sizeof(hme)); + hm->bufhead += sizeof(hme); + } + + return 0; +} + +/* How many bytes are waiting in the outbuf to be copied? */ +static inline size_t +xfs_healthmon_outbuf_bytes( + struct xfs_healthmon *hm) +{ + if (hm->bufhead > hm->buftail) + return hm->bufhead - hm->buftail; + return 0; +} + +/* + * Do we have something for userspace to read? This can mean unmount events, + * events pending in the queue, or pending bytes in the outbuf. + */ +static inline bool +xfs_healthmon_has_eventdata( + struct xfs_healthmon *hm) +{ + /* + * If the health monitor is already detached from the xfs_mount, we + * want reads to return 0 bytes even if there are no events, because + * userspace interprets that as EOF. If we race with deactivation, + * read_iter will take the necessary locks to discover that there are + * no events to send. + */ + if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) + return true; + + /* + * Either there are events waiting to be formatted into the buffer, or + * there's unread bytes in the buffer. + */ + return hm->events > 0 || xfs_healthmon_outbuf_bytes(hm) > 0; +} + +/* Try to copy the rest of the outbuf to the iov iter. */ +STATIC ssize_t +xfs_healthmon_copybuf( + struct xfs_healthmon *hm, + struct iov_iter *to) +{ + size_t to_copy; + size_t w = 0; + + trace_xfs_healthmon_copybuf(hm, to); + + to_copy = xfs_healthmon_outbuf_bytes(hm); + if (to_copy) { + w = copy_to_iter(hm->buffer + hm->buftail, to_copy, to); + if (!w) + return -EFAULT; + + hm->buftail += w; + } + + /* + * Nothing left to copy? Reset the output buffer cursors to the start + * since there's no live data in the buffer. + */ + if (xfs_healthmon_outbuf_bytes(hm) == 0) + xfs_healthmon_reset_outbuf(hm); + return w; +} + +/* + * Return a health monitoring event for formatting into the output buffer if + * there's enough space in the outbuf and an event waiting for us. Caller + * must hold i_rwsem on the healthmon file. + */ +static inline struct xfs_healthmon_event * +xfs_healthmon_format_pop( + struct xfs_healthmon *hm) +{ + struct xfs_healthmon_event *event; + + if (hm->bufhead + sizeof(*event) > hm->bufsize) + return NULL; + + mutex_lock(&hm->lock); + event = hm->first_event; + if (event) { + if (hm->last_event == event) + hm->last_event = NULL; + hm->first_event = event->next; + hm->events--; + + trace_xfs_healthmon_pop(hm, event); + } + mutex_unlock(&hm->lock); + return event; +} + +/* Allocate formatting buffer */ +STATIC int +xfs_healthmon_alloc_outbuf( + struct xfs_healthmon *hm, + size_t user_bufsize) +{ + void *outbuf; + size_t bufsize = + min(XFS_HEALTHMON_MAX_OUTBUF, max(PAGE_SIZE, user_bufsize)); + + outbuf = kzalloc(bufsize, GFP_KERNEL); + if (!outbuf) { + if (bufsize == PAGE_SIZE) + return -ENOMEM; + + bufsize = PAGE_SIZE; + outbuf = kzalloc(bufsize, GFP_KERNEL); + if (!outbuf) + return -ENOMEM; + } + + hm->buffer = outbuf; + hm->bufsize = bufsize; + hm->bufhead = 0; + hm->buftail = 0; + + return 0; +} + +/* + * Convey queued event data to userspace. First copy any remaining bytes in + * the outbuf, then format the oldest event into the outbuf and copy that too. + */ +STATIC ssize_t +xfs_healthmon_read_iter( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct xfs_healthmon *hm = file->private_data; + struct xfs_healthmon_event *event; + size_t copied = 0; + ssize_t ret = 0; + + if (file->f_flags & O_NONBLOCK) { + if (!xfs_healthmon_has_eventdata(hm) || !inode_trylock(inode)) + return -EAGAIN; + } else { + ret = wait_event_interruptible(hm->wait, + xfs_healthmon_has_eventdata(hm)); + if (ret) + return ret; + + inode_lock(inode); + } + + if (hm->bufsize == 0) { + ret = xfs_healthmon_alloc_outbuf(hm, iov_iter_count(to)); + if (ret) + goto out_unlock; + } + + trace_xfs_healthmon_read_start(hm); + + /* + * If there's anything left in the output buffer, copy that before + * formatting more events. + */ + ret = xfs_healthmon_copybuf(hm, to); + if (ret < 0) + goto out_unlock; + copied += ret; + + while (iov_iter_count(to) > 0) { + /* Format the next events into the outbuf until it's full. */ + while ((event = xfs_healthmon_format_pop(hm)) != NULL) { + ret = xfs_healthmon_format_v0(hm, event); + kfree(event); + if (ret) + goto out_unlock; + } + + /* Copy anything formatted into outbuf to userspace */ + ret = xfs_healthmon_copybuf(hm, to); + if (ret <= 0) + break; + + copied += ret; + } + +out_unlock: + trace_xfs_healthmon_read_finish(hm); + inode_unlock(inode); + return copied ?: ret; +} + +/* Poll for available events. */ +STATIC __poll_t +xfs_healthmon_poll( + struct file *file, + struct poll_table_struct *wait) +{ + struct xfs_healthmon *hm = file->private_data; + __poll_t mask = 0; + + poll_wait(file, &hm->wait, wait); + + if (xfs_healthmon_has_eventdata(hm)) + mask |= EPOLLIN; + return mask; +} + +/* Free the health monitoring information. */ +STATIC int +xfs_healthmon_release( + struct inode *inode, + struct file *file) +{ + struct xfs_healthmon *hm = file->private_data; + + trace_xfs_healthmon_release(hm); + + /* + * We might be closing the healthmon file before the filesystem + * unmounts, because userspace processes can terminate at any time and + * for any reason. Null out xfs_mount::m_healthmon so that another + * process can create another health monitor file. + */ + xfs_healthmon_detach(hm); + + /* + * Wake up any readers that might be left. There shouldn't be any + * because the only users of the waiter are read and poll. + */ + wake_up_all(&hm->wait); + + xfs_healthmon_put(hm); + return 0; +} + +/* Validate ioctl parameters. */ +static inline bool +xfs_healthmon_validate( + const struct xfs_health_monitor *hmo) +{ + if (hmo->flags & ~XFS_HEALTH_MONITOR_ALL) + return false; + if (hmo->format != XFS_HEALTH_MONITOR_FMT_V0) + return false; + if (memchr_inv(&hmo->pad, 0, sizeof(hmo->pad))) + return false; + return true; +} + +/* Emit some data about the health monitoring fd. */ +static void +xfs_healthmon_show_fdinfo( + struct seq_file *m, + struct file *file) +{ + struct xfs_healthmon *hm = file->private_data; + + mutex_lock(&hm->lock); + seq_printf(m, "state:\t%s\ndev:\t%d:%d\nformat:\tv0\nevents:\t%llu\nlost:\t%llu\n", + hm->mount_cookie == DETACHED_MOUNT_COOKIE ? + "dead" : "alive", + MAJOR(hm->dev), MINOR(hm->dev), + hm->total_events, + hm->total_lost); + mutex_unlock(&hm->lock); +} + +/* Reconfigure the health monitor. */ +STATIC long +xfs_healthmon_reconfigure( + struct file *file, + unsigned int cmd, + void __user *arg) +{ + struct xfs_health_monitor hmo; + struct xfs_healthmon *hm = file->private_data; + + if (copy_from_user(&hmo, arg, sizeof(hmo))) + return -EFAULT; + + if (!xfs_healthmon_validate(&hmo)) + return -EINVAL; + + mutex_lock(&hm->lock); + hm->verbose = !!(hmo.flags & XFS_HEALTH_MONITOR_VERBOSE); + mutex_unlock(&hm->lock); + + return 0; +} + +/* Does the fd point to the same filesystem as the one we're monitoring? */ +STATIC long +xfs_healthmon_file_on_monitored_fs( + struct file *file, + unsigned int cmd, + void __user *arg) +{ + struct xfs_health_file_on_monitored_fs hms; + struct xfs_healthmon *hm = file->private_data; + struct inode *hms_inode; + + if (copy_from_user(&hms, arg, sizeof(hms))) + return -EFAULT; + + if (hms.flags) + return -EINVAL; + + CLASS(fd, hms_fd)(hms.fd); + if (fd_empty(hms_fd)) + return -EBADF; + + hms_inode = file_inode(fd_file(hms_fd)); + mutex_lock(&hm->lock); + if (hm->mount_cookie != (uintptr_t)hms_inode->i_sb) { + mutex_unlock(&hm->lock); + return -ESTALE; + } + + mutex_unlock(&hm->lock); + return 0; +} + +/* Handle ioctls for the health monitoring thread. */ +STATIC long +xfs_healthmon_ioctl( + struct file *file, + unsigned int cmd, + unsigned long p) +{ + void __user *arg = (void __user *)p; + + switch (cmd) { + case XFS_IOC_HEALTH_MONITOR: + return xfs_healthmon_reconfigure(file, cmd, arg); + case XFS_IOC_HEALTH_FD_ON_MONITORED_FS: + return xfs_healthmon_file_on_monitored_fs(file, cmd, arg); + default: + break; + } + + return -ENOTTY; +} + +static const struct file_operations xfs_healthmon_fops = { + .owner = THIS_MODULE, + .show_fdinfo = xfs_healthmon_show_fdinfo, + .read_iter = xfs_healthmon_read_iter, + .poll = xfs_healthmon_poll, + .release = xfs_healthmon_release, + .unlocked_ioctl = xfs_healthmon_ioctl, +}; + +/* + * Create a health monitoring file. Returns an index to the fd table or a + * negative errno. + */ +long +xfs_ioc_health_monitor( + struct file *file, + struct xfs_health_monitor __user *arg) +{ + struct xfs_health_monitor hmo; + struct xfs_healthmon_event *running_event; + struct xfs_healthmon *hm; + struct xfs_inode *ip = XFS_I(file_inode(file)); + struct xfs_mount *mp = ip->i_mount; + int ret; + + /* + * The only intended user of the health monitoring system should be the + * xfs_healer daemon running on behalf of the whole filesystem in the + * initial user namespace. IOWs, we don't allow unprivileged userspace + * (they can use fsnotify) nor do we allow containers. + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (ip->i_ino != mp->m_sb.sb_rootino) + return -EPERM; + if (current_user_ns() != &init_user_ns) + return -EPERM; + + if (copy_from_user(&hmo, arg, sizeof(hmo))) + return -EFAULT; + + if (!xfs_healthmon_validate(&hmo)) + return -EINVAL; + + hm = kzalloc(sizeof(*hm), GFP_KERNEL); + if (!hm) + return -ENOMEM; + hm->dev = mp->m_super->s_dev; + refcount_set(&hm->ref, 1); + + mutex_init(&hm->lock); + init_waitqueue_head(&hm->wait); + + if (hmo.flags & XFS_HEALTH_MONITOR_VERBOSE) + hm->verbose = true; + + /* Queue up the first event that lets the client know we're running. */ + running_event = kzalloc(sizeof(struct xfs_healthmon_event), GFP_NOFS); + if (!running_event) { + ret = -ENOMEM; + goto out_hm; + } + running_event->type = XFS_HEALTHMON_RUNNING; + running_event->domain = XFS_HEALTHMON_MOUNT; + __xfs_healthmon_insert(hm, running_event); + + /* + * Preallocate the unmount event so that we can't fail to notify the + * filesystem later. This is key for triggering fast exit of the + * xfs_healer daemon. + */ + hm->unmount_event = kzalloc(sizeof(struct xfs_healthmon_event), + GFP_NOFS); + if (!hm->unmount_event) { + ret = -ENOMEM; + goto out_hm; + } + hm->unmount_event->type = XFS_HEALTHMON_UNMOUNT; + hm->unmount_event->domain = XFS_HEALTHMON_MOUNT; + + /* + * Try to attach this health monitor to the xfs_mount. The monitor is + * considered live and will receive events if this succeeds. + */ + ret = xfs_healthmon_attach(mp, hm); + if (ret) + goto out_hm; + + /* + * Create the anonymous file and install a fd for it. If it succeeds, + * the file owns hm and can go away at any time, so we must not access + * it again. This must go last because we can't undo a fd table + * installation. + */ + ret = anon_inode_getfd("xfs_healthmon", &xfs_healthmon_fops, hm, + O_CLOEXEC | O_RDONLY); + if (ret < 0) + goto out_mp; + + trace_xfs_healthmon_create(mp->m_super->s_dev, hmo.flags, hmo.format); + + return ret; + +out_mp: + xfs_healthmon_detach(hm); +out_hm: + ASSERT(refcount_read(&hm->ref) == 1); + xfs_healthmon_put(hm); + return ret; +} diff --git a/fs/xfs/xfs_healthmon.h b/fs/xfs/xfs_healthmon.h new file mode 100644 index 000000000000..0e936507037f --- /dev/null +++ b/fs/xfs/xfs_healthmon.h @@ -0,0 +1,184 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2024-2026 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_HEALTHMON_H__ +#define __XFS_HEALTHMON_H__ + +struct xfs_healthmon { + /* + * Weak reference to the xfs filesystem that is being monitored. It + * will be set to zero when the filesystem detaches from the monitor. + * Do not dereference this pointer. + */ + uintptr_t mount_cookie; + + /* + * Device number of the filesystem being monitored. This is for + * consistent tracing even after unmount. + */ + dev_t dev; + + /* + * Reference count of this structure. The open healthmon fd holds one + * ref, the xfs_mount holds another ref if it points to this object, + * and running event handlers hold their own refs. + */ + refcount_t ref; + + /* lock for event list and event counters */ + struct mutex lock; + + /* list of event objects */ + struct xfs_healthmon_event *first_event; + struct xfs_healthmon_event *last_event; + + /* preallocated event for unmount */ + struct xfs_healthmon_event *unmount_event; + + /* number of events in the list */ + unsigned int events; + + /* do we want all events? */ + bool verbose:1; + + /* waiter so read/poll can sleep until the arrival of events */ + struct wait_queue_head wait; + + /* + * Buffer for formatting events for a read_iter call. Events are + * formatted into the buffer at bufhead, and buftail determines where + * to start a copy_iter to get those events to userspace. All buffer + * fields are protected by inode_lock. + */ + char *buffer; + size_t bufsize; + size_t bufhead; + size_t buftail; + + /* did we lose previous events? */ + unsigned long long lost_prev_event; + + /* total counts of events observed and lost events */ + unsigned long long total_events; + unsigned long long total_lost; +}; + +void xfs_healthmon_unmount(struct xfs_mount *mp); + +enum xfs_healthmon_type { + XFS_HEALTHMON_RUNNING, /* monitor running */ + XFS_HEALTHMON_LOST, /* message lost */ + XFS_HEALTHMON_UNMOUNT, /* filesystem is unmounting */ + + /* filesystem shutdown */ + XFS_HEALTHMON_SHUTDOWN, + + /* metadata health events */ + XFS_HEALTHMON_SICK, /* runtime corruption observed */ + XFS_HEALTHMON_CORRUPT, /* fsck reported corruption */ + XFS_HEALTHMON_HEALTHY, /* fsck reported healthy structure */ + + /* media errors */ + XFS_HEALTHMON_MEDIA_ERROR, + + /* file range events */ + XFS_HEALTHMON_BUFREAD, + XFS_HEALTHMON_BUFWRITE, + XFS_HEALTHMON_DIOREAD, + XFS_HEALTHMON_DIOWRITE, + XFS_HEALTHMON_DATALOST, +}; + +enum xfs_healthmon_domain { + XFS_HEALTHMON_MOUNT, /* affects the whole fs */ + + /* metadata health events */ + XFS_HEALTHMON_FS, /* main filesystem metadata */ + XFS_HEALTHMON_AG, /* allocation group metadata */ + XFS_HEALTHMON_INODE, /* inode metadata */ + XFS_HEALTHMON_RTGROUP, /* realtime group metadata */ + + /* media errors */ + XFS_HEALTHMON_DATADEV, + XFS_HEALTHMON_RTDEV, + XFS_HEALTHMON_LOGDEV, + + /* file range events */ + XFS_HEALTHMON_FILERANGE, +}; + +struct xfs_healthmon_event { + struct xfs_healthmon_event *next; + + enum xfs_healthmon_type type; + enum xfs_healthmon_domain domain; + + uint64_t time_ns; + + union { + /* lost events */ + struct { + uint64_t lostcount; + }; + /* fs/rt metadata */ + struct { + /* XFS_SICK_* flags */ + unsigned int fsmask; + }; + /* ag/rtgroup metadata */ + struct { + /* XFS_SICK_(AG|RG)* flags */ + unsigned int grpmask; + unsigned int group; + }; + /* inode metadata */ + struct { + /* XFS_SICK_INO_* flags */ + unsigned int imask; + uint32_t gen; + xfs_ino_t ino; + }; + /* shutdown */ + struct { + unsigned int flags; + }; + /* media errors */ + struct { + xfs_daddr_t daddr; + uint64_t bbcount; + }; + /* file range events */ + struct { + xfs_ino_t fino; + loff_t fpos; + uint64_t flen; + uint32_t fgen; + int error; + }; + }; +}; + +void xfs_healthmon_report_fs(struct xfs_mount *mp, + enum xfs_healthmon_type type, unsigned int old_mask, + unsigned int new_mask); +void xfs_healthmon_report_group(struct xfs_group *xg, + enum xfs_healthmon_type type, unsigned int old_mask, + unsigned int new_mask); +void xfs_healthmon_report_inode(struct xfs_inode *ip, + enum xfs_healthmon_type type, unsigned int old_mask, + unsigned int new_mask); + +void xfs_healthmon_report_shutdown(struct xfs_mount *mp, uint32_t flags); + +void xfs_healthmon_report_media(struct xfs_mount *mp, enum xfs_device fdev, + xfs_daddr_t daddr, uint64_t bbcount); + +void xfs_healthmon_report_file_ioerror(struct xfs_inode *ip, + const struct fserror_event *p); + +long xfs_ioc_health_monitor(struct file *file, + struct xfs_health_monitor __user *arg); + +#endif /* __XFS_HEALTHMON_H__ */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 54cef912e05f..4eeda4d4e3ab 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -41,6 +41,8 @@ #include "xfs_exchrange.h" #include "xfs_handle.h" #include "xfs_rtgroup.h" +#include "xfs_healthmon.h" +#include "xfs_verify_media.h" #include <linux/mount.h> #include <linux/fileattr.h> @@ -1419,6 +1421,11 @@ xfs_file_ioctl( case XFS_IOC_COMMIT_RANGE: return xfs_ioc_commit_range(filp, arg); + case XFS_IOC_HEALTH_MONITOR: + return xfs_ioc_health_monitor(filp, arg); + case XFS_IOC_VERIFY_MEDIA: + return xfs_ioc_verify_media(filp, arg); + default: return -ENOTTY; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 4b0046483ca6..9c295abd0a0a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -41,6 +41,7 @@ #include "xfs_rtrefcount_btree.h" #include "scrub/stats.h" #include "xfs_zone_alloc.h" +#include "xfs_healthmon.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; @@ -625,6 +626,7 @@ xfs_unmount_flush_inodes( cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp); xfs_health_unmount(mp); + xfs_healthmon_unmount(mp); } static void diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b871dfde372b..61c71128d171 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -13,6 +13,7 @@ struct xfs_ail; struct xfs_quotainfo; struct xfs_da_geometry; struct xfs_perag; +struct xfs_healthmon; /* dynamic preallocation free space thresholds, 5% down to 1% */ enum { @@ -342,6 +343,9 @@ typedef struct xfs_mount { /* Hook to feed dirent updates to an active online repair. */ struct xfs_hooks m_dir_update_hooks; + + /* Private data referring to a health monitor object. */ + struct xfs_healthmon *m_healthmon; } xfs_mount_t; #define M_IGEO(mp) (&(mp)->m_ino_geo) diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index a6a34dc2c028..6be19fa1ebe2 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -22,10 +22,12 @@ #include "xfs_notify_failure.h" #include "xfs_rtgroup.h" #include "xfs_rtrmap_btree.h" +#include "xfs_healthmon.h" #include <linux/mm.h> #include <linux/dax.h> #include <linux/fs.h> +#include <linux/fserror.h> struct xfs_failure_info { xfs_agblock_t startblock; @@ -116,6 +118,9 @@ xfs_dax_failure_fn( invalidate_inode_pages2_range(mapping, pgoff, pgoff + pgcnt - 1); + fserror_report_data_lost(VFS_I(ip), (u64)pgoff << PAGE_SHIFT, + (u64)pgcnt << PAGE_SHIFT, GFP_NOFS); + xfs_irele(ip); return error; } @@ -215,6 +220,8 @@ xfs_dax_notify_logdev_failure( if (error) return error; + xfs_healthmon_report_media(mp, XFS_DEV_LOG, daddr, bblen); + /* * In the pre-remove case the failure notification is attempting to * trigger a force unmount. The expectation is that the device is @@ -248,16 +255,20 @@ xfs_dax_notify_dev_failure( uint64_t bblen; struct xfs_group *xg = NULL; - if (!xfs_has_rmapbt(mp)) { - xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); - return -EOPNOTSUPP; - } - error = xfs_dax_translate_range(xfs_group_type_buftarg(mp, type), offset, len, &daddr, &bblen); if (error) return error; + xfs_healthmon_report_media(mp, + type == XG_TYPE_RTG ? XFS_DEV_RT : XFS_DEV_DATA, + daddr, bblen); + + if (!xfs_has_rmapbt(mp)) { + xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); + return -EOPNOTSUPP; + } + if (type == XG_TYPE_RTG) { start_bno = xfs_daddr_to_rtb(mp, daddr); end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1); diff --git a/fs/xfs/xfs_platform.h b/fs/xfs/xfs_platform.h index c7b013593646..1e59bf94d1f2 100644 --- a/fs/xfs/xfs_platform.h +++ b/fs/xfs/xfs_platform.h @@ -133,8 +133,6 @@ typedef __u32 xfs_nlink_t; #define ENOATTR ENODATA /* Attribute not found */ #define EWRONGFS EINVAL /* Mount with wrong filesystem type */ -#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ -#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define __return_address __builtin_return_address(0) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e05bf62a5413..b6a92f027d64 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -47,12 +47,14 @@ #include "xfs_parent.h" #include "xfs_rtalloc.h" #include "xfs_zone_alloc.h" +#include "xfs_healthmon.h" #include "scrub/stats.h" #include "scrub/rcbag_btree.h" #include <linux/magic.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> +#include <linux/fserror.h> static const struct super_operations xfs_super_operations; @@ -1301,6 +1303,15 @@ xfs_fs_show_stats( return 0; } +static void +xfs_fs_report_error( + const struct fserror_event *event) +{ + /* healthmon already knows about non-inode and metadata errors */ + if (event->inode && event->type != FSERR_METADATA) + xfs_healthmon_report_file_ioerror(XFS_I(event->inode), event); +} + static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, @@ -1317,6 +1328,7 @@ static const struct super_operations xfs_super_operations = { .free_cached_objects = xfs_fs_free_cached_objects, .shutdown = xfs_fs_shutdown, .show_stats = xfs_fs_show_stats, + .report_error = xfs_fs_report_error, }; static int diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 478aebb60411..912713a8a019 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -51,6 +51,11 @@ #include "xfs_rtgroup.h" #include "xfs_zone_alloc.h" #include "xfs_zone_priv.h" +#include "xfs_health.h" +#include "xfs_healthmon.h" +#include "xfs_notify_failure.h" +#include "xfs_file.h" +#include <linux/fserror.h> /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f70afbf3cb19..3483461cf462 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -103,6 +103,9 @@ struct xfs_refcount_intent; struct xfs_metadir_update; struct xfs_rtgroup; struct xfs_open_zone; +struct xfs_healthmon_event; +struct xfs_healthmon; +struct fserror_event; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -5906,6 +5909,515 @@ DEFINE_EVENT(xfs_freeblocks_resv_class, name, \ DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_reserved); DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_enospc); +TRACE_EVENT(xfs_healthmon_lost_event, + TP_PROTO(const struct xfs_healthmon *hm), + TP_ARGS(hm), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long long, lost_prev) + ), + TP_fast_assign( + __entry->dev = hm->dev; + __entry->lost_prev = hm->lost_prev_event; + ), + TP_printk("dev %d:%d lost_prev %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lost_prev) +); + +#define XFS_HEALTHMON_FLAGS_STRINGS \ + { XFS_HEALTH_MONITOR_VERBOSE, "verbose" } +#define XFS_HEALTHMON_FMT_STRINGS \ + { XFS_HEALTH_MONITOR_FMT_V0, "v0" } + +TRACE_EVENT(xfs_healthmon_create, + TP_PROTO(dev_t dev, u64 flags, u8 format), + TP_ARGS(dev, flags, format), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, flags) + __field(u8, format) + ), + TP_fast_assign( + __entry->dev = dev; + __entry->flags = flags; + __entry->format = format; + ), + TP_printk("dev %d:%d flags %s format %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_flags(__entry->flags, "|", XFS_HEALTHMON_FLAGS_STRINGS), + __print_symbolic(__entry->format, XFS_HEALTHMON_FMT_STRINGS)) +); + +TRACE_EVENT(xfs_healthmon_copybuf, + TP_PROTO(const struct xfs_healthmon *hm, const struct iov_iter *iov), + TP_ARGS(hm, iov), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(size_t, bufsize) + __field(size_t, inpos) + __field(size_t, outpos) + __field(size_t, to_copy) + __field(size_t, iter_count) + ), + TP_fast_assign( + __entry->dev = hm->dev; + __entry->bufsize = hm->bufsize; + __entry->inpos = hm->bufhead; + __entry->outpos = hm->buftail; + if (hm->bufhead > hm->buftail) + __entry->to_copy = hm->bufhead - hm->buftail; + else + __entry->to_copy = 0; + __entry->iter_count = iov_iter_count(iov); + ), + TP_printk("dev %d:%d bufsize %zu in_pos %zu out_pos %zu to_copy %zu iter_count %zu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->bufsize, + __entry->inpos, + __entry->outpos, + __entry->to_copy, + __entry->iter_count) +); + +DECLARE_EVENT_CLASS(xfs_healthmon_class, + TP_PROTO(const struct xfs_healthmon *hm), + TP_ARGS(hm), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, events) + __field(unsigned long long, lost_prev) + ), + TP_fast_assign( + __entry->dev = hm->dev; + __entry->events = hm->events; + __entry->lost_prev = hm->lost_prev_event; + ), + TP_printk("dev %d:%d events %u lost_prev? %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->events, + __entry->lost_prev) +); +#define DEFINE_HEALTHMON_EVENT(name) \ +DEFINE_EVENT(xfs_healthmon_class, name, \ + TP_PROTO(const struct xfs_healthmon *hm), \ + TP_ARGS(hm)) +DEFINE_HEALTHMON_EVENT(xfs_healthmon_read_start); +DEFINE_HEALTHMON_EVENT(xfs_healthmon_read_finish); +DEFINE_HEALTHMON_EVENT(xfs_healthmon_release); +DEFINE_HEALTHMON_EVENT(xfs_healthmon_detach); +DEFINE_HEALTHMON_EVENT(xfs_healthmon_report_unmount); + +#define XFS_HEALTHMON_TYPE_STRINGS \ + { XFS_HEALTHMON_LOST, "lost" }, \ + { XFS_HEALTHMON_UNMOUNT, "unmount" }, \ + { XFS_HEALTHMON_SICK, "sick" }, \ + { XFS_HEALTHMON_CORRUPT, "corrupt" }, \ + { XFS_HEALTHMON_HEALTHY, "healthy" }, \ + { XFS_HEALTHMON_SHUTDOWN, "shutdown" } + +#define XFS_HEALTHMON_DOMAIN_STRINGS \ + { XFS_HEALTHMON_MOUNT, "mount" }, \ + { XFS_HEALTHMON_FS, "fs" }, \ + { XFS_HEALTHMON_AG, "ag" }, \ + { XFS_HEALTHMON_INODE, "inode" }, \ + { XFS_HEALTHMON_RTGROUP, "rtgroup" } + +TRACE_DEFINE_ENUM(XFS_HEALTHMON_LOST); +TRACE_DEFINE_ENUM(XFS_HEALTHMON_SHUTDOWN); +TRACE_DEFINE_ENUM(XFS_HEALTHMON_UNMOUNT); +TRACE_DEFINE_ENUM(XFS_HEALTHMON_SICK); +TRACE_DEFINE_ENUM(XFS_HEALTHMON_CORRUPT); +TRACE_DEFINE_ENUM(XFS_HEALTHMON_HEALTHY); + +TRACE_DEFINE_ENUM(XFS_HEALTHMON_MOUNT); +TRACE_DEFINE_ENUM(XFS_HEALTHMON_FS); +TRACE_DEFINE_ENUM(XFS_HEALTHMON_AG); +TRACE_DEFINE_ENUM(XFS_HEALTHMON_INODE); +TRACE_DEFINE_ENUM(XFS_HEALTHMON_RTGROUP); + +DECLARE_EVENT_CLASS(xfs_healthmon_event_class, + TP_PROTO(const struct xfs_healthmon *hm, + const struct xfs_healthmon_event *event), + TP_ARGS(hm, event), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(unsigned int, domain) + __field(unsigned int, mask) + __field(unsigned long long, ino) + __field(unsigned int, gen) + __field(unsigned int, group) + __field(unsigned long long, offset) + __field(unsigned long long, length) + __field(unsigned long long, lostcount) + ), + TP_fast_assign( + __entry->dev = hm->dev; + __entry->type = event->type; + __entry->domain = event->domain; + __entry->mask = 0; + __entry->group = 0; + __entry->ino = 0; + __entry->gen = 0; + __entry->offset = 0; + __entry->length = 0; + __entry->lostcount = 0; + switch (__entry->domain) { + case XFS_HEALTHMON_MOUNT: + switch (__entry->type) { + case XFS_HEALTHMON_SHUTDOWN: + __entry->mask = event->flags; + break; + case XFS_HEALTHMON_LOST: + __entry->lostcount = event->lostcount; + break; + } + break; + case XFS_HEALTHMON_FS: + __entry->mask = event->fsmask; + break; + case XFS_HEALTHMON_AG: + case XFS_HEALTHMON_RTGROUP: + __entry->mask = event->grpmask; + __entry->group = event->group; + break; + case XFS_HEALTHMON_INODE: + __entry->mask = event->imask; + __entry->ino = event->ino; + __entry->gen = event->gen; + break; + case XFS_HEALTHMON_DATADEV: + case XFS_HEALTHMON_LOGDEV: + case XFS_HEALTHMON_RTDEV: + __entry->offset = event->daddr; + __entry->length = event->bbcount; + break; + case XFS_HEALTHMON_FILERANGE: + __entry->ino = event->fino; + __entry->gen = event->fgen; + __entry->offset = event->fpos; + __entry->length = event->flen; + break; + } + ), + TP_printk("dev %d:%d type %s domain %s mask 0x%x ino 0x%llx gen 0x%x offset 0x%llx len 0x%llx group 0x%x lost %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_HEALTHMON_TYPE_STRINGS), + __print_symbolic(__entry->domain, XFS_HEALTHMON_DOMAIN_STRINGS), + __entry->mask, + __entry->ino, + __entry->gen, + __entry->offset, + __entry->length, + __entry->group, + __entry->lostcount) +); +#define DEFINE_HEALTHMONEVENT_EVENT(name) \ +DEFINE_EVENT(xfs_healthmon_event_class, name, \ + TP_PROTO(const struct xfs_healthmon *hm, \ + const struct xfs_healthmon_event *event), \ + TP_ARGS(hm, event)) +DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_insert); +DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_push); +DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_pop); +DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_format); +DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_format_overflow); +DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_drop); +DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_merge); + +TRACE_EVENT(xfs_healthmon_report_fs, + TP_PROTO(const struct xfs_healthmon *hm, + unsigned int old_mask, unsigned int new_mask, + const struct xfs_healthmon_event *event), + TP_ARGS(hm, old_mask, new_mask, event), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(unsigned int, domain) + __field(unsigned int, old_mask) + __field(unsigned int, new_mask) + __field(unsigned int, fsmask) + ), + TP_fast_assign( + __entry->dev = hm->dev; + __entry->type = event->type; + __entry->domain = event->domain; + __entry->old_mask = old_mask; + __entry->new_mask = new_mask; + __entry->fsmask = event->fsmask; + ), + TP_printk("dev %d:%d type %s domain %s oldmask 0x%x newmask 0x%x fsmask 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_HEALTHMON_TYPE_STRINGS), + __print_symbolic(__entry->domain, XFS_HEALTHMON_DOMAIN_STRINGS), + __entry->old_mask, + __entry->new_mask, + __entry->fsmask) +); + +TRACE_EVENT(xfs_healthmon_report_group, + TP_PROTO(const struct xfs_healthmon *hm, + unsigned int old_mask, unsigned int new_mask, + const struct xfs_healthmon_event *event), + TP_ARGS(hm, old_mask, new_mask, event), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(unsigned int, domain) + __field(unsigned int, old_mask) + __field(unsigned int, new_mask) + __field(unsigned int, grpmask) + __field(unsigned int, group) + ), + TP_fast_assign( + __entry->dev = hm->dev; + __entry->type = event->type; + __entry->domain = event->domain; + __entry->old_mask = old_mask; + __entry->new_mask = new_mask; + __entry->grpmask = event->grpmask; + __entry->group = event->group; + ), + TP_printk("dev %d:%d type %s domain %s oldmask 0x%x newmask 0x%x grpmask 0x%x group 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_HEALTHMON_TYPE_STRINGS), + __print_symbolic(__entry->domain, XFS_HEALTHMON_DOMAIN_STRINGS), + __entry->old_mask, + __entry->new_mask, + __entry->grpmask, + __entry->group) +); + +TRACE_EVENT(xfs_healthmon_report_inode, + TP_PROTO(const struct xfs_healthmon *hm, + unsigned int old_mask, unsigned int new_mask, + const struct xfs_healthmon_event *event), + TP_ARGS(hm, old_mask, new_mask, event), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(unsigned int, domain) + __field(unsigned int, old_mask) + __field(unsigned int, new_mask) + __field(unsigned int, imask) + __field(unsigned long long, ino) + __field(unsigned int, gen) + ), + TP_fast_assign( + __entry->dev = hm->dev; + __entry->type = event->type; + __entry->domain = event->domain; + __entry->old_mask = old_mask; + __entry->new_mask = new_mask; + __entry->imask = event->imask; + __entry->ino = event->ino; + __entry->gen = event->gen; + ), + TP_printk("dev %d:%d type %s domain %s oldmask 0x%x newmask 0x%x imask 0x%x ino 0x%llx gen 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_HEALTHMON_TYPE_STRINGS), + __print_symbolic(__entry->domain, XFS_HEALTHMON_DOMAIN_STRINGS), + __entry->old_mask, + __entry->new_mask, + __entry->imask, + __entry->ino, + __entry->gen) +); + +TRACE_EVENT(xfs_healthmon_report_shutdown, + TP_PROTO(const struct xfs_healthmon *hm, uint32_t shutdown_flags), + TP_ARGS(hm, shutdown_flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint32_t, shutdown_flags) + ), + TP_fast_assign( + __entry->dev = hm->dev; + __entry->shutdown_flags = shutdown_flags; + ), + TP_printk("dev %d:%d shutdown_flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_flags(__entry->shutdown_flags, "|", XFS_SHUTDOWN_STRINGS)) +); + +#define XFS_DEVICE_STRINGS \ + { XFS_DEV_DATA, "datadev" }, \ + { XFS_DEV_RT, "rtdev" }, \ + { XFS_DEV_LOG, "logdev" } + +TRACE_DEFINE_ENUM(XFS_DEV_DATA); +TRACE_DEFINE_ENUM(XFS_DEV_RT); +TRACE_DEFINE_ENUM(XFS_DEV_LOG); + +TRACE_EVENT(xfs_healthmon_report_media, + TP_PROTO(const struct xfs_healthmon *hm, enum xfs_device fdev, + const struct xfs_healthmon_event *event), + TP_ARGS(hm, fdev, event), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, error_dev) + __field(uint64_t, daddr) + __field(uint64_t, bbcount) + ), + TP_fast_assign( + __entry->dev = hm->dev; + __entry->error_dev = fdev; + __entry->daddr = event->daddr; + __entry->bbcount = event->bbcount; + ), + TP_printk("dev %d:%d %s daddr 0x%llx bbcount 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->error_dev, XFS_DEVICE_STRINGS), + __entry->daddr, + __entry->bbcount) +); + +#define FS_ERROR_STRINGS \ + { FSERR_BUFFERED_READ, "buffered_read" }, \ + { FSERR_BUFFERED_WRITE, "buffered_write" }, \ + { FSERR_DIRECTIO_READ, "directio_read" }, \ + { FSERR_DIRECTIO_WRITE, "directio_write" }, \ + { FSERR_DATA_LOST, "data_lost" }, \ + { FSERR_METADATA, "metadata" } + +TRACE_DEFINE_ENUM(FSERR_BUFFERED_READ); +TRACE_DEFINE_ENUM(FSERR_BUFFERED_WRITE); +TRACE_DEFINE_ENUM(FSERR_DIRECTIO_READ); +TRACE_DEFINE_ENUM(FSERR_DIRECTIO_WRITE); +TRACE_DEFINE_ENUM(FSERR_DATA_LOST); +TRACE_DEFINE_ENUM(FSERR_METADATA); + +TRACE_EVENT(xfs_healthmon_report_file_ioerror, + TP_PROTO(const struct xfs_healthmon *hm, + const struct fserror_event *p), + TP_ARGS(hm, p), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(unsigned long long, ino) + __field(unsigned int, gen) + __field(long long, pos) + __field(unsigned long long, len) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = hm->dev; + __entry->type = p->type; + __entry->ino = XFS_I(p->inode)->i_ino; + __entry->gen = p->inode->i_generation; + __entry->pos = p->pos; + __entry->len = p->len; + __entry->error = p->error; + ), + TP_printk("dev %d:%d ino 0x%llx gen 0x%x op %s pos 0x%llx bytecount 0x%llx error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->gen, + __print_symbolic(__entry->type, FS_ERROR_STRINGS), + __entry->pos, + __entry->len, + __entry->error) +); + +TRACE_EVENT(xfs_verify_media, + TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me, + dev_t fdev, xfs_daddr_t daddr, uint64_t bbcount, + const struct folio *folio), + TP_ARGS(mp, me, fdev, daddr, bbcount, folio), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, fdev) + __field(xfs_daddr_t, start_daddr) + __field(xfs_daddr_t, end_daddr) + __field(unsigned int, flags) + __field(xfs_daddr_t, daddr) + __field(uint64_t, bbcount) + __field(unsigned int, bufsize) + ), + TP_fast_assign( + __entry->dev = mp->m_ddev_targp->bt_dev; + __entry->fdev = fdev; + __entry->start_daddr = me->me_start_daddr; + __entry->end_daddr = me->me_end_daddr; + __entry->flags = me->me_flags; + __entry->daddr = daddr; + __entry->bbcount = bbcount; + __entry->bufsize = folio_size(folio); + ), + TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx flags 0x%x daddr 0x%llx bbcount 0x%llx bufsize 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->fdev), MINOR(__entry->fdev), + __entry->start_daddr, + __entry->end_daddr, + __entry->flags, + __entry->daddr, + __entry->bbcount, + __entry->bufsize) +); + +TRACE_EVENT(xfs_verify_media_end, + TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me, + dev_t fdev), + TP_ARGS(mp, me, fdev), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, fdev) + __field(xfs_daddr_t, start_daddr) + __field(xfs_daddr_t, end_daddr) + __field(int, ioerror) + ), + TP_fast_assign( + __entry->dev = mp->m_ddev_targp->bt_dev; + __entry->fdev = fdev; + __entry->start_daddr = me->me_start_daddr; + __entry->end_daddr = me->me_end_daddr; + __entry->ioerror = me->me_ioerror; + ), + TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx ioerror %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->fdev), MINOR(__entry->fdev), + __entry->start_daddr, + __entry->end_daddr, + __entry->ioerror) +); + +TRACE_EVENT(xfs_verify_media_error, + TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me, + dev_t fdev, xfs_daddr_t daddr, uint64_t bbcount, + blk_status_t status), + TP_ARGS(mp, me, fdev, daddr, bbcount, status), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, fdev) + __field(xfs_daddr_t, start_daddr) + __field(xfs_daddr_t, end_daddr) + __field(unsigned int, flags) + __field(xfs_daddr_t, daddr) + __field(uint64_t, bbcount) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = mp->m_ddev_targp->bt_dev; + __entry->fdev = fdev; + __entry->start_daddr = me->me_start_daddr; + __entry->end_daddr = me->me_end_daddr; + __entry->flags = me->me_flags; + __entry->daddr = daddr; + __entry->bbcount = bbcount; + __entry->error = blk_status_to_errno(status); + ), + TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx flags 0x%x daddr 0x%llx bbcount 0x%llx error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->fdev), MINOR(__entry->fdev), + __entry->start_daddr, + __entry->end_daddr, + __entry->flags, + __entry->daddr, + __entry->bbcount, + __entry->error) +); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_verify_media.c b/fs/xfs/xfs_verify_media.c new file mode 100644 index 000000000000..069cd371619d --- /dev/null +++ b/fs/xfs/xfs_verify_media.c @@ -0,0 +1,445 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2026 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs_platform.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_bit.h" +#include "xfs_btree.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_ag.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_health.h" +#include "xfs_healthmon.h" +#include "xfs_trace.h" +#include "xfs_verify_media.h" + +#include <linux/fserror.h> + +struct xfs_group_data_lost { + xfs_agblock_t startblock; + xfs_extlen_t blockcount; +}; + +/* Report lost file data from rmap records */ +static int +xfs_verify_report_data_lost( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *data) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_inode *ip; + struct xfs_group_data_lost *lost = data; + xfs_fileoff_t fileoff = rec->rm_offset; + xfs_extlen_t blocks = rec->rm_blockcount; + const bool is_attr = + (rec->rm_flags & XFS_RMAP_ATTR_FORK); + const xfs_agblock_t lost_end = + lost->startblock + lost->blockcount; + const xfs_agblock_t rmap_end = + rec->rm_startblock + rec->rm_blockcount; + int error = 0; + + if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner)) + return 0; + + error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip); + if (error) + return 0; + + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) { + xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK); + goto out_rele; + } + + if (is_attr) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR); + goto out_rele; + } + + if (lost->startblock > rec->rm_startblock) { + fileoff += lost->startblock - rec->rm_startblock; + blocks -= lost->startblock - rec->rm_startblock; + } + if (rmap_end > lost_end) + blocks -= rmap_end - lost_end; + + fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff), + XFS_FSB_TO_B(mp, blocks), GFP_NOFS); + +out_rele: + xfs_irele(ip); + return 0; +} + +/* Walk reverse mappings to look for all file data loss */ +static int +xfs_verify_report_losses( + struct xfs_mount *mp, + enum xfs_group_type type, + xfs_daddr_t daddr, + u64 bblen) +{ + struct xfs_group *xg = NULL; + struct xfs_trans *tp; + xfs_fsblock_t start_bno, end_bno; + uint32_t start_gno, end_gno; + int error; + + if (type == XG_TYPE_RTG) { + start_bno = xfs_daddr_to_rtb(mp, daddr); + end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1); + } else { + start_bno = XFS_DADDR_TO_FSB(mp, daddr); + end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1); + } + + tp = xfs_trans_alloc_empty(mp); + start_gno = xfs_fsb_to_gno(mp, start_bno, type); + end_gno = xfs_fsb_to_gno(mp, end_bno, type); + while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) { + struct xfs_buf *agf_bp = NULL; + struct xfs_rtgroup *rtg = NULL; + struct xfs_btree_cur *cur; + struct xfs_rmap_irec ri_low = { }; + struct xfs_rmap_irec ri_high; + struct xfs_group_data_lost lost; + + if (type == XG_TYPE_AG) { + struct xfs_perag *pag = to_perag(xg); + + error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); + if (error) { + xfs_perag_put(pag); + break; + } + + cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag); + } else { + rtg = to_rtg(xg); + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + cur = xfs_rtrmapbt_init_cursor(tp, rtg); + } + + /* + * Set the rmap range from ri_low to ri_high, which represents + * a [start, end] where we looking for the files or metadata. + */ + memset(&ri_high, 0xFF, sizeof(ri_high)); + if (xg->xg_gno == start_gno) + ri_low.rm_startblock = + xfs_fsb_to_gbno(mp, start_bno, type); + if (xg->xg_gno == end_gno) + ri_high.rm_startblock = + xfs_fsb_to_gbno(mp, end_bno, type); + + lost.startblock = ri_low.rm_startblock; + lost.blockcount = min(xg->xg_block_count, + ri_high.rm_startblock + 1) - + ri_low.rm_startblock; + + error = xfs_rmap_query_range(cur, &ri_low, &ri_high, + xfs_verify_report_data_lost, &lost); + xfs_btree_del_cursor(cur, error); + if (agf_bp) + xfs_trans_brelse(tp, agf_bp); + if (rtg) + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + if (error) { + xfs_group_put(xg); + break; + } + } + + xfs_trans_cancel(tp); + return 0; +} + +/* + * Compute the desired verify IO size. + * + * To minimize command overhead, we'd like to create bios that are 1MB, though + * we allow the user to ask for a smaller size. + */ +static unsigned int +xfs_verify_iosize( + const struct xfs_verify_media *me, + struct xfs_buftarg *btp, + uint64_t bbcount) +{ + unsigned int iosize = + min_not_zero(SZ_1M, me->me_max_io_size); + + BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT); + ASSERT(BBTOB(bbcount) >= bdev_logical_block_size(btp->bt_bdev)); + + return clamp(iosize, bdev_logical_block_size(btp->bt_bdev), + BBTOB(bbcount)); +} + +/* Allocate as much memory as we can get for verification buffer. */ +static struct folio * +xfs_verify_alloc_folio( + const unsigned int iosize) +{ + unsigned int order = get_order(iosize); + + while (order > 0) { + struct folio *folio = + folio_alloc(GFP_KERNEL | __GFP_NORETRY, order); + + if (folio) + return folio; + order--; + } + + return folio_alloc(GFP_KERNEL, 0); +} + +/* Report any kind of problem verifying media */ +static void +xfs_verify_media_error( + struct xfs_mount *mp, + struct xfs_verify_media *me, + struct xfs_buftarg *btp, + xfs_daddr_t daddr, + unsigned int bio_bbcount, + blk_status_t bio_status) +{ + trace_xfs_verify_media_error(mp, me, btp->bt_bdev->bd_dev, daddr, + bio_bbcount, bio_status); + + /* + * Pass any error, I/O or otherwise, up to the caller if we didn't + * successfully verify any bytes at all. + */ + if (me->me_start_daddr == daddr) + me->me_ioerror = -blk_status_to_errno(bio_status); + + /* + * PI validation failures, medium errors, or general IO errors are + * treated as indicators of data loss. Everything else are (hopefully) + * transient errors and are not reported to healthmon or fsnotify. + */ + switch (bio_status) { + case BLK_STS_PROTECTION: + case BLK_STS_IOERR: + case BLK_STS_MEDIUM: + break; + default: + return; + } + + if (!(me->me_flags & XFS_VERIFY_MEDIA_REPORT)) + return; + + xfs_healthmon_report_media(mp, me->me_dev, daddr, bio_bbcount); + + if (!xfs_has_rmapbt(mp)) + return; + + switch (me->me_dev) { + case XFS_DEV_DATA: + xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount); + break; + case XFS_DEV_RT: + xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount); + break; + } +} + +/* Verify the media of an xfs device by submitting read requests to the disk. */ +static int +xfs_verify_media( + struct xfs_mount *mp, + struct xfs_verify_media *me) +{ + struct xfs_buftarg *btp = NULL; + struct bio *bio; + struct folio *folio; + xfs_daddr_t daddr; + uint64_t bbcount; + int error = 0; + + me->me_ioerror = 0; + + switch (me->me_dev) { + case XFS_DEV_DATA: + btp = mp->m_ddev_targp; + break; + case XFS_DEV_LOG: + if (mp->m_logdev_targp->bt_bdev != mp->m_ddev_targp->bt_bdev) + btp = mp->m_logdev_targp; + break; + case XFS_DEV_RT: + btp = mp->m_rtdev_targp; + break; + } + if (!btp) + return -ENODEV; + + /* + * If the caller told us to verify beyond the end of the disk, tell the + * user exactly where that was. + */ + if (me->me_end_daddr > btp->bt_nr_sectors) + me->me_end_daddr = btp->bt_nr_sectors; + + /* start and end have to be aligned to the lba size */ + if (!IS_ALIGNED(BBTOB(me->me_start_daddr | me->me_end_daddr), + bdev_logical_block_size(btp->bt_bdev))) + return -EINVAL; + + /* + * end_daddr is the exclusive end of the range, so if start_daddr + * reaches there (or beyond), there's no work to be done. + */ + if (me->me_start_daddr >= me->me_end_daddr) + return 0; + + /* + * There are three ranges involved here: + * + * - [me->me_start_daddr, me->me_end_daddr) is the range that the + * user wants to verify. end_daddr can be beyond the end of the + * disk; we'll constrain it to the end if necessary. + * + * - [daddr, me->me_end_daddr) is the range that we have not yet + * verified. We update daddr after each successful read. + * me->me_start_daddr is set to daddr before returning. + * + * - [daddr, daddr + bio_bbcount) is the range that we're currently + * verifying. + */ + daddr = me->me_start_daddr; + bbcount = min_t(sector_t, me->me_end_daddr, btp->bt_nr_sectors) - + me->me_start_daddr; + + folio = xfs_verify_alloc_folio(xfs_verify_iosize(me, btp, bbcount)); + if (!folio) + return -ENOMEM; + + trace_xfs_verify_media(mp, me, btp->bt_bdev->bd_dev, daddr, bbcount, + folio); + + bio = bio_alloc(btp->bt_bdev, 1, REQ_OP_READ, GFP_KERNEL); + if (!bio) { + error = -ENOMEM; + goto out_folio; + } + + while (bbcount > 0) { + unsigned int bio_bbcount; + blk_status_t bio_status; + + bio_reset(bio, btp->bt_bdev, REQ_OP_READ); + bio->bi_iter.bi_sector = daddr; + bio_add_folio_nofail(bio, folio, + min(bbcount << SECTOR_SHIFT, folio_size(folio)), + 0); + + /* + * Save the length of the bio before we submit it, because we + * need the original daddr and length for reporting IO errors + * if the bio fails. + */ + bio_bbcount = bio->bi_iter.bi_size >> SECTOR_SHIFT; + submit_bio_wait(bio); + bio_status = bio->bi_status; + if (bio_status != BLK_STS_OK) { + xfs_verify_media_error(mp, me, btp, daddr, bio_bbcount, + bio_status); + error = 0; + break; + } + + daddr += bio_bbcount; + bbcount -= bio_bbcount; + + if (bbcount == 0) + break; + + if (me->me_rest_us) { + ktime_t expires; + + expires = ktime_add_ns(ktime_get(), + me->me_rest_us * 1000); + set_current_state(TASK_KILLABLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } + + if (fatal_signal_pending(current)) { + error = -EINTR; + break; + } + + cond_resched(); + } + + bio_put(bio); +out_folio: + folio_put(folio); + + if (error) + return error; + + /* + * Advance start_daddr to the end of what we verified if there wasn't + * an operational error. + */ + me->me_start_daddr = daddr; + trace_xfs_verify_media_end(mp, me, btp->bt_bdev->bd_dev); + return 0; +} + +int +xfs_ioc_verify_media( + struct file *file, + struct xfs_verify_media __user *arg) +{ + struct xfs_verify_media me; + struct xfs_inode *ip = XFS_I(file_inode(file)); + struct xfs_mount *mp = ip->i_mount; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&me, arg, sizeof(me))) + return -EFAULT; + + if (me.me_pad) + return -EINVAL; + if (me.me_flags & ~XFS_VERIFY_MEDIA_FLAGS) + return -EINVAL; + + switch (me.me_dev) { + case XFS_DEV_DATA: + case XFS_DEV_LOG: + case XFS_DEV_RT: + break; + default: + return -EINVAL; + } + + error = xfs_verify_media(mp, &me); + if (error) + return error; + + if (copy_to_user(arg, &me, sizeof(me))) + return -EFAULT; + + return 0; +} diff --git a/fs/xfs/xfs_verify_media.h b/fs/xfs/xfs_verify_media.h new file mode 100644 index 000000000000..dc6eee9c8863 --- /dev/null +++ b/fs/xfs/xfs_verify_media.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2026 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_VERIFY_MEDIA_H__ +#define __XFS_VERIFY_MEDIA_H__ + +struct xfs_verify_media; +int xfs_ioc_verify_media(struct file *file, + struct xfs_verify_media __user *arg); + +#endif /* __XFS_VERIFY_MEDIA_H__ */ diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index 6bd3009e09b3..97a8552d8f2b 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -35,6 +35,7 @@ struct user_namespace; struct workqueue_struct; struct writeback_control; struct xattr_handler; +struct fserror_event; extern struct super_block *blockdev_superblock; @@ -124,6 +125,9 @@ struct super_operations { */ int (*remove_bdev)(struct super_block *sb, struct block_device *bdev); void (*shutdown)(struct super_block *sb); + + /* Report a filesystem error */ + void (*report_error)(const struct fserror_event *event); }; struct super_block { @@ -268,6 +272,9 @@ struct super_block { spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ long s_min_writeback_pages; + + /* number of fserrors that are being sent to fsnotify/filesystems */ + refcount_t s_pending_errors; } __randomize_layout; /* diff --git a/include/linux/fserror.h b/include/linux/fserror.h new file mode 100644 index 000000000000..5e1ad78c346e --- /dev/null +++ b/include/linux/fserror.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2025 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef _LINUX_FSERROR_H__ +#define _LINUX_FSERROR_H__ + +void fserror_mount(struct super_block *sb); +void fserror_unmount(struct super_block *sb); + +enum fserror_type { + /* pagecache I/O failed */ + FSERR_BUFFERED_READ, + FSERR_BUFFERED_WRITE, + + /* direct I/O failed */ + FSERR_DIRECTIO_READ, + FSERR_DIRECTIO_WRITE, + + /* out of band media error reported */ + FSERR_DATA_LOST, + + /* filesystem metadata */ + FSERR_METADATA, +}; + +struct fserror_event { + struct work_struct work; + struct super_block *sb; + struct inode *inode; + loff_t pos; + u64 len; + enum fserror_type type; + + /* negative error number */ + int error; +}; + +void fserror_report(struct super_block *sb, struct inode *inode, + enum fserror_type type, loff_t pos, u64 len, int error, + gfp_t gfp); + +static inline void fserror_report_io(struct inode *inode, + enum fserror_type type, loff_t pos, + u64 len, int error, gfp_t gfp) +{ + fserror_report(inode->i_sb, inode, type, pos, len, error, gfp); +} + +static inline void fserror_report_data_lost(struct inode *inode, loff_t pos, + u64 len, gfp_t gfp) +{ + fserror_report(inode->i_sb, inode, FSERR_DATA_LOST, pos, len, -EIO, + gfp); +} + +static inline void fserror_report_file_metadata(struct inode *inode, int error, + gfp_t gfp) +{ + fserror_report(inode->i_sb, inode, FSERR_METADATA, 0, 0, error, gfp); +} + +static inline void fserror_report_metadata(struct super_block *sb, int error, + gfp_t gfp) +{ + fserror_report(sb, NULL, FSERR_METADATA, 0, 0, error, gfp); +} + +static inline void fserror_report_shutdown(struct super_block *sb, gfp_t gfp) +{ + fserror_report(sb, NULL, FSERR_METADATA, 0, 0, -ESHUTDOWN, gfp); +} + +#endif /* _LINUX_FSERROR_H__ */ diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index f5eaf76198f3..a53a00d36228 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1815,7 +1815,4 @@ static inline int jbd2_handle_buffer_credits(handle_t *handle) #endif /* __KERNEL__ */ -#define EFSBADCRC EBADMSG /* Bad CRC detected */ -#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ - #endif /* _LINUX_JBD2_H */ diff --git a/include/uapi/asm-generic/errno.h b/include/uapi/asm-generic/errno.h index cf9c51ac49f9..92e7ae493ee3 100644 --- a/include/uapi/asm-generic/errno.h +++ b/include/uapi/asm-generic/errno.h @@ -55,6 +55,7 @@ #define EMULTIHOP 72 /* Multihop attempted */ #define EDOTDOT 73 /* RFS specific error */ #define EBADMSG 74 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EOVERFLOW 75 /* Value too large for defined data type */ #define ENOTUNIQ 76 /* Name not unique on network */ #define EBADFD 77 /* File descriptor in bad state */ @@ -98,6 +99,7 @@ #define EINPROGRESS 115 /* Operation now in progress */ #define ESTALE 116 /* Stale file handle */ #define EUCLEAN 117 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 118 /* Not a XENIX named type file */ #define ENAVAIL 119 /* No XENIX semaphores available */ #define EISNAM 120 /* Is a named type file */ diff --git a/tools/arch/alpha/include/uapi/asm/errno.h b/tools/arch/alpha/include/uapi/asm/errno.h index 3d265f6babaf..6791f6508632 100644 --- a/tools/arch/alpha/include/uapi/asm/errno.h +++ b/tools/arch/alpha/include/uapi/asm/errno.h @@ -55,6 +55,7 @@ #define ENOSR 82 /* Out of streams resources */ #define ETIME 83 /* Timer expired */ #define EBADMSG 84 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EPROTO 85 /* Protocol error */ #define ENODATA 86 /* No data available */ #define ENOSTR 87 /* Device not a stream */ @@ -96,6 +97,7 @@ #define EREMCHG 115 /* Remote address changed */ #define EUCLEAN 117 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 118 /* Not a XENIX named type file */ #define ENAVAIL 119 /* No XENIX semaphores available */ #define EISNAM 120 /* Is a named type file */ diff --git a/tools/arch/mips/include/uapi/asm/errno.h b/tools/arch/mips/include/uapi/asm/errno.h index 2fb714e2d6d8..c01ed91b1ef4 100644 --- a/tools/arch/mips/include/uapi/asm/errno.h +++ b/tools/arch/mips/include/uapi/asm/errno.h @@ -50,6 +50,7 @@ #define EDOTDOT 73 /* RFS specific error */ #define EMULTIHOP 74 /* Multihop attempted */ #define EBADMSG 77 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define ENAMETOOLONG 78 /* File name too long */ #define EOVERFLOW 79 /* Value too large for defined data type */ #define ENOTUNIQ 80 /* Name not unique on network */ @@ -88,6 +89,7 @@ #define EISCONN 133 /* Transport endpoint is already connected */ #define ENOTCONN 134 /* Transport endpoint is not connected */ #define EUCLEAN 135 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 137 /* Not a XENIX named type file */ #define ENAVAIL 138 /* No XENIX semaphores available */ #define EISNAM 139 /* Is a named type file */ diff --git a/tools/arch/parisc/include/uapi/asm/errno.h b/tools/arch/parisc/include/uapi/asm/errno.h index 8d94739d75c6..8cbc07c1903e 100644 --- a/tools/arch/parisc/include/uapi/asm/errno.h +++ b/tools/arch/parisc/include/uapi/asm/errno.h @@ -36,6 +36,7 @@ #define EDOTDOT 66 /* RFS specific error */ #define EBADMSG 67 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EUSERS 68 /* Too many users */ #define EDQUOT 69 /* Quota exceeded */ #define ESTALE 70 /* Stale file handle */ @@ -62,6 +63,7 @@ #define ERESTART 175 /* Interrupted system call should be restarted */ #define ESTRPIPE 176 /* Streams pipe error */ #define EUCLEAN 177 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 178 /* Not a XENIX named type file */ #define ENAVAIL 179 /* No XENIX semaphores available */ #define EISNAM 180 /* Is a named type file */ diff --git a/tools/arch/sparc/include/uapi/asm/errno.h b/tools/arch/sparc/include/uapi/asm/errno.h index 81a732b902ee..4a41e7835fd5 100644 --- a/tools/arch/sparc/include/uapi/asm/errno.h +++ b/tools/arch/sparc/include/uapi/asm/errno.h @@ -48,6 +48,7 @@ #define ENOSR 74 /* Out of streams resources */ #define ENOMSG 75 /* No message of desired type */ #define EBADMSG 76 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EIDRM 77 /* Identifier removed */ #define EDEADLK 78 /* Resource deadlock would occur */ #define ENOLCK 79 /* No record locks available */ @@ -91,6 +92,7 @@ #define ENOTUNIQ 115 /* Name not unique on network */ #define ERESTART 116 /* Interrupted syscall should be restarted */ #define EUCLEAN 117 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 118 /* Not a XENIX named type file */ #define ENAVAIL 119 /* No XENIX semaphores available */ #define EISNAM 120 /* Is a named type file */ diff --git a/tools/include/uapi/asm-generic/errno.h b/tools/include/uapi/asm-generic/errno.h index cf9c51ac49f9..92e7ae493ee3 100644 --- a/tools/include/uapi/asm-generic/errno.h +++ b/tools/include/uapi/asm-generic/errno.h @@ -55,6 +55,7 @@ #define EMULTIHOP 72 /* Multihop attempted */ #define EDOTDOT 73 /* RFS specific error */ #define EBADMSG 74 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EOVERFLOW 75 /* Value too large for defined data type */ #define ENOTUNIQ 76 /* Name not unique on network */ #define EBADFD 77 /* File descriptor in bad state */ @@ -98,6 +99,7 @@ #define EINPROGRESS 115 /* Operation now in progress */ #define ESTALE 116 /* Stale file handle */ #define EUCLEAN 117 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 118 /* Not a XENIX named type file */ #define ENAVAIL 119 /* No XENIX semaphores available */ #define EISNAM 120 /* Is a named type file */ |
