diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-18 09:42:02 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-18 09:42:02 -0700 |
| commit | 83f1454877cc292b88baf13c829c16ce6937d120 (patch) | |
| tree | e361542b4b4a66d04f7b9c2fe2a6a378b5a49a30 | |
| parent | dac3b26eae7bee261fa05f20c3fcc24988a7c233 (diff) | |
| parent | c143957520c6c9b5cd72e0de8b52b814f0c576fe (diff) | |
Merge tag 'ext4_for_linus-7.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4HEADmaster
Pull ext4 updates from Ted Ts'o:
- A major rework of the fast commit mechanism to avoid lock contention
and deadlocks. We also export snapshot statistics in
/proc/fs/ext4/*/fc_info
- Performance optimization for directory hash computation by processing
input in 4-byte chunks and removing function pointers, along with new
KUnit tests for directory hash
- Cleanups in JBD2 to remove special slabs and use kmalloc() instead
- Various bug fixes, including:
- Early validation of donor superblock in EXT4_IOC_MOVE_EXT to
avoid cross-fs deadlock
- Fix for a kernel BUG in ext4_write_inline_data_end under
data=journal
- Fix for a NULL dereference in jbd2_journal_dirty_metadata when
handle is aborted
- Fix for an underflow in JBD2 fast commit block initialization
check
- Fix for LOGFLUSH shutdown ordering to ensure ordered data
writeback
- Miscellaneous fixes for error path return values and KUnit
assertions
* tag 'ext4_for_linus-7.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
ext4: validate donor file superblock early in EXT4_IOC_MOVE_EXT
ext4: fix kernel BUG in ext4_write_inline_data_end
ext4: fix ERR_PTR(0) in ext4_mkdir()
jbd2: remove special jbd2 slabs
ext4: remove mention of PageWriteback
ext4: improve str2hashbuf by processing 4-byte chunks and removing function pointers
ext4: add Kunit coverage for directory hash computation
ext4: fast commit: export snapshot stats in fc_info
ext4: fast commit: add lock_updates tracepoint
ext4: fast commit: avoid i_data_sem by dropping ext4_map_blocks() in snapshots
ext4: fast commit: avoid self-deadlock in inode snapshotting
ext4: fast commit: avoid waiting for FC_COMMITTING
ext4: lockdep: handle i_data_sem subclassing for special inodes
ext4: fast commit: snapshot inode state before writing log
jbd2: fix integer underflow in jbd2_journal_initialize_fast_commit()
ext4: fix fast commit wait/wake bit mapping on 64-bit
jbd2: check for aborted handle in jbd2_journal_dirty_metadata()
ext4: Use %pe to print PTR_ERR()
ext4: fix LOGFLUSH shutdown ordering to allow ordered-mode data writeback
ext4: replace KUnit tests for memcmp() with KUNIT_ASSERT_MEMEQ()
| -rw-r--r-- | fs/ext4/Makefile | 2 | ||||
| -rw-r--r-- | fs/ext4/ext4.h | 93 | ||||
| -rw-r--r-- | fs/ext4/extents.c | 4 | ||||
| -rw-r--r-- | fs/ext4/fast_commit.c | 784 | ||||
| -rw-r--r-- | fs/ext4/hash-test.c | 567 | ||||
| -rw-r--r-- | fs/ext4/hash.c | 68 | ||||
| -rw-r--r-- | fs/ext4/inode.c | 54 | ||||
| -rw-r--r-- | fs/ext4/ioctl.c | 15 | ||||
| -rw-r--r-- | fs/ext4/mballoc-test.c | 9 | ||||
| -rw-r--r-- | fs/ext4/namei.c | 6 | ||||
| -rw-r--r-- | fs/ext4/page-io.c | 2 | ||||
| -rw-r--r-- | fs/ext4/super.c | 13 | ||||
| -rw-r--r-- | fs/jbd2/commit.c | 8 | ||||
| -rw-r--r-- | fs/jbd2/journal.c | 124 | ||||
| -rw-r--r-- | fs/jbd2/transaction.c | 17 | ||||
| -rw-r--r-- | include/linux/jbd2.h | 3 | ||||
| -rw-r--r-- | include/trace/events/ext4.h | 61 |
17 files changed, 1495 insertions, 335 deletions
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 3baee4e7c1cf..3f9fc0eb8eca 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -15,7 +15,7 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o ext4-test-objs += inode-test.o mballoc-test.o \ - extents-test.o + extents-test.o hash-test.o obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-test.o ext4-$(CONFIG_FS_VERITY) += verity.o ext4-$(CONFIG_FS_ENCRYPTION) += crypto.o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6af11f0ff1c5..b37c136ea3ab 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1015,14 +1015,32 @@ do { \ * than the first * I_DATA_SEM_QUOTA - Used for quota inodes only * I_DATA_SEM_EA - Used for ea_inodes only + * I_DATA_SEM_JOURNAL - Used for journal inode only */ enum { I_DATA_SEM_NORMAL = 0, I_DATA_SEM_OTHER, I_DATA_SEM_QUOTA, - I_DATA_SEM_EA + I_DATA_SEM_EA, + I_DATA_SEM_JOURNAL }; +struct ext4_fc_inode_snap; + +/* + * Snapshot failure reasons for ext4_fc_lock_updates tracepoint. + * Keep these stable for tooling. + */ +enum ext4_fc_snap_err { + EXT4_FC_SNAP_ERR_NONE = 0, + EXT4_FC_SNAP_ERR_ES_MISS, + EXT4_FC_SNAP_ERR_ES_DELAYED, + EXT4_FC_SNAP_ERR_ES_OTHER, + EXT4_FC_SNAP_ERR_INODES_CAP, + EXT4_FC_SNAP_ERR_RANGES_CAP, + EXT4_FC_SNAP_ERR_NOMEM, + EXT4_FC_SNAP_ERR_INODE_LOC, +}; /* * fourth extended file system inode data in memory @@ -1079,6 +1097,22 @@ struct ext4_inode_info { /* End of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_len; + /* + * Commit-time fast commit snapshots. + * + * i_fc_snap is installed and freed under sbi->s_fc_lock. The fast + * commit log writing path reads the snapshot under sbi->s_fc_lock while + * serializing fast commit TLVs. + * + * The snapshot lifetime is bounded by EXT4_STATE_FC_COMMITTING and the + * corresponding cleanup / eviction paths. + * + * i_fc_snap points to per-inode snapshot data for fast commit: + * - a raw inode snapshot for EXT4_FC_TAG_INODE + * - data range records for EXT4_FC_TAG_{ADD,DEL}_RANGE + */ + struct ext4_fc_inode_snap *i_fc_snap; + spinlock_t i_raw_lock; /* protects updates to the raw inode */ /* @@ -1517,6 +1551,36 @@ struct ext4_orphan_info { }; /* + * Ext4 fast commit snapshot statistics. + * + * These are best-effort counters intended for debugging / performance + * introspection; they are not exact under concurrent updates. + */ +struct ext4_fc_snap_stats { + atomic64_t lock_updates_ns_total; + atomic64_t lock_updates_ns_max; + atomic64_t lock_updates_samples; + + atomic64_t snap_inodes; + atomic64_t snap_ranges; + + atomic64_t snap_fail_es_miss; + atomic64_t snap_fail_es_delayed; + atomic64_t snap_fail_es_other; + + atomic64_t snap_fail_inodes_cap; + atomic64_t snap_fail_ranges_cap; + atomic64_t snap_fail_nomem; + atomic64_t snap_fail_inode_loc; + + /* + * Missing inode snapshots during log writing should never happen. + * Keep this counter to help catch unexpected regressions. + */ + atomic64_t snap_fail_no_snap; +}; + +/* * fourth extended-fs super-block data in memory */ struct ext4_sb_info { @@ -1790,6 +1854,7 @@ struct ext4_sb_info { struct mutex s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; + struct ext4_fc_snap_stats s_fc_snap_stats; tid_t s_fc_ineligible_tid; #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; @@ -1972,6 +2037,7 @@ enum { EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */ EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ + EXT4_STATE_FC_REQUEUE, /* Inode modified during fast commit */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -2000,6 +2066,8 @@ EXT4_INODE_BIT_FNS(flag, flags, 0) static inline int ext4_test_inode_state(struct inode *inode, int bit); static inline void ext4_set_inode_state(struct inode *inode, int bit); static inline void ext4_clear_inode_state(struct inode *inode, int bit); +static inline unsigned long *ext4_inode_state_wait_word(struct inode *inode); +static inline int ext4_inode_state_wait_bit(int bit); #if (BITS_PER_LONG < 64) EXT4_INODE_BIT_FNS(state, state_flags, 0) @@ -2015,6 +2083,24 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) /* We depend on the fact that callers will set i_flags */ } #endif + +static inline unsigned long *ext4_inode_state_wait_word(struct inode *inode) +{ +#if (BITS_PER_LONG < 64) + return &EXT4_I(inode)->i_state_flags; +#else + return &EXT4_I(inode)->i_flags; +#endif +} + +static inline int ext4_inode_state_wait_bit(int bit) +{ +#if (BITS_PER_LONG < 64) + return bit; +#else + return bit + 32; +#endif +} #else /* Assume that user mode programs are passing in an ext4fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test @@ -3080,8 +3166,9 @@ extern int ext4_file_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); -extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); -extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, +int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc); +int ext4_get_inode_loc_noio(struct inode *inode, struct ext4_iloc *iloc); +int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, struct ext4_iloc *iloc); extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 125f628e738a..91c97af64b31 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3268,8 +3268,8 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, */ path = ext4_find_extent(inode, ee_block, NULL, flags | EXT4_EX_NOFAIL); if (IS_ERR(path)) { - EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld", - split, PTR_ERR(path)); + EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %pe", + split, path); goto out_path; } diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 5773b85e43cb..8e2259799614 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -56,21 +56,22 @@ * deleted while it is being flushed. * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA" * state. - * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that - * all the exsiting handles finish and no new handles can start. - * [4] Mark all the fast commit eligible inodes as undergoing fast commit - * by setting "EXT4_STATE_FC_COMMITTING" state. - * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows - * starting of new handles. If new handles try to start an update on - * any of the inodes that are being committed, ext4_fc_track_inode() - * will block until those inodes have finished the fast commit. + * [3] Lock the journal by calling jbd2_journal_lock_updates(). This ensures + * that all the existing handles finish and no new handles can start. + * [4] Mark all the fast commit eligible inodes as undergoing fast commit by + * setting "EXT4_STATE_FC_COMMITTING" state, and snapshot the inode state + * needed for log writing. + * [5] Unlock the journal by calling jbd2_journal_unlock_updates(). This allows + * starting of new handles. Updates to inodes being fast committed are + * tracked for requeue rather than blocking. * [6] Commit all the directory entry updates in the fast commit space. - * [7] Commit all the changed inodes in the fast commit space and clear - * "EXT4_STATE_FC_COMMITTING" for these inodes. + * [7] Commit all the changed inodes in the fast commit space. * [8] Write tail tag (this tag ensures the atomicity, please read the following * section for more details). + * [9] Clear "EXT4_STATE_FC_COMMITTING" and wake up waiters in + * ext4_fc_cleanup(). * - * All the inode updates must be enclosed within jbd2_jounrnal_start() + * All the inode updates must be enclosed within jbd2_journal_start() * and jbd2_journal_stop() similar to JBD2 journaling. * * Fast Commit Ineligibility @@ -183,6 +184,21 @@ #include <trace/events/ext4.h> static struct kmem_cache *ext4_fc_dentry_cachep; +static struct kmem_cache *ext4_fc_range_cachep; + +/* + * Avoid spending unbounded time/memory snapshotting highly fragmented files + * under jbd2_journal_lock_updates(). If we exceed this limit, fall back to + * full commit. + */ +#define EXT4_FC_SNAPSHOT_MAX_INODES 1024 +#define EXT4_FC_SNAPSHOT_MAX_RANGES 2048 + +static inline void ext4_fc_set_snap_err(int *snap_err, int err) +{ + if (snap_err && *snap_err == EXT4_FC_SNAP_ERR_NONE) + *snap_err = err; +} static void ext4_end_buffer_io_sync(struct bio *bio) { @@ -203,6 +219,8 @@ static void ext4_end_buffer_io_sync(struct bio *bio) unlock_buffer(bh); } +static void ext4_fc_free_inode_snap(struct inode *inode); + static inline void ext4_fc_reset_inode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); @@ -217,8 +235,10 @@ void ext4_fc_init_inode(struct inode *inode) ext4_fc_reset_inode(inode); ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); + ext4_clear_inode_state(inode, EXT4_STATE_FC_REQUEUE); INIT_LIST_HEAD(&ei->i_fc_list); INIT_LIST_HEAD(&ei->i_fc_dilist); + ei->i_fc_snap = NULL; } static bool ext4_fc_disabled(struct super_block *sb) @@ -234,6 +254,50 @@ static bool ext4_fc_eligible(struct super_block *sb) } /* + * Wait for an inode fast-commit state bit to clear while dropping the + * fast-commit lock around schedule(). + */ +static void ext4_fc_wait_inode_state(struct inode *inode, int bit, + int *alloc_ctx) +{ + wait_queue_head_t *wq; + unsigned long *wait_word = ext4_inode_state_wait_word(inode); + int wait_bit = ext4_inode_state_wait_bit(bit); + + while (ext4_test_inode_state(inode, bit)) { + DEFINE_WAIT_BIT(wait, wait_word, wait_bit); + + wq = bit_waitqueue(wait_word, wait_bit); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, bit)) { + ext4_fc_unlock(inode->i_sb, *alloc_ctx); + schedule(); + *alloc_ctx = ext4_fc_lock(inode->i_sb); + } + finish_wait(wq, &wait.wq_entry); + } +} + +static inline void ext4_fc_wake_inode_state(struct inode *inode, int bit) +{ + wake_up_bit(ext4_inode_state_wait_word(inode), + ext4_inode_state_wait_bit(bit)); +} + +static void ext4_fc_snap_stats_update_max(atomic64_t *stat, u64 value) +{ + u64 old = atomic64_read(stat); + + while (value > old) { + u64 prev = atomic64_cmpxchg(stat, old, value); + + if (prev == old) + break; + old = prev; + } +} + +/* * Remove inode from fast commit list. If the inode is being committed * we wait until inode commit is done. */ @@ -241,7 +305,6 @@ void ext4_fc_del(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_fc_dentry_update *fc_dentry; - wait_queue_head_t *wq; int alloc_ctx; if (ext4_fc_disabled(inode->i_sb)) @@ -249,59 +312,43 @@ void ext4_fc_del(struct inode *inode) alloc_ctx = ext4_fc_lock(inode->i_sb); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { + ext4_fc_free_inode_snap(inode); ext4_fc_unlock(inode->i_sb, alloc_ctx); return; } /* - * Since ext4_fc_del is called from ext4_evict_inode while having a - * handle open, there is no need for us to wait here even if a fast - * commit is going on. That is because, if this inode is being - * committed, ext4_mark_inode_dirty would have waited for inode commit - * operation to finish before we come here. So, by the time we come - * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So, - * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode - * here. - * - * We may come here without any handles open in the "no_delete" case of - * ext4_evict_inode as well. However, if that happens, we first mark the - * file system as fast commit ineligible anyway. So, even in that case, - * it is okay to remove the inode from the fc list. + * Wait for ongoing fast commit to finish. We cannot remove the inode + * from fast commit lists while it is being committed. If we wake from + * FC_FLUSHING_DATA, re-check FC_COMMITTING before deleting because the + * commit thread sets FC_COMMITTING only after clearing FLUSHING_DATA. */ - WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING) - && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)); - while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { -#if (BITS_PER_LONG < 64) - DEFINE_WAIT_BIT(wait, &ei->i_state_flags, - EXT4_STATE_FC_FLUSHING_DATA); - wq = bit_waitqueue(&ei->i_state_flags, - EXT4_STATE_FC_FLUSHING_DATA); -#else - DEFINE_WAIT_BIT(wait, &ei->i_flags, - EXT4_STATE_FC_FLUSHING_DATA); - wq = bit_waitqueue(&ei->i_flags, - EXT4_STATE_FC_FLUSHING_DATA); -#endif - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { - ext4_fc_unlock(inode->i_sb, alloc_ctx); - schedule(); - alloc_ctx = ext4_fc_lock(inode->i_sb); - } - finish_wait(wq, &wait.wq_entry); + for (;;) { + ext4_fc_wait_inode_state(inode, EXT4_STATE_FC_COMMITTING, + &alloc_ctx); + + if (!ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) + break; + + ext4_fc_wait_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA, + &alloc_ctx); } + + ext4_fc_free_inode_snap(inode); list_del_init(&ei->i_fc_list); /* - * Since this inode is getting removed, let's also remove all FC - * dentry create references, since it is not needed to log it anyways. + * Since this inode is getting removed, let's also remove all FC dentry + * create references, since it is not needed to log it anyways. */ if (list_empty(&ei->i_fc_dilist)) { ext4_fc_unlock(inode->i_sb, alloc_ctx); return; } - fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist); + fc_dentry = list_first_entry(&ei->i_fc_dilist, + struct ext4_fc_dentry_update, + fcd_dilist); WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); @@ -373,6 +420,8 @@ static int ext4_fc_track_template( tid = handle->h_transaction->t_tid; spin_lock(&ei->i_fc_lock); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) + ext4_set_inode_state(inode, EXT4_STATE_FC_REQUEUE); if (tid == ei->i_sync_tid) { update = true; } else { @@ -543,8 +592,6 @@ static int __track_inode(handle_t *handle, struct inode *inode, void *arg, void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { - struct ext4_inode_info *ei = EXT4_I(inode); - wait_queue_head_t *wq; int ret; if (S_ISDIR(inode->i_mode)) @@ -560,29 +607,11 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) return; /* - * If we come here, we may sleep while waiting for the inode to - * commit. We shouldn't be holding i_data_sem when we go to sleep since - * the commit path needs to grab the lock while committing the inode. + * Fast commit snapshots inode state at commit time, so there's no need + * to wait for EXT4_STATE_FC_COMMITTING here. If the inode is already + * on the commit queue, ext4_fc_cleanup() will requeue it for the new + * transaction once the current commit finishes. */ - lockdep_assert_not_held(&ei->i_data_sem); - - while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { -#if (BITS_PER_LONG < 64) - DEFINE_WAIT_BIT(wait, &ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); -#else - DEFINE_WAIT_BIT(wait, &ei->i_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_flags, - EXT4_STATE_FC_COMMITTING); -#endif - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) - schedule(); - finish_wait(wq, &wait.wq_entry); - } /* * From this point on, this inode will not be committed either @@ -831,6 +860,21 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, return true; } +struct ext4_fc_range { + struct list_head list; + u16 tag; + ext4_lblk_t lblk; + ext4_lblk_t len; + ext4_fsblk_t pblk; + bool unwritten; +}; + +struct ext4_fc_inode_snap { + struct list_head data_list; + unsigned int inode_len; + u8 inode_buf[]; +}; + /* * Writes inode in the fast commit space under TLV with tag @tag. * Returns 0 on success, error on failure. @@ -838,21 +882,27 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, static int ext4_fc_write_inode(struct inode *inode, u32 *crc) { struct ext4_inode_info *ei = EXT4_I(inode); - int inode_len = EXT4_GOOD_OLD_INODE_SIZE; - int ret; - struct ext4_iloc iloc; + struct ext4_fc_inode_snap *snap = ei->i_fc_snap; + struct ext4_fc_snap_stats *stats = + &EXT4_SB(inode->i_sb)->s_fc_snap_stats; struct ext4_fc_inode fc_inode; struct ext4_fc_tl tl; u8 *dst; + u8 *src; + int inode_len; + int ret; - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) - return ret; + if (!snap) { + atomic64_inc(&stats->snap_fail_no_snap); + return -ECANCELED; + } - if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) - inode_len = EXT4_INODE_SIZE(inode->i_sb); - else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) - inode_len += ei->i_extra_isize; + src = snap->inode_buf; + inode_len = snap->inode_len; + if (!src || inode_len == 0) { + atomic64_inc(&stats->snap_fail_no_snap); + return -ECANCELED; + } fc_inode.fc_ino = cpu_to_le32(inode->i_ino); tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); @@ -868,10 +918,9 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc) dst += EXT4_FC_TAG_BASE_LEN; memcpy(dst, &fc_inode, sizeof(fc_inode)); dst += sizeof(fc_inode); - memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len); + memcpy(dst, src, inode_len); ret = 0; err: - brelse(iloc.bh); return ret; } @@ -881,76 +930,244 @@ err: */ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) { - ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_map_blocks map; + struct ext4_fc_inode_snap *snap = ei->i_fc_snap; + struct ext4_fc_snap_stats *stats = + &EXT4_SB(inode->i_sb)->s_fc_snap_stats; struct ext4_fc_add_range fc_ext; struct ext4_fc_del_range lrange; struct ext4_extent *ex; - int ret; + struct ext4_fc_range *range; + + if (!snap) { + atomic64_inc(&stats->snap_fail_no_snap); + return -ECANCELED; + } + + list_for_each_entry(range, &snap->data_list, list) { + if (range->tag == EXT4_FC_TAG_DEL_RANGE) { + lrange.fc_ino = cpu_to_le32(inode->i_ino); + lrange.fc_lblk = cpu_to_le32(range->lblk); + lrange.fc_len = cpu_to_le32(range->len); + if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, + sizeof(lrange), (u8 *)&lrange, crc)) + return -ENOSPC; + continue; + } + + fc_ext.fc_ino = cpu_to_le32(inode->i_ino); + ex = (struct ext4_extent *)&fc_ext.fc_ex; + ex->ee_block = cpu_to_le32(range->lblk); + ex->ee_len = cpu_to_le16(range->len); + ext4_ext_store_pblock(ex, range->pblk); + if (range->unwritten) + ext4_ext_mark_unwritten(ex); + else + ext4_ext_mark_initialized(ex); + + if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, + sizeof(fc_ext), (u8 *)&fc_ext, crc)) + return -ENOSPC; + } + + return 0; +} + +static void ext4_fc_free_ranges(struct list_head *head) +{ + struct ext4_fc_range *range, *range_n; + + list_for_each_entry_safe(range, range_n, head, list) { + list_del(&range->list); + kmem_cache_free(ext4_fc_range_cachep, range); + } +} + +static void ext4_fc_free_inode_snap(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_fc_inode_snap *snap = ei->i_fc_snap; + + if (!snap) + return; + + ext4_fc_free_ranges(&snap->data_list); + kfree(snap); + ei->i_fc_snap = NULL; +} + +static int ext4_fc_snapshot_inode_data(struct inode *inode, + struct list_head *ranges, + unsigned int nr_ranges_total, + unsigned int *nr_rangesp, + int *snap_err) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_fc_snap_stats *stats = + &EXT4_SB(inode->i_sb)->s_fc_snap_stats; + ext4_lblk_t start_lblk, end_lblk, cur_lblk; + unsigned int nr_ranges = 0; spin_lock(&ei->i_fc_lock); if (ei->i_fc_lblk_len == 0) { spin_unlock(&ei->i_fc_lock); + if (nr_rangesp) + *nr_rangesp = 0; return 0; } - old_blk_size = ei->i_fc_lblk_start; - new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; + start_lblk = ei->i_fc_lblk_start; + end_lblk = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; ei->i_fc_lblk_len = 0; spin_unlock(&ei->i_fc_lock); - cur_lblk_off = old_blk_size; - ext4_debug("will try writing %d to %d for inode %llu\n", - cur_lblk_off, new_blk_size, inode->i_ino); + cur_lblk = start_lblk; + ext4_debug("snapshot data ranges %u-%u for inode %llu\n", + start_lblk, end_lblk, + (unsigned long long)inode->i_ino); + + while (cur_lblk <= end_lblk) { + struct extent_status es; + struct ext4_fc_range *range; + ext4_lblk_t len; + u64 remaining = (u64)end_lblk - cur_lblk + 1; + + if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL)) { + atomic64_inc(&stats->snap_fail_es_miss); + ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_MISS); + return -EAGAIN; + } - while (cur_lblk_off <= new_blk_size) { - map.m_lblk = cur_lblk_off; - map.m_len = new_blk_size - cur_lblk_off + 1; - ret = ext4_map_blocks(NULL, inode, &map, - EXT4_GET_BLOCKS_IO_SUBMIT | - EXT4_EX_NOCACHE); - if (ret < 0) - return -ECANCELED; + if (ext4_es_is_delayed(&es)) { + atomic64_inc(&stats->snap_fail_es_delayed); + ext4_fc_set_snap_err(snap_err, + EXT4_FC_SNAP_ERR_ES_DELAYED); + return -EAGAIN; + } - if (map.m_len == 0) { - cur_lblk_off++; + len = es.es_len - (cur_lblk - es.es_lblk); + if (len > remaining) + len = remaining; + if (len == 0) { + cur_lblk++; continue; } - if (ret == 0) { - lrange.fc_ino = cpu_to_le32(inode->i_ino); - lrange.fc_lblk = cpu_to_le32(map.m_lblk); - lrange.fc_len = cpu_to_le32(map.m_len); - if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, - sizeof(lrange), (u8 *)&lrange, crc)) - return -ENOSPC; + if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES) { + atomic64_inc(&stats->snap_fail_ranges_cap); + ext4_fc_set_snap_err(snap_err, + EXT4_FC_SNAP_ERR_RANGES_CAP); + return -E2BIG; + } + + range = kmem_cache_alloc(ext4_fc_range_cachep, GFP_NOFS); + if (!range) { + atomic64_inc(&stats->snap_fail_nomem); + ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM); + return -ENOMEM; + } + nr_ranges++; + + range->lblk = cur_lblk; + range->len = len; + range->pblk = 0; + range->unwritten = false; + + if (ext4_es_is_hole(&es)) { + range->tag = EXT4_FC_TAG_DEL_RANGE; + } else if (ext4_es_is_written(&es) || + ext4_es_is_unwritten(&es)) { + unsigned int max; + + range->tag = EXT4_FC_TAG_ADD_RANGE; + range->pblk = ext4_es_pblock(&es) + + (cur_lblk - es.es_lblk); + range->unwritten = ext4_es_is_unwritten(&es); + + max = range->unwritten ? EXT_UNWRITTEN_MAX_LEN : + EXT_INIT_MAX_LEN; + if (range->len > max) + range->len = max; } else { - unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? - EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; - - /* Limit the number of blocks in one extent */ - map.m_len = min(max, map.m_len); - - fc_ext.fc_ino = cpu_to_le32(inode->i_ino); - ex = (struct ext4_extent *)&fc_ext.fc_ex; - ex->ee_block = cpu_to_le32(map.m_lblk); - ex->ee_len = cpu_to_le16(map.m_len); - ext4_ext_store_pblock(ex, map.m_pblk); - if (map.m_flags & EXT4_MAP_UNWRITTEN) - ext4_ext_mark_unwritten(ex); - else - ext4_ext_mark_initialized(ex); - if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, - sizeof(fc_ext), (u8 *)&fc_ext, crc)) - return -ENOSPC; + kmem_cache_free(ext4_fc_range_cachep, range); + atomic64_inc(&stats->snap_fail_es_other); + ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_OTHER); + return -EAGAIN; } - cur_lblk_off += map.m_len; + INIT_LIST_HEAD(&range->list); + list_add_tail(&range->list, ranges); + + if ((u64)range->len > (u64)end_lblk - cur_lblk) + break; + + cur_lblk += range->len; } + if (nr_rangesp) + *nr_rangesp = nr_ranges; return 0; } +static int ext4_fc_snapshot_inode(struct inode *inode, + unsigned int nr_ranges_total, + unsigned int *nr_rangesp, int *snap_err) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_fc_snap_stats *stats = + &EXT4_SB(inode->i_sb)->s_fc_snap_stats; + struct ext4_fc_inode_snap *snap; + int inode_len = EXT4_GOOD_OLD_INODE_SIZE; + struct ext4_iloc iloc; + LIST_HEAD(ranges); + unsigned int nr_ranges = 0; + int ret; + int alloc_ctx; + + ret = ext4_get_inode_loc_noio(inode, &iloc); + if (ret) { + atomic64_inc(&stats->snap_fail_inode_loc); + ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_INODE_LOC); + return ret; + } + + if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) + inode_len = EXT4_INODE_SIZE(inode->i_sb); + else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) + inode_len += ei->i_extra_isize; + + snap = kmalloc(struct_size(snap, inode_buf, inode_len), GFP_NOFS); + if (!snap) { + atomic64_inc(&stats->snap_fail_nomem); + ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM); + brelse(iloc.bh); + return -ENOMEM; + } + INIT_LIST_HEAD(&snap->data_list); + snap->inode_len = inode_len; + + memcpy(snap->inode_buf, (u8 *)ext4_raw_inode(&iloc), inode_len); + brelse(iloc.bh); + + ret = ext4_fc_snapshot_inode_data(inode, &ranges, nr_ranges_total, + &nr_ranges, snap_err); + if (ret) { + kfree(snap); + ext4_fc_free_ranges(&ranges); + return ret; + } + + alloc_ctx = ext4_fc_lock(inode->i_sb); + ext4_fc_free_inode_snap(inode); + ei->i_fc_snap = snap; + list_splice_tail_init(&ranges, &snap->data_list); + ext4_fc_unlock(inode->i_sb, alloc_ctx); + + atomic64_inc(&stats->snap_inodes); + atomic64_add(nr_ranges, &stats->snap_ranges); + if (nr_rangesp) + *nr_rangesp = nr_ranges; + return 0; +} /* Flushes data of all the inodes in the commit queue. */ static int ext4_fc_flush_data(journal_t *journal) @@ -1001,6 +1218,11 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) */ if (list_empty(&fc_dentry->fcd_dilist)) continue; + /* + * For EXT4_FC_TAG_CREAT, fcd_dilist is linked on the created + * inode's i_fc_dilist list (kept singular), so we can recover the + * inode through it. + */ ei = list_first_entry(&fc_dentry->fcd_dilist, struct ext4_inode_info, i_fc_dilist); inode = &ei->vfs_inode; @@ -1025,17 +1247,114 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) return 0; } -static int ext4_fc_perform_commit(journal_t *journal) +static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb, + struct inode ***inodesp, + unsigned int *nr_inodesp); + +static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes, + unsigned int inodes_size, + unsigned int *nr_inodesp, + unsigned int *nr_rangesp, + int *snap_err) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_inode_info *iter; + struct ext4_fc_dentry_update *fc_dentry; + unsigned int i = 0; + unsigned int idx; + unsigned int nr_ranges = 0; + int ret = 0; + int alloc_ctx; + + alloc_ctx = ext4_fc_lock(sb); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + if (i >= inodes_size) { + atomic64_inc(&sbi->s_fc_snap_stats.snap_fail_inodes_cap); + ext4_fc_set_snap_err(snap_err, + EXT4_FC_SNAP_ERR_INODES_CAP); + ret = -E2BIG; + goto unlock; + } + inodes[i++] = &iter->vfs_inode; + } + + list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { + struct ext4_inode_info *ei; + struct inode *inode; + + if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) + continue; + if (list_empty(&fc_dentry->fcd_dilist)) + continue; + + /* See the comment in ext4_fc_commit_dentry_updates(). */ + ei = list_first_entry(&fc_dentry->fcd_dilist, + struct ext4_inode_info, i_fc_dilist); + inode = &ei->vfs_inode; + if (!list_empty(&ei->i_fc_list)) + continue; + + if (i >= inodes_size) { + atomic64_inc(&sbi->s_fc_snap_stats.snap_fail_inodes_cap); + ext4_fc_set_snap_err(snap_err, + EXT4_FC_SNAP_ERR_INODES_CAP); + ret = -E2BIG; + goto unlock; + } + /* + * Create-only inodes may only be referenced via fcd_dilist and + * not appear on s_fc_q[MAIN]. They may hit the last iput while + * we are snapshotting, but inode eviction calls ext4_fc_del(), + * which waits for FC_COMMITTING to clear. Mark them FC_COMMITTING + * so the inode stays pinned and the snapshot stays valid until + * ext4_fc_cleanup(). + */ + ext4_set_inode_state(inode, EXT4_STATE_FC_COMMITTING); + inodes[i++] = inode; + } +unlock: + ext4_fc_unlock(sb, alloc_ctx); + + if (ret) + return ret; + + for (idx = 0; idx < i; idx++) { + unsigned int inode_ranges = 0; + + ret = ext4_fc_snapshot_inode(inodes[idx], nr_ranges, + &inode_ranges, snap_err); + if (ret) + break; + nr_ranges += inode_ranges; + } + + if (nr_inodesp) + *nr_inodesp = idx; + if (nr_rangesp) + *nr_rangesp = nr_ranges; + return ret; +} + +static int ext4_fc_perform_commit(journal_t *journal, tid_t commit_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_fc_snap_stats *snap_stats = &sbi->s_fc_snap_stats; struct ext4_inode_info *iter; struct ext4_fc_head head; struct inode *inode; + struct inode **inodes; + unsigned int inodes_size; + unsigned int snap_inodes = 0; + unsigned int snap_ranges = 0; + int snap_err = EXT4_FC_SNAP_ERR_NONE; struct blk_plug plug; int ret = 0; u32 crc = 0; int alloc_ctx; + ktime_t lock_start; + u64 locked_ns; /* * Step 1: Mark all inodes on s_fc_q[MAIN] with @@ -1061,11 +1380,8 @@ static int ext4_fc_perform_commit(journal_t *journal) list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_FLUSHING_DATA); -#if (BITS_PER_LONG < 64) - wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA); -#else - wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA); -#endif + ext4_fc_wake_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); } /* @@ -1083,13 +1399,23 @@ static int ext4_fc_perform_commit(journal_t *journal) if (ret) return ret; + ret = ext4_fc_alloc_snapshot_inodes(sb, &inodes, &inodes_size); + if (ret) { + if (ret == -E2BIG) + atomic64_inc(&snap_stats->snap_fail_inodes_cap); + else if (ret == -ENOMEM) + atomic64_inc(&snap_stats->snap_fail_nomem); + return ret; + } /* Step 4: Mark all inodes as being committed. */ jbd2_journal_lock_updates(journal); + lock_start = ktime_get(); /* * The journal is now locked. No more handles can start and all the - * previous handles are now drained. We now mark the inodes on the - * commit queue as being committed. + * previous handles are now drained. Snapshotting happens in this + * window so log writing can consume only stable snapshots without + * doing logical-to-physical mapping. */ alloc_ctx = ext4_fc_lock(sb); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { @@ -1097,7 +1423,22 @@ static int ext4_fc_perform_commit(journal_t *journal) EXT4_STATE_FC_COMMITTING); } ext4_fc_unlock(sb, alloc_ctx); + + ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size, + &snap_inodes, &snap_ranges, &snap_err); jbd2_journal_unlock_updates(journal); + locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start)); + atomic64_add(locked_ns, &snap_stats->lock_updates_ns_total); + atomic64_inc(&snap_stats->lock_updates_samples); + ext4_fc_snap_stats_update_max(&snap_stats->lock_updates_ns_max, + locked_ns); + if (trace_ext4_fc_lock_updates_enabled()) + trace_call__ext4_fc_lock_updates(sb, commit_tid, locked_ns, + snap_inodes, snap_ranges, + ret, snap_err); + kvfree(inodes); + if (ret) + return ret; /* * Step 5: If file system device is different from journal device, @@ -1151,6 +1492,64 @@ out: return ret; } +static unsigned int ext4_fc_count_snapshot_inodes(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_inode_info *iter; + struct ext4_fc_dentry_update *fc_dentry; + unsigned int nr_inodes = 0; + int alloc_ctx; + + alloc_ctx = ext4_fc_lock(sb); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) + nr_inodes++; + + list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { + struct ext4_inode_info *ei; + + if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) + continue; + if (list_empty(&fc_dentry->fcd_dilist)) + continue; + + /* See the comment in ext4_fc_commit_dentry_updates(). */ + ei = list_first_entry(&fc_dentry->fcd_dilist, + struct ext4_inode_info, i_fc_dilist); + if (!list_empty(&ei->i_fc_list)) + continue; + + nr_inodes++; + } + ext4_fc_unlock(sb, alloc_ctx); + + return nr_inodes; +} + +static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb, + struct inode ***inodesp, + unsigned int *nr_inodesp) +{ + unsigned int nr_inodes = ext4_fc_count_snapshot_inodes(sb); + struct inode **inodes; + + *inodesp = NULL; + *nr_inodesp = 0; + + if (!nr_inodes) + return 0; + + if (nr_inodes > EXT4_FC_SNAPSHOT_MAX_INODES) + return -E2BIG; + + inodes = kvcalloc(nr_inodes, sizeof(*inodes), GFP_NOFS); + if (!inodes) + return -ENOMEM; + + *inodesp = inodes; + *nr_inodesp = nr_inodes; + return 0; +} + static void ext4_fc_update_stats(struct super_block *sb, int status, u64 commit_time, int nblks, tid_t commit_tid) { @@ -1241,9 +1640,12 @@ restart_fc: journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; set_task_ioprio(current, journal_ioprio); fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; - ret = ext4_fc_perform_commit(journal); + ret = ext4_fc_perform_commit(journal, commit_tid); if (ret < 0) { - status = EXT4_FC_STATUS_FAILED; + if (ret == -EAGAIN || ret == -E2BIG || ret == -ECANCELED) + status = EXT4_FC_STATUS_INELIGIBLE; + else + status = EXT4_FC_STATUS_FAILED; goto fallback; } nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; @@ -1290,45 +1692,66 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) alloc_ctx = ext4_fc_lock(sb); while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) { + bool requeue; + ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], struct ext4_inode_info, i_fc_list); list_del_init(&ei->i_fc_list); + ext4_fc_free_inode_snap(&ei->vfs_inode); + spin_lock(&ei->i_fc_lock); + if (full) + requeue = !tid_geq(tid, ei->i_sync_tid); + else + requeue = ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_FC_REQUEUE); + if (!requeue) + ext4_fc_reset_inode(&ei->vfs_inode); + ext4_clear_inode_state(&ei->vfs_inode, EXT4_STATE_FC_REQUEUE); ext4_clear_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); - if (tid_geq(tid, ei->i_sync_tid)) { - ext4_fc_reset_inode(&ei->vfs_inode); - } else if (full) { - /* - * We are called after a full commit, inode has been - * modified while the commit was running. Re-enqueue - * the inode into STAGING, which will then be splice - * back into MAIN. This cannot happen during - * fastcommit because the journal is locked all the - * time in that case (and tid doesn't increase so - * tid check above isn't reliable). - */ + spin_unlock(&ei->i_fc_lock); + if (requeue) list_add_tail(&ei->i_fc_list, &sbi->s_fc_q[FC_Q_STAGING]); - } /* * Make sure clearing of EXT4_STATE_FC_COMMITTING is * visible before we send the wakeup. Pairs with implicit - * barrier in prepare_to_wait() in ext4_fc_track_inode(). + * barrier in prepare_to_wait() in ext4_fc_del(). */ smp_mb(); -#if (BITS_PER_LONG < 64) - wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); -#else - wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING); -#endif + ext4_fc_wake_inode_state(&ei->vfs_inode, + EXT4_STATE_FC_COMMITTING); } while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], - struct ext4_fc_dentry_update, - fcd_list); + struct ext4_fc_dentry_update, + fcd_list); list_del_init(&fc_dentry->fcd_list); + if (fc_dentry->fcd_op == EXT4_FC_TAG_CREAT && + !list_empty(&fc_dentry->fcd_dilist)) { + /* See the comment in ext4_fc_commit_dentry_updates(). */ + ei = list_first_entry(&fc_dentry->fcd_dilist, + struct ext4_inode_info, + i_fc_dilist); + ext4_fc_free_inode_snap(&ei->vfs_inode); + spin_lock(&ei->i_fc_lock); + ext4_clear_inode_state(&ei->vfs_inode, + EXT4_STATE_FC_REQUEUE); + ext4_clear_inode_state(&ei->vfs_inode, + EXT4_STATE_FC_COMMITTING); + spin_unlock(&ei->i_fc_lock); + /* + * Make sure clearing of EXT4_STATE_FC_COMMITTING is + * visible before we send the wakeup. Pairs with + * implicit barrier in prepare_to_wait() in + * ext4_fc_del(). + */ + smp_mb(); + ext4_fc_wake_inode_state(&ei->vfs_inode, + EXT4_STATE_FC_COMMITTING); + } list_del_init(&fc_dentry->fcd_dilist); release_dentry_name_snapshot(&fc_dentry->fcd_name); @@ -2280,11 +2703,26 @@ int ext4_fc_info_show(struct seq_file *seq, void *v) { struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); struct ext4_fc_stats *stats = &sbi->s_fc_stats; + struct ext4_fc_snap_stats *snap_stats = &sbi->s_fc_snap_stats; + u64 lock_avg_ns = 0; + u64 lock_updates_samples; + u64 lock_updates_ns_total; + u64 lock_updates_ns_max; int i; if (v != SEQ_START_TOKEN) return 0; + lock_updates_samples = + atomic64_read(&snap_stats->lock_updates_samples); + lock_updates_ns_total = + atomic64_read(&snap_stats->lock_updates_ns_total); + lock_updates_ns_max = + atomic64_read(&snap_stats->lock_updates_ns_max); + if (lock_updates_samples) + lock_avg_ns = div64_u64(lock_updates_ns_total, + lock_updates_samples); + seq_printf(seq, "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", stats->fc_num_commits, stats->fc_ineligible_commits, @@ -2295,6 +2733,23 @@ int ext4_fc_info_show(struct seq_file *seq, void *v) seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], stats->fc_ineligible_reason_count[i]); + seq_printf(seq, + "Snapshot stats:\n%llu inodes\n%llu ranges\n%lluus lock_updates_avg\n%lluus lock_updates_max\n", + atomic64_read(&snap_stats->snap_inodes), + atomic64_read(&snap_stats->snap_ranges), + div_u64(lock_avg_ns, 1000), + div_u64(lock_updates_ns_max, 1000)); + seq_printf(seq, + "Snapshot failures:\n%llu es_miss\n%llu es_delayed\n%llu es_other\n%llu inodes_cap\n%llu ranges_cap\n%llu nomem\n%llu inode_loc\n%llu no_snap\n", + atomic64_read(&snap_stats->snap_fail_es_miss), + atomic64_read(&snap_stats->snap_fail_es_delayed), + atomic64_read(&snap_stats->snap_fail_es_other), + atomic64_read(&snap_stats->snap_fail_inodes_cap), + atomic64_read(&snap_stats->snap_fail_ranges_cap), + atomic64_read(&snap_stats->snap_fail_nomem), + atomic64_read(&snap_stats->snap_fail_inode_loc), + atomic64_read(&snap_stats->snap_fail_no_snap)); + return 0; } @@ -2303,13 +2758,20 @@ int __init ext4_fc_init_dentry_cache(void) ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, SLAB_RECLAIM_ACCOUNT); - if (ext4_fc_dentry_cachep == NULL) + if (!ext4_fc_dentry_cachep) return -ENOMEM; + ext4_fc_range_cachep = KMEM_CACHE(ext4_fc_range, SLAB_RECLAIM_ACCOUNT); + if (!ext4_fc_range_cachep) { + kmem_cache_destroy(ext4_fc_dentry_cachep); + return -ENOMEM; + } + return 0; } void ext4_fc_destroy_dentry_cache(void) { + kmem_cache_destroy(ext4_fc_range_cachep); kmem_cache_destroy(ext4_fc_dentry_cachep); } diff --git a/fs/ext4/hash-test.c b/fs/ext4/hash-test.c new file mode 100644 index 000000000000..49b0d874c833 --- /dev/null +++ b/fs/ext4/hash-test.c @@ -0,0 +1,567 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit tests for ext4 directory hash computation. + */ + +#include <kunit/test.h> +#include <kunit/resource.h> +#include <linux/fs.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/unicode.h> +#include "ext4.h" + +static void ext4_hash_init_fake_dir(struct inode *dir, struct super_block *sb) +{ + memset(sb, 0, sizeof(*sb)); + memset(dir, 0, sizeof(*dir)); + dir->i_sb = sb; + strscpy(sb->s_id, "kunit-ext4", sizeof(sb->s_id)); +} + +static void ext4_hash_init_fake_dir_with_sbi(struct inode *dir, + struct super_block *sb, + struct ext4_sb_info *sbi) +{ + ext4_hash_init_fake_dir(dir, sb); + memset(sbi, 0, sizeof(*sbi)); + sb->s_fs_info = sbi; + sbi->s_sb = sb; +} + +#ifdef CONFIG_FS_ENCRYPTION +static const struct fscrypt_operations ext4_hash_test_cryptops = { + .inode_info_offs = + (int)offsetof(struct ext4_inode_info, i_crypt_info) - + (int)offsetof(struct ext4_inode_info, vfs_inode), +}; +#endif + +static void ext4_hash_init_fake_ext4_dir(struct ext4_inode_info *ei, + struct super_block *sb, + struct ext4_sb_info *sbi) +{ + struct inode *dir = &ei->vfs_inode; + + memset(sb, 0, sizeof(*sb)); + memset(ei, 0, sizeof(*ei)); + memset(sbi, 0, sizeof(*sbi)); + + strscpy(sb->s_id, "kunit-ext4", sizeof(sb->s_id)); + sb->s_fs_info = sbi; + sbi->s_sb = sb; + + dir->i_sb = sb; + dir->i_mode = S_IFDIR; + +#ifdef CONFIG_FS_ENCRYPTION + fscrypt_set_ops(sb, &ext4_hash_test_cryptops); +#endif +} + +struct ext4_dirhash_test_case { + const char *name; + u32 hash_version; + const char *input; + int len; + u32 seed[4]; + bool use_seed; + u32 expected_hash; + u32 expected_minor_hash; +}; + +static const struct ext4_dirhash_test_case ext4_dirhash_test_cases[] = { + { + .name = "legacy_abc", + .hash_version = DX_HASH_LEGACY, + .input = "abc", + .len = 3, + .use_seed = false, + .expected_hash = 0x75afd992, + .expected_minor_hash = 0x00000000, + }, + { + .name = "legacy_unsigned_abc", + .hash_version = DX_HASH_LEGACY_UNSIGNED, + .input = "abc", + .len = 3, + .use_seed = false, + .expected_hash = 0x75afd992, + .expected_minor_hash = 0x00000000, + }, + { + .name = "half_md4_abc", + .hash_version = DX_HASH_HALF_MD4, + .input = "abc", + .len = 3, + .use_seed = false, + .expected_hash = 0xd196a868, + .expected_minor_hash = 0xc420eb28, + }, + { + .name = "half_md4_unsigned_abc", + .hash_version = DX_HASH_HALF_MD4_UNSIGNED, + .input = "abc", + .len = 3, + .use_seed = false, + .expected_hash = 0xd196a868, + .expected_minor_hash = 0xc420eb28, + }, + { + .name = "tea_abc", + .hash_version = DX_HASH_TEA, + .input = "abc", + .len = 3, + .use_seed = false, + .expected_hash = 0xb1435ec4, + .expected_minor_hash = 0x3f7eaa0e, + }, + { + .name = "tea_unsigned_abc", + .hash_version = DX_HASH_TEA_UNSIGNED, + .input = "abc", + .len = 3, + .use_seed = false, + .expected_hash = 0xb1435ec4, + .expected_minor_hash = 0x3f7eaa0e, + }, + { + .name = "empty_half_md4", + .hash_version = DX_HASH_HALF_MD4, + .input = "", + .len = 0, + .use_seed = false, + .expected_hash = 0xefcdab88, + .expected_minor_hash = 0x98badcfe, + }, + { + .name = "half_md4_31bytes", + .hash_version = DX_HASH_HALF_MD4, + .input = "1234567890123456789012345678901", + .len = 31, + .use_seed = false, + .expected_hash = 0xc4db1f78, + .expected_minor_hash = 0xea23921b, + }, + { + .name = "half_md4_32bytes", + .hash_version = DX_HASH_HALF_MD4, + .input = "12345678901234567890123456789012", + .len = 32, + .use_seed = false, + .expected_hash = 0xfa6cc63e, + .expected_minor_hash = 0x2f77bd1c, + }, + { + .name = "half_md4_33bytes", + .hash_version = DX_HASH_HALF_MD4, + .input = "123456789012345678901234567890123", + .len = 33, + .use_seed = false, + .expected_hash = 0xdc0c2dec, + .expected_minor_hash = 0x5ca23365, + }, + { + .name = "half_md4_unsigned_31bytes", + .hash_version = DX_HASH_HALF_MD4_UNSIGNED, + .input = "1234567890123456789012345678901", + .len = 31, + .use_seed = false, + .expected_hash = 0xc4db1f78, + .expected_minor_hash = 0xea23921b, + }, + { + .name = "half_md4_unsigned_32bytes", + .hash_version = DX_HASH_HALF_MD4_UNSIGNED, + .input = "12345678901234567890123456789012", + .len = 32, + .use_seed = false, + .expected_hash = 0xfa6cc63e, + .expected_minor_hash = 0x2f77bd1c, + }, + { + .name = "half_md4_unsigned_33bytes", + .hash_version = DX_HASH_HALF_MD4_UNSIGNED, + .input = "123456789012345678901234567890123", + .len = 33, + .use_seed = false, + .expected_hash = 0xdc0c2dec, + .expected_minor_hash = 0x5ca23365, + }, + { + .name = "tea_15bytes", + .hash_version = DX_HASH_TEA, + .input = "123456789abcdef", + .len = 15, + .use_seed = false, + .expected_hash = 0xa562903a, + .expected_minor_hash = 0x6174a00f, + }, + { + .name = "tea_16bytes", + .hash_version = DX_HASH_TEA, + .input = "1234567890abcdef", + .len = 16, + .use_seed = false, + .expected_hash = 0x8449f258, + .expected_minor_hash = 0x49a16d46, + }, + { + .name = "tea_17bytes", + .hash_version = DX_HASH_TEA, + .input = "123456789abcdefgh", + .len = 17, + .use_seed = false, + .expected_hash = 0xf32ec10c, + .expected_minor_hash = 0x58ceae61, + }, + { + .name = "half_md4_seeded", + .hash_version = DX_HASH_HALF_MD4, + .input = "same-name", + .len = 9, + .seed = { 0x11111111, 0x22222222, 0x33333333, 0x44444444 }, + .use_seed = true, + .expected_hash = 0x8aebf604, + .expected_minor_hash = 0x66ce48fe, + }, + { + .name = "half_md4_non_ascii_signed", + .hash_version = DX_HASH_HALF_MD4, + .input = "\x80\x81\x82\x83\x84", + .len = 5, + .use_seed = false, + .expected_hash = 0x8bab0498, + .expected_minor_hash = 0xc326632d, + }, + { + .name = "half_md4_non_ascii_unsigned", + .hash_version = DX_HASH_HALF_MD4_UNSIGNED, + .input = "\x80\x81\x82\x83\x84", + .len = 5, + .use_seed = false, + .expected_hash = 0xbc48596e, + .expected_minor_hash = 0xde0fad41, + }, + { + .name = "tea_non_ascii_signed", + .hash_version = DX_HASH_TEA, + .input = "\x80\x81\x82\x83\x84", + .len = 5, + .use_seed = false, + .expected_hash = 0x21e3a154, + .expected_minor_hash = 0x90112c3d, + }, + { + .name = "tea_non_ascii_unsigned", + .hash_version = DX_HASH_TEA_UNSIGNED, + .input = "\x80\x81\x82\x83\x84", + .len = 5, + .use_seed = false, + .expected_hash = 0x9b648616, + .expected_minor_hash = 0x011dd507, + }, +}; + +static void test_ext4fs_dirhash_vectors(struct kunit *test) +{ + struct super_block *sb; + struct inode *dir; + int i; + + sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL); + dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, sb); + KUNIT_ASSERT_NOT_NULL(test, dir); + + ext4_hash_init_fake_dir(dir, sb); + + for (i = 0; i < ARRAY_SIZE(ext4_dirhash_test_cases); i++) { + const struct ext4_dirhash_test_case *tc = + &ext4_dirhash_test_cases[i]; + struct dx_hash_info hinfo; + int ret; + + memset(&hinfo, 0, sizeof(hinfo)); + hinfo.hash_version = tc->hash_version; + hinfo.seed = tc->use_seed ? (u32 *)tc->seed : NULL; + + ret = ext4fs_dirhash(dir, tc->input, tc->len, &hinfo); + + KUNIT_ASSERT_EQ_MSG(test, ret, 0, "case=%s", tc->name); + KUNIT_EXPECT_EQ_MSG(test, hinfo.hash, tc->expected_hash, + "case=%s", tc->name); + KUNIT_EXPECT_EQ_MSG(test, hinfo.minor_hash, + tc->expected_minor_hash, + "case=%s", tc->name); + } +} + +static void test_ext4fs_dirhash_seed_changes_result(struct kunit *test) +{ + struct super_block *sb; + struct inode *dir; + u32 seed[4] = { 0x11111111, 0x22222222, 0x33333333, 0x44444444 }; + struct dx_hash_info plain = { + .hash_version = DX_HASH_HALF_MD4, + }; + struct dx_hash_info seeded = { + .hash_version = DX_HASH_HALF_MD4, + .seed = seed, + }; + int ret_plain, ret_seeded; + + sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL); + dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, sb); + KUNIT_ASSERT_NOT_NULL(test, dir); + + ext4_hash_init_fake_dir(dir, sb); + + ret_plain = ext4fs_dirhash(dir, "same-name", 9, &plain); + ret_seeded = ext4fs_dirhash(dir, "same-name", 9, &seeded); + + KUNIT_ASSERT_EQ(test, ret_plain, 0); + KUNIT_ASSERT_EQ(test, ret_seeded, 0); + + KUNIT_EXPECT_TRUE(test, + plain.hash != seeded.hash || + plain.minor_hash != seeded.minor_hash); +} + +static void test_ext4fs_dirhash_invalid_version_returns_einval(struct kunit *test) +{ + struct super_block *sb; + struct inode *dir; + struct ext4_sb_info *sbi; + struct dx_hash_info hinfo = { + .hash = 0xdeadbeef, + .minor_hash = 0xcafebabe, + .hash_version = DX_HASH_LAST + 1, + }; + int ret; + + sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL); + dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL); + sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, sb); + KUNIT_ASSERT_NOT_NULL(test, dir); + KUNIT_ASSERT_NOT_NULL(test, sbi); + + ext4_hash_init_fake_dir_with_sbi(dir, sb, sbi); + + ret = ext4fs_dirhash(dir, "abc", 3, &hinfo); + + KUNIT_EXPECT_EQ(test, ret, -EINVAL); + KUNIT_EXPECT_EQ(test, hinfo.hash, 0); + KUNIT_EXPECT_EQ(test, hinfo.minor_hash, 0); +} + +static void test_ext4fs_dirhash_siphash_without_key_returns_einval(struct kunit *test) +{ + struct super_block *sb; + struct ext4_inode_info *ei; + struct inode *dir; + struct ext4_sb_info *sbi; + struct dx_hash_info hinfo = { + .hash_version = DX_HASH_SIPHASH, + }; + int ret; + + sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL); + ei = kunit_kzalloc(test, sizeof(*ei), GFP_KERNEL); + sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, sb); + KUNIT_ASSERT_NOT_NULL(test, ei); + KUNIT_ASSERT_NOT_NULL(test, sbi); + + ext4_hash_init_fake_ext4_dir(ei, sb, sbi); + dir = &ei->vfs_inode; + + ret = ext4fs_dirhash(dir, "name", strlen("name"), &hinfo); + + KUNIT_EXPECT_EQ(test, ret, -EINVAL); +} + +static void test_ext4fs_dirhash_signed_unsigned_differ_on_nonascii(struct kunit *test) +{ + struct super_block *sb; + struct inode *dir; + static const char input[] = "\x80\xff\x81\xfe\101bc"; + struct dx_hash_info legacy_signed = { + .hash_version = DX_HASH_LEGACY, + }; + struct dx_hash_info legacy_unsigned = { + .hash_version = DX_HASH_LEGACY_UNSIGNED, + }; + struct dx_hash_info md4_signed = { + .hash_version = DX_HASH_HALF_MD4, + }; + struct dx_hash_info md4_unsigned = { + .hash_version = DX_HASH_HALF_MD4_UNSIGNED, + }; + struct dx_hash_info tea_signed = { + .hash_version = DX_HASH_TEA, + }; + struct dx_hash_info tea_unsigned = { + .hash_version = DX_HASH_TEA_UNSIGNED, + }; + int ret; + + sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL); + dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, sb); + KUNIT_ASSERT_NOT_NULL(test, dir); + + ext4_hash_init_fake_dir(dir, sb); + + ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &legacy_signed); + KUNIT_ASSERT_EQ(test, ret, 0); + ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &legacy_unsigned); + KUNIT_ASSERT_EQ(test, ret, 0); + KUNIT_EXPECT_NE(test, legacy_signed.hash, legacy_unsigned.hash); + + ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &md4_signed); + KUNIT_ASSERT_EQ(test, ret, 0); + ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &md4_unsigned); + KUNIT_ASSERT_EQ(test, ret, 0); + KUNIT_EXPECT_TRUE(test, + md4_signed.hash != md4_unsigned.hash || + md4_signed.minor_hash != md4_unsigned.minor_hash); + + ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &tea_signed); + KUNIT_ASSERT_EQ(test, ret, 0); + ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &tea_unsigned); + KUNIT_ASSERT_EQ(test, ret, 0); + KUNIT_EXPECT_TRUE(test, + tea_signed.hash != tea_unsigned.hash || + tea_signed.minor_hash != tea_unsigned.minor_hash); +} + +#if IS_ENABLED(CONFIG_UNICODE) +KUNIT_DEFINE_ACTION_WRAPPER(utf8_unload_action, utf8_unload, + struct unicode_map *); +static void test_ext4fs_dirhash_casefolded_names_hash_consistently(struct kunit *test) +{ + struct super_block *sb; + struct ext4_inode_info *ei; + struct ext4_sb_info *sbi; + struct unicode_map *um; + struct dx_hash_info h1 = { + .hash_version = DX_HASH_HALF_MD4, + }; + struct dx_hash_info h2 = { + .hash_version = DX_HASH_HALF_MD4, + }; + int ret, ret1, ret2; + + sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL); + ei = kunit_kzalloc(test, sizeof(*ei), GFP_KERNEL); + sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, sb); + KUNIT_ASSERT_NOT_NULL(test, ei); + KUNIT_ASSERT_NOT_NULL(test, sbi); + + um = utf8_load(UTF8_LATEST); + if (IS_ERR(um)) { + kunit_skip(test, "utf8_load(UTF8_LATEST) failed: %pe", + um); + return; + } + + ret = kunit_add_action_or_reset(test, utf8_unload_action, um); + KUNIT_ASSERT_EQ(test, ret, 0); + + ext4_hash_init_fake_ext4_dir(ei, sb, sbi); + sb->s_encoding = um; + ei->vfs_inode.i_flags |= S_CASEFOLD; + + KUNIT_ASSERT_TRUE(test, IS_CASEFOLDED(&ei->vfs_inode)); + + ret1 = ext4fs_dirhash(&ei->vfs_inode, "Alpha", 5, &h1); + ret2 = ext4fs_dirhash(&ei->vfs_inode, "aLPHa", 5, &h2); + + KUNIT_ASSERT_EQ(test, ret1, 0); + KUNIT_ASSERT_EQ(test, ret2, 0); + KUNIT_EXPECT_EQ(test, h1.hash, h2.hash); + KUNIT_EXPECT_EQ(test, h1.minor_hash, h2.minor_hash); +} + +static void test_ext4fs_dirhash_casefold_fallback(struct kunit *test) +{ + struct super_block *sb_cf, *sb_plain; + struct ext4_inode_info *ei; + struct ext4_sb_info *sbi; + struct inode *plain_dir; + struct unicode_map *um; + static const char invalid_utf8[] = "\xc3\x28"; + struct dx_hash_info folded_dir = { + .hash_version = DX_HASH_HALF_MD4, + }; + struct dx_hash_info plain = { + .hash_version = DX_HASH_HALF_MD4, + }; + int ret, ret_cf, ret_plain; + + sb_cf = kunit_kzalloc(test, sizeof(*sb_cf), GFP_KERNEL); + sb_plain = kunit_kzalloc(test, sizeof(*sb_plain), GFP_KERNEL); + ei = kunit_kzalloc(test, sizeof(*ei), GFP_KERNEL); + sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL); + plain_dir = kunit_kzalloc(test, sizeof(*plain_dir), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, sb_cf); + KUNIT_ASSERT_NOT_NULL(test, sb_plain); + KUNIT_ASSERT_NOT_NULL(test, ei); + KUNIT_ASSERT_NOT_NULL(test, sbi); + KUNIT_ASSERT_NOT_NULL(test, plain_dir); + + um = utf8_load(UTF8_LATEST); + if (IS_ERR(um)) { + kunit_skip(test, "utf8_load(UTF8_LATEST) failed: %pe", + um); + return; + } + + ret = kunit_add_action_or_reset(test, utf8_unload_action, um); + KUNIT_ASSERT_EQ(test, ret, 0); + + ext4_hash_init_fake_ext4_dir(ei, sb_cf, sbi); + sb_cf->s_encoding = um; + ei->vfs_inode.i_flags |= S_CASEFOLD; + + KUNIT_ASSERT_TRUE(test, IS_CASEFOLDED(&ei->vfs_inode)); + + ext4_hash_init_fake_dir(plain_dir, sb_plain); + + ret_cf = ext4fs_dirhash(&ei->vfs_inode, invalid_utf8, + sizeof(invalid_utf8) - 1, &folded_dir); + ret_plain = ext4fs_dirhash(plain_dir, invalid_utf8, + sizeof(invalid_utf8) - 1, &plain); + + KUNIT_ASSERT_EQ(test, ret_cf, 0); + KUNIT_ASSERT_EQ(test, ret_plain, 0); + KUNIT_EXPECT_EQ(test, folded_dir.hash, plain.hash); + KUNIT_EXPECT_EQ(test, folded_dir.minor_hash, plain.minor_hash); +} +#endif + +static struct kunit_case ext4_hash_test_cases[] = { + KUNIT_CASE(test_ext4fs_dirhash_vectors), + KUNIT_CASE(test_ext4fs_dirhash_seed_changes_result), + KUNIT_CASE(test_ext4fs_dirhash_invalid_version_returns_einval), + KUNIT_CASE(test_ext4fs_dirhash_siphash_without_key_returns_einval), + KUNIT_CASE(test_ext4fs_dirhash_signed_unsigned_differ_on_nonascii), +#if IS_ENABLED(CONFIG_UNICODE) + KUNIT_CASE(test_ext4fs_dirhash_casefolded_names_hash_consistently), + KUNIT_CASE(test_ext4fs_dirhash_casefold_fallback), +#endif + {} +}; + +static struct kunit_suite ext4_hash_test_suite = { + .name = "ext4_hash", + .test_cases = ext4_hash_test_cases, +}; + +kunit_test_suites(&ext4_hash_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 48483cd015d3..978bd92da0ad 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -9,6 +9,7 @@ #include <linux/unicode.h> #include <linux/compiler.h> #include <linux/bitops.h> +#include <linux/unaligned.h> #include "ext4.h" #define DELTA 0x9E3779B9 @@ -141,21 +142,28 @@ static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) pad = (__u32)len | ((__u32)len << 8); pad |= pad << 16; - val = pad; if (len > num*4) len = num * 4; - for (i = 0; i < len; i++) { - val = ((int) scp[i]) + (val << 8); - if ((i % 4) == 3) { - *buf++ = val; - val = pad; - num--; - } + + while (len >= 4) { + val = ((__u32)scp[0] << 24) + ((__u32)scp[1] << 16) + ((__u32)scp[2] << 8) + scp[3]; + *buf++ = val; + scp += 4; + len -= 4; + num--; } + + val = pad; + + for (i = 0; i < len; i++) + val = scp[i] + (val << 8); + if (--num >= 0) *buf++ = val; + while (--num >= 0) *buf++ = pad; + } static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) @@ -167,21 +175,28 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) pad = (__u32)len | ((__u32)len << 8); pad |= pad << 16; - val = pad; if (len > num*4) len = num * 4; - for (i = 0; i < len; i++) { - val = ((int) ucp[i]) + (val << 8); - if ((i % 4) == 3) { - *buf++ = val; - val = pad; - num--; - } + + while (len >= 4) { + val = get_unaligned_be32(ucp); + *buf++ = val; + ucp += 4; + len -= 4; + num--; } + + val = pad; + + for (i = 0; i < len; i++) + val = ucp[i] + (val << 8); + if (--num >= 0) *buf++ = val; + while (--num >= 0) *buf++ = pad; + } /* @@ -205,8 +220,7 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, const char *p; int i; __u32 in[8], buf[4]; - void (*str2hashbuf)(const char *, int, __u32 *, int) = - str2hashbuf_signed; + bool use_unsigned = false; /* Initialize the default seed for the hash checksum functions */ buf[0] = 0x67452301; @@ -232,12 +246,15 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, hash = dx_hack_hash_signed(name, len); break; case DX_HASH_HALF_MD4_UNSIGNED: - str2hashbuf = str2hashbuf_unsigned; + use_unsigned = true; fallthrough; case DX_HASH_HALF_MD4: p = name; while (len > 0) { - (*str2hashbuf)(p, len, in, 8); + if (use_unsigned) + str2hashbuf_unsigned(p, len, in, 8); + else + str2hashbuf_signed(p, len, in, 8); half_md4_transform(buf, in); len -= 32; p += 32; @@ -246,12 +263,15 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, hash = buf[1]; break; case DX_HASH_TEA_UNSIGNED: - str2hashbuf = str2hashbuf_unsigned; + use_unsigned = true; fallthrough; case DX_HASH_TEA: p = name; while (len > 0) { - (*str2hashbuf)(p, len, in, 4); + if (use_unsigned) + str2hashbuf_unsigned(p, len, in, 4); + else + str2hashbuf_signed(p, len, in, 4); TEA_transform(buf, in); len -= 16; p += 16; @@ -321,3 +341,7 @@ opaque_seq: #endif return __ext4fs_dirhash(dir, name, len, hinfo); } + +#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS) +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4fs_dirhash); +#endif diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c2c2d6ac7f3d..ce99807c5f5b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1560,7 +1560,8 @@ static int ext4_journalled_write_end(const struct kiocb *iocb, BUG_ON(!ext4_handle_valid(handle)); - if (ext4_has_inline_data(inode)) + if (ext4_has_inline_data(inode) && + ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); @@ -5025,6 +5026,57 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) return ret; } +/* + * ext4_get_inode_loc_noio() is a best-effort variant of ext4_get_inode_loc(). + * It looks up the inode table block in the buffer cache and returns -EAGAIN if + * the block is not present or not uptodate, without starting any I/O. + */ +int ext4_get_inode_loc_noio(struct inode *inode, struct ext4_iloc *iloc) +{ + struct super_block *sb = inode->i_sb; + struct ext4_group_desc *gdp; + struct buffer_head *bh; + ext4_fsblk_t block; + int inodes_per_block, inode_offset; + unsigned long ino = inode->i_ino; + + iloc->bh = NULL; + if (ino < EXT4_ROOT_INO || + ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) + return -EFSCORRUPTED; + + iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); + gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); + if (!gdp) + return -EIO; + + /* Figure out the offset within the block group inode table. */ + inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + inode_offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)); + iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); + + block = ext4_inode_table(sb, gdp); + if (block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) || + block >= ext4_blocks_count(EXT4_SB(sb)->s_es)) { + ext4_error(sb, + "Invalid inode table block %llu in block_group %u", + block, iloc->block_group); + return -EFSCORRUPTED; + } + block += inode_offset / inodes_per_block; + + bh = sb_find_get_block(sb, block); + if (!bh) + return -EAGAIN; + if (!ext4_buffer_uptodate(bh)) { + brelse(bh); + return -EAGAIN; + } + + iloc->bh = bh; + return 0; +} + int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, struct ext4_iloc *iloc) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 1d0c3d4bdf47..c8387e6a2c6e 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -830,11 +830,17 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags) bdev_thaw(sb->s_bdev); break; case EXT4_GOING_FLAGS_LOGFLUSH: + /* + * Call ext4_force_commit() before setting EXT4_FLAGS_SHUTDOWN. + * This is because in data=ordered mode, journal commit + * triggers data writeback which fails if shutdown is already + * set, causing the journal to be aborted prematurely before + * the commit succeeds. + */ + (void) ext4_force_commit(sb); set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); - if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) { - (void) ext4_force_commit(sb); + if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN); - } break; case EXT4_GOING_FLAGS_NOLOGFLUSH: set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); @@ -1650,6 +1656,9 @@ group_extend_out: if (!(fd_file(donor)->f_mode & FMODE_WRITE)) return -EBADF; + if (file_inode(filp)->i_sb != file_inode(fd_file(donor))->i_sb) + return -EXDEV; + err = mnt_want_write_file(filp); if (err) return err; diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index d90da44aadbd..0424b8b0b4c3 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -727,8 +727,7 @@ do_test_generate_buddy(struct kunit *test, struct super_block *sb, void *bitmap, ext4_mb_generate_buddy_test(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP, ext4_grp); - KUNIT_ASSERT_EQ(test, memcmp(mbt_buddy, ext4_buddy, sb->s_blocksize), - 0); + KUNIT_ASSERT_MEMEQ(test, mbt_buddy, ext4_buddy, sb->s_blocksize); mbt_validate_group_info(test, mbt_grp, ext4_grp); } @@ -789,8 +788,7 @@ test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b, grp->bb_counters[i] = 0; ext4_mb_generate_buddy_test(sb, buddy, bitmap, 0, grp); - KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize), - 0); + KUNIT_ASSERT_MEMEQ(test, buddy, e4b->bd_buddy, sb->s_blocksize); mbt_validate_group_info(test, grp, e4b->bd_info); } @@ -854,8 +852,7 @@ test_mb_free_blocks_range(struct kunit *test, struct ext4_buddy *e4b, grp->bb_counters[i] = 0; ext4_mb_generate_buddy_test(sb, buddy, bitmap, 0, grp); - KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize), - 0); + KUNIT_ASSERT_MEMEQ(test, buddy, e4b->bd_buddy, sb->s_blocksize); mbt_validate_group_info(test, grp, e4b->bd_info); } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 4a47fbd8dd30..cc49ae04a6f6 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -145,9 +145,9 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, if (IS_ERR(bh)) { __ext4_warning(inode->i_sb, func, line, "inode #%llu: lblock %lu: comm %s: " - "error %ld reading directory block", + "error %pe reading directory block", inode->i_ino, (unsigned long)block, - current->comm, PTR_ERR(bh)); + current->comm, bh); return bh; } @@ -3054,7 +3054,7 @@ out_stop: out_retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; - return ERR_PTR(err); + return err ? ERR_PTR(err) : NULL; } /* diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index dc82e7b57e75..bc674aa4a656 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -168,7 +168,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) * written. On IO failure, check if journal abort is needed. Note that * we are protected from truncate touching same part of extent tree by the * fact that truncate code waits for all DIO to finish (thus exclusion from - * direct IO is achieved) and also waits for PageWriteback bits. Thus we + * direct IO is achieved) and also waits for writeback to complete. Thus we * cannot get to ext4_ext_truncate() before all IOs overlapping that range are * completed (happens from ext4_free_ioend()). */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7283108d7609..245f67d10ded 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1431,6 +1431,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ext4_fc_init_inode(&ei->vfs_inode); spin_lock_init(&ei->i_fc_lock); mmb_init(&ei->i_metadata_bhs, &ei->vfs_inode.i_data); +#ifdef CONFIG_LOCKDEP + lockdep_set_subclass(&ei->i_data_sem, I_DATA_SEM_NORMAL); +#endif return &ei->vfs_inode; } @@ -4541,6 +4544,7 @@ static void ext4_fast_commit_init(struct super_block *sb) sbi->s_fc_ineligible_tid = 0; mutex_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); + memset(&sbi->s_fc_snap_stats, 0, sizeof(sbi->s_fc_snap_stats)); sbi->s_fc_replay_state.fc_regions = NULL; sbi->s_fc_replay_state.fc_regions_size = 0; sbi->s_fc_replay_state.fc_regions_used = 0; @@ -5910,6 +5914,11 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, return ERR_PTR(-EFSCORRUPTED); } +#ifdef CONFIG_LOCKDEP + lockdep_set_subclass(&EXT4_I(journal_inode)->i_data_sem, + I_DATA_SEM_JOURNAL); +#endif + ext4_debug("Journal inode found at %p: %lld bytes\n", journal_inode, journal_inode->i_size); return journal_inode; @@ -5977,8 +5986,8 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb, sb, &fs_holder_ops); if (IS_ERR(bdev_file)) { ext4_msg(sb, KERN_ERR, - "failed to open journal device unknown-block(%u,%u) %ld", - MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file)); + "failed to open journal device unknown-block(%u,%u) %pe", + MAJOR(j_dev), MINOR(j_dev), bdev_file); return bdev_file; } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index d8577725a2fb..3029cb6f6d64 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -512,10 +512,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) * leave undo-committed data. */ if (jh->b_committed_data) { - struct buffer_head *bh = jh2bh(jh); - spin_lock(&jh->b_state_lock); - jbd2_free(jh->b_committed_data, bh->b_size); + kfree(jh->b_committed_data); jh->b_committed_data = NULL; spin_unlock(&jh->b_state_lock); } @@ -976,7 +974,7 @@ restart_loop: * its triggers if they exist, so we can clear that too. */ if (jh->b_committed_data) { - jbd2_free(jh->b_committed_data, bh->b_size); + kfree(jh->b_committed_data); jh->b_committed_data = NULL; if (jh->b_frozen_data) { jh->b_committed_data = jh->b_frozen_data; @@ -984,7 +982,7 @@ restart_loop: jh->b_frozen_triggers = NULL; } } else if (jh->b_frozen_data) { - jbd2_free(jh->b_frozen_data, bh->b_size); + kfree(jh->b_frozen_data); jh->b_frozen_data = NULL; jh->b_frozen_triggers = NULL; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e82798680109..09efa337649e 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -95,8 +95,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); EXPORT_SYMBOL(jbd2_inode_cache); -static int jbd2_journal_create_slab(size_t slab_size); - #ifdef CONFIG_JBD2_DEBUG void __jbd2_debug(int level, const char *file, const char *func, unsigned int line, const char *fmt, ...) @@ -385,10 +383,10 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, goto escape_done; spin_unlock(&jh_in->b_state_lock); - tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL); + tmp = kmalloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL); spin_lock(&jh_in->b_state_lock); if (jh_in->b_frozen_data) { - jbd2_free(tmp, bh_in->b_size); + kfree(tmp); goto copy_done; } @@ -2062,14 +2060,6 @@ EXPORT_SYMBOL(jbd2_journal_update_sb_errno); int jbd2_journal_load(journal_t *journal) { int err; - journal_superblock_t *sb = journal->j_superblock; - - /* - * Create a slab for this blocksize - */ - err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize)); - if (err) - return err; /* Let the recovery code check whether it needs to recover any * data from the journal. */ @@ -2261,6 +2251,8 @@ jbd2_journal_initialize_fast_commit(journal_t *journal) unsigned long long num_fc_blks; num_fc_blks = jbd2_journal_get_num_fc_blks(sb); + if (num_fc_blks > journal->j_last) + return -EFSCORRUPTED; if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS) return -ENOSPC; @@ -2698,105 +2690,6 @@ size_t journal_tag_bytes(journal_t *journal) } /* - * JBD memory management - * - * These functions are used to allocate block-sized chunks of memory - * used for making copies of buffer_head data. Very often it will be - * page-sized chunks of data, but sometimes it will be in - * sub-page-size chunks. (For example, 16k pages on Power systems - * with a 4k block file system.) For blocks smaller than a page, we - * use a SLAB allocator. There are slab caches for each block size, - * which are allocated at mount time, if necessary, and we only free - * (all of) the slab caches when/if the jbd2 module is unloaded. For - * this reason we don't need to a mutex to protect access to - * jbd2_slab[] allocating or releasing memory; only in - * jbd2_journal_create_slab(). - */ -#define JBD2_MAX_SLABS 8 -static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS]; - -static const char *jbd2_slab_names[JBD2_MAX_SLABS] = { - "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k", - "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k" -}; - - -static void jbd2_journal_destroy_slabs(void) -{ - int i; - - for (i = 0; i < JBD2_MAX_SLABS; i++) { - kmem_cache_destroy(jbd2_slab[i]); - jbd2_slab[i] = NULL; - } -} - -static int jbd2_journal_create_slab(size_t size) -{ - static DEFINE_MUTEX(jbd2_slab_create_mutex); - int i = order_base_2(size) - 10; - size_t slab_size; - - if (size == PAGE_SIZE) - return 0; - - if (i >= JBD2_MAX_SLABS) - return -EINVAL; - - if (unlikely(i < 0)) - i = 0; - mutex_lock(&jbd2_slab_create_mutex); - if (jbd2_slab[i]) { - mutex_unlock(&jbd2_slab_create_mutex); - return 0; /* Already created */ - } - - slab_size = 1 << (i+10); - jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size, - slab_size, 0, NULL); - mutex_unlock(&jbd2_slab_create_mutex); - if (!jbd2_slab[i]) { - printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n"); - return -ENOMEM; - } - return 0; -} - -static struct kmem_cache *get_slab(size_t size) -{ - int i = order_base_2(size) - 10; - - BUG_ON(i >= JBD2_MAX_SLABS); - if (unlikely(i < 0)) - i = 0; - BUG_ON(jbd2_slab[i] == NULL); - return jbd2_slab[i]; -} - -void *jbd2_alloc(size_t size, gfp_t flags) -{ - void *ptr; - - BUG_ON(size & (size-1)); /* Must be a power of 2 */ - - if (size < PAGE_SIZE) - ptr = kmem_cache_alloc(get_slab(size), flags); - else - ptr = kmalloc(size, flags); - - /* Check alignment; SLUB has gotten this wrong in the past, - * and this can lead to user data corruption! */ - BUG_ON(((unsigned long) ptr) & (size-1)); - - return ptr; -} - -void jbd2_free(void *ptr, size_t size) -{ - kfree(ptr); -}; - -/* * Journal_head storage management */ static struct kmem_cache *jbd2_journal_head_cache; @@ -2969,15 +2862,15 @@ static void __journal_remove_journal_head(struct buffer_head *bh) clear_buffer_jbd(bh); } -static void journal_release_journal_head(struct journal_head *jh, size_t b_size) +static void journal_release_journal_head(struct journal_head *jh) { if (jh->b_frozen_data) { printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); - jbd2_free(jh->b_frozen_data, b_size); + kfree(jh->b_frozen_data); } if (jh->b_committed_data) { printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); - jbd2_free(jh->b_committed_data, b_size); + kfree(jh->b_committed_data); } journal_free_journal_head(jh); } @@ -2996,7 +2889,7 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) if (!jh->b_jcount) { __journal_remove_journal_head(bh); jbd_unlock_bh_journal_head(bh); - journal_release_journal_head(jh, bh->b_size); + journal_release_journal_head(jh); __brelse(bh); } else { jbd_unlock_bh_journal_head(bh); @@ -3138,7 +3031,6 @@ static void jbd2_journal_destroy_caches(void) jbd2_journal_destroy_handle_cache(); jbd2_journal_destroy_inode_cache(); jbd2_journal_destroy_transaction_cache(); - jbd2_journal_destroy_slabs(); } static int __init journal_init(void) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 4885903bbd10..5cc7d097b2ac 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1131,7 +1131,7 @@ repeat: if (!frozen_buffer) { JBUFFER_TRACE(jh, "allocate memory for buffer"); spin_unlock(&jh->b_state_lock); - frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, + frozen_buffer = kmalloc(jh2bh(jh)->b_size, GFP_NOFS | __GFP_NOFAIL); goto repeat; } @@ -1159,7 +1159,7 @@ done: out: if (unlikely(frozen_buffer)) /* It's usually NULL */ - jbd2_free(frozen_buffer, bh->b_size); + kfree(frozen_buffer); JBUFFER_TRACE(jh, "exit"); return error; @@ -1424,7 +1424,7 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) repeat: if (!jh->b_committed_data) - committed_data = jbd2_alloc(jh2bh(jh)->b_size, + committed_data = kmalloc(jh2bh(jh)->b_size, GFP_NOFS|__GFP_NOFAIL); spin_lock(&jh->b_state_lock); @@ -1445,7 +1445,7 @@ repeat: out: jbd2_journal_put_journal_head(jh); if (unlikely(committed_data)) - jbd2_free(committed_data, bh->b_size); + kfree(committed_data); return err; } @@ -1516,14 +1516,19 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh, */ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) { - transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; + transaction_t *transaction; + journal_t *journal; struct journal_head *jh; int ret = 0; + if (is_handle_aborted(handle)) + return -EROFS; if (!buffer_jbd(bh)) return -EUCLEAN; + transaction = handle->h_transaction; + journal = transaction->t_journal; + /* * We don't grab jh reference here since the buffer must be part * of the running transaction. diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 7e785aa6d35d..b68561187e90 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -63,9 +63,6 @@ void __jbd2_debug(int level, const char *file, const char *func, #define jbd2_debug(n, fmt, a...) no_printk(fmt, ##a) #endif -extern void *jbd2_alloc(size_t size, gfp_t flags); -extern void jbd2_free(void *ptr, size_t size); - #define JBD2_MIN_JOURNAL_BLOCKS 1024 #define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256 diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index f493642cf121..7028a28316fa 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -107,6 +107,26 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_VERITY); TRACE_DEFINE_ENUM(EXT4_FC_REASON_MOVE_EXT); TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); +#undef EM +#undef EMe +#define EM(a) TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a); +#define EMe(a) TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a); + +#define TRACE_SNAP_ERR \ + EM(NONE) \ + EM(ES_MISS) \ + EM(ES_DELAYED) \ + EM(ES_OTHER) \ + EM(INODES_CAP) \ + EM(RANGES_CAP) \ + EM(NOMEM) \ + EMe(INODE_LOC) + +TRACE_SNAP_ERR + +#undef EM +#undef EMe + #define show_fc_reason(reason) \ __print_symbolic(reason, \ { EXT4_FC_REASON_XATTR, "XATTR"}, \ @@ -2818,6 +2838,47 @@ TRACE_EVENT(ext4_fc_commit_stop, __entry->num_fc_ineligible, __entry->nblks_agg, __entry->tid) ); +#define EM(a) { EXT4_FC_SNAP_ERR_##a, #a }, +#define EMe(a) { EXT4_FC_SNAP_ERR_##a, #a } + +TRACE_EVENT(ext4_fc_lock_updates, + TP_PROTO(struct super_block *sb, tid_t commit_tid, u64 locked_ns, + unsigned int nr_inodes, unsigned int nr_ranges, int err, + int snap_err), + + TP_ARGS(sb, commit_tid, locked_ns, nr_inodes, nr_ranges, err, snap_err), + + TP_STRUCT__entry(/* entry */ + __field(dev_t, dev) + __field(tid_t, tid) + __field(u64, locked_ns) + __field(unsigned int, nr_inodes) + __field(unsigned int, nr_ranges) + __field(int, err) + __field(int, snap_err) + ), + + TP_fast_assign(/* assign */ + __entry->dev = sb->s_dev; + __entry->tid = commit_tid; + __entry->locked_ns = locked_ns; + __entry->nr_inodes = nr_inodes; + __entry->nr_ranges = nr_ranges; + __entry->err = err; + __entry->snap_err = snap_err; + ), + + TP_printk("dev %d,%d tid %u locked_ns %llu nr_inodes %u nr_ranges %u err %d snap_err %s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, + __entry->locked_ns, __entry->nr_inodes, __entry->nr_ranges, + __entry->err, __print_symbolic(__entry->snap_err, + TRACE_SNAP_ERR)) +); + +#undef EM +#undef EMe +#undef TRACE_SNAP_ERR + #define FC_REASON_NAME_STAT(reason) \ show_fc_reason(reason), \ __entry->fc_ineligible_rc[reason] |
