17 files changed, 1495 insertions, 335 deletions
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 3baee4e7c1cf..3f9fc0eb8eca 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -15,7 +15,7 @@ ext4-y	:= balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
 ext4-$(CONFIG_EXT4_FS_SECURITY)		+= xattr_security.o
 ext4-test-objs				+= inode-test.o mballoc-test.o \
-					   extents-test.o
+					   extents-test.o hash-test.o
 obj-$(CONFIG_EXT4_KUNIT_TESTS)		+= ext4-test.o
 ext4-$(CONFIG_FS_VERITY)		+= verity.o
 ext4-$(CONFIG_FS_ENCRYPTION)		+= crypto.o
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6af11f0ff1c5..b37c136ea3ab 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1015,14 +1015,32 @@ do {										\
  *			  than the first
  *  I_DATA_SEM_QUOTA  - Used for quota inodes only
  *  I_DATA_SEM_EA     - Used for ea_inodes only
+ *  I_DATA_SEM_JOURNAL - Used for journal inode only
  */
 enum {
 	I_DATA_SEM_NORMAL = 0,
 	I_DATA_SEM_OTHER,
 	I_DATA_SEM_QUOTA,
-	I_DATA_SEM_EA
+	I_DATA_SEM_EA,
+	I_DATA_SEM_JOURNAL
 };
 
+struct ext4_fc_inode_snap;
+
+/*
+ * Snapshot failure reasons for ext4_fc_lock_updates tracepoint.
+ * Keep these stable for tooling.
+ */
+enum ext4_fc_snap_err {
+	EXT4_FC_SNAP_ERR_NONE = 0,
+	EXT4_FC_SNAP_ERR_ES_MISS,
+	EXT4_FC_SNAP_ERR_ES_DELAYED,
+	EXT4_FC_SNAP_ERR_ES_OTHER,
+	EXT4_FC_SNAP_ERR_INODES_CAP,
+	EXT4_FC_SNAP_ERR_RANGES_CAP,
+	EXT4_FC_SNAP_ERR_NOMEM,
+	EXT4_FC_SNAP_ERR_INODE_LOC,
+};
 
 /*
  * fourth extended file system inode data in memory
@@ -1079,6 +1097,22 @@ struct ext4_inode_info {
 	/* End of lblk range that needs to be committed in this fast commit */
 	ext4_lblk_t i_fc_lblk_len;
 
+	/*
+	 * Commit-time fast commit snapshots.
+	 *
+	 * i_fc_snap is installed and freed under sbi->s_fc_lock. The fast
+	 * commit log writing path reads the snapshot under sbi->s_fc_lock while
+	 * serializing fast commit TLVs.
+	 *
+	 * The snapshot lifetime is bounded by EXT4_STATE_FC_COMMITTING and the
+	 * corresponding cleanup / eviction paths.
+	 *
+	 * i_fc_snap points to per-inode snapshot data for fast commit:
+	 * - a raw inode snapshot for EXT4_FC_TAG_INODE
+	 * - data range records for EXT4_FC_TAG_{ADD,DEL}_RANGE
+	 */
+	struct ext4_fc_inode_snap *i_fc_snap;
+
 	spinlock_t i_raw_lock;	/* protects updates to the raw inode */
 
 	/*
@@ -1517,6 +1551,36 @@ struct ext4_orphan_info {
 };
 
 /*
+ * Ext4 fast commit snapshot statistics.
+ *
+ * These are best-effort counters intended for debugging / performance
+ * introspection; they are not exact under concurrent updates.
+ */
+struct ext4_fc_snap_stats {
+	atomic64_t lock_updates_ns_total;
+	atomic64_t lock_updates_ns_max;
+	atomic64_t lock_updates_samples;
+
+	atomic64_t snap_inodes;
+	atomic64_t snap_ranges;
+
+	atomic64_t snap_fail_es_miss;
+	atomic64_t snap_fail_es_delayed;
+	atomic64_t snap_fail_es_other;
+
+	atomic64_t snap_fail_inodes_cap;
+	atomic64_t snap_fail_ranges_cap;
+	atomic64_t snap_fail_nomem;
+	atomic64_t snap_fail_inode_loc;
+
+	/*
+	 * Missing inode snapshots during log writing should never happen.
+	 * Keep this counter to help catch unexpected regressions.
+	 */
+	atomic64_t snap_fail_no_snap;
+};
+
+/*
  * fourth extended-fs super-block data in memory
  */
 struct ext4_sb_info {
@@ -1790,6 +1854,7 @@ struct ext4_sb_info {
 	struct mutex s_fc_lock;
 	struct buffer_head *s_fc_bh;
 	struct ext4_fc_stats s_fc_stats;
+	struct ext4_fc_snap_stats s_fc_snap_stats;
 	tid_t s_fc_ineligible_tid;
 #ifdef CONFIG_EXT4_DEBUG
 	int s_fc_debug_max_replay;
@@ -1972,6 +2037,7 @@ enum {
 	EXT4_STATE_FC_COMMITTING,	/* Fast commit ongoing */
 	EXT4_STATE_FC_FLUSHING_DATA,	/* Fast commit flushing data */
 	EXT4_STATE_ORPHAN_FILE,		/* Inode orphaned in orphan file */
+	EXT4_STATE_FC_REQUEUE,		/* Inode modified during fast commit */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field, offset)				\
@@ -2000,6 +2066,8 @@ EXT4_INODE_BIT_FNS(flag, flags, 0)
 static inline int ext4_test_inode_state(struct inode *inode, int bit);
 static inline void ext4_set_inode_state(struct inode *inode, int bit);
 static inline void ext4_clear_inode_state(struct inode *inode, int bit);
+static inline unsigned long *ext4_inode_state_wait_word(struct inode *inode);
+static inline int ext4_inode_state_wait_bit(int bit);
 #if (BITS_PER_LONG < 64)
 EXT4_INODE_BIT_FNS(state, state_flags, 0)
 
@@ -2015,6 +2083,24 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 	/* We depend on the fact that callers will set i_flags */
 }
 #endif
+
+static inline unsigned long *ext4_inode_state_wait_word(struct inode *inode)
+{
+#if (BITS_PER_LONG < 64)
+	return &EXT4_I(inode)->i_state_flags;
+#else
+	return &EXT4_I(inode)->i_flags;
+#endif
+}
+
+static inline int ext4_inode_state_wait_bit(int bit)
+{
+#if (BITS_PER_LONG < 64)
+	return bit;
+#else
+	return bit + 32;
+#endif
+}
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
  * a kernel struct super_block.  This will allow us to call the feature-test
@@ -3080,8 +3166,9 @@ extern int  ext4_file_getattr(struct mnt_idmap *, const struct path *,
 			      struct kstat *, u32, unsigned int);
 extern void ext4_dirty_inode(struct inode *, int);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
-extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
-extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
+int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc);
+int ext4_get_inode_loc_noio(struct inode *inode, struct ext4_iloc *iloc);
+int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
 			  struct ext4_iloc *iloc);
 extern int ext4_inode_attach_jinode(struct inode *inode);
 extern int ext4_can_truncate(struct inode *inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 125f628e738a..91c97af64b31 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3268,8 +3268,8 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
 	 */
 	path = ext4_find_extent(inode, ee_block, NULL, flags | EXT4_EX_NOFAIL);
 	if (IS_ERR(path)) {
-		EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld",
-				 split, PTR_ERR(path));
+		EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %pe",
+				 split, path);
 		goto out_path;
 	}
 
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 5773b85e43cb..8e2259799614 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -56,21 +56,22 @@
  *     deleted while it is being flushed.
  * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA"
  *     state.
- * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that
- *     all the exsiting handles finish and no new handles can start.
- * [4] Mark all the fast commit eligible inodes as undergoing fast commit
- *     by setting "EXT4_STATE_FC_COMMITTING" state.
- * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows
- *     starting of new handles. If new handles try to start an update on
- *     any of the inodes that are being committed, ext4_fc_track_inode()
- *     will block until those inodes have finished the fast commit.
+ * [3] Lock the journal by calling jbd2_journal_lock_updates(). This ensures
+ *     that all the existing handles finish and no new handles can start.
+ * [4] Mark all the fast commit eligible inodes as undergoing fast commit by
+ *     setting "EXT4_STATE_FC_COMMITTING" state, and snapshot the inode state
+ *     needed for log writing.
+ * [5] Unlock the journal by calling jbd2_journal_unlock_updates(). This allows
+ *     starting of new handles. Updates to inodes being fast committed are
+ *     tracked for requeue rather than blocking.
  * [6] Commit all the directory entry updates in the fast commit space.
- * [7] Commit all the changed inodes in the fast commit space and clear
- *     "EXT4_STATE_FC_COMMITTING" for these inodes.
+ * [7] Commit all the changed inodes in the fast commit space.
  * [8] Write tail tag (this tag ensures the atomicity, please read the following
  *     section for more details).
+ * [9] Clear "EXT4_STATE_FC_COMMITTING" and wake up waiters in
+ *     ext4_fc_cleanup().
  *
- * All the inode updates must be enclosed within jbd2_jounrnal_start()
+ * All the inode updates must be enclosed within jbd2_journal_start()
  * and jbd2_journal_stop() similar to JBD2 journaling.
  *
  * Fast Commit Ineligibility
@@ -183,6 +184,21 @@
 
 #include <trace/events/ext4.h>
 static struct kmem_cache *ext4_fc_dentry_cachep;
+static struct kmem_cache *ext4_fc_range_cachep;
+
+/*
+ * Avoid spending unbounded time/memory snapshotting highly fragmented files
+ * under jbd2_journal_lock_updates(). If we exceed this limit, fall back to
+ * full commit.
+ */
+#define EXT4_FC_SNAPSHOT_MAX_INODES	1024
+#define EXT4_FC_SNAPSHOT_MAX_RANGES	2048
+
+static inline void ext4_fc_set_snap_err(int *snap_err, int err)
+{
+	if (snap_err && *snap_err == EXT4_FC_SNAP_ERR_NONE)
+		*snap_err = err;
+}
 
 static void ext4_end_buffer_io_sync(struct bio *bio)
 {
@@ -203,6 +219,8 @@ static void ext4_end_buffer_io_sync(struct bio *bio)
 	unlock_buffer(bh);
 }
 
+static void ext4_fc_free_inode_snap(struct inode *inode);
+
 static inline void ext4_fc_reset_inode(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
@@ -217,8 +235,10 @@ void ext4_fc_init_inode(struct inode *inode)
 
 	ext4_fc_reset_inode(inode);
 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+	ext4_clear_inode_state(inode, EXT4_STATE_FC_REQUEUE);
 	INIT_LIST_HEAD(&ei->i_fc_list);
 	INIT_LIST_HEAD(&ei->i_fc_dilist);
+	ei->i_fc_snap = NULL;
 }
 
 static bool ext4_fc_disabled(struct super_block *sb)
@@ -234,6 +254,50 @@ static bool ext4_fc_eligible(struct super_block *sb)
 }
 
 /*
+ * Wait for an inode fast-commit state bit to clear while dropping the
+ * fast-commit lock around schedule().
+ */
+static void ext4_fc_wait_inode_state(struct inode *inode, int bit,
+				     int *alloc_ctx)
+{
+	wait_queue_head_t *wq;
+	unsigned long *wait_word = ext4_inode_state_wait_word(inode);
+	int wait_bit = ext4_inode_state_wait_bit(bit);
+
+	while (ext4_test_inode_state(inode, bit)) {
+		DEFINE_WAIT_BIT(wait, wait_word, wait_bit);
+
+		wq = bit_waitqueue(wait_word, wait_bit);
+		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+		if (ext4_test_inode_state(inode, bit)) {
+			ext4_fc_unlock(inode->i_sb, *alloc_ctx);
+			schedule();
+			*alloc_ctx = ext4_fc_lock(inode->i_sb);
+		}
+		finish_wait(wq, &wait.wq_entry);
+	}
+}
+
+static inline void ext4_fc_wake_inode_state(struct inode *inode, int bit)
+{
+	wake_up_bit(ext4_inode_state_wait_word(inode),
+		    ext4_inode_state_wait_bit(bit));
+}
+
+static void ext4_fc_snap_stats_update_max(atomic64_t *stat, u64 value)
+{
+	u64 old = atomic64_read(stat);
+
+	while (value > old) {
+		u64 prev = atomic64_cmpxchg(stat, old, value);
+
+		if (prev == old)
+			break;
+		old = prev;
+	}
+}
+
+/*
  * Remove inode from fast commit list. If the inode is being committed
  * we wait until inode commit is done.
  */
@@ -241,7 +305,6 @@ void ext4_fc_del(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct ext4_fc_dentry_update *fc_dentry;
-	wait_queue_head_t *wq;
 	int alloc_ctx;
 
 	if (ext4_fc_disabled(inode->i_sb))
@@ -249,59 +312,43 @@ void ext4_fc_del(struct inode *inode)
 
 	alloc_ctx = ext4_fc_lock(inode->i_sb);
 	if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
+		ext4_fc_free_inode_snap(inode);
 		ext4_fc_unlock(inode->i_sb, alloc_ctx);
 		return;
 	}
 
 	/*
-	 * Since ext4_fc_del is called from ext4_evict_inode while having a
-	 * handle open, there is no need for us to wait here even if a fast
-	 * commit is going on. That is because, if this inode is being
-	 * committed, ext4_mark_inode_dirty would have waited for inode commit
-	 * operation to finish before we come here. So, by the time we come
-	 * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
-	 * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
-	 * here.
-	 *
-	 * We may come here without any handles open in the "no_delete" case of
-	 * ext4_evict_inode as well. However, if that happens, we first mark the
-	 * file system as fast commit ineligible anyway. So, even in that case,
-	 * it is okay to remove the inode from the fc list.
+	 * Wait for ongoing fast commit to finish. We cannot remove the inode
+	 * from fast commit lists while it is being committed. If we wake from
+	 * FC_FLUSHING_DATA, re-check FC_COMMITTING before deleting because the
+	 * commit thread sets FC_COMMITTING only after clearing FLUSHING_DATA.
 	 */
-	WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
-		&& !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
-	while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
-#if (BITS_PER_LONG < 64)
-		DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
-				EXT4_STATE_FC_FLUSHING_DATA);
-		wq = bit_waitqueue(&ei->i_state_flags,
-				   EXT4_STATE_FC_FLUSHING_DATA);
-#else
-		DEFINE_WAIT_BIT(wait, &ei->i_flags,
-				EXT4_STATE_FC_FLUSHING_DATA);
-		wq = bit_waitqueue(&ei->i_flags,
-				   EXT4_STATE_FC_FLUSHING_DATA);
-#endif
-		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
-		if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
-			ext4_fc_unlock(inode->i_sb, alloc_ctx);
-			schedule();
-			alloc_ctx = ext4_fc_lock(inode->i_sb);
-		}
-		finish_wait(wq, &wait.wq_entry);
+	for (;;) {
+		ext4_fc_wait_inode_state(inode, EXT4_STATE_FC_COMMITTING,
+					 &alloc_ctx);
+
+		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA))
+			break;
+
+		ext4_fc_wait_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA,
+					 &alloc_ctx);
 	}
+
+	ext4_fc_free_inode_snap(inode);
 	list_del_init(&ei->i_fc_list);
 
 	/*
-	 * Since this inode is getting removed, let's also remove all FC
-	 * dentry create references, since it is not needed to log it anyways.
+	 * Since this inode is getting removed, let's also remove all FC dentry
+	 * create references, since it is not needed to log it anyways.
 	 */
 	if (list_empty(&ei->i_fc_dilist)) {
 		ext4_fc_unlock(inode->i_sb, alloc_ctx);
 		return;
 	}
 
-	fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
+	fc_dentry = list_first_entry(&ei->i_fc_dilist,
+				     struct ext4_fc_dentry_update,
+				     fcd_dilist);
 	WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
 	list_del_init(&fc_dentry->fcd_list);
 	list_del_init(&fc_dentry->fcd_dilist);
@@ -373,6 +420,8 @@ static int ext4_fc_track_template(
 
 	tid = handle->h_transaction->t_tid;
 	spin_lock(&ei->i_fc_lock);
+	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
+		ext4_set_inode_state(inode, EXT4_STATE_FC_REQUEUE);
 	if (tid == ei->i_sync_tid) {
 		update = true;
 	} else {
@@ -543,8 +592,6 @@ static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
 
 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 {
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	wait_queue_head_t *wq;
 	int ret;
 
 	if (S_ISDIR(inode->i_mode))
@@ -560,29 +607,11 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 		return;
 
 	/*
-	 * If we come here, we may sleep while waiting for the inode to
-	 * commit. We shouldn't be holding i_data_sem when we go to sleep since
-	 * the commit path needs to grab the lock while committing the inode.
+	 * Fast commit snapshots inode state at commit time, so there's no need
+	 * to wait for EXT4_STATE_FC_COMMITTING here. If the inode is already
+	 * on the commit queue, ext4_fc_cleanup() will requeue it for the new
+	 * transaction once the current commit finishes.
 	 */
-	lockdep_assert_not_held(&ei->i_data_sem);
-
-	while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
-#if (BITS_PER_LONG < 64)
-		DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
-				EXT4_STATE_FC_COMMITTING);
-		wq = bit_waitqueue(&ei->i_state_flags,
-				   EXT4_STATE_FC_COMMITTING);
-#else
-		DEFINE_WAIT_BIT(wait, &ei->i_flags,
-				EXT4_STATE_FC_COMMITTING);
-		wq = bit_waitqueue(&ei->i_flags,
-				   EXT4_STATE_FC_COMMITTING);
-#endif
-		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
-		if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
-			schedule();
-		finish_wait(wq, &wait.wq_entry);
-	}
 
 	/*
 	 * From this point on, this inode will not be committed either
@@ -831,6 +860,21 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
 	return true;
 }
 
+struct ext4_fc_range {
+	struct list_head list;
+	u16 tag;
+	ext4_lblk_t lblk;
+	ext4_lblk_t len;
+	ext4_fsblk_t pblk;
+	bool unwritten;
+};
+
+struct ext4_fc_inode_snap {
+	struct list_head data_list;
+	unsigned int inode_len;
+	u8 inode_buf[];
+};
+
 /*
  * Writes inode in the fast commit space under TLV with tag @tag.
  * Returns 0 on success, error on failure.
@@ -838,21 +882,27 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
-	int ret;
-	struct ext4_iloc iloc;
+	struct ext4_fc_inode_snap *snap = ei->i_fc_snap;
+	struct ext4_fc_snap_stats *stats =
+		&EXT4_SB(inode->i_sb)->s_fc_snap_stats;
 	struct ext4_fc_inode fc_inode;
 	struct ext4_fc_tl tl;
 	u8 *dst;
+	u8 *src;
+	int inode_len;
+	int ret;
 
-	ret = ext4_get_inode_loc(inode, &iloc);
-	if (ret)
-		return ret;
+	if (!snap) {
+		atomic64_inc(&stats->snap_fail_no_snap);
+		return -ECANCELED;
+	}
 
-	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
-		inode_len = EXT4_INODE_SIZE(inode->i_sb);
-	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
-		inode_len += ei->i_extra_isize;
+	src = snap->inode_buf;
+	inode_len = snap->inode_len;
+	if (!src || inode_len == 0) {
+		atomic64_inc(&stats->snap_fail_no_snap);
+		return -ECANCELED;
+	}
 
 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
@@ -868,10 +918,9 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 	dst += EXT4_FC_TAG_BASE_LEN;
 	memcpy(dst, &fc_inode, sizeof(fc_inode));
 	dst += sizeof(fc_inode);
-	memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
+	memcpy(dst, src, inode_len);
 	ret = 0;
 err:
-	brelse(iloc.bh);
 	return ret;
 }
 
@@ -881,76 +930,244 @@ err:
  */
 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 {
-	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_map_blocks map;
+	struct ext4_fc_inode_snap *snap = ei->i_fc_snap;
+	struct ext4_fc_snap_stats *stats =
+		&EXT4_SB(inode->i_sb)->s_fc_snap_stats;
 	struct ext4_fc_add_range fc_ext;
 	struct ext4_fc_del_range lrange;
 	struct ext4_extent *ex;
-	int ret;
+	struct ext4_fc_range *range;
+
+	if (!snap) {
+		atomic64_inc(&stats->snap_fail_no_snap);
+		return -ECANCELED;
+	}
+
+	list_for_each_entry(range, &snap->data_list, list) {
+		if (range->tag == EXT4_FC_TAG_DEL_RANGE) {
+			lrange.fc_ino = cpu_to_le32(inode->i_ino);
+			lrange.fc_lblk = cpu_to_le32(range->lblk);
+			lrange.fc_len = cpu_to_le32(range->len);
+			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
+					     sizeof(lrange), (u8 *)&lrange, crc))
+				return -ENOSPC;
+			continue;
+		}
+
+		fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
+		ex = (struct ext4_extent *)&fc_ext.fc_ex;
+		ex->ee_block = cpu_to_le32(range->lblk);
+		ex->ee_len = cpu_to_le16(range->len);
+		ext4_ext_store_pblock(ex, range->pblk);
+		if (range->unwritten)
+			ext4_ext_mark_unwritten(ex);
+		else
+			ext4_ext_mark_initialized(ex);
+
+		if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
+				     sizeof(fc_ext), (u8 *)&fc_ext, crc))
+			return -ENOSPC;
+	}
+
+	return 0;
+}
+
+static void ext4_fc_free_ranges(struct list_head *head)
+{
+	struct ext4_fc_range *range, *range_n;
+
+	list_for_each_entry_safe(range, range_n, head, list) {
+		list_del(&range->list);
+		kmem_cache_free(ext4_fc_range_cachep, range);
+	}
+}
+
+static void ext4_fc_free_inode_snap(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_fc_inode_snap *snap = ei->i_fc_snap;
+
+	if (!snap)
+		return;
+
+	ext4_fc_free_ranges(&snap->data_list);
+	kfree(snap);
+	ei->i_fc_snap = NULL;
+}
+
+static int ext4_fc_snapshot_inode_data(struct inode *inode,
+				       struct list_head *ranges,
+				       unsigned int nr_ranges_total,
+				       unsigned int *nr_rangesp,
+				       int *snap_err)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_fc_snap_stats *stats =
+		&EXT4_SB(inode->i_sb)->s_fc_snap_stats;
+	ext4_lblk_t start_lblk, end_lblk, cur_lblk;
+	unsigned int nr_ranges = 0;
 
 	spin_lock(&ei->i_fc_lock);
 	if (ei->i_fc_lblk_len == 0) {
 		spin_unlock(&ei->i_fc_lock);
+		if (nr_rangesp)
+			*nr_rangesp = 0;
 		return 0;
 	}
-	old_blk_size = ei->i_fc_lblk_start;
-	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
+	start_lblk = ei->i_fc_lblk_start;
+	end_lblk = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 	ei->i_fc_lblk_len = 0;
 	spin_unlock(&ei->i_fc_lock);
 
-	cur_lblk_off = old_blk_size;
-	ext4_debug("will try writing %d to %d for inode %llu\n",
-		   cur_lblk_off, new_blk_size, inode->i_ino);
+	cur_lblk = start_lblk;
+	ext4_debug("snapshot data ranges %u-%u for inode %llu\n",
+		   start_lblk, end_lblk,
+		   (unsigned long long)inode->i_ino);
+
+	while (cur_lblk <= end_lblk) {
+		struct extent_status es;
+		struct ext4_fc_range *range;
+		ext4_lblk_t len;
+		u64 remaining = (u64)end_lblk - cur_lblk + 1;
+
+		if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL)) {
+			atomic64_inc(&stats->snap_fail_es_miss);
+			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_MISS);
+			return -EAGAIN;
+		}
 
-	while (cur_lblk_off <= new_blk_size) {
-		map.m_lblk = cur_lblk_off;
-		map.m_len = new_blk_size - cur_lblk_off + 1;
-		ret = ext4_map_blocks(NULL, inode, &map,
-				      EXT4_GET_BLOCKS_IO_SUBMIT |
-				      EXT4_EX_NOCACHE);
-		if (ret < 0)
-			return -ECANCELED;
+		if (ext4_es_is_delayed(&es)) {
+			atomic64_inc(&stats->snap_fail_es_delayed);
+			ext4_fc_set_snap_err(snap_err,
+					     EXT4_FC_SNAP_ERR_ES_DELAYED);
+			return -EAGAIN;
+		}
 
-		if (map.m_len == 0) {
-			cur_lblk_off++;
+		len = es.es_len - (cur_lblk - es.es_lblk);
+		if (len > remaining)
+			len = remaining;
+		if (len == 0) {
+			cur_lblk++;
 			continue;
 		}
 
-		if (ret == 0) {
-			lrange.fc_ino = cpu_to_le32(inode->i_ino);
-			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
-			lrange.fc_len = cpu_to_le32(map.m_len);
-			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
-					    sizeof(lrange), (u8 *)&lrange, crc))
-				return -ENOSPC;
+		if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES) {
+			atomic64_inc(&stats->snap_fail_ranges_cap);
+			ext4_fc_set_snap_err(snap_err,
+					     EXT4_FC_SNAP_ERR_RANGES_CAP);
+			return -E2BIG;
+		}
+
+		range = kmem_cache_alloc(ext4_fc_range_cachep, GFP_NOFS);
+		if (!range) {
+			atomic64_inc(&stats->snap_fail_nomem);
+			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
+			return -ENOMEM;
+		}
+		nr_ranges++;
+
+		range->lblk = cur_lblk;
+		range->len = len;
+		range->pblk = 0;
+		range->unwritten = false;
+
+		if (ext4_es_is_hole(&es)) {
+			range->tag = EXT4_FC_TAG_DEL_RANGE;
+		} else if (ext4_es_is_written(&es) ||
+			   ext4_es_is_unwritten(&es)) {
+			unsigned int max;
+
+			range->tag = EXT4_FC_TAG_ADD_RANGE;
+			range->pblk = ext4_es_pblock(&es) +
+				      (cur_lblk - es.es_lblk);
+			range->unwritten = ext4_es_is_unwritten(&es);
+
+			max = range->unwritten ? EXT_UNWRITTEN_MAX_LEN :
+						 EXT_INIT_MAX_LEN;
+			if (range->len > max)
+				range->len = max;
 		} else {
-			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
-				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
-
-			/* Limit the number of blocks in one extent */
-			map.m_len = min(max, map.m_len);
-
-			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
-			ex = (struct ext4_extent *)&fc_ext.fc_ex;
-			ex->ee_block = cpu_to_le32(map.m_lblk);
-			ex->ee_len = cpu_to_le16(map.m_len);
-			ext4_ext_store_pblock(ex, map.m_pblk);
-			if (map.m_flags & EXT4_MAP_UNWRITTEN)
-				ext4_ext_mark_unwritten(ex);
-			else
-				ext4_ext_mark_initialized(ex);
-			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
-					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
-				return -ENOSPC;
+			kmem_cache_free(ext4_fc_range_cachep, range);
+			atomic64_inc(&stats->snap_fail_es_other);
+			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_OTHER);
+			return -EAGAIN;
 		}
 
-		cur_lblk_off += map.m_len;
+		INIT_LIST_HEAD(&range->list);
+		list_add_tail(&range->list, ranges);
+
+		if ((u64)range->len > (u64)end_lblk - cur_lblk)
+			break;
+
+		cur_lblk += range->len;
 	}
 
+	if (nr_rangesp)
+		*nr_rangesp = nr_ranges;
 	return 0;
 }
 
+static int ext4_fc_snapshot_inode(struct inode *inode,
+				  unsigned int nr_ranges_total,
+				  unsigned int *nr_rangesp, int *snap_err)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_fc_snap_stats *stats =
+		&EXT4_SB(inode->i_sb)->s_fc_snap_stats;
+	struct ext4_fc_inode_snap *snap;
+	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
+	struct ext4_iloc iloc;
+	LIST_HEAD(ranges);
+	unsigned int nr_ranges = 0;
+	int ret;
+	int alloc_ctx;
+
+	ret = ext4_get_inode_loc_noio(inode, &iloc);
+	if (ret) {
+		atomic64_inc(&stats->snap_fail_inode_loc);
+		ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_INODE_LOC);
+		return ret;
+	}
+
+	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
+		inode_len = EXT4_INODE_SIZE(inode->i_sb);
+	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
+		inode_len += ei->i_extra_isize;
+
+	snap = kmalloc(struct_size(snap, inode_buf, inode_len), GFP_NOFS);
+	if (!snap) {
+		atomic64_inc(&stats->snap_fail_nomem);
+		ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
+		brelse(iloc.bh);
+		return -ENOMEM;
+	}
+	INIT_LIST_HEAD(&snap->data_list);
+	snap->inode_len = inode_len;
+
+	memcpy(snap->inode_buf, (u8 *)ext4_raw_inode(&iloc), inode_len);
+	brelse(iloc.bh);
+
+	ret = ext4_fc_snapshot_inode_data(inode, &ranges, nr_ranges_total,
+					  &nr_ranges, snap_err);
+	if (ret) {
+		kfree(snap);
+		ext4_fc_free_ranges(&ranges);
+		return ret;
+	}
+
+	alloc_ctx = ext4_fc_lock(inode->i_sb);
+	ext4_fc_free_inode_snap(inode);
+	ei->i_fc_snap = snap;
+	list_splice_tail_init(&ranges, &snap->data_list);
+	ext4_fc_unlock(inode->i_sb, alloc_ctx);
+
+	atomic64_inc(&stats->snap_inodes);
+	atomic64_add(nr_ranges, &stats->snap_ranges);
+	if (nr_rangesp)
+		*nr_rangesp = nr_ranges;
+	return 0;
+}
 
 /* Flushes data of all the inodes in the commit queue. */
 static int ext4_fc_flush_data(journal_t *journal)
@@ -1001,6 +1218,11 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 		 */
 		if (list_empty(&fc_dentry->fcd_dilist))
 			continue;
+		/*
+		 * For EXT4_FC_TAG_CREAT, fcd_dilist is linked on the created
+		 * inode's i_fc_dilist list (kept singular), so we can recover the
+		 * inode through it.
+		 */
 		ei = list_first_entry(&fc_dentry->fcd_dilist,
 				struct ext4_inode_info, i_fc_dilist);
 		inode = &ei->vfs_inode;
@@ -1025,17 +1247,114 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 	return 0;
 }
 
-static int ext4_fc_perform_commit(journal_t *journal)
+static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb,
+					 struct inode ***inodesp,
+					 unsigned int *nr_inodesp);
+
+static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
+				   unsigned int inodes_size,
+				   unsigned int *nr_inodesp,
+				   unsigned int *nr_rangesp,
+				   int *snap_err)
+{
+	struct super_block *sb = journal->j_private;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_inode_info *iter;
+	struct ext4_fc_dentry_update *fc_dentry;
+	unsigned int i = 0;
+	unsigned int idx;
+	unsigned int nr_ranges = 0;
+	int ret = 0;
+	int alloc_ctx;
+
+	alloc_ctx = ext4_fc_lock(sb);
+	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+		if (i >= inodes_size) {
+			atomic64_inc(&sbi->s_fc_snap_stats.snap_fail_inodes_cap);
+			ext4_fc_set_snap_err(snap_err,
+					     EXT4_FC_SNAP_ERR_INODES_CAP);
+			ret = -E2BIG;
+			goto unlock;
+		}
+		inodes[i++] = &iter->vfs_inode;
+	}
+
+	list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
+		struct ext4_inode_info *ei;
+		struct inode *inode;
+
+		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
+			continue;
+		if (list_empty(&fc_dentry->fcd_dilist))
+			continue;
+
+		/* See the comment in ext4_fc_commit_dentry_updates(). */
+		ei = list_first_entry(&fc_dentry->fcd_dilist,
+				      struct ext4_inode_info, i_fc_dilist);
+		inode = &ei->vfs_inode;
+		if (!list_empty(&ei->i_fc_list))
+			continue;
+
+		if (i >= inodes_size) {
+			atomic64_inc(&sbi->s_fc_snap_stats.snap_fail_inodes_cap);
+			ext4_fc_set_snap_err(snap_err,
+					     EXT4_FC_SNAP_ERR_INODES_CAP);
+			ret = -E2BIG;
+			goto unlock;
+		}
+		/*
+		 * Create-only inodes may only be referenced via fcd_dilist and
+		 * not appear on s_fc_q[MAIN]. They may hit the last iput while
+		 * we are snapshotting, but inode eviction calls ext4_fc_del(),
+		 * which waits for FC_COMMITTING to clear. Mark them FC_COMMITTING
+		 * so the inode stays pinned and the snapshot stays valid until
+		 * ext4_fc_cleanup().
+		 */
+		ext4_set_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+		inodes[i++] = inode;
+	}
+unlock:
+	ext4_fc_unlock(sb, alloc_ctx);
+
+	if (ret)
+		return ret;
+
+	for (idx = 0; idx < i; idx++) {
+		unsigned int inode_ranges = 0;
+
+		ret = ext4_fc_snapshot_inode(inodes[idx], nr_ranges,
+					     &inode_ranges, snap_err);
+		if (ret)
+			break;
+		nr_ranges += inode_ranges;
+	}
+
+	if (nr_inodesp)
+		*nr_inodesp = idx;
+	if (nr_rangesp)
+		*nr_rangesp = nr_ranges;
+	return ret;
+}
+
+static int ext4_fc_perform_commit(journal_t *journal, tid_t commit_tid)
 {
 	struct super_block *sb = journal->j_private;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_fc_snap_stats *snap_stats = &sbi->s_fc_snap_stats;
 	struct ext4_inode_info *iter;
 	struct ext4_fc_head head;
 	struct inode *inode;
+	struct inode **inodes;
+	unsigned int inodes_size;
+	unsigned int snap_inodes = 0;
+	unsigned int snap_ranges = 0;
+	int snap_err = EXT4_FC_SNAP_ERR_NONE;
 	struct blk_plug plug;
 	int ret = 0;
 	u32 crc = 0;
 	int alloc_ctx;
+	ktime_t lock_start;
+	u64 locked_ns;
 
 	/*
 	 * Step 1: Mark all inodes on s_fc_q[MAIN] with
@@ -1061,11 +1380,8 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 		ext4_clear_inode_state(&iter->vfs_inode,
 				       EXT4_STATE_FC_FLUSHING_DATA);
-#if (BITS_PER_LONG < 64)
-		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA);
-#else
-		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA);
-#endif
+		ext4_fc_wake_inode_state(&iter->vfs_inode,
+					 EXT4_STATE_FC_FLUSHING_DATA);
 	}
 
 	/*
@@ -1083,13 +1399,23 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	if (ret)
 		return ret;
 
+	ret = ext4_fc_alloc_snapshot_inodes(sb, &inodes, &inodes_size);
+	if (ret) {
+		if (ret == -E2BIG)
+			atomic64_inc(&snap_stats->snap_fail_inodes_cap);
+		else if (ret == -ENOMEM)
+			atomic64_inc(&snap_stats->snap_fail_nomem);
+		return ret;
+	}
 
 	/* Step 4: Mark all inodes as being committed. */
 	jbd2_journal_lock_updates(journal);
+	lock_start = ktime_get();
 	/*
 	 * The journal is now locked. No more handles can start and all the
-	 * previous handles are now drained. We now mark the inodes on the
-	 * commit queue as being committed.
+	 * previous handles are now drained. Snapshotting happens in this
+	 * window so log writing can consume only stable snapshots without
+	 * doing logical-to-physical mapping.
 	 */
 	alloc_ctx = ext4_fc_lock(sb);
 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
@@ -1097,7 +1423,22 @@ static int ext4_fc_perform_commit(journal_t *journal)
 				     EXT4_STATE_FC_COMMITTING);
 	}
 	ext4_fc_unlock(sb, alloc_ctx);
+
+	ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size,
+				      &snap_inodes, &snap_ranges, &snap_err);
 	jbd2_journal_unlock_updates(journal);
+	locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
+	atomic64_add(locked_ns, &snap_stats->lock_updates_ns_total);
+	atomic64_inc(&snap_stats->lock_updates_samples);
+	ext4_fc_snap_stats_update_max(&snap_stats->lock_updates_ns_max,
+				      locked_ns);
+	if (trace_ext4_fc_lock_updates_enabled())
+		trace_call__ext4_fc_lock_updates(sb, commit_tid, locked_ns,
+						 snap_inodes, snap_ranges,
+						 ret, snap_err);
+	kvfree(inodes);
+	if (ret)
+		return ret;
 
 	/*
 	 * Step 5: If file system device is different from journal device,
@@ -1151,6 +1492,64 @@ out:
 	return ret;
 }
 
+static unsigned int ext4_fc_count_snapshot_inodes(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_inode_info *iter;
+	struct ext4_fc_dentry_update *fc_dentry;
+	unsigned int nr_inodes = 0;
+	int alloc_ctx;
+
+	alloc_ctx = ext4_fc_lock(sb);
+	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list)
+		nr_inodes++;
+
+	list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
+		struct ext4_inode_info *ei;
+
+		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
+			continue;
+		if (list_empty(&fc_dentry->fcd_dilist))
+			continue;
+
+		/* See the comment in ext4_fc_commit_dentry_updates(). */
+		ei = list_first_entry(&fc_dentry->fcd_dilist,
+				      struct ext4_inode_info, i_fc_dilist);
+		if (!list_empty(&ei->i_fc_list))
+			continue;
+
+		nr_inodes++;
+	}
+	ext4_fc_unlock(sb, alloc_ctx);
+
+	return nr_inodes;
+}
+
+static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb,
+					 struct inode ***inodesp,
+					 unsigned int *nr_inodesp)
+{
+	unsigned int nr_inodes = ext4_fc_count_snapshot_inodes(sb);
+	struct inode **inodes;
+
+	*inodesp = NULL;
+	*nr_inodesp = 0;
+
+	if (!nr_inodes)
+		return 0;
+
+	if (nr_inodes > EXT4_FC_SNAPSHOT_MAX_INODES)
+		return -E2BIG;
+
+	inodes = kvcalloc(nr_inodes, sizeof(*inodes), GFP_NOFS);
+	if (!inodes)
+		return -ENOMEM;
+
+	*inodesp = inodes;
+	*nr_inodesp = nr_inodes;
+	return 0;
+}
+
 static void ext4_fc_update_stats(struct super_block *sb, int status,
 				 u64 commit_time, int nblks, tid_t commit_tid)
 {
@@ -1241,9 +1640,12 @@ restart_fc:
 		journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
 	set_task_ioprio(current, journal_ioprio);
 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
-	ret = ext4_fc_perform_commit(journal);
+	ret = ext4_fc_perform_commit(journal, commit_tid);
 	if (ret < 0) {
-		status = EXT4_FC_STATUS_FAILED;
+		if (ret == -EAGAIN || ret == -E2BIG || ret == -ECANCELED)
+			status = EXT4_FC_STATUS_INELIGIBLE;
+		else
+			status = EXT4_FC_STATUS_FAILED;
 		goto fallback;
 	}
 	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
@@ -1290,45 +1692,66 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 
 	alloc_ctx = ext4_fc_lock(sb);
 	while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
+		bool requeue;
+
 		ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
 					struct ext4_inode_info,
 					i_fc_list);
 		list_del_init(&ei->i_fc_list);
+		ext4_fc_free_inode_snap(&ei->vfs_inode);
+		spin_lock(&ei->i_fc_lock);
+		if (full)
+			requeue = !tid_geq(tid, ei->i_sync_tid);
+		else
+			requeue = ext4_test_inode_state(&ei->vfs_inode,
+							EXT4_STATE_FC_REQUEUE);
+		if (!requeue)
+			ext4_fc_reset_inode(&ei->vfs_inode);
+		ext4_clear_inode_state(&ei->vfs_inode, EXT4_STATE_FC_REQUEUE);
 		ext4_clear_inode_state(&ei->vfs_inode,
 				       EXT4_STATE_FC_COMMITTING);
-		if (tid_geq(tid, ei->i_sync_tid)) {
-			ext4_fc_reset_inode(&ei->vfs_inode);
-		} else if (full) {
-			/*
-			 * We are called after a full commit, inode has been
-			 * modified while the commit was running. Re-enqueue
-			 * the inode into STAGING, which will then be splice
-			 * back into MAIN. This cannot happen during
-			 * fastcommit because the journal is locked all the
-			 * time in that case (and tid doesn't increase so
-			 * tid check above isn't reliable).
-			 */
+		spin_unlock(&ei->i_fc_lock);
+		if (requeue)
 			list_add_tail(&ei->i_fc_list,
 				      &sbi->s_fc_q[FC_Q_STAGING]);
-		}
 		/*
 		 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
 		 * visible before we send the wakeup. Pairs with implicit
-		 * barrier in prepare_to_wait() in ext4_fc_track_inode().
+		 * barrier in prepare_to_wait() in ext4_fc_del().
 		 */
 		smp_mb();
-#if (BITS_PER_LONG < 64)
-		wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING);
-#else
-		wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING);
-#endif
+		ext4_fc_wake_inode_state(&ei->vfs_inode,
+					 EXT4_STATE_FC_COMMITTING);
 	}
 
 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
 		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
-					     struct ext4_fc_dentry_update,
-					     fcd_list);
+						 struct ext4_fc_dentry_update,
+						 fcd_list);
 		list_del_init(&fc_dentry->fcd_list);
+		if (fc_dentry->fcd_op == EXT4_FC_TAG_CREAT &&
+			!list_empty(&fc_dentry->fcd_dilist)) {
+			/* See the comment in ext4_fc_commit_dentry_updates(). */
+			ei = list_first_entry(&fc_dentry->fcd_dilist,
+						  struct ext4_inode_info,
+						  i_fc_dilist);
+			ext4_fc_free_inode_snap(&ei->vfs_inode);
+			spin_lock(&ei->i_fc_lock);
+			ext4_clear_inode_state(&ei->vfs_inode,
+						   EXT4_STATE_FC_REQUEUE);
+			ext4_clear_inode_state(&ei->vfs_inode,
+						   EXT4_STATE_FC_COMMITTING);
+			spin_unlock(&ei->i_fc_lock);
+			/*
+			 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+			 * visible before we send the wakeup. Pairs with
+			 * implicit barrier in prepare_to_wait() in
+			 * ext4_fc_del().
+			 */
+			smp_mb();
+			ext4_fc_wake_inode_state(&ei->vfs_inode,
+						 EXT4_STATE_FC_COMMITTING);
+		}
 		list_del_init(&fc_dentry->fcd_dilist);
 
 		release_dentry_name_snapshot(&fc_dentry->fcd_name);
@@ -2280,11 +2703,26 @@ int ext4_fc_info_show(struct seq_file *seq, void *v)
 {
 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
 	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
+	struct ext4_fc_snap_stats *snap_stats = &sbi->s_fc_snap_stats;
+	u64 lock_avg_ns = 0;
+	u64 lock_updates_samples;
+	u64 lock_updates_ns_total;
+	u64 lock_updates_ns_max;
 	int i;
 
 	if (v != SEQ_START_TOKEN)
 		return 0;
 
+	lock_updates_samples =
+		atomic64_read(&snap_stats->lock_updates_samples);
+	lock_updates_ns_total =
+		atomic64_read(&snap_stats->lock_updates_ns_total);
+	lock_updates_ns_max =
+		atomic64_read(&snap_stats->lock_updates_ns_max);
+	if (lock_updates_samples)
+		lock_avg_ns = div64_u64(lock_updates_ns_total,
+					lock_updates_samples);
+
 	seq_printf(seq,
 		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
 		   stats->fc_num_commits, stats->fc_ineligible_commits,
@@ -2295,6 +2733,23 @@ int ext4_fc_info_show(struct seq_file *seq, void *v)
 		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
 			stats->fc_ineligible_reason_count[i]);
 
+	seq_printf(seq,
+		   "Snapshot stats:\n%llu inodes\n%llu ranges\n%lluus lock_updates_avg\n%lluus lock_updates_max\n",
+		   atomic64_read(&snap_stats->snap_inodes),
+		   atomic64_read(&snap_stats->snap_ranges),
+		   div_u64(lock_avg_ns, 1000),
+		   div_u64(lock_updates_ns_max, 1000));
+	seq_printf(seq,
+		   "Snapshot failures:\n%llu es_miss\n%llu es_delayed\n%llu es_other\n%llu inodes_cap\n%llu ranges_cap\n%llu nomem\n%llu inode_loc\n%llu no_snap\n",
+		   atomic64_read(&snap_stats->snap_fail_es_miss),
+		   atomic64_read(&snap_stats->snap_fail_es_delayed),
+		   atomic64_read(&snap_stats->snap_fail_es_other),
+		   atomic64_read(&snap_stats->snap_fail_inodes_cap),
+		   atomic64_read(&snap_stats->snap_fail_ranges_cap),
+		   atomic64_read(&snap_stats->snap_fail_nomem),
+		   atomic64_read(&snap_stats->snap_fail_inode_loc),
+		   atomic64_read(&snap_stats->snap_fail_no_snap));
+
 	return 0;
 }
 
@@ -2303,13 +2758,20 @@ int __init ext4_fc_init_dentry_cache(void)
 	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
 					   SLAB_RECLAIM_ACCOUNT);
 
-	if (ext4_fc_dentry_cachep == NULL)
+	if (!ext4_fc_dentry_cachep)
 		return -ENOMEM;
 
+	ext4_fc_range_cachep = KMEM_CACHE(ext4_fc_range, SLAB_RECLAIM_ACCOUNT);
+	if (!ext4_fc_range_cachep) {
+		kmem_cache_destroy(ext4_fc_dentry_cachep);
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
 void ext4_fc_destroy_dentry_cache(void)
 {
+	kmem_cache_destroy(ext4_fc_range_cachep);
 	kmem_cache_destroy(ext4_fc_dentry_cachep);
 }
diff --git a/fs/ext4/hash-test.c b/fs/ext4/hash-test.c
new file mode 100644
index 000000000000..49b0d874c833
--- /dev/null
+++ b/fs/ext4/hash-test.c
@@ -0,0 +1,567 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit tests for ext4 directory hash computation.
+ */
+
+#include <kunit/test.h>
+#include <kunit/resource.h>
+#include <linux/fs.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/unicode.h>
+#include "ext4.h"
+
+static void ext4_hash_init_fake_dir(struct inode *dir, struct super_block *sb)
+{
+	memset(sb, 0, sizeof(*sb));
+	memset(dir, 0, sizeof(*dir));
+	dir->i_sb = sb;
+	strscpy(sb->s_id, "kunit-ext4", sizeof(sb->s_id));
+}
+
+static void ext4_hash_init_fake_dir_with_sbi(struct inode *dir,
+					     struct super_block *sb,
+					     struct ext4_sb_info *sbi)
+{
+	ext4_hash_init_fake_dir(dir, sb);
+	memset(sbi, 0, sizeof(*sbi));
+	sb->s_fs_info = sbi;
+	sbi->s_sb = sb;
+}
+
+#ifdef CONFIG_FS_ENCRYPTION
+static const struct fscrypt_operations ext4_hash_test_cryptops = {
+	.inode_info_offs =
+		(int)offsetof(struct ext4_inode_info, i_crypt_info) -
+		(int)offsetof(struct ext4_inode_info, vfs_inode),
+};
+#endif
+
+static void ext4_hash_init_fake_ext4_dir(struct ext4_inode_info *ei,
+					 struct super_block *sb,
+					 struct ext4_sb_info *sbi)
+{
+	struct inode *dir = &ei->vfs_inode;
+
+	memset(sb, 0, sizeof(*sb));
+	memset(ei, 0, sizeof(*ei));
+	memset(sbi, 0, sizeof(*sbi));
+
+	strscpy(sb->s_id, "kunit-ext4", sizeof(sb->s_id));
+	sb->s_fs_info = sbi;
+	sbi->s_sb = sb;
+
+	dir->i_sb = sb;
+	dir->i_mode = S_IFDIR;
+
+#ifdef CONFIG_FS_ENCRYPTION
+	fscrypt_set_ops(sb, &ext4_hash_test_cryptops);
+#endif
+}
+
+struct ext4_dirhash_test_case {
+	const char *name;
+	u32 hash_version;
+	const char *input;
+	int len;
+	u32 seed[4];
+	bool use_seed;
+	u32 expected_hash;
+	u32 expected_minor_hash;
+};
+
+static const struct ext4_dirhash_test_case ext4_dirhash_test_cases[] = {
+	{
+		.name = "legacy_abc",
+		.hash_version = DX_HASH_LEGACY,
+		.input = "abc",
+		.len = 3,
+		.use_seed = false,
+		.expected_hash = 0x75afd992,
+		.expected_minor_hash = 0x00000000,
+	},
+	{
+		.name = "legacy_unsigned_abc",
+		.hash_version = DX_HASH_LEGACY_UNSIGNED,
+		.input = "abc",
+		.len = 3,
+		.use_seed = false,
+		.expected_hash = 0x75afd992,
+		.expected_minor_hash = 0x00000000,
+	},
+	{
+		.name = "half_md4_abc",
+		.hash_version = DX_HASH_HALF_MD4,
+		.input = "abc",
+		.len = 3,
+		.use_seed = false,
+		.expected_hash = 0xd196a868,
+		.expected_minor_hash = 0xc420eb28,
+	},
+	{
+		.name = "half_md4_unsigned_abc",
+		.hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+		.input = "abc",
+		.len = 3,
+		.use_seed = false,
+		.expected_hash = 0xd196a868,
+		.expected_minor_hash = 0xc420eb28,
+	},
+	{
+		.name = "tea_abc",
+		.hash_version = DX_HASH_TEA,
+		.input = "abc",
+		.len = 3,
+		.use_seed = false,
+		.expected_hash = 0xb1435ec4,
+		.expected_minor_hash = 0x3f7eaa0e,
+	},
+	{
+		.name = "tea_unsigned_abc",
+		.hash_version = DX_HASH_TEA_UNSIGNED,
+		.input = "abc",
+		.len = 3,
+		.use_seed = false,
+		.expected_hash = 0xb1435ec4,
+		.expected_minor_hash = 0x3f7eaa0e,
+	},
+	{
+		.name = "empty_half_md4",
+		.hash_version = DX_HASH_HALF_MD4,
+		.input = "",
+		.len = 0,
+		.use_seed = false,
+		.expected_hash = 0xefcdab88,
+		.expected_minor_hash = 0x98badcfe,
+	},
+	{
+		.name = "half_md4_31bytes",
+		.hash_version = DX_HASH_HALF_MD4,
+		.input = "1234567890123456789012345678901",
+		.len = 31,
+		.use_seed = false,
+		.expected_hash = 0xc4db1f78,
+		.expected_minor_hash = 0xea23921b,
+	},
+	{
+		.name = "half_md4_32bytes",
+		.hash_version = DX_HASH_HALF_MD4,
+		.input = "12345678901234567890123456789012",
+		.len = 32,
+		.use_seed = false,
+		.expected_hash = 0xfa6cc63e,
+		.expected_minor_hash = 0x2f77bd1c,
+	},
+	{
+		.name = "half_md4_33bytes",
+		.hash_version = DX_HASH_HALF_MD4,
+		.input = "123456789012345678901234567890123",
+		.len = 33,
+		.use_seed = false,
+		.expected_hash = 0xdc0c2dec,
+		.expected_minor_hash = 0x5ca23365,
+	},
+	{
+		.name = "half_md4_unsigned_31bytes",
+		.hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+		.input = "1234567890123456789012345678901",
+		.len = 31,
+		.use_seed = false,
+		.expected_hash = 0xc4db1f78,
+		.expected_minor_hash = 0xea23921b,
+	},
+	{
+		.name = "half_md4_unsigned_32bytes",
+		.hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+		.input = "12345678901234567890123456789012",
+		.len = 32,
+		.use_seed = false,
+		.expected_hash = 0xfa6cc63e,
+		.expected_minor_hash = 0x2f77bd1c,
+	},
+	{
+		.name = "half_md4_unsigned_33bytes",
+		.hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+		.input = "123456789012345678901234567890123",
+		.len = 33,
+		.use_seed = false,
+		.expected_hash = 0xdc0c2dec,
+		.expected_minor_hash = 0x5ca23365,
+	},
+	{
+		.name = "tea_15bytes",
+		.hash_version = DX_HASH_TEA,
+		.input = "123456789abcdef",
+		.len = 15,
+		.use_seed = false,
+		.expected_hash = 0xa562903a,
+		.expected_minor_hash = 0x6174a00f,
+	},
+	{
+		.name = "tea_16bytes",
+		.hash_version = DX_HASH_TEA,
+		.input = "1234567890abcdef",
+		.len = 16,
+		.use_seed = false,
+		.expected_hash = 0x8449f258,
+		.expected_minor_hash = 0x49a16d46,
+	},
+	{
+		.name = "tea_17bytes",
+		.hash_version = DX_HASH_TEA,
+		.input = "123456789abcdefgh",
+		.len = 17,
+		.use_seed = false,
+		.expected_hash = 0xf32ec10c,
+		.expected_minor_hash = 0x58ceae61,
+	},
+	{
+		.name = "half_md4_seeded",
+		.hash_version = DX_HASH_HALF_MD4,
+		.input = "same-name",
+		.len = 9,
+		.seed = { 0x11111111, 0x22222222, 0x33333333, 0x44444444 },
+		.use_seed = true,
+		.expected_hash = 0x8aebf604,
+		.expected_minor_hash = 0x66ce48fe,
+	},
+	{
+		.name = "half_md4_non_ascii_signed",
+		.hash_version = DX_HASH_HALF_MD4,
+		.input = "\x80\x81\x82\x83\x84",
+		.len = 5,
+		.use_seed = false,
+		.expected_hash = 0x8bab0498,
+		.expected_minor_hash = 0xc326632d,
+	},
+	{
+		.name = "half_md4_non_ascii_unsigned",
+		.hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+		.input = "\x80\x81\x82\x83\x84",
+		.len = 5,
+		.use_seed = false,
+		.expected_hash = 0xbc48596e,
+		.expected_minor_hash = 0xde0fad41,
+	},
+	{
+		.name = "tea_non_ascii_signed",
+		.hash_version = DX_HASH_TEA,
+		.input = "\x80\x81\x82\x83\x84",
+		.len = 5,
+		.use_seed = false,
+		.expected_hash = 0x21e3a154,
+		.expected_minor_hash = 0x90112c3d,
+	},
+	{
+		.name = "tea_non_ascii_unsigned",
+		.hash_version = DX_HASH_TEA_UNSIGNED,
+		.input = "\x80\x81\x82\x83\x84",
+		.len = 5,
+		.use_seed = false,
+		.expected_hash = 0x9b648616,
+		.expected_minor_hash = 0x011dd507,
+	},
+};
+
+static void test_ext4fs_dirhash_vectors(struct kunit *test)
+{
+	struct super_block *sb;
+	struct inode *dir;
+	int i;
+
+	sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+	dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, sb);
+	KUNIT_ASSERT_NOT_NULL(test, dir);
+
+	ext4_hash_init_fake_dir(dir, sb);
+
+	for (i = 0; i < ARRAY_SIZE(ext4_dirhash_test_cases); i++) {
+		const struct ext4_dirhash_test_case *tc =
+			&ext4_dirhash_test_cases[i];
+		struct dx_hash_info hinfo;
+		int ret;
+
+		memset(&hinfo, 0, sizeof(hinfo));
+		hinfo.hash_version = tc->hash_version;
+		hinfo.seed = tc->use_seed ? (u32 *)tc->seed : NULL;
+
+		ret = ext4fs_dirhash(dir, tc->input, tc->len, &hinfo);
+
+		KUNIT_ASSERT_EQ_MSG(test, ret, 0, "case=%s", tc->name);
+		KUNIT_EXPECT_EQ_MSG(test, hinfo.hash, tc->expected_hash,
+				    "case=%s", tc->name);
+		KUNIT_EXPECT_EQ_MSG(test, hinfo.minor_hash,
+				    tc->expected_minor_hash,
+				    "case=%s", tc->name);
+	}
+}
+
+static void test_ext4fs_dirhash_seed_changes_result(struct kunit *test)
+{
+	struct super_block *sb;
+	struct inode *dir;
+	u32 seed[4] = { 0x11111111, 0x22222222, 0x33333333, 0x44444444 };
+	struct dx_hash_info plain = {
+		.hash_version = DX_HASH_HALF_MD4,
+	};
+	struct dx_hash_info seeded = {
+		.hash_version = DX_HASH_HALF_MD4,
+		.seed = seed,
+	};
+	int ret_plain, ret_seeded;
+
+	sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+	dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, sb);
+	KUNIT_ASSERT_NOT_NULL(test, dir);
+
+	ext4_hash_init_fake_dir(dir, sb);
+
+	ret_plain = ext4fs_dirhash(dir, "same-name", 9, &plain);
+	ret_seeded = ext4fs_dirhash(dir, "same-name", 9, &seeded);
+
+	KUNIT_ASSERT_EQ(test, ret_plain, 0);
+	KUNIT_ASSERT_EQ(test, ret_seeded, 0);
+
+	KUNIT_EXPECT_TRUE(test,
+			  plain.hash != seeded.hash ||
+			  plain.minor_hash != seeded.minor_hash);
+}
+
+static void test_ext4fs_dirhash_invalid_version_returns_einval(struct kunit *test)
+{
+	struct super_block *sb;
+	struct inode *dir;
+	struct ext4_sb_info *sbi;
+	struct dx_hash_info hinfo = {
+		.hash = 0xdeadbeef,
+		.minor_hash = 0xcafebabe,
+		.hash_version = DX_HASH_LAST + 1,
+	};
+	int ret;
+
+	sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+	dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL);
+	sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, sb);
+	KUNIT_ASSERT_NOT_NULL(test, dir);
+	KUNIT_ASSERT_NOT_NULL(test, sbi);
+
+	ext4_hash_init_fake_dir_with_sbi(dir, sb, sbi);
+
+	ret = ext4fs_dirhash(dir, "abc", 3, &hinfo);
+
+	KUNIT_EXPECT_EQ(test, ret, -EINVAL);
+	KUNIT_EXPECT_EQ(test, hinfo.hash, 0);
+	KUNIT_EXPECT_EQ(test, hinfo.minor_hash, 0);
+}
+
+static void test_ext4fs_dirhash_siphash_without_key_returns_einval(struct kunit *test)
+{
+	struct super_block *sb;
+	struct ext4_inode_info *ei;
+	struct inode *dir;
+	struct ext4_sb_info *sbi;
+	struct dx_hash_info hinfo = {
+		.hash_version = DX_HASH_SIPHASH,
+	};
+	int ret;
+
+	sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+	ei = kunit_kzalloc(test, sizeof(*ei), GFP_KERNEL);
+	sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, sb);
+	KUNIT_ASSERT_NOT_NULL(test, ei);
+	KUNIT_ASSERT_NOT_NULL(test, sbi);
+
+	ext4_hash_init_fake_ext4_dir(ei, sb, sbi);
+	dir = &ei->vfs_inode;
+
+	ret = ext4fs_dirhash(dir, "name", strlen("name"), &hinfo);
+
+	KUNIT_EXPECT_EQ(test, ret, -EINVAL);
+}
+
+static void test_ext4fs_dirhash_signed_unsigned_differ_on_nonascii(struct kunit *test)
+{
+	struct super_block *sb;
+	struct inode *dir;
+	static const char input[] = "\x80\xff\x81\xfe\101bc";
+	struct dx_hash_info legacy_signed = {
+		.hash_version = DX_HASH_LEGACY,
+	};
+	struct dx_hash_info legacy_unsigned = {
+		.hash_version = DX_HASH_LEGACY_UNSIGNED,
+	};
+	struct dx_hash_info md4_signed = {
+		.hash_version = DX_HASH_HALF_MD4,
+	};
+	struct dx_hash_info md4_unsigned = {
+		.hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+	};
+	struct dx_hash_info tea_signed = {
+		.hash_version = DX_HASH_TEA,
+	};
+	struct dx_hash_info tea_unsigned = {
+		.hash_version = DX_HASH_TEA_UNSIGNED,
+	};
+	int ret;
+
+	sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+	dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, sb);
+	KUNIT_ASSERT_NOT_NULL(test, dir);
+
+	ext4_hash_init_fake_dir(dir, sb);
+
+	ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &legacy_signed);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+	ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &legacy_unsigned);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+	KUNIT_EXPECT_NE(test, legacy_signed.hash, legacy_unsigned.hash);
+
+	ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &md4_signed);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+	ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &md4_unsigned);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+	KUNIT_EXPECT_TRUE(test,
+			  md4_signed.hash != md4_unsigned.hash ||
+			  md4_signed.minor_hash != md4_unsigned.minor_hash);
+
+	ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &tea_signed);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+	ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &tea_unsigned);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+	KUNIT_EXPECT_TRUE(test,
+			  tea_signed.hash != tea_unsigned.hash ||
+			  tea_signed.minor_hash != tea_unsigned.minor_hash);
+}
+
+#if IS_ENABLED(CONFIG_UNICODE)
+KUNIT_DEFINE_ACTION_WRAPPER(utf8_unload_action, utf8_unload,
+			    struct unicode_map *);
+static void test_ext4fs_dirhash_casefolded_names_hash_consistently(struct kunit *test)
+{
+	struct super_block *sb;
+	struct ext4_inode_info *ei;
+	struct ext4_sb_info *sbi;
+	struct unicode_map *um;
+	struct dx_hash_info h1 = {
+		.hash_version = DX_HASH_HALF_MD4,
+	};
+	struct dx_hash_info h2 = {
+		.hash_version = DX_HASH_HALF_MD4,
+	};
+	int ret, ret1, ret2;
+
+	sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+	ei = kunit_kzalloc(test, sizeof(*ei), GFP_KERNEL);
+	sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, sb);
+	KUNIT_ASSERT_NOT_NULL(test, ei);
+	KUNIT_ASSERT_NOT_NULL(test, sbi);
+
+	um = utf8_load(UTF8_LATEST);
+	if (IS_ERR(um)) {
+		kunit_skip(test, "utf8_load(UTF8_LATEST) failed: %pe",
+			   um);
+		return;
+	}
+
+	ret = kunit_add_action_or_reset(test, utf8_unload_action, um);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+
+	ext4_hash_init_fake_ext4_dir(ei, sb, sbi);
+	sb->s_encoding = um;
+	ei->vfs_inode.i_flags |= S_CASEFOLD;
+
+	KUNIT_ASSERT_TRUE(test, IS_CASEFOLDED(&ei->vfs_inode));
+
+	ret1 = ext4fs_dirhash(&ei->vfs_inode, "Alpha", 5, &h1);
+	ret2 = ext4fs_dirhash(&ei->vfs_inode, "aLPHa", 5, &h2);
+
+	KUNIT_ASSERT_EQ(test, ret1, 0);
+	KUNIT_ASSERT_EQ(test, ret2, 0);
+	KUNIT_EXPECT_EQ(test, h1.hash, h2.hash);
+	KUNIT_EXPECT_EQ(test, h1.minor_hash, h2.minor_hash);
+}
+
+static void test_ext4fs_dirhash_casefold_fallback(struct kunit *test)
+{
+	struct super_block *sb_cf, *sb_plain;
+	struct ext4_inode_info *ei;
+	struct ext4_sb_info *sbi;
+	struct inode *plain_dir;
+	struct unicode_map *um;
+	static const char invalid_utf8[] = "\xc3\x28";
+	struct dx_hash_info folded_dir = {
+		.hash_version = DX_HASH_HALF_MD4,
+	};
+	struct dx_hash_info plain = {
+		.hash_version = DX_HASH_HALF_MD4,
+	};
+	int ret, ret_cf, ret_plain;
+
+	sb_cf = kunit_kzalloc(test, sizeof(*sb_cf), GFP_KERNEL);
+	sb_plain = kunit_kzalloc(test, sizeof(*sb_plain), GFP_KERNEL);
+	ei = kunit_kzalloc(test, sizeof(*ei), GFP_KERNEL);
+	sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL);
+	plain_dir = kunit_kzalloc(test, sizeof(*plain_dir), GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, sb_cf);
+	KUNIT_ASSERT_NOT_NULL(test, sb_plain);
+	KUNIT_ASSERT_NOT_NULL(test, ei);
+	KUNIT_ASSERT_NOT_NULL(test, sbi);
+	KUNIT_ASSERT_NOT_NULL(test, plain_dir);
+
+	um = utf8_load(UTF8_LATEST);
+	if (IS_ERR(um)) {
+		kunit_skip(test, "utf8_load(UTF8_LATEST) failed: %pe",
+			   um);
+		return;
+	}
+
+	ret = kunit_add_action_or_reset(test, utf8_unload_action, um);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+
+	ext4_hash_init_fake_ext4_dir(ei, sb_cf, sbi);
+	sb_cf->s_encoding = um;
+	ei->vfs_inode.i_flags |= S_CASEFOLD;
+
+	KUNIT_ASSERT_TRUE(test, IS_CASEFOLDED(&ei->vfs_inode));
+
+	ext4_hash_init_fake_dir(plain_dir, sb_plain);
+
+	ret_cf = ext4fs_dirhash(&ei->vfs_inode, invalid_utf8,
+				sizeof(invalid_utf8) - 1, &folded_dir);
+	ret_plain = ext4fs_dirhash(plain_dir, invalid_utf8,
+				   sizeof(invalid_utf8) - 1, &plain);
+
+	KUNIT_ASSERT_EQ(test, ret_cf, 0);
+	KUNIT_ASSERT_EQ(test, ret_plain, 0);
+	KUNIT_EXPECT_EQ(test, folded_dir.hash, plain.hash);
+	KUNIT_EXPECT_EQ(test, folded_dir.minor_hash, plain.minor_hash);
+}
+#endif
+
+static struct kunit_case ext4_hash_test_cases[] = {
+	KUNIT_CASE(test_ext4fs_dirhash_vectors),
+	KUNIT_CASE(test_ext4fs_dirhash_seed_changes_result),
+	KUNIT_CASE(test_ext4fs_dirhash_invalid_version_returns_einval),
+	KUNIT_CASE(test_ext4fs_dirhash_siphash_without_key_returns_einval),
+	KUNIT_CASE(test_ext4fs_dirhash_signed_unsigned_differ_on_nonascii),
+#if IS_ENABLED(CONFIG_UNICODE)
+	KUNIT_CASE(test_ext4fs_dirhash_casefolded_names_hash_consistently),
+	KUNIT_CASE(test_ext4fs_dirhash_casefold_fallback),
+#endif
+	{}
+};
+
+static struct kunit_suite ext4_hash_test_suite = {
+	.name = "ext4_hash",
+	.test_cases = ext4_hash_test_cases,
+};
+
+kunit_test_suites(&ext4_hash_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 48483cd015d3..978bd92da0ad 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -9,6 +9,7 @@
 #include <linux/unicode.h>
 #include <linux/compiler.h>
 #include <linux/bitops.h>
+#include <linux/unaligned.h>
 #include "ext4.h"
 
 #define DELTA 0x9E3779B9
@@ -141,21 +142,28 @@ static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
 	pad = (__u32)len | ((__u32)len << 8);
 	pad |= pad << 16;
 
-	val = pad;
 	if (len > num*4)
 		len = num * 4;
-	for (i = 0; i < len; i++) {
-		val = ((int) scp[i]) + (val << 8);
-		if ((i % 4) == 3) {
-			*buf++ = val;
-			val = pad;
-			num--;
-		}
+
+	while (len >= 4) {
+		val = ((__u32)scp[0] << 24) + ((__u32)scp[1] << 16) + ((__u32)scp[2] << 8) + scp[3];
+		*buf++ = val;
+		scp += 4;
+		len -= 4;
+		num--;
 	}
+
+	val = pad;
+
+	for (i = 0; i < len; i++)
+		val = scp[i] + (val << 8);
+
 	if (--num >= 0)
 		*buf++ = val;
+
 	while (--num >= 0)
 		*buf++ = pad;
+
 }
 
 static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
@@ -167,21 +175,28 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
 	pad = (__u32)len | ((__u32)len << 8);
 	pad |= pad << 16;
 
-	val = pad;
 	if (len > num*4)
 		len = num * 4;
-	for (i = 0; i < len; i++) {
-		val = ((int) ucp[i]) + (val << 8);
-		if ((i % 4) == 3) {
-			*buf++ = val;
-			val = pad;
-			num--;
-		}
+
+	while (len >= 4) {
+		val = get_unaligned_be32(ucp);
+		*buf++ = val;
+		ucp += 4;
+		len -= 4;
+		num--;
 	}
+
+	val = pad;
+
+	for (i = 0; i < len; i++)
+		val = ucp[i] + (val << 8);
+
 	if (--num >= 0)
 		*buf++ = val;
+
 	while (--num >= 0)
 		*buf++ = pad;
+
 }
 
 /*
@@ -205,8 +220,7 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
 	const char	*p;
 	int		i;
 	__u32		in[8], buf[4];
-	void		(*str2hashbuf)(const char *, int, __u32 *, int) =
-				str2hashbuf_signed;
+	bool use_unsigned = false;
 
 	/* Initialize the default seed for the hash checksum functions */
 	buf[0] = 0x67452301;
@@ -232,12 +246,15 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
 		hash = dx_hack_hash_signed(name, len);
 		break;
 	case DX_HASH_HALF_MD4_UNSIGNED:
-		str2hashbuf = str2hashbuf_unsigned;
+		use_unsigned = true;
 		fallthrough;
 	case DX_HASH_HALF_MD4:
 		p = name;
 		while (len > 0) {
-			(*str2hashbuf)(p, len, in, 8);
+			if (use_unsigned)
+				str2hashbuf_unsigned(p, len, in, 8);
+			else
+				str2hashbuf_signed(p, len, in, 8);
 			half_md4_transform(buf, in);
 			len -= 32;
 			p += 32;
@@ -246,12 +263,15 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
 		hash = buf[1];
 		break;
 	case DX_HASH_TEA_UNSIGNED:
-		str2hashbuf = str2hashbuf_unsigned;
+		use_unsigned = true;
 		fallthrough;
 	case DX_HASH_TEA:
 		p = name;
 		while (len > 0) {
-			(*str2hashbuf)(p, len, in, 4);
+			if (use_unsigned)
+				str2hashbuf_unsigned(p, len, in, 4);
+			else
+				str2hashbuf_signed(p, len, in, 4);
 			TEA_transform(buf, in);
 			len -= 16;
 			p += 16;
@@ -321,3 +341,7 @@ opaque_seq:
 #endif
 	return __ext4fs_dirhash(dir, name, len, hinfo);
 }
+
+#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS)
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4fs_dirhash);
+#endif
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c2c2d6ac7f3d..ce99807c5f5b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1560,7 +1560,8 @@ static int ext4_journalled_write_end(const struct kiocb *iocb,
 
 	BUG_ON(!ext4_handle_valid(handle));
 
-	if (ext4_has_inline_data(inode))
+	if (ext4_has_inline_data(inode) &&
+	    ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
 		return ext4_write_inline_data_end(inode, pos, len, copied,
 						  folio);
 
@@ -5025,6 +5026,57 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 	return ret;
 }
 
+/*
+ * ext4_get_inode_loc_noio() is a best-effort variant of ext4_get_inode_loc().
+ * It looks up the inode table block in the buffer cache and returns -EAGAIN if
+ * the block is not present or not uptodate, without starting any I/O.
+ */
+int ext4_get_inode_loc_noio(struct inode *inode, struct ext4_iloc *iloc)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ext4_group_desc *gdp;
+	struct buffer_head *bh;
+	ext4_fsblk_t block;
+	int inodes_per_block, inode_offset;
+	unsigned long ino = inode->i_ino;
+
+	iloc->bh = NULL;
+	if (ino < EXT4_ROOT_INO ||
+	    ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
+		return -EFSCORRUPTED;
+
+	iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
+	if (!gdp)
+		return -EIO;
+
+	/* Figure out the offset within the block group inode table. */
+	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+	inode_offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb));
+	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+
+	block = ext4_inode_table(sb, gdp);
+	if (block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) ||
+	    block >= ext4_blocks_count(EXT4_SB(sb)->s_es)) {
+		ext4_error(sb,
+			   "Invalid inode table block %llu in block_group %u",
+			   block, iloc->block_group);
+		return -EFSCORRUPTED;
+	}
+	block += inode_offset / inodes_per_block;
+
+	bh = sb_find_get_block(sb, block);
+	if (!bh)
+		return -EAGAIN;
+	if (!ext4_buffer_uptodate(bh)) {
+		brelse(bh);
+		return -EAGAIN;
+	}
+
+	iloc->bh = bh;
+	return 0;
+}
+
 
 int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
 			  struct ext4_iloc *iloc)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 1d0c3d4bdf47..c8387e6a2c6e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -830,11 +830,17 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags)
 		bdev_thaw(sb->s_bdev);
 		break;
 	case EXT4_GOING_FLAGS_LOGFLUSH:
+		/*
+		 * Call ext4_force_commit() before setting EXT4_FLAGS_SHUTDOWN.
+		 * This is because in data=ordered mode, journal commit
+		 * triggers data writeback which fails if shutdown is already
+		 * set, causing the journal to be aborted prematurely before
+		 * the commit succeeds.
+		 */
+		(void) ext4_force_commit(sb);
 		set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
-		if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) {
-			(void) ext4_force_commit(sb);
+		if (sbi->s_journal && !is_journal_aborted(sbi->s_journal))
 			jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN);
-		}
 		break;
 	case EXT4_GOING_FLAGS_NOLOGFLUSH:
 		set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
@@ -1650,6 +1656,9 @@ group_extend_out:
 		if (!(fd_file(donor)->f_mode & FMODE_WRITE))
 			return -EBADF;
 
+		if (file_inode(filp)->i_sb != file_inode(fd_file(donor))->i_sb)
+			return -EXDEV;
+
 		err = mnt_want_write_file(filp);
 		if (err)
 			return err;
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
index d90da44aadbd..0424b8b0b4c3 100644
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -727,8 +727,7 @@ do_test_generate_buddy(struct kunit *test, struct super_block *sb, void *bitmap,
 	ext4_mb_generate_buddy_test(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP,
 			       ext4_grp);
 
-	KUNIT_ASSERT_EQ(test, memcmp(mbt_buddy, ext4_buddy, sb->s_blocksize),
-			0);
+	KUNIT_ASSERT_MEMEQ(test, mbt_buddy, ext4_buddy, sb->s_blocksize);
 	mbt_validate_group_info(test, mbt_grp, ext4_grp);
 }
 
@@ -789,8 +788,7 @@ test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b,
 		grp->bb_counters[i] = 0;
 	ext4_mb_generate_buddy_test(sb, buddy, bitmap, 0, grp);
 
-	KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize),
-			0);
+	KUNIT_ASSERT_MEMEQ(test, buddy, e4b->bd_buddy, sb->s_blocksize);
 	mbt_validate_group_info(test, grp, e4b->bd_info);
 }
 
@@ -854,8 +852,7 @@ test_mb_free_blocks_range(struct kunit *test, struct ext4_buddy *e4b,
 		grp->bb_counters[i] = 0;
 	ext4_mb_generate_buddy_test(sb, buddy, bitmap, 0, grp);
 
-	KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize),
-			0);
+	KUNIT_ASSERT_MEMEQ(test, buddy, e4b->bd_buddy, sb->s_blocksize);
 	mbt_validate_group_info(test, grp, e4b->bd_info);
 
 }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 4a47fbd8dd30..cc49ae04a6f6 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -145,9 +145,9 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 	if (IS_ERR(bh)) {
 		__ext4_warning(inode->i_sb, func, line,
 			       "inode #%llu: lblock %lu: comm %s: "
-			       "error %ld reading directory block",
+			       "error %pe reading directory block",
 			       inode->i_ino, (unsigned long)block,
-			       current->comm, PTR_ERR(bh));
+			       current->comm, bh);
 
 		return bh;
 	}
@@ -3054,7 +3054,7 @@ out_stop:
 out_retry:
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
-	return ERR_PTR(err);
+	return err ? ERR_PTR(err) : NULL;
 }
 
 /*
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dc82e7b57e75..bc674aa4a656 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -168,7 +168,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
  * written. On IO failure, check if journal abort is needed. Note that
  * we are protected from truncate touching same part of extent tree by the
  * fact that truncate code waits for all DIO to finish (thus exclusion from
- * direct IO is achieved) and also waits for PageWriteback bits. Thus we
+ * direct IO is achieved) and also waits for writeback to complete. Thus we
  * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
  * completed (happens from ext4_free_ioend()).
  */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7283108d7609..245f67d10ded 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1431,6 +1431,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ext4_fc_init_inode(&ei->vfs_inode);
 	spin_lock_init(&ei->i_fc_lock);
 	mmb_init(&ei->i_metadata_bhs, &ei->vfs_inode.i_data);
+#ifdef CONFIG_LOCKDEP
+	lockdep_set_subclass(&ei->i_data_sem, I_DATA_SEM_NORMAL);
+#endif
 	return &ei->vfs_inode;
 }
 
@@ -4541,6 +4544,7 @@ static void ext4_fast_commit_init(struct super_block *sb)
 	sbi->s_fc_ineligible_tid = 0;
 	mutex_init(&sbi->s_fc_lock);
 	memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
+	memset(&sbi->s_fc_snap_stats, 0, sizeof(sbi->s_fc_snap_stats));
 	sbi->s_fc_replay_state.fc_regions = NULL;
 	sbi->s_fc_replay_state.fc_regions_size = 0;
 	sbi->s_fc_replay_state.fc_regions_used = 0;
@@ -5910,6 +5914,11 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
 		return ERR_PTR(-EFSCORRUPTED);
 	}
 
+#ifdef CONFIG_LOCKDEP
+	lockdep_set_subclass(&EXT4_I(journal_inode)->i_data_sem,
+			     I_DATA_SEM_JOURNAL);
+#endif
+
 	ext4_debug("Journal inode found at %p: %lld bytes\n",
 		  journal_inode, journal_inode->i_size);
 	return journal_inode;
@@ -5977,8 +5986,8 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb,
 		sb, &fs_holder_ops);
 	if (IS_ERR(bdev_file)) {
 		ext4_msg(sb, KERN_ERR,
-			 "failed to open journal device unknown-block(%u,%u) %ld",
-			 MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file));
+			 "failed to open journal device unknown-block(%u,%u) %pe",
+			 MAJOR(j_dev), MINOR(j_dev), bdev_file);
 		return bdev_file;
 	}
 
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index d8577725a2fb..3029cb6f6d64 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -512,10 +512,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		 * leave undo-committed data.
 		 */
 		if (jh->b_committed_data) {
-			struct buffer_head *bh = jh2bh(jh);
-
 			spin_lock(&jh->b_state_lock);
-			jbd2_free(jh->b_committed_data, bh->b_size);
+			kfree(jh->b_committed_data);
 			jh->b_committed_data = NULL;
 			spin_unlock(&jh->b_state_lock);
 		}
@@ -976,7 +974,7 @@ restart_loop:
 		 * its triggers if they exist, so we can clear that too.
 		 */
 		if (jh->b_committed_data) {
-			jbd2_free(jh->b_committed_data, bh->b_size);
+			kfree(jh->b_committed_data);
 			jh->b_committed_data = NULL;
 			if (jh->b_frozen_data) {
 				jh->b_committed_data = jh->b_frozen_data;
@@ -984,7 +982,7 @@ restart_loop:
 				jh->b_frozen_triggers = NULL;
 			}
 		} else if (jh->b_frozen_data) {
-			jbd2_free(jh->b_frozen_data, bh->b_size);
+			kfree(jh->b_frozen_data);
 			jh->b_frozen_data = NULL;
 			jh->b_frozen_triggers = NULL;
 		}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e82798680109..09efa337649e 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -95,8 +95,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 EXPORT_SYMBOL(jbd2_inode_cache);
 
-static int jbd2_journal_create_slab(size_t slab_size);
-
 #ifdef CONFIG_JBD2_DEBUG
 void __jbd2_debug(int level, const char *file, const char *func,
 		  unsigned int line, const char *fmt, ...)
@@ -385,10 +383,10 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
 			goto escape_done;
 
 		spin_unlock(&jh_in->b_state_lock);
-		tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL);
+		tmp = kmalloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL);
 		spin_lock(&jh_in->b_state_lock);
 		if (jh_in->b_frozen_data) {
-			jbd2_free(tmp, bh_in->b_size);
+			kfree(tmp);
 			goto copy_done;
 		}
 
@@ -2062,14 +2060,6 @@ EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
 int jbd2_journal_load(journal_t *journal)
 {
 	int err;
-	journal_superblock_t *sb = journal->j_superblock;
-
-	/*
-	 * Create a slab for this blocksize
-	 */
-	err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
-	if (err)
-		return err;
 
 	/* Let the recovery code check whether it needs to recover any
 	 * data from the journal. */
@@ -2261,6 +2251,8 @@ jbd2_journal_initialize_fast_commit(journal_t *journal)
 	unsigned long long num_fc_blks;
 
 	num_fc_blks = jbd2_journal_get_num_fc_blks(sb);
+	if (num_fc_blks > journal->j_last)
+		return -EFSCORRUPTED;
 	if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS)
 		return -ENOSPC;
 
@@ -2698,105 +2690,6 @@ size_t journal_tag_bytes(journal_t *journal)
 }
 
 /*
- * JBD memory management
- *
- * These functions are used to allocate block-sized chunks of memory
- * used for making copies of buffer_head data.  Very often it will be
- * page-sized chunks of data, but sometimes it will be in
- * sub-page-size chunks.  (For example, 16k pages on Power systems
- * with a 4k block file system.)  For blocks smaller than a page, we
- * use a SLAB allocator.  There are slab caches for each block size,
- * which are allocated at mount time, if necessary, and we only free
- * (all of) the slab caches when/if the jbd2 module is unloaded.  For
- * this reason we don't need to a mutex to protect access to
- * jbd2_slab[] allocating or releasing memory; only in
- * jbd2_journal_create_slab().
- */
-#define JBD2_MAX_SLABS 8
-static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
-
-static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
-	"jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
-	"jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
-};
-
-
-static void jbd2_journal_destroy_slabs(void)
-{
-	int i;
-
-	for (i = 0; i < JBD2_MAX_SLABS; i++) {
-		kmem_cache_destroy(jbd2_slab[i]);
-		jbd2_slab[i] = NULL;
-	}
-}
-
-static int jbd2_journal_create_slab(size_t size)
-{
-	static DEFINE_MUTEX(jbd2_slab_create_mutex);
-	int i = order_base_2(size) - 10;
-	size_t slab_size;
-
-	if (size == PAGE_SIZE)
-		return 0;
-
-	if (i >= JBD2_MAX_SLABS)
-		return -EINVAL;
-
-	if (unlikely(i < 0))
-		i = 0;
-	mutex_lock(&jbd2_slab_create_mutex);
-	if (jbd2_slab[i]) {
-		mutex_unlock(&jbd2_slab_create_mutex);
-		return 0;	/* Already created */
-	}
-
-	slab_size = 1 << (i+10);
-	jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
-					 slab_size, 0, NULL);
-	mutex_unlock(&jbd2_slab_create_mutex);
-	if (!jbd2_slab[i]) {
-		printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
-		return -ENOMEM;
-	}
-	return 0;
-}
-
-static struct kmem_cache *get_slab(size_t size)
-{
-	int i = order_base_2(size) - 10;
-
-	BUG_ON(i >= JBD2_MAX_SLABS);
-	if (unlikely(i < 0))
-		i = 0;
-	BUG_ON(jbd2_slab[i] == NULL);
-	return jbd2_slab[i];
-}
-
-void *jbd2_alloc(size_t size, gfp_t flags)
-{
-	void *ptr;
-
-	BUG_ON(size & (size-1)); /* Must be a power of 2 */
-
-	if (size < PAGE_SIZE)
-		ptr = kmem_cache_alloc(get_slab(size), flags);
-	else
-		ptr = kmalloc(size, flags);
-
-	/* Check alignment; SLUB has gotten this wrong in the past,
-	 * and this can lead to user data corruption! */
-	BUG_ON(((unsigned long) ptr) & (size-1));
-
-	return ptr;
-}
-
-void jbd2_free(void *ptr, size_t size)
-{
-	kfree(ptr);
-};
-
-/*
  * Journal_head storage management
  */
 static struct kmem_cache *jbd2_journal_head_cache;
@@ -2969,15 +2862,15 @@ static void __journal_remove_journal_head(struct buffer_head *bh)
 	clear_buffer_jbd(bh);
 }
 
-static void journal_release_journal_head(struct journal_head *jh, size_t b_size)
+static void journal_release_journal_head(struct journal_head *jh)
 {
 	if (jh->b_frozen_data) {
 		printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
-		jbd2_free(jh->b_frozen_data, b_size);
+		kfree(jh->b_frozen_data);
 	}
 	if (jh->b_committed_data) {
 		printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
-		jbd2_free(jh->b_committed_data, b_size);
+		kfree(jh->b_committed_data);
 	}
 	journal_free_journal_head(jh);
 }
@@ -2996,7 +2889,7 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
 	if (!jh->b_jcount) {
 		__journal_remove_journal_head(bh);
 		jbd_unlock_bh_journal_head(bh);
-		journal_release_journal_head(jh, bh->b_size);
+		journal_release_journal_head(jh);
 		__brelse(bh);
 	} else {
 		jbd_unlock_bh_journal_head(bh);
@@ -3138,7 +3031,6 @@ static void jbd2_journal_destroy_caches(void)
 	jbd2_journal_destroy_handle_cache();
 	jbd2_journal_destroy_inode_cache();
 	jbd2_journal_destroy_transaction_cache();
-	jbd2_journal_destroy_slabs();
 }
 
 static int __init journal_init(void)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 4885903bbd10..5cc7d097b2ac 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1131,7 +1131,7 @@ repeat:
 		if (!frozen_buffer) {
 			JBUFFER_TRACE(jh, "allocate memory for buffer");
 			spin_unlock(&jh->b_state_lock);
-			frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
+			frozen_buffer = kmalloc(jh2bh(jh)->b_size,
 						   GFP_NOFS | __GFP_NOFAIL);
 			goto repeat;
 		}
@@ -1159,7 +1159,7 @@ done:
 
 out:
 	if (unlikely(frozen_buffer))	/* It's usually NULL */
-		jbd2_free(frozen_buffer, bh->b_size);
+		kfree(frozen_buffer);
 
 	JBUFFER_TRACE(jh, "exit");
 	return error;
@@ -1424,7 +1424,7 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
 
 repeat:
 	if (!jh->b_committed_data)
-		committed_data = jbd2_alloc(jh2bh(jh)->b_size,
+		committed_data = kmalloc(jh2bh(jh)->b_size,
 					    GFP_NOFS|__GFP_NOFAIL);
 
 	spin_lock(&jh->b_state_lock);
@@ -1445,7 +1445,7 @@ repeat:
 out:
 	jbd2_journal_put_journal_head(jh);
 	if (unlikely(committed_data))
-		jbd2_free(committed_data, bh->b_size);
+		kfree(committed_data);
 	return err;
 }
 
@@ -1516,14 +1516,19 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
  */
 int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 {
-	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
+	transaction_t *transaction;
+	journal_t *journal;
 	struct journal_head *jh;
 	int ret = 0;
 
+	if (is_handle_aborted(handle))
+		return -EROFS;
 	if (!buffer_jbd(bh))
 		return -EUCLEAN;
 
+	transaction = handle->h_transaction;
+	journal = transaction->t_journal;
+
 	/*
 	 * We don't grab jh reference here since the buffer must be part
 	 * of the running transaction.
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 7e785aa6d35d..b68561187e90 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -63,9 +63,6 @@ void __jbd2_debug(int level, const char *file, const char *func,
 #define jbd2_debug(n, fmt, a...)  no_printk(fmt, ##a)
 #endif
 
-extern void *jbd2_alloc(size_t size, gfp_t flags);
-extern void jbd2_free(void *ptr, size_t size);
-
 #define JBD2_MIN_JOURNAL_BLOCKS 1024
 #define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256
 
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index f493642cf121..7028a28316fa 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -107,6 +107,26 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_VERITY);
 TRACE_DEFINE_ENUM(EXT4_FC_REASON_MOVE_EXT);
 TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);
 
+#undef EM
+#undef EMe
+#define EM(a)	TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a);
+#define EMe(a)	TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a);
+
+#define TRACE_SNAP_ERR						\
+	EM(NONE)						\
+	EM(ES_MISS)						\
+	EM(ES_DELAYED)						\
+	EM(ES_OTHER)						\
+	EM(INODES_CAP)						\
+	EM(RANGES_CAP)						\
+	EM(NOMEM)						\
+	EMe(INODE_LOC)
+
+TRACE_SNAP_ERR
+
+#undef EM
+#undef EMe
+
 #define show_fc_reason(reason)						\
 	__print_symbolic(reason,					\
 		{ EXT4_FC_REASON_XATTR,		"XATTR"},		\
@@ -2818,6 +2838,47 @@ TRACE_EVENT(ext4_fc_commit_stop,
 		  __entry->num_fc_ineligible, __entry->nblks_agg, __entry->tid)
 );
 
+#define EM(a)	{ EXT4_FC_SNAP_ERR_##a, #a },
+#define EMe(a)	{ EXT4_FC_SNAP_ERR_##a, #a }
+
+TRACE_EVENT(ext4_fc_lock_updates,
+	    TP_PROTO(struct super_block *sb, tid_t commit_tid, u64 locked_ns,
+		     unsigned int nr_inodes, unsigned int nr_ranges, int err,
+		     int snap_err),
+
+	TP_ARGS(sb, commit_tid, locked_ns, nr_inodes, nr_ranges, err, snap_err),
+
+	TP_STRUCT__entry(/* entry */
+		__field(dev_t, dev)
+		__field(tid_t, tid)
+		__field(u64, locked_ns)
+		__field(unsigned int, nr_inodes)
+		__field(unsigned int, nr_ranges)
+		__field(int, err)
+		__field(int, snap_err)
+	),
+
+	TP_fast_assign(/* assign */
+		__entry->dev = sb->s_dev;
+		__entry->tid = commit_tid;
+		__entry->locked_ns = locked_ns;
+		__entry->nr_inodes = nr_inodes;
+		__entry->nr_ranges = nr_ranges;
+		__entry->err = err;
+		__entry->snap_err = snap_err;
+	),
+
+	TP_printk("dev %d,%d tid %u locked_ns %llu nr_inodes %u nr_ranges %u err %d snap_err %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
+		  __entry->locked_ns, __entry->nr_inodes, __entry->nr_ranges,
+		  __entry->err, __print_symbolic(__entry->snap_err,
+						 TRACE_SNAP_ERR))
+);
+
+#undef EM
+#undef EMe
+#undef TRACE_SNAP_ERR
+
 #define FC_REASON_NAME_STAT(reason)					\
 	show_fc_reason(reason),						\
 	__entry->fc_ineligible_rc[reason]