summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/ext4/Makefile2
-rw-r--r--fs/ext4/ext4.h93
-rw-r--r--fs/ext4/extents.c4
-rw-r--r--fs/ext4/fast_commit.c784
-rw-r--r--fs/ext4/hash-test.c567
-rw-r--r--fs/ext4/hash.c68
-rw-r--r--fs/ext4/inode.c54
-rw-r--r--fs/ext4/ioctl.c15
-rw-r--r--fs/ext4/mballoc-test.c9
-rw-r--r--fs/ext4/namei.c6
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/ext4/super.c13
-rw-r--r--fs/jbd2/commit.c8
-rw-r--r--fs/jbd2/journal.c124
-rw-r--r--fs/jbd2/transaction.c17
-rw-r--r--include/linux/jbd2.h3
-rw-r--r--include/trace/events/ext4.h61
17 files changed, 1495 insertions, 335 deletions
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 3baee4e7c1cf..3f9fc0eb8eca 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -15,7 +15,7 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
ext4-test-objs += inode-test.o mballoc-test.o \
- extents-test.o
+ extents-test.o hash-test.o
obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-test.o
ext4-$(CONFIG_FS_VERITY) += verity.o
ext4-$(CONFIG_FS_ENCRYPTION) += crypto.o
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6af11f0ff1c5..b37c136ea3ab 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1015,14 +1015,32 @@ do { \
* than the first
* I_DATA_SEM_QUOTA - Used for quota inodes only
* I_DATA_SEM_EA - Used for ea_inodes only
+ * I_DATA_SEM_JOURNAL - Used for journal inode only
*/
enum {
I_DATA_SEM_NORMAL = 0,
I_DATA_SEM_OTHER,
I_DATA_SEM_QUOTA,
- I_DATA_SEM_EA
+ I_DATA_SEM_EA,
+ I_DATA_SEM_JOURNAL
};
+struct ext4_fc_inode_snap;
+
+/*
+ * Snapshot failure reasons for ext4_fc_lock_updates tracepoint.
+ * Keep these stable for tooling.
+ */
+enum ext4_fc_snap_err {
+ EXT4_FC_SNAP_ERR_NONE = 0,
+ EXT4_FC_SNAP_ERR_ES_MISS,
+ EXT4_FC_SNAP_ERR_ES_DELAYED,
+ EXT4_FC_SNAP_ERR_ES_OTHER,
+ EXT4_FC_SNAP_ERR_INODES_CAP,
+ EXT4_FC_SNAP_ERR_RANGES_CAP,
+ EXT4_FC_SNAP_ERR_NOMEM,
+ EXT4_FC_SNAP_ERR_INODE_LOC,
+};
/*
* fourth extended file system inode data in memory
@@ -1079,6 +1097,22 @@ struct ext4_inode_info {
/* End of lblk range that needs to be committed in this fast commit */
ext4_lblk_t i_fc_lblk_len;
+ /*
+ * Commit-time fast commit snapshots.
+ *
+ * i_fc_snap is installed and freed under sbi->s_fc_lock. The fast
+ * commit log writing path reads the snapshot under sbi->s_fc_lock while
+ * serializing fast commit TLVs.
+ *
+ * The snapshot lifetime is bounded by EXT4_STATE_FC_COMMITTING and the
+ * corresponding cleanup / eviction paths.
+ *
+ * i_fc_snap points to per-inode snapshot data for fast commit:
+ * - a raw inode snapshot for EXT4_FC_TAG_INODE
+ * - data range records for EXT4_FC_TAG_{ADD,DEL}_RANGE
+ */
+ struct ext4_fc_inode_snap *i_fc_snap;
+
spinlock_t i_raw_lock; /* protects updates to the raw inode */
/*
@@ -1517,6 +1551,36 @@ struct ext4_orphan_info {
};
/*
+ * Ext4 fast commit snapshot statistics.
+ *
+ * These are best-effort counters intended for debugging / performance
+ * introspection; they are not exact under concurrent updates.
+ */
+struct ext4_fc_snap_stats {
+ atomic64_t lock_updates_ns_total;
+ atomic64_t lock_updates_ns_max;
+ atomic64_t lock_updates_samples;
+
+ atomic64_t snap_inodes;
+ atomic64_t snap_ranges;
+
+ atomic64_t snap_fail_es_miss;
+ atomic64_t snap_fail_es_delayed;
+ atomic64_t snap_fail_es_other;
+
+ atomic64_t snap_fail_inodes_cap;
+ atomic64_t snap_fail_ranges_cap;
+ atomic64_t snap_fail_nomem;
+ atomic64_t snap_fail_inode_loc;
+
+ /*
+ * Missing inode snapshots during log writing should never happen.
+ * Keep this counter to help catch unexpected regressions.
+ */
+ atomic64_t snap_fail_no_snap;
+};
+
+/*
* fourth extended-fs super-block data in memory
*/
struct ext4_sb_info {
@@ -1790,6 +1854,7 @@ struct ext4_sb_info {
struct mutex s_fc_lock;
struct buffer_head *s_fc_bh;
struct ext4_fc_stats s_fc_stats;
+ struct ext4_fc_snap_stats s_fc_snap_stats;
tid_t s_fc_ineligible_tid;
#ifdef CONFIG_EXT4_DEBUG
int s_fc_debug_max_replay;
@@ -1972,6 +2037,7 @@ enum {
EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */
EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */
EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */
+ EXT4_STATE_FC_REQUEUE, /* Inode modified during fast commit */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -2000,6 +2066,8 @@ EXT4_INODE_BIT_FNS(flag, flags, 0)
static inline int ext4_test_inode_state(struct inode *inode, int bit);
static inline void ext4_set_inode_state(struct inode *inode, int bit);
static inline void ext4_clear_inode_state(struct inode *inode, int bit);
+static inline unsigned long *ext4_inode_state_wait_word(struct inode *inode);
+static inline int ext4_inode_state_wait_bit(int bit);
#if (BITS_PER_LONG < 64)
EXT4_INODE_BIT_FNS(state, state_flags, 0)
@@ -2015,6 +2083,24 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
/* We depend on the fact that callers will set i_flags */
}
#endif
+
+static inline unsigned long *ext4_inode_state_wait_word(struct inode *inode)
+{
+#if (BITS_PER_LONG < 64)
+ return &EXT4_I(inode)->i_state_flags;
+#else
+ return &EXT4_I(inode)->i_flags;
+#endif
+}
+
+static inline int ext4_inode_state_wait_bit(int bit)
+{
+#if (BITS_PER_LONG < 64)
+ return bit;
+#else
+ return bit + 32;
+#endif
+}
#else
/* Assume that user mode programs are passing in an ext4fs superblock, not
* a kernel struct super_block. This will allow us to call the feature-test
@@ -3080,8 +3166,9 @@ extern int ext4_file_getattr(struct mnt_idmap *, const struct path *,
struct kstat *, u32, unsigned int);
extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
-extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
-extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
+int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc);
+int ext4_get_inode_loc_noio(struct inode *inode, struct ext4_iloc *iloc);
+int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
struct ext4_iloc *iloc);
extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 125f628e738a..91c97af64b31 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3268,8 +3268,8 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
*/
path = ext4_find_extent(inode, ee_block, NULL, flags | EXT4_EX_NOFAIL);
if (IS_ERR(path)) {
- EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld",
- split, PTR_ERR(path));
+ EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %pe",
+ split, path);
goto out_path;
}
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 5773b85e43cb..8e2259799614 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -56,21 +56,22 @@
* deleted while it is being flushed.
* [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA"
* state.
- * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that
- * all the exsiting handles finish and no new handles can start.
- * [4] Mark all the fast commit eligible inodes as undergoing fast commit
- * by setting "EXT4_STATE_FC_COMMITTING" state.
- * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows
- * starting of new handles. If new handles try to start an update on
- * any of the inodes that are being committed, ext4_fc_track_inode()
- * will block until those inodes have finished the fast commit.
+ * [3] Lock the journal by calling jbd2_journal_lock_updates(). This ensures
+ * that all the existing handles finish and no new handles can start.
+ * [4] Mark all the fast commit eligible inodes as undergoing fast commit by
+ * setting "EXT4_STATE_FC_COMMITTING" state, and snapshot the inode state
+ * needed for log writing.
+ * [5] Unlock the journal by calling jbd2_journal_unlock_updates(). This allows
+ * starting of new handles. Updates to inodes being fast committed are
+ * tracked for requeue rather than blocking.
* [6] Commit all the directory entry updates in the fast commit space.
- * [7] Commit all the changed inodes in the fast commit space and clear
- * "EXT4_STATE_FC_COMMITTING" for these inodes.
+ * [7] Commit all the changed inodes in the fast commit space.
* [8] Write tail tag (this tag ensures the atomicity, please read the following
* section for more details).
+ * [9] Clear "EXT4_STATE_FC_COMMITTING" and wake up waiters in
+ * ext4_fc_cleanup().
*
- * All the inode updates must be enclosed within jbd2_jounrnal_start()
+ * All the inode updates must be enclosed within jbd2_journal_start()
* and jbd2_journal_stop() similar to JBD2 journaling.
*
* Fast Commit Ineligibility
@@ -183,6 +184,21 @@
#include <trace/events/ext4.h>
static struct kmem_cache *ext4_fc_dentry_cachep;
+static struct kmem_cache *ext4_fc_range_cachep;
+
+/*
+ * Avoid spending unbounded time/memory snapshotting highly fragmented files
+ * under jbd2_journal_lock_updates(). If we exceed this limit, fall back to
+ * full commit.
+ */
+#define EXT4_FC_SNAPSHOT_MAX_INODES 1024
+#define EXT4_FC_SNAPSHOT_MAX_RANGES 2048
+
+static inline void ext4_fc_set_snap_err(int *snap_err, int err)
+{
+ if (snap_err && *snap_err == EXT4_FC_SNAP_ERR_NONE)
+ *snap_err = err;
+}
static void ext4_end_buffer_io_sync(struct bio *bio)
{
@@ -203,6 +219,8 @@ static void ext4_end_buffer_io_sync(struct bio *bio)
unlock_buffer(bh);
}
+static void ext4_fc_free_inode_snap(struct inode *inode);
+
static inline void ext4_fc_reset_inode(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -217,8 +235,10 @@ void ext4_fc_init_inode(struct inode *inode)
ext4_fc_reset_inode(inode);
ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+ ext4_clear_inode_state(inode, EXT4_STATE_FC_REQUEUE);
INIT_LIST_HEAD(&ei->i_fc_list);
INIT_LIST_HEAD(&ei->i_fc_dilist);
+ ei->i_fc_snap = NULL;
}
static bool ext4_fc_disabled(struct super_block *sb)
@@ -234,6 +254,50 @@ static bool ext4_fc_eligible(struct super_block *sb)
}
/*
+ * Wait for an inode fast-commit state bit to clear while dropping the
+ * fast-commit lock around schedule().
+ */
+static void ext4_fc_wait_inode_state(struct inode *inode, int bit,
+ int *alloc_ctx)
+{
+ wait_queue_head_t *wq;
+ unsigned long *wait_word = ext4_inode_state_wait_word(inode);
+ int wait_bit = ext4_inode_state_wait_bit(bit);
+
+ while (ext4_test_inode_state(inode, bit)) {
+ DEFINE_WAIT_BIT(wait, wait_word, wait_bit);
+
+ wq = bit_waitqueue(wait_word, wait_bit);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ if (ext4_test_inode_state(inode, bit)) {
+ ext4_fc_unlock(inode->i_sb, *alloc_ctx);
+ schedule();
+ *alloc_ctx = ext4_fc_lock(inode->i_sb);
+ }
+ finish_wait(wq, &wait.wq_entry);
+ }
+}
+
+static inline void ext4_fc_wake_inode_state(struct inode *inode, int bit)
+{
+ wake_up_bit(ext4_inode_state_wait_word(inode),
+ ext4_inode_state_wait_bit(bit));
+}
+
+static void ext4_fc_snap_stats_update_max(atomic64_t *stat, u64 value)
+{
+ u64 old = atomic64_read(stat);
+
+ while (value > old) {
+ u64 prev = atomic64_cmpxchg(stat, old, value);
+
+ if (prev == old)
+ break;
+ old = prev;
+ }
+}
+
+/*
* Remove inode from fast commit list. If the inode is being committed
* we wait until inode commit is done.
*/
@@ -241,7 +305,6 @@ void ext4_fc_del(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_fc_dentry_update *fc_dentry;
- wait_queue_head_t *wq;
int alloc_ctx;
if (ext4_fc_disabled(inode->i_sb))
@@ -249,59 +312,43 @@ void ext4_fc_del(struct inode *inode)
alloc_ctx = ext4_fc_lock(inode->i_sb);
if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
+ ext4_fc_free_inode_snap(inode);
ext4_fc_unlock(inode->i_sb, alloc_ctx);
return;
}
/*
- * Since ext4_fc_del is called from ext4_evict_inode while having a
- * handle open, there is no need for us to wait here even if a fast
- * commit is going on. That is because, if this inode is being
- * committed, ext4_mark_inode_dirty would have waited for inode commit
- * operation to finish before we come here. So, by the time we come
- * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
- * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
- * here.
- *
- * We may come here without any handles open in the "no_delete" case of
- * ext4_evict_inode as well. However, if that happens, we first mark the
- * file system as fast commit ineligible anyway. So, even in that case,
- * it is okay to remove the inode from the fc list.
+ * Wait for ongoing fast commit to finish. We cannot remove the inode
+ * from fast commit lists while it is being committed. If we wake from
+ * FC_FLUSHING_DATA, re-check FC_COMMITTING before deleting because the
+ * commit thread sets FC_COMMITTING only after clearing FLUSHING_DATA.
*/
- WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
- && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
- while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
-#if (BITS_PER_LONG < 64)
- DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
- EXT4_STATE_FC_FLUSHING_DATA);
- wq = bit_waitqueue(&ei->i_state_flags,
- EXT4_STATE_FC_FLUSHING_DATA);
-#else
- DEFINE_WAIT_BIT(wait, &ei->i_flags,
- EXT4_STATE_FC_FLUSHING_DATA);
- wq = bit_waitqueue(&ei->i_flags,
- EXT4_STATE_FC_FLUSHING_DATA);
-#endif
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
- ext4_fc_unlock(inode->i_sb, alloc_ctx);
- schedule();
- alloc_ctx = ext4_fc_lock(inode->i_sb);
- }
- finish_wait(wq, &wait.wq_entry);
+ for (;;) {
+ ext4_fc_wait_inode_state(inode, EXT4_STATE_FC_COMMITTING,
+ &alloc_ctx);
+
+ if (!ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA))
+ break;
+
+ ext4_fc_wait_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA,
+ &alloc_ctx);
}
+
+ ext4_fc_free_inode_snap(inode);
list_del_init(&ei->i_fc_list);
/*
- * Since this inode is getting removed, let's also remove all FC
- * dentry create references, since it is not needed to log it anyways.
+ * Since this inode is getting removed, let's also remove all FC dentry
+ * create references, since it is not needed to log it anyways.
*/
if (list_empty(&ei->i_fc_dilist)) {
ext4_fc_unlock(inode->i_sb, alloc_ctx);
return;
}
- fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
+ fc_dentry = list_first_entry(&ei->i_fc_dilist,
+ struct ext4_fc_dentry_update,
+ fcd_dilist);
WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
list_del_init(&fc_dentry->fcd_list);
list_del_init(&fc_dentry->fcd_dilist);
@@ -373,6 +420,8 @@ static int ext4_fc_track_template(
tid = handle->h_transaction->t_tid;
spin_lock(&ei->i_fc_lock);
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
+ ext4_set_inode_state(inode, EXT4_STATE_FC_REQUEUE);
if (tid == ei->i_sync_tid) {
update = true;
} else {
@@ -543,8 +592,6 @@ static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
- struct ext4_inode_info *ei = EXT4_I(inode);
- wait_queue_head_t *wq;
int ret;
if (S_ISDIR(inode->i_mode))
@@ -560,29 +607,11 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
return;
/*
- * If we come here, we may sleep while waiting for the inode to
- * commit. We shouldn't be holding i_data_sem when we go to sleep since
- * the commit path needs to grab the lock while committing the inode.
+ * Fast commit snapshots inode state at commit time, so there's no need
+ * to wait for EXT4_STATE_FC_COMMITTING here. If the inode is already
+ * on the commit queue, ext4_fc_cleanup() will requeue it for the new
+ * transaction once the current commit finishes.
*/
- lockdep_assert_not_held(&ei->i_data_sem);
-
- while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
-#if (BITS_PER_LONG < 64)
- DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
- EXT4_STATE_FC_COMMITTING);
- wq = bit_waitqueue(&ei->i_state_flags,
- EXT4_STATE_FC_COMMITTING);
-#else
- DEFINE_WAIT_BIT(wait, &ei->i_flags,
- EXT4_STATE_FC_COMMITTING);
- wq = bit_waitqueue(&ei->i_flags,
- EXT4_STATE_FC_COMMITTING);
-#endif
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
- schedule();
- finish_wait(wq, &wait.wq_entry);
- }
/*
* From this point on, this inode will not be committed either
@@ -831,6 +860,21 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
return true;
}
+struct ext4_fc_range {
+ struct list_head list;
+ u16 tag;
+ ext4_lblk_t lblk;
+ ext4_lblk_t len;
+ ext4_fsblk_t pblk;
+ bool unwritten;
+};
+
+struct ext4_fc_inode_snap {
+ struct list_head data_list;
+ unsigned int inode_len;
+ u8 inode_buf[];
+};
+
/*
* Writes inode in the fast commit space under TLV with tag @tag.
* Returns 0 on success, error on failure.
@@ -838,21 +882,27 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
- int ret;
- struct ext4_iloc iloc;
+ struct ext4_fc_inode_snap *snap = ei->i_fc_snap;
+ struct ext4_fc_snap_stats *stats =
+ &EXT4_SB(inode->i_sb)->s_fc_snap_stats;
struct ext4_fc_inode fc_inode;
struct ext4_fc_tl tl;
u8 *dst;
+ u8 *src;
+ int inode_len;
+ int ret;
- ret = ext4_get_inode_loc(inode, &iloc);
- if (ret)
- return ret;
+ if (!snap) {
+ atomic64_inc(&stats->snap_fail_no_snap);
+ return -ECANCELED;
+ }
- if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
- inode_len = EXT4_INODE_SIZE(inode->i_sb);
- else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
- inode_len += ei->i_extra_isize;
+ src = snap->inode_buf;
+ inode_len = snap->inode_len;
+ if (!src || inode_len == 0) {
+ atomic64_inc(&stats->snap_fail_no_snap);
+ return -ECANCELED;
+ }
fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
@@ -868,10 +918,9 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
dst += EXT4_FC_TAG_BASE_LEN;
memcpy(dst, &fc_inode, sizeof(fc_inode));
dst += sizeof(fc_inode);
- memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
+ memcpy(dst, src, inode_len);
ret = 0;
err:
- brelse(iloc.bh);
return ret;
}
@@ -881,76 +930,244 @@ err:
*/
static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
{
- ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_map_blocks map;
+ struct ext4_fc_inode_snap *snap = ei->i_fc_snap;
+ struct ext4_fc_snap_stats *stats =
+ &EXT4_SB(inode->i_sb)->s_fc_snap_stats;
struct ext4_fc_add_range fc_ext;
struct ext4_fc_del_range lrange;
struct ext4_extent *ex;
- int ret;
+ struct ext4_fc_range *range;
+
+ if (!snap) {
+ atomic64_inc(&stats->snap_fail_no_snap);
+ return -ECANCELED;
+ }
+
+ list_for_each_entry(range, &snap->data_list, list) {
+ if (range->tag == EXT4_FC_TAG_DEL_RANGE) {
+ lrange.fc_ino = cpu_to_le32(inode->i_ino);
+ lrange.fc_lblk = cpu_to_le32(range->lblk);
+ lrange.fc_len = cpu_to_le32(range->len);
+ if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
+ sizeof(lrange), (u8 *)&lrange, crc))
+ return -ENOSPC;
+ continue;
+ }
+
+ fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
+ ex = (struct ext4_extent *)&fc_ext.fc_ex;
+ ex->ee_block = cpu_to_le32(range->lblk);
+ ex->ee_len = cpu_to_le16(range->len);
+ ext4_ext_store_pblock(ex, range->pblk);
+ if (range->unwritten)
+ ext4_ext_mark_unwritten(ex);
+ else
+ ext4_ext_mark_initialized(ex);
+
+ if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
+ sizeof(fc_ext), (u8 *)&fc_ext, crc))
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
+static void ext4_fc_free_ranges(struct list_head *head)
+{
+ struct ext4_fc_range *range, *range_n;
+
+ list_for_each_entry_safe(range, range_n, head, list) {
+ list_del(&range->list);
+ kmem_cache_free(ext4_fc_range_cachep, range);
+ }
+}
+
+static void ext4_fc_free_inode_snap(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_fc_inode_snap *snap = ei->i_fc_snap;
+
+ if (!snap)
+ return;
+
+ ext4_fc_free_ranges(&snap->data_list);
+ kfree(snap);
+ ei->i_fc_snap = NULL;
+}
+
+static int ext4_fc_snapshot_inode_data(struct inode *inode,
+ struct list_head *ranges,
+ unsigned int nr_ranges_total,
+ unsigned int *nr_rangesp,
+ int *snap_err)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_fc_snap_stats *stats =
+ &EXT4_SB(inode->i_sb)->s_fc_snap_stats;
+ ext4_lblk_t start_lblk, end_lblk, cur_lblk;
+ unsigned int nr_ranges = 0;
spin_lock(&ei->i_fc_lock);
if (ei->i_fc_lblk_len == 0) {
spin_unlock(&ei->i_fc_lock);
+ if (nr_rangesp)
+ *nr_rangesp = 0;
return 0;
}
- old_blk_size = ei->i_fc_lblk_start;
- new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
+ start_lblk = ei->i_fc_lblk_start;
+ end_lblk = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
ei->i_fc_lblk_len = 0;
spin_unlock(&ei->i_fc_lock);
- cur_lblk_off = old_blk_size;
- ext4_debug("will try writing %d to %d for inode %llu\n",
- cur_lblk_off, new_blk_size, inode->i_ino);
+ cur_lblk = start_lblk;
+ ext4_debug("snapshot data ranges %u-%u for inode %llu\n",
+ start_lblk, end_lblk,
+ (unsigned long long)inode->i_ino);
+
+ while (cur_lblk <= end_lblk) {
+ struct extent_status es;
+ struct ext4_fc_range *range;
+ ext4_lblk_t len;
+ u64 remaining = (u64)end_lblk - cur_lblk + 1;
+
+ if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL)) {
+ atomic64_inc(&stats->snap_fail_es_miss);
+ ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_MISS);
+ return -EAGAIN;
+ }
- while (cur_lblk_off <= new_blk_size) {
- map.m_lblk = cur_lblk_off;
- map.m_len = new_blk_size - cur_lblk_off + 1;
- ret = ext4_map_blocks(NULL, inode, &map,
- EXT4_GET_BLOCKS_IO_SUBMIT |
- EXT4_EX_NOCACHE);
- if (ret < 0)
- return -ECANCELED;
+ if (ext4_es_is_delayed(&es)) {
+ atomic64_inc(&stats->snap_fail_es_delayed);
+ ext4_fc_set_snap_err(snap_err,
+ EXT4_FC_SNAP_ERR_ES_DELAYED);
+ return -EAGAIN;
+ }
- if (map.m_len == 0) {
- cur_lblk_off++;
+ len = es.es_len - (cur_lblk - es.es_lblk);
+ if (len > remaining)
+ len = remaining;
+ if (len == 0) {
+ cur_lblk++;
continue;
}
- if (ret == 0) {
- lrange.fc_ino = cpu_to_le32(inode->i_ino);
- lrange.fc_lblk = cpu_to_le32(map.m_lblk);
- lrange.fc_len = cpu_to_le32(map.m_len);
- if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
- sizeof(lrange), (u8 *)&lrange, crc))
- return -ENOSPC;
+ if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES) {
+ atomic64_inc(&stats->snap_fail_ranges_cap);
+ ext4_fc_set_snap_err(snap_err,
+ EXT4_FC_SNAP_ERR_RANGES_CAP);
+ return -E2BIG;
+ }
+
+ range = kmem_cache_alloc(ext4_fc_range_cachep, GFP_NOFS);
+ if (!range) {
+ atomic64_inc(&stats->snap_fail_nomem);
+ ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
+ return -ENOMEM;
+ }
+ nr_ranges++;
+
+ range->lblk = cur_lblk;
+ range->len = len;
+ range->pblk = 0;
+ range->unwritten = false;
+
+ if (ext4_es_is_hole(&es)) {
+ range->tag = EXT4_FC_TAG_DEL_RANGE;
+ } else if (ext4_es_is_written(&es) ||
+ ext4_es_is_unwritten(&es)) {
+ unsigned int max;
+
+ range->tag = EXT4_FC_TAG_ADD_RANGE;
+ range->pblk = ext4_es_pblock(&es) +
+ (cur_lblk - es.es_lblk);
+ range->unwritten = ext4_es_is_unwritten(&es);
+
+ max = range->unwritten ? EXT_UNWRITTEN_MAX_LEN :
+ EXT_INIT_MAX_LEN;
+ if (range->len > max)
+ range->len = max;
} else {
- unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
- EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
-
- /* Limit the number of blocks in one extent */
- map.m_len = min(max, map.m_len);
-
- fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
- ex = (struct ext4_extent *)&fc_ext.fc_ex;
- ex->ee_block = cpu_to_le32(map.m_lblk);
- ex->ee_len = cpu_to_le16(map.m_len);
- ext4_ext_store_pblock(ex, map.m_pblk);
- if (map.m_flags & EXT4_MAP_UNWRITTEN)
- ext4_ext_mark_unwritten(ex);
- else
- ext4_ext_mark_initialized(ex);
- if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
- sizeof(fc_ext), (u8 *)&fc_ext, crc))
- return -ENOSPC;
+ kmem_cache_free(ext4_fc_range_cachep, range);
+ atomic64_inc(&stats->snap_fail_es_other);
+ ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_OTHER);
+ return -EAGAIN;
}
- cur_lblk_off += map.m_len;
+ INIT_LIST_HEAD(&range->list);
+ list_add_tail(&range->list, ranges);
+
+ if ((u64)range->len > (u64)end_lblk - cur_lblk)
+ break;
+
+ cur_lblk += range->len;
}
+ if (nr_rangesp)
+ *nr_rangesp = nr_ranges;
return 0;
}
+static int ext4_fc_snapshot_inode(struct inode *inode,
+ unsigned int nr_ranges_total,
+ unsigned int *nr_rangesp, int *snap_err)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_fc_snap_stats *stats =
+ &EXT4_SB(inode->i_sb)->s_fc_snap_stats;
+ struct ext4_fc_inode_snap *snap;
+ int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
+ struct ext4_iloc iloc;
+ LIST_HEAD(ranges);
+ unsigned int nr_ranges = 0;
+ int ret;
+ int alloc_ctx;
+
+ ret = ext4_get_inode_loc_noio(inode, &iloc);
+ if (ret) {
+ atomic64_inc(&stats->snap_fail_inode_loc);
+ ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_INODE_LOC);
+ return ret;
+ }
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
+ inode_len = EXT4_INODE_SIZE(inode->i_sb);
+ else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
+ inode_len += ei->i_extra_isize;
+
+ snap = kmalloc(struct_size(snap, inode_buf, inode_len), GFP_NOFS);
+ if (!snap) {
+ atomic64_inc(&stats->snap_fail_nomem);
+ ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
+ brelse(iloc.bh);
+ return -ENOMEM;
+ }
+ INIT_LIST_HEAD(&snap->data_list);
+ snap->inode_len = inode_len;
+
+ memcpy(snap->inode_buf, (u8 *)ext4_raw_inode(&iloc), inode_len);
+ brelse(iloc.bh);
+
+ ret = ext4_fc_snapshot_inode_data(inode, &ranges, nr_ranges_total,
+ &nr_ranges, snap_err);
+ if (ret) {
+ kfree(snap);
+ ext4_fc_free_ranges(&ranges);
+ return ret;
+ }
+
+ alloc_ctx = ext4_fc_lock(inode->i_sb);
+ ext4_fc_free_inode_snap(inode);
+ ei->i_fc_snap = snap;
+ list_splice_tail_init(&ranges, &snap->data_list);
+ ext4_fc_unlock(inode->i_sb, alloc_ctx);
+
+ atomic64_inc(&stats->snap_inodes);
+ atomic64_add(nr_ranges, &stats->snap_ranges);
+ if (nr_rangesp)
+ *nr_rangesp = nr_ranges;
+ return 0;
+}
/* Flushes data of all the inodes in the commit queue. */
static int ext4_fc_flush_data(journal_t *journal)
@@ -1001,6 +1218,11 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
*/
if (list_empty(&fc_dentry->fcd_dilist))
continue;
+ /*
+ * For EXT4_FC_TAG_CREAT, fcd_dilist is linked on the created
+ * inode's i_fc_dilist list (kept singular), so we can recover the
+ * inode through it.
+ */
ei = list_first_entry(&fc_dentry->fcd_dilist,
struct ext4_inode_info, i_fc_dilist);
inode = &ei->vfs_inode;
@@ -1025,17 +1247,114 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
return 0;
}
-static int ext4_fc_perform_commit(journal_t *journal)
+static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb,
+ struct inode ***inodesp,
+ unsigned int *nr_inodesp);
+
+static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
+ unsigned int inodes_size,
+ unsigned int *nr_inodesp,
+ unsigned int *nr_rangesp,
+ int *snap_err)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *iter;
+ struct ext4_fc_dentry_update *fc_dentry;
+ unsigned int i = 0;
+ unsigned int idx;
+ unsigned int nr_ranges = 0;
+ int ret = 0;
+ int alloc_ctx;
+
+ alloc_ctx = ext4_fc_lock(sb);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ if (i >= inodes_size) {
+ atomic64_inc(&sbi->s_fc_snap_stats.snap_fail_inodes_cap);
+ ext4_fc_set_snap_err(snap_err,
+ EXT4_FC_SNAP_ERR_INODES_CAP);
+ ret = -E2BIG;
+ goto unlock;
+ }
+ inodes[i++] = &iter->vfs_inode;
+ }
+
+ list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
+ struct ext4_inode_info *ei;
+ struct inode *inode;
+
+ if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
+ continue;
+ if (list_empty(&fc_dentry->fcd_dilist))
+ continue;
+
+ /* See the comment in ext4_fc_commit_dentry_updates(). */
+ ei = list_first_entry(&fc_dentry->fcd_dilist,
+ struct ext4_inode_info, i_fc_dilist);
+ inode = &ei->vfs_inode;
+ if (!list_empty(&ei->i_fc_list))
+ continue;
+
+ if (i >= inodes_size) {
+ atomic64_inc(&sbi->s_fc_snap_stats.snap_fail_inodes_cap);
+ ext4_fc_set_snap_err(snap_err,
+ EXT4_FC_SNAP_ERR_INODES_CAP);
+ ret = -E2BIG;
+ goto unlock;
+ }
+ /*
+ * Create-only inodes may only be referenced via fcd_dilist and
+ * not appear on s_fc_q[MAIN]. They may hit the last iput while
+ * we are snapshotting, but inode eviction calls ext4_fc_del(),
+ * which waits for FC_COMMITTING to clear. Mark them FC_COMMITTING
+ * so the inode stays pinned and the snapshot stays valid until
+ * ext4_fc_cleanup().
+ */
+ ext4_set_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+ inodes[i++] = inode;
+ }
+unlock:
+ ext4_fc_unlock(sb, alloc_ctx);
+
+ if (ret)
+ return ret;
+
+ for (idx = 0; idx < i; idx++) {
+ unsigned int inode_ranges = 0;
+
+ ret = ext4_fc_snapshot_inode(inodes[idx], nr_ranges,
+ &inode_ranges, snap_err);
+ if (ret)
+ break;
+ nr_ranges += inode_ranges;
+ }
+
+ if (nr_inodesp)
+ *nr_inodesp = idx;
+ if (nr_rangesp)
+ *nr_rangesp = nr_ranges;
+ return ret;
+}
+
+static int ext4_fc_perform_commit(journal_t *journal, tid_t commit_tid)
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_snap_stats *snap_stats = &sbi->s_fc_snap_stats;
struct ext4_inode_info *iter;
struct ext4_fc_head head;
struct inode *inode;
+ struct inode **inodes;
+ unsigned int inodes_size;
+ unsigned int snap_inodes = 0;
+ unsigned int snap_ranges = 0;
+ int snap_err = EXT4_FC_SNAP_ERR_NONE;
struct blk_plug plug;
int ret = 0;
u32 crc = 0;
int alloc_ctx;
+ ktime_t lock_start;
+ u64 locked_ns;
/*
* Step 1: Mark all inodes on s_fc_q[MAIN] with
@@ -1061,11 +1380,8 @@ static int ext4_fc_perform_commit(journal_t *journal)
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
ext4_clear_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_FLUSHING_DATA);
-#if (BITS_PER_LONG < 64)
- wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA);
-#else
- wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA);
-#endif
+ ext4_fc_wake_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_FLUSHING_DATA);
}
/*
@@ -1083,13 +1399,23 @@ static int ext4_fc_perform_commit(journal_t *journal)
if (ret)
return ret;
+ ret = ext4_fc_alloc_snapshot_inodes(sb, &inodes, &inodes_size);
+ if (ret) {
+ if (ret == -E2BIG)
+ atomic64_inc(&snap_stats->snap_fail_inodes_cap);
+ else if (ret == -ENOMEM)
+ atomic64_inc(&snap_stats->snap_fail_nomem);
+ return ret;
+ }
/* Step 4: Mark all inodes as being committed. */
jbd2_journal_lock_updates(journal);
+ lock_start = ktime_get();
/*
* The journal is now locked. No more handles can start and all the
- * previous handles are now drained. We now mark the inodes on the
- * commit queue as being committed.
+ * previous handles are now drained. Snapshotting happens in this
+ * window so log writing can consume only stable snapshots without
+ * doing logical-to-physical mapping.
*/
alloc_ctx = ext4_fc_lock(sb);
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
@@ -1097,7 +1423,22 @@ static int ext4_fc_perform_commit(journal_t *journal)
EXT4_STATE_FC_COMMITTING);
}
ext4_fc_unlock(sb, alloc_ctx);
+
+ ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size,
+ &snap_inodes, &snap_ranges, &snap_err);
jbd2_journal_unlock_updates(journal);
+ locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
+ atomic64_add(locked_ns, &snap_stats->lock_updates_ns_total);
+ atomic64_inc(&snap_stats->lock_updates_samples);
+ ext4_fc_snap_stats_update_max(&snap_stats->lock_updates_ns_max,
+ locked_ns);
+ if (trace_ext4_fc_lock_updates_enabled())
+ trace_call__ext4_fc_lock_updates(sb, commit_tid, locked_ns,
+ snap_inodes, snap_ranges,
+ ret, snap_err);
+ kvfree(inodes);
+ if (ret)
+ return ret;
/*
* Step 5: If file system device is different from journal device,
@@ -1151,6 +1492,64 @@ out:
return ret;
}
+static unsigned int ext4_fc_count_snapshot_inodes(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *iter;
+ struct ext4_fc_dentry_update *fc_dentry;
+ unsigned int nr_inodes = 0;
+ int alloc_ctx;
+
+ alloc_ctx = ext4_fc_lock(sb);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list)
+ nr_inodes++;
+
+ list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
+ struct ext4_inode_info *ei;
+
+ if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
+ continue;
+ if (list_empty(&fc_dentry->fcd_dilist))
+ continue;
+
+ /* See the comment in ext4_fc_commit_dentry_updates(). */
+ ei = list_first_entry(&fc_dentry->fcd_dilist,
+ struct ext4_inode_info, i_fc_dilist);
+ if (!list_empty(&ei->i_fc_list))
+ continue;
+
+ nr_inodes++;
+ }
+ ext4_fc_unlock(sb, alloc_ctx);
+
+ return nr_inodes;
+}
+
+static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb,
+ struct inode ***inodesp,
+ unsigned int *nr_inodesp)
+{
+ unsigned int nr_inodes = ext4_fc_count_snapshot_inodes(sb);
+ struct inode **inodes;
+
+ *inodesp = NULL;
+ *nr_inodesp = 0;
+
+ if (!nr_inodes)
+ return 0;
+
+ if (nr_inodes > EXT4_FC_SNAPSHOT_MAX_INODES)
+ return -E2BIG;
+
+ inodes = kvcalloc(nr_inodes, sizeof(*inodes), GFP_NOFS);
+ if (!inodes)
+ return -ENOMEM;
+
+ *inodesp = inodes;
+ *nr_inodesp = nr_inodes;
+ return 0;
+}
+
static void ext4_fc_update_stats(struct super_block *sb, int status,
u64 commit_time, int nblks, tid_t commit_tid)
{
@@ -1241,9 +1640,12 @@ restart_fc:
journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
set_task_ioprio(current, journal_ioprio);
fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
- ret = ext4_fc_perform_commit(journal);
+ ret = ext4_fc_perform_commit(journal, commit_tid);
if (ret < 0) {
- status = EXT4_FC_STATUS_FAILED;
+ if (ret == -EAGAIN || ret == -E2BIG || ret == -ECANCELED)
+ status = EXT4_FC_STATUS_INELIGIBLE;
+ else
+ status = EXT4_FC_STATUS_FAILED;
goto fallback;
}
nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
@@ -1290,45 +1692,66 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
alloc_ctx = ext4_fc_lock(sb);
while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
+ bool requeue;
+
ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
struct ext4_inode_info,
i_fc_list);
list_del_init(&ei->i_fc_list);
+ ext4_fc_free_inode_snap(&ei->vfs_inode);
+ spin_lock(&ei->i_fc_lock);
+ if (full)
+ requeue = !tid_geq(tid, ei->i_sync_tid);
+ else
+ requeue = ext4_test_inode_state(&ei->vfs_inode,
+ EXT4_STATE_FC_REQUEUE);
+ if (!requeue)
+ ext4_fc_reset_inode(&ei->vfs_inode);
+ ext4_clear_inode_state(&ei->vfs_inode, EXT4_STATE_FC_REQUEUE);
ext4_clear_inode_state(&ei->vfs_inode,
EXT4_STATE_FC_COMMITTING);
- if (tid_geq(tid, ei->i_sync_tid)) {
- ext4_fc_reset_inode(&ei->vfs_inode);
- } else if (full) {
- /*
- * We are called after a full commit, inode has been
- * modified while the commit was running. Re-enqueue
- * the inode into STAGING, which will then be splice
- * back into MAIN. This cannot happen during
- * fastcommit because the journal is locked all the
- * time in that case (and tid doesn't increase so
- * tid check above isn't reliable).
- */
+ spin_unlock(&ei->i_fc_lock);
+ if (requeue)
list_add_tail(&ei->i_fc_list,
&sbi->s_fc_q[FC_Q_STAGING]);
- }
/*
* Make sure clearing of EXT4_STATE_FC_COMMITTING is
* visible before we send the wakeup. Pairs with implicit
- * barrier in prepare_to_wait() in ext4_fc_track_inode().
+ * barrier in prepare_to_wait() in ext4_fc_del().
*/
smp_mb();
-#if (BITS_PER_LONG < 64)
- wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING);
-#else
- wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING);
-#endif
+ ext4_fc_wake_inode_state(&ei->vfs_inode,
+ EXT4_STATE_FC_COMMITTING);
}
while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
- struct ext4_fc_dentry_update,
- fcd_list);
+ struct ext4_fc_dentry_update,
+ fcd_list);
list_del_init(&fc_dentry->fcd_list);
+ if (fc_dentry->fcd_op == EXT4_FC_TAG_CREAT &&
+ !list_empty(&fc_dentry->fcd_dilist)) {
+ /* See the comment in ext4_fc_commit_dentry_updates(). */
+ ei = list_first_entry(&fc_dentry->fcd_dilist,
+ struct ext4_inode_info,
+ i_fc_dilist);
+ ext4_fc_free_inode_snap(&ei->vfs_inode);
+ spin_lock(&ei->i_fc_lock);
+ ext4_clear_inode_state(&ei->vfs_inode,
+ EXT4_STATE_FC_REQUEUE);
+ ext4_clear_inode_state(&ei->vfs_inode,
+ EXT4_STATE_FC_COMMITTING);
+ spin_unlock(&ei->i_fc_lock);
+ /*
+ * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+ * visible before we send the wakeup. Pairs with
+ * implicit barrier in prepare_to_wait() in
+ * ext4_fc_del().
+ */
+ smp_mb();
+ ext4_fc_wake_inode_state(&ei->vfs_inode,
+ EXT4_STATE_FC_COMMITTING);
+ }
list_del_init(&fc_dentry->fcd_dilist);
release_dentry_name_snapshot(&fc_dentry->fcd_name);
@@ -2280,11 +2703,26 @@ int ext4_fc_info_show(struct seq_file *seq, void *v)
{
struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
struct ext4_fc_stats *stats = &sbi->s_fc_stats;
+ struct ext4_fc_snap_stats *snap_stats = &sbi->s_fc_snap_stats;
+ u64 lock_avg_ns = 0;
+ u64 lock_updates_samples;
+ u64 lock_updates_ns_total;
+ u64 lock_updates_ns_max;
int i;
if (v != SEQ_START_TOKEN)
return 0;
+ lock_updates_samples =
+ atomic64_read(&snap_stats->lock_updates_samples);
+ lock_updates_ns_total =
+ atomic64_read(&snap_stats->lock_updates_ns_total);
+ lock_updates_ns_max =
+ atomic64_read(&snap_stats->lock_updates_ns_max);
+ if (lock_updates_samples)
+ lock_avg_ns = div64_u64(lock_updates_ns_total,
+ lock_updates_samples);
+
seq_printf(seq,
"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
stats->fc_num_commits, stats->fc_ineligible_commits,
@@ -2295,6 +2733,23 @@ int ext4_fc_info_show(struct seq_file *seq, void *v)
seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
stats->fc_ineligible_reason_count[i]);
+ seq_printf(seq,
+ "Snapshot stats:\n%llu inodes\n%llu ranges\n%lluus lock_updates_avg\n%lluus lock_updates_max\n",
+ atomic64_read(&snap_stats->snap_inodes),
+ atomic64_read(&snap_stats->snap_ranges),
+ div_u64(lock_avg_ns, 1000),
+ div_u64(lock_updates_ns_max, 1000));
+ seq_printf(seq,
+ "Snapshot failures:\n%llu es_miss\n%llu es_delayed\n%llu es_other\n%llu inodes_cap\n%llu ranges_cap\n%llu nomem\n%llu inode_loc\n%llu no_snap\n",
+ atomic64_read(&snap_stats->snap_fail_es_miss),
+ atomic64_read(&snap_stats->snap_fail_es_delayed),
+ atomic64_read(&snap_stats->snap_fail_es_other),
+ atomic64_read(&snap_stats->snap_fail_inodes_cap),
+ atomic64_read(&snap_stats->snap_fail_ranges_cap),
+ atomic64_read(&snap_stats->snap_fail_nomem),
+ atomic64_read(&snap_stats->snap_fail_inode_loc),
+ atomic64_read(&snap_stats->snap_fail_no_snap));
+
return 0;
}
@@ -2303,13 +2758,20 @@ int __init ext4_fc_init_dentry_cache(void)
ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
SLAB_RECLAIM_ACCOUNT);
- if (ext4_fc_dentry_cachep == NULL)
+ if (!ext4_fc_dentry_cachep)
return -ENOMEM;
+ ext4_fc_range_cachep = KMEM_CACHE(ext4_fc_range, SLAB_RECLAIM_ACCOUNT);
+ if (!ext4_fc_range_cachep) {
+ kmem_cache_destroy(ext4_fc_dentry_cachep);
+ return -ENOMEM;
+ }
+
return 0;
}
void ext4_fc_destroy_dentry_cache(void)
{
+ kmem_cache_destroy(ext4_fc_range_cachep);
kmem_cache_destroy(ext4_fc_dentry_cachep);
}
diff --git a/fs/ext4/hash-test.c b/fs/ext4/hash-test.c
new file mode 100644
index 000000000000..49b0d874c833
--- /dev/null
+++ b/fs/ext4/hash-test.c
@@ -0,0 +1,567 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit tests for ext4 directory hash computation.
+ */
+
+#include <kunit/test.h>
+#include <kunit/resource.h>
+#include <linux/fs.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/unicode.h>
+#include "ext4.h"
+
+static void ext4_hash_init_fake_dir(struct inode *dir, struct super_block *sb)
+{
+ memset(sb, 0, sizeof(*sb));
+ memset(dir, 0, sizeof(*dir));
+ dir->i_sb = sb;
+ strscpy(sb->s_id, "kunit-ext4", sizeof(sb->s_id));
+}
+
+static void ext4_hash_init_fake_dir_with_sbi(struct inode *dir,
+ struct super_block *sb,
+ struct ext4_sb_info *sbi)
+{
+ ext4_hash_init_fake_dir(dir, sb);
+ memset(sbi, 0, sizeof(*sbi));
+ sb->s_fs_info = sbi;
+ sbi->s_sb = sb;
+}
+
+#ifdef CONFIG_FS_ENCRYPTION
+static const struct fscrypt_operations ext4_hash_test_cryptops = {
+ .inode_info_offs =
+ (int)offsetof(struct ext4_inode_info, i_crypt_info) -
+ (int)offsetof(struct ext4_inode_info, vfs_inode),
+};
+#endif
+
+static void ext4_hash_init_fake_ext4_dir(struct ext4_inode_info *ei,
+ struct super_block *sb,
+ struct ext4_sb_info *sbi)
+{
+ struct inode *dir = &ei->vfs_inode;
+
+ memset(sb, 0, sizeof(*sb));
+ memset(ei, 0, sizeof(*ei));
+ memset(sbi, 0, sizeof(*sbi));
+
+ strscpy(sb->s_id, "kunit-ext4", sizeof(sb->s_id));
+ sb->s_fs_info = sbi;
+ sbi->s_sb = sb;
+
+ dir->i_sb = sb;
+ dir->i_mode = S_IFDIR;
+
+#ifdef CONFIG_FS_ENCRYPTION
+ fscrypt_set_ops(sb, &ext4_hash_test_cryptops);
+#endif
+}
+
+struct ext4_dirhash_test_case {
+ const char *name;
+ u32 hash_version;
+ const char *input;
+ int len;
+ u32 seed[4];
+ bool use_seed;
+ u32 expected_hash;
+ u32 expected_minor_hash;
+};
+
+static const struct ext4_dirhash_test_case ext4_dirhash_test_cases[] = {
+ {
+ .name = "legacy_abc",
+ .hash_version = DX_HASH_LEGACY,
+ .input = "abc",
+ .len = 3,
+ .use_seed = false,
+ .expected_hash = 0x75afd992,
+ .expected_minor_hash = 0x00000000,
+ },
+ {
+ .name = "legacy_unsigned_abc",
+ .hash_version = DX_HASH_LEGACY_UNSIGNED,
+ .input = "abc",
+ .len = 3,
+ .use_seed = false,
+ .expected_hash = 0x75afd992,
+ .expected_minor_hash = 0x00000000,
+ },
+ {
+ .name = "half_md4_abc",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "abc",
+ .len = 3,
+ .use_seed = false,
+ .expected_hash = 0xd196a868,
+ .expected_minor_hash = 0xc420eb28,
+ },
+ {
+ .name = "half_md4_unsigned_abc",
+ .hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+ .input = "abc",
+ .len = 3,
+ .use_seed = false,
+ .expected_hash = 0xd196a868,
+ .expected_minor_hash = 0xc420eb28,
+ },
+ {
+ .name = "tea_abc",
+ .hash_version = DX_HASH_TEA,
+ .input = "abc",
+ .len = 3,
+ .use_seed = false,
+ .expected_hash = 0xb1435ec4,
+ .expected_minor_hash = 0x3f7eaa0e,
+ },
+ {
+ .name = "tea_unsigned_abc",
+ .hash_version = DX_HASH_TEA_UNSIGNED,
+ .input = "abc",
+ .len = 3,
+ .use_seed = false,
+ .expected_hash = 0xb1435ec4,
+ .expected_minor_hash = 0x3f7eaa0e,
+ },
+ {
+ .name = "empty_half_md4",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "",
+ .len = 0,
+ .use_seed = false,
+ .expected_hash = 0xefcdab88,
+ .expected_minor_hash = 0x98badcfe,
+ },
+ {
+ .name = "half_md4_31bytes",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "1234567890123456789012345678901",
+ .len = 31,
+ .use_seed = false,
+ .expected_hash = 0xc4db1f78,
+ .expected_minor_hash = 0xea23921b,
+ },
+ {
+ .name = "half_md4_32bytes",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "12345678901234567890123456789012",
+ .len = 32,
+ .use_seed = false,
+ .expected_hash = 0xfa6cc63e,
+ .expected_minor_hash = 0x2f77bd1c,
+ },
+ {
+ .name = "half_md4_33bytes",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "123456789012345678901234567890123",
+ .len = 33,
+ .use_seed = false,
+ .expected_hash = 0xdc0c2dec,
+ .expected_minor_hash = 0x5ca23365,
+ },
+ {
+ .name = "half_md4_unsigned_31bytes",
+ .hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+ .input = "1234567890123456789012345678901",
+ .len = 31,
+ .use_seed = false,
+ .expected_hash = 0xc4db1f78,
+ .expected_minor_hash = 0xea23921b,
+ },
+ {
+ .name = "half_md4_unsigned_32bytes",
+ .hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+ .input = "12345678901234567890123456789012",
+ .len = 32,
+ .use_seed = false,
+ .expected_hash = 0xfa6cc63e,
+ .expected_minor_hash = 0x2f77bd1c,
+ },
+ {
+ .name = "half_md4_unsigned_33bytes",
+ .hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+ .input = "123456789012345678901234567890123",
+ .len = 33,
+ .use_seed = false,
+ .expected_hash = 0xdc0c2dec,
+ .expected_minor_hash = 0x5ca23365,
+ },
+ {
+ .name = "tea_15bytes",
+ .hash_version = DX_HASH_TEA,
+ .input = "123456789abcdef",
+ .len = 15,
+ .use_seed = false,
+ .expected_hash = 0xa562903a,
+ .expected_minor_hash = 0x6174a00f,
+ },
+ {
+ .name = "tea_16bytes",
+ .hash_version = DX_HASH_TEA,
+ .input = "1234567890abcdef",
+ .len = 16,
+ .use_seed = false,
+ .expected_hash = 0x8449f258,
+ .expected_minor_hash = 0x49a16d46,
+ },
+ {
+ .name = "tea_17bytes",
+ .hash_version = DX_HASH_TEA,
+ .input = "123456789abcdefgh",
+ .len = 17,
+ .use_seed = false,
+ .expected_hash = 0xf32ec10c,
+ .expected_minor_hash = 0x58ceae61,
+ },
+ {
+ .name = "half_md4_seeded",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "same-name",
+ .len = 9,
+ .seed = { 0x11111111, 0x22222222, 0x33333333, 0x44444444 },
+ .use_seed = true,
+ .expected_hash = 0x8aebf604,
+ .expected_minor_hash = 0x66ce48fe,
+ },
+ {
+ .name = "half_md4_non_ascii_signed",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "\x80\x81\x82\x83\x84",
+ .len = 5,
+ .use_seed = false,
+ .expected_hash = 0x8bab0498,
+ .expected_minor_hash = 0xc326632d,
+ },
+ {
+ .name = "half_md4_non_ascii_unsigned",
+ .hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+ .input = "\x80\x81\x82\x83\x84",
+ .len = 5,
+ .use_seed = false,
+ .expected_hash = 0xbc48596e,
+ .expected_minor_hash = 0xde0fad41,
+ },
+ {
+ .name = "tea_non_ascii_signed",
+ .hash_version = DX_HASH_TEA,
+ .input = "\x80\x81\x82\x83\x84",
+ .len = 5,
+ .use_seed = false,
+ .expected_hash = 0x21e3a154,
+ .expected_minor_hash = 0x90112c3d,
+ },
+ {
+ .name = "tea_non_ascii_unsigned",
+ .hash_version = DX_HASH_TEA_UNSIGNED,
+ .input = "\x80\x81\x82\x83\x84",
+ .len = 5,
+ .use_seed = false,
+ .expected_hash = 0x9b648616,
+ .expected_minor_hash = 0x011dd507,
+ },
+};
+
+static void test_ext4fs_dirhash_vectors(struct kunit *test)
+{
+ struct super_block *sb;
+ struct inode *dir;
+ int i;
+
+ sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+ dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb);
+ KUNIT_ASSERT_NOT_NULL(test, dir);
+
+ ext4_hash_init_fake_dir(dir, sb);
+
+ for (i = 0; i < ARRAY_SIZE(ext4_dirhash_test_cases); i++) {
+ const struct ext4_dirhash_test_case *tc =
+ &ext4_dirhash_test_cases[i];
+ struct dx_hash_info hinfo;
+ int ret;
+
+ memset(&hinfo, 0, sizeof(hinfo));
+ hinfo.hash_version = tc->hash_version;
+ hinfo.seed = tc->use_seed ? (u32 *)tc->seed : NULL;
+
+ ret = ext4fs_dirhash(dir, tc->input, tc->len, &hinfo);
+
+ KUNIT_ASSERT_EQ_MSG(test, ret, 0, "case=%s", tc->name);
+ KUNIT_EXPECT_EQ_MSG(test, hinfo.hash, tc->expected_hash,
+ "case=%s", tc->name);
+ KUNIT_EXPECT_EQ_MSG(test, hinfo.minor_hash,
+ tc->expected_minor_hash,
+ "case=%s", tc->name);
+ }
+}
+
+static void test_ext4fs_dirhash_seed_changes_result(struct kunit *test)
+{
+ struct super_block *sb;
+ struct inode *dir;
+ u32 seed[4] = { 0x11111111, 0x22222222, 0x33333333, 0x44444444 };
+ struct dx_hash_info plain = {
+ .hash_version = DX_HASH_HALF_MD4,
+ };
+ struct dx_hash_info seeded = {
+ .hash_version = DX_HASH_HALF_MD4,
+ .seed = seed,
+ };
+ int ret_plain, ret_seeded;
+
+ sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+ dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb);
+ KUNIT_ASSERT_NOT_NULL(test, dir);
+
+ ext4_hash_init_fake_dir(dir, sb);
+
+ ret_plain = ext4fs_dirhash(dir, "same-name", 9, &plain);
+ ret_seeded = ext4fs_dirhash(dir, "same-name", 9, &seeded);
+
+ KUNIT_ASSERT_EQ(test, ret_plain, 0);
+ KUNIT_ASSERT_EQ(test, ret_seeded, 0);
+
+ KUNIT_EXPECT_TRUE(test,
+ plain.hash != seeded.hash ||
+ plain.minor_hash != seeded.minor_hash);
+}
+
+static void test_ext4fs_dirhash_invalid_version_returns_einval(struct kunit *test)
+{
+ struct super_block *sb;
+ struct inode *dir;
+ struct ext4_sb_info *sbi;
+ struct dx_hash_info hinfo = {
+ .hash = 0xdeadbeef,
+ .minor_hash = 0xcafebabe,
+ .hash_version = DX_HASH_LAST + 1,
+ };
+ int ret;
+
+ sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+ dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL);
+ sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb);
+ KUNIT_ASSERT_NOT_NULL(test, dir);
+ KUNIT_ASSERT_NOT_NULL(test, sbi);
+
+ ext4_hash_init_fake_dir_with_sbi(dir, sb, sbi);
+
+ ret = ext4fs_dirhash(dir, "abc", 3, &hinfo);
+
+ KUNIT_EXPECT_EQ(test, ret, -EINVAL);
+ KUNIT_EXPECT_EQ(test, hinfo.hash, 0);
+ KUNIT_EXPECT_EQ(test, hinfo.minor_hash, 0);
+}
+
+static void test_ext4fs_dirhash_siphash_without_key_returns_einval(struct kunit *test)
+{
+ struct super_block *sb;
+ struct ext4_inode_info *ei;
+ struct inode *dir;
+ struct ext4_sb_info *sbi;
+ struct dx_hash_info hinfo = {
+ .hash_version = DX_HASH_SIPHASH,
+ };
+ int ret;
+
+ sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+ ei = kunit_kzalloc(test, sizeof(*ei), GFP_KERNEL);
+ sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb);
+ KUNIT_ASSERT_NOT_NULL(test, ei);
+ KUNIT_ASSERT_NOT_NULL(test, sbi);
+
+ ext4_hash_init_fake_ext4_dir(ei, sb, sbi);
+ dir = &ei->vfs_inode;
+
+ ret = ext4fs_dirhash(dir, "name", strlen("name"), &hinfo);
+
+ KUNIT_EXPECT_EQ(test, ret, -EINVAL);
+}
+
+static void test_ext4fs_dirhash_signed_unsigned_differ_on_nonascii(struct kunit *test)
+{
+ struct super_block *sb;
+ struct inode *dir;
+ static const char input[] = "\x80\xff\x81\xfe\101bc";
+ struct dx_hash_info legacy_signed = {
+ .hash_version = DX_HASH_LEGACY,
+ };
+ struct dx_hash_info legacy_unsigned = {
+ .hash_version = DX_HASH_LEGACY_UNSIGNED,
+ };
+ struct dx_hash_info md4_signed = {
+ .hash_version = DX_HASH_HALF_MD4,
+ };
+ struct dx_hash_info md4_unsigned = {
+ .hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+ };
+ struct dx_hash_info tea_signed = {
+ .hash_version = DX_HASH_TEA,
+ };
+ struct dx_hash_info tea_unsigned = {
+ .hash_version = DX_HASH_TEA_UNSIGNED,
+ };
+ int ret;
+
+ sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+ dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb);
+ KUNIT_ASSERT_NOT_NULL(test, dir);
+
+ ext4_hash_init_fake_dir(dir, sb);
+
+ ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &legacy_signed);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &legacy_unsigned);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ KUNIT_EXPECT_NE(test, legacy_signed.hash, legacy_unsigned.hash);
+
+ ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &md4_signed);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &md4_unsigned);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ KUNIT_EXPECT_TRUE(test,
+ md4_signed.hash != md4_unsigned.hash ||
+ md4_signed.minor_hash != md4_unsigned.minor_hash);
+
+ ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &tea_signed);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &tea_unsigned);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ KUNIT_EXPECT_TRUE(test,
+ tea_signed.hash != tea_unsigned.hash ||
+ tea_signed.minor_hash != tea_unsigned.minor_hash);
+}
+
+#if IS_ENABLED(CONFIG_UNICODE)
+KUNIT_DEFINE_ACTION_WRAPPER(utf8_unload_action, utf8_unload,
+ struct unicode_map *);
+static void test_ext4fs_dirhash_casefolded_names_hash_consistently(struct kunit *test)
+{
+ struct super_block *sb;
+ struct ext4_inode_info *ei;
+ struct ext4_sb_info *sbi;
+ struct unicode_map *um;
+ struct dx_hash_info h1 = {
+ .hash_version = DX_HASH_HALF_MD4,
+ };
+ struct dx_hash_info h2 = {
+ .hash_version = DX_HASH_HALF_MD4,
+ };
+ int ret, ret1, ret2;
+
+ sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+ ei = kunit_kzalloc(test, sizeof(*ei), GFP_KERNEL);
+ sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb);
+ KUNIT_ASSERT_NOT_NULL(test, ei);
+ KUNIT_ASSERT_NOT_NULL(test, sbi);
+
+ um = utf8_load(UTF8_LATEST);
+ if (IS_ERR(um)) {
+ kunit_skip(test, "utf8_load(UTF8_LATEST) failed: %pe",
+ um);
+ return;
+ }
+
+ ret = kunit_add_action_or_reset(test, utf8_unload_action, um);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ ext4_hash_init_fake_ext4_dir(ei, sb, sbi);
+ sb->s_encoding = um;
+ ei->vfs_inode.i_flags |= S_CASEFOLD;
+
+ KUNIT_ASSERT_TRUE(test, IS_CASEFOLDED(&ei->vfs_inode));
+
+ ret1 = ext4fs_dirhash(&ei->vfs_inode, "Alpha", 5, &h1);
+ ret2 = ext4fs_dirhash(&ei->vfs_inode, "aLPHa", 5, &h2);
+
+ KUNIT_ASSERT_EQ(test, ret1, 0);
+ KUNIT_ASSERT_EQ(test, ret2, 0);
+ KUNIT_EXPECT_EQ(test, h1.hash, h2.hash);
+ KUNIT_EXPECT_EQ(test, h1.minor_hash, h2.minor_hash);
+}
+
+static void test_ext4fs_dirhash_casefold_fallback(struct kunit *test)
+{
+ struct super_block *sb_cf, *sb_plain;
+ struct ext4_inode_info *ei;
+ struct ext4_sb_info *sbi;
+ struct inode *plain_dir;
+ struct unicode_map *um;
+ static const char invalid_utf8[] = "\xc3\x28";
+ struct dx_hash_info folded_dir = {
+ .hash_version = DX_HASH_HALF_MD4,
+ };
+ struct dx_hash_info plain = {
+ .hash_version = DX_HASH_HALF_MD4,
+ };
+ int ret, ret_cf, ret_plain;
+
+ sb_cf = kunit_kzalloc(test, sizeof(*sb_cf), GFP_KERNEL);
+ sb_plain = kunit_kzalloc(test, sizeof(*sb_plain), GFP_KERNEL);
+ ei = kunit_kzalloc(test, sizeof(*ei), GFP_KERNEL);
+ sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL);
+ plain_dir = kunit_kzalloc(test, sizeof(*plain_dir), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb_cf);
+ KUNIT_ASSERT_NOT_NULL(test, sb_plain);
+ KUNIT_ASSERT_NOT_NULL(test, ei);
+ KUNIT_ASSERT_NOT_NULL(test, sbi);
+ KUNIT_ASSERT_NOT_NULL(test, plain_dir);
+
+ um = utf8_load(UTF8_LATEST);
+ if (IS_ERR(um)) {
+ kunit_skip(test, "utf8_load(UTF8_LATEST) failed: %pe",
+ um);
+ return;
+ }
+
+ ret = kunit_add_action_or_reset(test, utf8_unload_action, um);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ ext4_hash_init_fake_ext4_dir(ei, sb_cf, sbi);
+ sb_cf->s_encoding = um;
+ ei->vfs_inode.i_flags |= S_CASEFOLD;
+
+ KUNIT_ASSERT_TRUE(test, IS_CASEFOLDED(&ei->vfs_inode));
+
+ ext4_hash_init_fake_dir(plain_dir, sb_plain);
+
+ ret_cf = ext4fs_dirhash(&ei->vfs_inode, invalid_utf8,
+ sizeof(invalid_utf8) - 1, &folded_dir);
+ ret_plain = ext4fs_dirhash(plain_dir, invalid_utf8,
+ sizeof(invalid_utf8) - 1, &plain);
+
+ KUNIT_ASSERT_EQ(test, ret_cf, 0);
+ KUNIT_ASSERT_EQ(test, ret_plain, 0);
+ KUNIT_EXPECT_EQ(test, folded_dir.hash, plain.hash);
+ KUNIT_EXPECT_EQ(test, folded_dir.minor_hash, plain.minor_hash);
+}
+#endif
+
+static struct kunit_case ext4_hash_test_cases[] = {
+ KUNIT_CASE(test_ext4fs_dirhash_vectors),
+ KUNIT_CASE(test_ext4fs_dirhash_seed_changes_result),
+ KUNIT_CASE(test_ext4fs_dirhash_invalid_version_returns_einval),
+ KUNIT_CASE(test_ext4fs_dirhash_siphash_without_key_returns_einval),
+ KUNIT_CASE(test_ext4fs_dirhash_signed_unsigned_differ_on_nonascii),
+#if IS_ENABLED(CONFIG_UNICODE)
+ KUNIT_CASE(test_ext4fs_dirhash_casefolded_names_hash_consistently),
+ KUNIT_CASE(test_ext4fs_dirhash_casefold_fallback),
+#endif
+ {}
+};
+
+static struct kunit_suite ext4_hash_test_suite = {
+ .name = "ext4_hash",
+ .test_cases = ext4_hash_test_cases,
+};
+
+kunit_test_suites(&ext4_hash_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 48483cd015d3..978bd92da0ad 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -9,6 +9,7 @@
#include <linux/unicode.h>
#include <linux/compiler.h>
#include <linux/bitops.h>
+#include <linux/unaligned.h>
#include "ext4.h"
#define DELTA 0x9E3779B9
@@ -141,21 +142,28 @@ static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
pad = (__u32)len | ((__u32)len << 8);
pad |= pad << 16;
- val = pad;
if (len > num*4)
len = num * 4;
- for (i = 0; i < len; i++) {
- val = ((int) scp[i]) + (val << 8);
- if ((i % 4) == 3) {
- *buf++ = val;
- val = pad;
- num--;
- }
+
+ while (len >= 4) {
+ val = ((__u32)scp[0] << 24) + ((__u32)scp[1] << 16) + ((__u32)scp[2] << 8) + scp[3];
+ *buf++ = val;
+ scp += 4;
+ len -= 4;
+ num--;
}
+
+ val = pad;
+
+ for (i = 0; i < len; i++)
+ val = scp[i] + (val << 8);
+
if (--num >= 0)
*buf++ = val;
+
while (--num >= 0)
*buf++ = pad;
+
}
static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
@@ -167,21 +175,28 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
pad = (__u32)len | ((__u32)len << 8);
pad |= pad << 16;
- val = pad;
if (len > num*4)
len = num * 4;
- for (i = 0; i < len; i++) {
- val = ((int) ucp[i]) + (val << 8);
- if ((i % 4) == 3) {
- *buf++ = val;
- val = pad;
- num--;
- }
+
+ while (len >= 4) {
+ val = get_unaligned_be32(ucp);
+ *buf++ = val;
+ ucp += 4;
+ len -= 4;
+ num--;
}
+
+ val = pad;
+
+ for (i = 0; i < len; i++)
+ val = ucp[i] + (val << 8);
+
if (--num >= 0)
*buf++ = val;
+
while (--num >= 0)
*buf++ = pad;
+
}
/*
@@ -205,8 +220,7 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
const char *p;
int i;
__u32 in[8], buf[4];
- void (*str2hashbuf)(const char *, int, __u32 *, int) =
- str2hashbuf_signed;
+ bool use_unsigned = false;
/* Initialize the default seed for the hash checksum functions */
buf[0] = 0x67452301;
@@ -232,12 +246,15 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
hash = dx_hack_hash_signed(name, len);
break;
case DX_HASH_HALF_MD4_UNSIGNED:
- str2hashbuf = str2hashbuf_unsigned;
+ use_unsigned = true;
fallthrough;
case DX_HASH_HALF_MD4:
p = name;
while (len > 0) {
- (*str2hashbuf)(p, len, in, 8);
+ if (use_unsigned)
+ str2hashbuf_unsigned(p, len, in, 8);
+ else
+ str2hashbuf_signed(p, len, in, 8);
half_md4_transform(buf, in);
len -= 32;
p += 32;
@@ -246,12 +263,15 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
hash = buf[1];
break;
case DX_HASH_TEA_UNSIGNED:
- str2hashbuf = str2hashbuf_unsigned;
+ use_unsigned = true;
fallthrough;
case DX_HASH_TEA:
p = name;
while (len > 0) {
- (*str2hashbuf)(p, len, in, 4);
+ if (use_unsigned)
+ str2hashbuf_unsigned(p, len, in, 4);
+ else
+ str2hashbuf_signed(p, len, in, 4);
TEA_transform(buf, in);
len -= 16;
p += 16;
@@ -321,3 +341,7 @@ opaque_seq:
#endif
return __ext4fs_dirhash(dir, name, len, hinfo);
}
+
+#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS)
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4fs_dirhash);
+#endif
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c2c2d6ac7f3d..ce99807c5f5b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1560,7 +1560,8 @@ static int ext4_journalled_write_end(const struct kiocb *iocb,
BUG_ON(!ext4_handle_valid(handle));
- if (ext4_has_inline_data(inode))
+ if (ext4_has_inline_data(inode) &&
+ ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
return ext4_write_inline_data_end(inode, pos, len, copied,
folio);
@@ -5025,6 +5026,57 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
return ret;
}
+/*
+ * ext4_get_inode_loc_noio() is a best-effort variant of ext4_get_inode_loc().
+ * It looks up the inode table block in the buffer cache and returns -EAGAIN if
+ * the block is not present or not uptodate, without starting any I/O.
+ */
+int ext4_get_inode_loc_noio(struct inode *inode, struct ext4_iloc *iloc)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ext4_group_desc *gdp;
+ struct buffer_head *bh;
+ ext4_fsblk_t block;
+ int inodes_per_block, inode_offset;
+ unsigned long ino = inode->i_ino;
+
+ iloc->bh = NULL;
+ if (ino < EXT4_ROOT_INO ||
+ ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
+ return -EFSCORRUPTED;
+
+ iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+ gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
+ if (!gdp)
+ return -EIO;
+
+ /* Figure out the offset within the block group inode table. */
+ inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+ inode_offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb));
+ iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+
+ block = ext4_inode_table(sb, gdp);
+ if (block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) ||
+ block >= ext4_blocks_count(EXT4_SB(sb)->s_es)) {
+ ext4_error(sb,
+ "Invalid inode table block %llu in block_group %u",
+ block, iloc->block_group);
+ return -EFSCORRUPTED;
+ }
+ block += inode_offset / inodes_per_block;
+
+ bh = sb_find_get_block(sb, block);
+ if (!bh)
+ return -EAGAIN;
+ if (!ext4_buffer_uptodate(bh)) {
+ brelse(bh);
+ return -EAGAIN;
+ }
+
+ iloc->bh = bh;
+ return 0;
+}
+
int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
struct ext4_iloc *iloc)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 1d0c3d4bdf47..c8387e6a2c6e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -830,11 +830,17 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags)
bdev_thaw(sb->s_bdev);
break;
case EXT4_GOING_FLAGS_LOGFLUSH:
+ /*
+ * Call ext4_force_commit() before setting EXT4_FLAGS_SHUTDOWN.
+ * This is because in data=ordered mode, journal commit
+ * triggers data writeback which fails if shutdown is already
+ * set, causing the journal to be aborted prematurely before
+ * the commit succeeds.
+ */
+ (void) ext4_force_commit(sb);
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
- if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) {
- (void) ext4_force_commit(sb);
+ if (sbi->s_journal && !is_journal_aborted(sbi->s_journal))
jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN);
- }
break;
case EXT4_GOING_FLAGS_NOLOGFLUSH:
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
@@ -1650,6 +1656,9 @@ group_extend_out:
if (!(fd_file(donor)->f_mode & FMODE_WRITE))
return -EBADF;
+ if (file_inode(filp)->i_sb != file_inode(fd_file(donor))->i_sb)
+ return -EXDEV;
+
err = mnt_want_write_file(filp);
if (err)
return err;
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
index d90da44aadbd..0424b8b0b4c3 100644
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -727,8 +727,7 @@ do_test_generate_buddy(struct kunit *test, struct super_block *sb, void *bitmap,
ext4_mb_generate_buddy_test(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP,
ext4_grp);
- KUNIT_ASSERT_EQ(test, memcmp(mbt_buddy, ext4_buddy, sb->s_blocksize),
- 0);
+ KUNIT_ASSERT_MEMEQ(test, mbt_buddy, ext4_buddy, sb->s_blocksize);
mbt_validate_group_info(test, mbt_grp, ext4_grp);
}
@@ -789,8 +788,7 @@ test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b,
grp->bb_counters[i] = 0;
ext4_mb_generate_buddy_test(sb, buddy, bitmap, 0, grp);
- KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize),
- 0);
+ KUNIT_ASSERT_MEMEQ(test, buddy, e4b->bd_buddy, sb->s_blocksize);
mbt_validate_group_info(test, grp, e4b->bd_info);
}
@@ -854,8 +852,7 @@ test_mb_free_blocks_range(struct kunit *test, struct ext4_buddy *e4b,
grp->bb_counters[i] = 0;
ext4_mb_generate_buddy_test(sb, buddy, bitmap, 0, grp);
- KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize),
- 0);
+ KUNIT_ASSERT_MEMEQ(test, buddy, e4b->bd_buddy, sb->s_blocksize);
mbt_validate_group_info(test, grp, e4b->bd_info);
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 4a47fbd8dd30..cc49ae04a6f6 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -145,9 +145,9 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
if (IS_ERR(bh)) {
__ext4_warning(inode->i_sb, func, line,
"inode #%llu: lblock %lu: comm %s: "
- "error %ld reading directory block",
+ "error %pe reading directory block",
inode->i_ino, (unsigned long)block,
- current->comm, PTR_ERR(bh));
+ current->comm, bh);
return bh;
}
@@ -3054,7 +3054,7 @@ out_stop:
out_retry:
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
- return ERR_PTR(err);
+ return err ? ERR_PTR(err) : NULL;
}
/*
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dc82e7b57e75..bc674aa4a656 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -168,7 +168,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
* written. On IO failure, check if journal abort is needed. Note that
* we are protected from truncate touching same part of extent tree by the
* fact that truncate code waits for all DIO to finish (thus exclusion from
- * direct IO is achieved) and also waits for PageWriteback bits. Thus we
+ * direct IO is achieved) and also waits for writeback to complete. Thus we
* cannot get to ext4_ext_truncate() before all IOs overlapping that range are
* completed (happens from ext4_free_ioend()).
*/
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7283108d7609..245f67d10ded 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1431,6 +1431,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ext4_fc_init_inode(&ei->vfs_inode);
spin_lock_init(&ei->i_fc_lock);
mmb_init(&ei->i_metadata_bhs, &ei->vfs_inode.i_data);
+#ifdef CONFIG_LOCKDEP
+ lockdep_set_subclass(&ei->i_data_sem, I_DATA_SEM_NORMAL);
+#endif
return &ei->vfs_inode;
}
@@ -4541,6 +4544,7 @@ static void ext4_fast_commit_init(struct super_block *sb)
sbi->s_fc_ineligible_tid = 0;
mutex_init(&sbi->s_fc_lock);
memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
+ memset(&sbi->s_fc_snap_stats, 0, sizeof(sbi->s_fc_snap_stats));
sbi->s_fc_replay_state.fc_regions = NULL;
sbi->s_fc_replay_state.fc_regions_size = 0;
sbi->s_fc_replay_state.fc_regions_used = 0;
@@ -5910,6 +5914,11 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
return ERR_PTR(-EFSCORRUPTED);
}
+#ifdef CONFIG_LOCKDEP
+ lockdep_set_subclass(&EXT4_I(journal_inode)->i_data_sem,
+ I_DATA_SEM_JOURNAL);
+#endif
+
ext4_debug("Journal inode found at %p: %lld bytes\n",
journal_inode, journal_inode->i_size);
return journal_inode;
@@ -5977,8 +5986,8 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb,
sb, &fs_holder_ops);
if (IS_ERR(bdev_file)) {
ext4_msg(sb, KERN_ERR,
- "failed to open journal device unknown-block(%u,%u) %ld",
- MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file));
+ "failed to open journal device unknown-block(%u,%u) %pe",
+ MAJOR(j_dev), MINOR(j_dev), bdev_file);
return bdev_file;
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index d8577725a2fb..3029cb6f6d64 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -512,10 +512,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* leave undo-committed data.
*/
if (jh->b_committed_data) {
- struct buffer_head *bh = jh2bh(jh);
-
spin_lock(&jh->b_state_lock);
- jbd2_free(jh->b_committed_data, bh->b_size);
+ kfree(jh->b_committed_data);
jh->b_committed_data = NULL;
spin_unlock(&jh->b_state_lock);
}
@@ -976,7 +974,7 @@ restart_loop:
* its triggers if they exist, so we can clear that too.
*/
if (jh->b_committed_data) {
- jbd2_free(jh->b_committed_data, bh->b_size);
+ kfree(jh->b_committed_data);
jh->b_committed_data = NULL;
if (jh->b_frozen_data) {
jh->b_committed_data = jh->b_frozen_data;
@@ -984,7 +982,7 @@ restart_loop:
jh->b_frozen_triggers = NULL;
}
} else if (jh->b_frozen_data) {
- jbd2_free(jh->b_frozen_data, bh->b_size);
+ kfree(jh->b_frozen_data);
jh->b_frozen_data = NULL;
jh->b_frozen_triggers = NULL;
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e82798680109..09efa337649e 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -95,8 +95,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
EXPORT_SYMBOL(jbd2_inode_cache);
-static int jbd2_journal_create_slab(size_t slab_size);
-
#ifdef CONFIG_JBD2_DEBUG
void __jbd2_debug(int level, const char *file, const char *func,
unsigned int line, const char *fmt, ...)
@@ -385,10 +383,10 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
goto escape_done;
spin_unlock(&jh_in->b_state_lock);
- tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL);
+ tmp = kmalloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL);
spin_lock(&jh_in->b_state_lock);
if (jh_in->b_frozen_data) {
- jbd2_free(tmp, bh_in->b_size);
+ kfree(tmp);
goto copy_done;
}
@@ -2062,14 +2060,6 @@ EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
int jbd2_journal_load(journal_t *journal)
{
int err;
- journal_superblock_t *sb = journal->j_superblock;
-
- /*
- * Create a slab for this blocksize
- */
- err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
- if (err)
- return err;
/* Let the recovery code check whether it needs to recover any
* data from the journal. */
@@ -2261,6 +2251,8 @@ jbd2_journal_initialize_fast_commit(journal_t *journal)
unsigned long long num_fc_blks;
num_fc_blks = jbd2_journal_get_num_fc_blks(sb);
+ if (num_fc_blks > journal->j_last)
+ return -EFSCORRUPTED;
if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS)
return -ENOSPC;
@@ -2698,105 +2690,6 @@ size_t journal_tag_bytes(journal_t *journal)
}
/*
- * JBD memory management
- *
- * These functions are used to allocate block-sized chunks of memory
- * used for making copies of buffer_head data. Very often it will be
- * page-sized chunks of data, but sometimes it will be in
- * sub-page-size chunks. (For example, 16k pages on Power systems
- * with a 4k block file system.) For blocks smaller than a page, we
- * use a SLAB allocator. There are slab caches for each block size,
- * which are allocated at mount time, if necessary, and we only free
- * (all of) the slab caches when/if the jbd2 module is unloaded. For
- * this reason we don't need to a mutex to protect access to
- * jbd2_slab[] allocating or releasing memory; only in
- * jbd2_journal_create_slab().
- */
-#define JBD2_MAX_SLABS 8
-static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
-
-static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
- "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
- "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
-};
-
-
-static void jbd2_journal_destroy_slabs(void)
-{
- int i;
-
- for (i = 0; i < JBD2_MAX_SLABS; i++) {
- kmem_cache_destroy(jbd2_slab[i]);
- jbd2_slab[i] = NULL;
- }
-}
-
-static int jbd2_journal_create_slab(size_t size)
-{
- static DEFINE_MUTEX(jbd2_slab_create_mutex);
- int i = order_base_2(size) - 10;
- size_t slab_size;
-
- if (size == PAGE_SIZE)
- return 0;
-
- if (i >= JBD2_MAX_SLABS)
- return -EINVAL;
-
- if (unlikely(i < 0))
- i = 0;
- mutex_lock(&jbd2_slab_create_mutex);
- if (jbd2_slab[i]) {
- mutex_unlock(&jbd2_slab_create_mutex);
- return 0; /* Already created */
- }
-
- slab_size = 1 << (i+10);
- jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
- slab_size, 0, NULL);
- mutex_unlock(&jbd2_slab_create_mutex);
- if (!jbd2_slab[i]) {
- printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
- return -ENOMEM;
- }
- return 0;
-}
-
-static struct kmem_cache *get_slab(size_t size)
-{
- int i = order_base_2(size) - 10;
-
- BUG_ON(i >= JBD2_MAX_SLABS);
- if (unlikely(i < 0))
- i = 0;
- BUG_ON(jbd2_slab[i] == NULL);
- return jbd2_slab[i];
-}
-
-void *jbd2_alloc(size_t size, gfp_t flags)
-{
- void *ptr;
-
- BUG_ON(size & (size-1)); /* Must be a power of 2 */
-
- if (size < PAGE_SIZE)
- ptr = kmem_cache_alloc(get_slab(size), flags);
- else
- ptr = kmalloc(size, flags);
-
- /* Check alignment; SLUB has gotten this wrong in the past,
- * and this can lead to user data corruption! */
- BUG_ON(((unsigned long) ptr) & (size-1));
-
- return ptr;
-}
-
-void jbd2_free(void *ptr, size_t size)
-{
- kfree(ptr);
-};
-
-/*
* Journal_head storage management
*/
static struct kmem_cache *jbd2_journal_head_cache;
@@ -2969,15 +2862,15 @@ static void __journal_remove_journal_head(struct buffer_head *bh)
clear_buffer_jbd(bh);
}
-static void journal_release_journal_head(struct journal_head *jh, size_t b_size)
+static void journal_release_journal_head(struct journal_head *jh)
{
if (jh->b_frozen_data) {
printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
- jbd2_free(jh->b_frozen_data, b_size);
+ kfree(jh->b_frozen_data);
}
if (jh->b_committed_data) {
printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
- jbd2_free(jh->b_committed_data, b_size);
+ kfree(jh->b_committed_data);
}
journal_free_journal_head(jh);
}
@@ -2996,7 +2889,7 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
if (!jh->b_jcount) {
__journal_remove_journal_head(bh);
jbd_unlock_bh_journal_head(bh);
- journal_release_journal_head(jh, bh->b_size);
+ journal_release_journal_head(jh);
__brelse(bh);
} else {
jbd_unlock_bh_journal_head(bh);
@@ -3138,7 +3031,6 @@ static void jbd2_journal_destroy_caches(void)
jbd2_journal_destroy_handle_cache();
jbd2_journal_destroy_inode_cache();
jbd2_journal_destroy_transaction_cache();
- jbd2_journal_destroy_slabs();
}
static int __init journal_init(void)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 4885903bbd10..5cc7d097b2ac 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1131,7 +1131,7 @@ repeat:
if (!frozen_buffer) {
JBUFFER_TRACE(jh, "allocate memory for buffer");
spin_unlock(&jh->b_state_lock);
- frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
+ frozen_buffer = kmalloc(jh2bh(jh)->b_size,
GFP_NOFS | __GFP_NOFAIL);
goto repeat;
}
@@ -1159,7 +1159,7 @@ done:
out:
if (unlikely(frozen_buffer)) /* It's usually NULL */
- jbd2_free(frozen_buffer, bh->b_size);
+ kfree(frozen_buffer);
JBUFFER_TRACE(jh, "exit");
return error;
@@ -1424,7 +1424,7 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
repeat:
if (!jh->b_committed_data)
- committed_data = jbd2_alloc(jh2bh(jh)->b_size,
+ committed_data = kmalloc(jh2bh(jh)->b_size,
GFP_NOFS|__GFP_NOFAIL);
spin_lock(&jh->b_state_lock);
@@ -1445,7 +1445,7 @@ repeat:
out:
jbd2_journal_put_journal_head(jh);
if (unlikely(committed_data))
- jbd2_free(committed_data, bh->b_size);
+ kfree(committed_data);
return err;
}
@@ -1516,14 +1516,19 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
*/
int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
{
- transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ transaction_t *transaction;
+ journal_t *journal;
struct journal_head *jh;
int ret = 0;
+ if (is_handle_aborted(handle))
+ return -EROFS;
if (!buffer_jbd(bh))
return -EUCLEAN;
+ transaction = handle->h_transaction;
+ journal = transaction->t_journal;
+
/*
* We don't grab jh reference here since the buffer must be part
* of the running transaction.
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 7e785aa6d35d..b68561187e90 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -63,9 +63,6 @@ void __jbd2_debug(int level, const char *file, const char *func,
#define jbd2_debug(n, fmt, a...) no_printk(fmt, ##a)
#endif
-extern void *jbd2_alloc(size_t size, gfp_t flags);
-extern void jbd2_free(void *ptr, size_t size);
-
#define JBD2_MIN_JOURNAL_BLOCKS 1024
#define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index f493642cf121..7028a28316fa 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -107,6 +107,26 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_VERITY);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_MOVE_EXT);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);
+#undef EM
+#undef EMe
+#define EM(a) TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a);
+#define EMe(a) TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a);
+
+#define TRACE_SNAP_ERR \
+ EM(NONE) \
+ EM(ES_MISS) \
+ EM(ES_DELAYED) \
+ EM(ES_OTHER) \
+ EM(INODES_CAP) \
+ EM(RANGES_CAP) \
+ EM(NOMEM) \
+ EMe(INODE_LOC)
+
+TRACE_SNAP_ERR
+
+#undef EM
+#undef EMe
+
#define show_fc_reason(reason) \
__print_symbolic(reason, \
{ EXT4_FC_REASON_XATTR, "XATTR"}, \
@@ -2818,6 +2838,47 @@ TRACE_EVENT(ext4_fc_commit_stop,
__entry->num_fc_ineligible, __entry->nblks_agg, __entry->tid)
);
+#define EM(a) { EXT4_FC_SNAP_ERR_##a, #a },
+#define EMe(a) { EXT4_FC_SNAP_ERR_##a, #a }
+
+TRACE_EVENT(ext4_fc_lock_updates,
+ TP_PROTO(struct super_block *sb, tid_t commit_tid, u64 locked_ns,
+ unsigned int nr_inodes, unsigned int nr_ranges, int err,
+ int snap_err),
+
+ TP_ARGS(sb, commit_tid, locked_ns, nr_inodes, nr_ranges, err, snap_err),
+
+ TP_STRUCT__entry(/* entry */
+ __field(dev_t, dev)
+ __field(tid_t, tid)
+ __field(u64, locked_ns)
+ __field(unsigned int, nr_inodes)
+ __field(unsigned int, nr_ranges)
+ __field(int, err)
+ __field(int, snap_err)
+ ),
+
+ TP_fast_assign(/* assign */
+ __entry->dev = sb->s_dev;
+ __entry->tid = commit_tid;
+ __entry->locked_ns = locked_ns;
+ __entry->nr_inodes = nr_inodes;
+ __entry->nr_ranges = nr_ranges;
+ __entry->err = err;
+ __entry->snap_err = snap_err;
+ ),
+
+ TP_printk("dev %d,%d tid %u locked_ns %llu nr_inodes %u nr_ranges %u err %d snap_err %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
+ __entry->locked_ns, __entry->nr_inodes, __entry->nr_ranges,
+ __entry->err, __print_symbolic(__entry->snap_err,
+ TRACE_SNAP_ERR))
+);
+
+#undef EM
+#undef EMe
+#undef TRACE_SNAP_ERR
+
#define FC_REASON_NAME_STAT(reason) \
show_fc_reason(reason), \
__entry->fc_ineligible_rc[reason]