From 55138e0bc29c0751e2152df9ad35deea542f29b3 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 29 Sep 2009 13:31:31 -0400 Subject: ext4: Adjust ext4_da_writepages() to write out larger contiguous chunks Work around problems in the writeback code to force out writebacks in larger chunks than just 4mb, which is just too small. This also works around limitations in the ext4 block allocator, which can't allocate more than 2048 blocks at a time. So we need to defeat the round-robin characteristics of the writeback code and try to write out as many blocks in one inode before allowing the writeback code to move on to another inode. We add a a new per-filesystem tunable, max_writeback_mb_bump, which caps this to a default of 128mb per inode. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index df539ba27779..16817737ba52 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2197,6 +2197,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); +EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), @@ -2210,6 +2211,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_order2_req), ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), + ATTR_LIST(max_writeback_mb_bump), NULL, }; @@ -2679,6 +2681,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sbi->s_stripe = ext4_get_stripe_size(sbi); + sbi->s_max_writeback_mb_bump = 128; /* * set up enough so that it can read an inode -- cgit v1.2.3 From 4c0425ff68b1b87b802ffeda7b6a46ff7da7241c Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Mon, 28 Sep 2009 15:48:41 -0400 Subject: ext4: Use end_io callback to avoid direct I/O fallback to buffered I/O Currently the DIO VFS code passes create = 0 when writing to the middle of file. It does this to avoid block allocation for holes, so as not to expose stale data out when there is a parallel buffered read (which does not hold the i_mutex lock). Direct I/O writes into holes falls back to buffered IO for this reason. Since preallocated extents are treated as holes when doing a get_block() look up (buffer is not mapped), direct IO over fallocate also falls back to buffered IO. Thus ext4 actually silently falls back to buffered IO in above two cases, which is undesirable. To fix this, this patch creates unitialized extents when a direct I/O write into holes in sparse files, and registering an end_io callback which converts the uninitialized extent to an initialized extent after the I/O is completed. Singed-Off-By: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 16817737ba52..1a03ea98fdd1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -580,6 +580,9 @@ static void ext4_put_super(struct super_block *sb) struct ext4_super_block *es = sbi->s_es; int i, err; + flush_workqueue(sbi->dio_unwritten_wq); + destroy_workqueue(sbi->dio_unwritten_wq); + lock_super(sb); lock_kernel(); if (sb->s_dirt) @@ -2801,6 +2804,12 @@ no_journal: clear_opt(sbi->s_mount_opt, NOBH); } } + EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); + if (!EXT4_SB(sb)->dio_unwritten_wq) { + printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); + goto failed_mount_wq; + } + /* * The jbd2_journal_load will have done any necessary log recovery, * so we can safely mount the rest of the filesystem now. @@ -2913,6 +2922,8 @@ cantfind_ext4: failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); +failed_mount_wq: ext4_release_system_zone(sb); if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); -- cgit v1.2.3 From 8d5d02e6b176565c77ff03604908b1453a22044d Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Mon, 28 Sep 2009 15:48:29 -0400 Subject: ext4: async direct IO for holes and fallocate support For async direct IO that covers holes or fallocate, the end_io callback function now queued the convertion work on workqueue but don't flush the work rightaway as it might take too long to afford. But when fsync is called after all the data is completed, user expects the metadata also being updated before fsync returns. Thus we need to flush the conversion work when fsync() is called. This patch keep track of a listed of completed async direct io that has a work queued on workqueue. When fsync() is called, it will go through the list and do the conversion. Signed-off-by: Mingming Cao --- fs/ext4/super.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1a03ea98fdd1..f095c60b569e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -687,6 +687,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_allocated_meta_blocks = 0; ei->i_delalloc_reserved_flag = 0; spin_lock_init(&(ei->i_block_reservation_lock)); + INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); + ei->cur_aio_dio = NULL; return &ei->vfs_inode; } @@ -3375,11 +3377,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait) { int ret = 0; tid_t target; + struct ext4_sb_info *sbi = EXT4_SB(sb); trace_ext4_sync_fs(sb, wait); - if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { + flush_workqueue(sbi->dio_unwritten_wq); + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { if (wait) - jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); + jbd2_log_wait_commit(sbi->s_journal, target); } return ret; } -- cgit v1.2.3 From d3d1faf6a74496ea4435fd057c6a2cad49f3e523 Mon Sep 17 00:00:00 2001 From: Curt Wohlgemuth Date: Tue, 29 Sep 2009 11:01:03 -0400 Subject: ext4: Handle nested ext4_journal_start/stop calls without a journal This patch fixes a problem with handling nested calls to ext4_journal_start/ext4_journal_stop, when there is no journal present. Signed-off-by: Curt Wohlgemuth Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f095c60b569e..3f7e7010c098 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -189,6 +189,36 @@ void ext4_itable_unused_set(struct super_block *sb, bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); } + +/* Just increment the non-pointer handle value */ +static handle_t *ext4_get_nojournal(void) +{ + handle_t *handle = current->journal_info; + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); + + ref_cnt++; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; + return handle; +} + + +/* Decrement the non-pointer handle value */ +static void ext4_put_nojournal(handle_t *handle) +{ + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt == 0); + + ref_cnt--; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; +} + /* * Wrappers for jbd2_journal_start/end. * @@ -215,11 +245,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) } return jbd2_journal_start(journal, nblocks); } - /* - * We're not journaling, return the appropriate indication. - */ - current->journal_info = EXT4_NOJOURNAL_HANDLE; - return current->journal_info; + return ext4_get_nojournal(); } /* @@ -235,11 +261,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle) int rc; if (!ext4_handle_valid(handle)) { - /* - * Do this here since we don't call jbd2_journal_stop() in - * no-journal mode. - */ - current->journal_info = NULL; + ext4_put_nojournal(handle); return 0; } sb = handle->h_transaction->t_journal->j_private; -- cgit v1.2.3 From 90576c0b9a0b5323fc4bd7f23f49be0d234f36d1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 29 Sep 2009 15:51:30 -0400 Subject: ext4, jbd2: Drop unneeded printks at mount and unmount time There are a number of kernel printk's which are printed when an ext4 filesystem is mounted and unmounted. Disable them to economize space in the system logs. In addition, disabling the mballoc stats by default saves a number of unneeded atomic operations for every block allocation or deallocation. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3f7e7010c098..e5b206a043a5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1673,13 +1673,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt); - if (EXT4_SB(sb)->s_journal) { - ext4_msg(sb, KERN_INFO, "%s journal on %s", - EXT4_SB(sb)->s_journal->j_inode ? "internal" : - "external", EXT4_SB(sb)->s_journal->j_devname); - } else { - ext4_msg(sb, KERN_INFO, "no journal"); - } return res; } @@ -2885,12 +2878,12 @@ no_journal: "available"); } - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + if (test_opt(sb, DELALLOC) && + (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " "requested data journaling mode"); clear_opt(sbi->s_mount_opt, DELALLOC); - } else if (test_opt(sb, DELALLOC)) - ext4_msg(sb, KERN_INFO, "delayed allocation enabled"); + } err = ext4_setup_system_zone(sb); if (err) { @@ -3202,9 +3195,7 @@ static int ext4_load_journal(struct super_block *sb, return -EINVAL; } - if (journal->j_flags & JBD2_BARRIER) - ext4_msg(sb, KERN_INFO, "barriers enabled"); - else + if (!(journal->j_flags & JBD2_BARRIER)) ext4_msg(sb, KERN_INFO, "barriers disabled"); if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { -- cgit v1.2.3 From 296c355cd6443d89fa251885a8d78778fe111dc4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 30 Sep 2009 00:32:42 -0400 Subject: ext4: Use tracepoints for mb_history trace file The /proc/fs/ext4//mb_history was maintained manually, and had a number of problems: it required a largish amount of memory to be allocated for each ext4 filesystem, and the s_mb_history_lock introduced a CPU contention problem. By ripping out the mb_history code and replacing it with ftrace tracepoints, and we get more functionality: timestamps, event filtering, the ability to correlate mballoc history with other ext4 tracepoints, etc. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e5b206a043a5..12e726a7073f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -50,13 +50,6 @@ #define CREATE_TRACE_POINTS #include -static int default_mb_history_length = 1000; - -module_param_named(default_mb_history_length, default_mb_history_length, - int, 0644); -MODULE_PARM_DESC(default_mb_history_length, - "Default number of entries saved for mb_history"); - struct proc_dir_entry *ext4_proc_root; static struct kset *ext4_kset; @@ -1079,7 +1072,7 @@ enum { Opt_journal_update, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length, + Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, @@ -1126,7 +1119,6 @@ static const match_table_t tokens = { {Opt_data_writeback, "data=writeback"}, {Opt_data_err_abort, "data_err=abort"}, {Opt_data_err_ignore, "data_err=ignore"}, - {Opt_mb_history_length, "mb_history_length=%u"}, {Opt_offusrjquota, "usrjquota="}, {Opt_usrjquota, "usrjquota=%s"}, {Opt_offgrpjquota, "grpjquota="}, @@ -1367,13 +1359,6 @@ static int parse_options(char *options, struct super_block *sb, case Opt_data_err_ignore: clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); break; - case Opt_mb_history_length: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - sbi->s_mb_history_max = option; - break; #ifdef CONFIG_QUOTA case Opt_usrjquota: qtype = USRQUOTA; @@ -2435,7 +2420,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; - sbi->s_mb_history_max = default_mb_history_length; set_opt(sbi->s_mount_opt, BARRIER); -- cgit v1.2.3