diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-05-26 09:53:20 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-05-26 09:53:20 -0700 |
commit | 35806b4f7c5620b547f183e9d53f7cfaeabb582b (patch) | |
tree | dc966f5edd9e482fdc85b8fb886ae39ce5c5ec80 | |
parent | 32e51f141fd8d880f57b6a2eb53ce72856254d4a (diff) | |
parent | d183e11a4a66d80e10d60b0918a47cf073135379 (diff) |
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (61 commits)
jbd2: Add MAINTAINERS entry
jbd2: fix a potential leak of a journal_head on an error path
ext4: teach ext4_ext_split to calculate extents efficiently
ext4: Convert ext4 to new truncate calling convention
ext4: do not normalize block requests from fallocate()
ext4: enable "punch hole" functionality
ext4: add "punch hole" flag to ext4_map_blocks()
ext4: punch out extents
ext4: add new function ext4_block_zero_page_range()
ext4: add flag to ext4_has_free_blocks
ext4: reserve inodes and feature code for 'quota' feature
ext4: add support for multiple mount protection
ext4: ensure f_bfree returned by ext4_statfs() is non-negative
ext4: protect bb_first_free in ext4_trim_all_free() with group lock
ext4: only load buddy bitmap in ext4_trim_fs() when it is needed
jbd2: Fix comment to match the code in jbd2__journal_start()
ext4: fix waiting and sending of a barrier in ext4_sync_file()
jbd2: Add function jbd2_trans_will_send_data_barrier()
jbd2: fix sending of data flush on journal commit
ext4: fix ext4_ext_fiemap_cb() to handle blocks before request range correctly
...
-rw-r--r-- | Documentation/filesystems/ext4.txt | 4 | ||||
-rw-r--r-- | MAINTAINERS | 13 | ||||
-rw-r--r-- | fs/ext4/Makefile | 3 | ||||
-rw-r--r-- | fs/ext4/balloc.c | 146 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 127 | ||||
-rw-r--r-- | fs/ext4/ext4_jbd2.c | 14 | ||||
-rw-r--r-- | fs/ext4/ext4_jbd2.h | 5 | ||||
-rw-r--r-- | fs/ext4/extents.c | 1410 | ||||
-rw-r--r-- | fs/ext4/file.c | 1 | ||||
-rw-r--r-- | fs/ext4/fsync.c | 25 | ||||
-rw-r--r-- | fs/ext4/inode.c | 114 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 459 | ||||
-rw-r--r-- | fs/ext4/mballoc.h | 6 | ||||
-rw-r--r-- | fs/ext4/migrate.c | 2 | ||||
-rw-r--r-- | fs/ext4/mmp.c | 351 | ||||
-rw-r--r-- | fs/ext4/move_extent.c | 3 | ||||
-rw-r--r-- | fs/ext4/namei.c | 82 | ||||
-rw-r--r-- | fs/ext4/page-io.c | 39 | ||||
-rw-r--r-- | fs/ext4/super.c | 204 | ||||
-rw-r--r-- | fs/ext4/xattr.c | 4 | ||||
-rw-r--r-- | fs/jbd2/commit.c | 22 | ||||
-rw-r--r-- | fs/jbd2/journal.c | 58 | ||||
-rw-r--r-- | fs/jbd2/transaction.c | 22 | ||||
-rw-r--r-- | include/linux/jbd2.h | 8 |
24 files changed, 2019 insertions, 1103 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index c79ec58fd7f6..3ae9bc94352a 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -226,10 +226,6 @@ acl Enables POSIX Access Control Lists support. noacl This option disables POSIX Access Control List support. -reservation - -noreservation - bsddf (*) Make 'df' act like BSD. minixdf Make 'df' act like Minix. diff --git a/MAINTAINERS b/MAINTAINERS index 1ab17de642e5..d54d551004f7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3572,9 +3572,16 @@ M: Andrew Morton <akpm@linux-foundation.org> M: Jan Kara <jack@suse.cz> L: linux-ext4@vger.kernel.org S: Maintained -F: fs/jbd*/ -F: include/linux/ext*jbd*.h -F: include/linux/jbd*.h +F: fs/jbd/ +F: include/linux/ext3_jbd.h +F: include/linux/jbd.h + +JOURNALLING LAYER FOR BLOCK DEVICES (JBD2) +M: "Theodore Ts'o" <tytso@mit.edu> +L: linux-ext4@vger.kernel.org +S: Maintained +F: fs/jbd2/ +F: include/linux/jbd2.h JSM Neo PCI based serial card M: Breno Leitao <leitao@linux.vnet.ibm.com> diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index c947e36eda6c..04109460ba9e 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ - ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o + ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ + mmp.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1c67139ad4b4..264f6949511e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -362,130 +362,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) } /** - * ext4_add_groupblocks() -- Add given blocks to an existing group - * @handle: handle to this transaction - * @sb: super block - * @block: start physcial block to add to the block group - * @count: number of blocks to free - * - * This marks the blocks as free in the bitmap. We ask the - * mballoc to reload the buddy after this by setting group - * EXT4_GROUP_INFO_NEED_INIT_BIT flag - */ -void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, - ext4_fsblk_t block, unsigned long count) -{ - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *gd_bh; - ext4_group_t block_group; - ext4_grpblk_t bit; - unsigned int i; - struct ext4_group_desc *desc; - struct ext4_sb_info *sbi = EXT4_SB(sb); - int err = 0, ret, blk_free_count; - ext4_grpblk_t blocks_freed; - struct ext4_group_info *grp; - - ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); - - ext4_get_group_no_and_offset(sb, block, &block_group, &bit); - grp = ext4_get_group_info(sb, block_group); - /* - * Check to see if we are freeing blocks across a group - * boundary. - */ - if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { - goto error_return; - } - bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (!bitmap_bh) - goto error_return; - desc = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!desc) - goto error_return; - - if (in_range(ext4_block_bitmap(sb, desc), block, count) || - in_range(ext4_inode_bitmap(sb, desc), block, count) || - in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || - in_range(block + count - 1, ext4_inode_table(sb, desc), - sbi->s_itb_per_group)) { - ext4_error(sb, "Adding blocks in system zones - " - "Block = %llu, count = %lu", - block, count); - goto error_return; - } - - /* - * We are about to add blocks to the bitmap, - * so we need undo access. - */ - BUFFER_TRACE(bitmap_bh, "getting undo access"); - err = ext4_journal_get_undo_access(handle, bitmap_bh); - if (err) - goto error_return; - - /* - * We are about to modify some metadata. Call the journal APIs - * to unshare ->b_data if a currently-committing transaction is - * using it - */ - BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gd_bh); - if (err) - goto error_return; - /* - * make sure we don't allow a parallel init on other groups in the - * same buddy cache - */ - down_write(&grp->alloc_sem); - for (i = 0, blocks_freed = 0; i < count; i++) { - BUFFER_TRACE(bitmap_bh, "clear bit"); - if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), - bit + i, bitmap_bh->b_data)) { - ext4_error(sb, "bit already cleared for block %llu", - (ext4_fsblk_t)(block + i)); - BUFFER_TRACE(bitmap_bh, "bit already cleared"); - } else { - blocks_freed++; - } - } - ext4_lock_group(sb, block_group); - blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); - ext4_free_blks_set(sb, desc, blk_free_count); - desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); - ext4_unlock_group(sb, block_group); - percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); - - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(blocks_freed, - &sbi->s_flex_groups[flex_group].free_blocks); - } - /* - * request to reload the buddy with the - * new bitmap information - */ - set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); - grp->bb_free += blocks_freed; - up_write(&grp->alloc_sem); - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - - /* And the group descriptor block */ - BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); - if (!err) - err = ret; - -error_return: - brelse(bitmap_bh); - ext4_std_error(sb, err); - return; -} - -/** * ext4_has_free_blocks() * @sbi: in-core super block structure. * @nblocks: number of needed blocks @@ -493,7 +369,8 @@ error_return: * Check if filesystem has nblocks free & available for allocation. * On success return 1, return 0 on failure. */ -static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) +static int ext4_has_free_blocks(struct ext4_sb_info *sbi, + s64 nblocks, unsigned int flags) { s64 free_blocks, dirty_blocks, root_blocks; struct percpu_counter *fbc = &sbi->s_freeblocks_counter; @@ -507,11 +384,6 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) EXT4_FREEBLOCKS_WATERMARK) { free_blocks = percpu_counter_sum_positive(fbc); dirty_blocks = percpu_counter_sum_positive(dbc); - if (dirty_blocks < 0) { - printk(KERN_CRIT "Dirty block accounting " - "went wrong %lld\n", - (long long)dirty_blocks); - } } /* Check whether we have space after * accounting for current dirty blocks & root reserved blocks. @@ -522,7 +394,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) /* Hm, nope. Are (enough) root reserved blocks available? */ if (sbi->s_resuid == current_fsuid() || ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || - capable(CAP_SYS_RESOURCE)) { + capable(CAP_SYS_RESOURCE) || + (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + if (free_blocks >= (nblocks + dirty_blocks)) return 1; } @@ -531,9 +405,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) } int ext4_claim_free_blocks(struct ext4_sb_info *sbi, - s64 nblocks) + s64 nblocks, unsigned int flags) { - if (ext4_has_free_blocks(sbi, nblocks)) { + if (ext4_has_free_blocks(sbi, nblocks, flags)) { percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks); return 0; } else @@ -554,7 +428,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi, */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || + if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) || (*retries)++ > 3 || !EXT4_SB(sb)->s_journal) return 0; @@ -577,7 +451,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) * error stores in errp pointer */ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, unsigned long *count, int *errp) + ext4_fsblk_t goal, unsigned int flags, + unsigned long *count, int *errp) { struct ext4_allocation_request ar; ext4_fsblk_t ret; @@ -587,6 +462,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ar.inode = inode; ar.goal = goal; ar.len = count ? *count : 1; + ar.flags = flags; ret = ext4_mb_new_blocks(handle, &ar, errp); if (count) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4daaf2b753f4..a74b89c09f90 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -108,7 +108,8 @@ typedef unsigned int ext4_group_t; #define EXT4_MB_DELALLOC_RESERVED 0x0400 /* We are doing stream allocation */ #define EXT4_MB_STREAM_ALLOC 0x0800 - +/* Use reserved root blocks if needed */ +#define EXT4_MB_USE_ROOT_BLOCKS 0x1000 struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -209,6 +210,8 @@ struct ext4_io_submit { */ #define EXT4_BAD_INO 1 /* Bad blocks inode */ #define EXT4_ROOT_INO 2 /* Root inode */ +#define EXT4_USR_QUOTA_INO 3 /* User quota inode */ +#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ #define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ #define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ #define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ @@ -512,6 +515,10 @@ struct ext4_new_group_data { /* Convert extent to initialized after IO complete */ #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + /* Punch out blocks of an extent */ +#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 + /* Don't normalize allocation size (used for fallocate) */ +#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 /* * Flags used by ext4_free_blocks @@ -1028,7 +1035,7 @@ struct ext4_super_block { __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ __le32 s_flags; /* Miscellaneous flags */ __le16 s_raid_stride; /* RAID stride */ - __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ + __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ __le64 s_mmp_block; /* Block for multi-mount protection */ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ __u8 s_log_groups_per_flex; /* FLEX_BG group size */ @@ -1144,6 +1151,9 @@ struct ext4_sb_info { unsigned long s_ext_blocks; unsigned long s_ext_extents; #endif + /* ext4 extent cache stats */ + unsigned long extent_cache_hits; + unsigned long extent_cache_misses; /* for buddy allocator */ struct ext4_group_info ***s_group_info; @@ -1201,6 +1211,9 @@ struct ext4_sb_info { struct ext4_li_request *s_li_request; /* Wait multiplier for lazy initialization thread */ unsigned int s_li_wait_mult; + + /* Kernel thread for multiple mount protection */ + struct task_struct *s_mmp_tsk; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1338,6 +1351,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 +#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -1351,13 +1365,29 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ +#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_RECOVER| \ EXT4_FEATURE_INCOMPAT_META_BG| \ EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ - EXT4_FEATURE_INCOMPAT_FLEX_BG) + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_MMP) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ @@ -1590,12 +1620,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, */ struct ext4_lazy_init { unsigned long li_state; - - wait_queue_head_t li_wait_daemon; - wait_queue_head_t li_wait_task; - struct timer_list li_timer; - struct task_struct *li_task; - struct list_head li_request_list; struct mutex li_list_mtx; }; @@ -1615,6 +1639,67 @@ struct ext4_features { }; /* + * This structure will be used for multiple mount protection. It will be + * written into the block number saved in the s_mmp_block field in the + * superblock. Programs that check MMP should assume that if + * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe + * to use the filesystem, regardless of how old the timestamp is. + */ +#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ +#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ +#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ +#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ + +struct mmp_struct { + __le32 mmp_magic; /* Magic number for MMP */ + __le32 mmp_seq; /* Sequence no. updated periodically */ + + /* + * mmp_time, mmp_nodename & mmp_bdevname are only used for information + * purposes and do not affect the correctness of the algorithm + */ + __le64 mmp_time; /* Time last updated */ + char mmp_nodename[64]; /* Node which last updated MMP block */ + char mmp_bdevname[32]; /* Bdev which last updated MMP block */ + + /* + * mmp_check_interval is used to verify if the MMP block has been + * updated on the block device. The value is updated based on the + * maximum time to write the MMP block during an update cycle. + */ + __le16 mmp_check_interval; + + __le16 mmp_pad1; + __le32 mmp_pad2[227]; +}; + +/* arguments passed to the mmp thread */ +struct mmpd_data { + struct buffer_head *bh; /* bh from initial read_mmp_block() */ + struct super_block *sb; /* super block of the fs */ +}; + +/* + * Check interval multiplier + * The MMP block is written every update interval and initially checked every + * update interval x the multiplier (the value is then adapted based on the + * write latency). The reason is that writes can be delayed under load and we + * don't want readers to incorrectly assume that the filesystem is no longer + * in use. + */ +#define EXT4_MMP_CHECK_MULT 2UL + +/* + * Minimum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL + +/* + * Maximum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL + +/* * Function prototypes */ @@ -1638,10 +1723,12 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); extern unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group); extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, unsigned long *count, int *errp); -extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); -extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, - ext4_fsblk_t block, unsigned long count); + ext4_fsblk_t goal, + unsigned int flags, + unsigned long *count, + int *errp); +extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, + s64 nblocks, unsigned int flags); extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, @@ -1706,6 +1793,8 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode, unsigned long count, int flags); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); +extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); /* inode.c */ @@ -1729,6 +1818,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate(struct inode *); +extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); @@ -1738,6 +1828,8 @@ extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); +extern int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, @@ -1788,6 +1880,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int, __LINE__, ## message) extern void ext4_msg(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); +extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, + const char *, unsigned int, const char *); +#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ + __LINE__, msg) extern void __ext4_grp_locked_error(const char *, unsigned int, \ struct super_block *, ext4_group_t, \ unsigned long, ext4_fsblk_t, \ @@ -2064,6 +2160,8 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern void ext4_ext_truncate(struct inode *); +extern int ext4_ext_punch_hole(struct file *file, loff_t offset, + loff_t length); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct file *file, int mode, loff_t offset, @@ -2092,6 +2190,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io, int len, struct writeback_control *wbc); +/* mmp.c */ +extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); + /* BH_Uninit flag: blocks are allocated but uninitialized on disk */ enum ext4_state_bits { BH_Uninit /* blocks are allocated but uninitialized on disk */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 6e272ef6ba96..f5240aa15601 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -6,20 +6,6 @@ #include <trace/events/ext4.h> -int __ext4_journal_get_undo_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh) -{ - int err = 0; - - if (ext4_handle_valid(handle)) { - err = jbd2_journal_get_undo_access(handle, bh); - if (err) - ext4_journal_abort_handle(where, line, __func__, bh, - handle, err); - } - return err; -} - int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh) { diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index d0f53538a57f..bb85757689b6 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -126,9 +126,6 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line, const char *err_fn, struct buffer_head *bh, handle_t *handle, int err); -int __ext4_journal_get_undo_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh); - int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh); @@ -146,8 +143,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, int __ext4_handle_dirty_super(const char *where, unsigned int line, handle_t *handle, struct super_block *sb); -#define ext4_journal_get_undo_access(handle, bh) \ - __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh)) #define ext4_journal_get_write_access(handle, bh) \ __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4890d6f3ad15..5199bac7fc62 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -46,6 +46,13 @@ #include <trace/events/ext4.h> +static int ext4_split_extent(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + struct ext4_map_blocks *map, + int split_flag, + int flags); + static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, int needed) @@ -192,12 +199,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, static ext4_fsblk_t ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *ex, int *err) + struct ext4_extent *ex, int *err, unsigned int flags) { ext4_fsblk_t goal, newblock; goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); - newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); + newblock = ext4_new_meta_blocks(handle, inode, goal, flags, + NULL, err); return newblock; } @@ -474,9 +482,43 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) } ext_debug("\n"); } + +static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, + ext4_fsblk_t newblock, int level) +{ + int depth = ext_depth(inode); + struct ext4_extent *ex; + + if (depth != level) { + struct ext4_extent_idx *idx; + idx = path[level].p_idx; + while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { + ext_debug("%d: move %d:%llu in new index %llu\n", level, + le32_to_cpu(idx->ei_block), + ext4_idx_pblock(idx), + newblock); + idx++; + } + + return; + } + + ex = path[depth].p_ext; + while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { + ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", + le32_to_cpu(ex->ee_block), + ext4_ext_pblock(ex), + ext4_ext_is_uninitialized(ex), + ext4_ext_get_actual_len(ex), + newblock); + ex++; + } +} + #else #define ext4_ext_show_path(inode, path) #define ext4_ext_show_leaf(inode, path) +#define ext4_ext_show_move(inode, path, newblock, level) #endif void ext4_ext_drop_refs(struct ext4_ext_path *path) @@ -792,14 +834,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, * - initializes subtree */ static int ext4_ext_split(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, - struct ext4_extent *newext, int at) + unsigned int flags, + struct ext4_ext_path *path, + struct ext4_extent *newext, int at) { struct buffer_head *bh = NULL; int depth = ext_depth(inode); struct ext4_extent_header *neh; struct ext4_extent_idx *fidx; - struct ext4_extent *ex; int i = at, k, m, a; ext4_fsblk_t newblock, oldblock; __le32 border; @@ -847,7 +889,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); for (a = 0; a < depth - at; a++) { newblock = ext4_ext_new_meta_block(handle, inode, path, - newext, &err); + newext, &err, flags); if (newblock == 0) goto cleanup; ablocks[a] = newblock; @@ -876,7 +918,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_depth = 0; - ex = EXT_FIRST_EXTENT(neh); /* move remainder of path[depth] to the new leaf */ if (unlikely(path[depth].p_hdr->eh_entries != @@ -888,25 +929,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, goto cleanup; } /* start copy from next extent */ - /* TODO: we could do it by single memmove */ - m = 0; - path[depth].p_ext++; - while (path[depth].p_ext <= - EXT_MAX_EXTENT(path[depth].p_hdr)) { - ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", - le32_to_cpu(path[depth].p_ext->ee_block), - ext4_ext_pblock(path[depth].p_ext), - ext4_ext_is_uninitialized(path[depth].p_ext), - ext4_ext_get_actual_len(path[depth].p_ext), - newblock); - /*memmove(ex++, path[depth].p_ext++, - sizeof(struct ext4_extent)); - neh->eh_entries++;*/ - path[depth].p_ext++; - m++; - } + m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; + ext4_ext_show_move(inode, path, newblock, depth); if (m) { - memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m); + struct ext4_extent *ex; + ex = EXT_FIRST_EXTENT(neh); + memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); le16_add_cpu(&neh->eh_entries, m); } @@ -968,12 +996,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ext_debug("int.index at %d (block %llu): %u -> %llu\n", i, newblock, le32_to_cpu(border), oldblock); - /* copy indexes */ - m = 0; - path[i].p_idx++; - ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, - EXT_MAX_INDEX(path[i].p_hdr)); + /* move remainder of path[i] to the new index block */ if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != EXT_LAST_INDEX(path[i].p_hdr))) { EXT4_ERROR_INODE(inode, @@ -982,20 +1006,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, err = -EIO; goto cleanup; } - while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { - ext_debug("%d: move %d:%llu in new index %llu\n", i, - le32_to_cpu(path[i].p_idx->ei_block), - ext4_idx_pblock(path[i].p_idx), - newblock); - /*memmove(++fidx, path[i].p_idx++, - sizeof(struct ext4_extent_idx)); - neh->eh_entries++; - BUG_ON(neh->eh_entries > neh->eh_max);*/ - path[i].p_idx++; - m++; - } + /* start copy indexes */ + m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; + ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, + EXT_MAX_INDEX(path[i].p_hdr)); + ext4_ext_show_move(inode, path, newblock, i); if (m) { - memmove(++fidx, path[i].p_idx - m, + memmove(++fidx, path[i].p_idx, sizeof(struct ext4_extent_idx) * m); le16_add_cpu(&neh->eh_entries, m); } @@ -1056,8 +1073,9 @@ cleanup: * just created block */ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, - struct ext4_extent *newext) + unsigned int flags, + struct ext4_ext_path *path, + struct ext4_extent *newext) { struct ext4_ext_path *curp = path; struct ext4_extent_header *neh; @@ -1065,7 +1083,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, ext4_fsblk_t newblock; int err = 0; - newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); + newblock = ext4_ext_new_meta_block(handle, inode, path, + newext, &err, flags); if (newblock == 0) return err; @@ -1140,8 +1159,9 @@ out: * if no free index is found, then it requests in-depth growing. */ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, - struct ext4_extent *newext) + unsigned int flags, + struct ext4_ext_path *path, + struct ext4_extent *newext) { struct ext4_ext_path *curp; int depth, i, err = 0; @@ -1161,7 +1181,7 @@ repeat: if (EXT_HAS_FREE_INDEX(curp)) { /* if we found index with free entry, then use that * entry: create all needed subtree and add new leaf */ - err = ext4_ext_split(handle, inode, path, newext, i); + err = ext4_ext_split(handle, inode, flags, path, newext, i); if (err) goto out; @@ -1174,7 +1194,8 @@ repeat: err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, path, newext); + err = ext4_ext_grow_indepth(handle, inode, flags, + path, newext); if (err) goto out; @@ -1563,7 +1584,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns * 1 if they got merged. */ -static int ext4_ext_try_to_merge(struct inode *inode, +static int ext4_ext_try_to_merge_right(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex) { @@ -1603,6 +1624,31 @@ static int ext4_ext_try_to_merge(struct inode *inode, } /* + * This function tries to merge the @ex extent to neighbours in the tree. + * return 1 if merge left else 0. + */ +static int ext4_ext_try_to_merge(struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex) { + struct ext4_extent_header *eh; + unsigned int depth; + int merge_done = 0; + int ret = 0; + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + eh = path[depth].p_hdr; + + if (ex > EXT_FIRST_EXTENT(eh)) + merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); + + if (!merge_done) + ret = ext4_ext_try_to_merge_right(inode, path, ex); + + return ret; +} + +/* * check if a portion of the "newext" extent overlaps with an * existing extent. * @@ -1668,6 +1714,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, int depth, len, err; ext4_lblk_t next; unsigned uninitialized = 0; + int flags = 0; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); @@ -1742,7 +1789,9 @@ repeat: * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. */ - err = ext4_ext_create_new_leaf(handle, inode, path, newext); + if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) + flags = EXT4_MB_USE_ROOT_BLOCKS; + err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); if (err) goto cleanup; depth = ext_depth(inode); @@ -2003,13 +2052,25 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, } /* + * ext4_ext_in_cache() + * Checks to see if the given block is in the cache. + * If it is, the cached extent is stored in the given + * cache extent pointer. If the cached extent is a hole, + * this routine should be used instead of + * ext4_ext_in_cache if the calling function needs to + * know the size of the hole. + * + * @inode: The files inode + * @block: The block to look for in the cache + * @ex: Pointer where the cached extent will be stored + * if it contains block + * * Return 0 if cache is invalid; 1 if the cache is valid */ -static int -ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, - struct ext4_extent *ex) -{ +static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, + struct ext4_ext_cache *ex){ struct ext4_ext_cache *cex; + struct ext4_sb_info *sbi; int ret = 0; /* @@ -2017,26 +2078,60 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); cex = &EXT4_I(inode)->i_cached_extent; + sbi = EXT4_SB(inode->i_sb); /* has cache valid data? */ if (cex->ec_len == 0) goto errout; if (in_range(block, cex->ec_block, cex->ec_len)) { - ex->ee_block = cpu_to_le32(cex->ec_block); - ext4_ext_store_pblock(ex, cex->ec_start); - ex->ee_len = cpu_to_le16(cex->ec_len); + memcpy(ex, cex, sizeof(struct ext4_ext_cache)); ext_debug("%u cached by %u:%u:%llu\n", block, cex->ec_block, cex->ec_len, cex->ec_start); ret = 1; } errout: + if (!ret) + sbi->extent_cache_misses++; + else + sbi->extent_cache_hits++; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); return ret; } /* + * ext4_ext_in_cache() + * Checks to see if the given block is in the cache. + * If it is, the cached extent is stored in the given + * extent pointer. + * + * @inode: The files inode + * @block: The block to look for in the cache + * @ex: Pointer where the cached extent will be stored + * if it contains block + * + * Return 0 if cache is invalid; 1 if the cache is valid + */ +static int +ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, + struct ext4_extent *ex) +{ + struct ext4_ext_cache cex; + int ret = 0; + + if (ext4_ext_check_cache(inode, block, &cex)) { + ex->ee_block = cpu_to_le32(cex.ec_block); + ext4_ext_store_pblock(ex, cex.ec_start); + ex->ee_len = cpu_to_le16(cex.ec_len); + ret = 1; + } + + return ret; +} + + +/* * ext4_ext_rm_idx: * removes index from the index block. * It's used in truncate case only, thus all requests are for @@ -2163,8 +2258,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, ext4_free_blocks(handle, inode, NULL, start, num, flags); } else if (from == le32_to_cpu(ex->ee_block) && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { - printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", - from, to, le32_to_cpu(ex->ee_block), ee_len); + /* head removal */ + ext4_lblk_t num; + ext4_fsblk_t start; + + num = to - from; + start = ext4_ext_pblock(ex); + + ext_debug("free first %u blocks starting %llu\n", num, start); + ext4_free_blocks(handle, inode, 0, start, num, flags); + } else { printk(KERN_INFO "strange request: removal(2) " "%u-%u from %u:%u\n", @@ -2173,9 +2276,22 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, return 0; } + +/* + * ext4_ext_rm_leaf() Removes the extents associated with the + * blocks appearing between "start" and "end", and splits the extents + * if "start" and "end" appear in the same extent + * + * @handle: The journal handle + * @inode: The files inode + * @path: The path to the leaf + * @start: The first block to remove + * @end: The last block to remove + */ static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, ext4_lblk_t start) + struct ext4_ext_path *path, ext4_lblk_t start, + ext4_lblk_t end) { int err = 0, correct_index = 0; int depth = ext_depth(inode), credits; @@ -2186,6 +2302,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, unsigned short ex_ee_len; unsigned uninitialized = 0; struct ext4_extent *ex; + struct ext4_map_blocks map; /* the header must be checked already in ext4_ext_remove_space() */ ext_debug("truncate since %u in leaf\n", start); @@ -2215,31 +2332,95 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, path[depth].p_ext = ex; a = ex_ee_block > start ? ex_ee_block : start; - b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ? - ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK; + b = ex_ee_block+ex_ee_len - 1 < end ? + ex_ee_block+ex_ee_len - 1 : end; ext_debug(" border %u:%u\n", a, b); - if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) { - block = 0; - num = 0; - BUG(); + /* If this extent is beyond the end of the hole, skip it */ + if (end <= ex_ee_block) { + ex--; + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = ext4_ext_get_actual_len(ex); + continue; + } else if (a != ex_ee_block && + b != ex_ee_block + ex_ee_len - 1) { + /* + * If this is a truncate, then this condition should + * never happen because at least one of the end points + * needs to be on the edge of the extent. + */ + if (end == EXT_MAX_BLOCK) { + ext_debug(" bad truncate %u:%u\n", + start, end); + block = 0; + num = 0; + err = -EIO; + goto out; + } + /* + * else this is a hole punch, so the extent needs to + * be split since neither edge of the hole is on the + * extent edge + */ + else{ + map.m_pblk = ext4_ext_pblock(ex); + map.m_lblk = ex_ee_block; + map.m_len = b - ex_ee_block; + + err = ext4_split_extent(handle, + inode, path, &map, 0, + EXT4_GET_BLOCKS_PUNCH_OUT_EXT | + EXT4_GET_BLOCKS_PRE_IO); + + if (err < 0) + goto out; + + ex_ee_len = ext4_ext_get_actual_len(ex); + + b = ex_ee_block+ex_ee_len - 1 < end ? + ex_ee_block+ex_ee_len - 1 : end; + + /* Then remove tail of this extent */ + block = ex_ee_block; + num = a - block; + } } else if (a != ex_ee_block) { /* remove tail of the extent */ block = ex_ee_block; num = a - block; } else if (b != ex_ee_block + ex_ee_len - 1) { /* remove head of the extent */ - block = a; - num = b - a; - /* there is no "make a hole" API yet */ - BUG(); + block = b; + num = ex_ee_block + ex_ee_len - b; + + /* + * If this is a truncate, this condition + * should never happen + */ + if (end == EXT_MAX_BLOCK) { + ext_debug(" bad truncate %u:%u\n", + start, end); + err = -EIO; + goto out; + } } else { /* remove whole extent: excellent! */ block = ex_ee_block; num = 0; - BUG_ON(a != ex_ee_block); - BUG_ON(b != ex_ee_block + ex_ee_len - 1); + if (a != ex_ee_block) { + ext_debug(" bad truncate %u:%u\n", + start, end); + err = -EIO; + goto out; + } + + if (b != ex_ee_block + ex_ee_len - 1) { + ext_debug(" bad truncate %u:%u\n", + start, end); + err = -EIO; + goto out; + } } /* @@ -2270,7 +2451,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (num == 0) { /* this extent is removed; mark slot entirely unused */ ext4_ext_store_pblock(ex, 0); - le16_add_cpu(&eh->eh_entries, -1); + } else if (block != ex_ee_block) { + /* + * If this was a head removal, then we need to update + * the physical block since it is now at a different + * location + */ + ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a)); } ex->ee_block = cpu_to_le32(block); @@ -2286,6 +2473,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (err) goto out; + /* + * If the extent was completely released, + * we need to remove it from the leaf + */ + if (num == 0) { + if (end != EXT_MAX_BLOCK) { + /* + * For hole punching, we need to scoot all the + * extents up when an extent is removed so that + * we dont have blank extents in the middle + */ + memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * + sizeof(struct ext4_extent)); + + /* Now get rid of the one at the end */ + memset(EXT_LAST_EXTENT(eh), 0, + sizeof(struct ext4_extent)); + } + le16_add_cpu(&eh->eh_entries, -1); + } + ext_debug("new extent: %u:%u:%llu\n", block, num, ext4_ext_pblock(ex)); ex--; @@ -2326,7 +2534,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) return 1; } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) +static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) { struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); @@ -2365,7 +2574,8 @@ again: while (i >= 0 && err == 0) { if (i == depth) { /* this is leaf block */ - err = ext4_ext_rm_leaf(handle, inode, path, start); + err = ext4_ext_rm_leaf(handle, inode, path, + start, end); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -2529,6 +2739,195 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) return ret; } +/* + * used by extent splitting. + */ +#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ + due to ENOSPC */ +#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ +#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ + +/* + * ext4_split_extent_at() splits an extent at given block. + * + * @handle: the journal handle + * @inode: the file inode + * @path: the path to the extent + * @split: the logical block where the extent is splitted. + * @split_flags: indicates if the extent could be zeroout if split fails, and + * the states(init or uninit) of new extents. + * @flags: flags used to insert new extent to extent tree. + * + * + * Splits extent [a, b] into two extents [a, @split) and [@split, b], states + * of which are deterimined by split_flag. + * + * There are two cases: + * a> the extent are splitted into two extent. + * b> split is not needed, and just mark the extent. + * + * return 0 on success. + */ +static int ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t split, + int split_flag, + int flags) +{ + ext4_fsblk_t newblock; + ext4_lblk_t ee_block; + struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex2 = NULL; + unsigned int ee_len, depth; + int err = 0; + + ext_debug("ext4_split_extents_at: inode %lu, logical" + "block %llu\n", inode->i_ino, (unsigned long long)split); + + ext4_ext_show_leaf(inode, path); + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + newblock = split - ee_block + ext4_ext_pblock(ex); + + BUG_ON(split < ee_block || split >= (ee_block + ee_len)); + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + if (split == ee_block) { + /* + * case b: block @split is the block that the extent begins with + * then we just change the state of the extent, and splitting + * is not needed. + */ + if (split_flag & EXT4_EXT_MARK_UNINIT2) + ext4_ext_mark_uninitialized(ex); + else + ext4_ext_mark_initialized(ex); + + if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) + ext4_ext_try_to_merge(inode, path, ex); + + err = ext4_ext_dirty(handle, inode, path + depth); + goto out; + } + + /* case a */ + memcpy(&orig_ex, ex, sizeof(orig_ex)); + ex->ee_len = cpu_to_le16(split - ee_block); + if (split_flag & EXT4_EXT_MARK_UNINIT1) + ext4_ext_mark_uninitialized(ex); + + /* + * path may lead to new leaf, not to original leaf any more + * after ext4_ext_insert_extent() returns, + */ + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto fix_extent_len; + + ex2 = &newex; + ex2->ee_block = cpu_to_le32(split); + ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); + ext4_ext_store_pblock(ex2, newblock); + if (split_flag & EXT4_EXT_MARK_UNINIT2) + ext4_ext_mark_uninitialized(ex2); + + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_len = cpu_to_le32(ee_len); + ext4_ext_try_to_merge(inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + depth); + goto out; + } else if (err) + goto fix_extent_len; + +out: + ext4_ext_show_leaf(inode, path); + return err; + +fix_extent_len: + ex->ee_len = orig_ex.ee_len; + ext4_ext_dirty(handle, inode, path + depth); + return err; +} + +/* + * ext4_split_extents() splits an extent and mark extent which is covered + * by @map as split_flags indicates + * + * It may result in splitting the extent into multiple extents (upto three) + * There are three possibilities: + * a> There is no split required + * b> Splits in two extents: Split is happening at either end of the extent + * c> Splits in three extents: Somone is splitting in middle of the extent + * + */ +static int ext4_split_extent(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + struct ext4_map_blocks *map, + int split_flag, + int flags) +{ + ext4_lblk_t ee_block; + struct ext4_extent *ex; + unsigned int ee_len, depth; + int err = 0; + int uninitialized; + int split_flag1, flags1; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + uninitialized = ext4_ext_is_uninitialized(ex); + + if (map->m_lblk + map->m_len < ee_block + ee_len) { + split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? + EXT4_EXT_MAY_ZEROOUT : 0; + flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; + if (uninitialized) + split_flag1 |= EXT4_EXT_MARK_UNINIT1 | + EXT4_EXT_MARK_UNINIT2; + err = ext4_split_extent_at(handle, inode, path, + map->m_lblk + map->m_len, split_flag1, flags1); + if (err) + goto out; + } + + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, map->m_lblk, path); + if (IS_ERR(path)) + return PTR_ERR(path); + + if (map->m_lblk >= ee_block) { + split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? + EXT4_EXT_MAY_ZEROOUT : 0; + if (uninitialized) + split_flag1 |= EXT4_EXT_MARK_UNINIT1; + if (split_flag & EXT4_EXT_MARK_UNINIT2) + split_flag1 |= EXT4_EXT_MARK_UNINIT2; + err = ext4_split_extent_at(handle, inode, path, + map->m_lblk, split_flag1, flags); + if (err) + goto out; + } + + ext4_ext_show_leaf(inode, path); +out: + return err ? err : map->m_len; +} + #define EXT4_EXT_ZERO_LEN 7 /* * This function is called by ext4_ext_map_blocks() if someone tries to write @@ -2545,17 +2944,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct ext4_map_blocks *map, struct ext4_ext_path *path) { - struct ext4_extent *ex, newex, orig_ex; - struct ext4_extent *ex1 = NULL; - struct ext4_extent *ex2 = NULL; - struct ext4_extent *ex3 = NULL; - struct ext4_extent_header *eh; + struct ext4_map_blocks split_map; + struct ext4_extent zero_ex; + struct ext4_extent *ex; ext4_lblk_t ee_block, eof_block; unsigned int allocated, ee_len, depth; - ext4_fsblk_t newblock; int err = 0; - int ret = 0; - int may_zeroout; + int split_flag = 0; ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" "block %llu, max_blocks %u\n", inode->i_ino, @@ -2567,280 +2962,86 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, eof_block = map->m_lblk + map->m_len; depth = ext_depth(inode); - eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); allocated = ee_len - (map->m_lblk - ee_block); - newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex); - - ex2 = ex; - orig_ex.ee_block = ex->ee_block; - orig_ex.ee_len = cpu_to_le16(ee_len); - ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex)); + WARN_ON(map->m_lblk < ee_block); /* * It is safe to convert extent to initialized via explicit * zeroout only if extent is fully insde i_size or new_size. */ - may_zeroout = ee_block + ee_len <= eof_block; + split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ - if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) { - err = ext4_ext_zeroout(inode, &orig_ex); + if (ee_len <= 2*EXT4_EXT_ZERO_LEN && + (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + err = ext4_ext_zeroout(inode, ex); if (err) - goto fix_extent_len; - /* update the extent length and mark as initialized */ - ex->ee_block = orig_ex.ee_block; - ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); - ext4_ext_dirty(handle, inode, path + depth); - /* zeroed the full extent */ - return allocated; - } - - /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ - if (map->m_lblk > ee_block) { - ex1 = ex; - ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); - ext4_ext_mark_uninitialized(ex1); - ex2 = &newex; - } - /* - * for sanity, update the length of the ex2 extent before - * we insert ex3, if ex1 is NULL. This is to avoid temporary - * overlap of blocks. - */ - if (!ex1 && allocated > map->m_len) - ex2->ee_len = cpu_to_le16(map->m_len); - /* ex3: to ee_block + ee_len : uninitialised */ - if (allocated > map->m_len) { - unsigned int newdepth; - /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ - if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) { - /* - * map->m_lblk == ee_block is handled by the zerouout - * at the beginning. - * Mark first half uninitialized. - * Mark second half initialized and zero out the - * initialized extent - */ - ex->ee_block = orig_ex.ee_block; - ex->ee_len = cpu_to_le16(ee_len - allocated); - ext4_ext_mark_uninitialized(ex); - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); - ext4_ext_dirty(handle, inode, path + depth); - - ex3 = &newex; - ex3->ee_block = cpu_to_le32(map->m_lblk); - ext4_ext_store_pblock(ex3, newblock); - ex3->ee_len = cpu_to_le16(allocated); - err = ext4_ext_insert_extent(handle, inode, path, - ex3, 0); - if (err == -ENOSPC) { - err = ext4_ext_zeroout(inode, &orig_ex); - if (err) - goto fix_extent_len; - ex->ee_block = orig_ex.ee_block; - ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, - ext4_ext_pblock(&orig_ex)); - ext4_ext_dirty(handle, inode, path + depth); - /* blocks available from map->m_lblk */ - return allocated; - - } else if (err) - goto fix_extent_len; - - /* - * We need to zero out the second half because - * an fallocate request can update file size and - * converting the second half to initialized extent - * implies that we can leak some junk data to user - * space. - */ - err = ext4_ext_zeroout(inode, ex3); - if (err) { - /* - * We should actually mark the - * second half as uninit and return error - * Insert would have changed the extent - */ - depth = ext_depth(inode); - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, - path); - if (IS_ERR(path)) { - err = PTR_ERR(path); - return err; - } - /* get the second half extent details */ - ex = path[depth].p_ext; - err = ext4_ext_get_access(handle, inode, - path + depth); - if (err) - return err; - ext4_ext_mark_uninitialized(ex); - ext4_ext_dirty(handle, inode, path + depth); - return err; - } - - /* zeroed the second half */ - return allocated; - } - ex3 = &newex; - ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); - ext4_ext_store_pblock(ex3, newblock + map->m_len); - ex3->ee_len = cpu_to_le16(allocated - map->m_len); - ext4_ext_mark_uninitialized(ex3); - err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); - if (err == -ENOSPC && may_zeroout) { - err = ext4_ext_zeroout(inode, &orig_ex); - if (err) - goto fix_extent_len; - /* update the extent length and mark as initialized */ - ex->ee_block = orig_ex.ee_block; - ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); - ext4_ext_dirty(handle, inode, path + depth); - /* zeroed the full extent */ - /* blocks available from map->m_lblk */ - return allocated; - - } else if (err) - goto fix_extent_len; - /* - * The depth, and hence eh & ex might change - * as part of the insert above. - */ - newdepth = ext_depth(inode); - /* - * update the extent length after successful insert of the - * split extent - */ - ee_len -= ext4_ext_get_actual_len(ex3); - orig_ex.ee_len = cpu_to_le16(ee_len); - may_zeroout = ee_block + ee_len <= eof_block; - - depth = newdepth; - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path); - if (IS_ERR(path)) { - err = PTR_ERR(path); goto out; - } - eh = path[depth].p_hdr; - ex = path[depth].p_ext; - if (ex2 != &newex) - ex2 = ex; err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; - - allocated = map->m_len; - - /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying - * to insert a extent in the middle zerout directly - * otherwise give the extent a chance to merge to left - */ - if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && - map->m_lblk != ee_block && may_zeroout) { - err = ext4_ext_zeroout(inode, &orig_ex); - if (err) - goto fix_extent_len; - /* update the extent length and mark as initialized */ - ex->ee_block = orig_ex.ee_block; - ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); - ext4_ext_dirty(handle, inode, path + depth); - /* zero out the first half */ - /* blocks available from map->m_lblk */ - return allocated; - } - } - /* - * If there was a change of depth as part of the - * insertion of ex3 above, we need to update the length - * of the ex1 extent again here - */ - if (ex1 && ex1 != ex) { - ex1 = ex; - ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); - ext4_ext_mark_uninitialized(ex1); - ex2 = &newex; - } - /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */ - ex2->ee_block = cpu_to_le32(map->m_lblk); - ext4_ext_store_pblock(ex2, newblock); - ex2->ee_len = cpu_to_le16(allocated); - if (ex2 != ex) - goto insert; - /* - * New (initialized) extent starts from the first block - * in the current extent. i.e., ex2 == ex - * We have to see if it can be merged with the extent - * on the left. - */ - if (ex2 > EXT_FIRST_EXTENT(eh)) { - /* - * To merge left, pass "ex2 - 1" to try_to_merge(), - * since it merges towards right _only_. - */ - ret = ext4_ext_try_to_merge(inode, path, ex2 - 1); - if (ret) { - err = ext4_ext_correct_indexes(handle, inode, path); - if (err) - goto out; - depth = ext_depth(inode); - ex2--; - } + ext4_ext_mark_initialized(ex); + ext4_ext_try_to_merge(inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + depth); + goto out; } + /* - * Try to Merge towards right. This might be required - * only when the whole extent is being written to. - * i.e. ex2 == ex and ex3 == NULL. + * four cases: + * 1. split the extent into three extents. + * 2. split the extent into two extents, zeroout the first half. + * 3. split the extent into two extents, zeroout the second half. + * 4. split the extent into two extents with out zeroout. */ - if (!ex3) { - ret = ext4_ext_try_to_merge(inode, path, ex2); - if (ret) { - err = ext4_ext_correct_indexes(handle, inode, path); + split_map.m_lblk = map->m_lblk; + split_map.m_len = map->m_len; + + if (allocated > map->m_len) { + if (allocated <= EXT4_EXT_ZERO_LEN && + (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + /* case 3 */ + zero_ex.ee_block = + cpu_to_le32(map->m_lblk); + zero_ex.ee_len = cpu_to_le16(allocated); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex) + map->m_lblk - ee_block); + err = ext4_ext_zeroout(inode, &zero_ex); if (err) goto out; + split_map.m_lblk = map->m_lblk; + split_map.m_len = allocated; + } else if ((map->m_lblk - ee_block + map->m_len < + EXT4_EXT_ZERO_LEN) && + (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + /* case 2 */ + if (map->m_lblk != ee_block) { + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = cpu_to_le16(map->m_lblk - + ee_block); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex)); + err = ext4_ext_zeroout(inode, &zero_ex); + if (err) + goto out; + } + + split_map.m_lblk = ee_block; + split_map.m_len = map->m_lblk - ee_block + map->m_len; + allocated = map->m_len; } } - /* Mark modified extent as dirty */ - err = ext4_ext_dirty(handle, inode, path + depth); - goto out; -insert: - err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); - if (err == -ENOSPC && may_zeroout) { - err = ext4_ext_zeroout(inode, &orig_ex); - if (err) - goto fix_extent_len; - /* update the extent length and mark as initialized */ - ex->ee_block = orig_ex.ee_block; - ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); - ext4_ext_dirty(handle, inode, path + depth); - /* zero out the first half */ - return allocated; - } else if (err) - goto fix_extent_len; + + allocated = ext4_split_extent(handle, inode, path, + &split_map, split_flag, 0); + if (allocated < 0) + err = allocated; + out: - ext4_ext_show_leaf(inode, path); return err ? err : allocated; - -fix_extent_len: - ex->ee_block = orig_ex.ee_block; - ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); - ext4_ext_mark_uninitialized(ex); - ext4_ext_dirty(handle, inode, path + depth); - return err; } /* @@ -2871,15 +3072,11 @@ static int ext4_split_unwritten_extents(handle_t *handle, struct ext4_ext_path *path, int flags) { - struct ext4_extent *ex, newex, orig_ex; - struct ext4_extent *ex1 = NULL; - struct ext4_extent *ex2 = NULL; - struct ext4_extent *ex3 = NULL; - ext4_lblk_t ee_block, eof_block; - unsigned int allocated, ee_len, depth; - ext4_fsblk_t newblock; - int err = 0; - int may_zeroout; + ext4_lblk_t eof_block; + ext4_lblk_t ee_block; + struct ext4_extent *ex; + unsigned int ee_len; + int split_flag = 0, depth; ext_debug("ext4_split_unwritten_extents: inode %lu, logical" "block %llu, max_blocks %u\n", inode->i_ino, @@ -2889,156 +3086,22 @@ static int ext4_split_unwritten_extents(handle_t *handle, inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map->m_len) eof_block = map->m_lblk + map->m_len; - - depth = ext_depth(inode); - ex = path[depth].p_ext; - ee_block = le32_to_cpu(ex->ee_block); - ee_len = ext4_ext_get_actual_len(ex); - allocated = ee_len - (map->m_lblk - ee_block); - newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex); - - ex2 = ex; - orig_ex.ee_block = ex->ee_block; - orig_ex.ee_len = cpu_to_le16(ee_len); - ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex)); - /* * It is safe to convert extent to initialized via explicit * zeroout only if extent is fully insde i_size or new_size. */ - may_zeroout = ee_block + ee_len <= eof_block; - - /* - * If the uninitialized extent begins at the same logical - * block where the write begins, and the write completely - * covers the extent, then we don't need to split it. - */ - if ((map->m_lblk == ee_block) && (allocated <= map->m_len)) - return allocated; - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; - /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ - if (map->m_lblk > ee_block) { - ex1 = ex; - ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); - ext4_ext_mark_uninitialized(ex1); - ex2 = &newex; - } - /* - * for sanity, update the length of the ex2 extent before - * we insert ex3, if ex1 is NULL. This is to avoid temporary - * overlap of blocks. - */ - if (!ex1 && allocated > map->m_len) - ex2->ee_len = cpu_to_le16(map->m_len); - /* ex3: to ee_block + ee_len : uninitialised */ - if (allocated > map->m_len) { - unsigned int newdepth; - ex3 = &newex; - ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); - ext4_ext_store_pblock(ex3, newblock + map->m_len); - ex3->ee_len = cpu_to_le16(allocated - map->m_len); - ext4_ext_mark_uninitialized(ex3); - err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); - if (err == -ENOSPC && may_zeroout) { - err = ext4_ext_zeroout(inode, &orig_ex); - if (err) - goto fix_extent_len; - /* update the extent length and mark as initialized */ - ex->ee_block = orig_ex.ee_block; - ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); - ext4_ext_dirty(handle, inode, path + depth); - /* zeroed the full extent */ - /* blocks available from map->m_lblk */ - return allocated; - - } else if (err) - goto fix_extent_len; - /* - * The depth, and hence eh & ex might change - * as part of the insert above. - */ - newdepth = ext_depth(inode); - /* - * update the extent length after successful insert of the - * split extent - */ - ee_len -= ext4_ext_get_actual_len(ex3); - orig_ex.ee_len = cpu_to_le16(ee_len); - may_zeroout = ee_block + ee_len <= eof_block; - - depth = newdepth; - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path); - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out; - } - ex = path[depth].p_ext; - if (ex2 != &newex) - ex2 = ex; + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; + split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; + split_flag |= EXT4_EXT_MARK_UNINIT2; - allocated = map->m_len; - } - /* - * If there was a change of depth as part of the - * insertion of ex3 above, we need to update the length - * of the ex1 extent again here - */ - if (ex1 && ex1 != ex) { - ex1 = ex; - ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); - ext4_ext_mark_uninitialized(ex1); - ex2 = &newex; - } - /* - * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written - * using direct I/O, uninitialised still. - */ - ex2->ee_block = cpu_to_le32(map->m_lblk); - ext4_ext_store_pblock(ex2, newblock); - ex2->ee_len = cpu_to_le16(allocated); - ext4_ext_mark_uninitialized(ex2); - if (ex2 != ex) - goto insert; - /* Mark modified extent as dirty */ - err = ext4_ext_dirty(handle, inode, path + depth); - ext_debug("out here\n"); - goto out; -insert: - err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); - if (err == -ENOSPC && may_zeroout) { - err = ext4_ext_zeroout(inode, &orig_ex); - if (err) - goto fix_extent_len; - /* update the extent length and mark as initialized */ - ex->ee_block = orig_ex.ee_block; - ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); - ext4_ext_dirty(handle, inode, path + depth); - /* zero out the first half */ - return allocated; - } else if (err) - goto fix_extent_len; -out: - ext4_ext_show_leaf(inode, path); - return err ? err : allocated; - -fix_extent_len: - ex->ee_block = orig_ex.ee_block; - ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); - ext4_ext_mark_uninitialized(ex); - ext4_ext_dirty(handle, inode, path + depth); - return err; + flags |= EXT4_GET_BLOCKS_PRE_IO; + return ext4_split_extent(handle, inode, path, map, split_flag, flags); } + static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) @@ -3047,46 +3110,27 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct ext4_extent_header *eh; int depth; int err = 0; - int ret = 0; depth = ext_depth(inode); eh = path[depth].p_hdr; ex = path[depth].p_ext; + ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" + "block %llu, max_blocks %u\n", inode->i_ino, + (unsigned long long)le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex)); + err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; /* first mark the extent as initialized */ ext4_ext_mark_initialized(ex); - /* - * We have to see if it can be merged with the extent - * on the left. - */ - if (ex > EXT_FIRST_EXTENT(eh)) { - /* - * To merge left, pass "ex - 1" to try_to_merge(), - * since it merges towards right _only_. - */ - ret = ext4_ext_try_to_merge(inode, path, ex - 1); - if (ret) { - err = ext4_ext_correct_indexes(handle, inode, path); - if (err) - goto out; - depth = ext_depth(inode); - ex--; - } - } - /* - * Try to Merge towards right. + /* note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed */ - ret = ext4_ext_try_to_merge(inode, path, ex); - if (ret) { - err = ext4_ext_correct_indexes(handle, inode, path); - if (err) - goto out; - depth = ext_depth(inode); - } + ext4_ext_try_to_merge(inode, path, ex); + /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + depth); out: @@ -3302,15 +3346,19 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t newblock = 0; int err = 0, depth, ret; unsigned int allocated = 0; + unsigned int punched_out = 0; + unsigned int result = 0; struct ext4_allocation_request ar; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + struct ext4_map_blocks punch_map; ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* check in cache */ - if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { + if (ext4_ext_in_cache(inode, map->m_lblk, &newex) && + ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) { if (!newex.ee_start_lo && !newex.ee_start_hi) { if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* @@ -3375,16 +3423,84 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); - /* Do not put uninitialized extent in the cache */ - if (!ext4_ext_is_uninitialized(ex)) { - ext4_ext_put_in_cache(inode, ee_block, - ee_len, ee_start); - goto out; + if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) { + /* + * Do not put uninitialized extent + * in the cache + */ + if (!ext4_ext_is_uninitialized(ex)) { + ext4_ext_put_in_cache(inode, ee_block, + ee_len, ee_start); + goto out; + } + ret = ext4_ext_handle_uninitialized_extents( + handle, inode, map, path, flags, + allocated, newblock); + return ret; } - ret = ext4_ext_handle_uninitialized_extents(handle, - inode, map, path, flags, allocated, - newblock); - return ret; + + /* + * Punch out the map length, but only to the + * end of the extent + */ + punched_out = allocated < map->m_len ? + allocated : map->m_len; + + /* + * Sense extents need to be converted to + * uninitialized, they must fit in an + * uninitialized extent + */ + if (punched_out > EXT_UNINIT_MAX_LEN) + punched_out = EXT_UNINIT_MAX_LEN; + + punch_map.m_lblk = map->m_lblk; + punch_map.m_pblk = newblock; + punch_map.m_len = punched_out; + punch_map.m_flags = 0; + + /* Check to see if the extent needs to be split */ + if (punch_map.m_len != ee_len || + punch_map.m_lblk != ee_block) { + + ret = ext4_split_extent(handle, inode, + path, &punch_map, 0, + EXT4_GET_BLOCKS_PUNCH_OUT_EXT | + EXT4_GET_BLOCKS_PRE_IO); + + if (ret < 0) { + err = ret; + goto out2; + } + /* + * find extent for the block at + * the start of the hole + */ + ext4_ext_drop_refs(path); + kfree(path); + + path = ext4_ext_find_extent(inode, + map->m_lblk, NULL); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + goto out2; + } + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_len = ext4_ext_get_actual_len(ex); + ee_block = le32_to_cpu(ex->ee_block); + ee_start = ext4_ext_pblock(ex); + + } + + ext4_ext_mark_uninitialized(ex); + + err = ext4_ext_remove_space(inode, map->m_lblk, + map->m_lblk + punched_out); + + goto out2; } } @@ -3446,6 +3562,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, else /* disable in-core preallocation for non-regular files */ ar.flags = 0; + if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) + ar.flags |= EXT4_MB_HINT_NOPREALLOC; newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; @@ -3529,7 +3647,11 @@ out2: } trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, newblock, map->m_len, err ? err : allocated); - return err ? err : allocated; + + result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ? + punched_out : allocated; + + return err ? err : result; } void ext4_ext_truncate(struct inode *inode) @@ -3577,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); - err = ext4_ext_remove_space(inode, last_block); + err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK); /* In a multi-transaction truncate, we only make the final * transaction synchronous. @@ -3585,8 +3707,9 @@ void ext4_ext_truncate(struct inode *inode) if (IS_SYNC(inode)) ext4_handle_sync(handle); -out_stop: up_write(&EXT4_I(inode)->i_data_sem); + +out_stop: /* * If this was a simple ftruncate() and the file will remain alive, * then we need to clear up the orphan record which we created above. @@ -3651,10 +3774,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) struct ext4_map_blocks map; unsigned int credits, blkbits = inode->i_blkbits; - /* We only support the FALLOC_FL_KEEP_SIZE mode */ - if (mode & ~FALLOC_FL_KEEP_SIZE) - return -EOPNOTSUPP; - /* * currently supporting (pre)allocate mode for extent-based * files _only_ @@ -3662,6 +3781,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; + /* Return error if mode is not supported */ + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (mode & FALLOC_FL_PUNCH_HOLE) + return ext4_punch_hole(file, offset, len); + trace_ext4_fallocate_enter(inode, offset, len, mode); map.m_lblk = offset >> blkbits; /* @@ -3691,7 +3817,8 @@ retry: break; } ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT | + EXT4_GET_BLOCKS_NO_NORMALIZE); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); @@ -3822,6 +3949,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, pgoff_t last_offset; pgoff_t offset; pgoff_t index; + pgoff_t start_index = 0; struct page **pages = NULL; struct buffer_head *bh = NULL; struct buffer_head *head = NULL; @@ -3848,39 +3976,57 @@ out: kfree(pages); return EXT_CONTINUE; } + index = 0; +next_page: /* Try to find the 1st mapped buffer. */ - end = ((__u64)pages[0]->index << PAGE_SHIFT) >> + end = ((__u64)pages[index]->index << PAGE_SHIFT) >> blksize_bits; - if (!page_has_buffers(pages[0])) + if (!page_has_buffers(pages[index])) goto out; - head = page_buffers(pages[0]); + head = page_buffers(pages[index]); if (!head) goto out; + index++; bh = head; do { - if (buffer_mapped(bh)) { + if (end >= newex->ec_block + + newex->ec_len) + /* The buffer is out of + * the request range. + */ + goto out; + + if (buffer_mapped(bh) && + end >= newex->ec_block) { + start_index = index - 1; /* get the 1st mapped buffer. */ - if (end > newex->ec_block + - newex->ec_len) - /* The buffer is out of - * the request range. - */ - goto out; goto found_mapped_buffer; } + bh = bh->b_this_page; end++; } while (bh != head); - /* No mapped buffer found. */ - goto out; + /* No mapped buffer in the range found in this page, + * We need to look up next page. + */ + if (index >= ret) { + /* There is no page left, but we need to limit + * newex->ec_len. + */ + newex->ec_len = end - newex->ec_block; + goto out; + } + goto next_page; } else { /*Find contiguous delayed buffers. */ if (ret > 0 && pages[0]->index == last_offset) head = page_buffers(pages[0]); bh = head; + index = 1; + start_index = 0; } found_mapped_buffer: @@ -3903,7 +4049,7 @@ found_mapped_buffer: end++; } while (bh != head); - for (index = 1; index < ret; index++) { + for (; index < ret; index++) { if (!page_has_buffers(pages[index])) { bh = NULL; break; @@ -3913,8 +4059,10 @@ found_mapped_buffer: bh = NULL; break; } + if (pages[index]->index != - pages[0]->index + index) { + pages[start_index]->index + index + - start_index) { /* Blocks are not contiguous. */ bh = NULL; break; @@ -4006,6 +4154,177 @@ static int ext4_xattr_fiemap(struct inode *inode, return (error < 0 ? error : 0); } +/* + * ext4_ext_punch_hole + * + * Punches a hole of "length" bytes in a file starting + * at byte "offset" + * + * @inode: The inode of the file to punch a hole in + * @offset: The starting byte offset of the hole + * @length: The length of the hole + * + * Returns the number of blocks removed or negative on err + */ +int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct ext4_ext_cache cache_ex; + ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks; + struct address_space *mapping = inode->i_mapping; + struct ext4_map_blocks map; + handle_t *handle; + loff_t first_block_offset, last_block_offset, block_len; + loff_t first_page, last_page, first_page_offset, last_page_offset; + int ret, credits, blocks_released, err = 0; + + first_block = (offset + sb->s_blocksize - 1) >> + EXT4_BLOCK_SIZE_BITS(sb); + last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + + first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb); + last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb); + + first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + last_page = (offset + length) >> PAGE_CACHE_SHIFT; + + first_page_offset = first_page << PAGE_CACHE_SHIFT; + last_page_offset = last_page << PAGE_CACHE_SHIFT; + + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + err = filemap_write_and_wait_range(mapping, + first_page_offset == 0 ? 0 : first_page_offset-1, + last_page_offset); + + if (err) + return err; + } + + /* Now release the pages */ + if (last_page_offset > first_page_offset) { + truncate_inode_pages_range(mapping, first_page_offset, + last_page_offset-1); + } + + /* finish any pending end_io work */ + ext4_flush_completed_IO(inode); + + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext4_orphan_add(handle, inode); + if (err) + goto out; + + /* + * Now we need to zero out the un block aligned data. + * If the file is smaller than a block, just + * zero out the middle + */ + if (first_block > last_block) + ext4_block_zero_page_range(handle, mapping, offset, length); + else { + /* zero out the head of the hole before the first block */ + block_len = first_block_offset - offset; + if (block_len > 0) + ext4_block_zero_page_range(handle, mapping, + offset, block_len); + + /* zero out the tail of the hole after the last block */ + block_len = offset + length - last_block_offset; + if (block_len > 0) { + ext4_block_zero_page_range(handle, mapping, + last_block_offset, block_len); + } + } + + /* If there are no blocks to remove, return now */ + if (first_block >= last_block) + goto out; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_ext_invalidate_cache(inode); + ext4_discard_preallocations(inode); + + /* + * Loop over all the blocks and identify blocks + * that need to be punched out + */ + iblock = first_block; + blocks_released = 0; + while (iblock < last_block) { + max_blocks = last_block - iblock; + num_blocks = 1; + memset(&map, 0, sizeof(map)); + map.m_lblk = iblock; + map.m_len = max_blocks; + ret = ext4_ext_map_blocks(handle, inode, &map, + EXT4_GET_BLOCKS_PUNCH_OUT_EXT); + + if (ret > 0) { + blocks_released += ret; + num_blocks = ret; + } else if (ret == 0) { + /* + * If map blocks could not find the block, + * then it is in a hole. If the hole was + * not already cached, then map blocks should + * put it in the cache. So we can get the hole + * out of the cache + */ + memset(&cache_ex, 0, sizeof(cache_ex)); + if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) && + !cache_ex.ec_start) { + + /* The hole is cached */ + num_blocks = cache_ex.ec_block + + cache_ex.ec_len - iblock; + + } else { + /* The block could not be identified */ + err = -EIO; + break; + } + } else { + /* Map blocks error */ + err = ret; + break; + } + + if (num_blocks == 0) { + /* This condition should never happen */ + ext_debug("Block lookup failed"); + err = -EIO; + break; + } + + iblock += num_blocks; + } + + if (blocks_released > 0) { + ext4_ext_invalidate_cache(inode); + ext4_discard_preallocations(inode); + } + + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + + up_write(&EXT4_I(inode)->i_data_sem); + +out: + ext4_orphan_del(handle, inode); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + return err; +} int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { @@ -4042,4 +4361,3 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return error; } - diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 7b80d543b89e..2c0972322009 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -272,7 +272,6 @@ const struct file_operations ext4_file_operations = { }; const struct inode_operations ext4_file_inode_operations = { - .truncate = ext4_truncate, .setattr = ext4_setattr, .getattr = ext4_getattr, #ifdef CONFIG_EXT4_FS_XATTR diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index e9473cbe80df..ce66d2fe826c 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -36,7 +36,7 @@ static void dump_completed_IO(struct inode * inode) { -#ifdef EXT4_DEBUG +#ifdef EXT4FS_DEBUG struct list_head *cur, *before, *after; ext4_io_end_t *io, *io0, *io1; unsigned long flags; @@ -172,6 +172,7 @@ int ext4_sync_file(struct file *file, int datasync) journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; int ret; tid_t commit_tid; + bool needs_barrier = false; J_ASSERT(ext4_journal_current_handle() == NULL); @@ -211,22 +212,12 @@ int ext4_sync_file(struct file *file, int datasync) } commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; - if (jbd2_log_start_commit(journal, commit_tid)) { - /* - * When the journal is on a different device than the - * fs data disk, we need to issue the barrier in - * writeback mode. (In ordered mode, the jbd2 layer - * will take care of issuing the barrier. In - * data=journal, all of the data blocks are written to - * the journal device.) - */ - if (ext4_should_writeback_data(inode) && - (journal->j_fs_dev != journal->j_dev) && - (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, - NULL); - ret = jbd2_log_wait_commit(journal, commit_tid); - } else if (journal->j_flags & JBD2_BARRIER) + if (journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(journal, commit_tid)) + needs_barrier = true; + jbd2_log_start_commit(journal, commit_tid); + ret = jbd2_log_wait_commit(journal, commit_tid); + if (needs_barrier) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); out: trace_ext4_sync_file_exit(inode, ret); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f2fa5e8a582c..50d0e9c64584 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -639,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, while (target > 0) { count = target; /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext4_new_meta_blocks(handle, inode, - goal, &count, err); + current_block = ext4_new_meta_blocks(handle, inode, goal, + 0, &count, err); if (*err) goto failed_out; @@ -1930,7 +1930,7 @@ repeat: * We do still charge estimated metadata to the sb though; * we cannot afford to run out of free blocks. */ - if (ext4_claim_free_blocks(sbi, md_needed + 1)) { + if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) { dquot_release_reservation_block(inode, 1); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); @@ -2796,9 +2796,7 @@ static int write_cache_pages_da(struct address_space *mapping, continue; } - if (PageWriteback(page)) - wait_on_page_writeback(page); - + wait_on_page_writeback(page); BUG_ON(PageWriteback(page)); if (mpd->next_page != page->index) @@ -3513,7 +3511,7 @@ retry: loff_t end = offset + iov_length(iov, nr_segs); if (end > isize) - vmtruncate(inode, isize); + ext4_truncate_failed_write(inode); } } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) @@ -3916,9 +3914,30 @@ void ext4_set_aops(struct inode *inode) int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned length; + unsigned blocksize; + struct inode *inode = mapping->host; + + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); + + return ext4_block_zero_page_range(handle, mapping, from, length); +} + +/* + * ext4_block_zero_page_range() zeros out a mapping of length 'length' + * starting from file offset 'from'. The range to be zero'd must + * be contained with in one block. If the specified range exceeds + * the end of the block it will be shortened to end of the block + * that cooresponds to 'from' + */ +int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length) +{ ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize, length, pos; + unsigned blocksize, max, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; @@ -3931,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle, return -EINVAL; blocksize = inode->i_sb->s_blocksize; - length = blocksize - (offset & (blocksize - 1)); + max = blocksize - (offset & (blocksize - 1)); + + /* + * correct length if it does not fall between + * 'from' and the end of the block + */ + if (length > max || length < 0) + length = max; + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); if (!page_has_buffers(page)) @@ -4380,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, int ext4_can_truncate(struct inode *inode) { - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return 0; if (S_ISREG(inode->i_mode)) return 1; if (S_ISDIR(inode->i_mode)) @@ -4392,6 +4417,31 @@ int ext4_can_truncate(struct inode *inode) } /* + * ext4_punch_hole: punches a hole in a file by releaseing the blocks + * associated with the given offset and length + * + * @inode: File inode + * @offset: The offset where the hole will begin + * @len: The length of the hole + * + * Returns: 0 on sucess or negative on failure + */ + +int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) +{ + struct inode *inode = file->f_path.dentry->d_inode; + if (!S_ISREG(inode->i_mode)) + return -ENOTSUPP; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + /* TODO: Add support for non extent hole punching */ + return -ENOTSUPP; + } + + return ext4_ext_punch_hole(file, offset, length); +} + +/* * ext4_truncate() * * We block out ext4_get_block() block instantiations across the entire @@ -4617,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode, /* * Figure out the offset within the block group inode table */ - inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); + inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; inode_offset = ((inode->i_ino - 1) % EXT4_INODES_PER_GROUP(sb)); block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); @@ -5311,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE && - (attr->ia_size < inode->i_size || - (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) { + (attr->ia_size < inode->i_size)) { handle_t *handle; handle = ext4_journal_start(inode, 3); @@ -5346,14 +5395,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) goto err_out; } } - /* ext4_truncate will clear the flag */ - if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) - ext4_truncate(inode); } - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) - rc = vmtruncate(inode, attr->ia_size); + if (attr->ia_valid & ATTR_SIZE) { + if (attr->ia_size != i_size_read(inode)) { + truncate_setsize(inode, attr->ia_size); + ext4_truncate(inode); + } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) + ext4_truncate(inode); + } if (!rc) { setattr_copy(inode, attr); @@ -5811,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out_unlock; } ret = 0; - if (PageMappedToDisk(page)) - goto out_unlock; + + lock_page(page); + wait_on_page_writeback(page); + if (PageMappedToDisk(page)) { + up_read(&inode->i_alloc_sem); + return VM_FAULT_LOCKED; + } if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; else len = PAGE_CACHE_SIZE; - lock_page(page); /* * return if we have all the buffers mapped. This avoid * the need to call write_begin/write_end which does a @@ -5829,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (page_has_buffers(page)) { if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, ext4_bh_unmapped)) { - unlock_page(page); - goto out_unlock; + up_read(&inode->i_alloc_sem); + return VM_FAULT_LOCKED; } } unlock_page(page); @@ -5850,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret < 0) goto out_unlock; ret = 0; + + /* + * write_begin/end might have created a dirty page and someone + * could wander in and start the IO. Make sure that hasn't + * happened. + */ + lock_page(page); + wait_on_page_writeback(page); + up_read(&inode->i_alloc_sem); + return VM_FAULT_LOCKED; out_unlock: if (ret) ret = VM_FAULT_SIGBUS; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d8a16eecf1d5..859f2ae8864e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -787,6 +787,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) struct inode *inode; char *data; char *bitmap; + struct ext4_group_info *grinfo; mb_debug(1, "init page %lu\n", page->index); @@ -819,6 +820,18 @@ static int ext4_mb_init_cache(struct page *page, char *incore) if (first_group + i >= ngroups) break; + grinfo = ext4_get_group_info(sb, first_group + i); + /* + * If page is uptodate then we came here after online resize + * which added some new uninitialized group info structs, so + * we must skip all initialized uptodate buddies on the page, + * which may be currently in use by an allocating task. + */ + if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { + bh[i] = NULL; + continue; + } + err = -EIO; desc = ext4_get_group_desc(sb, first_group + i, NULL); if (desc == NULL) @@ -871,26 +884,28 @@ static int ext4_mb_init_cache(struct page *page, char *incore) } /* wait for I/O completion */ - for (i = 0; i < groups_per_page && bh[i]; i++) - wait_on_buffer(bh[i]); + for (i = 0; i < groups_per_page; i++) + if (bh[i]) + wait_on_buffer(bh[i]); err = -EIO; - for (i = 0; i < groups_per_page && bh[i]; i++) - if (!buffer_uptodate(bh[i])) + for (i = 0; i < groups_per_page; i++) + if (bh[i] && !buffer_uptodate(bh[i])) goto out; err = 0; first_block = page->index * blocks_per_page; - /* init the page */ - memset(page_address(page), 0xff, PAGE_CACHE_SIZE); for (i = 0; i < blocks_per_page; i++) { int group; - struct ext4_group_info *grinfo; group = (first_block + i) >> 1; if (group >= ngroups) break; + if (!bh[group - first_group]) + /* skip initialized uptodate buddy */ + continue; + /* * data carry information regarding this * particular group in the format specified @@ -919,6 +934,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore) * incore got set to the group block bitmap below */ ext4_lock_group(sb, group); + /* init the buddy */ + memset(data, 0xff, blocksize); ext4_mb_generate_buddy(sb, data, incore, group); ext4_unlock_group(sb, group); incore = NULL; @@ -948,7 +965,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) out: if (bh) { - for (i = 0; i < groups_per_page && bh[i]; i++) + for (i = 0; i < groups_per_page; i++) brelse(bh[i]); if (bh != &bhs) kfree(bh); @@ -957,22 +974,21 @@ out: } /* - * lock the group_info alloc_sem of all the groups - * belonging to the same buddy cache page. This - * make sure other parallel operation on the buddy - * cache doesn't happen whild holding the buddy cache - * lock + * Lock the buddy and bitmap pages. This make sure other parallel init_group + * on the same buddy page doesn't happen whild holding the buddy page lock. + * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap + * are on the same page e4b->bd_buddy_page is NULL and return value is 0. */ -static int ext4_mb_get_buddy_cache_lock(struct super_block *sb, - ext4_group_t group) +static int ext4_mb_get_buddy_page_lock(struct super_block *sb, + ext4_group_t group, struct ext4_buddy *e4b) { - int i; - int block, pnum; + struct inode *inode = EXT4_SB(sb)->s_buddy_cache; + int block, pnum, poff; int blocks_per_page; - int groups_per_page; - ext4_group_t ngroups = ext4_get_groups_count(sb); - ext4_group_t first_group; - struct ext4_group_info *grp; + struct page *page; + + e4b->bd_buddy_page = NULL; + e4b->bd_bitmap_page = NULL; blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; /* @@ -982,57 +998,40 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb, */ block = group * 2; pnum = block / blocks_per_page; - first_group = pnum * blocks_per_page / 2; - - groups_per_page = blocks_per_page >> 1; - if (groups_per_page == 0) - groups_per_page = 1; - /* read all groups the page covers into the cache */ - for (i = 0; i < groups_per_page; i++) { + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (!page) + return -EIO; + BUG_ON(page->mapping != inode->i_mapping); + e4b->bd_bitmap_page = page; + e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); - if ((first_group + i) >= ngroups) - break; - grp = ext4_get_group_info(sb, first_group + i); - /* take all groups write allocation - * semaphore. This make sure there is - * no block allocation going on in any - * of that groups - */ - down_write_nested(&grp->alloc_sem, i); + if (blocks_per_page >= 2) { + /* buddy and bitmap are on the same page */ + return 0; } - return i; + + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (!page) + return -EIO; + BUG_ON(page->mapping != inode->i_mapping); + e4b->bd_buddy_page = page; + return 0; } -static void ext4_mb_put_buddy_cache_lock(struct super_block *sb, - ext4_group_t group, int locked_group) +static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) { - int i; - int block, pnum; - int blocks_per_page; - ext4_group_t first_group; - struct ext4_group_info *grp; - - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - /* - * the buddy cache inode stores the block bitmap - * and buddy information in consecutive blocks. - * So for each group we need two blocks. - */ - block = group * 2; - pnum = block / blocks_per_page; - first_group = pnum * blocks_per_page / 2; - /* release locks on all the groups */ - for (i = 0; i < locked_group; i++) { - - grp = ext4_get_group_info(sb, first_group + i); - /* take all groups write allocation - * semaphore. This make sure there is - * no block allocation going on in any - * of that groups - */ - up_write(&grp->alloc_sem); + if (e4b->bd_bitmap_page) { + unlock_page(e4b->bd_bitmap_page); + page_cache_release(e4b->bd_bitmap_page); + } + if (e4b->bd_buddy_page) { + unlock_page(e4b->bd_buddy_page); + page_cache_release(e4b->bd_buddy_page); } - } /* @@ -1044,93 +1043,60 @@ static noinline_for_stack int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) { - int ret = 0; - void *bitmap; - int blocks_per_page; - int block, pnum, poff; - int num_grp_locked = 0; struct ext4_group_info *this_grp; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct inode *inode = sbi->s_buddy_cache; - struct page *page = NULL, *bitmap_page = NULL; + struct ext4_buddy e4b; + struct page *page; + int ret = 0; mb_debug(1, "init group %u\n", group); - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; this_grp = ext4_get_group_info(sb, group); /* * This ensures that we don't reinit the buddy cache * page which map to the group from which we are already * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that - * would have taken the alloc_sem lock. + * would have pinned buddy page to page cache. */ - num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); - if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { + ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); + if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group * return without doing anything */ - ret = 0; goto err; } - /* - * the buddy cache inode stores the block bitmap - * and buddy information in consecutive blocks. - * So for each group we need two blocks. - */ - block = group * 2; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); - if (page) { - BUG_ON(page->mapping != inode->i_mapping); - ret = ext4_mb_init_cache(page, NULL); - if (ret) { - unlock_page(page); - goto err; - } - unlock_page(page); - } - if (page == NULL || !PageUptodate(page)) { + + page = e4b.bd_bitmap_page; + ret = ext4_mb_init_cache(page, NULL); + if (ret) + goto err; + if (!PageUptodate(page)) { ret = -EIO; goto err; } mark_page_accessed(page); - bitmap_page = page; - bitmap = page_address(page) + (poff * sb->s_blocksize); - /* init buddy cache */ - block++; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); - if (page == bitmap_page) { + if (e4b.bd_buddy_page == NULL) { /* * If both the bitmap and buddy are in * the same page we don't need to force * init the buddy */ - unlock_page(page); - } else if (page) { - BUG_ON(page->mapping != inode->i_mapping); - ret = ext4_mb_init_cache(page, bitmap); - if (ret) { - unlock_page(page); - goto err; - } - unlock_page(page); + ret = 0; + goto err; } - if (page == NULL || !PageUptodate(page)) { + /* init buddy cache */ + page = e4b.bd_buddy_page; + ret = ext4_mb_init_cache(page, e4b.bd_bitmap); + if (ret) + goto err; + if (!PageUptodate(page)) { ret = -EIO; goto err; } mark_page_accessed(page); err: - ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); - if (bitmap_page) - page_cache_release(bitmap_page); - if (page) - page_cache_release(page); + ext4_mb_put_buddy_page_lock(&e4b); return ret; } @@ -1164,24 +1130,8 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, e4b->bd_group = group; e4b->bd_buddy_page = NULL; e4b->bd_bitmap_page = NULL; - e4b->alloc_semp = &grp->alloc_sem; - - /* Take the read lock on the group alloc - * sem. This would make sure a parallel - * ext4_mb_init_group happening on other - * groups mapped by the page is blocked - * till we are done with allocation - */ -repeat_load_buddy: - down_read(e4b->alloc_semp); if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - /* we need to check for group need init flag - * with alloc_semp held so that we can be sure - * that new blocks didn't get added to the group - * when we are loading the buddy cache - */ - up_read(e4b->alloc_semp); /* * we need full data about the group * to make a good selection @@ -1189,7 +1139,6 @@ repeat_load_buddy: ret = ext4_mb_init_group(sb, group); if (ret) return ret; - goto repeat_load_buddy; } /* @@ -1273,15 +1222,14 @@ repeat_load_buddy: return 0; err: + if (page) + page_cache_release(page); if (e4b->bd_bitmap_page) page_cache_release(e4b->bd_bitmap_page); if (e4b->bd_buddy_page) page_cache_release(e4b->bd_buddy_page); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; - - /* Done with the buddy cache */ - up_read(e4b->alloc_semp); return ret; } @@ -1291,9 +1239,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) page_cache_release(e4b->bd_bitmap_page); if (e4b->bd_buddy_page) page_cache_release(e4b->bd_buddy_page); - /* Done with the buddy cache */ - if (e4b->alloc_semp) - up_read(e4b->alloc_semp); } @@ -1606,9 +1551,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, get_page(ac->ac_bitmap_page); ac->ac_buddy_page = e4b->bd_buddy_page; get_page(ac->ac_buddy_page); - /* on allocation we use ac to track the held semaphore */ - ac->alloc_semp = e4b->alloc_semp; - e4b->alloc_semp = NULL; /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { spin_lock(&sbi->s_md_lock); @@ -2659,7 +2601,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) struct super_block *sb = journal->j_private; struct ext4_buddy e4b; struct ext4_group_info *db; - int err, ret, count = 0, count2 = 0; + int err, count = 0, count2 = 0; struct ext4_free_data *entry; struct list_head *l, *ltmp; @@ -2669,15 +2611,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->count, entry->group, entry); - if (test_opt(sb, DISCARD)) { - ret = ext4_issue_discard(sb, entry->group, - entry->start_blk, entry->count); - if (unlikely(ret == -EOPNOTSUPP)) { - ext4_warning(sb, "discard not supported, " - "disabling"); - clear_opt(sb, DISCARD); - } - } + if (test_opt(sb, DISCARD)) + ext4_issue_discard(sb, entry->group, + entry->start_blk, entry->count); err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ @@ -4226,15 +4162,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) spin_unlock(&pa->pa_lock); } } - if (ac->alloc_semp) - up_read(ac->alloc_semp); if (pa) { /* * We want to add the pa to the right bucket. * Remove it from the list and while adding * make sure the list to which we are adding - * doesn't grow big. We need to release - * alloc_semp before calling ext4_mb_add_n_trim() + * doesn't grow big. */ if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { spin_lock(pa->pa_obj_lock); @@ -4303,7 +4236,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, * there is enough free blocks to do block allocation * and verify allocation doesn't exceed the quota limits. */ - while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { + while (ar->len && + ext4_claim_free_blocks(sbi, ar->len, ar->flags)) { + /* let others to free the space */ yield(); ar->len = ar->len >> 1; @@ -4313,9 +4248,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, return 0; } reserv_blks = ar->len; - while (ar->len && dquot_alloc_block(ar->inode, ar->len)) { - ar->flags |= EXT4_MB_HINT_NOPREALLOC; - ar->len--; + if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { + dquot_alloc_block_nofail(ar->inode, ar->len); + } else { + while (ar->len && + dquot_alloc_block(ar->inode, ar->len)) { + + ar->flags |= EXT4_MB_HINT_NOPREALLOC; + ar->len--; + } } inquota = ar->len; if (ar->len == 0) { @@ -4704,6 +4645,127 @@ error_return: } /** + * ext4_add_groupblocks() -- Add given blocks to an existing group + * @handle: handle to this transaction + * @sb: super block + * @block: start physcial block to add to the block group + * @count: number of blocks to free + * + * This marks the blocks as free in the bitmap and buddy. + */ +void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gd_bh; + ext4_group_t block_group; + ext4_grpblk_t bit; + unsigned int i; + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_buddy e4b; + int err = 0, ret, blk_free_count; + ext4_grpblk_t blocks_freed; + struct ext4_group_info *grp; + + ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); + + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + grp = ext4_get_group_info(sb, block_group); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) + goto error_return; + + bitmap_bh = ext4_read_block_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + desc = ext4_get_group_desc(sb, block_group, &gd_bh); + if (!desc) + goto error_return; + + if (in_range(ext4_block_bitmap(sb, desc), block, count) || + in_range(ext4_inode_bitmap(sb, desc), block, count) || + in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || + in_range(block + count - 1, ext4_inode_table(sb, desc), + sbi->s_itb_per_group)) { + ext4_error(sb, "Adding blocks in system zones - " + "Block = %llu, count = %lu", + block, count); + goto error_return; + } + + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext4_journal_get_write_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; + + for (i = 0, blocks_freed = 0; i < count; i++) { + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { + ext4_error(sb, "bit already cleared for block %llu", + (ext4_fsblk_t)(block + i)); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + blocks_freed++; + } + } + + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_return; + + /* + * need to update group_info->bb_free and bitmap + * with group lock held. generate_buddy look at + * them with group lock_held + */ + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count); + mb_free_blocks(NULL, &e4b, bit, count); + blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); + ext4_free_blks_set(sb, desc, blk_free_count); + desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); + ext4_unlock_group(sb, block_group); + percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + atomic_add(blocks_freed, + &sbi->s_flex_groups[flex_group].free_blocks); + } + + ext4_mb_unload_buddy(&e4b); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); + if (!err) + err = ret; + +error_return: + brelse(bitmap_bh); + ext4_std_error(sb, err); + return; +} + +/** * ext4_trim_extent -- function to TRIM one single free extent in the group * @sb: super block for the file system * @start: starting block of the free extent in the alloc. group @@ -4715,11 +4777,10 @@ error_return: * one will allocate those blocks, mark it as used in buddy bitmap. This must * be called with under the group lock. */ -static int ext4_trim_extent(struct super_block *sb, int start, int count, - ext4_group_t group, struct ext4_buddy *e4b) +static void ext4_trim_extent(struct super_block *sb, int start, int count, + ext4_group_t group, struct ext4_buddy *e4b) { struct ext4_free_extent ex; - int ret = 0; assert_spin_locked(ext4_group_lock_ptr(sb, group)); @@ -4733,12 +4794,9 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count, */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); - - ret = ext4_issue_discard(sb, group, start, count); - + ext4_issue_discard(sb, group, start, count); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); - return ret; } /** @@ -4760,21 +4818,26 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count, * the group buddy bitmap. This is done until whole group is scanned. */ static ext4_grpblk_t -ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, - ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) +ext4_trim_all_free(struct super_block *sb, ext4_group_t group, + ext4_grpblk_t start, ext4_grpblk_t max, + ext4_grpblk_t minblocks) { void *bitmap; ext4_grpblk_t next, count = 0; - ext4_group_t group; - int ret = 0; + struct ext4_buddy e4b; + int ret; - BUG_ON(e4b == NULL); + ret = ext4_mb_load_buddy(sb, group, &e4b); + if (ret) { + ext4_error(sb, "Error in loading buddy " + "information for %u", group); + return ret; + } + bitmap = e4b.bd_bitmap; - bitmap = e4b->bd_bitmap; - group = e4b->bd_group; - start = (e4b->bd_info->bb_first_free > start) ? - e4b->bd_info->bb_first_free : start; ext4_lock_group(sb, group); + start = (e4b.bd_info->bb_first_free > start) ? + e4b.bd_info->bb_first_free : start; while (start < max) { start = mb_find_next_zero_bit(bitmap, max, start); @@ -4783,10 +4846,8 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, next = mb_find_next_bit(bitmap, max, start); if ((next - start) >= minblocks) { - ret = ext4_trim_extent(sb, start, - next - start, group, e4b); - if (ret < 0) - break; + ext4_trim_extent(sb, start, + next - start, group, &e4b); count += next - start; } start = next + 1; @@ -4802,17 +4863,15 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, ext4_lock_group(sb, group); } - if ((e4b->bd_info->bb_free - count) < minblocks) + if ((e4b.bd_info->bb_free - count) < minblocks) break; } ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); ext4_debug("trimmed %d blocks in the group %d\n", count, group); - if (ret < 0) - count = ret; - return count; } @@ -4830,11 +4889,11 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, */ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { - struct ext4_buddy e4b; + struct ext4_group_info *grp; ext4_group_t first_group, last_group; ext4_group_t group, ngroups = ext4_get_groups_count(sb); ext4_grpblk_t cnt = 0, first_block, last_block; - uint64_t start, len, minlen, trimmed; + uint64_t start, len, minlen, trimmed = 0; ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); int ret = 0; @@ -4842,7 +4901,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) start = range->start >> sb->s_blocksize_bits; len = range->len >> sb->s_blocksize_bits; minlen = range->minlen >> sb->s_blocksize_bits; - trimmed = 0; if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) return -EINVAL; @@ -4863,11 +4921,12 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) return -EINVAL; for (group = first_group; group <= last_group; group++) { - ret = ext4_mb_load_buddy(sb, group, &e4b); - if (ret) { - ext4_error(sb, "Error in loading buddy " - "information for %u", group); - break; + grp = ext4_get_group_info(sb, group); + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + ret = ext4_mb_init_group(sb, group); + if (ret) + break; } /* @@ -4880,16 +4939,14 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) last_block = first_block + len; len -= last_block - first_block; - if (e4b.bd_info->bb_free >= minlen) { - cnt = ext4_trim_all_free(sb, &e4b, first_block, + if (grp->bb_free >= minlen) { + cnt = ext4_trim_all_free(sb, group, first_block, last_block, minlen); if (cnt < 0) { ret = cnt; - ext4_mb_unload_buddy(&e4b); break; } } - ext4_mb_unload_buddy(&e4b); trimmed += cnt; first_block = 0; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 22bd4d7f289b..20b5e7bfebd1 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -193,11 +193,6 @@ struct ext4_allocation_context { __u8 ac_op; /* operation, for history only */ struct page *ac_bitmap_page; struct page *ac_buddy_page; - /* - * pointer to the held semaphore upon successful - * block allocation - */ - struct rw_semaphore *alloc_semp; struct ext4_prealloc_space *ac_pa; struct ext4_locality_group *ac_lg; }; @@ -215,7 +210,6 @@ struct ext4_buddy { struct super_block *bd_sb; __u16 bd_blkbits; ext4_group_t bd_group; - struct rw_semaphore *alloc_semp; }; #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 92816b4e0f16..b57b98fb44d1 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * We have the extent map build with the tmp inode. * Now copy the i_data across */ - ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); /* diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c new file mode 100644 index 000000000000..9bdef3f537c5 --- /dev/null +++ b/fs/ext4/mmp.c @@ -0,0 +1,351 @@ +#include <linux/fs.h> +#include <linux/random.h> +#include <linux/buffer_head.h> +#include <linux/utsname.h> +#include <linux/kthread.h> + +#include "ext4.h" + +/* + * Write the MMP block using WRITE_SYNC to try to get the block on-disk + * faster. + */ +static int write_mmp_block(struct buffer_head *bh) +{ + mark_buffer_dirty(bh); + lock_buffer(bh); + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + submit_bh(WRITE_SYNC, bh); + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) + return 1; + + return 0; +} + +/* + * Read the MMP block. It _must_ be read from disk and hence we clear the + * uptodate flag on the buffer. + */ +static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, + ext4_fsblk_t mmp_block) +{ + struct mmp_struct *mmp; + + if (*bh) + clear_buffer_uptodate(*bh); + + /* This would be sb_bread(sb, mmp_block), except we need to be sure + * that the MD RAID device cache has been bypassed, and that the read + * is not blocked in the elevator. */ + if (!*bh) + *bh = sb_getblk(sb, mmp_block); + if (*bh) { + get_bh(*bh); + lock_buffer(*bh); + (*bh)->b_end_io = end_buffer_read_sync; + submit_bh(READ_SYNC, *bh); + wait_on_buffer(*bh); + if (!buffer_uptodate(*bh)) { + brelse(*bh); + *bh = NULL; + } + } + if (!*bh) { + ext4_warning(sb, "Error while reading MMP block %llu", + mmp_block); + return -EIO; + } + + mmp = (struct mmp_struct *)((*bh)->b_data); + if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) + return -EINVAL; + + return 0; +} + +/* + * Dump as much information as possible to help the admin. + */ +void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, + const char *function, unsigned int line, const char *msg) +{ + __ext4_warning(sb, function, line, msg); + __ext4_warning(sb, function, line, + "MMP failure info: last update time: %llu, last update " + "node: %s, last update device: %s\n", + (long long unsigned int) le64_to_cpu(mmp->mmp_time), + mmp->mmp_nodename, mmp->mmp_bdevname); +} + +/* + * kmmpd will update the MMP sequence every s_mmp_update_interval seconds + */ +static int kmmpd(void *data) +{ + struct super_block *sb = ((struct mmpd_data *) data)->sb; + struct buffer_head *bh = ((struct mmpd_data *) data)->bh; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct mmp_struct *mmp; + ext4_fsblk_t mmp_block; + u32 seq = 0; + unsigned long failed_writes = 0; + int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); + unsigned mmp_check_interval; + unsigned long last_update_time; + unsigned long diff; + int retval; + + mmp_block = le64_to_cpu(es->s_mmp_block); + mmp = (struct mmp_struct *)(bh->b_data); + mmp->mmp_time = cpu_to_le64(get_seconds()); + /* + * Start with the higher mmp_check_interval and reduce it if + * the MMP block is being updated on time. + */ + mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval, + EXT4_MMP_MIN_CHECK_INTERVAL); + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); + bdevname(bh->b_bdev, mmp->mmp_bdevname); + + memcpy(mmp->mmp_nodename, init_utsname()->sysname, + sizeof(mmp->mmp_nodename)); + + while (!kthread_should_stop()) { + if (++seq > EXT4_MMP_SEQ_MAX) + seq = 1; + + mmp->mmp_seq = cpu_to_le32(seq); + mmp->mmp_time = cpu_to_le64(get_seconds()); + last_update_time = jiffies; + + retval = write_mmp_block(bh); + /* + * Don't spew too many error messages. Print one every + * (s_mmp_update_interval * 60) seconds. + */ + if (retval && (failed_writes % 60) == 0) { + ext4_error(sb, "Error writing to MMP block"); + failed_writes++; + } + + if (!(le32_to_cpu(es->s_feature_incompat) & + EXT4_FEATURE_INCOMPAT_MMP)) { + ext4_warning(sb, "kmmpd being stopped since MMP feature" + " has been disabled."); + EXT4_SB(sb)->s_mmp_tsk = NULL; + goto failed; + } + + if (sb->s_flags & MS_RDONLY) { + ext4_warning(sb, "kmmpd being stopped since filesystem " + "has been remounted as readonly."); + EXT4_SB(sb)->s_mmp_tsk = NULL; + goto failed; + } + + diff = jiffies - last_update_time; + if (diff < mmp_update_interval * HZ) + schedule_timeout_interruptible(mmp_update_interval * + HZ - diff); + + /* + * We need to make sure that more than mmp_check_interval + * seconds have not passed since writing. If that has happened + * we need to check if the MMP block is as we left it. + */ + diff = jiffies - last_update_time; + if (diff > mmp_check_interval * HZ) { + struct buffer_head *bh_check = NULL; + struct mmp_struct *mmp_check; + + retval = read_mmp_block(sb, &bh_check, mmp_block); + if (retval) { + ext4_error(sb, "error reading MMP data: %d", + retval); + + EXT4_SB(sb)->s_mmp_tsk = NULL; + goto failed; + } + + mmp_check = (struct mmp_struct *)(bh_check->b_data); + if (mmp->mmp_seq != mmp_check->mmp_seq || + memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, + sizeof(mmp->mmp_nodename))) { + dump_mmp_msg(sb, mmp_check, + "Error while updating MMP info. " + "The filesystem seems to have been" + " multiply mounted."); + ext4_error(sb, "abort"); + goto failed; + } + put_bh(bh_check); + } + + /* + * Adjust the mmp_check_interval depending on how much time + * it took for the MMP block to be written. + */ + mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ, + EXT4_MMP_MAX_CHECK_INTERVAL), + EXT4_MMP_MIN_CHECK_INTERVAL); + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); + } + + /* + * Unmount seems to be clean. + */ + mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); + mmp->mmp_time = cpu_to_le64(get_seconds()); + + retval = write_mmp_block(bh); + +failed: + kfree(data); + brelse(bh); + return retval; +} + +/* + * Get a random new sequence number but make sure it is not greater than + * EXT4_MMP_SEQ_MAX. + */ +static unsigned int mmp_new_seq(void) +{ + u32 new_seq; + + do { + get_random_bytes(&new_seq, sizeof(u32)); + } while (new_seq > EXT4_MMP_SEQ_MAX); + + return new_seq; +} + +/* + * Protect the filesystem from being mounted more than once. + */ +int ext4_multi_mount_protect(struct super_block *sb, + ext4_fsblk_t mmp_block) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct buffer_head *bh = NULL; + struct mmp_struct *mmp = NULL; + struct mmpd_data *mmpd_data; + u32 seq; + unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); + unsigned int wait_time = 0; + int retval; + + if (mmp_block < le32_to_cpu(es->s_first_data_block) || + mmp_block >= ext4_blocks_count(es)) { + ext4_warning(sb, "Invalid MMP block in superblock"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + + mmp = (struct mmp_struct *)(bh->b_data); + + if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) + mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; + + /* + * If check_interval in MMP block is larger, use that instead of + * update_interval from the superblock. + */ + if (mmp->mmp_check_interval > mmp_check_interval) + mmp_check_interval = mmp->mmp_check_interval; + + seq = le32_to_cpu(mmp->mmp_seq); + if (seq == EXT4_MMP_SEQ_CLEAN) + goto skip; + + if (seq == EXT4_MMP_SEQ_FSCK) { + dump_mmp_msg(sb, mmp, "fsck is running on the filesystem"); + goto failed; + } + + wait_time = min(mmp_check_interval * 2 + 1, + mmp_check_interval + 60); + + /* Print MMP interval if more than 20 secs. */ + if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4) + ext4_warning(sb, "MMP interval %u higher than expected, please" + " wait.\n", wait_time * 2); + + if (schedule_timeout_interruptible(HZ * wait_time) != 0) { + ext4_warning(sb, "MMP startup interrupted, failing mount\n"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + mmp = (struct mmp_struct *)(bh->b_data); + if (seq != le32_to_cpu(mmp->mmp_seq)) { + dump_mmp_msg(sb, mmp, + "Device is already active on another node."); + goto failed; + } + +skip: + /* + * write a new random sequence number. + */ + mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); + + retval = write_mmp_block(bh); + if (retval) + goto failed; + + /* + * wait for MMP interval and check mmp_seq. + */ + if (schedule_timeout_interruptible(HZ * wait_time) != 0) { + ext4_warning(sb, "MMP startup interrupted, failing mount\n"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + mmp = (struct mmp_struct *)(bh->b_data); + if (seq != le32_to_cpu(mmp->mmp_seq)) { + dump_mmp_msg(sb, mmp, + "Device is already active on another node."); + goto failed; + } + + mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); + if (!mmpd_data) { + ext4_warning(sb, "not enough memory for mmpd_data"); + goto failed; + } + mmpd_data->sb = sb; + mmpd_data->bh = bh; + + /* + * Start a kernel thread to update the MMP block periodically. + */ + EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s", + bdevname(bh->b_bdev, + mmp->mmp_bdevname)); + if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { + EXT4_SB(sb)->s_mmp_tsk = NULL; + kfree(mmpd_data); + ext4_warning(sb, "Unable to create kmmpd thread for %s.", + sb->s_id); + goto failed; + } + + return 0; + +failed: + brelse(bh); + return 1; +} + + diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index b9f3e7862f13..2b8304bf3c50 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -876,8 +876,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, * It needs to call wait_on_page_writeback() to wait for the * writeback of the page. */ - if (PageWriteback(page)) - wait_on_page_writeback(page); + wait_on_page_writeback(page); /* Release old bh and drop refs */ try_to_release_page(page, 0); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 67fd0b025858..b754b7721f51 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1413,10 +1413,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, frame->at = entries; frame->bh = bh; bh = bh2; + + ext4_handle_dirty_metadata(handle, dir, frame->bh); + ext4_handle_dirty_metadata(handle, dir, bh); + de = do_split(handle,dir, &bh, frame, &hinfo, &retval); - dx_release (frames); - if (!(de)) + if (!de) { + /* + * Even if the block split failed, we have to properly write + * out all the changes we did so far. Otherwise we can end up + * with corrupted filesystem. + */ + ext4_mark_inode_dirty(handle, dir); + dx_release(frames); return retval; + } + dx_release(frames); retval = add_dirent_to_buf(handle, dentry, inode, de, bh); brelse(bh); @@ -2240,6 +2252,7 @@ static int ext4_symlink(struct inode *dir, handle_t *handle; struct inode *inode; int l, err, retries = 0; + int credits; l = strlen(symname)+1; if (l > dir->i_sb->s_blocksize) @@ -2247,10 +2260,26 @@ static int ext4_symlink(struct inode *dir, dquot_initialize(dir); + if (l > EXT4_N_BLOCKS * 4) { + /* + * For non-fast symlinks, we just allocate inode and put it on + * orphan list in the first transaction => we need bitmap, + * group descriptor, sb, inode block, quota blocks. + */ + credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + } else { + /* + * Fast symlink. We have to add entry to directory + * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS), + * allocate new inode (bitmap, group descriptor, inode block, + * quota blocks, sb is already counted in previous macros). + */ + credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + } retry: - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + handle = ext4_journal_start(dir, credits); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2263,21 +2292,44 @@ retry: if (IS_ERR(inode)) goto out_stop; - if (l > sizeof(EXT4_I(inode)->i_data)) { + if (l > EXT4_N_BLOCKS * 4) { inode->i_op = &ext4_symlink_inode_operations; ext4_set_aops(inode); /* - * page_symlink() calls into ext4_prepare/commit_write. - * We have a transaction open. All is sweetness. It also sets - * i_size in generic_commit_write(). + * We cannot call page_symlink() with transaction started + * because it calls into ext4_write_begin() which can wait + * for transaction commit if we are running out of space + * and thus we deadlock. So we have to stop transaction now + * and restart it when symlink contents is written. + * + * To keep fs consistent in case of crash, we have to put inode + * to orphan list in the mean time. */ + drop_nlink(inode); + err = ext4_orphan_add(handle, inode); + ext4_journal_stop(handle); + if (err) + goto err_drop_inode; err = __page_symlink(inode, symname, l, 1); + if (err) + goto err_drop_inode; + /* + * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS + * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified + */ + handle = ext4_journal_start(dir, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto err_drop_inode; + } + inc_nlink(inode); + err = ext4_orphan_del(handle, inode); if (err) { + ext4_journal_stop(handle); clear_nlink(inode); - unlock_new_inode(inode); - ext4_mark_inode_dirty(handle, inode); - iput(inode); - goto out_stop; + goto err_drop_inode; } } else { /* clear the extent format for fast symlink */ @@ -2293,6 +2345,10 @@ out_stop: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; +err_drop_inode: + unlock_new_inode(inode); + iput(inode); + return err; } static int ext4_link(struct dentry *old_dentry, diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index b6dbd056fcb1..7bb8f76d470a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -203,46 +203,29 @@ static void ext4_end_bio(struct bio *bio, int error) for (i = 0; i < io_end->num_io_pages; i++) { struct page *page = io_end->pages[i]->p_page; struct buffer_head *bh, *head; - int partial_write = 0; + loff_t offset; + loff_t io_end_offset; - head = page_buffers(page); - if (error) + if (error) { SetPageError(page); - BUG_ON(!head); - if (head->b_size != PAGE_CACHE_SIZE) { - loff_t offset; - loff_t io_end_offset = io_end->offset + io_end->size; + set_bit(AS_EIO, &page->mapping->flags); + head = page_buffers(page); + BUG_ON(!head); + + io_end_offset = io_end->offset + io_end->size; offset = (sector_t) page->index << PAGE_CACHE_SHIFT; bh = head; do { if ((offset >= io_end->offset) && - (offset+bh->b_size <= io_end_offset)) { - if (error) - buffer_io_error(bh); - - } - if (buffer_delay(bh)) - partial_write = 1; - else if (!buffer_mapped(bh)) - clear_buffer_dirty(bh); - else if (buffer_dirty(bh)) - partial_write = 1; + (offset+bh->b_size <= io_end_offset)) + buffer_io_error(bh); + offset += bh->b_size; bh = bh->b_this_page; } while (bh != head); } - /* - * If this is a partial write which happened to make - * all buffers uptodate then we can optimize away a - * bogus readpage() for the next read(). Here we - * 'discover' whether the page went uptodate as a - * result of this (potentially partial) write. - */ - if (!partial_write) - SetPageUptodate(page); - put_io_page(io_end->pages[i]); } io_end->num_io_pages = 0; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8553dfb310af..d9937df7f5cf 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -75,11 +75,27 @@ static void ext4_write_super(struct super_block *sb); static int ext4_freeze(struct super_block *sb); static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); +static inline int ext2_feature_set_ok(struct super_block *sb); +static inline int ext3_feature_set_ok(struct super_block *sb); static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); +#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static struct file_system_type ext2_fs_type = { + .owner = THIS_MODULE, + .name = "ext2", + .mount = ext4_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type) +#else +#define IS_EXT2_SB(sb) (0) +#endif + + #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, @@ -806,6 +822,8 @@ static void ext4_put_super(struct super_block *sb) invalidate_bdev(sbi->journal_bdev); ext4_blkdev_remove(sbi); } + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); sb->s_fs_info = NULL; /* * Now that we are completely done shutting down the @@ -1096,7 +1114,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (!test_opt(sb, INIT_INODE_TABLE)) seq_puts(seq, ",noinit_inode_table"); - else if (sbi->s_li_wait_mult) + else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT) seq_printf(seq, ",init_inode_table=%u", (unsigned) sbi->s_li_wait_mult); @@ -1187,9 +1205,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static const struct dquot_operations ext4_quota_operations = { -#ifdef CONFIG_QUOTA .get_reserved_space = ext4_get_reserved_space, -#endif .write_dquot = ext4_write_dquot, .acquire_dquot = ext4_acquire_dquot, .release_dquot = ext4_release_dquot, @@ -1900,7 +1916,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, ext4_msg(sb, KERN_WARNING, "warning: mounting fs with errors, " "running e2fsck is recommended"); - else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && + else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && le16_to_cpu(es->s_mnt_count) >= (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) ext4_msg(sb, KERN_WARNING, @@ -2425,6 +2441,18 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, EXT4_SB(sb)->s_sectors_written_start) >> 1))); } +static ssize_t extent_cache_hits_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits); +} + +static ssize_t extent_cache_misses_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses); +} + static ssize_t inode_readahead_blks_store(struct ext4_attr *a, struct ext4_sb_info *sbi, const char *buf, size_t count) @@ -2482,6 +2510,8 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) EXT4_RO_ATTR(delayed_allocation_blocks); EXT4_RO_ATTR(session_write_kbytes); EXT4_RO_ATTR(lifetime_write_kbytes); +EXT4_RO_ATTR(extent_cache_hits); +EXT4_RO_ATTR(extent_cache_misses); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, inode_readahead_blks_store, s_inode_readahead_blks); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); @@ -2497,6 +2527,8 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), ATTR_LIST(session_write_kbytes), ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(extent_cache_hits), + ATTR_LIST(extent_cache_misses), ATTR_LIST(inode_readahead_blks), ATTR_LIST(inode_goal), ATTR_LIST(mb_stats), @@ -2659,12 +2691,6 @@ static void print_daily_error_info(unsigned long arg) mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ } -static void ext4_lazyinode_timeout(unsigned long data) -{ - struct task_struct *p = (struct task_struct *)data; - wake_up_process(p); -} - /* Find next suitable group and run ext4_init_inode_table */ static int ext4_run_li_request(struct ext4_li_request *elr) { @@ -2696,11 +2722,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr) ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); if (elr->lr_timeout == 0) { - timeout = jiffies - timeout; - if (elr->lr_sbi->s_li_wait_mult) - timeout *= elr->lr_sbi->s_li_wait_mult; - else - timeout *= 20; + timeout = (jiffies - timeout) * + elr->lr_sbi->s_li_wait_mult; elr->lr_timeout = timeout; } elr->lr_next_sched = jiffies + elr->lr_timeout; @@ -2712,7 +2735,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr) /* * Remove lr_request from the list_request and free the - * request tructure. Should be called with li_list_mtx held + * request structure. Should be called with li_list_mtx held */ static void ext4_remove_li_request(struct ext4_li_request *elr) { @@ -2730,14 +2753,16 @@ static void ext4_remove_li_request(struct ext4_li_request *elr) static void ext4_unregister_li_request(struct super_block *sb) { - struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request; - - if (!ext4_li_info) + mutex_lock(&ext4_li_mtx); + if (!ext4_li_info) { + mutex_unlock(&ext4_li_mtx); return; + } mutex_lock(&ext4_li_info->li_list_mtx); - ext4_remove_li_request(elr); + ext4_remove_li_request(EXT4_SB(sb)->s_li_request); mutex_unlock(&ext4_li_info->li_list_mtx); + mutex_unlock(&ext4_li_mtx); } static struct task_struct *ext4_lazyinit_task; @@ -2756,17 +2781,10 @@ static int ext4_lazyinit_thread(void *arg) struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; struct list_head *pos, *n; struct ext4_li_request *elr; - unsigned long next_wakeup; - DEFINE_WAIT(wait); + unsigned long next_wakeup, cur; BUG_ON(NULL == eli); - eli->li_timer.data = (unsigned long)current; - eli->li_timer.function = ext4_lazyinode_timeout; - - eli->li_task = current; - wake_up(&eli->li_wait_task); - cont_thread: while (true) { next_wakeup = MAX_JIFFY_OFFSET; @@ -2797,19 +2815,15 @@ cont_thread: if (freezing(current)) refrigerator(); - if ((time_after_eq(jiffies, next_wakeup)) || + cur = jiffies; + if ((time_after_eq(cur, next_wakeup)) || (MAX_JIFFY_OFFSET == next_wakeup)) { cond_resched(); continue; } - eli->li_timer.expires = next_wakeup; - add_timer(&eli->li_timer); - prepare_to_wait(&eli->li_wait_daemon, &wait, - TASK_INTERRUPTIBLE); - if (time_before(jiffies, next_wakeup)) - schedule(); - finish_wait(&eli->li_wait_daemon, &wait); + schedule_timeout_interruptible(next_wakeup - cur); + if (kthread_should_stop()) { ext4_clear_request_list(); goto exit_thread; @@ -2833,12 +2847,7 @@ exit_thread: goto cont_thread; } mutex_unlock(&eli->li_list_mtx); - del_timer_sync(&ext4_li_info->li_timer); - eli->li_task = NULL; - wake_up(&eli->li_wait_task); - kfree(ext4_li_info); - ext4_lazyinit_task = NULL; ext4_li_info = NULL; mutex_unlock(&ext4_li_mtx); @@ -2866,7 +2875,6 @@ static int ext4_run_lazyinit_thread(void) if (IS_ERR(ext4_lazyinit_task)) { int err = PTR_ERR(ext4_lazyinit_task); ext4_clear_request_list(); - del_timer_sync(&ext4_li_info->li_timer); kfree(ext4_li_info); ext4_li_info = NULL; printk(KERN_CRIT "EXT4: error %d creating inode table " @@ -2875,8 +2883,6 @@ static int ext4_run_lazyinit_thread(void) return err; } ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; - - wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL); return 0; } @@ -2911,13 +2917,9 @@ static int ext4_li_info_new(void) if (!eli) return -ENOMEM; - eli->li_task = NULL; INIT_LIST_HEAD(&eli->li_request_list); mutex_init(&eli->li_list_mtx); - init_waitqueue_head(&eli->li_wait_daemon); - init_waitqueue_head(&eli->li_wait_task); - init_timer(&eli->li_timer); eli->li_state |= EXT4_LAZYINIT_QUIT; ext4_li_info = eli; @@ -2960,20 +2962,19 @@ static int ext4_register_li_request(struct super_block *sb, ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; int ret = 0; - if (sbi->s_li_request != NULL) + if (sbi->s_li_request != NULL) { + /* + * Reset timeout so it can be computed again, because + * s_li_wait_mult might have changed. + */ + sbi->s_li_request->lr_timeout = 0; return 0; + } if (first_not_zeroed == ngroups || (sb->s_flags & MS_RDONLY) || - !test_opt(sb, INIT_INODE_TABLE)) { - sbi->s_li_request = NULL; + !test_opt(sb, INIT_INODE_TABLE)) return 0; - } - - if (first_not_zeroed == ngroups) { - sbi->s_li_request = NULL; - return 0; - } elr = ext4_li_request_new(sb, first_not_zeroed); if (!elr) @@ -3166,6 +3167,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); + /* + * set default s_li_wait_mult for lazyinit, for the case there is + * no mount option specified. + */ + sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, &journal_devnum, &journal_ioprio, NULL, 0)) { ext4_msg(sb, KERN_WARNING, @@ -3187,6 +3194,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "feature flags set on rev 0 fs, " "running e2fsck is recommended"); + if (IS_EXT2_SB(sb)) { + if (ext2_feature_set_ok(sb)) + ext4_msg(sb, KERN_INFO, "mounting ext2 file system " + "using the ext4 subsystem"); + else { + ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " + "to feature incompatibilities"); + goto failed_mount; + } + } + + if (IS_EXT3_SB(sb)) { + if (ext3_feature_set_ok(sb)) + ext4_msg(sb, KERN_INFO, "mounting ext3 file system " + "using the ext4 subsystem"); + else { + ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " + "to feature incompatibilities"); + goto failed_mount; + } + } + /* * Check feature flags regardless of the revision level, since we * previously didn't change the revision level when setting the flags, @@ -3459,6 +3488,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && + !(sb->s_flags & MS_RDONLY)) + if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) + goto failed_mount3; + /* * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! @@ -3474,7 +3508,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount_wq; } else { clear_opt(sb, DATA_FLAGS); - set_opt(sb, WRITEBACK_DATA); sbi->s_journal = NULL; needs_recovery = 0; goto no_journal; @@ -3707,6 +3740,8 @@ failed_mount3: percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyblocks_counter); + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); @@ -4242,7 +4277,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) int enable_quota = 0; ext4_group_t g; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; - int err; + int err = 0; #ifdef CONFIG_QUOTA int i; #endif @@ -4368,6 +4403,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; if (!ext4_setup_super(sb, es, 0)) sb->s_flags &= ~MS_RDONLY; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_MMP)) + if (ext4_multi_mount_protect(sb, + le64_to_cpu(es->s_mmp_block))) { + err = -EROFS; + goto restore_opts; + } enable_quota = 1; } } @@ -4432,6 +4474,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; u64 fsid; + s64 bfree; if (test_opt(sb, MINIX_DF)) { sbi->s_overhead_last = 0; @@ -4475,8 +4518,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; - buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - + bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); + /* prevent underflow in case that few free space is available */ + buf->f_bfree = max_t(s64, bfree, 0); buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); if (buf->f_bfree < ext4_r_blocks_count(es)) buf->f_bavail = 0; @@ -4652,6 +4697,9 @@ static int ext4_quota_off(struct super_block *sb, int type) if (test_opt(sb, DELALLOC)) sync_filesystem(sb); + if (!inode) + goto out; + /* Update modification times of quota files when userspace can * start looking at them */ handle = ext4_journal_start(inode, 1); @@ -4772,14 +4820,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, } #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) -static struct file_system_type ext2_fs_type = { - .owner = THIS_MODULE, - .name = "ext2", - .mount = ext4_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; - static inline void register_as_ext2(void) { int err = register_filesystem(&ext2_fs_type); @@ -4792,10 +4832,22 @@ static inline void unregister_as_ext2(void) { unregister_filesystem(&ext2_fs_type); } + +static inline int ext2_feature_set_ok(struct super_block *sb) +{ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP)) + return 0; + if (sb->s_flags & MS_RDONLY) + return 1; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP)) + return 0; + return 1; +} MODULE_ALIAS("ext2"); #else static inline void register_as_ext2(void) { } static inline void unregister_as_ext2(void) { } +static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; } #endif #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) @@ -4811,10 +4863,24 @@ static inline void unregister_as_ext3(void) { unregister_filesystem(&ext3_fs_type); } + +static inline int ext3_feature_set_ok(struct super_block *sb) +{ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP)) + return 0; + if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + return 0; + if (sb->s_flags & MS_RDONLY) + return 1; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) + return 0; + return 1; +} MODULE_ALIAS("ext3"); #else static inline void register_as_ext3(void) { } static inline void unregister_as_ext3(void) { } +static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; } #endif static struct file_system_type ext4_fs_type = { @@ -4898,8 +4964,8 @@ static int __init ext4_init_fs(void) err = init_inodecache(); if (err) goto out1; - register_as_ext2(); register_as_ext3(); + register_as_ext2(); err = register_filesystem(&ext4_fs_type); if (err) goto out; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index b545ca1c459c..c757adc97250 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -820,8 +820,8 @@ inserted: if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; - block = ext4_new_meta_blocks(handle, inode, - goal, NULL, &error); + block = ext4_new_meta_blocks(handle, inode, goal, 0, + NULL, &error); if (error) goto cleanup; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 29148a81c783..7f21cf3aaf92 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -219,7 +219,6 @@ static int journal_submit_data_buffers(journal_t *journal, ret = err; spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); - commit_transaction->t_flushed_data_blocks = 1; clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); smp_mb__after_clear_bit(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); @@ -672,12 +671,16 @@ start_journal_io: err = 0; } + write_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_COMMIT); + commit_transaction->t_state = T_COMMIT_DFLUSH; + write_unlock(&journal->j_state_lock); /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue * the commit record */ - if (commit_transaction->t_flushed_data_blocks && + if (commit_transaction->t_need_data_flush && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); @@ -754,8 +757,13 @@ wait_for_iobuf: required. */ JBUFFER_TRACE(jh, "file as BJ_Forget"); jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); - /* Wake up any transactions which were waiting for this - IO to complete */ + /* + * Wake up any transactions which were waiting for this IO to + * complete. The barrier must be here so that changes by + * jbd2_journal_file_buffer() take effect before wake_up_bit() + * does the waitqueue check. + */ + smp_mb(); wake_up_bit(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "brelse shadowed buffer"); __brelse(bh); @@ -794,6 +802,10 @@ wait_for_iobuf: jbd2_journal_abort(journal, err); jbd_debug(3, "JBD: commit phase 5\n"); + write_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); + commit_transaction->t_state = T_COMMIT_JFLUSH; + write_unlock(&journal->j_state_lock); if (!JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { @@ -949,7 +961,7 @@ restart_loop: jbd_debug(3, "JBD: commit phase 7\n"); - J_ASSERT(commit_transaction->t_state == T_COMMIT); + J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); commit_transaction->t_start = jiffies; stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e0ec3db1c395..9a7826990304 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -479,9 +479,12 @@ int __jbd2_log_space_left(journal_t *journal) int __jbd2_log_start_commit(journal_t *journal, tid_t target) { /* - * Are we already doing a recent enough commit? + * The only transaction we can possibly wait upon is the + * currently running transaction (if it exists). Otherwise, + * the target tid must be an old one. */ - if (!tid_geq(journal->j_commit_request, target)) { + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == target) { /* * We want a new commit: OK, mark the request and wakeup the * commit thread. We do _not_ do the commit ourselves. @@ -493,7 +496,15 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) journal->j_commit_sequence); wake_up(&journal->j_wait_commit); return 1; - } + } else if (!tid_geq(journal->j_commit_request, target)) + /* This should never happen, but if it does, preserve + the evidence before kjournald goes into a loop and + increments j_commit_sequence beyond all recognition. */ + WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", + journal->j_commit_request, + journal->j_commit_sequence, + target, journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); return 0; } @@ -577,6 +588,47 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) } /* + * Return 1 if a given transaction has not yet sent barrier request + * connected with a transaction commit. If 0 is returned, transaction + * may or may not have sent the barrier. Used to avoid sending barrier + * twice in common cases. + */ +int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) +{ + int ret = 0; + transaction_t *commit_trans; + + if (!(journal->j_flags & JBD2_BARRIER)) + return 0; + read_lock(&journal->j_state_lock); + /* Transaction already committed? */ + if (tid_geq(journal->j_commit_sequence, tid)) + goto out; + commit_trans = journal->j_committing_transaction; + if (!commit_trans || commit_trans->t_tid != tid) { + ret = 1; + goto out; + } + /* + * Transaction is being committed and we already proceeded to + * submitting a flush to fs partition? + */ + if (journal->j_fs_dev != journal->j_dev) { + if (!commit_trans->t_need_data_flush || + commit_trans->t_state >= T_COMMIT_DFLUSH) + goto out; + } else { + if (commit_trans->t_state >= T_COMMIT_JFLUSH) + goto out; + } + ret = 1; +out: + read_unlock(&journal->j_state_lock); + return ret; +} +EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier); + +/* * Wait for a specified commit to complete. * The caller may not hold the journal lock. */ diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 05fa77a23711..3eec82d32fd4 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -82,7 +82,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) */ /* - * Update transiaction's maximum wait time, if debugging is enabled. + * Update transaction's maximum wait time, if debugging is enabled. * * In order for t_max_wait to be reliable, it must be protected by a * lock. But doing so will mean that start_this_handle() can not be @@ -91,11 +91,10 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) * means that maximum wait time reported by the jbd2_run_stats * tracepoint will always be zero. */ -static inline void update_t_max_wait(transaction_t *transaction) +static inline void update_t_max_wait(transaction_t *transaction, + unsigned long ts) { #ifdef CONFIG_JBD2_DEBUG - unsigned long ts = jiffies; - if (jbd2_journal_enable_debug && time_after(transaction->t_start, ts)) { ts = jbd2_time_diff(ts, transaction->t_start); @@ -121,6 +120,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle, tid_t tid; int needed, need_to_start; int nblocks = handle->h_buffer_credits; + unsigned long ts = jiffies; if (nblocks > journal->j_max_transaction_buffers) { printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", @@ -271,7 +271,7 @@ repeat: /* OK, account for the buffers that this operation expects to * use and add the handle to the running transaction. */ - update_t_max_wait(transaction); + update_t_max_wait(transaction, ts); handle->h_transaction = transaction; atomic_inc(&transaction->t_updates); atomic_inc(&transaction->t_handle_count); @@ -316,7 +316,8 @@ static handle_t *new_handle(int nblocks) * This function is visible to journal users (like ext3fs), so is not * called with the journal already locked. * - * Return a pointer to a newly allocated handle, or NULL on failure + * Return a pointer to a newly allocated handle, or an ERR_PTR() value + * on failure. */ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) { @@ -921,8 +922,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) */ JBUFFER_TRACE(jh, "cancelling revoke"); jbd2_journal_cancel_revoke(handle, jh); - jbd2_journal_put_journal_head(jh); out: + jbd2_journal_put_journal_head(jh); return err; } @@ -2147,6 +2148,13 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) jinode->i_next_transaction == transaction) goto done; + /* + * We only ever set this variable to 1 so the test is safe. Since + * t_need_data_flush is likely to be set, we do the test to save some + * cacheline bouncing + */ + if (!transaction->t_need_data_flush) + transaction->t_need_data_flush = 1; /* On some different transaction's list - should be * the committing one */ if (jinode->i_transaction) { diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index a32dcaec04e1..4ecb7b16b278 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -529,9 +529,10 @@ struct transaction_s enum { T_RUNNING, T_LOCKED, - T_RUNDOWN, T_FLUSH, T_COMMIT, + T_COMMIT_DFLUSH, + T_COMMIT_JFLUSH, T_FINISHED } t_state; @@ -658,7 +659,9 @@ struct transaction_s * waiting for it to finish. */ unsigned int t_synchronous_commit:1; - unsigned int t_flushed_data_blocks:1; + + /* Disk flush needs to be sent to fs partition [no locking] */ + int t_need_data_flush; /* * For use by the filesystem to store fs-specific data @@ -1228,6 +1231,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); int jbd2_journal_force_commit_nested(journal_t *journal); int jbd2_log_wait_commit(journal_t *journal, tid_t tid); int jbd2_log_do_checkpoint(journal_t *journal); +int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); void __jbd2_log_wait_for_space(journal_t *journal); extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); |