From 6662a8d03104516c53170257b837d8a03e86db39 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Nov 2009 07:25:49 -0500 Subject: ext4: fix potential buffer head leak when add_dirent_to_buf() returns ENOSPC (cherry picked from commit 2de770a406b06dfc619faabbf5d85c835ed3f2e1) Previously add_dirent_to_buf() did not free its passed-in buffer head in the case of ENOSPC, since in some cases the caller still needed it. However, this led to potential buffer head leaks since not all callers dealt with this correctly. Fix this by making simplifying the freeing convention; now add_dirent_to_buf() *never* frees the passed-in buffer head, and leaves that to the responsibility of its caller. This makes things cleaner and easier to prove that the code is neither leaking buffer heads or calling brelse() one time too many. Signed-off-by: "Theodore Ts'o" Cc: Curt Wohlgemuth Signed-off-by: Greg Kroah-Hartman --- fs/ext4/namei.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6d2c1b897fc7..fde08c919d12 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1292,9 +1292,6 @@ errout: * add_dirent_to_buf will attempt search the directory block for * space. It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. - * - * NOTE! bh is NOT released in the case where ENOSPC is returned. In - * all other cases bh is released. */ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *inode, struct ext4_dir_entry_2 *de, @@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, top = bh->b_data + blocksize - reclen; while ((char *) de <= top) { if (!ext4_check_dir_entry("ext4_add_entry", dir, de, - bh, offset)) { - brelse(bh); + bh, offset)) return -EIO; - } - if (ext4_match(namelen, name, de)) { - brelse(bh); + if (ext4_match(namelen, name, de)) return -EEXIST; - } nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); if ((de->inode? rlen - nlen: rlen) >= reclen) @@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, err = ext4_journal_get_write_access(handle, bh); if (err) { ext4_std_error(dir->i_sb, err); - brelse(bh); return err; } @@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, err = ext4_handle_dirty_metadata(handle, dir, bh); if (err) ext4_std_error(dir->i_sb, err); - brelse(bh); return 0; } @@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, if (!(de)) return retval; - return add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + brelse(bh); + return retval; } /* @@ -1514,8 +1507,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if(!bh) return retval; retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (retval != -ENOSPC) + if (retval != -ENOSPC) { + brelse(bh); return retval; + } if (blocks == 1 && !dx_fallback && EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) @@ -1528,7 +1523,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); - return add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + brelse(bh); + return retval; } /* @@ -1561,10 +1558,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto journal_error; err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (err != -ENOSPC) { - bh = NULL; + if (err != -ENOSPC) goto cleanup; - } /* Block full, should compress but for now just split */ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", @@ -1657,7 +1652,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (!de) goto cleanup; err = add_dirent_to_buf(handle, dentry, inode, de, bh); - bh = NULL; goto cleanup; journal_error: -- cgit v1.2.3 From 8ed33ff5203300ac8878042bc4d4954e2f40c488 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Nov 2009 07:24:46 -0500 Subject: ext4: avoid divide by zero when trying to mount a corrupted file system (cherry picked from commit 503358ae01b70ce6909d19dd01287093f6b6271c) If s_log_groups_per_flex is greater than 31, then groups_per_flex will will overflow and cause a divide by zero error. This can cause kernel BUG if such a file system is mounted. Thanks to Nageswara R Sastry for analyzing the failure and providing an initial patch. http://bugzilla.kernel.org/show_bug.cgi?id=14287 Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d4ca92aab514..8662b2e6e9f9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1673,14 +1673,14 @@ static int ext4_fill_flex_info(struct super_block *sb) size_t size; int i; - if (!sbi->s_es->s_log_groups_per_flex) { + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; + groups_per_flex = 1 << sbi->s_log_groups_per_flex; + + if (groups_per_flex < 2) { sbi->s_log_groups_per_flex = 0; return 1; } - sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; - groups_per_flex = 1 << sbi->s_log_groups_per_flex; - /* We allocate both existing and potentially added groups */ flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << -- cgit v1.2.3 From a4a87a7f39020ea18f220c9e9c8ab112ebd6764a Mon Sep 17 00:00:00 2001 From: Akira Fujita Date: Mon, 23 Nov 2009 07:25:48 -0500 Subject: ext4: fix the returned block count if EXT4_IOC_MOVE_EXT fails (cherry picked from commit f868a48d06f8886cb0367568a12367fa4f21ea0d) If the EXT4_IOC_MOVE_EXT ioctl fails, the number of blocks that were exchanged before the failure should be returned to the userspace caller. Unfortunately, currently if the block size is not the same as the page size, the returned block count that is returned is the page-aligned block count instead of the actual block count. This commit addresses this bug. Signed-off-by: Akira Fujita Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/move_extent.c | 139 ++++++++++++++++++++++++++------------------------ 1 file changed, 73 insertions(+), 66 deletions(-) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 25b6b1457360..83f8c9e47c60 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -661,6 +661,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, * @donor_inode: donor inode * @from: block offset of orig_inode * @count: block count to be replaced + * @err: pointer to save return value * * Replace original inode extents and donor inode extents page by page. * We implement this replacement in the following three steps: @@ -671,19 +672,18 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, * 3. Change the block information of donor inode to point at the saved * original inode blocks in the dummy extents. * - * Return 0 on success, or a negative error value on failure. + * Return replaced block count. */ static int mext_replace_branches(handle_t *handle, struct inode *orig_inode, struct inode *donor_inode, ext4_lblk_t from, - ext4_lblk_t count) + ext4_lblk_t count, int *err) { struct ext4_ext_path *orig_path = NULL; struct ext4_ext_path *donor_path = NULL; struct ext4_extent *oext, *dext; struct ext4_extent tmp_dext, tmp_oext; ext4_lblk_t orig_off = from, donor_off = from; - int err = 0; int depth; int replaced_count = 0; int dext_alen; @@ -691,13 +691,13 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, mext_double_down_write(orig_inode, donor_inode); /* Get the original extent for the block "orig_off" */ - err = get_ext_path(orig_inode, orig_off, &orig_path); - if (err) + *err = get_ext_path(orig_inode, orig_off, &orig_path); + if (*err) goto out; /* Get the donor extent for the head */ - err = get_ext_path(donor_inode, donor_off, &donor_path); - if (err) + *err = get_ext_path(donor_inode, donor_off, &donor_path); + if (*err) goto out; depth = ext_depth(orig_inode); oext = orig_path[depth].p_ext; @@ -707,9 +707,9 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, dext = donor_path[depth].p_ext; tmp_dext = *dext; - err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, donor_off, count); - if (err) + if (*err) goto out; /* Loop for the donor extents */ @@ -718,7 +718,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, if (!dext) { ext4_error(donor_inode->i_sb, __func__, "The extent for donor must be found"); - err = -EIO; + *err = -EIO; goto out; } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { ext4_error(donor_inode->i_sb, __func__, @@ -726,20 +726,20 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, "extent(%u) should be equal", donor_off, le32_to_cpu(tmp_dext.ee_block)); - err = -EIO; + *err = -EIO; goto out; } /* Set donor extent to orig extent */ - err = mext_leaf_block(handle, orig_inode, + *err = mext_leaf_block(handle, orig_inode, orig_path, &tmp_dext, &orig_off); - if (err < 0) + if (*err) goto out; /* Set orig extent to donor extent */ - err = mext_leaf_block(handle, donor_inode, + *err = mext_leaf_block(handle, donor_inode, donor_path, &tmp_oext, &donor_off); - if (err < 0) + if (*err) goto out; dext_alen = ext4_ext_get_actual_len(&tmp_dext); @@ -753,35 +753,25 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, if (orig_path) ext4_ext_drop_refs(orig_path); - err = get_ext_path(orig_inode, orig_off, &orig_path); - if (err) + *err = get_ext_path(orig_inode, orig_off, &orig_path); + if (*err) goto out; depth = ext_depth(orig_inode); oext = orig_path[depth].p_ext; - if (le32_to_cpu(oext->ee_block) + - ext4_ext_get_actual_len(oext) <= orig_off) { - err = 0; - goto out; - } tmp_oext = *oext; if (donor_path) ext4_ext_drop_refs(donor_path); - err = get_ext_path(donor_inode, donor_off, &donor_path); - if (err) + *err = get_ext_path(donor_inode, donor_off, &donor_path); + if (*err) goto out; depth = ext_depth(donor_inode); dext = donor_path[depth].p_ext; - if (le32_to_cpu(dext->ee_block) + - ext4_ext_get_actual_len(dext) <= donor_off) { - err = 0; - goto out; - } tmp_dext = *dext; - err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, donor_off, count - replaced_count); - if (err) + if (*err) goto out; } @@ -796,7 +786,7 @@ out: } mext_double_up_write(orig_inode, donor_inode); - return err; + return replaced_count; } /** @@ -808,16 +798,17 @@ out: * @data_offset_in_page: block index where data swapping starts * @block_len_in_page: the number of blocks to be swapped * @uninit: orig extent is uninitialized or not + * @err: pointer to save return value * * Save the data in original inode blocks and replace original inode extents * with donor inode extents by calling mext_replace_branches(). - * Finally, write out the saved data in new original inode blocks. Return 0 - * on success, or a negative error value on failure. + * Finally, write out the saved data in new original inode blocks. Return + * replaced block count. */ static int move_extent_per_page(struct file *o_filp, struct inode *donor_inode, pgoff_t orig_page_offset, int data_offset_in_page, - int block_len_in_page, int uninit) + int block_len_in_page, int uninit, int *err) { struct inode *orig_inode = o_filp->f_dentry->d_inode; struct address_space *mapping = orig_inode->i_mapping; @@ -829,9 +820,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, long long offs = orig_page_offset << PAGE_CACHE_SHIFT; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int w_flags = 0; - unsigned int tmp_data_len, data_len; + unsigned int tmp_data_size, data_size, replaced_size; void *fsdata; - int ret, i, jblocks; + int i, jblocks; + int err2 = 0; + int replaced_count = 0; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; /* @@ -841,8 +834,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; handle = ext4_journal_start(orig_inode, jblocks); if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - return ret; + *err = PTR_ERR(handle); + return 0; } if (segment_eq(get_fs(), KERNEL_DS)) @@ -858,9 +851,9 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, * Just swap data blocks between orig and donor. */ if (uninit) { - ret = mext_replace_branches(handle, orig_inode, - donor_inode, orig_blk_offset, - block_len_in_page); + replaced_count = mext_replace_branches(handle, orig_inode, + donor_inode, orig_blk_offset, + block_len_in_page, err); /* Clear the inode cache not to refer to the old data */ ext4_ext_invalidate_cache(orig_inode); @@ -870,27 +863,28 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, offs = (long long)orig_blk_offset << orig_inode->i_blkbits; - /* Calculate data_len */ + /* Calculate data_size */ if ((orig_blk_offset + block_len_in_page - 1) == ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { /* Replace the last block */ - tmp_data_len = orig_inode->i_size & (blocksize - 1); + tmp_data_size = orig_inode->i_size & (blocksize - 1); /* - * If data_len equal zero, it shows data_len is multiples of + * If data_size equal zero, it shows data_size is multiples of * blocksize. So we set appropriate value. */ - if (tmp_data_len == 0) - tmp_data_len = blocksize; + if (tmp_data_size == 0) + tmp_data_size = blocksize; - data_len = tmp_data_len + + data_size = tmp_data_size + ((block_len_in_page - 1) << orig_inode->i_blkbits); - } else { - data_len = block_len_in_page << orig_inode->i_blkbits; - } + } else + data_size = block_len_in_page << orig_inode->i_blkbits; + + replaced_size = data_size; - ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags, + *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, &page, &fsdata); - if (unlikely(ret < 0)) + if (unlikely(*err < 0)) goto out; if (!PageUptodate(page)) { @@ -911,10 +905,17 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, /* Release old bh and drop refs */ try_to_release_page(page, 0); - ret = mext_replace_branches(handle, orig_inode, donor_inode, - orig_blk_offset, block_len_in_page); - if (ret < 0) - goto out; + replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, + orig_blk_offset, block_len_in_page, + &err2); + if (err2) { + if (replaced_count) { + block_len_in_page = replaced_count; + replaced_size = + block_len_in_page << orig_inode->i_blkbits; + } else + goto out; + } /* Clear the inode cache not to refer to the old data */ ext4_ext_invalidate_cache(orig_inode); @@ -928,16 +929,16 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { - ret = ext4_get_block(orig_inode, + *err = ext4_get_block(orig_inode, (sector_t)(orig_blk_offset + i), bh, 0); - if (ret < 0) + if (*err < 0) goto out; if (bh->b_this_page != NULL) bh = bh->b_this_page; } - ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len, + *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, page, fsdata); page = NULL; @@ -951,7 +952,10 @@ out: out2: ext4_journal_stop(handle); - return ret < 0 ? ret : 0; + if (err2) + *err = err2; + + return replaced_count; } /** @@ -1367,15 +1371,17 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, while (orig_page_offset <= seq_end_page) { /* Swap original branches with new branches */ - ret1 = move_extent_per_page(o_filp, donor_inode, + block_len_in_page = move_extent_per_page( + o_filp, donor_inode, orig_page_offset, data_offset_in_page, - block_len_in_page, uninit); - if (ret1 < 0) - goto out; - orig_page_offset++; + block_len_in_page, uninit, + &ret1); + /* Count how many blocks we have exchanged */ *moved_len += block_len_in_page; + if (ret1 < 0) + goto out; if (*moved_len > len) { ext4_error(orig_inode->i_sb, __func__, "We replaced blocks too much! " @@ -1385,6 +1391,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, goto out; } + orig_page_offset++; data_offset_in_page = 0; rest_blocks -= block_len_in_page; if (rest_blocks > blocks_per_page) -- cgit v1.2.3 From 52a4345d3d82b77bea320bf223716298468be3de Mon Sep 17 00:00:00 2001 From: Akira Fujita Date: Mon, 23 Nov 2009 07:24:43 -0500 Subject: ext4: fix lock order problem in ext4_move_extents() (cherry picked from commit fc04cb49a898c372a22b21fffc47f299d8710801) ext4_move_extents() checks the logical block contiguousness of original file with ext4_find_extent() and mext_next_extent(). Therefore the extent which ext4_ext_path structure indicates must not be changed between above functions. But in current implementation, there is no i_data_sem protection between ext4_ext_find_extent() and mext_next_extent(). So the extent which ext4_ext_path structure indicates may be overwritten by delalloc. As a result, ext4_move_extents() will exchange wrong blocks between original and donor files. I change the place where acquire/release i_data_sem to solve this problem. Moreover, I changed move_extent_per_page() to start transaction first, and then acquire i_data_sem. Without this change, there is a possibility of the deadlock between mmap() and ext4_move_extents(): * NOTE: "A", "B" and "C" mean different processes A-1: ext4_ext_move_extents() acquires i_data_sem of two inodes. B: do_page_fault() starts the transaction (T), and then tries to acquire i_data_sem. But process "A" is already holding it, so it is kept waiting. C: While "A" and "B" running, kjournald2 tries to commit transaction (T) but it is under updating, so kjournald2 waits for it. A-2: Call ext4_journal_start with holding i_data_sem, but transaction (T) is locked. Signed-off-by: Akira Fujita Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/move_extent.c | 117 +++++++++++++++++++++++--------------------------- 1 file changed, 53 insertions(+), 64 deletions(-) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 83f8c9e47c60..a7410b34c5ed 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -77,12 +77,14 @@ static int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent **extent) { + struct ext4_extent_header *eh; int ppos, leaf_ppos = path->p_depth; ppos = leaf_ppos; if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { /* leaf block */ *extent = ++path[ppos].p_ext; + path[ppos].p_block = ext_pblock(path[ppos].p_ext); return 0; } @@ -119,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, ext_block_hdr(path[cur_ppos+1].p_bh); } + path[leaf_ppos].p_ext = *extent = NULL; + + eh = path[leaf_ppos].p_hdr; + if (le16_to_cpu(eh->eh_entries) == 0) + /* empty leaf is found */ + return -ENODATA; + /* leaf block */ path[leaf_ppos].p_ext = *extent = EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); + path[leaf_ppos].p_block = + ext_pblock(path[leaf_ppos].p_ext); return 0; } } @@ -155,40 +166,15 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2, } /** - * mext_double_down_read - Acquire two inodes' read semaphore - * - * @orig_inode: original inode structure - * @donor_inode: donor inode structure - * Acquire read semaphore of the two inodes (orig and donor) by i_ino order. - */ -static void -mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode) -{ - struct inode *first = orig_inode, *second = donor_inode; - - /* - * Use the inode number to provide the stable locking order instead - * of its address, because the C language doesn't guarantee you can - * compare pointers that don't come from the same array. - */ - if (donor_inode->i_ino < orig_inode->i_ino) { - first = donor_inode; - second = orig_inode; - } - - down_read(&EXT4_I(first)->i_data_sem); - down_read(&EXT4_I(second)->i_data_sem); -} - -/** - * mext_double_down_write - Acquire two inodes' write semaphore + * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem * * @orig_inode: original inode structure * @donor_inode: donor inode structure - * Acquire write semaphore of the two inodes (orig and donor) by i_ino order. + * Acquire write lock of i_data_sem of the two inodes (orig and donor) by + * i_ino order. */ static void -mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) +double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) { struct inode *first = orig_inode, *second = donor_inode; @@ -207,28 +193,14 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) } /** - * mext_double_up_read - Release two inodes' read semaphore + * double_up_write_data_sem - Release two inodes' write lock of i_data_sem * * @orig_inode: original inode structure to be released its lock first * @donor_inode: donor inode structure to be released its lock second - * Release read semaphore of two inodes (orig and donor). + * Release write lock of i_data_sem of two inodes (orig and donor). */ static void -mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) -{ - up_read(&EXT4_I(orig_inode)->i_data_sem); - up_read(&EXT4_I(donor_inode)->i_data_sem); -} - -/** - * mext_double_up_write - Release two inodes' write semaphore - * - * @orig_inode: original inode structure to be released its lock first - * @donor_inode: donor inode structure to be released its lock second - * Release write semaphore of two inodes (orig and donor). - */ -static void -mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) +double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) { up_write(&EXT4_I(orig_inode)->i_data_sem); up_write(&EXT4_I(donor_inode)->i_data_sem); @@ -688,8 +660,6 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, int replaced_count = 0; int dext_alen; - mext_double_down_write(orig_inode, donor_inode); - /* Get the original extent for the block "orig_off" */ *err = get_ext_path(orig_inode, orig_off, &orig_path); if (*err) @@ -785,7 +755,6 @@ out: kfree(donor_path); } - mext_double_up_write(orig_inode, donor_inode); return replaced_count; } @@ -851,6 +820,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, * Just swap data blocks between orig and donor. */ if (uninit) { + /* + * Protect extent trees against block allocations + * via delalloc + */ + double_down_write_data_sem(orig_inode, donor_inode); replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, orig_blk_offset, block_len_in_page, err); @@ -858,6 +832,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, /* Clear the inode cache not to refer to the old data */ ext4_ext_invalidate_cache(orig_inode); ext4_ext_invalidate_cache(donor_inode); + double_up_write_data_sem(orig_inode, donor_inode); goto out2; } @@ -905,6 +880,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, /* Release old bh and drop refs */ try_to_release_page(page, 0); + /* Protect extent trees against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, orig_blk_offset, block_len_in_page, &err2); @@ -913,14 +890,18 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, block_len_in_page = replaced_count; replaced_size = block_len_in_page << orig_inode->i_blkbits; - } else + } else { + double_up_write_data_sem(orig_inode, donor_inode); goto out; + } } /* Clear the inode cache not to refer to the old data */ ext4_ext_invalidate_cache(orig_inode); ext4_ext_invalidate_cache(donor_inode); + double_up_write_data_sem(orig_inode, donor_inode); + if (!page_has_buffers(page)) create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); @@ -1236,16 +1217,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, return -EINVAL; } - /* protect orig and donor against a truncate */ + /* Protect orig and donor inodes against a truncate */ ret1 = mext_inode_double_lock(orig_inode, donor_inode); if (ret1 < 0) return ret1; - mext_double_down_read(orig_inode, donor_inode); + /* Protect extent tree against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, donor_start, &len, *moved_len); - mext_double_up_read(orig_inode, donor_inode); if (ret1) goto out; @@ -1308,6 +1289,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, ext4_ext_get_actual_len(ext_cur), block_end + 1) - max(le32_to_cpu(ext_cur->ee_block), block_start); + /* Discard preallocations of two inodes */ + ext4_discard_preallocations(orig_inode); + ext4_discard_preallocations(donor_inode); + while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { seq_blocks += add_blocks; @@ -1359,14 +1344,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, seq_start = le32_to_cpu(ext_cur->ee_block); rest_blocks = seq_blocks; - /* Discard preallocations of two inodes */ - down_write(&EXT4_I(orig_inode)->i_data_sem); - ext4_discard_preallocations(orig_inode); - up_write(&EXT4_I(orig_inode)->i_data_sem); - - down_write(&EXT4_I(donor_inode)->i_data_sem); - ext4_discard_preallocations(donor_inode); - up_write(&EXT4_I(donor_inode)->i_data_sem); + /* + * Up semaphore to avoid following problems: + * a. transaction deadlock among ext4_journal_start, + * ->write_begin via pagefault, and jbd2_journal_commit + * b. racing with ->readpage, ->write_begin, and ext4_get_block + * in move_extent_per_page + */ + double_up_write_data_sem(orig_inode, donor_inode); while (orig_page_offset <= seq_end_page) { @@ -1381,14 +1366,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, /* Count how many blocks we have exchanged */ *moved_len += block_len_in_page; if (ret1 < 0) - goto out; + break; if (*moved_len > len) { ext4_error(orig_inode->i_sb, __func__, "We replaced blocks too much! " "sum of replaced: %llu requested: %llu", *moved_len, len); ret1 = -EIO; - goto out; + break; } orig_page_offset++; @@ -1400,6 +1385,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, block_len_in_page = rest_blocks; } + double_down_write_data_sem(orig_inode, donor_inode); + if (ret1 < 0) + break; + /* Decrease buffer counter */ if (holecheck_path) ext4_ext_drop_refs(holecheck_path); @@ -1429,7 +1418,7 @@ out: ext4_ext_drop_refs(holecheck_path); kfree(holecheck_path); } - + double_up_write_data_sem(orig_inode, donor_inode); ret2 = mext_inode_double_unlock(orig_inode, donor_inode); if (ret1) -- cgit v1.2.3 From a752b960d4b57e0d307e1c40b1b47ab849832c0f Mon Sep 17 00:00:00 2001 From: Akira Fujita Date: Mon, 23 Nov 2009 07:24:41 -0500 Subject: ext4: fix possible recursive locking warning in EXT4_IOC_MOVE_EXT (cherry picked from commit 49bd22bc4d603a2a4fc2a6a60e156cbea52eb494) If CONFIG_PROVE_LOCKING is enabled, the double_down_write_data_sem() will trigger a false-positive warning of a recursive lock. Since we take i_data_sem for the two inodes ordered by their inode numbers, this isn't a problem. Use of down_write_nested() will notify the lock dependency checker machinery that there is no problem here. This problem was reported by Brian Rogers: http://marc.info/?l=linux-ext4&m=125115356928011&w=1 Reported-by: Brian Rogers Signed-off-by: Akira Fujita Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/move_extent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index a7410b34c5ed..2ca6aa3f34e6 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -189,7 +189,7 @@ double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) } down_write(&EXT4_I(first)->i_data_sem); - down_write(&EXT4_I(second)->i_data_sem); + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); } /** -- cgit v1.2.3 From 21a4b3aaa2180ca6748446c4b06e91f3da244dca Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 14 Nov 2009 08:19:05 -0500 Subject: ext4: plug a buffer_head leak in an error path of ext4_iget() (cherry picked from commit 567f3e9a70d71e5c9be03701b8578be77857293b) One of the invalid error paths in ext4_iget() forgot to brelse() the inode buffer head. Fix it by adding a brelse() in the common error return path, which also simplifies function. Thanks to Andi Kleen reporting the problem. Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2c8caa51addb..554c6798597c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4781,7 +4781,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct ext4_inode_info *ei; - struct buffer_head *bh; struct inode *inode; long ret; int block; @@ -4793,11 +4792,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) return inode; ei = EXT4_I(inode); + iloc.bh = 0; ret = __ext4_get_inode_loc(inode, &iloc, 0); if (ret < 0) goto bad_inode; - bh = iloc.bh; raw_inode = ext4_raw_inode(&iloc); inode->i_mode = le16_to_cpu(raw_inode->i_mode); inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); @@ -4820,7 +4819,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (inode->i_mode == 0 || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { /* this inode is deleted */ - brelse(bh); ret = -ESTALE; goto bad_inode; } @@ -4852,7 +4850,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb)) { - brelse(bh); ret = -EIO; goto bad_inode; } @@ -4905,10 +4902,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) /* Validate block references which are part of inode */ ret = ext4_check_inode_blockref(inode); } - if (ret) { - brelse(bh); + if (ret) goto bad_inode; - } if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; @@ -4936,7 +4931,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else { - brelse(bh); ret = -EIO; ext4_error(inode->i_sb, __func__, "bogus i_mode (%o) for inode=%lu", @@ -4949,6 +4943,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) return inode; bad_inode: + brelse(iloc.bh); iget_failed(inode); return ERR_PTR(ret); } -- cgit v1.2.3 From 0f9036c7eed145cdd8c8ed9e899c61f499278259 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Nov 2009 07:17:34 -0500 Subject: ext4: make sure directory and symlink blocks are revoked (cherry picked from commit 50689696867d95b38d9c7be640a311494a04fb86) When an inode gets unlinked, the functions ext4_clear_blocks() and ext4_remove_blocks() call ext4_forget() for all the buffer heads corresponding to the deleted inode's data blocks. If the inode is a directory or a symlink, the is_metadata parameter must be non-zero so ext4_forget() will revoke them via jbd2_journal_revoke(). Otherwise, if these blocks are reused for a data file, and the system crashes before a journal checkpoint, the journal replay could end up corrupting these data blocks. Thanks to Curt Wohlgemuth for pointing out potential problems in this area. Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/extents.c | 2 +- fs/ext4/inode.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 715264b4bae4..74dcff84c3a8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2074,7 +2074,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, ext_debug("free last %u blocks starting %llu\n", num, start); for (i = 0; i < num; i++) { bh = sb_find_get_block(inode->i_sb, start + i); - ext4_forget(handle, 0, inode, bh, start + i); + ext4_forget(handle, metadata, inode, bh, start + i); } ext4_free_blocks(handle, inode, start, num, metadata); } else if (from == le32_to_cpu(ex->ee_block) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 554c6798597c..a8dd8028229a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4120,6 +4120,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, __le32 *last) { __le32 *p; + int is_metadata = S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode); + if (try_to_extend_transaction(handle, inode)) { if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); @@ -4150,11 +4152,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, *p = 0; tbh = sb_find_get_block(inode->i_sb, nr); - ext4_forget(handle, 0, inode, tbh, nr); + ext4_forget(handle, is_metadata, inode, tbh, nr); } } - ext4_free_blocks(handle, inode, block_to_free, count, 0); + ext4_free_blocks(handle, inode, block_to_free, count, is_metadata); } /** -- cgit v1.2.3 From 70095d96ced4dbe2ebc7cc0d248313e507dda31a Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 15 Nov 2009 15:30:58 -0500 Subject: ext4: fix i_flags access in ext4_da_writepages_trans_blocks() (cherry picked from commit 30c6e07a92ea4cb87160d32ffa9bce172576ae4c) We need to be testing the i_flags field in the ext4 specific portion of the inode, instead of the (confusingly aliased) i_flags field in the generic struct inode. Signed-off-by: Julia Lawall Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a8dd8028229a..1842a72b72b1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2788,7 +2788,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) * number of contiguous block. So we will limit * number of contiguous block to a sane value */ - if (!(inode->i_flags & EXT4_EXTENTS_FL) && + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && (max_blocks > EXT4_MAX_TRANS_DATA)) max_blocks = EXT4_MAX_TRANS_DATA; -- cgit v1.2.3 From 023c1d20ed1d29d5f5e40e79fde1f8f6b6a42e6f Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sun, 15 Nov 2009 15:30:52 -0500 Subject: ext4: journal all modifications in ext4_xattr_set_handle (cherry picked from commit 86ebfd08a1930ccedb8eac0aeb1ed4b8b6a41dbc) ext4_xattr_set_handle() was zeroing out an inode outside of journaling constraints; this is one of the accesses that was causing the crc errors in journal replay as seen in kernel.org bugzilla #14354. Reviewed-by: Andreas Dilger Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/xattr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index fed5b01d7a8d..025701926f9a 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -988,6 +988,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (error) goto cleanup; + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto cleanup; + if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); @@ -1013,9 +1017,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (flags & XATTR_CREATE) goto cleanup; } - error = ext4_journal_get_write_access(handle, is.iloc.bh); - if (error) - goto cleanup; if (!value) { if (!is.s.not_found) error = ext4_xattr_ibody_set(handle, inode, &i, &is); -- cgit v1.2.3 From 9e9ddddfe74189e2f2514ed2d8eb9ef3f015118f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Nov 2009 07:24:52 -0500 Subject: ext4: don't update the superblock in ext4_statfs() (cherry picked from commit 3f8fb9490efbd300887470a2a880a64e04dcc3f5) commit a71ce8c6c9bf269b192f352ea555217815cf027e updated ext4_statfs() to update the on-disk superblock counters, but modified this buffer directly without any journaling of the change. This is one of the accesses that was causing the crc errors in journal replay as seen in kernel.org bugzilla #14354. Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8662b2e6e9f9..3cd519b8fb1c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3668,13 +3668,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); - ext4_free_blocks_count_set(es, buf->f_bfree); buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); if (buf->f_bfree < ext4_r_blocks_count(es)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); - es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); buf->f_namelen = EXT4_NAME_LEN; fsid = le64_to_cpup((void *)es->s_uuid) ^ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); -- cgit v1.2.3 From 58ffefbe7bf2bdfc6f91b79eef9d670df6eab4d3 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Nov 2009 07:24:38 -0500 Subject: ext4: fix uninit block bitmap initialization when s_meta_first_bg is non-zero (cherry picked from commit 8dadb198cb70ef811916668fe67eeec82e8858dd) The number of old-style block group descriptor blocks is s_meta_first_bg when the meta_bg feature flag is set. Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/balloc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1d0418980f8d..f3032c919a22 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -761,7 +761,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, ext4_group_t group) { - return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0; + if (!ext4_bg_has_super(sb, group)) + return 0; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG)) + return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); + else + return EXT4_SB(sb)->s_gdb_count; } /** -- cgit v1.2.3 From 31299e22be94e77b6df41d022e8f0bd075537d58 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 15 Nov 2009 15:29:56 -0500 Subject: ext4: fix block validity checks so they work correctly with meta_bg (cherry picked from commit 1032988c71f3f85483b2b4319684d1205a704c02) The block validity checks used by ext4_data_block_valid() wasn't correctly written to check file systems with the meta_bg feature. Fix this. Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/block_validity.c | 2 +- fs/ext4/inode.c | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 50784ef07563..dc79b75d8f70 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_block *sb) if (ext4_bg_has_super(sb, i) && ((i < 5) || ((i % flex_size) == 0))) add_system_zone(sbi, ext4_group_first_block_no(sb, i), - sbi->s_gdb_count + 1); + ext4_bg_num_gdb(sb, i) + 1); gdp = ext4_get_group_desc(sb, i, NULL); ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); if (ret) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1842a72b72b1..68c9055a4d91 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4883,10 +4883,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = 0; if (ei->i_file_acl && - ((ei->i_file_acl < - (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + - EXT4_SB(sb)->s_gdb_count)) || - (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { + !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { ext4_error(sb, __func__, "bad extended attribute block %llu in inode #%lu", ei->i_file_acl, inode->i_ino); -- cgit v1.2.3 From 43e932d3116f3a74f1b5ff0a38cca5e0bac1099a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Nov 2009 07:24:57 -0500 Subject: ext4: avoid issuing unnecessary barriers (cherry picked from commit 6b17d902fdd241adfa4ce780df20547b28bf5801) We don't to issue an I/O barrier on an error or if we force commit because we are doing data journaling. Signed-off-by: "Theodore Ts'o" Cc: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext4/fsync.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 2b1531266ee2..a3c25076aef1 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -60,7 +60,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) ret = flush_aio_dio_completed_IO(inode); if (ret < 0) - goto out; + return ret; /* * data=writeback: * The caller's filemap_fdatawrite()/wait will sync the data. @@ -79,10 +79,8 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) * (they were dirtied by commit). But that's OK - the blocks are * safe in-journal, which is all fsync() needs to ensure. */ - if (ext4_should_journal_data(inode)) { - ret = ext4_force_commit(inode->i_sb); - goto out; - } + if (ext4_should_journal_data(inode)) + return ext4_force_commit(inode->i_sb); if (!journal) ret = sync_mapping_buffers(inode->i_mapping); -- cgit v1.2.3 From cd9c823a6830d6c84f50fe7dabaf281c542d13ef Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 23 Nov 2009 07:24:48 -0500 Subject: ext4: fix error handling in ext4_ind_get_blocks() (cherry picked from commit 2bba702d4f88d7b010ec37e2527b552588404ae7) When an error happened in ext4_splice_branch we failed to notice that in ext4_ind_get_blocks and mapped the buffer anyway. Fix the problem by checking for error properly. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 68c9055a4d91..c27816a2c4fd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1021,7 +1021,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, if (!err) err = ext4_splice_branch(handle, inode, iblock, partial, indirect_blks, count); - else + if (err) goto cleanup; set_buffer_new(bh_result); -- cgit v1.2.3 From feac39ba7f50e851f4e20466e5d390f046932287 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 19 Nov 2009 14:25:42 -0500 Subject: ext4: make trim/discard optional (and off by default) (cherry picked from commit 5328e635315734d42080de9a5a1ee87bf4cae0a4) It is anticipated that when sb_issue_discard starts doing real work on trim-capable devices, we may see issues. Make this mount-time optional, and default it to off until we know that things are working out OK. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4.h | 1 + fs/ext4/mballoc.c | 21 +++++++++++++-------- fs/ext4/super.c | 14 +++++++++++++- 3 files changed, 27 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8825515eeddd..05ce38b981cb 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -750,6 +750,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt #define set_opt(o, opt) o |= EXT4_MOUNT_##opt diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bba12824defa..6e5a23a2cc25 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2529,7 +2529,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) struct ext4_group_info *db; int err, count = 0, count2 = 0; struct ext4_free_data *entry; - ext4_fsblk_t discard_block; struct list_head *l, *ltmp; list_for_each_safe(l, ltmp, &txn->t_private_list) { @@ -2559,13 +2558,19 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) page_cache_release(e4b.bd_bitmap_page); } ext4_unlock_group(sb, entry->group); - discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) - + entry->start_blk - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, - entry->count); - sb_issue_discard(sb, discard_block, entry->count); - + if (test_opt(sb, DISCARD)) { + ext4_fsblk_t discard_block; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + discard_block = (ext4_fsblk_t)entry->group * + EXT4_BLOCKS_PER_GROUP(sb) + + entry->start_blk + + le32_to_cpu(es->s_first_data_block); + trace_ext4_discard_blocks(sb, + (unsigned long long)discard_block, + entry->count); + sb_issue_discard(sb, discard_block, entry->count); + } kmem_cache_free(ext4_free_ext_cachep, entry); ext4_mb_release_desc(&e4b); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3cd519b8fb1c..86edc41a8284 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -899,6 +899,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NO_AUTO_DA_ALLOC)) seq_puts(seq, ",noauto_da_alloc"); + if (test_opt(sb, DISCARD)) + seq_puts(seq, ",discard"); + ext4_show_quota_options(seq, sb); return 0; @@ -1079,7 +1082,8 @@ enum { Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_block_validity, Opt_noblock_validity, - Opt_inode_readahead_blks, Opt_journal_ioprio + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_discard, Opt_nodiscard, }; static const match_table_t tokens = { @@ -1144,6 +1148,8 @@ static const match_table_t tokens = { {Opt_auto_da_alloc, "auto_da_alloc=%u"}, {Opt_auto_da_alloc, "auto_da_alloc"}, {Opt_noauto_da_alloc, "noauto_da_alloc"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, {Opt_err, NULL}, }; @@ -1565,6 +1571,12 @@ set_qf_format: else set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); break; + case Opt_discard: + set_opt(sbi->s_mount_opt, DISCARD); + break; + case Opt_nodiscard: + clear_opt(sbi->s_mount_opt, DISCARD); + break; default: ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " -- cgit v1.2.3 From 6011d0baad8d5229445b1e0f7b6bdfcaf9cbff26 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 19 Nov 2009 14:28:50 -0500 Subject: ext4: make "norecovery" an alias for "noload" (cherry picked from commit e3bb52ae2bb9573e84c17b8e3560378d13a5c798) Users on the linux-ext4 list recently complained about differences across filesystems w.r.t. how to mount without a journal replay. In the discussion it was noted that xfs's "norecovery" option is perhaps more descriptively accurate than "noload," so let's make that an alias for ext4. Also show this status in /proc/mounts Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 86edc41a8284..dad64a1a23aa 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -902,6 +902,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, DISCARD)) seq_puts(seq, ",discard"); + if (test_opt(sb, NOLOAD)) + seq_puts(seq, ",norecovery"); + ext4_show_quota_options(seq, sb); return 0; @@ -1108,6 +1111,7 @@ static const match_table_t tokens = { {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_noload, "noload"}, + {Opt_noload, "norecovery"}, {Opt_nobh, "nobh"}, {Opt_bh, "bh"}, {Opt_commit, "commit=%u"}, -- cgit v1.2.3 From 74920c74ad3802bc81f5ccbb737d351073dd3ff9 Mon Sep 17 00:00:00 2001 From: Akira Fujita Date: Tue, 24 Nov 2009 10:19:57 -0500 Subject: ext4: Fix double-free of blocks with EXT4_IOC_MOVE_EXT (cherry picked from commit 94d7c16cbbbd0e03841fcf272bcaf0620ad39618) At the beginning of ext4_move_extent(), we call ext4_discard_preallocations() to discard inode PAs of orig and donor inodes. But in the following case, blocks can be double freed, so move ext4_discard_preallocations() to the end of ext4_move_extents(). 1. Discard inode PAs of orig and donor inodes with ext4_discard_preallocations() in ext4_move_extents(). orig : [ DATA1 ] donor: [ DATA2 ] 2. While data blocks are exchanging between orig and donor inodes, new inode PAs is created to orig by other process's block allocation. (Since there are semaphore gaps in ext4_move_extents().) And new inode PAs is used partially (2-1). 2-1 Create new inode PAs to orig inode orig : [ DATA1 | used PA1 | free PA1 ] donor: [ DATA2 ] 3. Donor inode which has old orig inode's blocks is deleted after EXT4_IOC_MOVE_EXT finished (3-1, 3-2). So the block bitmap corresponds to old orig inode's blocks are freed. 3-1 After EXT4_IOC_MOVE_EXT finished orig : [ DATA2 | free PA1 ] donor: [ DATA1 | used PA1 ] 3-2 Delete donor inode orig : [ DATA2 | free PA1 ] donor: [ FREE SPACE(DATA1) | FREE SPACE(used PA1) ] 4. The double-free of blocks is occurred, when close() is called to orig inode. Because ext4_discard_preallocations() for orig inode frees used PA1 and free PA1, though used PA1 is already freed in 3. 4-1 Double-free of blocks is occurred orig : [ DATA2 | FREE SPACE(free PA1) ] donor: [ FREE SPACE(DATA1) | DOUBLE FREE(used PA1) ] Signed-off-by: Akira Fujita Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/move_extent.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 2ca6aa3f34e6..71a62312fff6 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -1289,10 +1289,6 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, ext4_ext_get_actual_len(ext_cur), block_end + 1) - max(le32_to_cpu(ext_cur->ee_block), block_start); - /* Discard preallocations of two inodes */ - ext4_discard_preallocations(orig_inode); - ext4_discard_preallocations(donor_inode); - while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { seq_blocks += add_blocks; @@ -1410,6 +1406,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, } out: + if (*moved_len) { + ext4_discard_preallocations(orig_inode); + ext4_discard_preallocations(donor_inode); + } + if (orig_path) { ext4_ext_drop_refs(orig_path); kfree(orig_path); -- cgit v1.2.3 From 3369cbb6bedae7e8c7600b33803a8e69b2a6e99f Mon Sep 17 00:00:00 2001 From: Kazuya Mio Date: Tue, 24 Nov 2009 10:28:48 -0500 Subject: ext4: initialize moved_len before calling ext4_move_extents() (cherry picked from commit 446aaa6e7e993b38a6f21c6acfa68f3f1af3dbe3) The move_extent.moved_len is used to pass back the number of exchanged blocks count to user space. Currently the caller must clear this field; but we spend more code space checking for this requirement than simply zeroing the field ourselves, so let's just make life easier for everyone all around. Signed-off-by: Kazuya Mio Signed-off-by: Akira Fujita Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ioctl.c | 1 + fs/ext4/move_extent.c | 14 +++----------- 2 files changed, 4 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index c1cdf613e725..31e5ee0c858f 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -239,6 +239,7 @@ setversion_out: } } + me.moved_len = 0; err = ext4_move_extents(filp, donor_filp, me.orig_start, me.donor_start, me.len, &me.moved_len); fput(donor_filp); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 71a62312fff6..5f3774232470 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -947,7 +947,6 @@ out2: * @orig_start: logical start offset in block for orig * @donor_start: logical start offset in block for donor * @len: the number of blocks to be moved - * @moved_len: moved block length * * Check the arguments of ext4_move_extents() whether the files can be * exchanged with each other. @@ -955,8 +954,8 @@ out2: */ static int mext_check_arguments(struct inode *orig_inode, - struct inode *donor_inode, __u64 orig_start, - __u64 donor_start, __u64 *len, __u64 moved_len) + struct inode *donor_inode, __u64 orig_start, + __u64 donor_start, __u64 *len) { ext4_lblk_t orig_blocks, donor_blocks; unsigned int blkbits = orig_inode->i_blkbits; @@ -1010,13 +1009,6 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } - if (moved_len) { - ext4_debug("ext4 move extent: moved_len should be 0 " - "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, - donor_inode->i_ino); - return -EINVAL; - } - if ((orig_start > EXT_MAX_BLOCK) || (donor_start > EXT_MAX_BLOCK) || (*len > EXT_MAX_BLOCK) || @@ -1226,7 +1218,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, - donor_start, &len, *moved_len); + donor_start, &len); if (ret1) goto out; -- cgit v1.2.3 From ce5cf38ef17272ad2641c723c6d0d8eaf1c34eab Mon Sep 17 00:00:00 2001 From: Akira Fujita Date: Tue, 24 Nov 2009 10:31:56 -0500 Subject: ext4: move_extent_per_page() cleanup (cherry picked from commit ac48b0a1d068887141581bea8285de5fcab182b0) Integrate duplicate lines (acquire/release semaphore and invalidate extent cache in move_extent_per_page()) into mext_replace_branches(), to reduce source and object code size. Signed-off-by: Akira Fujita Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/move_extent.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 5f3774232470..bcc41e63965c 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -660,6 +660,9 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, int replaced_count = 0; int dext_alen; + /* Protect extent trees against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); + /* Get the original extent for the block "orig_off" */ *err = get_ext_path(orig_inode, orig_off, &orig_path); if (*err) @@ -755,6 +758,11 @@ out: kfree(donor_path); } + ext4_ext_invalidate_cache(orig_inode); + ext4_ext_invalidate_cache(donor_inode); + + double_up_write_data_sem(orig_inode, donor_inode); + return replaced_count; } @@ -820,19 +828,9 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, * Just swap data blocks between orig and donor. */ if (uninit) { - /* - * Protect extent trees against block allocations - * via delalloc - */ - double_down_write_data_sem(orig_inode, donor_inode); replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, orig_blk_offset, block_len_in_page, err); - - /* Clear the inode cache not to refer to the old data */ - ext4_ext_invalidate_cache(orig_inode); - ext4_ext_invalidate_cache(donor_inode); - double_up_write_data_sem(orig_inode, donor_inode); goto out2; } @@ -880,8 +878,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, /* Release old bh and drop refs */ try_to_release_page(page, 0); - /* Protect extent trees against block allocations via delalloc */ - double_down_write_data_sem(orig_inode, donor_inode); replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, orig_blk_offset, block_len_in_page, &err2); @@ -890,18 +886,10 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, block_len_in_page = replaced_count; replaced_size = block_len_in_page << orig_inode->i_blkbits; - } else { - double_up_write_data_sem(orig_inode, donor_inode); + } else goto out; - } } - /* Clear the inode cache not to refer to the old data */ - ext4_ext_invalidate_cache(orig_inode); - ext4_ext_invalidate_cache(donor_inode); - - double_up_write_data_sem(orig_inode, donor_inode); - if (!page_has_buffers(page)) create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); -- cgit v1.2.3 From 86d291a39650e05c9db7c0d849e48beaf39efcdd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 1 Dec 2009 09:04:42 -0500 Subject: jbd2: Add ENOMEM checking in and for jbd2_journal_write_metadata_buffer() (cherry picked from commit e6ec116b67f46e0e7808276476554727b2e6240b) OOM happens. Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/jbd2/commit.c | 4 ++++ fs/jbd2/journal.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index d4cfd6d2779e..8896c1d4febe 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -636,6 +636,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) JBUFFER_TRACE(jh, "ph3: write metadata"); flags = jbd2_journal_write_metadata_buffer(commit_transaction, jh, &new_jh, blocknr); + if (flags < 0) { + jbd2_journal_abort(journal, flags); + continue; + } set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); wbuf[bufs++] = jh2bh(new_jh); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index fed85388ee86..c974077915fa 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -358,6 +358,10 @@ repeat: jbd_unlock_bh_state(bh_in); tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); + if (!tmp) { + jbd2_journal_put_journal_head(new_jh); + return -ENOMEM; + } jbd_lock_bh_state(bh_in); if (jh_in->b_frozen_data) { jbd2_free(tmp, bh_in->b_size); -- cgit v1.2.3 From 66c3a718335ccd37ac14c4f0f2d0a7551af562ed Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Mon, 7 Dec 2009 10:38:16 -0500 Subject: ext4: Return the PTR_ERR of the correct pointer in setup_new_group_blocks() (cherry picked from commit c09eef305dd43846360944ad072f051f964fa383) Signed-off-by: Roel Kluin Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/resize.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 3cfc343c41b5..3b2c5541d8a6 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -247,7 +247,7 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_bh; if (IS_ERR(gdb = bclean(handle, sb, block))) { - err = PTR_ERR(bh); + err = PTR_ERR(gdb); goto exit_bh; } ext4_handle_dirty_metadata(handle, NULL, gdb); -- cgit v1.2.3 From 35a6f7824919816fca466997531885044d290b59 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 8 Dec 2009 21:24:33 -0500 Subject: ext4: Avoid data / filesystem corruption when write fails to copy data (cherry picked from commit b9a4207d5e911b938f73079a83cc2ae10524ec7f) When ext4_write_begin fails after allocating some blocks or generic_perform_write fails to copy data to write, we truncate blocks already instantiated beyond i_size. Although these blocks were never inside i_size, we have to truncate the pagecache of these blocks so that corresponding buffers get unmapped. Otherwise subsequent __block_prepare_write (called because we are retrying the write) will find the buffers mapped, not call ->get_block, and thus the page will be backed by already freed blocks leading to filesystem and data corruption. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c27816a2c4fd..87406c61eca1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1534,6 +1534,16 @@ static int do_journal_get_write_access(handle_t *handle, return ext4_journal_get_write_access(handle, bh); } +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static void ext4_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); +} + static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1599,7 +1609,7 @@ retry: ext4_journal_stop(handle); if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to @@ -1709,7 +1719,7 @@ static int ext4_ordered_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -1751,7 +1761,7 @@ static int ext4_writeback_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -1814,7 +1824,7 @@ static int ext4_journalled_write_end(struct file *file, if (!ret) ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -3091,7 +3101,7 @@ retry: * i_size_read because we hold i_mutex. */ if (pos + len > inode->i_size) - ext4_truncate(inode); + ext4_truncate_failed_write(inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) -- cgit v1.2.3 From 1cd3f1980ce02bd814879ce1ac9cde5eaceb5f13 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 8 Dec 2009 21:48:58 -0500 Subject: ext4: wait for log to commit when umounting (cherry picked from commit d4edac314e9ad0b21ba20ba8bc61b61f186f79e1) There is a potential race when a transaction is committing right when the file system is being umounting. This could reduce in a race because EXT4_SB(sb)->s_group_info could be freed in ext4_put_super before the commit code calls a callback so the mballoc code can release freed blocks in the transaction, resulting in a panic trying to access the freed s_group_info. The fix is to wait for the transaction to finish committing before we shutdown the multiblock allocator. Signed-off-by: Josef Bacik Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dad64a1a23aa..803fb252c111 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -603,10 +603,6 @@ static void ext4_put_super(struct super_block *sb) if (sb->s_dirt) ext4_commit_super(sb, 1); - ext4_release_system_zone(sb); - ext4_mb_release(sb); - ext4_ext_release(sb); - ext4_xattr_put_super(sb); if (sbi->s_journal) { err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -614,6 +610,12 @@ static void ext4_put_super(struct super_block *sb) ext4_abort(sb, __func__, "Couldn't clean up the journal"); } + + ext4_release_system_zone(sb); + ext4_mb_release(sb); + ext4_ext_release(sb); + ext4_xattr_put_super(sb); + if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); -- cgit v1.2.3 From 637b13106b744398b530fe916eb1556aeb7f6bca Mon Sep 17 00:00:00 2001 From: Curt Wohlgemuth Date: Tue, 8 Dec 2009 22:18:25 -0500 Subject: ext4: remove blocks from inode prealloc list on failure (cherry picked from commit b844167edc7fcafda9623955c05e4c1b3c32ebc7) This fixes a leak of blocks in an inode prealloc list if device failures cause ext4_mb_mark_diskspace_used() to fail. Signed-off-by: Curt Wohlgemuth Acked-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/mballoc.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6e5a23a2cc25..7d7114818f8d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3010,6 +3010,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) trace_ext4_mballoc_prealloc(ac); } +/* + * Called on failure; free up any blocks from the inode PA for this + * context. We don't need this for MB_GROUP_PA because we only change + * pa_free in ext4_mb_release_context(), but on failure, we've already + * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. + */ +static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa = ac->ac_pa; + int len; + + if (pa && pa->pa_type == MB_INODE_PA) { + len = ac->ac_b_ex.fe_len; + pa->pa_free += len; + } + +} + /* * use blocks preallocated to inode */ @@ -4295,6 +4313,7 @@ repeat: ac->ac_status = AC_STATUS_CONTINUE; goto repeat; } else if (*errp) { + ext4_discard_allocated_blocks(ac); ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); -- cgit v1.2.3 From 6798788a72ee430761aa41c02f770ee3afb9c212 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 8 Dec 2009 22:41:52 -0500 Subject: ext4: ext4_get_reserved_space() must return bytes instead of blocks (cherry picked from commit 8aa6790f876e81f5a2211fe1711a5fe3fe2d7b20) Signed-off-by: Dmitry Monakhov Reviewed-by: Eric Sandeen Acked-by: Mingming Cao Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 87406c61eca1..86469775cca4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1052,7 +1052,7 @@ qsize_t ext4_get_reserved_space(struct inode *inode) EXT4_I(inode)->i_reserved_meta_blocks; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - return total; + return (total << inode->i_blkbits); } /* * Calculate the number of metadata blocks need to reserve -- cgit v1.2.3 From da2068b384bbfaae98ff55f1424be88c65bf801b Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 8 Dec 2009 22:42:15 -0500 Subject: ext4: quota macros cleanup (cherry picked from commit 5aca07eb7d8f14d90c740834d15ca15277f4820c) Currently all quota block reservation macros contains hard-coded "2" aka MAXQUOTAS value. This is no good because in some places it is not obvious to understand what does this digit represent. Let's introduce new macro with self descriptive name. Signed-off-by: Dmitry Monakhov Acked-by: Mingming Cao Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4_jbd2.h | 8 ++++++-- fs/ext4/extents.c | 2 +- fs/ext4/inode.c | 2 +- fs/ext4/migrate.c | 4 ++-- fs/ext4/namei.c | 8 ++++---- 5 files changed, 14 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index a2865980342f..404ab6c824d3 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -49,7 +49,7 @@ #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ EXT4_XATTR_TRANS_BLOCKS - 2 + \ - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* * Define the number of metadata blocks we need to account to modify data. @@ -57,7 +57,7 @@ * This include super block, inode block, quota blocks and xattr blocks */ #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* Delete operations potentially hit one directory's namespace plus an * entire inode, plus arbitrary amounts of bitmap/indirection data. Be @@ -92,6 +92,7 @@ * but inode, sb and group updates are done only once */ #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) + #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) #else @@ -99,6 +100,9 @@ #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 #endif +#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) int ext4_mark_iloc_dirty(handle_t *handle, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 74dcff84c3a8..499d6bb9be94 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2167,7 +2167,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, correct_index = 1; credits += (ext_depth(inode)) + 1; } - credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); err = ext4_ext_truncate_extend_restart(handle, inode, credits); if (err) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 86469775cca4..e08e81b4f7d1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5231,7 +5231,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ + handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index a93d5b80f3e2..864614974536 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) * So allocate a credit of 3. We may update * quota (user and group). */ - needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); + needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); if (ext4_journal_extend(handle, needed) != 0) retval = ext4_journal_restart(handle, needed); @@ -477,7 +477,7 @@ int ext4_ext_migrate(struct inode *inode) handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb) + EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + 1); if (IS_ERR(handle)) { retval = PTR_ERR(handle); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index fde08c919d12..17a17e10dd60 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1769,7 +1769,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1803,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1840,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2253,7 +2253,7 @@ static int ext4_symlink(struct inode *dir, retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); -- cgit v1.2.3 From caa305aa349212c285ad9564b9ff2ffa040b193c Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 8 Dec 2009 22:42:28 -0500 Subject: ext4: fix incorrect block reservation on quota transfer. (cherry picked from commit 194074acacebc169ded90a4657193f5180015051) Inside ->setattr() call both ATTR_UID and ATTR_GID may be valid This means that we may end-up with transferring all quotas. Add we have to reserve QUOTA_DEL_BLOCKS for all quotas, as we do in case of QUOTA_INIT_BLOCKS. Signed-off-by: Dmitry Monakhov Reviewed-by: Mingming Cao Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e08e81b4f7d1..64086fb96e47 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5232,7 +5232,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ - EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; -- cgit v1.2.3 From eebb744d30006474a8f63af098bc71f0cb209677 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 8 Dec 2009 23:51:10 -0500 Subject: ext4: Wait for proper transaction commit on fsync (cherry picked from commit b436b9bef84de6893e86346d8fbf7104bc520645) We cannot rely on buffer dirty bits during fsync because pdflush can come before fsync is called and clear dirty bits without forcing a transaction commit. What we do is that we track which transaction has last changed the inode and which transaction last changed allocation and force it to disk on fsync. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4.h | 7 +++++++ fs/ext4/ext4_jbd2.h | 13 +++++++++++++ fs/ext4/extents.c | 14 ++++++++++++-- fs/ext4/fsync.c | 46 +++++++++++++++++----------------------------- fs/ext4/inode.c | 29 +++++++++++++++++++++++++++++ fs/ext4/super.c | 2 ++ fs/jbd2/journal.c | 1 + 7 files changed, 81 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 05ce38b981cb..bd2a9dd1464c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -703,6 +703,13 @@ struct ext4_inode_info { struct list_head i_aio_dio_complete_list; /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; }; /* diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 404ab6c824d3..1892a7763426 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -258,6 +258,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) return 0; } +static inline void ext4_update_inode_fsync_trans(handle_t *handle, + struct inode *inode, + int datasync) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (ext4_handle_valid(handle)) { + ei->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + ei->i_datasync_tid = handle->h_transaction->t_tid; + } +} + /* super.c */ int ext4_force_commit(struct super_block *sb); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 499d6bb9be94..43c04ff653c6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3064,6 +3064,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { ret = ext4_convert_unwritten_extents_dio(handle, inode, path); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); goto out2; } /* buffered IO case */ @@ -3091,6 +3093,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ret = ext4_ext_convert_to_initialized(handle, inode, path, iblock, max_blocks); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); out: if (ret <= 0) { err = ret; @@ -3329,10 +3333,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, allocated = ext4_ext_get_actual_len(&newex); set_buffer_new(bh_result); - /* Cache only when it is _not_ an uninitialized extent */ - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) + /* + * Cache the extent and update transaction to commit on fdatasync only + * when it is _not_ an uninitialized extent. + */ + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { ext4_ext_put_in_cache(inode, iblock, allocated, newblock, EXT4_EXT_CACHE_EXTENT); + ext4_update_inode_fsync_trans(handle, inode, 1); + } else + ext4_update_inode_fsync_trans(handle, inode, 0); out: if (allocated > max_blocks) allocated = max_blocks; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index a3c25076aef1..d6049e40b161 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -51,25 +51,30 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; + struct ext4_inode_info *ei = EXT4_I(inode); journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - int err, ret = 0; + int ret; + tid_t commit_tid; J_ASSERT(ext4_journal_current_handle() == NULL); trace_ext4_sync_file(file, dentry, datasync); + if (inode->i_sb->s_flags & MS_RDONLY) + return 0; + ret = flush_aio_dio_completed_IO(inode); if (ret < 0) return ret; + + if (!journal) + return simple_fsync(file, dentry, datasync); + /* - * data=writeback: + * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. - * sync_inode() will sync the metadata - * - * data=ordered: - * The caller's filemap_fdatawrite() will write the data and - * sync_inode() will write the inode if it is dirty. Then the caller's - * filemap_fdatawait() will wait on the pages. + * Metadata is in the journal, we wait for proper transaction to + * commit here. * * data=journal: * filemap_fdatawrite won't do anything (the buffers are clean). @@ -82,27 +87,10 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) if (ext4_should_journal_data(inode)) return ext4_force_commit(inode->i_sb); - if (!journal) - ret = sync_mapping_buffers(inode->i_mapping); - - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - goto out; - - /* - * The VFS has written the file data. If the inode is unaltered - * then we need not start a commit. - */ - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* sys_fsync did this */ - }; - err = sync_inode(inode, &wbc); - if (ret == 0) - ret = err; - } -out: - if (journal && (journal->j_flags & JBD2_BARRIER)) + commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; + if (jbd2_log_start_commit(journal, commit_tid)) + jbd2_log_wait_commit(journal, commit_tid); + else if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(inode->i_sb->s_bdev, NULL); return ret; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 64086fb96e47..1dae9a4dbb03 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1025,6 +1025,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, goto cleanup; set_buffer_new(bh_result); + + ext4_update_inode_fsync_trans(handle, inode, 1); got_it: map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); if (count > blocks_to_boundary) @@ -4794,6 +4796,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) struct ext4_inode *raw_inode; struct ext4_inode_info *ei; struct inode *inode; + journal_t *journal = EXT4_SB(sb)->s_journal; long ret; int block; @@ -4858,6 +4861,31 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + transaction_t *transaction; + tid_t tid; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + spin_unlock(&journal->j_state_lock); + ei->i_sync_tid = tid; + ei->i_datasync_tid = tid; + } + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > @@ -5112,6 +5140,7 @@ static int ext4_do_update_inode(handle_t *handle, err = rc; ei->i_state &= ~EXT4_STATE_NEW; + ext4_update_inode_fsync_trans(handle, inode, 0); out_brelse: brelse(bh); ext4_std_error(inode->i_sb, err); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 803fb252c111..9ae521749b90 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -706,6 +706,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&(ei->i_block_reservation_lock)); INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); ei->cur_aio_dio = NULL; + ei->i_sync_tid = 0; + ei->i_datasync_tid = 0; return &ei->vfs_inode; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c974077915fa..82c295d2a7c5 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno); EXPORT_SYMBOL(jbd2_journal_ack_err); EXPORT_SYMBOL(jbd2_journal_clear_err); EXPORT_SYMBOL(jbd2_log_wait_commit); +EXPORT_SYMBOL(jbd2_log_start_commit); EXPORT_SYMBOL(jbd2_journal_start_commit); EXPORT_SYMBOL(jbd2_journal_force_commit_nested); EXPORT_SYMBOL(jbd2_journal_wipe); -- cgit v1.2.3 From 0fd023ecf102ab0bd070d5affd73b18e6704ff0f Mon Sep 17 00:00:00 2001 From: Akira Fujita Date: Sun, 6 Dec 2009 23:38:31 -0500 Subject: ext4: Fix insufficient checks in EXT4_IOC_MOVE_EXT (cherry picked from commit 4a58579b9e4e2a35d57e6c9c8483e52f6f1b7fd6) This patch fixes three problems in the handling of the EXT4_IOC_MOVE_EXT ioctl: 1. In current EXT4_IOC_MOVE_EXT, there are read access mode checks for original and donor files, but they allow the illegal write access to donor file, since donor file is overwritten by original file data. To fix this problem, change access mode checks of original (r->r/w) and donor (r->w) files. 2. Disallow the use of donor files that have a setuid or setgid bits. 3. Call mnt_want_write() and mnt_drop_write() before and after ext4_move_extents() calling to get write access to a mount. Signed-off-by: Akira Fujita Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ioctl.c | 30 ++++++++++++++++++------------ fs/ext4/move_extent.c | 7 +++++++ 2 files changed, 25 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 31e5ee0c858f..b63d193126db 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -221,32 +221,38 @@ setversion_out: struct file *donor_filp; int err; + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + if (copy_from_user(&me, (struct move_extent __user *)arg, sizeof(me))) return -EFAULT; + me.moved_len = 0; donor_filp = fget(me.donor_fd); if (!donor_filp) return -EBADF; - if (!capable(CAP_DAC_OVERRIDE)) { - if ((current->real_cred->fsuid != inode->i_uid) || - !(inode->i_mode & S_IRUSR) || - !(donor_filp->f_dentry->d_inode->i_mode & - S_IRUSR)) { - fput(donor_filp); - return -EACCES; - } + if (!(donor_filp->f_mode & FMODE_WRITE)) { + err = -EBADF; + goto mext_out; } - me.moved_len = 0; + err = mnt_want_write(filp->f_path.mnt); + if (err) + goto mext_out; + err = ext4_move_extents(filp, donor_filp, me.orig_start, me.donor_start, me.len, &me.moved_len); - fput(donor_filp); + mnt_drop_write(filp->f_path.mnt); + if (me.moved_len > 0) + file_remove_suid(donor_filp); if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) - return -EFAULT; - + err = -EFAULT; +mext_out: + fput(donor_filp); return err; } diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index bcc41e63965c..f5b03a132a2a 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -957,6 +957,13 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { + ext4_debug("ext4 move extent: suid or sgid is set" + " to donor file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + /* Ext4 move extent does not support swapfile */ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { ext4_debug("ext4 move extent: The argument files should " -- cgit v1.2.3 From abb247066f9769e90df87ab65e5c2bb4dbdb529c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 9 Dec 2009 21:30:02 -0500 Subject: ext4: Fix potential fiemap deadlock (mmap_sem vs. i_data_sem) (cherry picked from commit fab3a549e204172236779f502eccb4f9bf0dc87d) Fix the following potential circular locking dependency between mm->mmap_sem and ei->i_data_sem: ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.32-04115-gec044c5 #37 ------------------------------------------------------- ureadahead/1855 is trying to acquire lock: (&mm->mmap_sem){++++++}, at: [] might_fault+0x5c/0xac but task is already holding lock: (&ei->i_data_sem){++++..}, at: [] ext4_fiemap+0x11b/0x159 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&ei->i_data_sem){++++..}: [] __lock_acquire+0xb67/0xd0f [] lock_acquire+0xdc/0x102 [] down_read+0x51/0x84 [] ext4_get_blocks+0x50/0x2a5 [] ext4_get_block+0xab/0xef [] do_mpage_readpage+0x198/0x48d [] mpage_readpages+0xd0/0x114 [] ext4_readpages+0x1d/0x1f [] __do_page_cache_readahead+0x12f/0x1bc [] ra_submit+0x21/0x25 [] filemap_fault+0x19f/0x32c [] __do_fault+0x55/0x3a2 [] handle_mm_fault+0x327/0x734 [] do_page_fault+0x292/0x2aa [] page_fault+0x25/0x30 [] clear_user+0x38/0x3c [] padzero+0x20/0x31 [] load_elf_binary+0x8bc/0x17ed [] search_binary_handler+0xc2/0x259 [] load_script+0x1b8/0x1cc [] search_binary_handler+0xc2/0x259 [] do_execve+0x1ce/0x2cf [] sys_execve+0x43/0x5a [] stub_execve+0x6a/0xc0 -> #0 (&mm->mmap_sem){++++++}: [] __lock_acquire+0xa11/0xd0f [] lock_acquire+0xdc/0x102 [] might_fault+0x89/0xac [] fiemap_fill_next_extent+0x95/0xda [] ext4_ext_fiemap_cb+0x138/0x157 [] ext4_ext_walk_space+0x178/0x1f1 [] ext4_fiemap+0x13c/0x159 [] do_vfs_ioctl+0x348/0x4d6 [] sys_ioctl+0x56/0x79 [] system_call_fastpath+0x16/0x1b other info that might help us debug this: 1 lock held by ureadahead/1855: #0: (&ei->i_data_sem){++++..}, at: [] ext4_fiemap+0x11b/0x159 stack backtrace: Pid: 1855, comm: ureadahead Not tainted 2.6.32-04115-gec044c5 #37 Call Trace: [] print_circular_bug+0xa8/0xb7 [] __lock_acquire+0xa11/0xd0f [] ? sched_clock+0x9/0xd [] lock_acquire+0xdc/0x102 [] ? might_fault+0x5c/0xac [] might_fault+0x89/0xac [] ? might_fault+0x5c/0xac [] ? __kmalloc+0x13b/0x18c [] fiemap_fill_next_extent+0x95/0xda [] ext4_ext_fiemap_cb+0x138/0x157 [] ? ext4_ext_fiemap_cb+0x0/0x157 [] ext4_ext_walk_space+0x178/0x1f1 [] ext4_fiemap+0x13c/0x159 [] ? might_fault+0x5c/0xac [] do_vfs_ioctl+0x348/0x4d6 [] ? __up_read+0x8d/0x95 [] ? retint_swapgs+0x13/0x1b [] sys_ioctl+0x56/0x79 [] system_call_fastpath+0x16/0x1b Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/extents.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 43c04ff653c6..8b8bae4c0cf4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1761,7 +1761,9 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, while (block < last && block != EXT_MAX_BLOCK) { num = last - block; /* find extent for this block */ + down_read(&EXT4_I(inode)->i_data_sem); path = ext4_ext_find_extent(inode, block, path); + up_read(&EXT4_I(inode)->i_data_sem); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; @@ -3730,10 +3732,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * Walk the extent tree gathering extent information. * ext4_ext_fiemap_cb will push extents back to user. */ - down_read(&EXT4_I(inode)->i_data_sem); error = ext4_ext_walk_space(inode, start_blk, len_blks, ext4_ext_fiemap_cb, fieinfo); - up_read(&EXT4_I(inode)->i_data_sem); } return error; -- cgit v1.2.3 From b71a005bac02d0f68ba2d93f47cd58afcd563149 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 15 Nov 2009 15:31:37 -0500 Subject: jbd2: don't wipe the journal on a failed journal checksum commit e6a47428de84e19fda52f21ab73fde2906c40d09 upstream. If there is a failed journal checksum, don't reset the journal. This allows for userspace programs to decide how to recover from this situation. It may be that ignoring the journal checksum failure might be a better way of recovering the file system. Once we add per-block checksums, we can definitely do better. Until then, a system administrator can try backing up the file system image (or taking a snapshot) and and trying to determine experimentally whether ignoring the checksum failure or aborting the journal replay results in less data loss. Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/jbd2/journal.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 82c295d2a7c5..b7ca3a92a4db 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1253,6 +1253,13 @@ int jbd2_journal_load(journal_t *journal) if (jbd2_journal_recover(journal)) goto recovery_error; + if (journal->j_failed_commit) { + printk(KERN_ERR "JBD2: journal transaction %u on %s " + "is corrupt.\n", journal->j_failed_commit, + journal->j_devname); + return -EIO; + } + /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory * and reset them on disk. */ -- cgit v1.2.3 From 037b7867ec53baa0dea5d05d4201e4fbf2405fc3 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Mon, 14 Dec 2009 17:57:37 -0800 Subject: hfs: fix a potential buffer overflow commit ec81aecb29668ad71f699f4e7b96ec46691895b6 upstream. A specially-crafted Hierarchical File System (HFS) filesystem could cause a buffer overflow to occur in a process's kernel stack during a memcpy() call within the hfs_bnode_read() function (at fs/hfs/bnode.c:24). The attacker can provide the source buffer and length, and the destination buffer is a local variable of a fixed length. This local variable (passed as "&entry" from fs/hfs/dir.c:112 and allocated on line 60) is stored in the stack frame of hfs_bnode_read()'s caller, which is hfs_readdir(). Because the hfs_readdir() function executes upon any attempt to read a directory on the filesystem, it gets called whenever a user attempts to inspect any filesystem contents. [amwang@redhat.com: modify this patch and fix coding style problems] Signed-off-by: WANG Cong Cc: Eugene Teo Cc: Roman Zippel Cc: Al Viro Cc: Christoph Hellwig Cc: Alexey Dobriyan Cc: Dave Anderson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/hfs/catalog.c | 4 ++++ fs/hfs/dir.c | 11 +++++++++++ fs/hfs/super.c | 7 ++++++- 3 files changed, 21 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index 6d98f116ca03..424b0337f524 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -289,6 +289,10 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name, err = hfs_brec_find(&src_fd); if (err) goto out; + if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) { + err = -EIO; + goto out; + } hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset, src_fd.entrylength); diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 7c69b98a2e45..2b3b8611b41b 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -79,6 +79,11 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) filp->f_pos++; /* fall through */ case 1: + if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { + err = -EIO; + goto out; + } + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); if (entry.type != HFS_CDR_THD) { printk(KERN_ERR "hfs: bad catalog folder thread\n"); @@ -109,6 +114,12 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) err = -EIO; goto out; } + + if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { + err = -EIO; + goto out; + } + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); type = entry.type; len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName); diff --git a/fs/hfs/super.c b/fs/hfs/super.c index f7fcbe49da72..5ed7252b7b23 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -409,8 +409,13 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) /* try to get the root inode */ hfs_find_init(HFS_SB(sb)->cat_tree, &fd); res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd); - if (!res) + if (!res) { + if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) { + res = -EIO; + goto bail; + } hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength); + } if (res) { hfs_find_exit(&fd); goto bail_no_root; -- cgit v1.2.3 From e4f676cea796e170fcd8561c8e2a234724d05f35 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 10 Dec 2009 09:05:55 -0500 Subject: NFS: Fix nfs_migrate_page() commit 190f38e5cedc910940b1da9015f00458c18f97b4 upstream. The call to migrate_page() will cause the page->private field to be cleared. Also fix up the locking around the page->private transfer, so that we ensure that calls to nfs_page_find_request() don't end up racing. Finally, fix up a double free bug: nfs_unlock_request() already calls nfs_release_request() for us... Reported-by: Wu Fengguang Tested-by: Andi Kleen Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/write.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 53eb26c16b50..6fc37762c495 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1612,15 +1612,16 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, if (ret) goto out_unlock; page_cache_get(newpage); + spin_lock(&mapping->host->i_lock); req->wb_page = newpage; SetPagePrivate(newpage); - set_page_private(newpage, page_private(page)); + set_page_private(newpage, (unsigned long)req); ClearPagePrivate(page); set_page_private(page, 0); + spin_unlock(&mapping->host->i_lock); page_cache_release(page); out_unlock: nfs_clear_page_tag_locked(req); - nfs_release_request(req); out: return ret; } -- cgit v1.2.3 From 5451f13711a2af1fffdfe0d1e2faf02449fdab28 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 17 Nov 2009 18:35:43 -0800 Subject: devpts_get_tty() should validate inode commit edfacdd6f81119b9005615593f2cbd94b8c7e2d8 upstream. devpts_get_tty() assumes that the inode passed in is associated with a valid pty. But if the only reference to the pty is via a bind-mount, the inode passed to devpts_get_tty() while valid, would refer to a pty that no longer exists. With a lot of debug effort, Grzegorz Nosek developed a small program (see below) to reproduce a crash on recent kernels. This crash is a regression introduced by the commit: commit 527b3e4773628b30d03323a2cb5fb0d84441990f Author: Sukadev Bhattiprolu Date: Mon Oct 13 10:43:08 2008 +0100 To fix, ensure that the dentry associated with the inode has not yet been deleted/unhashed by devpts_pty_kill(). See also: https://lists.linux-foundation.org/pipermail/containers/2009-July/019273.html tty-bug.c: #define _GNU_SOURCE #include #include #include #include #include #include #include #include void dummy(int sig) { } static int child(void *unused) { int fd; signal(SIGINT, dummy); signal(SIGHUP, dummy); pause(); /* cheesy synchronisation to wait for /dev/pts/0 to appear */ mount("/dev/pts/0", "/dev/console", NULL, MS_BIND, NULL); sleep(2); fd = open("/dev/console", O_RDWR); dup(0); dup(0); write(1, "Hello world!\n", sizeof("Hello world!\n")-1); return 0; } int main(void) { pid_t pid; char *stack; stack = malloc(16384); pid = clone(child, stack+16384, CLONE_NEWNS|SIGCHLD, NULL); open("/dev/ptmx", O_RDWR|O_NOCTTY|O_NONBLOCK); unlockpt(fd); grantpt(fd); sleep(2); kill(pid, SIGHUP); sleep(1); return 0; /* exit before child opens /dev/console */ } Reported-by: Grzegorz Nosek Signed-off-by: Sukadev Bhattiprolu Tested-by: Serge Hallyn Signed-off-by: Greg Kroah-Hartman --- fs/devpts/inode.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index d5f8c96964be..8882ecc0f1bf 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -517,11 +517,23 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number) { + struct dentry *dentry; + struct tty_struct *tty; + BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); + /* Ensure dentry has not been deleted by devpts_pty_kill() */ + dentry = d_find_alias(pts_inode); + if (!dentry) + return NULL; + + tty = NULL; if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) - return (struct tty_struct *)pts_inode->i_private; - return NULL; + tty = (struct tty_struct *)pts_inode->i_private; + + dput(dentry); + + return tty; } void devpts_pty_kill(struct tty_struct *tty) -- cgit v1.2.3 From 9f2813ec1f4868d07d8a16f4a9c4932819fc5d8e Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 17 Nov 2009 14:40:26 -0800 Subject: debugfs: fix create mutex racy fops and private data commit d3a3b0adad0865c12e39b712ca89efbd0a3a0dbc upstream. Setting fops and private data outside of the mutex at debugfs file creation introduces a race where the files can be opened with the wrong file operations and private data. It is easy to trigger with a process waiting on file creation notification. Signed-off-by: Mathieu Desnoyers Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/inode.c | 55 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index d22438ef7674..39c6ee868d7b 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -32,7 +32,9 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; -static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev) +static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev, + void *data, const struct file_operations *fops) + { struct inode *inode = new_inode(sb); @@ -44,14 +46,18 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d init_special_inode(inode, mode, dev); break; case S_IFREG: - inode->i_fop = &debugfs_file_operations; + inode->i_fop = fops ? fops : &debugfs_file_operations; + inode->i_private = data; break; case S_IFLNK: inode->i_op = &debugfs_link_operations; + inode->i_fop = fops; + inode->i_private = data; break; case S_IFDIR: inode->i_op = &simple_dir_inode_operations; - inode->i_fop = &simple_dir_operations; + inode->i_fop = fops ? fops : &simple_dir_operations; + inode->i_private = data; /* directory inodes start off with i_nlink == 2 * (for "." entry) */ @@ -64,7 +70,8 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d /* SMP-safe */ static int debugfs_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t dev) + int mode, dev_t dev, void *data, + const struct file_operations *fops) { struct inode *inode; int error = -EPERM; @@ -72,7 +79,7 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry, if (dentry->d_inode) return -EEXIST; - inode = debugfs_get_inode(dir->i_sb, mode, dev); + inode = debugfs_get_inode(dir->i_sb, mode, dev, data, fops); if (inode) { d_instantiate(dentry, inode); dget(dentry); @@ -81,12 +88,13 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry, return error; } -static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode, + void *data, const struct file_operations *fops) { int res; mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR; - res = debugfs_mknod(dir, dentry, mode, 0); + res = debugfs_mknod(dir, dentry, mode, 0, data, fops); if (!res) { inc_nlink(dir); fsnotify_mkdir(dir, dentry); @@ -94,18 +102,20 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) return res; } -static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode) +static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode, + void *data, const struct file_operations *fops) { mode = (mode & S_IALLUGO) | S_IFLNK; - return debugfs_mknod(dir, dentry, mode, 0); + return debugfs_mknod(dir, dentry, mode, 0, data, fops); } -static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode) +static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode, + void *data, const struct file_operations *fops) { int res; mode = (mode & S_IALLUGO) | S_IFREG; - res = debugfs_mknod(dir, dentry, mode, 0); + res = debugfs_mknod(dir, dentry, mode, 0, data, fops); if (!res) fsnotify_create(dir, dentry); return res; @@ -139,7 +149,9 @@ static struct file_system_type debug_fs_type = { static int debugfs_create_by_name(const char *name, mode_t mode, struct dentry *parent, - struct dentry **dentry) + struct dentry **dentry, + void *data, + const struct file_operations *fops) { int error = 0; @@ -164,13 +176,16 @@ static int debugfs_create_by_name(const char *name, mode_t mode, if (!IS_ERR(*dentry)) { switch (mode & S_IFMT) { case S_IFDIR: - error = debugfs_mkdir(parent->d_inode, *dentry, mode); + error = debugfs_mkdir(parent->d_inode, *dentry, mode, + data, fops); break; case S_IFLNK: - error = debugfs_link(parent->d_inode, *dentry, mode); + error = debugfs_link(parent->d_inode, *dentry, mode, + data, fops); break; default: - error = debugfs_create(parent->d_inode, *dentry, mode); + error = debugfs_create(parent->d_inode, *dentry, mode, + data, fops); break; } dput(*dentry); @@ -221,19 +236,13 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode, if (error) goto exit; - error = debugfs_create_by_name(name, mode, parent, &dentry); + error = debugfs_create_by_name(name, mode, parent, &dentry, + data, fops); if (error) { dentry = NULL; simple_release_fs(&debugfs_mount, &debugfs_mount_count); goto exit; } - - if (dentry->d_inode) { - if (data) - dentry->d_inode->i_private = data; - if (fops) - dentry->d_inode->i_fop = fops; - } exit: return dentry; } -- cgit v1.2.3 From 62965d8347a071c72f4dae96e65be5fb68dd052f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 1 Dec 2009 16:53:06 +0100 Subject: ext3: Fix data / filesystem corruption when write fails to copy data commit 68eb3db08344286733adac48304d9fb7a0e53b27 upstream. When ext3_write_begin fails after allocating some blocks or generic_perform_write fails to copy data to write, we truncate blocks already instantiated beyond i_size. Although these blocks were never inside i_size, we have to truncate pagecache of these blocks so that corresponding buffers get unmapped. Otherwise subsequent __block_prepare_write (called because we are retrying the write) will find the buffers mapped, not call ->get_block, and thus the page will be backed by already freed blocks leading to filesystem and data corruption. Reported-by: James Y Knight Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext3/inode.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 354ed3b47b30..f9d69378caaa 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1151,6 +1151,16 @@ static int do_journal_get_write_access(handle_t *handle, return ext3_journal_get_write_access(handle, bh); } +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static void ext3_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext3_truncate(inode); +} + static int ext3_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1209,7 +1219,7 @@ write_begin_failed: unlock_page(page); page_cache_release(page); if (pos + len > inode->i_size) - ext3_truncate(inode); + ext3_truncate_failed_write(inode); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -1304,7 +1314,7 @@ static int ext3_ordered_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - ext3_truncate(inode); + ext3_truncate_failed_write(inode); return ret ? ret : copied; } @@ -1330,7 +1340,7 @@ static int ext3_writeback_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - ext3_truncate(inode); + ext3_truncate_failed_write(inode); return ret ? ret : copied; } @@ -1383,7 +1393,7 @@ static int ext3_journalled_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - ext3_truncate(inode); + ext3_truncate_failed_write(inode); return ret ? ret : copied; } -- cgit v1.2.3 From 3a03456dc34ff703e5b7c718b78713577f832116 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 16 Dec 2009 03:27:20 +0000 Subject: jffs2: Fix long-standing bug with symlink garbage collection. commit 2e16cfca6e17ae37ae21feca080a6f2eca9087dc upstream. Ever since jffs2_garbage_collect_metadata() was first half-written in February 2001, it's been broken on architectures where 'char' is signed. When garbage collecting a symlink with target length above 127, the payload length would end up negative, causing interesting and bad things to happen. Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- fs/jffs2/gc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index 090c556ffed2..3b6f2fa12cff 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -700,7 +700,8 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_ struct jffs2_raw_inode ri; struct jffs2_node_frag *last_frag; union jffs2_device_node dev; - char *mdata = NULL, mdatalen = 0; + char *mdata = NULL; + int mdatalen = 0; uint32_t alloclen, ilen; int ret; -- cgit v1.2.3 From cd7bc18e0908a287e64cf4593a480b613b07fdce Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 3 Dec 2009 08:09:41 -0500 Subject: cifs: NULL out tcon, pSesInfo, and srvTcp pointers when chasing DFS referrals commit a2934c7b363ddcc001964f2444649f909e583bef upstream. The scenario is this: The kernel gets EREMOTE and starts chasing a DFS referral at mount time. The tcon reference is put, which puts the session reference too, but neither pointer is zeroed out. The mount gets retried (goto try_mount_again) with new mount info. Session setup fails fails and rc ends up being non-zero. The code then falls through to the end and tries to put the previously freed tcon pointer again. Oops at: cifs_put_smb_ses+0x14/0xd0 Fix this by moving the initialization of the rc variable and the tcon, pSesInfo and srvTcp pointers below the try_mount_again label. Also, add a FreeXid() before the goto to prevent xid "leaks". Signed-off-by: Jeff Layton Reported-by: Gustavo Carvalho Homem Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/connect.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 63ea83ff687f..3bbcaa716b3c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2287,12 +2287,12 @@ int cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, char *mount_data_global, const char *devname) { - int rc = 0; + int rc; int xid; struct smb_vol *volume_info; - struct cifsSesInfo *pSesInfo = NULL; - struct cifsTconInfo *tcon = NULL; - struct TCP_Server_Info *srvTcp = NULL; + struct cifsSesInfo *pSesInfo; + struct cifsTconInfo *tcon; + struct TCP_Server_Info *srvTcp; char *full_path; char *mount_data = mount_data_global; #ifdef CONFIG_CIFS_DFS_UPCALL @@ -2301,6 +2301,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, int referral_walks_count = 0; try_mount_again: #endif + rc = 0; + tcon = NULL; + pSesInfo = NULL; + srvTcp = NULL; full_path = NULL; xid = GetXid(); @@ -2597,6 +2601,7 @@ remote_path_check: cleanup_volume_info(&volume_info); referral_walks_count++; + FreeXid(xid); goto try_mount_again; } #else /* No DFS support, return error on mount */ -- cgit v1.2.3 From 0aebc28083514af318c438fccc8a7ae23b4e1ab8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 30 Nov 2009 19:47:55 +0100 Subject: udf: Try harder when looking for VAT inode commit e971b0b9e0dd50d9ceecb67a6a6ab80a80906033 upstream. Some disks do not contain VAT inode in the last recorded block as required by the standard but a few blocks earlier (or the number of recorded blocks is wrong). So look for the VAT inode a bit before the end of the media. Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/udf/super.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/udf/super.c b/fs/udf/super.c index 9d1b8c2e6c45..1e4543cbcd27 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1078,21 +1078,39 @@ static int udf_fill_partdesc_info(struct super_block *sb, return 0; } -static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) +static void udf_find_vat_block(struct super_block *sb, int p_index, + int type1_index, sector_t start_block) { struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map = &sbi->s_partmaps[p_index]; + sector_t vat_block; struct kernel_lb_addr ino; + + /* + * VAT file entry is in the last recorded block. Some broken disks have + * it a few blocks before so try a bit harder... + */ + ino.partitionReferenceNum = type1_index; + for (vat_block = start_block; + vat_block >= map->s_partition_root && + vat_block >= start_block - 3 && + !sbi->s_vat_inode; vat_block--) { + ino.logicalBlockNum = vat_block - map->s_partition_root; + sbi->s_vat_inode = udf_iget(sb, &ino); + } +} + +static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map = &sbi->s_partmaps[p_index]; struct buffer_head *bh = NULL; struct udf_inode_info *vati; uint32_t pos; struct virtualAllocationTable20 *vat20; sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; - /* VAT file entry is in the last recorded block */ - ino.partitionReferenceNum = type1_index; - ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root; - sbi->s_vat_inode = udf_iget(sb, &ino); + udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block); if (!sbi->s_vat_inode && sbi->s_last_block != blocks - 1) { printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the" @@ -1100,9 +1118,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) "block of the device (%lu).\n", (unsigned long)sbi->s_last_block, (unsigned long)blocks - 1); - ino.partitionReferenceNum = type1_index; - ino.logicalBlockNum = blocks - 1 - map->s_partition_root; - sbi->s_vat_inode = udf_iget(sb, &ino); + udf_find_vat_block(sb, p_index, type1_index, blocks - 1); } if (!sbi->s_vat_inode) return 1; -- cgit v1.2.3 From f07c88dd6cbcc8086e28759f3686068163d423ae Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 14 Dec 2009 15:21:12 +0300 Subject: Add unlocked version of inode_add_bytes() function commit b462707e7ccad058ae151e5c5b06eb5cadcb737f upstream. Quota code requires unlocked version of this function. Off course we can just copy-paste the code, but copy-pasting is always an evil. Signed-off-by: Dmitry Monakhov Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/stat.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/stat.c b/fs/stat.c index 075694e31d8b..c4ecd52c5737 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -401,9 +401,9 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename, } #endif /* __ARCH_WANT_STAT64 */ -void inode_add_bytes(struct inode *inode, loff_t bytes) +/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */ +void __inode_add_bytes(struct inode *inode, loff_t bytes) { - spin_lock(&inode->i_lock); inode->i_blocks += bytes >> 9; bytes &= 511; inode->i_bytes += bytes; @@ -411,6 +411,12 @@ void inode_add_bytes(struct inode *inode, loff_t bytes) inode->i_blocks++; inode->i_bytes -= 512; } +} + +void inode_add_bytes(struct inode *inode, loff_t bytes) +{ + spin_lock(&inode->i_lock); + __inode_add_bytes(inode, bytes); spin_unlock(&inode->i_lock); } -- cgit v1.2.3 From bbf245072d81e512cc88535379ae6edb5d08f420 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 14 Dec 2009 15:21:13 +0300 Subject: quota: decouple fs reserved space from quota reservation commit fd8fbfc1709822bd94247c5b2ab15a5f5041e103 upstream. Currently inode_reservation is managed by fs itself and this reservation is transfered on dquot_transfer(). This means what inode_reservation must always be in sync with dquot->dq_dqb.dqb_rsvspace. Otherwise dquot_transfer() will result in incorrect quota(WARN_ON in dquot_claim_reserved_space() will be triggered) This is not easy because of complex locking order issues for example http://bugzilla.kernel.org/show_bug.cgi?id=14739 The patch introduce quota reservation field for each fs-inode (fs specific inode is used in order to prevent bloating generic vfs inode). This reservation is managed by quota code internally similar to i_blocks/i_bytes and may not be always in sync with internal fs reservation. Also perform some code rearrangement: - Unify dquot_reserve_space() and dquot_reserve_space() - Unify dquot_release_reserved_space() and dquot_free_space() - Also this patch add missing warning update to release_rsv() dquot_release_reserved_space() must call flush_warnings() as dquot_free_space() does. Signed-off-by: Dmitry Monakhov Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/quota/dquot.c | 213 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 119 insertions(+), 94 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 39b49c42a7ed..c4d07a803bd0 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1387,6 +1387,67 @@ void vfs_dq_drop(struct inode *inode) } EXPORT_SYMBOL(vfs_dq_drop); +/* + * inode_reserved_space is managed internally by quota, and protected by + * i_lock similar to i_blocks+i_bytes. + */ +static qsize_t *inode_reserved_space(struct inode * inode) +{ + /* Filesystem must explicitly define it's own method in order to use + * quota reservation interface */ + BUG_ON(!inode->i_sb->dq_op->get_reserved_space); + return inode->i_sb->dq_op->get_reserved_space(inode); +} + +static void inode_add_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) += number; + spin_unlock(&inode->i_lock); +} + + +static void inode_claim_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + __inode_add_bytes(inode, number); + spin_unlock(&inode->i_lock); +} + +static void inode_sub_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + spin_unlock(&inode->i_lock); +} + +static qsize_t inode_get_rsv_space(struct inode *inode) +{ + qsize_t ret; + spin_lock(&inode->i_lock); + ret = *inode_reserved_space(inode); + spin_unlock(&inode->i_lock); + return ret; +} + +static void inode_incr_space(struct inode *inode, qsize_t number, + int reserve) +{ + if (reserve) + inode_add_rsv_space(inode, number); + else + inode_add_bytes(inode, number); +} + +static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) +{ + if (reserve) + inode_sub_rsv_space(inode, number); + else + inode_sub_bytes(inode, number); +} + /* * Following four functions update i_blocks+i_bytes fields and * quota information (together with appropriate checks) @@ -1405,6 +1466,21 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int cnt, ret = QUOTA_OK; char warntype[MAXQUOTAS]; + /* + * First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex + */ + if (IS_NOQUOTA(inode)) { + inode_incr_space(inode, number, reserve); + goto out; + } + + down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + if (IS_NOQUOTA(inode)) { + inode_incr_space(inode, number, reserve); + goto out_unlock; + } + for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; @@ -1415,7 +1491,8 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt) == NO_QUOTA) { ret = NO_QUOTA; - goto out_unlock; + spin_unlock(&dq_data_lock); + goto out_flush_warn; } } for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1426,64 +1503,32 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, else dquot_incr_space(inode->i_dquot[cnt], number); } - if (!reserve) - inode_add_bytes(inode, number); -out_unlock: + inode_incr_space(inode, number, reserve); spin_unlock(&dq_data_lock); - flush_warnings(inode->i_dquot, warntype); - return ret; -} - -int dquot_alloc_space(struct inode *inode, qsize_t number, int warn) -{ - int cnt, ret = QUOTA_OK; - - /* - * First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex - */ - if (IS_NOQUOTA(inode)) { - inode_add_bytes(inode, number); - goto out; - } - - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) { - inode_add_bytes(inode, number); - goto out_unlock; - } - - ret = __dquot_alloc_space(inode, number, warn, 0); - if (ret == NO_QUOTA) - goto out_unlock; + if (reserve) + goto out_flush_warn; /* Dirtify all the dquots - this can block when journalling */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (inode->i_dquot[cnt]) mark_dquot_dirty(inode->i_dquot[cnt]); +out_flush_warn: + flush_warnings(inode->i_dquot, warntype); out_unlock: up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); out: return ret; } + +int dquot_alloc_space(struct inode *inode, qsize_t number, int warn) +{ + return __dquot_alloc_space(inode, number, warn, 0); +} EXPORT_SYMBOL(dquot_alloc_space); int dquot_reserve_space(struct inode *inode, qsize_t number, int warn) { - int ret = QUOTA_OK; - - if (IS_NOQUOTA(inode)) - goto out; - - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) - goto out_unlock; - - ret = __dquot_alloc_space(inode, number, warn, 1); -out_unlock: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return ret; + return __dquot_alloc_space(inode, number, warn, 1); } EXPORT_SYMBOL(dquot_reserve_space); @@ -1540,14 +1585,14 @@ int dquot_claim_space(struct inode *inode, qsize_t number) int ret = QUOTA_OK; if (IS_NOQUOTA(inode)) { - inode_add_bytes(inode, number); + inode_claim_rsv_space(inode, number); goto out; } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); if (IS_NOQUOTA(inode)) { up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - inode_add_bytes(inode, number); + inode_claim_rsv_space(inode, number); goto out; } @@ -1559,7 +1604,7 @@ int dquot_claim_space(struct inode *inode, qsize_t number) number); } /* Update inode bytes */ - inode_add_bytes(inode, number); + inode_claim_rsv_space(inode, number); spin_unlock(&dq_data_lock); /* Dirtify all the dquots - this can block when journalling */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) @@ -1571,39 +1616,10 @@ out: } EXPORT_SYMBOL(dquot_claim_space); -/* - * Release reserved quota space - */ -void dquot_release_reserved_space(struct inode *inode, qsize_t number) -{ - int cnt; - - if (IS_NOQUOTA(inode)) - goto out; - - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) - goto out_unlock; - - spin_lock(&dq_data_lock); - /* Release reserved dquots */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (inode->i_dquot[cnt]) - dquot_free_reserved_space(inode->i_dquot[cnt], number); - } - spin_unlock(&dq_data_lock); - -out_unlock: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return; -} -EXPORT_SYMBOL(dquot_release_reserved_space); - /* * This operation can block, but only after everything is updated */ -int dquot_free_space(struct inode *inode, qsize_t number) +int __dquot_free_space(struct inode *inode, qsize_t number, int reserve) { unsigned int cnt; char warntype[MAXQUOTAS]; @@ -1612,7 +1628,7 @@ int dquot_free_space(struct inode *inode, qsize_t number) * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) { out_sub: - inode_sub_bytes(inode, number); + inode_decr_space(inode, number, reserve); return QUOTA_OK; } @@ -1627,20 +1643,42 @@ out_sub: if (!inode->i_dquot[cnt]) continue; warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number); - dquot_decr_space(inode->i_dquot[cnt], number); + if (reserve) + dquot_free_reserved_space(inode->i_dquot[cnt], number); + else + dquot_decr_space(inode->i_dquot[cnt], number); } - inode_sub_bytes(inode, number); + inode_decr_space(inode, number, reserve); spin_unlock(&dq_data_lock); + + if (reserve) + goto out_unlock; /* Dirtify all the dquots - this can block when journalling */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (inode->i_dquot[cnt]) mark_dquot_dirty(inode->i_dquot[cnt]); +out_unlock: flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return QUOTA_OK; } + +int dquot_free_space(struct inode *inode, qsize_t number) +{ + return __dquot_free_space(inode, number, 0); +} EXPORT_SYMBOL(dquot_free_space); +/* + * Release reserved quota space + */ +void dquot_release_reserved_space(struct inode *inode, qsize_t number) +{ + __dquot_free_space(inode, number, 1); + +} +EXPORT_SYMBOL(dquot_release_reserved_space); + /* * This operation can block, but only after everything is updated */ @@ -1678,19 +1716,6 @@ int dquot_free_inode(const struct inode *inode, qsize_t number) } EXPORT_SYMBOL(dquot_free_inode); -/* - * call back function, get reserved quota space from underlying fs - */ -qsize_t dquot_get_reserved_space(struct inode *inode) -{ - qsize_t reserved_space = 0; - - if (sb_any_quota_active(inode->i_sb) && - inode->i_sb->dq_op->get_reserved_space) - reserved_space = inode->i_sb->dq_op->get_reserved_space(inode); - return reserved_space; -} - /* * Transfer the number of inode and blocks from one diskquota to an other. * @@ -1734,7 +1759,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) } spin_lock(&dq_data_lock); cur_space = inode_get_bytes(inode); - rsv_space = dquot_get_reserved_space(inode); + rsv_space = inode_get_rsv_space(inode); space = cur_space + rsv_space; /* Build the transfer_from list and check the limits */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { -- cgit v1.2.3 From dbe5cc0045fafc8fc452fa90dba93ad957494e14 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 14 Dec 2009 15:21:14 +0300 Subject: ext4: Convert to generic reserved quota's space management. commit a9e7f4472075fb6937c545af3f6329e9946bbe66 upstream. This patch also fixes write vs chown race condition. Acked-by: "Theodore Ts'o" Signed-off-by: Dmitry Monakhov Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4.h | 6 +++++- fs/ext4/inode.c | 16 +++++++--------- fs/ext4/super.c | 5 +++++ 3 files changed, 17 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bd2a9dd1464c..d0a2afbab009 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -698,6 +698,10 @@ struct ext4_inode_info { __u16 i_extra_isize; spinlock_t i_block_reservation_lock; +#ifdef CONFIG_QUOTA + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif /* completed async DIOs that might need unwritten extents handling */ struct list_head i_aio_dio_complete_list; @@ -1432,7 +1436,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); -extern qsize_t ext4_get_reserved_space(struct inode *inode); +extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int flush_aio_dio_completed_IO(struct inode *inode); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1dae9a4dbb03..374d38cdb21c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1045,17 +1045,12 @@ out: return err; } -qsize_t ext4_get_reserved_space(struct inode *inode) +#ifdef CONFIG_QUOTA +qsize_t *ext4_get_reserved_space(struct inode *inode) { - unsigned long long total; - - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - total = EXT4_I(inode)->i_reserved_data_blocks + - EXT4_I(inode)->i_reserved_meta_blocks; - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - - return (total << inode->i_blkbits); + return &EXT4_I(inode)->i_reserved_quota; } +#endif /* * Calculate the number of metadata blocks need to reserve * to allocate @blocks for non extent file based file @@ -4850,6 +4845,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(raw_inode); ei->i_disksize = inode->i_size; +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +#endif inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; ei->i_last_alloc_group = ~0; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9ae521749b90..92943f2ca2ab 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -704,6 +704,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_allocated_meta_blocks = 0; ei->i_delalloc_reserved_flag = 0; spin_lock_init(&(ei->i_block_reservation_lock)); +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +#endif INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; @@ -1001,7 +1004,9 @@ static const struct dquot_operations ext4_quota_operations = { .reserve_space = dquot_reserve_space, .claim_space = dquot_claim_space, .release_rsv = dquot_release_reserved_space, +#ifdef CONFIG_QUOTA .get_reserved_space = ext4_get_reserved_space, +#endif .alloc_inode = dquot_alloc_inode, .free_space = dquot_free_space, .free_inode = dquot_free_inode, -- cgit v1.2.3 From b2dbc4634227483a44732e73801b804915f04969 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Thu, 10 Dec 2009 16:36:27 +0000 Subject: ext4: fix sleep inside spinlock issue with quota and dealloc (#14739) commit 39bc680a8160bb9d6743f7873b535d553ff61058 upstream. Unlock i_block_reservation_lock before vfs_dq_reserve_block(). This patch fixes http://bugzilla.kernel.org/show_bug.cgi?id=14739 Cc: Theodore Ts'o Signed-off-by: Dmitry Monakhov Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 374d38cdb21c..e233879ebbcb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1853,19 +1853,17 @@ repeat: md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; total = md_needed + nrblocks; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); /* * Make quota reservation here to prevent quota overflow * later. Real quota accounting is done at pages writeout * time. */ - if (vfs_dq_reserve_block(inode, total)) { - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + if (vfs_dq_reserve_block(inode, total)) return -EDQUOT; - } if (ext4_claim_free_blocks(sbi, total)) { - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); vfs_dq_release_reservation_block(inode, total); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); @@ -1873,10 +1871,11 @@ repeat: } return -ENOSPC; } + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); EXT4_I(inode)->i_reserved_data_blocks += nrblocks; - EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; - + EXT4_I(inode)->i_reserved_meta_blocks += md_needed; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return 0; /* success */ } -- cgit v1.2.3 From 84d330ec80dc467baf6cb393d9c1ee006d1c024a Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Tue, 29 Dec 2009 14:50:19 -0600 Subject: generic_permission: MAY_OPEN is not write access commit 7ea6600148c265b1fd53e521022b1d7aec81d974 upstream. generic_permission was refusing CAP_DAC_READ_SEARCH-enabled processes from opening DAC-protected files read-only, because do_filp_open adds MAY_OPEN to the open mask. Ignore MAY_OPEN. After this patch, CAP_DAC_READ_SEARCH is again sufficient to open(fname, O_RDONLY) on a file to which DAC otherwise refuses us read permission. Reported-by: Mike Kazantsev Signed-off-by: Serge E. Hallyn Tested-by: Mike Kazantsev Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/namei.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index d11f404667e9..a2b3c28a499d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -234,6 +234,7 @@ int generic_permission(struct inode *inode, int mask, /* * Searching includes executable on directories, else just read. */ + mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) if (capable(CAP_DAC_READ_SEARCH)) return 0; -- cgit v1.2.3 From e4dd8ca5be771aa706b0797b3284328bb595bfa1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 16 Dec 2009 08:23:37 -0800 Subject: fasync: split 'fasync_helper()' into separate add/remove functions commit 53281b6d34d44308372d16acb7fb5327609f68b6 upstream. Yes, the add and remove cases do share the same basic loop and the locking, but the compiler can inline and then CSE some of the end result anyway. And splitting it up makes the code way easier to follow, and makes it clearer exactly what the semantics are. In particular, we must make sure that the FASYNC flag in file->f_flags exactly matches the state of "is this file on any fasync list", since not only is that flag visible to user space (F_GETFL), but we also use that flag to check whether we need to remove any fasync entries on file close. We got that wrong for the case of a mixed use of file locking (which tries to remove any fasync entries for file leases) and fasync. Splitting the function up also makes it possible to do some future optimizations without making the function even messier. In particular, since the FASYNC flag has to match the state of "is this on a list", we can do the following future optimizations: - on remove, we don't even need to get the locks and traverse the list if FASYNC isn't set, since we can know a priori that there is no point (this is effectively the same optimization that we already do in __fput() wrt removing fasync on file close) - on add, we can use the FASYNC flag to decide whether we are changing an existing entry or need to allocate a new one. but this is just the cleanup + fix for the FASYNC flag. Acked-by: Al Viro Tested-by: Tavis Ormandy Cc: Jeff Dike Cc: Matt Mackall Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/fcntl.c | 102 +++++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 66 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index 2cf93ec40a67..97e01dc0d95f 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -618,60 +618,90 @@ static DEFINE_RWLOCK(fasync_lock); static struct kmem_cache *fasync_cache __read_mostly; /* - * fasync_helper() is used by almost all character device drivers - * to set up the fasync queue. It returns negative on error, 0 if it did - * no changes and positive if it added/deleted the entry. + * Remove a fasync entry. If successfully removed, return + * positive and clear the FASYNC flag. If no entry exists, + * do nothing and return 0. + * + * NOTE! It is very important that the FASYNC flag always + * match the state "is the filp on a fasync list". + * + * We always take the 'filp->f_lock', in since fasync_lock + * needs to be irq-safe. */ -int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp) +static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) { struct fasync_struct *fa, **fp; - struct fasync_struct *new = NULL; int result = 0; - if (on) { - new = kmem_cache_alloc(fasync_cache, GFP_KERNEL); - if (!new) - return -ENOMEM; + spin_lock(&filp->f_lock); + write_lock_irq(&fasync_lock); + for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { + if (fa->fa_file != filp) + continue; + *fp = fa->fa_next; + kmem_cache_free(fasync_cache, fa); + filp->f_flags &= ~FASYNC; + result = 1; + break; } + write_unlock_irq(&fasync_lock); + spin_unlock(&filp->f_lock); + return result; +} + +/* + * Add a fasync entry. Return negative on error, positive if + * added, and zero if did nothing but change an existing one. + * + * NOTE! It is very important that the FASYNC flag always + * match the state "is the filp on a fasync list". + */ +static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp) +{ + struct fasync_struct *new, *fa, **fp; + int result = 0; + + new = kmem_cache_alloc(fasync_cache, GFP_KERNEL); + if (!new) + return -ENOMEM; - /* - * We need to take f_lock first since it's not an IRQ-safe - * lock. - */ spin_lock(&filp->f_lock); write_lock_irq(&fasync_lock); for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { - if (fa->fa_file == filp) { - if(on) { - fa->fa_fd = fd; - kmem_cache_free(fasync_cache, new); - } else { - *fp = fa->fa_next; - kmem_cache_free(fasync_cache, fa); - result = 1; - } - goto out; - } + if (fa->fa_file != filp) + continue; + fa->fa_fd = fd; + kmem_cache_free(fasync_cache, new); + goto out; } - if (on) { - new->magic = FASYNC_MAGIC; - new->fa_file = filp; - new->fa_fd = fd; - new->fa_next = *fapp; - *fapp = new; - result = 1; - } + new->magic = FASYNC_MAGIC; + new->fa_file = filp; + new->fa_fd = fd; + new->fa_next = *fapp; + *fapp = new; + result = 1; + filp->f_flags |= FASYNC; + out: - if (on) - filp->f_flags |= FASYNC; - else - filp->f_flags &= ~FASYNC; write_unlock_irq(&fasync_lock); spin_unlock(&filp->f_lock); return result; } +/* + * fasync_helper() is used by almost all character device drivers + * to set up the fasync queue, and for regular files by the file + * lease code. It returns negative on error, 0 if it did no changes + * and positive if it added/deleted the entry. + */ +int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp) +{ + if (!on) + return fasync_remove_entry(filp, fapp); + return fasync_add_entry(fd, filp, fapp); +} + EXPORT_SYMBOL(fasync_helper); void __kill_fasync(struct fasync_struct *fa, int sig, int band) -- cgit v1.2.3 From 24488113f0ee3d3fdb64734cb2364cbe2ca1ba33 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 27 Dec 2009 17:01:42 +0200 Subject: exofs: simple_write_end does not mark_inode_dirty commit efd124b999fb4d426b30675f1684521af0872789 upstream. exofs uses simple_write_end() for it's .write_end handler. But it is not enough because simple_write_end() does not call mark_inode_dirty() when it extends i_size. So even if we do call mark_inode_dirty at beginning of write out, with a very long IO and a saturated system we might get the .write_inode() called while still extend-writing to file and miss out on the last i_size updates. So override .write_end, call simple_write_end(), and afterwords if i_size was changed call mark_inode_dirty(). It stands to logic that since simple_write_end() was the one extending i_size it should also call mark_inode_dirty(). But it looks like all users of simple_write_end() are memory-bound pseudo filesystems, who could careless about mark_inode_dirty(). I might submit a warning-comment patch to simple_write_end() in future. Signed-off-by: Boaz Harrosh Signed-off-by: Greg Kroah-Hartman --- fs/exofs/inode.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 6c10f7476699..6f7df0f7d282 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -731,13 +731,28 @@ static int exofs_write_begin_export(struct file *file, fsdata); } +static int exofs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + /* According to comment in simple_write_end i_mutex is held */ + loff_t i_size = inode->i_size; + int ret; + + ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata); + if (i_size != inode->i_size) + mark_inode_dirty(inode); + return ret; +} + const struct address_space_operations exofs_aops = { .readpage = exofs_readpage, .readpages = exofs_readpages, .writepage = exofs_writepage, .writepages = exofs_writepages, .write_begin = exofs_write_begin_export, - .write_end = simple_write_end, + .write_end = exofs_write_end, }; /****************************************************************************** -- cgit v1.2.3 From 3aafc557edb43aa82d9a5651f071d9dae7a2638b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 25 Dec 2009 17:44:45 +0100 Subject: nfsd: make sure data is on disk before calling ->fsync commit 7211a4e859ad070b28545c06e0a6cb60b3b8aa31 upstream. nfsd is not using vfs_fsync, so I missed it when changing the calling convention during the 2.6.32 window. This patch fixes it to not only start the data writeout, but also wait for it to complete before calling into ->fsync. Signed-off-by: Christoph Hellwig Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/vfs.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index a293f0273263..570dd1c103d5 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -774,12 +774,9 @@ static inline int nfsd_dosync(struct file *filp, struct dentry *dp, int (*fsync) (struct file *, struct dentry *, int); int err; - err = filemap_fdatawrite(inode->i_mapping); + err = filemap_write_and_wait(inode->i_mapping); if (err == 0 && fop && (fsync = fop->fsync)) err = fsync(filp, dp, 0); - if (err == 0) - err = filemap_fdatawait(inode->i_mapping); - return err; } -- cgit v1.2.3 From 38c4d8d579c4e752e03edcdd2e96a49787d71a3a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Jan 2010 18:03:36 +0100 Subject: quota: Fix dquot_transfer for filesystems different from ext4 commit 05b5d898235401c489c68e1f3bc5706a29ad5713 upstream. Commit fd8fbfc1 modified the way we find amount of reserved space belonging to an inode. The amount of reserved space is checked from dquot_transfer and thus inode_reserved_space gets called even for filesystems that don't provide get_reserved_space callback which results in a BUG. Fix the problem by checking get_reserved_space callback and return 0 if the filesystem does not provide it. CC: Dmitry Monakhov Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/quota/dquot.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index c4d07a803bd0..253498739978 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1425,6 +1425,9 @@ static void inode_sub_rsv_space(struct inode *inode, qsize_t number) static qsize_t inode_get_rsv_space(struct inode *inode) { qsize_t ret; + + if (!inode->i_sb->dq_op->get_reserved_space) + return 0; spin_lock(&inode->i_lock); ret = *inode_reserved_space(inode); spin_unlock(&inode->i_lock); -- cgit v1.2.3 From cec3ad600c2395d2d018332dadea9b2fe33e6169 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 15 Jan 2010 12:12:24 -0500 Subject: inotify: do not reuse watch descriptors commit 9e572cc9877ee6c43af60778f6b8d5ba0692d935 upstream. Since commit 7e790dd5fc937bc8d2400c30a05e32a9e9eef276 ("inotify: fix error paths in inotify_update_watch") inotify changed the manor in which it gave watch descriptors back to userspace. Previous to this commit inotify acted like the following: inotify_add_watch(X, Y, Z) = 1 inotify_rm_watch(X, 1); inotify_add_watch(X, Y, Z) = 2 but after this patch inotify would return watch descriptors like so: inotify_add_watch(X, Y, Z) = 1 inotify_rm_watch(X, 1); inotify_add_watch(X, Y, Z) = 1 which I saw as equivalent to opening an fd where open(file) = 1; close(1); open(file) = 1; seemed perfectly reasonable. The issue is that quite a bit of userspace apparently relies on the behavior in which watch descriptors will not be quickly reused. KDE relies on it, I know some selinux packages rely on it, and I have heard complaints from other random sources such as debian bug 558981. Although the man page implies what we do is ok, we broke userspace so this patch almost reverts us to the old behavior. It is still slightly racey and I have patches that would fix that, but they are rather large and this will fix it for all real world cases. The race is as follows: - task1 creates a watch and blocks in idr_new_watch() before it updates the hint. - task2 creates a watch and updates the hint. - task1 updates the hint with it's older wd - task removes the watch created by task2 - task adds a new watch and will reuse the wd originally given to task2 it requires moving some locking around the hint (last_wd) but this should solve it for the real world and be -stable safe. As a side effect this patch papers over a bug in the lib/idr code which is causing a large number WARN's to pop on people's system and many reports in kerneloops.org. I'm working on the root cause of that idr bug seperately but this should make inotify immune to that issue. Signed-off-by: Eric Paris Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/notify/inotify/inotify_user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index dcd2040d330c..ca44337b06ce 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -558,7 +558,7 @@ retry: spin_lock(&group->inotify_data.idr_lock); ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, - group->inotify_data.last_wd, + group->inotify_data.last_wd+1, &tmp_ientry->wd); spin_unlock(&group->inotify_data.idr_lock); if (ret) { @@ -638,7 +638,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); - group->inotify_data.last_wd = 1; + group->inotify_data.last_wd = 0; group->inotify_data.user = user; group->inotify_data.fa = NULL; -- cgit v1.2.3 From 800c02837ef72e60fd0d61ed3c791263d9d0dc36 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 15 Jan 2010 12:12:25 -0500 Subject: inotify: only warn once for inotify problems commit 976ae32be45a736acd49215a7e4771ff91f161c3 upstream. inotify will WARN() if it finds that the idr and the fsnotify internals somehow got out of sync. It was only supposed to do this once but due to this stupid bug it would warn every single time a problem was detected. Signed-off-by: Eric Paris Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/notify/inotify/inotify_fsnotify.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index c9ee67b442e1..1afb0a10229f 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -121,7 +121,7 @@ static int idr_callback(int id, void *p, void *data) if (warned) return 0; - warned = false; + warned = true; entry = p; ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); -- cgit v1.2.3 From 781d5c42490436e4e0b6b624088b109695cf6b00 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 17 Dec 2009 15:27:06 -0800 Subject: reiserfs: truncate blocks not used by a write commit ec8e2f7466ca370f5e09000ca40a71759afc9ac8 upstream. It can happen that write does not use all the blocks allocated in write_begin either because of some filesystem error (like ENOSPC) or because page with data to write has been removed from memory. We truncate these blocks so that we don't have dangling blocks beyond i_size. Cc: Jeff Mahoney Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/reiserfs/inode.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index a14d6cd9eeda..d240c15bf086 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2531,6 +2531,12 @@ static int reiserfs_writepage(struct page *page, struct writeback_control *wbc) return reiserfs_write_full_page(page, wbc); } +static void reiserfs_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + reiserfs_truncate_file(inode, 0); +} + static int reiserfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -2597,6 +2603,8 @@ static int reiserfs_write_begin(struct file *file, if (ret) { unlock_page(page); page_cache_release(page); + /* Truncate allocated blocks */ + reiserfs_truncate_failed_write(inode); } return ret; } @@ -2689,8 +2697,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, ** transaction tracking stuff when the size changes. So, we have ** to do the i_size updates here. */ - pos += copied; - if (pos > inode->i_size) { + if (pos + copied > inode->i_size) { struct reiserfs_transaction_handle myth; reiserfs_write_lock(inode->i_sb); /* If the file have grown beyond the border where it @@ -2708,7 +2715,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, goto journal_error; } reiserfs_update_inode_transaction(inode); - inode->i_size = pos; + inode->i_size = pos + copied; /* * this will just nest into our transaction. It's important * to use mark_inode_dirty so the inode gets pushed around on the @@ -2735,6 +2742,10 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, out: unlock_page(page); page_cache_release(page); + + if (pos + len > inode->i_size) + reiserfs_truncate_failed_write(inode); + return ret == 0 ? copied : ret; journal_error: -- cgit v1.2.3 From 36212162584e722f432f531d7611099e31fe9f3e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 19 Jan 2010 12:34:32 +0300 Subject: ecryptfs: use after free commit ece550f51ba175c14ec3ec047815927d7386ea1f upstream. The "full_alg_name" variable is used on a couple error paths, so we shouldn't free it until the end. Signed-off-by: Dan Carpenter Signed-off-by: Tyler Hicks Signed-off-by: Greg Kroah-Hartman --- fs/ecryptfs/crypto.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index fbb6e5eed697..7cb0a59f4b9d 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1748,7 +1748,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, char *cipher_name, size_t *key_size) { char dummy_key[ECRYPTFS_MAX_KEY_BYTES]; - char *full_alg_name; + char *full_alg_name = NULL; int rc; *key_tfm = NULL; @@ -1763,7 +1763,6 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, if (rc) goto out; *key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC); - kfree(full_alg_name); if (IS_ERR(*key_tfm)) { rc = PTR_ERR(*key_tfm); printk(KERN_ERR "Unable to allocate crypto cipher with name " @@ -1786,6 +1785,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, goto out; } out: + kfree(full_alg_name); return rc; } -- cgit v1.2.3 From 8c535426188c312914bebec8158324dbdbdca2d3 Mon Sep 17 00:00:00 2001 From: Erez Zadok Date: Thu, 3 Dec 2009 13:35:27 -0500 Subject: ecryptfs: initialize private persistent file before dereferencing pointer commit e27759d7a333d1f25d628c4f7caf845c51be51c2 upstream. Ecryptfs_open dereferences a pointer to the private lower file (the one stored in the ecryptfs inode), without checking if the pointer is NULL. Right afterward, it initializes that pointer if it is NULL. Swap order of statements to first initialize. Bug discovered by Duckjin Kang. Signed-off-by: Duckjin Kang Signed-off-by: Erez Zadok Cc: Dustin Kirkland Cc: Al Viro Signed-off-by: Tyler Hicks Signed-off-by: Greg Kroah-Hartman --- fs/ecryptfs/file.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 9e944057001b..1744f17ce96e 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -191,13 +191,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) | ECRYPTFS_ENCRYPTED); } mutex_unlock(&crypt_stat->cs_mutex); - if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) - && !(file->f_flags & O_RDONLY)) { - rc = -EPERM; - printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " - "file must hence be opened RO\n", __func__); - goto out; - } if (!ecryptfs_inode_to_private(inode)->lower_file) { rc = ecryptfs_init_persistent_file(ecryptfs_dentry); if (rc) { @@ -208,6 +201,13 @@ static int ecryptfs_open(struct inode *inode, struct file *file) goto out; } } + if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) + && !(file->f_flags & O_RDONLY)) { + rc = -EPERM; + printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " + "file must hence be opened RO\n", __func__); + goto out; + } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { -- cgit v1.2.3 From 4bc685e6b0d123ceca7bccb05ebce2e5e6a5ba21 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 18 Dec 2009 21:18:15 +0100 Subject: vfs: get_sb_single() - do not pass options twice commit 9329d1beaeed1a94f030c784dcec5ff973f402c4 upstream. Filesystem code usually destroys the option buffer while parsing it. This leads to errors when the same buffer is passed twice. In case we fill a new superblock do not call remount. This is needed to quite a warning that the debugfs code causes every boot. Cc: Miklos Szeredi Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- fs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 19eb70b374bc..aff046b0fe78 100644 --- a/fs/super.c +++ b/fs/super.c @@ -901,8 +901,9 @@ int get_sb_single(struct file_system_type *fs_type, return error; } s->s_flags |= MS_ACTIVE; + } else { + do_remount_sb(s, flags, data, 0); } - do_remount_sb(s, flags, data, 0); simple_set_mnt(mnt, s); return 0; } -- cgit v1.2.3 From 6f8de29a8ef0abd5800589f3f8f4927b9a72739a Mon Sep 17 00:00:00 2001 From: Karel Zak Date: Mon, 23 Nov 2009 09:29:58 +0100 Subject: partitions: read whole sector with EFI GPT header commit 87038c2d5bda2418fda8b1456a0ae81cc3ff5bd8 upstream. The size of EFI GPT header is not static, but whole sector is allocated for the header. The HeaderSize field must be greater than 92 (= sizeof(struct gpt_header) and must be less than or equal to the logical block size. It means we have to read whole sector with the header, because the header crc32 checksum is calculated according to HeaderSize. For more details see UEFI standard (version 2.3, May 2009): - 5.3.1 GUID Format overview, page 93 - Table 13. GUID Partition Table Header, page 96 Signed-off-by: Karel Zak Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/partitions/efi.c | 7 ++++--- fs/partitions/efi.h | 8 ++++++-- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index 038a6022152f..8e5e454d94e2 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c @@ -257,15 +257,16 @@ static gpt_header * alloc_read_gpt_header(struct block_device *bdev, u64 lba) { gpt_header *gpt; + unsigned ssz = bdev_logical_block_size(bdev); + if (!bdev) return NULL; - gpt = kzalloc(sizeof (gpt_header), GFP_KERNEL); + gpt = kzalloc(ssz, GFP_KERNEL); if (!gpt) return NULL; - if (read_lba(bdev, lba, (u8 *) gpt, - sizeof (gpt_header)) < sizeof (gpt_header)) { + if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) { kfree(gpt); gpt=NULL; return NULL; diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h index 2cc89d0475bf..6998b589abf9 100644 --- a/fs/partitions/efi.h +++ b/fs/partitions/efi.h @@ -37,7 +37,6 @@ #define EFI_PMBR_OSTYPE_EFI 0xEF #define EFI_PMBR_OSTYPE_EFI_GPT 0xEE -#define GPT_BLOCK_SIZE 512 #define GPT_HEADER_SIGNATURE 0x5452415020494645ULL #define GPT_HEADER_REVISION_V1 0x00010000 #define GPT_PRIMARY_PARTITION_TABLE_LBA 1 @@ -79,7 +78,12 @@ typedef struct _gpt_header { __le32 num_partition_entries; __le32 sizeof_partition_entry; __le32 partition_entry_array_crc32; - u8 reserved2[GPT_BLOCK_SIZE - 92]; + + /* The rest of the logical block is reserved by UEFI and must be zero. + * EFI standard handles this by: + * + * uint8_t reserved2[ BlockSize - 92 ]; + */ } __attribute__ ((packed)) gpt_header; typedef struct _gpt_entry_attributes { -- cgit v1.2.3 From e96610c64809b253558c3706b02fe173b0b829d8 Mon Sep 17 00:00:00 2001 From: Karel Zak Date: Mon, 23 Nov 2009 09:29:13 +0100 Subject: partitions: use sector size for EFI GPT commit 7d13af3279985f554784a45cc961f706dbcdbdd1 upstream. Currently, kernel uses strictly 512-byte sectors for EFI GPT parsing. That's wrong. UEFI standard (version 2.3, May 2009, 5.3.1 GUID Format overview, page 95) defines that LBA is always based on the logical block size. It means bdev_logical_block_size() (aka BLKSSZGET) for Linux. This patch removes static sector size from EFI GPT parser. The problem is reproducible with the latest GNU Parted: # modprobe scsi_debug dev_size_mb=50 sector_size=4096 # ./parted /dev/sdb print Model: Linux scsi_debug (scsi) Disk /dev/sdb: 52.4MB Sector size (logical/physical): 4096B/4096B Partition Table: gpt Number Start End Size File system Name Flags 1 24.6kB 3002kB 2978kB primary 2 3002kB 6001kB 2998kB primary 3 6001kB 9003kB 3002kB primary # blockdev --rereadpt /dev/sdb # dmesg | tail -1 sdb: unknown partition table <---- !!! with this patch: # blockdev --rereadpt /dev/sdb # dmesg | tail -1 sdb: sdb1 sdb2 sdb3 Signed-off-by: Karel Zak Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/partitions/efi.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index 8e5e454d94e2..49cfd5f54238 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c @@ -1,7 +1,9 @@ /************************************************************ * EFI GUID Partition Table handling - * Per Intel EFI Specification v1.02 - * http://developer.intel.com/technology/efi/efi.htm + * + * http://www.uefi.org/specs/ + * http://www.intel.com/technology/efi/ + * * efi.[ch] by Matt Domsch * Copyright 2000,2001,2002,2004 Dell Inc. * @@ -92,6 +94,7 @@ * ************************************************************/ #include +#include #include "check.h" #include "efi.h" @@ -141,7 +144,8 @@ last_lba(struct block_device *bdev) { if (!bdev || !bdev->bd_inode) return 0; - return (bdev->bd_inode->i_size >> 9) - 1ULL; + return div_u64(bdev->bd_inode->i_size, + bdev_logical_block_size(bdev)) - 1ULL; } static inline int @@ -188,6 +192,7 @@ static size_t read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) { size_t totalreadcount = 0; + sector_t n = lba * (bdev_logical_block_size(bdev) / 512); if (!bdev || !buffer || lba > last_lba(bdev)) return 0; @@ -195,7 +200,7 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) while (count) { int copied = 512; Sector sect; - unsigned char *data = read_dev_sector(bdev, lba++, §); + unsigned char *data = read_dev_sector(bdev, n++, §); if (!data) break; if (copied > count) @@ -602,6 +607,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev) gpt_header *gpt = NULL; gpt_entry *ptes = NULL; u32 i; + unsigned ssz = bdev_logical_block_size(bdev) / 512; if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) { kfree(gpt); @@ -612,13 +618,14 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev) pr_debug("GUID Partition Table is valid! Yea!\n"); for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { + u64 start = le64_to_cpu(ptes[i].starting_lba); + u64 size = le64_to_cpu(ptes[i].ending_lba) - + le64_to_cpu(ptes[i].starting_lba) + 1ULL; + if (!is_pte_valid(&ptes[i], last_lba(bdev))) continue; - put_partition(state, i+1, le64_to_cpu(ptes[i].starting_lba), - (le64_to_cpu(ptes[i].ending_lba) - - le64_to_cpu(ptes[i].starting_lba) + - 1ULL)); + put_partition(state, i+1, start * ssz, size * ssz); /* If this is a RAID volume, tell md */ if (!efi_guidcmp(ptes[i].partition_type_guid, -- cgit v1.2.3 From 34911bf2037c212644cc6b3e2e3207af4e763df7 Mon Sep 17 00:00:00 2001 From: Frank Filz Date: Wed, 21 Oct 2009 16:45:02 -0700 Subject: nfsd: Fix sort_pacl in fs/nfsd/nf4acl.c to actually sort groups commit aba24d71580180dfdf6a1a83a5858a1c048fd785 upstream. We have been doing some extensive testing of Linux support for ACLs on NFDS v4. We have noticed that the server rejects ACLs where the groups are out of order, for example, the following ACL is rejected: A::OWNER@:rwaxtTcCy A::user101@domain:rwaxtcy A::GROUP@:rwaxtcy A:g:group102@domain:rwaxtcy A:g:group101@domain:rwaxtcy A::EVERYONE@:rwaxtcy Examining the server code, I found that after converting an NFS v4 ACL to POSIX, sort_pacl is called to sort the user ACEs and group ACEs. Unfortunately, a minor bug causes the group sort to be skipped. Signed-off-by: Frank Filz Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 725d02f210e2..6d9c6aabc85e 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -389,7 +389,7 @@ sort_pacl(struct posix_acl *pacl) sort_pacl_range(pacl, 1, i-1); BUG_ON(pacl->a_entries[i].e_tag != ACL_GROUP_OBJ); - j = i++; + j = ++i; while (pacl->a_entries[j].e_tag == ACL_GROUP) j++; sort_pacl_range(pacl, i, j-1); -- cgit v1.2.3 From 5e057878f946d25bf3a3bc811f4e27099f41c24f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: NFS: Revert default r/wsize behavior commit dd47f96c077b4516727e497e4b6fd47a06778c0a upstream. When the "rsize=" or "wsize=" mount options are not specified, text-based mounts have slightly different behavior than legacy binary mounts. Text-based mounts use the smaller of the server's maximum and the client's maximum, but binary mounts use the smaller of the server's _preferred_ size and the client's maximum. This difference is actually pretty subtle. Most servers advertise the same value as their maximum and their preferred transfer size, so the end result is the same in most cases. The reason for this difference is that for text-based mounts, if r/wsize are not specified, they are set to the largest value supported by the client. For legacy mounts, the values are set to zero if these options are not specified. nfs_server_set_fsinfo() can negotiate the transfer size defaults correctly in any case. There's no need to specify any particular value as default in the text-based option parsing logic. Note that nfs4 doesn't use nfs_server_set_fsinfo(), but the mount.nfs4 command does set rsize and wsize to 0 if the user didn't specify these options. So, make the same change for text-based NFSv4 mounts. Thanks to James Pearson for reporting and diagnosing the problem. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust Cc: Neil Brown Signed-off-by: Greg Kroah-Hartman --- fs/nfs/super.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 90be551b80c1..e71f0fdff1da 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -734,8 +734,6 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve data = kzalloc(sizeof(*data), GFP_KERNEL); if (data) { - data->rsize = NFS_MAX_FILE_IO_SIZE; - data->wsize = NFS_MAX_FILE_IO_SIZE; data->acregmin = NFS_DEF_ACREGMIN; data->acregmax = NFS_DEF_ACREGMAX; data->acdirmin = NFS_DEF_ACDIRMIN; -- cgit v1.2.3 From 0a1c275a44db55b2624652b04d4ed9430e82957b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 26 Jan 2010 15:04:02 -0800 Subject: fnctl: f_modown should call write_lock_irqsave/restore commit b04da8bfdfbbd79544cab2fadfdc12e87eb01600 upstream. Commit 703625118069f9f8960d356676662d3db5a9d116 exposed that f_modown() should call write_lock_irqsave instead of just write_lock_irq so that because a caller could have a spinlock held and it would not be good to renable interrupts. Cc: Eric W. Biederman Cc: Al Viro Cc: Alan Cox Cc: Tavis Ormandy Signed-off-by: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- fs/fcntl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index 97e01dc0d95f..5ef953e6f908 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -199,7 +199,9 @@ static int setfl(int fd, struct file * filp, unsigned long arg) static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, int force) { - write_lock_irq(&filp->f_owner.lock); + unsigned long flags; + + write_lock_irqsave(&filp->f_owner.lock, flags); if (force || !filp->f_owner.pid) { put_pid(filp->f_owner.pid); filp->f_owner.pid = get_pid(pid); @@ -211,7 +213,7 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, filp->f_owner.euid = cred->euid; } } - write_unlock_irq(&filp->f_owner.lock); + write_unlock_irqrestore(&filp->f_owner.lock, flags); } int __f_setown(struct file *filp, struct pid *pid, enum pid_type type, -- cgit v1.2.3 From 703c300c7415c0a2f25dbd7d2f8773a5abd84099 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 23 Jan 2010 23:38:27 -0500 Subject: Fix a leak in affs_fill_super() commit afc70ed05a07bfe171f7a5b8fdc80bdb073d314f upstream. Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/affs/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/affs/super.c b/fs/affs/super.c index 104fdcb3a7fc..b2a5958c6191 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -316,6 +316,8 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) &blocksize,&sbi->s_prefix, sbi->s_volume, &mount_flags)) { printk(KERN_ERR "AFFS: Error parsing options\n"); + kfree(sbi->s_prefix); + kfree(sbi); return -EINVAL; } /* N.B. after this point s_prefix must be released */ -- cgit v1.2.3 From deb20f1c49d390d3fcb62fa9789486f170054004 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Jan 2010 00:52:22 -0500 Subject: Fix failure exits in bfs_fill_super() commit 5998649f779b7148a8a0c10c46cfa99e27d34dfe upstream. double iput(), leaks... Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/bfs/inode.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 6f60336c6628..8f3d9fd89604 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -353,35 +353,35 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) struct inode *inode; unsigned i, imap_len; struct bfs_sb_info *info; - long ret = -EINVAL; + int ret = -EINVAL; unsigned long i_sblock, i_eblock, i_eoff, s_size; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; + mutex_init(&info->bfs_lock); s->s_fs_info = info; sb_set_blocksize(s, BFS_BSIZE); - bh = sb_bread(s, 0); - if(!bh) + info->si_sbh = sb_bread(s, 0); + if (!info->si_sbh) goto out; - bfs_sb = (struct bfs_super_block *)bh->b_data; + bfs_sb = (struct bfs_super_block *)info->si_sbh->b_data; if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) { if (!silent) printf("No BFS filesystem on %s (magic=%08x)\n", s->s_id, le32_to_cpu(bfs_sb->s_magic)); - goto out; + goto out1; } if (BFS_UNCLEAN(bfs_sb, s) && !silent) printf("%s is unclean, continuing\n", s->s_id); s->s_magic = BFS_MAGIC; - info->si_sbh = bh; if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) { printf("Superblock is corrupted\n"); - goto out; + goto out1; } info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / @@ -390,7 +390,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) imap_len = (info->si_lasti / 8) + 1; info->si_imap = kzalloc(imap_len, GFP_KERNEL); if (!info->si_imap) - goto out; + goto out1; for (i = 0; i < BFS_ROOT_INO; i++) set_bit(i, info->si_imap); @@ -398,15 +398,13 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) inode = bfs_iget(s, BFS_ROOT_INO); if (IS_ERR(inode)) { ret = PTR_ERR(inode); - kfree(info->si_imap); - goto out; + goto out2; } s->s_root = d_alloc_root(inode); if (!s->s_root) { iput(inode); ret = -ENOMEM; - kfree(info->si_imap); - goto out; + goto out2; } info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS; @@ -419,10 +417,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) bh = sb_bread(s, info->si_blocks - 1); if (!bh) { printf("Last block not available: %lu\n", info->si_blocks - 1); - iput(inode); ret = -EIO; - kfree(info->si_imap); - goto out; + goto out3; } brelse(bh); @@ -459,11 +455,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) printf("Inode 0x%08x corrupted\n", i); brelse(bh); - s->s_root = NULL; - kfree(info->si_imap); - kfree(info); - s->s_fs_info = NULL; - return -EIO; + ret = -EIO; + goto out3; } if (!di->i_ino) { @@ -483,11 +476,17 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) s->s_dirt = 1; } dump_imap("read_super", s); - mutex_init(&info->bfs_lock); return 0; +out3: + dput(s->s_root); + s->s_root = NULL; +out2: + kfree(info->si_imap); +out1: + brelse(info->si_sbh); out: - brelse(bh); + mutex_destroy(&info->bfs_lock); kfree(info); s->s_fs_info = NULL; return ret; -- cgit v1.2.3 From 26d2257801281a5bd2833d8f22cebead57a4b4c6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 25 Jan 2010 06:16:19 -0500 Subject: fix oops in fs/9p late mount failure commit 083c73c253c23c20359a344dfe1198ea628e6259 upstream. if 9P ->get_sb() fails late (at root inode or root dentry allocation), we'll hit its ->kill_sb() with NULL ->s_root Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/9p/vfs_super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 14a86448572c..69357c0d9899 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -188,7 +188,8 @@ static void v9fs_kill_super(struct super_block *s) P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); - v9fs_dentry_release(s->s_root); /* clunk root */ + if (s->s_root) + v9fs_dentry_release(s->s_root); /* clunk root */ kill_anon_super(s); -- cgit v1.2.3 From 36a0a4afb2e996f59651a845e5680d2f80cc6f23 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 25 Jan 2010 06:05:54 -0500 Subject: fix leak in romfs_fill_super() commit 7e32b7bb734047c5e3cecf2e896b9cf8fc35d1e8 upstream. Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/romfs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/romfs/super.c b/fs/romfs/super.c index c117fa80d1e9..42d213546894 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -544,6 +544,7 @@ error: error_rsb_inval: ret = -EINVAL; error_rsb: + kfree(rsb); return ret; } -- cgit v1.2.3 From d842c31e636e5dbc85c3c75d1b964a6ff6e4daeb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Jan 2010 00:04:07 -0500 Subject: Fix remount races with symlink handling in affs commit 29333920a5a46edcc9b728e2cf0134d5a9b516ee upstream. A couple of fields in affs_sb_info is used in follow_link() and symlink() for handling AFFS "absolute" symlinks. Need locking against affs_remount() updates. Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/affs/affs.h | 2 +- fs/affs/namei.c | 7 +++++-- fs/affs/super.c | 17 ++++++++++++++--- fs/affs/symlink.c | 7 +++++-- 4 files changed, 25 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/affs/affs.h b/fs/affs/affs.h index e511dc621a2e..0e40caaba456 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -106,8 +106,8 @@ struct affs_sb_info { u32 s_last_bmap; struct buffer_head *s_bmap_bh; char *s_prefix; /* Prefix for volumes and assigns. */ - int s_prefix_len; /* Length of prefix. */ char s_volume[32]; /* Volume prefix for absolute symlinks. */ + spinlock_t symlink_lock; /* protects the previous two */ }; #define SF_INTL 0x0001 /* International filesystem. */ diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 960d336ec694..d70bbbac6b7b 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -341,10 +341,13 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) p = (char *)AFFS_HEAD(bh)->table; lc = '/'; if (*symname == '/') { + struct affs_sb_info *sbi = AFFS_SB(sb); while (*symname == '/') symname++; - while (AFFS_SB(sb)->s_volume[i]) /* Cannot overflow */ - *p++ = AFFS_SB(sb)->s_volume[i++]; + spin_lock(&sbi->symlink_lock); + while (sbi->s_volume[i]) /* Cannot overflow */ + *p++ = sbi->s_volume[i++]; + spin_unlock(&sbi->symlink_lock); } while (i < maxlen && (c = *symname++)) { if (c == '.' && lc == '/' && *symname == '.' && symname[1] == '/') { diff --git a/fs/affs/super.c b/fs/affs/super.c index b2a5958c6191..be6a6e8ed7d6 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -221,8 +221,6 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s *mount_opts |= SF_MUFS; break; case Opt_prefix: - /* Free any previous prefix */ - kfree(*prefix); *prefix = match_strdup(&args[0]); if (!*prefix) return 0; @@ -311,6 +309,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; sb->s_fs_info = sbi; mutex_init(&sbi->s_bmlock); + spin_lock_init(&sbi->symlink_lock); if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block, &blocksize,&sbi->s_prefix, @@ -518,14 +517,18 @@ affs_remount(struct super_block *sb, int *flags, char *data) unsigned long mount_flags; int res = 0; char *new_opts = kstrdup(data, GFP_KERNEL); + char volume[32]; + char *prefix = NULL; pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data); *flags |= MS_NODIRATIME; + memcpy(volume, sbi->s_volume, 32); if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block, - &blocksize, &sbi->s_prefix, sbi->s_volume, + &blocksize, &prefix, volume, &mount_flags)) { + kfree(prefix); kfree(new_opts); return -EINVAL; } @@ -536,6 +539,14 @@ affs_remount(struct super_block *sb, int *flags, char *data) sbi->s_mode = mode; sbi->s_uid = uid; sbi->s_gid = gid; + /* protect against readers */ + spin_lock(&sbi->symlink_lock); + if (prefix) { + kfree(sbi->s_prefix); + sbi->s_prefix = prefix; + } + memcpy(sbi->s_volume, volume, 32); + spin_unlock(&sbi->symlink_lock); if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { unlock_kernel(); diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index 41782539c907..ee00f08c4f53 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -20,7 +20,6 @@ static int affs_symlink_readpage(struct file *file, struct page *page) int i, j; char c; char lc; - char *pf; pr_debug("AFFS: follow_link(ino=%lu)\n",inode->i_ino); @@ -32,11 +31,15 @@ static int affs_symlink_readpage(struct file *file, struct page *page) j = 0; lf = (struct slink_front *)bh->b_data; lc = 0; - pf = AFFS_SB(inode->i_sb)->s_prefix ? AFFS_SB(inode->i_sb)->s_prefix : "/"; if (strchr(lf->symname,':')) { /* Handle assign or volume name */ + struct affs_sb_info *sbi = AFFS_SB(inode->i_sb); + char *pf; + spin_lock(&sbi->symlink_lock); + pf = sbi->s_prefix ? sbi->s_prefix : "/"; while (i < 1023 && (c = pf[i])) link[i++] = c; + spin_unlock(&sbi->symlink_lock); while (i < 1023 && lf->symname[j] != ':') link[i++] = lf->symname[j++]; if (i < 1023) -- cgit v1.2.3 From 30d3844a878b9d6adb2aa76961042f0c73c7de6c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Jan 2010 00:06:22 -0500 Subject: fix affs parse_options() commit 217686e98321a4ff4c1a6cc535e511e37c5d2dbf upstream. Error handling in that sucker got broken back in 2003. If function returns 0 on failure, it's not nice to add return -EINVAL into it. Adding return 1 on other failure exits is also not a good thing (and yes, original success exits with 1 and some of failure exits with 0 are still there; so's the original logics in callers). Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/affs/super.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/affs/super.c b/fs/affs/super.c index be6a6e8ed7d6..d41e9673cd97 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -203,7 +203,7 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s switch (token) { case Opt_bs: if (match_int(&args[0], &n)) - return -EINVAL; + return 0; if (n != 512 && n != 1024 && n != 2048 && n != 4096) { printk ("AFFS: Invalid blocksize (512, 1024, 2048, 4096 allowed)\n"); @@ -213,7 +213,7 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s break; case Opt_mode: if (match_octal(&args[0], &option)) - return 1; + return 0; *mode = option & 0777; *mount_opts |= SF_SETMODE; break; @@ -231,21 +231,21 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s break; case Opt_reserved: if (match_int(&args[0], reserved)) - return 1; + return 0; break; case Opt_root: if (match_int(&args[0], root)) - return 1; + return 0; break; case Opt_setgid: if (match_int(&args[0], &option)) - return 1; + return 0; *gid = option; *mount_opts |= SF_SETGID; break; case Opt_setuid: if (match_int(&args[0], &option)) - return -EINVAL; + return 0; *uid = option; *mount_opts |= SF_SETUID; break; -- cgit v1.2.3 From 944a638b7bb07f3ef9a33af60e5ea8465b7adfd1 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Wed, 6 Jan 2010 17:23:17 +0000 Subject: FDPIC: Respect PT_GNU_STACK exec protection markings when creating NOMMU stack commit 04e4f2b18c8de1389d1e00fef0f42a8099910daf upstream. The current code will load the stack size and protection markings, but then only use the markings in the MMU code path. The NOMMU code path always passes PROT_EXEC to the mmap() call. While this doesn't matter to most people whilst the code is running, it will cause a pointless icache flush when starting every FDPIC application. Typically this icache flush will be of a region on the order of 128KB in size, or may be the entire icache, depending on the facilities available on the CPU. In the case where the arch default behaviour seems to be desired (EXSTACK_DEFAULT), we probe VM_STACK_FLAGS for VM_EXEC to determine whether we should be setting PROT_EXEC or not. For arches that support an MPU (Memory Protection Unit - an MMU without the virtual mapping capability), setting PROT_EXEC or not will make an important difference. It should be noted that this change also affects the executability of the brk region, since ELF-FDPIC has that share with the stack. However, this is probably irrelevant as NOMMU programs aren't likely to use the brk region, preferring instead allocation via mmap(). Signed-off-by: Mike Frysinger Signed-off-by: David Howells Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/binfmt_elf_fdpic.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 38502c67987c..456cfcf7da0d 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -170,6 +170,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, unsigned long stack_size, entryaddr; #ifdef ELF_FDPIC_PLAT_INIT unsigned long dynaddr; +#endif +#ifndef CONFIG_MMU + unsigned long stack_prot; #endif struct file *interpreter = NULL; /* to shut gcc up */ char *interpreter_name = NULL; @@ -316,6 +319,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, * defunct, deceased, etc. after this point we have to exit via * error_kill */ set_personality(PER_LINUX_FDPIC); + if (elf_read_implies_exec(&exec_params.hdr, executable_stack)) + current->personality |= READ_IMPLIES_EXEC; set_binfmt(&elf_fdpic_format); current->mm->start_code = 0; @@ -377,9 +382,13 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, if (stack_size < PAGE_SIZE * 2) stack_size = PAGE_SIZE * 2; + stack_prot = PROT_READ | PROT_WRITE; + if (executable_stack == EXSTACK_ENABLE_X || + (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC)) + stack_prot |= PROT_EXEC; + down_write(¤t->mm->mmap_sem); - current->mm->start_brk = do_mmap(NULL, 0, stack_size, - PROT_READ | PROT_WRITE | PROT_EXEC, + current->mm->start_brk = do_mmap(NULL, 0, stack_size, stack_prot, MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN, 0); -- cgit v1.2.3 From 336ca4cc1f9d14edbb5d155b41aa301aaeb731c4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 28 Jan 2010 22:14:42 -0800 Subject: Split 'flush_old_exec' into two functions commit 221af7f87b97431e3ee21ce4b0e77d5411cf1549 upstream. 'flush_old_exec()' is the point of no return when doing an execve(), and it is pretty badly misnamed. It doesn't just flush the old executable environment, it also starts up the new one. Which is very inconvenient for things like setting up the new personality, because we want the new personality to affect the starting of the new environment, but at the same time we do _not_ want the new personality to take effect if flushing the old one fails. As a result, the x86-64 '32-bit' personality is actually done using this insane "I'm going to change the ABI, but I haven't done it yet" bit (TIF_ABI_PENDING), with SET_PERSONALITY() not actually setting the personality, but just the "pending" bit, so that "flush_thread()" can do the actual personality magic. This patch in no way changes any of that insanity, but it does split the 'flush_old_exec()' function up into a preparatory part that can fail (still called flush_old_exec()), and a new part that will actually set up the new exec environment (setup_new_exec()). All callers are changed to trivially comply with the new world order. Signed-off-by: H. Peter Anvin Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/binfmt_aout.c | 1 + fs/binfmt_elf.c | 27 ++------------------------- fs/binfmt_elf_fdpic.c | 3 +++ fs/binfmt_flat.c | 1 + fs/binfmt_som.c | 1 + fs/exec.c | 26 ++++++++++++++++---------- 6 files changed, 24 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index b639dcf7c778..0133b5a30413 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -263,6 +263,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) #else set_personality(PER_LINUX); #endif + setup_new_exec(bprm); current->mm->end_code = ex.a_text + (current->mm->start_code = N_TXTADDR(ex)); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b9b3bb51b1e4..1ed37ba72343 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -662,27 +662,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') goto out_free_interp; - /* - * The early SET_PERSONALITY here is so that the lookup - * for the interpreter happens in the namespace of the - * to-be-execed image. SET_PERSONALITY can select an - * alternate root. - * - * However, SET_PERSONALITY is NOT allowed to switch - * this task into the new images's memory mapping - * policy - that is, TASK_SIZE must still evaluate to - * that which is appropriate to the execing application. - * This is because exit_mmap() needs to have TASK_SIZE - * evaluate to the size of the old image. - * - * So if (say) a 64-bit application is execing a 32-bit - * application it is the architecture's responsibility - * to defer changing the value of TASK_SIZE until the - * switch really is going to happen - do this in - * flush_thread(). - akpm - */ - SET_PERSONALITY(loc->elf_ex); - interpreter = open_exec(elf_interpreter); retval = PTR_ERR(interpreter); if (IS_ERR(interpreter)) @@ -730,9 +709,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) /* Verify the interpreter has a valid arch */ if (!elf_check_arch(&loc->interp_elf_ex)) goto out_free_dentry; - } else { - /* Executables without an interpreter also need a personality */ - SET_PERSONALITY(loc->elf_ex); } /* Flush all traces of the currently running executable */ @@ -752,7 +728,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) current->flags |= PF_RANDOMIZE; - arch_pick_mmap_layout(current->mm); + + setup_new_exec(bprm); /* Do this so that we can load the interpreter, if need be. We will change some of these later */ diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 456cfcf7da0d..e7a0bb47252c 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -321,6 +321,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, set_personality(PER_LINUX_FDPIC); if (elf_read_implies_exec(&exec_params.hdr, executable_stack)) current->personality |= READ_IMPLIES_EXEC; + + setup_new_exec(bprm); + set_binfmt(&elf_fdpic_format); current->mm->start_code = 0; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index a2796651e756..ca88c4687b84 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -519,6 +519,7 @@ static int load_flat_file(struct linux_binprm * bprm, /* OK, This is the point of no return */ set_personality(PER_LINUX_32BIT); + setup_new_exec(bprm); } /* diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index eff74b9c9e77..35cf0023ddb8 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -227,6 +227,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) /* OK, This is the point of no return */ current->flags &= ~PF_FORKNOEXEC; current->personality = PER_HPUX; + setup_new_exec(bprm); /* Set the task size for HP-UX processes such that * the gateway page is outside the address space. diff --git a/fs/exec.c b/fs/exec.c index ba112bd4a339..b2ad94e25da7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -931,9 +931,7 @@ void set_task_comm(struct task_struct *tsk, char *buf) int flush_old_exec(struct linux_binprm * bprm) { - char * name; - int i, ch, retval; - char tcomm[sizeof(current->comm)]; + int retval; /* * Make sure we have a private signal table and that @@ -953,6 +951,20 @@ int flush_old_exec(struct linux_binprm * bprm) goto out; bprm->mm = NULL; /* We're using it now */ + return 0; + +out: + return retval; +} +EXPORT_SYMBOL(flush_old_exec); + +void setup_new_exec(struct linux_binprm * bprm) +{ + int i, ch; + char * name; + char tcomm[sizeof(current->comm)]; + + arch_pick_mmap_layout(current->mm); /* This is the point of no return */ current->sas_ss_sp = current->sas_ss_size = 0; @@ -1009,14 +1021,8 @@ int flush_old_exec(struct linux_binprm * bprm) flush_signal_handlers(current, 0); flush_old_files(current->files); - - return 0; - -out: - return retval; } - -EXPORT_SYMBOL(flush_old_exec); +EXPORT_SYMBOL(setup_new_exec); /* * Prepare credentials and lock ->cred_guard_mutex. -- cgit v1.2.3 From cb723ba5d03bf719dbc7409b4d67572d4472ef8b Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 27 Jan 2010 22:44:36 +0300 Subject: block: fix bio_add_page for non trivial merge_bvec_fn case commit 1d6165851cd8e3f919d446cd6da35dee44e8837e upstream. We have to properly decrease bi_size in order to merge_bvec_fn return right result. Otherwise this result in false merge rejects for two absolutely valid bio_vecs. This may cause significant performance penalty for example fs_block_size == 1k and block device is raid0 with small chunk_size = 8k. Then it is impossible to merge 7-th fs-block in to bio which already has 6 fs-blocks. Signed-off-by: Dmitry Monakhov Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/bio.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 12da5db8682c..e0c9e71cc404 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -542,13 +542,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (page == prev->bv_page && offset == prev->bv_offset + prev->bv_len) { + unsigned int prev_bv_len = prev->bv_len; prev->bv_len += len; if (q->merge_bvec_fn) { struct bvec_merge_data bvm = { + /* prev_bvec is already charged in + bi_size, discharge it in order to + simulate merging updated prev_bvec + as new bvec. */ .bi_bdev = bio->bi_bdev, .bi_sector = bio->bi_sector, - .bi_size = bio->bi_size, + .bi_size = bio->bi_size - prev_bv_len, .bi_rw = bio->bi_rw, }; -- cgit v1.2.3 From 94af44b66b66bf9c848f11dc12fcd1558e55f995 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 2 Feb 2010 12:37:44 -0800 Subject: Fix 'flush_old_exec()/setup_new_exec()' split commit 7ab02af428c2d312c0cf8fb0b01cc1eb21131a3d upstream. Commit 221af7f87b9 ("Split 'flush_old_exec' into two functions") split the function at the point of no return - ie right where there were no more error cases to check. That made sense from a technical standpoint, but when we then also combined it with the actual personality setting going in between flush_old_exec() and setup_new_exec(), it needs to be a bit more careful. In particular, we need to make sure that we really flush the old personality bits in the 'flush' stage, rather than later in the 'setup' stage, since otherwise we might be flushing the _new_ personality state that we're just setting up. So this moves the flags and personality flushing (and 'flush_thread()', which is the arch-specific function that generally resets lazy FP state etc) of the old process into flush_old_exec(), so that it doesn't affect any state that execve() is setting up for the new process environment. This was reported by Michal Simek as breaking his Microblaze qemu environment. Reported-and-tested-by: Michal Simek Cc: Peter Anvin Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/exec.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index b2ad94e25da7..7fa4efd50425 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -951,6 +951,11 @@ int flush_old_exec(struct linux_binprm * bprm) goto out; bprm->mm = NULL; /* We're using it now */ + + current->flags &= ~PF_RANDOMIZE; + flush_thread(); + current->personality &= ~bprm->per_clear; + return 0; out: @@ -987,9 +992,6 @@ void setup_new_exec(struct linux_binprm * bprm) tcomm[i] = '\0'; set_task_comm(current, tcomm); - current->flags &= ~PF_RANDOMIZE; - flush_thread(); - /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on * some architectures like powerpc @@ -1005,8 +1007,6 @@ void setup_new_exec(struct linux_binprm * bprm) set_dumpable(current->mm, suid_dumpable); } - current->personality &= ~bprm->per_clear; - /* * Flush performance counters when crossing a * security domain: -- cgit v1.2.3 From 0ae2b7de3957a477ec0f332c4da5633499b4d3aa Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Sat, 30 Jan 2010 20:28:19 +0100 Subject: block: fix bugs in bio-integrity mempool usage commit 9e9432c267e4047db98b9d4fba95099c6effcef9 upstream. Fix two bugs in the bio integrity code: use_bip_pool() always returns 0 because it checks against the wrong limit, causing the mempool to be used only when regular allocation fails. When the mempool is used as a fallback we don't free the data properly. Signed-Off-By: Chuck Ebbert Acked-by: Martin K. Petersen Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/bio-integrity.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 49a34e7f7306..a16f29e888cd 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -61,7 +61,7 @@ static inline unsigned int vecs_to_idx(unsigned int nr) static inline int use_bip_pool(unsigned int idx) { - if (idx == BIOVEC_NR_POOLS) + if (idx == BIOVEC_MAX_IDX) return 1; return 0; @@ -95,6 +95,7 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, /* Use mempool if lower order alloc failed or max vecs were requested */ if (bip == NULL) { + idx = BIOVEC_MAX_IDX; /* so we free the payload properly later */ bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); if (unlikely(bip == NULL)) { -- cgit v1.2.3 From 3a9353f232812dc9057cbee39c618f400a8c5f60 Mon Sep 17 00:00:00 2001 From: anfei zhou Date: Tue, 2 Feb 2010 13:44:02 -0800 Subject: mm: flush dcache before writing into page to avoid alias commit 931e80e4b3263db75c8e34f078d22f11bbabd3a3 upstream. The cache alias problem will happen if the changes of user shared mapping is not flushed before copying, then user and kernel mapping may be mapped into two different cache line, it is impossible to guarantee the coherence after iov_iter_copy_from_user_atomic. So the right steps should be: flush_dcache_page(page); kmap_atomic(page); write to page; kunmap_atomic(page); flush_dcache_page(page); More precisely, we might create two new APIs flush_dcache_user_page and flush_dcache_kern_page to replace the two flush_dcache_page accordingly. Here is a snippet tested on omap2430 with VIPT cache, and I think it is not ARM-specific: int val = 0x11111111; fd = open("abc", O_RDWR); addr = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); *(addr+0) = 0x44444444; tmp = *(addr+0); *(addr+1) = 0x77777777; write(fd, &val, sizeof(int)); close(fd); The results are not always 0x11111111 0x77777777 at the beginning as expected. Sometimes we see 0x44444444 0x77777777. Signed-off-by: Anfei Cc: Russell King Cc: Miklos Szeredi Cc: Nick Piggin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/fuse/file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c18913a777ae..a9f5e137f1d3 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -828,6 +828,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, if (!page) break; + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + pagefault_disable(); tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); pagefault_enable(); -- cgit v1.2.3 From 3520341ce308afaf676c961e8432ff5c87d3121b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 7 Feb 2010 10:11:23 -0800 Subject: Fix race in tty_fasync() properly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 80e1e823989ec44d8e35bdfddadbddcffec90424 upstream. This reverts commit 703625118069 ("tty: fix race in tty_fasync") and commit b04da8bfdfbb ("fnctl: f_modown should call write_lock_irqsave/ restore") that tried to fix up some of the fallout but was incomplete. It turns out that we really cannot hold 'tty->ctrl_lock' over calling __f_setown, because not only did that cause problems with interrupt disables (which the second commit fixed), it also causes a potential ABBA deadlock due to lock ordering. Thanks to Tetsuo Handa for following up on the issue, and running lockdep to show the problem. It goes roughly like this: - f_getown gets filp->f_owner.lock for reading without interrupts disabled, so an interrupt that happens while that lock is held can cause a lockdep chain from f_owner.lock -> sighand->siglock. - at the same time, the tty->ctrl_lock -> f_owner.lock chain that commit 703625118069 introduced, together with the pre-existing sighand->siglock -> tty->ctrl_lock chain means that we have a lock dependency the other way too. So instead of extending tty->ctrl_lock over the whole __f_setown() call, we now just take a reference to the 'pid' structure while holding the lock, and then release it after having done the __f_setown. That still guarantees that 'struct pid' won't go away from under us, which is all we really ever needed. Reported-and-tested-by: Tetsuo Handa Acked-by: Greg Kroah-Hartman Acked-by: Américo Wang Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/fcntl.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index 5ef953e6f908..97e01dc0d95f 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -199,9 +199,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, int force) { - unsigned long flags; - - write_lock_irqsave(&filp->f_owner.lock, flags); + write_lock_irq(&filp->f_owner.lock); if (force || !filp->f_owner.pid) { put_pid(filp->f_owner.pid); filp->f_owner.pid = get_pid(pid); @@ -213,7 +211,7 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, filp->f_owner.euid = cred->euid; } } - write_unlock_irqrestore(&filp->f_owner.lock, flags); + write_unlock_irq(&filp->f_owner.lock); } int __f_setown(struct file *filp, struct pid *pid, enum pid_type type, -- cgit v1.2.3 From e031c71282e8223cc8fc088e4c1bf86900ce9927 Mon Sep 17 00:00:00 2001 From: Jun'ichi Nomura Date: Fri, 29 Jan 2010 09:56:22 +0900 Subject: freeze_bdev: don't deactivate successfully frozen MS_RDONLY sb commit 4b06e5b9ad8abb20105b2b25e42c509ebe9b2d76 upstream. Thanks Thomas and Christoph for testing and review. I removed 'smp_wmb()' before up_write from the previous patch, since up_write() should have necessary ordering constraints. (I.e. the change of s_frozen is visible to others after up_write) I'm quite sure the change is harmless but if you are uncomfortable with Tested-by/Reviewed-by on the modified patch, please remove them. If MS_RDONLY, freeze_bdev should just up_write(s_umount) instead of deactivate_locked_super(). Also, keep sb->s_frozen consistent so that remount can check the frozen state. Otherwise a crash reported here can happen: http://lkml.org/lkml/2010/1/16/37 http://lkml.org/lkml/2010/1/28/53 This patch should be applied for 2.6.32 stable series, too. Reviewed-by: Christoph Hellwig Tested-by: Thomas Backlund Signed-off-by: Jun'ichi Nomura Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/block_dev.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 8bed0557d88c..34e2d2033765 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -246,7 +246,8 @@ struct super_block *freeze_bdev(struct block_device *bdev) if (!sb) goto out; if (sb->s_flags & MS_RDONLY) { - deactivate_locked_super(sb); + sb->s_frozen = SB_FREEZE_TRANS; + up_write(&sb->s_umount); mutex_unlock(&bdev->bd_fsfreeze_mutex); return sb; } @@ -307,7 +308,7 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) BUG_ON(sb->s_bdev != bdev); down_write(&sb->s_umount); if (sb->s_flags & MS_RDONLY) - goto out_deactivate; + goto out_unfrozen; if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); @@ -321,11 +322,11 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) } } +out_unfrozen: sb->s_frozen = SB_UNFROZEN; smp_wmb(); wake_up(&sb->s_wait_unfrozen); -out_deactivate: if (sb) deactivate_locked_super(sb); out_unlock: -- cgit v1.2.3 From 35e2093d5d7b632c083af3578c05876375828314 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Wed, 10 Feb 2010 13:56:42 -0800 Subject: fs/exec.c: restrict initial stack space expansion to rlimit commit 803bf5ec259941936262d10ecc84511b76a20921 upstream. When reserving stack space for a new process, make sure we're not attempting to expand the stack by more than rlimit allows. This fixes a bug caused by b6a2fea39318e43fee84fa7b0b90d68bed92d2ba ("mm: variable length argument support") and unmasked by fc63cf237078c86214abcb2ee9926d8ad289da9b ("exec: setup_arg_pages() fails to return errors"). This bug means that when limiting the stack to less the 20*PAGE_SIZE (eg. 80K on 4K pages or 'ulimit -s 79') all processes will be killed before they start. This is particularly bad with 64K pages, where a ulimit below 1280K will kill every process. To test, do: 'ulimit -s 15; ls' before and after the patch is applied. Before it's applied, 'ls' should be killed. After the patch is applied, 'ls' should no longer be killed. A stack limit of 15KB since it's small enough to trigger 20*PAGE_SIZE. Also 15KB not a multiple of PAGE_SIZE, which is a trickier case to handle correctly with this code. 4K pages should be fine to test with. [kosaki.motohiro@jp.fujitsu.com: cleanup] [akpm@linux-foundation.org: cleanup cleanup] Signed-off-by: Michael Neuling Signed-off-by: KOSAKI Motohiro Cc: Americo Wang Cc: Anton Blanchard Cc: Oleg Nesterov Cc: James Morris Cc: Ingo Molnar Cc: Serge Hallyn Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/exec.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 7fa4efd50425..da36c206f0f1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -572,6 +572,9 @@ int setup_arg_pages(struct linux_binprm *bprm, struct vm_area_struct *prev = NULL; unsigned long vm_flags; unsigned long stack_base; + unsigned long stack_size; + unsigned long stack_expand; + unsigned long rlim_stack; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size to 1GB */ @@ -628,10 +631,24 @@ int setup_arg_pages(struct linux_binprm *bprm, goto out_unlock; } + stack_expand = EXTRA_STACK_VM_PAGES * PAGE_SIZE; + stack_size = vma->vm_end - vma->vm_start; + /* + * Align this down to a page boundary as expand_stack + * will align it up. + */ + rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK; + rlim_stack = min(rlim_stack, stack_size); #ifdef CONFIG_STACK_GROWSUP - stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE; + if (stack_size + stack_expand > rlim_stack) + stack_base = vma->vm_start + rlim_stack; + else + stack_base = vma->vm_end + stack_expand; #else - stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE; + if (stack_size + stack_expand > rlim_stack) + stack_base = vma->vm_end - rlim_stack; + else + stack_base = vma->vm_start - stack_expand; #endif ret = expand_stack(vma, stack_base); if (ret) -- cgit v1.2.3 From ea332d671c5f6802d85572c2af13b9a671d8ea88 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 5 Feb 2010 13:14:00 -0500 Subject: cifs: fix length calculation for converted unicode readdir names commit f12f98dba6ea1517cd7fbb912208893b9c014c15 upstream. cifs_from_ucs2 returns the length of the converted name, including the length of the NULL terminator. We don't want to include the NULL terminator in the dentry name length however since that'll throw off the hash calculation for the dentry cache. I believe that this is the root cause of several problems that have cropped up recently that seem to be papered over with the "noserverino" mount option. More confirmation of that would be good, but this is clearly a bug and it fixes at least one reproducible problem that was reported. This patch fixes at least this reproducer in this kernel.org bug: http://bugzilla.kernel.org/show_bug.cgi?id=15088#c12 Reported-by: Bjorn Tore Sund Acked-by: Dave Kleikamp Signed-off-by: Jeff Layton Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/readdir.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index f84062f9a985..f5618f8cc462 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -666,6 +666,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst, min(len, max_len), nlt, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + pqst->len -= nls_nullsize(nlt); } else { pqst->name = filename; pqst->len = len; -- cgit v1.2.3 From 1f9382ec17cc4fa5344b871658345e569fba8af7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 26 Jan 2010 15:41:34 -0500 Subject: NFS: Fix a reference leak in nfs_wb_cancel_page() commit c9edda7140ec6a22accf7f2f86da362dfbfd41fc upstream. Signed-off-by: Trond Myklebust Reviewed-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfs/write.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 6fc37762c495..cf6c06f66edc 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1542,6 +1542,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) break; } ret = nfs_wait_on_request(req); + nfs_release_request(req); if (ret < 0) goto out; } -- cgit v1.2.3 From 438b9c70b8800b0c12ec6bc784e00bbae24a75ca Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 26 Jan 2010 15:41:53 -0500 Subject: NFS: Try to commit unstable writes in nfs_release_page() commit 82be934a59ff891cac598727e5a862ba2b9d1fac upstream. If someone calls nfs_release_page(), we presumably already know that the page is clean, however it may be holding an unstable write. Signed-off-by: Trond Myklebust Reviewed-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfs/file.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f5fdd39e037a..393d40fd7eb9 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -486,6 +486,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp) { dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); + if (gfp & __GFP_WAIT) + nfs_wb_page(page->mapping->host, page); /* If PagePrivate() is set, then the page is not freeable */ if (PagePrivate(page)) return 0; -- cgit v1.2.3 From 1236ff906f9d3044cd24d22463c3c943ee92e3a2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 26 Jan 2010 15:42:30 -0500 Subject: NFSv4: Don't allow posix locking against servers that don't support it commit 8e469ebd6dc32cbaf620e134d79f740bf0ebab79 upstream. Signed-off-by: Trond Myklebust Reviewed-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4_fs.h | 1 + fs/nfs/nfs4proc.c | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 6ea07a3c75d4..b4a6b1a586d6 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -141,6 +141,7 @@ enum { NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */ NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */ NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */ + NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */ }; struct nfs4_state { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 741a562177fc..8dbee6291284 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1573,6 +1573,8 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in status = PTR_ERR(state); if (IS_ERR(state)) goto err_opendata_put; + if ((opendata->o_res.rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) != 0) + set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); nfs4_opendata_put(opendata); nfs4_put_state_owner(sp); *res = state; @@ -4060,8 +4062,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock { struct nfs_inode *nfsi = NFS_I(state->inode); unsigned char fl_flags = request->fl_flags; - int status; + int status = -ENOLCK; + if ((fl_flags & FL_POSIX) && + !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) + goto out; /* Is this a delegated open? */ status = nfs4_set_lock_state(state, request); if (status != 0) -- cgit v1.2.3 From e7055c2366a5a5b507492905200d4327949b81ce Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 26 Jan 2010 15:42:21 -0500 Subject: NFSv4: Ensure that the NFSv4 locking can recover from stateid errors commit 2bee72a6aa1e6d0a4f5da56217f0d0bbbdd0d9a3 upstream. In most cases, we just want to mark the lock_stateid sequence id as being uninitialised. Signed-off-by: Trond Myklebust Reviewed-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4proc.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8dbee6291284..6c200595099f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3978,6 +3978,22 @@ static const struct rpc_call_ops nfs4_lock_ops = { .rpc_release = nfs4_lock_release, }; +static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state *state = lsp->ls_state; + + switch (error) { + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_EXPIRED: + if (new_lock_owner != 0 || + (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) + nfs4_state_mark_reclaim_nograce(clp, state); + lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + }; +} + static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int reclaim) { struct nfs4_lockdata *data; @@ -4013,6 +4029,9 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f ret = nfs4_wait_for_completion_rpc_task(task); if (ret == 0) { ret = data->rpc_status; + if (ret) + nfs4_handle_setlk_error(data->server, data->lsp, + data->arg.new_lock_owner, ret); } else data->cancelled = 1; rpc_put_task(task); -- cgit v1.2.3 From 04673da74dbd331040fa439bd72963a35f1b67cf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 3 Feb 2010 08:27:22 -0500 Subject: NFS: Fix an Oops when truncating a file commit 9f557cd8073104b39528794d44e129331ded649f upstream. The VM/VFS does not allow mapping->a_ops->invalidatepage() to fail. Unfortunately, nfs_wb_page_cancel() may fail if a fatal signal occurs. Since the NFS code assumes that the page stays mapped for as long as the writeback is active, we can end up Oopsing (among other things). The only safe fix here is to convert nfs_wait_on_request(), so as to make it uninterruptible (as is already the case with wait_on_page_writeback()). Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/pagelist.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index e2975939126a..a12c45b65dd4 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -176,6 +176,12 @@ void nfs_release_request(struct nfs_page *req) kref_put(&req->wb_kref, nfs_free_request); } +static int nfs_wait_bit_uninterruptible(void *word) +{ + io_schedule(); + return 0; +} + /** * nfs_wait_on_request - Wait for a request to complete. * @req: request to wait upon. @@ -186,14 +192,9 @@ void nfs_release_request(struct nfs_page *req) int nfs_wait_on_request(struct nfs_page *req) { - int ret = 0; - - if (!test_bit(PG_BUSY, &req->wb_flags)) - goto out; - ret = out_of_line_wait_on_bit(&req->wb_flags, PG_BUSY, - nfs_wait_bit_killable, TASK_KILLABLE); -out: - return ret; + return wait_on_bit(&req->wb_flags, PG_BUSY, + nfs_wait_bit_uninterruptible, + TASK_UNINTERRUPTIBLE); } /** -- cgit v1.2.3 From 16e63ecd5d683fe18d91fcd1e6a7f63468f7d805 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 3 Feb 2010 08:27:35 -0500 Subject: NFS: Fix a umount race commit 387c149b54b4321cbc790dadbd4f8eedb5a90468 upstream. Ensure that we unregister the bdi before kill_anon_super() calls ida_remove() on our device name. Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/super.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index e71f0fdff1da..4bf23f6f93cf 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -241,6 +241,7 @@ static int nfs_show_stats(struct seq_file *, struct vfsmount *); static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *); static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); +static void nfs_put_super(struct super_block *); static void nfs_kill_super(struct super_block *); static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); @@ -264,6 +265,7 @@ static const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, + .put_super = nfs_put_super, .statfs = nfs_statfs, .clear_inode = nfs_clear_inode, .umount_begin = nfs_umount_begin, @@ -333,6 +335,7 @@ static const struct super_operations nfs4_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, + .put_super = nfs_put_super, .statfs = nfs_statfs, .clear_inode = nfs4_clear_inode, .umount_begin = nfs_umount_begin, @@ -2195,6 +2198,17 @@ error_splat_super: goto out; } +/* + * Ensure that we unregister the bdi before kill_anon_super + * releases the device name + */ +static void nfs_put_super(struct super_block *s) +{ + struct nfs_server *server = NFS_SB(s); + + bdi_unregister(&server->backing_dev_info); +} + /* * Destroy an NFS2/3 superblock */ @@ -2203,7 +2217,6 @@ static void nfs_kill_super(struct super_block *s) struct nfs_server *server = NFS_SB(s); kill_anon_super(s); - bdi_unregister(&server->backing_dev_info); nfs_fscache_release_super_cookie(s); nfs_free_server(server); } -- cgit v1.2.3 From 5ba0facdbeb73ead6fe04d6bc5947e2fb4f29320 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 8 Feb 2010 09:32:27 -0500 Subject: NFS: Fix a bug in nfs_fscache_release_page() commit 2c1740098c708b465e87637b237feb2fd98f129a upstream. Not having an fscache cookie is perfectly valid if the user didn't mount with the fscache option. This patch fixes http://bugzilla.kernel.org/show_bug.cgi?id=15234 Signed-off-by: Trond Myklebust Acked-by: David Howells Signed-off-by: Greg Kroah-Hartman --- fs/nfs/fscache.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index fa588006588d..237874f1af23 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -354,12 +354,11 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode) */ int nfs_fscache_release_page(struct page *page, gfp_t gfp) { - struct nfs_inode *nfsi = NFS_I(page->mapping->host); - struct fscache_cookie *cookie = nfsi->fscache; - - BUG_ON(!cookie); - if (PageFsCache(page)) { + struct nfs_inode *nfsi = NFS_I(page->mapping->host); + struct fscache_cookie *cookie = nfsi->fscache; + + BUG_ON(!cookie); dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", cookie, page, nfsi); -- cgit v1.2.3 From 892faa5a39f6236bccbe6f48a5f736a8aec9b278 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 8 Feb 2010 09:32:40 -0500 Subject: NFS: Fix the mapping of the NFSERR_SERVERFAULT error commit fdcb45777a3d1689c5541e1f85ee3ebbd197d2c1 upstream. It was recently pointed out that the NFSERR_SERVERFAULT error, which is designed to inform the user of a serious internal error on the server, was being mapped to an error value that is internal to the kernel. This patch maps it to the error EREMOTEIO, which is exported to userland through errno.h. Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/mount_clnt.c | 2 +- fs/nfs/nfs2xdr.c | 2 +- fs/nfs/nfs4xdr.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 0adefc40cc89..59047f8d7d72 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -120,7 +120,7 @@ static struct { { .status = MNT3ERR_INVAL, .errno = -EINVAL, }, { .status = MNT3ERR_NAMETOOLONG, .errno = -ENAMETOOLONG, }, { .status = MNT3ERR_NOTSUPP, .errno = -ENOTSUPP, }, - { .status = MNT3ERR_SERVERFAULT, .errno = -ESERVERFAULT, }, + { .status = MNT3ERR_SERVERFAULT, .errno = -EREMOTEIO, }, }; struct mountres { diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 5e078b222b4e..7bc2da8efd4a 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -699,7 +699,7 @@ static struct { { NFSERR_BAD_COOKIE, -EBADCOOKIE }, { NFSERR_NOTSUPP, -ENOTSUPP }, { NFSERR_TOOSMALL, -ETOOSMALL }, - { NFSERR_SERVERFAULT, -ESERVERFAULT }, + { NFSERR_SERVERFAULT, -EREMOTEIO }, { NFSERR_BADTYPE, -EBADTYPE }, { NFSERR_JUKEBOX, -EJUKEBOX }, { -1, -EIO } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 20b4e30e6c82..a4cd1b79b71d 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4554,7 +4554,7 @@ static int decode_sequence(struct xdr_stream *xdr, * If the server returns different values for sessionID, slotID or * sequence number, the server is looney tunes. */ - status = -ESERVERFAULT; + status = -EREMOTEIO; if (memcmp(id.data, res->sr_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { @@ -5678,7 +5678,7 @@ static struct { { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, { NFS4ERR_NOTSUPP, -ENOTSUPP }, { NFS4ERR_TOOSMALL, -ETOOSMALL }, - { NFS4ERR_SERVERFAULT, -ESERVERFAULT }, + { NFS4ERR_SERVERFAULT, -EREMOTEIO }, { NFS4ERR_BADTYPE, -EBADTYPE }, { NFS4ERR_LOCKED, -EAGAIN }, { NFS4ERR_SYMLINK, -ELOOP }, @@ -5705,7 +5705,7 @@ nfs4_stat_to_errno(int stat) } if (stat <= 10000 || stat > 10100) { /* The server is looney tunes. */ - return -ESERVERFAULT; + return -EREMOTEIO; } /* If we cannot translate the error, the recovery routines should * handle it. -- cgit v1.2.3 From c5fc9aa0845fa2920e0f9d70844151832bfd9d1a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 28 Jan 2010 22:11:38 -0500 Subject: befs: fix leak commit 8dd5ca532c2d2c2b85f16bc038ebfff05b8853e1 upstream. Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/befs/linuxvfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 33baf27fac78..34ddda888e63 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -873,6 +873,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); unacquire_priv_sbp: + kfree(befs_sb->mount_opts.iocharset); kfree(sb->s_fs_info); unacquire_none: -- cgit v1.2.3 From 0f102935a606e13e923905451c097acf8e87e4cd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 3 Feb 2010 23:13:24 -0800 Subject: sysfs: sysfs_sd_setattr set iattrs unconditionally commit 7c0ff870d1ed287504a61ed865f3d728c757436b upstream. There is currently a bug in sysfs_sd_setattr inherited from sysfs_setattr in 2.6.32 where the first time we set the attributes on a sysfs file we allocate backing store but do not set the backing store attributes. Resulting in overly restrictive permissions on sysfs files. The fix is to simply modify the code so that it always executes when we update the sysfs attributes, as we did in 2.6.31 and earlier. Signed-off-by: Eric W. Biederman Tested-by: Jean Delvare Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/inode.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index e28cecf179f5..02a022a3d473 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -94,30 +94,29 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) if (!sd_attrs) return -ENOMEM; sd->s_iattr = sd_attrs; - } else { - /* attributes were changed at least once in past */ - iattrs = &sd_attrs->ia_iattr; - - if (ia_valid & ATTR_UID) - iattrs->ia_uid = iattr->ia_uid; - if (ia_valid & ATTR_GID) - iattrs->ia_gid = iattr->ia_gid; - if (ia_valid & ATTR_ATIME) - iattrs->ia_atime = timespec_trunc(iattr->ia_atime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_MTIME) - iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_CTIME) - iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - mode &= ~S_ISGID; - iattrs->ia_mode = sd->s_mode = mode; - } + } + /* attributes were changed at least once in past */ + iattrs = &sd_attrs->ia_iattr; + + if (ia_valid & ATTR_UID) + iattrs->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + iattrs->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + iattrs->ia_atime = timespec_trunc(iattr->ia_atime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + iattrs->ia_mode = sd->s_mode = mode; } return error; } -- cgit v1.2.3 From be6520201bc0c80ec869ec035607ace48a5f78c4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 15 Feb 2010 12:19:53 -0500 Subject: NFS: Too many GETATTR and ACCESS calls after direct I/O commit 65d269538a1129495ac45a14a777cd11cfe881d8 upstream. The cached read and write paths initialize fattr->time_start in their setup procedures. The value of fattr->time_start is propagated to read_cache_jiffies by nfs_update_inode(). Subsequent calls to nfs_attribute_timeout() will then use a good time stamp when computing the attribute cache timeout, and squelch unneeded GETATTR calls. Since the direct I/O paths erroneously leave the inode's fattr->time_start field set to zero, read_cache_jiffies for that inode is set to zero after any direct read or write operation. This triggers an otw GETATTR or ACCESS call to update the file's attribute and access caches properly, even when the NFS READ or WRITE replies have usable post-op attributes. Make sure the direct read and write setup code performs the same fattr initialization as the cached I/O paths to prevent unnecessary GETATTR calls. This was likely introduced by commit 0e574af1 in 2.6.15, which appears to add new nfs_fattr_init() call sites in the cached read and write paths, but not in the equivalent places in fs/nfs/direct.c. A subsequent commit in the same series, 33801147, introduces the fattr->time_start field. Interestingly, the direct write reschedule path already has a call to nfs_fattr_init() in the right place. Reported-by: Quentin Barnes Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/nfs/direct.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index e1d415e97849..0d289823e856 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -342,6 +342,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->res.fattr = &data->fattr; data->res.eof = 0; data->res.count = bytes; + nfs_fattr_init(&data->fattr); msg.rpc_argp = &data->args; msg.rpc_resp = &data->res; @@ -575,6 +576,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); NFS_PROTO(data->inode)->commit_setup(data, &msg); @@ -766,6 +768,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->res.fattr = &data->fattr; data->res.count = bytes; data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); task_setup_data.task = &data->task; task_setup_data.callback_data = data; -- cgit v1.2.3 From 081312e8c147a85df0f37dcc45d7e2fbaf79dfb2 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Wed, 4 Nov 2009 02:48:01 -0600 Subject: eCryptfs: Add getattr function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f8f484d1b6677dd5cd5e7e605db747e8c30bbd47 upstream. The i_blocks field of an eCryptfs inode cannot be trusted, but generic_fillattr() uses it to instantiate the blocks field of a stat() syscall when a filesystem doesn't implement its own getattr(). Users have noticed that the output of du is incorrect on newly created files. This patch creates ecryptfs_getattr() which calls into the lower filesystem's getattr() so that eCryptfs can use its kstat.blocks value after calling generic_fillattr(). It is important to note that the block count includes the eCryptfs metadata stored in the beginning of the lower file plus any padding used to fill an extent before encryption. https://bugs.launchpad.net/ecryptfs/+bug/390833 Reported-by: Dominic Sacré Signed-off-by: Tyler Hicks Cc: Tim Gardner Signed-off-by: Greg Kroah-Hartman --- fs/ecryptfs/inode.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 056fed62d0de..728f07ebc593 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -971,6 +971,21 @@ out: return rc; } +int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct kstat lower_stat; + int rc; + + rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry), + ecryptfs_dentry_to_lower(dentry), &lower_stat); + if (!rc) { + generic_fillattr(dentry->d_inode, stat); + stat->blocks = lower_stat.blocks; + } + return rc; +} + int ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) @@ -1100,6 +1115,7 @@ const struct inode_operations ecryptfs_dir_iops = { const struct inode_operations ecryptfs_main_iops = { .permission = ecryptfs_permission, .setattr = ecryptfs_setattr, + .getattr = ecryptfs_getattr, .setxattr = ecryptfs_setxattr, .getxattr = ecryptfs_getxattr, .listxattr = ecryptfs_listxattr, -- cgit v1.2.3