From a83fffb75d09cd3d44167b7fb9c1ab9e2269445f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 15 Jun 2014 02:39:54 +0200 Subject: btrfs: sink blocksize parameter to btrfs_find_create_tree_block Finally it's clear that the requested blocksize is always equal to nodesize, with one exception, the superblock. Superblock has fixed size regardless of the metadata block size, but uses the same helpers to initialize sys array/chunk tree and to work with the chunk items. So it pretends to be an extent_buffer for a moment, btrfs_read_sys_array is full of special cases, we're adding one more. Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9a02da16f2be..4a42edc224a8 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2164,7 +2164,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, parent = path->nodes[*level]; root_owner = btrfs_header_owner(parent); - next = btrfs_find_create_tree_block(root, bytenr, blocksize); + next = btrfs_find_create_tree_block(root, bytenr); if (!next) return -ENOMEM; -- cgit v1.2.3 From 381cf6587f8a8a8e981bc0c1aaaa8859b51dc756 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 2 Jan 2015 18:45:16 +0100 Subject: btrfs: fix leak of path in btrfs_find_item If btrfs_find_item is called with NULL path it allocates one locally but does not free it. Affected paths are inserting an orphan item for a file and for a subvol root. Move the path allocation to the callers. CC: # 3.14+ Fixes: 3f870c289900 ("btrfs: expand btrfs_find_item() to include find_orphan_item functionality") Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9a02da16f2be..5be45c12dd71 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1257,10 +1257,19 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { int ret; - ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID, + struct btrfs_path *path; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_find_item(root, path, BTRFS_ORPHAN_OBJECTID, offset, BTRFS_ORPHAN_ITEM_KEY, NULL); if (ret > 0) ret = btrfs_insert_orphan_item(trans, root, offset); + + btrfs_free_path(path); + return ret; } -- cgit v1.2.3 From 9c4f61f01d269815bb7c37be3ede59c5587747c6 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 2 Jan 2015 19:12:57 +0100 Subject: btrfs: simplify insert_orphan_item We can search and add the orphan item in one go, btrfs_insert_orphan_item will find out if the item already exists. Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5be45c12dd71..25a1c363a5f4 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1254,21 +1254,13 @@ out: } static int insert_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset) + struct btrfs_root *root, u64 ino) { int ret; - struct btrfs_path *path; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - ret = btrfs_find_item(root, path, BTRFS_ORPHAN_OBJECTID, - offset, BTRFS_ORPHAN_ITEM_KEY, NULL); - if (ret > 0) - ret = btrfs_insert_orphan_item(trans, root, offset); - - btrfs_free_path(path); + ret = btrfs_insert_orphan_item(trans, root, ino); + if (ret == -EEXIST) + ret = 0; return ret; } -- cgit v1.2.3 From d36808e0d4fc4802892dbd1dba964912cddf1323 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 10 Jan 2015 10:56:48 +0000 Subject: Btrfs: fix directory inconsistency after fsync log replay If we have an inode (file) with a link count greater than 1, remove one of its hard links, fsync the inode, power fail/crash and then replay the fsync log on the next mount, we end up getting the parent directory's metadata inconsistent - its i_size still reflects the deleted hard link and has dangling index entries (with no matching inode reference entries). This prevents the directory from ever being deletable, as its i_size can never decrease to BTRFS_EMPTY_DIR_SIZE even if all of its children inodes are deleted, and the dangling index entries can never be removed (as they point to an inode that does not exist anymore). This is easy to reproduce with the following excerpt from the test case for xfstests that I just made: _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create a test file with 2 hard links in the same directory. mkdir -p $SCRATCH_MNT/a/b echo "hello world" > $SCRATCH_MNT/a/b/foo ln $SCRATCH_MNT/a/b/foo $SCRATCH_MNT/a/b/bar # Make sure all metadata and data are durably persisted. sync # Now remove one of the hard links and fsync the inode. rm -f $SCRATCH_MNT/a/b/bar $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/a/b/foo # Simulate a crash/power loss. This makes sure the next mount # will see an fsync log and will replay that log. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # Remove the last hard link of the file and attempt to remove its parent # directory - this failed in btrfs because the fsync log and replay code # didn't decrement the parent directory's i_size and left dangling directory # index entries - this made the btrfs rmdir implementation always fail with # the error -ENOTEMPTY. # # The dangling directory index entries were visible to user space, but it was # impossible to do anything on them (unlink, open, read, write, stat, etc) # because the inode they pointed to did not exist anymore. # # The parent directory's metadata inconsistency (stale index entries) was # also detected by btrfs' fsck tool, which is run automatically by the fstests # framework when the test finishes. The error message reported by fsck was: # # root 5 inode 259 errors 2001, no inode item, link count wrong # unresolved ref dir 258 index 3 namelen 3 name bar filetype 1 errors 4, no inode ref # rm -f $SCRATCH_MNT/a/b/* rmdir $SCRATCH_MNT/a/b rmdir $SCRATCH_MNT/a To fix this just make sure that after an unlink, if the inode is fsync'ed, he parent inode is fully logged in the fsync log. A test case for xfstests follows soon. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 67e5bf709dca..60e1d0083faa 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4273,6 +4273,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct dentry *old_parent = NULL; int ret = 0; u64 last_committed = root->fs_info->last_trans_committed; + const struct dentry * const first_parent = parent; + const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > + last_committed); sb = inode->i_sb; @@ -4328,7 +4331,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } - inode_only = LOG_INODE_EXISTS; while (1) { if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) break; @@ -4337,8 +4339,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (root != BTRFS_I(inode)->root) break; + /* + * On unlink we must make sure our immediate parent directory + * inode is fully logged. This is to prevent leaving dangling + * directory index entries and a wrong directory inode's i_size. + * Not doing so can result in a directory being impossible to + * delete after log replay (rmdir will always fail with error + * -ENOTEMPTY). + */ + if (did_unlink && parent == first_parent) + inode_only = LOG_INODE_ALL; + else + inode_only = LOG_INODE_EXISTS; + if (BTRFS_I(inode)->generation > - root->fs_info->last_trans_committed) { + root->fs_info->last_trans_committed || + inode_only == LOG_INODE_ALL) { ret = btrfs_log_inode(trans, root, inode, inode_only, 0, LLONG_MAX, ctx); if (ret) -- cgit v1.2.3 From 2c2c452b0cafdc27442796c526ac929443654b0b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 13 Jan 2015 16:40:04 +0000 Subject: Btrfs: fix fsync when extend references are added to an inode If we added an extended reference to an inode and fsync'ed it, the log replay code would make our inode have an incorrect link count, which was lower then the expected/correct count. This resulted in stale directory index entries after deleting some of the hard links, and any access to the dangling directory entries resulted in -ESTALE errors because the entries pointed to inode items that don't exist anymore. This is easy to reproduce with the test case I made for xfstests, and the bulk of that test is: _scratch_mkfs "-O extref" >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create a test file with 3001 hard links. This number is large enough to # make btrfs start using extrefs at some point even if the fs has the maximum # possible leaf/node size (64Kb). echo "hello world" > $SCRATCH_MNT/foo for i in `seq 1 3000`; do ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_`printf "%04d" $i` done # Make sure all metadata and data are durably persisted. sync # Add one more link to the inode that ends up being a btrfs extref and fsync # the inode. ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3001 $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo # Simulate a crash/power loss. This makes sure the next mount # will see an fsync log and will replay that log. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # Now after the fsync log replay btrfs left our inode with a wrong link count N, # which was smaller than the correct link count M (N < M). # So after removing N hard links, the remaining M - N directory entries were # still visible to user space but it was impossible to do anything with them # because they pointed to an inode that didn't exist anymore. This resulted in # stale file handle errors (-ESTALE) when accessing those dentries for example. # # So remove all hard links except the first one and then attempt to read the # file, to verify we don't get an -ESTALE error when accessing the inodel # # The btrfs fsck tool also detected the incorrect inode link count and it # reported an error message like the following: # # root 5 inode 257 errors 2001, no inode item, link count wrong # unresolved ref dir 256 index 2978 namelen 13 name foo_link_2976 filetype 1 errors 4, no inode ref # # The fstests framework automatically calls fsck after a test is run, so we # don't need to call fsck explicitly here. rm -f $SCRATCH_MNT/foo_link_* cat $SCRATCH_MNT/foo status=0 exit So make sure an fsync always flushes the delayed inode item, so that the fsync log contains it (needed in order to trigger the link count fixup code) and fix the extref counting function, which always return -ENOENT to its caller (and made it assume there were always 0 extrefs). This issue has been present since the introduction of the extrefs feature (2012). A test case for xfstests follows soon. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 60e1d0083faa..533cdb02978a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1288,6 +1288,7 @@ static int count_inode_extrefs(struct btrfs_root *root, leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + cur_offset = 0; while (cur_offset < item_size) { extref = (struct btrfs_inode_extref *) (ptr + cur_offset); @@ -1303,7 +1304,7 @@ static int count_inode_extrefs(struct btrfs_root *root, } btrfs_release_path(path); - if (ret < 0) + if (ret < 0 && ret != -ENOENT) return ret; return nlink; } @@ -1395,9 +1396,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, nlink = ret; ret = count_inode_extrefs(root, inode, path); - if (ret == -ENOENT) - ret = 0; - if (ret < 0) goto out; @@ -3966,15 +3964,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key.type = (u8)-1; max_key.offset = (u64)-1; - /* Only run delayed items if we are a dir or a new file */ + /* + * Only run delayed items if we are a dir or a new file. + * Otherwise commit the delayed inode only, which is needed in + * order for the log replay code to mark inodes for link count + * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). + */ if (S_ISDIR(inode->i_mode) || - BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { + BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) ret = btrfs_commit_inode_delayed_items(trans, inode); - if (ret) { - btrfs_free_path(path); - btrfs_free_path(dst_path); - return ret; - } + else + ret = btrfs_commit_inode_delayed_inode(inode); + + if (ret) { + btrfs_free_path(path); + btrfs_free_path(dst_path); + return ret; } mutex_lock(&BTRFS_I(inode)->log_mutex); -- cgit v1.2.3 From df8d116ffa379f3ef09d7ff28da0f0c921cc9fa1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 14 Jan 2015 01:52:25 +0000 Subject: Btrfs: fix fsync log replay for inodes with a mix of regular refs and extrefs If we have an inode with a large number of hard links, some of which may be extrefs, turn a regular ref into an extref, fsync the inode and then replay the fsync log (after a crash/reboot), we can endup with an fsync log that makes the replay code always fail with -EOVERFLOW when processing the inode's references. This is easy to reproduce with the test case I made for xfstests. Its steps are the following: _scratch_mkfs "-O extref" >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create a test file with 3001 hard links. This number is large enough to # make btrfs start using extrefs at some point even if the fs has the maximum # possible leaf/node size (64Kb). echo "hello world" > $SCRATCH_MNT/foo for i in `seq 1 3000`; do ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_`printf "%04d" $i` done # Make sure all metadata and data are durably persisted. sync # Now remove one link, add a new one with a new name, add another new one with # the same name as the one we just removed and fsync the inode. rm -f $SCRATCH_MNT/foo_link_0001 ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3001 ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_0001 rm -f $SCRATCH_MNT/foo_link_0002 ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3002 ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3003 $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo # Simulate a crash/power loss. This makes sure the next mount # will see an fsync log and will replay that log. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # Check that the number of hard links is correct, we are able to remove all # the hard links and read the file's data. This is just to verify we don't # get stale file handle errors (due to dangling directory index entries that # point to inodes that no longer exist). echo "Link count: $(stat --format=%h $SCRATCH_MNT/foo)" [ -f $SCRATCH_MNT/foo ] || echo "Link foo is missing" for ((i = 1; i <= 3003; i++)); do name=foo_link_`printf "%04d" $i` if [ $i -eq 2 ]; then [ -f $SCRATCH_MNT/$name ] && echo "Link $name found" else [ -f $SCRATCH_MNT/$name ] || echo "Link $name is missing" fi done rm -f $SCRATCH_MNT/foo_link_* cat $SCRATCH_MNT/foo rm -f $SCRATCH_MNT/foo status=0 exit The fix is simply to correct the overflow condition when overwriting a reference item because it was wrong, trying to increase the item in the fs/subvol tree by an impossible amount. Also ensure that we don't insert one normal ref and one ext ref for the same dentry - this happened because processing a dir index entry from the parent in the log happened when the normal ref item was full, which made the logic insert an extref and later when the normal ref had enough room, it would be inserted again when processing the ref item from the child inode in the log. This issue has been present since the introduction of the extrefs feature (2012). A test case for xfstests follows soon. This test only passes if the previous patch titled "Btrfs: fix fsync when extend references are added to an inode" is applied too. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 533cdb02978a..a26658756537 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -453,11 +453,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, insert: btrfs_release_path(path); /* try to insert the key into the destination tree */ + path->skip_release_on_error = 1; ret = btrfs_insert_empty_item(trans, root, path, key, item_size); + path->skip_release_on_error = 0; /* make sure any existing item is the correct size */ - if (ret == -EEXIST) { + if (ret == -EEXIST || ret == -EOVERFLOW) { u32 found_size; found_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); @@ -844,7 +846,7 @@ out: static noinline int backref_in_log(struct btrfs_root *log, struct btrfs_key *key, u64 ref_objectid, - char *name, int namelen) + const char *name, int namelen) { struct btrfs_path *path; struct btrfs_inode_ref *ref; @@ -1555,6 +1557,30 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, return ret; } +/* + * Return true if an inode reference exists in the log for the given name, + * inode and parent inode. + */ +static bool name_in_log_ref(struct btrfs_root *log_root, + const char *name, const int name_len, + const u64 dirid, const u64 ino) +{ + struct btrfs_key search_key; + + search_key.objectid = ino; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = dirid; + if (backref_in_log(log_root, &search_key, dirid, name, name_len)) + return true; + + search_key.type = BTRFS_INODE_EXTREF_KEY; + search_key.offset = btrfs_extref_hash(dirid, name, name_len); + if (backref_in_log(log_root, &search_key, dirid, name, name_len)) + return true; + + return false; +} + /* * take a single entry in a log directory item and replay it into * the subvolume. @@ -1665,10 +1691,17 @@ out: return ret; insert: + if (name_in_log_ref(root->log_root, name, name_len, + key->objectid, log_key.objectid)) { + /* The dentry will be added later. */ + ret = 0; + update_size = false; + goto out; + } btrfs_release_path(path); ret = insert_one_name(trans, root, path, key->objectid, key->offset, name, name_len, log_type, &log_key); - if (ret && ret != -ENOENT) + if (ret && ret != -ENOENT && ret != -EEXIST) goto out; update_size = false; ret = 0; -- cgit v1.2.3 From a937b9791ec2ee71d5e303d2c02c8c1ad8abff35 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 12 Dec 2014 17:39:12 +0100 Subject: btrfs: kill btrfs_inode_*time helpers They just opencode taking address of the timespec member. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a26658756537..79f05bc9d6ec 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3276,19 +3276,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + btrfs_set_token_timespec_sec(leaf, &item->atime, inode->i_atime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + btrfs_set_token_timespec_nsec(leaf, &item->atime, inode->i_atime.tv_nsec, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + btrfs_set_token_timespec_sec(leaf, &item->mtime, inode->i_mtime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + btrfs_set_token_timespec_nsec(leaf, &item->mtime, inode->i_mtime.tv_nsec, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + btrfs_set_token_timespec_sec(leaf, &item->ctime, inode->i_ctime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + btrfs_set_token_timespec_nsec(leaf, &item->ctime, inode->i_ctime.tv_nsec, &token); btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), -- cgit v1.2.3 From 575849ecf5d103ca9bbf0a6b9e89eba335d4e750 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 11 Feb 2015 11:12:39 +0000 Subject: Btrfs: fix scheduler warning when syncing log We try to lock a mutex while the current task state is not TASK_RUNNING, which results in the following warning when CONFIG_DEBUG_LOCK_ALLOC=y: [30736.772501] ------------[ cut here ]------------ [30736.774545] WARNING: CPU: 9 PID: 19972 at kernel/sched/core.c:7300 __might_sleep+0x8b/0xa8() [30736.783453] do not call blocking ops when !TASK_RUNNING; state=2 set at [] prepare_to_wait+0x43/0x89 [30736.786261] Modules linked in: dm_flakey dm_mod crc32c_generic btrfs xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop parport_pc psmouse parport pcspkr microcode serio_raw evdev processor thermal_sys i2c_piix4 i2c_core button ext4 crc16 jbd2 mbcache sg sr_mod cdrom sd_mod ata_generic virtio_scsi floppy ata_piix libata virtio_pci virtio_ring e1000 virtio scsi_mod [30736.794323] CPU: 9 PID: 19972 Comm: fsstress Not tainted 3.19.0-rc7-btrfs-next-5+ #1 [30736.795821] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 [30736.798788] 0000000000000009 ffff88042743fbd8 ffffffff814248ed ffff88043d32f2d8 [30736.800504] ffff88042743fc28 ffff88042743fc18 ffffffff81045338 0000000000000001 [30736.802131] ffffffff81064514 ffffffff817c52d1 000000000000026d 0000000000000000 [30736.803676] Call Trace: [30736.804256] [] dump_stack+0x4c/0x65 [30736.805245] [] warn_slowpath_common+0xa1/0xbb [30736.806360] [] ? __might_sleep+0x8b/0xa8 [30736.807391] [] warn_slowpath_fmt+0x46/0x48 [30736.808511] [] ? prepare_to_wait+0x43/0x89 [30736.809620] [] ? prepare_to_wait+0x43/0x89 [30736.810691] [] __might_sleep+0x8b/0xa8 [30736.811703] [] mutex_lock_nested+0x2f/0x3a0 [30736.812889] [] ? trace_hardirqs_on_caller+0x18f/0x1ab [30736.814138] [] ? trace_hardirqs_on+0xd/0xf [30736.819878] [] wait_for_writer.isra.12+0x91/0xaa [btrfs] [30736.821260] [] ? signal_pending_state+0x31/0x31 [30736.822410] [] btrfs_sync_log+0x160/0x947 [btrfs] [30736.823574] [] ? trace_hardirqs_on_caller+0x18f/0x1ab [30736.824847] [] ? trace_hardirqs_on+0xd/0xf [30736.825972] [] btrfs_sync_file+0x2b0/0x319 [btrfs] [30736.827684] [] vfs_fsync_range+0x21/0x23 [30736.828932] [] vfs_fsync+0x1c/0x1e [30736.829917] [] do_fsync+0x34/0x4e [30736.830862] [] SyS_fsync+0x10/0x14 [30736.831819] [] system_call_fastpath+0x12/0x17 [30736.832982] ---[ end trace c0b57df60d32ae5c ]--- Fix this my acquiring the mutex after calling finish_wait(), which sets the task's state to TASK_RUNNING. Signed-off-by: Filipe Manana Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 79f05bc9d6ec..7870bdba26b7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2448,8 +2448,8 @@ static void wait_for_writer(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); if (atomic_read(&root->log_writers)) schedule(); - mutex_lock(&root->log_mutex); finish_wait(&root->log_writer_wait, &wait); + mutex_lock(&root->log_mutex); } } -- cgit v1.2.3 From 1a4bcf470c886b955adf36486f4c86f2441d85cb Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 13 Feb 2015 12:30:56 +0000 Subject: Btrfs: fix fsync data loss after adding hard link to inode We have a scenario where after the fsync log replay we can lose file data that had been previously fsync'ed if we added an hard link for our inode and after that we sync'ed the fsync log (for example by fsync'ing some other file or directory). This is because when adding an hard link we updated the inode item in the log tree with an i_size value of 0. At that point the new inode item was in memory only and a subsequent fsync log replay would not make us lose the file data. However if after adding the hard link we sync the log tree to disk, by fsync'ing some other file or directory for example, we ended up losing the file data after log replay, because the inode item in the persisted log tree had an an i_size of zero. This is easy to reproduce, and the following excerpt from my test for xfstests shows this: _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create one file with data and fsync it. # This made the btrfs fsync log persist the data and the inode metadata with # a correct inode->i_size (4096 bytes). $XFS_IO_PROG -f -c "pwrite -S 0xaa -b 4K 0 4K" -c "fsync" \ $SCRATCH_MNT/foo | _filter_xfs_io # Now add one hard link to our file. This made the btrfs code update the fsync # log, in memory only, with an inode metadata having a size of 0. ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link # Now force persistence of the fsync log to disk, for example, by fsyncing some # other file. touch $SCRATCH_MNT/bar $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/bar # Before a power loss or crash, we could read the 4Kb of data from our file as # expected. echo "File content before:" od -t x1 $SCRATCH_MNT/foo # Simulate a crash/power loss. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # After the fsync log replay, because the fsync log had a value of 0 for our # inode's i_size, we couldn't read anymore the 4Kb of data that we previously # wrote and fsync'ed. The size of the file became 0 after the fsync log replay. echo "File content after:" od -t x1 $SCRATCH_MNT/foo Another alternative test, that doesn't need to fsync an inode in the same transaction it was created, is: _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create our test file with some data. $XFS_IO_PROG -f -c "pwrite -S 0xaa -b 8K 0 8K" \ $SCRATCH_MNT/foo | _filter_xfs_io # Make sure the file is durably persisted. sync # Append some data to our file, to increase its size. $XFS_IO_PROG -f -c "pwrite -S 0xcc -b 4K 8K 4K" \ $SCRATCH_MNT/foo | _filter_xfs_io # Fsync the file, so from this point on if a crash/power failure happens, our # new data is guaranteed to be there next time the fs is mounted. $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo # Add one hard link to our file. This made btrfs write into the in memory fsync # log a special inode with generation 0 and an i_size of 0 too. Note that this # didn't update the inode in the fsync log on disk. ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link # Now make sure the in memory fsync log is durably persisted. # Creating and fsync'ing another file will do it. touch $SCRATCH_MNT/bar $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/bar # As expected, before the crash/power failure, we should be able to read the # 12Kb of file data. echo "File content before:" od -t x1 $SCRATCH_MNT/foo # Simulate a crash/power loss. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # After mounting the fs again, the fsync log was replayed. # The btrfs fsync log replay code didn't update the i_size of the persisted # inode because the inode item in the log had a special generation with a # value of 0 (and it couldn't know the correct i_size, since that inode item # had a 0 i_size too). This made the last 4Kb of file data inaccessible and # effectively lost. echo "File content after:" od -t x1 $SCRATCH_MNT/foo This isn't a new issue/regression. This problem has been around since the log tree code was added in 2008: Btrfs: Add a write ahead tree log to optimize synchronous operations (commit e02119d5a7b4396c5a872582fddc8bd6d305a70a) Test cases for xfstests follow soon. CC: Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7870bdba26b7..5f649bb32bec 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -490,8 +490,20 @@ insert: src_item = (struct btrfs_inode_item *)src_ptr; dst_item = (struct btrfs_inode_item *)dst_ptr; - if (btrfs_inode_generation(eb, src_item) == 0) + if (btrfs_inode_generation(eb, src_item) == 0) { + struct extent_buffer *dst_eb = path->nodes[0]; + + if (S_ISREG(btrfs_inode_mode(eb, src_item)) && + S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) { + struct btrfs_map_token token; + u64 ino_size = btrfs_inode_size(eb, src_item); + + btrfs_init_map_token(&token); + btrfs_set_token_inode_size(dst_eb, dst_item, + ino_size, &token); + } goto no_copy; + } if (overwrite_root && S_ISDIR(btrfs_inode_mode(eb, src_item)) && @@ -3250,7 +3262,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, - struct inode *inode, int log_inode_only) + struct inode *inode, int log_inode_only, + u64 logged_isize) { struct btrfs_map_token token; @@ -3263,7 +3276,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * to say 'update this inode with these values' */ btrfs_set_token_inode_generation(leaf, item, 0, &token); - btrfs_set_token_inode_size(leaf, item, 0, &token); + btrfs_set_token_inode_size(leaf, item, logged_isize, &token); } else { btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, @@ -3315,7 +3328,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, return ret; inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); - fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); + fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0); btrfs_release_path(path); return 0; } @@ -3324,7 +3337,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *dst_path, struct btrfs_path *src_path, u64 *last_extent, - int start_slot, int nr, int inode_only) + int start_slot, int nr, int inode_only, + u64 logged_isize) { unsigned long src_offset; unsigned long dst_offset; @@ -3381,7 +3395,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, dst_path->slots[0], struct btrfs_inode_item); fill_inode_item(trans, dst_path->nodes[0], inode_item, - inode, inode_only == LOG_INODE_EXISTS); + inode, inode_only == LOG_INODE_EXISTS, + logged_isize); } else { copy_extent_buffer(dst_path->nodes[0], src, dst_offset, src_offset, ins_sizes[i]); @@ -3933,6 +3948,33 @@ process: return ret; } +static int logged_inode_size(struct btrfs_root *log, struct inode *inode, + struct btrfs_path *path, u64 *size_ret) +{ + struct btrfs_key key; + int ret; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + *size_ret = i_size_read(inode); + } else { + struct btrfs_inode_item *item; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + *size_ret = btrfs_inode_size(path->nodes[0], item); + } + + btrfs_release_path(path); + return 0; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -3970,6 +4012,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, bool fast_search = false; u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + u64 logged_isize = 0; path = btrfs_alloc_path(); if (!path) @@ -4030,6 +4073,25 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key_type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key_type); } else { + if (inode_only == LOG_INODE_EXISTS) { + /* + * Make sure the new inode item we write to the log has + * the same isize as the current one (if it exists). + * This is necessary to prevent data loss after log + * replay, and also to prevent doing a wrong expanding + * truncate - for e.g. create file, write 4K into offset + * 0, fsync, write 4K into offset 4096, add hard link, + * fsync some other file (to sync log), power fail - if + * we use the inode's current i_size, after log replay + * we get a 8Kb file, with the last 4Kb extent as a hole + * (zeroes), as if an expanding truncate happened, + * instead of getting a file of 4Kb only. + */ + err = logged_inode_size(log, inode, path, + &logged_isize); + if (err) + goto out_unlock; + } if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags)) { clear_bit(BTRFS_INODE_COPY_EVERYTHING, @@ -4085,7 +4147,8 @@ again: } ret = copy_items(trans, inode, dst_path, path, &last_extent, - ins_start_slot, ins_nr, inode_only); + ins_start_slot, ins_nr, inode_only, + logged_isize); if (ret < 0) { err = ret; goto out_unlock; @@ -4109,7 +4172,7 @@ next_slot: if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, &last_extent, ins_start_slot, - ins_nr, inode_only); + ins_nr, inode_only, logged_isize); if (ret < 0) { err = ret; goto out_unlock; @@ -4130,7 +4193,8 @@ next_slot: } if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, &last_extent, - ins_start_slot, ins_nr, inode_only); + ins_start_slot, ins_nr, inode_only, + logged_isize); if (ret < 0) { err = ret; goto out_unlock; -- cgit v1.2.3 From a742994aa2e271eb8cd8e043d276515ec858ed73 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 13 Feb 2015 16:56:14 +0000 Subject: Btrfs: don't remove extents and xattrs when logging new names If we are recording in the tree log that an inode has new names (new hard links were added), we would drop items, belonging to the inode, that we shouldn't: 1) When the flag BTRFS_INODE_COPY_EVERYTHING is set in the inode's runtime flags, we ended up dropping all the extent and xattr items that were previously logged. This was done only in memory, since logging a new name doesn't imply syncing the log; 2) When the flag BTRFS_INODE_COPY_EVERYTHING is set in the inode's runtime flags, we ended up dropping all the xattr items that were previously logged. Like the case before, this was done only in memory because logging a new name doesn't imply syncing the log. This led to some surprises in scenarios such as the following: 1) write some extents to an inode; 2) fsync the inode; 3) truncate the inode or delete/modify some of its xattrs 4) add a new hard link for that inode 5) fsync some other file, to force the log tree to be durably persisted 6) power failure happens The next time the fs is mounted, the fsync log replay code is executed, and the resulting file doesn't have the content it had when the last fsync against it was performed, instead if has a content matching what it had when the last transaction commit happened. So change the behaviour such that when a new name is logged, only the inode item and reference items are processed. This is easy to reproduce with the test I just made for xfstests, whose main body is: _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create our test file with some data. $XFS_IO_PROG -f -c "pwrite -S 0xaa -b 8K 0 8K" \ $SCRATCH_MNT/foo | _filter_xfs_io # Make sure the file is durably persisted. sync # Append some data to our file, to increase its size. $XFS_IO_PROG -f -c "pwrite -S 0xcc -b 4K 8K 4K" \ $SCRATCH_MNT/foo | _filter_xfs_io # Fsync the file, so from this point on if a crash/power failure happens, our # new data is guaranteed to be there next time the fs is mounted. $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo # Now shrink our file to 5000 bytes. $XFS_IO_PROG -c "truncate 5000" $SCRATCH_MNT/foo # Now do an expanding truncate to a size larger than what we had when we last # fsync'ed our file. This is just to verify that after power failure and # replaying the fsync log, our file matches what it was when we last fsync'ed # it - 12Kb size, first 8Kb of data had a value of 0xaa and the last 4Kb of # data had a value of 0xcc. $XFS_IO_PROG -c "truncate 32K" $SCRATCH_MNT/foo # Add one hard link to our file. This made btrfs drop all of our file's # metadata from the fsync log, including the metadata relative to the # extent we just wrote and fsync'ed. This change was made only to the fsync # log in memory, so adding the hard link alone doesn't change the persisted # fsync log. This happened because the previous truncates set the runtime # flag BTRFS_INODE_NEEDS_FULL_SYNC in the btrfs inode structure. ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link # Now make sure the in memory fsync log is durably persisted. # Creating and fsync'ing another file will do it. # After this our persisted fsync log will no longer have metadata for our file # foo that points to the extent we wrote and fsync'ed before. touch $SCRATCH_MNT/bar $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/bar # As expected, before the crash/power failure, we should be able to see a file # with a size of 32Kb, with its first 5000 bytes having the value 0xaa and all # the remaining bytes with value 0x00. echo "File content before:" od -t x1 $SCRATCH_MNT/foo # Simulate a crash/power loss. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # After mounting the fs again, the fsync log was replayed. # The expected result is to see a file with a size of 12Kb, with its first 8Kb # of data having the value 0xaa and its last 4Kb of data having a value of 0xcc. # The btrfs bug used to leave the file as it used te be as of the last # transaction commit - that is, with a size of 8Kb with all bytes having a # value of 0xaa. echo "File content after:" od -t x1 $SCRATCH_MNT/foo The test case for xfstests follows soon. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5f649bb32bec..f96996a1b70c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4069,8 +4069,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode)) { int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; - if (inode_only == LOG_INODE_EXISTS) - max_key_type = BTRFS_XATTR_ITEM_KEY; + if (inode_only == LOG_INODE_EXISTS) { + max_key_type = BTRFS_INODE_EXTREF_KEY; + max_key.type = max_key_type; + } ret = drop_objectid_items(trans, log, path, ino, max_key_type); } else { if (inode_only == LOG_INODE_EXISTS) { @@ -4092,18 +4094,31 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (err) goto out_unlock; } - if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags)) { - clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags); - ret = btrfs_truncate_inode_items(trans, log, - inode, 0, 0); - } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags) || + if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags)) { + if (inode_only == LOG_INODE_EXISTS) { + max_key.type = BTRFS_INODE_EXTREF_KEY; + ret = drop_objectid_items(trans, log, path, ino, + max_key.type); + } else { + clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); + ret = btrfs_truncate_inode_items(trans, log, + inode, 0, 0); + } + } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags) || inode_only == LOG_INODE_EXISTS) { - if (inode_only == LOG_INODE_ALL) + if (inode_only == LOG_INODE_ALL) { + clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); fast_search = true; - max_key.type = BTRFS_XATTR_ITEM_KEY; + max_key.type = BTRFS_XATTR_ITEM_KEY; + } else { + max_key.type = BTRFS_INODE_EXTREF_KEY; + } ret = drop_objectid_items(trans, log, path, ino, max_key.type); } else { -- cgit v1.2.3