From 1ec9a1ae1e30c733077c0b288c4301b66b7a81f2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 10 Feb 2016 10:42:25 +0000 Subject: Btrfs: fix unreplayable log after snapshot delete + parent dir fsync If we delete a snapshot, fsync its parent directory and crash/power fail before the next transaction commit, on the next mount when we attempt to replay the log tree of the root containing the parent directory we will fail and prevent the filesystem from mounting, which is solvable by wiping out the log trees with the btrfs-zero-log tool but very inconvenient as we will lose any data and metadata fsynced before the parent directory was fsynced. For example: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt $ mkdir /mnt/testdir $ btrfs subvolume snapshot /mnt /mnt/testdir/snap $ btrfs subvolume delete /mnt/testdir/snap $ xfs_io -c "fsync" /mnt/testdir < crash / power failure and reboot > $ mount /dev/sdc /mnt mount: mount(2) failed: No such file or directory And in dmesg/syslog we get the following message and trace: [192066.361162] BTRFS info (device dm-0): failed to delete reference to snap, inode 257 parent 257 [192066.363010] ------------[ cut here ]------------ [192066.365268] WARNING: CPU: 4 PID: 5130 at fs/btrfs/inode.c:3986 __btrfs_unlink_inode+0x17a/0x354 [btrfs]() [192066.367250] BTRFS: Transaction aborted (error -2) [192066.368401] Modules linked in: btrfs dm_flakey dm_mod ppdev sha256_generic xor raid6_pq hmac drbg ansi_cprng aesni_intel acpi_cpufreq tpm_tis aes_x86_64 tpm ablk_helper evdev cryptd sg parport_pc i2c_piix4 psmouse lrw parport i2c_core pcspkr gf128mul processor serio_raw glue_helper button loop autofs4 ext4 crc16 mbcache jbd2 sd_mod sr_mod cdrom ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring crc32c_intel scsi_mod e1000 virtio floppy [last unloaded: btrfs] [192066.377154] CPU: 4 PID: 5130 Comm: mount Tainted: G W 4.4.0-rc6-btrfs-next-20+ #1 [192066.378875] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014 [192066.380889] 0000000000000000 ffff880143923670 ffffffff81257570 ffff8801439236b8 [192066.382561] ffff8801439236a8 ffffffff8104ec07 ffffffffa039dc2c 00000000fffffffe [192066.384191] ffff8801ed31d000 ffff8801b9fc9c88 ffff8801086875e0 ffff880143923710 [192066.385827] Call Trace: [192066.386373] [] dump_stack+0x4e/0x79 [192066.387387] [] warn_slowpath_common+0x99/0xb2 [192066.388429] [] ? __btrfs_unlink_inode+0x17a/0x354 [btrfs] [192066.389236] [] warn_slowpath_fmt+0x48/0x50 [192066.389884] [] __btrfs_unlink_inode+0x17a/0x354 [btrfs] [192066.390621] [] ? iput+0xb0/0x266 [192066.391200] [] btrfs_unlink_inode+0x1c/0x3d [btrfs] [192066.391930] [] check_item_in_log+0x1fe/0x29b [btrfs] [192066.392715] [] replay_dir_deletes+0x167/0x1cf [btrfs] [192066.393510] [] replay_one_buffer+0x417/0x570 [btrfs] [192066.394241] [] walk_up_log_tree+0x10e/0x1dc [btrfs] [192066.394958] [] walk_log_tree+0xa5/0x190 [btrfs] [192066.395628] [] btrfs_recover_log_trees+0x239/0x32c [btrfs] [192066.396790] [] ? replay_one_extent+0x50a/0x50a [btrfs] [192066.397891] [] open_ctree+0x1d8b/0x2167 [btrfs] [192066.398897] [] btrfs_mount+0x5ef/0x729 [btrfs] [192066.399823] [] ? trace_hardirqs_on+0xd/0xf [192066.400739] [] ? lockdep_init_map+0xb9/0x1b3 [192066.401700] [] mount_fs+0x67/0x131 [192066.402482] [] vfs_kern_mount+0x6c/0xde [192066.403930] [] btrfs_mount+0x1cb/0x729 [btrfs] [192066.404831] [] ? trace_hardirqs_on+0xd/0xf [192066.405726] [] ? lockdep_init_map+0xb9/0x1b3 [192066.406621] [] mount_fs+0x67/0x131 [192066.407401] [] vfs_kern_mount+0x6c/0xde [192066.408247] [] do_mount+0x893/0x9d2 [192066.409047] [] ? strndup_user+0x3f/0x8c [192066.409842] [] SyS_mount+0x75/0xa1 [192066.410621] [] entry_SYSCALL_64_fastpath+0x12/0x6b [192066.411572] ---[ end trace 2de42126c1e0a0f0 ]--- [192066.412344] BTRFS: error (device dm-0) in __btrfs_unlink_inode:3986: errno=-2 No such entry [192066.413748] BTRFS: error (device dm-0) in btrfs_replay_log:2464: errno=-2 No such entry (Failed to recover log tree) [192066.415458] BTRFS error (device dm-0): cleaner transaction attach returned -30 [192066.444613] BTRFS: open_ctree failed This happens because when we are replaying the log and processing the directory entry pointing to the snapshot in the subvolume tree, we treat its btrfs_dir_item item as having a location with a key type matching BTRFS_INODE_ITEM_KEY, which is wrong because the type matches BTRFS_ROOT_ITEM_KEY and therefore must be processed differently, as the object id refers to a root number and not to an inode in the root containing the parent directory. So fix this by triggering a transaction commit if an fsync against the parent directory is requested after deleting a snapshot. This is the simplest approach for a rare use case. Some alternative that avoids the transaction commit would require more code to explicitly delete the snapshot at log replay time (factoring out common code from ioctl.c: btrfs_ioctl_snap_destroy()), special care at fsync time to remove the log tree of the snapshot's root from the log root of the root of tree roots, amongst other steps. A test case for xfstests that triggers the issue follows. seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { _cleanup_flakey cd / rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter . ./common/dmflakey # real QA test starts here _need_to_be_root _supported_fs btrfs _supported_os Linux _require_scratch _require_dm_target flakey _require_metadata_journaling $SCRATCH_DEV rm -f $seqres.full _scratch_mkfs >>$seqres.full 2>&1 _init_flakey _mount_flakey # Create a snapshot at the root of our filesystem (mount point path), delete it, # fsync the mount point path, crash and mount to replay the log. This should # succeed and after the filesystem is mounted the snapshot should not be visible # anymore. _run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT $SCRATCH_MNT/snap1 _run_btrfs_util_prog subvolume delete $SCRATCH_MNT/snap1 $XFS_IO_PROG -c "fsync" $SCRATCH_MNT _flakey_drop_and_remount [ -e $SCRATCH_MNT/snap1 ] && \ echo "Snapshot snap1 still exists after log replay" # Similar scenario as above, but this time the snapshot is created inside a # directory and not directly under the root (mount point path). mkdir $SCRATCH_MNT/testdir _run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT $SCRATCH_MNT/testdir/snap2 _run_btrfs_util_prog subvolume delete $SCRATCH_MNT/testdir/snap2 $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir _flakey_drop_and_remount [ -e $SCRATCH_MNT/testdir/snap2 ] && \ echo "Snapshot snap2 still exists after log replay" _unmount_flakey echo "Silence is golden" status=0 exit Signed-off-by: Filipe Manana Tested-by: Liu Bo Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 978c3a810893..43c6781af654 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5497,6 +5497,21 @@ record: BTRFS_I(dir)->last_unlink_trans = trans->transid; } +/* + * Make sure that if someone attempts to fsync the parent directory of a deleted + * snapshot, it ends up triggering a transaction commit. This is to guarantee + * that after replaying the log tree of the parent directory's root we will not + * see the snapshot anymore and at log replay time we will not see any log tree + * corresponding to the deleted snapshot's root, which could lead to replaying + * it after replaying the log tree of the parent directory (which would replay + * the snapshot delete operation). + */ +void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, + struct inode *dir) +{ + BTRFS_I(dir)->last_unlink_trans = trans->transid; +} + /* * Call this after adding a new name for a file and it will properly * update the log to reflect the new name. -- cgit v1.2.3 From 2be63d5ce929603d4e7cedabd9e992eb34a0ff95 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 12 Feb 2016 11:34:23 +0000 Subject: Btrfs: fix file loss on log replay after renaming a file and fsync We have two cases where we end up deleting a file at log replay time when we should not. For this to happen the file must have been renamed and a directory inode must have been fsynced/logged. Two examples that exercise these two cases are listed below. Case 1) $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ mkdir -p /mnt/a/b $ mkdir /mnt/c $ touch /mnt/a/b/foo $ sync $ mv /mnt/a/b/foo /mnt/c/ # Create file bar just to make sure the fsync on directory a/ does # something and it's not a no-op. $ touch /mnt/a/bar $ xfs_io -c "fsync" /mnt/a < power fail / crash > The next time the filesystem is mounted, the log replay procedure deletes file foo. Case 2) $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ mkdir /mnt/a $ mkdir /mnt/b $ mkdir /mnt/c $ touch /mnt/a/foo $ ln /mnt/a/foo /mnt/b/foo_link $ touch /mnt/b/bar $ sync $ unlink /mnt/b/foo_link $ mv /mnt/b/bar /mnt/c/ $ xfs_io -c "fsync" /mnt/a/foo < power fail / crash > The next time the filesystem is mounted, the log replay procedure deletes file bar. The reason why the files are deleted is because when we log inodes other then the fsync target inode, we ignore their last_unlink_trans value and leave the log without enough information to later replay the rename operations. So we need to look at the last_unlink_trans values and fallback to a transaction commit if they are greater than the id of the last committed transaction. So fix this by looking at the last_unlink_trans values and fallback to transaction commits when needed. Also, when logging other inodes (for case 1 we logged descendants of the fsync target inode while for case 2 we logged ascendants) we need to care about concurrent tasks updating the last_unlink_trans of inodes we are logging (which was already an existing problem in check_parent_dirs_for_sync()). Since we can not acquire their inode mutex (vfs' struct inode ->i_mutex), as that causes deadlocks with other concurrent operations that acquire the i_mutex of 2 inodes (other fsyncs or renames for example), we need to serialize on the log_mutex of the inode we are logging. A task setting a new value for an inode's last_unlink_trans must acquire the inode's log_mutex and it must do this update before doing the actual unlink operation (which is already the case except when deleting a snapshot). Conversely the task logging the inode must first log the inode and then check the inode's last_unlink_trans value while holding its log_mutex, as if its value is not greater then the id of the last committed transaction it means it logged a safe state of the inode's items, while if its value is not smaller then the id of the last committed transaction it means the inode state it has logged might not be safe (the concurrent task might have just updated last_unlink_trans but hasn't done yet the unlink operation) and therefore a transaction commit must be done. Test cases for xfstests follow in separate patches. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 67 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 43c6781af654..9f6372dd0eab 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4771,6 +4771,42 @@ out_unlock: return err; } +/* + * Check if we must fallback to a transaction commit when logging an inode. + * This must be called after logging the inode and is used only in the context + * when fsyncing an inode requires the need to log some other inode - in which + * case we can't lock the i_mutex of each other inode we need to log as that + * can lead to deadlocks with concurrent fsync against other inodes (as we can + * log inodes up or down in the hierarchy) or rename operations for example. So + * we take the log_mutex of the inode after we have logged it and then check for + * its last_unlink_trans value - this is safe because any task setting + * last_unlink_trans must take the log_mutex and it must do this before it does + * the actual unlink operation, so if we do this check before a concurrent task + * sets last_unlink_trans it means we've logged a consistent version/state of + * all the inode items, otherwise we are not sure and must do a transaction + * commit (the concurrent task migth have only updated last_unlink_trans before + * we logged the inode or it might have also done the unlink). + */ +static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + bool ret = false; + + mutex_lock(&BTRFS_I(inode)->log_mutex); + if (BTRFS_I(inode)->last_unlink_trans > fs_info->last_trans_committed) { + /* + * Make sure any commits to the log are forced to be full + * commits. + */ + btrfs_set_log_full_commit(fs_info, trans); + ret = true; + } + mutex_unlock(&BTRFS_I(inode)->log_mutex); + + return ret; +} + /* * follow the dentry parent pointers up the chain and see if any * of the directories in it require a full commit before they can @@ -4784,7 +4820,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, u64 last_committed) { int ret = 0; - struct btrfs_root *root; struct dentry *old_parent = NULL; struct inode *orig_inode = inode; @@ -4816,14 +4851,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, BTRFS_I(inode)->logged_trans = trans->transid; smp_mb(); - if (BTRFS_I(inode)->last_unlink_trans > last_committed) { - root = BTRFS_I(inode)->root; - - /* - * make sure any commits to the log are forced - * to be full commits - */ - btrfs_set_log_full_commit(root->fs_info, trans); + if (btrfs_must_commit_transaction(trans, inode)) { ret = 1; break; } @@ -4982,6 +5010,9 @@ process_leaf: btrfs_release_path(path); ret = btrfs_log_inode(trans, root, di_inode, log_mode, 0, LLONG_MAX, ctx); + if (!ret && + btrfs_must_commit_transaction(trans, di_inode)) + ret = 1; iput(di_inode); if (ret) goto next_dir_inode; @@ -5096,6 +5127,9 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, root, dir_inode, LOG_INODE_ALL, 0, LLONG_MAX, ctx); + if (!ret && + btrfs_must_commit_transaction(trans, dir_inode)) + ret = 1; iput(dir_inode); if (ret) goto out; @@ -5447,6 +5481,9 @@ error: * They revolve around files there were unlinked from the directory, and * this function updates the parent directory so that a full commit is * properly done if it is fsync'd later after the unlinks are done. + * + * Must be called before the unlink operations (updates to the subvolume tree, + * inodes, etc) are done. */ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct inode *dir, struct inode *inode, @@ -5462,8 +5499,11 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, * into the file. When the file is logged we check it and * don't log the parents if the file is fully on disk. */ - if (S_ISREG(inode->i_mode)) + if (S_ISREG(inode->i_mode)) { + mutex_lock(&BTRFS_I(inode)->log_mutex); BTRFS_I(inode)->last_unlink_trans = trans->transid; + mutex_unlock(&BTRFS_I(inode)->log_mutex); + } /* * if this directory was already logged any new @@ -5494,7 +5534,9 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, return; record: + mutex_lock(&BTRFS_I(dir)->log_mutex); BTRFS_I(dir)->last_unlink_trans = trans->transid; + mutex_unlock(&BTRFS_I(dir)->log_mutex); } /* @@ -5505,11 +5547,16 @@ record: * corresponding to the deleted snapshot's root, which could lead to replaying * it after replaying the log tree of the parent directory (which would replay * the snapshot delete operation). + * + * Must be called before the actual snapshot destroy operation (updates to the + * parent root and tree of tree roots trees, etc) are done. */ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct inode *dir) { + mutex_lock(&BTRFS_I(dir)->log_mutex); BTRFS_I(dir)->last_unlink_trans = trans->transid; + mutex_unlock(&BTRFS_I(dir)->log_mutex); } /* -- cgit v1.2.3 From 5e33a2bd7ca7fa687fb0965869196eea6815d1f3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 25 Feb 2016 23:19:38 +0000 Subject: Btrfs: do not collect ordered extents when logging that inode exists When logging that an inode exists, for example as part of a directory fsync operation, we were collecting any ordered extents for the inode but we ended up doing nothing with them except tagging them as processed, by setting the flag BTRFS_ORDERED_LOGGED on them, which prevented a subsequent fsync of that inode (using the LOG_INODE_ALL mode) from collecting and processing them. This created a time window where a second fsync against the inode, using the fast path, ended up not logging the checksums for the new extents but it logged the extents since they were part of the list of modified extents. This happened because the ordered extents were not collected and checksums were not yet added to the csum tree - the ordered extents have not gone through btrfs_finish_ordered_io() yet (which is where we add them to the csum tree by calling inode.c:add_pending_csums()). So fix this by not collecting an inode's ordered extents if we are logging it with the LOG_INODE_EXISTS mode. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9f6372dd0eab..9d2e8ecc5382 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4500,7 +4500,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); - btrfs_get_logged_extents(inode, &logged_list, start, end); + /* + * Collect ordered extents only if we are logging data. This is to + * ensure a subsequent request to log this inode in LOG_INODE_ALL mode + * will process the ordered extents if they still exists at the time, + * because when we collect them we test and set for the flag + * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the + * same ordered extents. The consequence for the LOG_INODE_ALL log mode + * not processing the ordered extents is that we end up logging the + * corresponding file extent items, based on the extent maps in the + * inode's extent_map_tree's modified_list, without logging the + * respective checksums (since the may still be only attached to the + * ordered extents and have not been inserted in the csum tree by + * btrfs_finish_ordered_io() yet). + */ + if (inode_only == LOG_INODE_ALL) + btrfs_get_logged_extents(inode, &logged_list, start, end); /* * a brute force approach to making sure we get the most uptodate -- cgit v1.2.3 From ebb8765b2ded869b75bf5154b048119eb52571f7 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 10 Mar 2016 17:26:59 +0800 Subject: btrfs: move btrfs_compression_type to compression.h So that its better organized. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9d2e8ecc5382..eafdb965c206 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -26,6 +26,7 @@ #include "print-tree.h" #include "backref.h" #include "hash.h" +#include "compression.h" /* magic values for the inode_only field in btrfs_log_inode: * -- cgit v1.2.3 From bb7ab3b92e46da06b580c6f83abe7894dc449cca Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Fri, 4 Mar 2016 11:23:12 -0800 Subject: btrfs: Fix misspellings in comments. Signed-off-by: Adam Buchbinder Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index eafdb965c206..24d03c751149 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1046,7 +1046,7 @@ again: /* * NOTE: we have searched root tree and checked the - * coresponding ref, it does not need to check again. + * corresponding ref, it does not need to check again. */ *search_done = 1; } -- cgit v1.2.3