From f588d72bd95f748849685412b1f0c7959ca228cf Mon Sep 17 00:00:00 2001
From: Dai Ngo <dai.ngo@oracle.com>
Date: Mon, 18 Sep 2023 23:30:20 -0700
Subject: nfs42: client needs to strip file mode's suid/sgid bit after ALLOCATE
 op

The Linux NFS server strips the SUID and SGID from the file mode
on ALLOCATE op.

Modify _nfs42_proc_fallocate to add NFS_INO_REVAL_FORCED to
nfs_set_cache_invalid's argument to force update of the file
mode suid/sgid bit.

Suggested-by: Trond Myklebust <trondmy@hammerspace.com>
Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/nfs42proc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 063e00aff87e..28704f924612 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -81,7 +81,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
 	if (status == 0) {
 		if (nfs_should_remove_suid(inode)) {
 			spin_lock(&inode->i_lock);
-			nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE);
+			nfs_set_cache_invalid(inode,
+				NFS_INO_REVAL_FORCED | NFS_INO_INVALID_MODE);
 			spin_unlock(&inode->i_lock);
 		}
 		status = nfs_post_op_update_inode_force_wcc(inode,
-- 
cgit v1.2.3


From 6a6d4644ce935ddec4f76223ac0ca68da56bd2d3 Mon Sep 17 00:00:00 2001
From: Scott Mayhew <smayhew@redhat.com>
Date: Wed, 11 Oct 2023 10:43:26 -0400
Subject: NFS: Fix potential oops in nfs_inode_remove_request()

Once a folio's private data has been cleared, it's possible for another
process to clear the folio->mapping (e.g. via invalidate_complete_folio2
or evict_mapping_folio), so it wouldn't be safe to call
nfs_page_to_inode() after that.

Fixes: 0c493b5cf16e ("NFS: Convert buffered writes to use folios")
Signed-off-by: Scott Mayhew <smayhew@redhat.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/write.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 7720b5e43014..9d82d50ce0b1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -788,6 +788,8 @@ static void nfs_inode_add_request(struct nfs_page *req)
  */
 static void nfs_inode_remove_request(struct nfs_page *req)
 {
+	struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req));
+
 	if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
 		struct folio *folio = nfs_page_to_folio(req->wb_head);
 		struct address_space *mapping = folio_file_mapping(folio);
@@ -802,7 +804,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 	}
 
 	if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
-		atomic_long_dec(&NFS_I(nfs_page_to_inode(req))->nrequests);
+		atomic_long_dec(&nfsi->nrequests);
 		nfs_release_request(req);
 	}
 }
-- 
cgit v1.2.3


From 1aee9158bc978f91701c5992e395efbc6da2de3c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 14 Oct 2023 21:34:40 -0400
Subject: nfsd: lock_rename() needs both directories to live on the same fs

... checking that after lock_rename() is too late.  Incidentally,
NFSv2 had no nfserr_xdev...

Fixes: aa387d6ce153 "nfsd: fix EXDEV checking in rename"
Cc: stable@vger.kernel.org # v3.9+
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/vfs.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 48260cf68fde..02f5fcaad03f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1788,6 +1788,12 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
 		goto out;
 
+	err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
+	if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
+		goto out;
+	if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
+		goto out;
+
 retry:
 	host_err = fh_want_write(ffhp);
 	if (host_err) {
@@ -1823,12 +1829,6 @@ retry:
 	if (ndentry == trap)
 		goto out_dput_new;
 
-	host_err = -EXDEV;
-	if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
-		goto out_dput_new;
-	if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
-		goto out_dput_new;
-
 	if ((ndentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) &&
 	    nfsd_has_cached_files(ndentry)) {
 		close_cached = true;
-- 
cgit v1.2.3


From f63955721a8020e979b99cc417dcb6da3106aa24 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 8 Oct 2023 14:20:19 -0400
Subject: pNFS: Fix a hang in nfs4_evict_inode()

We are not allowed to call pnfs_mark_matching_lsegs_return() without
also holding a reference to the layout header, since doing so could lead
to the reference count going to zero when we call
pnfs_layout_remove_lseg(). This again can lead to a hang when we get to
nfs4_evict_inode() and are unable to clear the layout pointer.

pnfs_layout_return_unused_byserver() is guilty of this behaviour, and
has been seen to trigger the refcount warning prior to a hang.

Fixes: b6d49ecd1081 ("NFSv4: Fix a pNFS layout related use-after-free race when freeing the inode")
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/pnfs.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 306cba0b9e69..84343aefbbd6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2634,31 +2634,44 @@ pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo,
 	return mode == 0;
 }
 
-static int
-pnfs_layout_return_unused_byserver(struct nfs_server *server, void *data)
+static int pnfs_layout_return_unused_byserver(struct nfs_server *server,
+					      void *data)
 {
 	const struct pnfs_layout_range *range = data;
+	const struct cred *cred;
 	struct pnfs_layout_hdr *lo;
 	struct inode *inode;
+	nfs4_stateid stateid;
+	enum pnfs_iomode iomode;
+
 restart:
 	rcu_read_lock();
 	list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
-		if (!pnfs_layout_can_be_returned(lo) ||
+		inode = lo->plh_inode;
+		if (!inode || !pnfs_layout_can_be_returned(lo) ||
 		    test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
 			continue;
-		inode = lo->plh_inode;
 		spin_lock(&inode->i_lock);
-		if (!pnfs_should_return_unused_layout(lo, range)) {
+		if (!lo->plh_inode ||
+		    !pnfs_should_return_unused_layout(lo, range)) {
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
+		pnfs_get_layout_hdr(lo);
+		pnfs_set_plh_return_info(lo, range->iomode, 0);
+		if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
+						    range, 0) != 0 ||
+		    !pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode)) {
+			spin_unlock(&inode->i_lock);
+			rcu_read_unlock();
+			pnfs_put_layout_hdr(lo);
+			cond_resched();
+			goto restart;
+		}
 		spin_unlock(&inode->i_lock);
-		inode = pnfs_grab_inode_layout_hdr(lo);
-		if (!inode)
-			continue;
 		rcu_read_unlock();
-		pnfs_mark_layout_for_return(inode, range);
-		iput(inode);
+		pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
+		pnfs_put_layout_hdr(lo);
 		cond_resched();
 		goto restart;
 	}
-- 
cgit v1.2.3


From e1c6cfbb3bd1377e2ddcbe06cf8fb1ec323ea7d3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 8 Oct 2023 14:28:46 -0400
Subject: pNFS/flexfiles: Check the layout validity in
 ff_layout_mirror_prepare_stats

Ensure that we check the layout pointer and validity after dereferencing
it in ff_layout_mirror_prepare_stats.

Fixes: 08e2e5bc6c9a ("pNFS/flexfiles: Clean up layoutstats")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index a1dc33864906..ef817a0475ff 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -2520,9 +2520,9 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
 	return i;
 }
 
-static int
-ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
+static int ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
 {
+	struct pnfs_layout_hdr *lo;
 	struct nfs4_flexfile_layout *ff_layout;
 	const int dev_count = PNFS_LAYOUTSTATS_MAXDEV;
 
@@ -2533,11 +2533,14 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
 		return -ENOMEM;
 
 	spin_lock(&args->inode->i_lock);
-	ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
-	args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
-						       &args->devinfo[0],
-						       dev_count,
-						       NFS4_FF_OP_LAYOUTSTATS);
+	lo = NFS_I(args->inode)->layout;
+	if (lo && pnfs_layout_is_valid(lo)) {
+		ff_layout = FF_LAYOUT_FROM_HDR(lo);
+		args->num_dev = ff_layout_mirror_prepare_stats(
+			&ff_layout->generic_hdr, &args->devinfo[0], dev_count,
+			NFS4_FF_OP_LAYOUTSTATS);
+	} else
+		args->num_dev = 0;
 	spin_unlock(&args->inode->i_lock);
 	if (!args->num_dev) {
 		kfree(args->devinfo);
-- 
cgit v1.2.3


From 379e4adfddd6a2f95a4f2029b8ddcbacf92b21f9 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Mon, 9 Oct 2023 10:59:01 -0400
Subject: NFSv4.1: fixup use EXCHGID4_FLAG_USE_PNFS_DS for DS server

This patches fixes commit 51d674a5e488 "NFSv4.1: use
EXCHGID4_FLAG_USE_PNFS_DS for DS server", purpose of that
commit was to mark EXCHANGE_ID to the DS with the appropriate
flag.

However, connection to MDS can return both EXCHGID4_FLAG_USE_PNFS_DS
and EXCHGID4_FLAG_USE_PNFS_MDS set but previous patch would only
remember the USE_PNFS_DS and for the 2nd EXCHANGE_ID send that
to the MDS.

Instead, just mark the pnfs path exclusively.

Fixes: 51d674a5e488 ("NFSv4.1: use EXCHGID4_FLAG_USE_PNFS_DS for DS server")
Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/nfs4proc.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7016eaadf555..5ee283eb9660 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8870,8 +8870,6 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cre
 	/* Save the EXCHANGE_ID verifier session trunk tests */
 	memcpy(clp->cl_confirm.data, argp->verifier.data,
 	       sizeof(clp->cl_confirm.data));
-	if (resp->flags & EXCHGID4_FLAG_USE_PNFS_DS)
-		set_bit(NFS_CS_DS, &clp->cl_flags);
 out:
 	trace_nfs4_exchange_id(clp, status);
 	rpc_put_task(task);
-- 
cgit v1.2.3


From 97ac489775f26acfd46a8a60c2f84ce7cc79fa4b Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 18 Oct 2023 12:59:56 +0300
Subject: fanotify: limit reporting of event with non-decodeable file handles

Commit a95aef69a740 ("fanotify: support reporting non-decodeable file
handles") merged in v6.5-rc1, added the ability to use an fanotify group
with FAN_REPORT_FID mode to watch filesystems that do not support nfs
export, but do know how to encode non-decodeable file handles, with the
newly introduced AT_HANDLE_FID flag.

At the time that this commit was merged, there were no filesystems
in-tree with those traits.

Commit 16aac5ad1fa9 ("ovl: support encoding non-decodable file handles"),
merged in v6.6-rc1, added this trait to overlayfs, thus allowing fanotify
watching of overlayfs with FAN_REPORT_FID mode.

In retrospect, allowing an fanotify filesystem/mount mark on such
filesystem in FAN_REPORT_FID mode will result in getting events with
file handles, without the ability to resolve the filesystem objects from
those file handles (i.e. no open_by_handle_at() support).

For v6.6, the safer option would be to allow this mode for inode marks
only, where the caller has the opportunity to use name_to_handle_at() at
the time of setting the mark. In the future we can revise this decision.

Fixes: a95aef69a740 ("fanotify: support reporting non-decodeable file handles")
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <20231018100000.2453965-2-amir73il@gmail.com>
---
 fs/notify/fanotify/fanotify_user.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index f69c451018e3..62fe0b679e58 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1585,16 +1585,25 @@ static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
 }
 
 /* Check if filesystem can encode a unique fid */
-static int fanotify_test_fid(struct dentry *dentry)
+static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
 {
+	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
+	const struct export_operations *nop = dentry->d_sb->s_export_op;
+
+	/*
+	 * We need to make sure that the filesystem supports encoding of
+	 * file handles so user can use name_to_handle_at() to compare fids
+	 * reported with events to the file handle of watched objects.
+	 */
+	if (!nop)
+		return -EOPNOTSUPP;
+
 	/*
-	 * We need to make sure that the file system supports at least
-	 * encoding a file handle so user can use name_to_handle_at() to
-	 * compare fid returned with event to the file handle of watched
-	 * objects. However, even the relaxed AT_HANDLE_FID flag requires
-	 * at least empty export_operations for ecoding unique file ids.
+	 * For sb/mount mark, we also need to make sure that the filesystem
+	 * supports decoding file handles, so user has a way to map back the
+	 * reported fids to filesystem objects.
 	 */
-	if (!dentry->d_sb->s_export_op)
+	if (mark_type != FAN_MARK_INODE && !nop->fh_to_dentry)
 		return -EOPNOTSUPP;
 
 	return 0;
@@ -1812,7 +1821,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 		if (ret)
 			goto path_put_and_out;
 
-		ret = fanotify_test_fid(path.dentry);
+		ret = fanotify_test_fid(path.dentry, flags);
 		if (ret)
 			goto path_put_and_out;
 
-- 
cgit v1.2.3


From 3ac974796e5d94509b85a403449132ea660127c2 Mon Sep 17 00:00:00 2001
From: Jan Stancek <jstancek@redhat.com>
Date: Thu, 19 Oct 2023 09:41:36 -0700
Subject: iomap: fix short copy in iomap_write_iter()

Starting with commit 5d8edfb900d5 ("iomap: Copy larger chunks from
userspace"), iomap_write_iter() can get into endless loop. This can
be reproduced with LTP writev07 which uses partially valid iovecs:
        struct iovec wr_iovec[] = {
                { buffer, 64 },
                { bad_addr, 64 },
                { buffer + 64, 64 },
                { buffer + 64 * 2, 64 },
        };

commit bc1bb416bbb9 ("generic_perform_write()/iomap_write_actor():
saner logics for short copy") previously introduced the logic, which
made short copy retry in next iteration with amount of "bytes" it
managed to copy:

                if (unlikely(status == 0)) {
                        /*
                         * A short copy made iomap_write_end() reject the
                         * thing entirely.  Might be memory poisoning
                         * halfway through, might be a race with munmap,
                         * might be severe memory pressure.
                         */
                        if (copied)
                                bytes = copied;

However, since 5d8edfb900d5 "bytes" is no longer carried into next
iteration, because it is now always initialized at the beginning of
the loop. And for iov_iter_count < PAGE_SIZE, "bytes" ends up with
same value as previous iteration, making the loop retry same copy
over and over, which leads to writev07 testcase hanging.

Make next iteration retry with amount of bytes we managed to copy.

Fixes: 5d8edfb900d5 ("iomap: Copy larger chunks from userspace")
Signed-off-by: Jan Stancek <jstancek@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap/buffered-io.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 5db54ca29a35..2bc0aa23fde3 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -881,8 +881,10 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
 		size_t bytes;		/* Bytes to write to folio */
 		size_t copied;		/* Bytes copied from user */
 
+		bytes = iov_iter_count(i);
+retry:
 		offset = pos & (chunk - 1);
-		bytes = min(chunk - offset, iov_iter_count(i));
+		bytes = min(chunk - offset, bytes);
 		status = balance_dirty_pages_ratelimited_flags(mapping,
 							       bdp_flags);
 		if (unlikely(status))
@@ -933,10 +935,12 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
 			 * halfway through, might be a race with munmap,
 			 * might be severe memory pressure.
 			 */
-			if (copied)
-				bytes = copied;
 			if (chunk > PAGE_SIZE)
 				chunk /= 2;
+			if (copied) {
+				bytes = copied;
+				goto retry;
+			}
 		} else {
 			pos += status;
 			written += status;
-- 
cgit v1.2.3


From eb96e221937af3c7bb8a63208dbab813ca5d3d7e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 19 Oct 2023 13:19:28 +0100
Subject: btrfs: fix unwritten extent buffer after snapshotting a new subvolume

When creating a snapshot of a subvolume that was created in the current
transaction, we can end up not persisting a dirty extent buffer that is
referenced by the snapshot, resulting in IO errors due to checksum failures
when trying to read the extent buffer later from disk. A sequence of steps
that leads to this is the following:

1) At ioctl.c:create_subvol() we allocate an extent buffer, with logical
   address 36007936, for the leaf/root of a new subvolume that has an ID
   of 291. We mark the extent buffer as dirty, and at this point the
   subvolume tree has a single node/leaf which is also its root (level 0);

2) We no longer commit the transaction used to create the subvolume at
   create_subvol(). We used to, but that was recently removed in
   commit 1b53e51a4a8f ("btrfs: don't commit transaction for every subvol
   create");

3) The transaction used to create the subvolume has an ID of 33, so the
   extent buffer 36007936 has a generation of 33;

4) Several updates happen to subvolume 291 during transaction 33, several
   files created and its tree height changes from 0 to 1, so we end up with
   a new root at level 1 and the extent buffer 36007936 is now a leaf of
   that new root node, which is extent buffer 36048896.

   The commit root remains as 36007936, since we are still at transaction
   33;

5) Creation of a snapshot of subvolume 291, with an ID of 292, starts at
   ioctl.c:create_snapshot(). This triggers a commit of transaction 33 and
   we end up at transaction.c:create_pending_snapshot(), in the critical
   section of a transaction commit.

   There we COW the root of subvolume 291, which is extent buffer 36048896.
   The COW operation returns extent buffer 36048896, since there's no need
   to COW because the extent buffer was created in this transaction and it
   was not written yet.

   The we call btrfs_copy_root() against the root node 36048896. During
   this operation we allocate a new extent buffer to turn into the root
   node of the snapshot, copy the contents of the root node 36048896 into
   this snapshot root extent buffer, set the owner to 292 (the ID of the
   snapshot), etc, and then we call btrfs_inc_ref(). This will create a
   delayed reference for each leaf pointed by the root node with a
   reference root of 292 - this includes a reference for the leaf
   36007936.

   After that we set the bit BTRFS_ROOT_FORCE_COW in the root's state.

   Then we call btrfs_insert_dir_item(), to create the directory entry in
   in the tree of subvolume 291 that points to the snapshot. This ends up
   needing to modify leaf 36007936 to insert the respective directory
   items. Because the bit BTRFS_ROOT_FORCE_COW is set for the root's state,
   we need to COW the leaf. We end up at btrfs_force_cow_block() and then
   at update_ref_for_cow().

   At update_ref_for_cow() we call btrfs_block_can_be_shared() which
   returns false, despite the fact the leaf 36007936 is shared - the
   subvolume's root and the snapshot's root point to that leaf. The
   reason that it incorrectly returns false is because the commit root
   of the subvolume is extent buffer 36007936 - it was the initial root
   of the subvolume when we created it. So btrfs_block_can_be_shared()
   which has the following logic:

   int btrfs_block_can_be_shared(struct btrfs_root *root,
                                 struct extent_buffer *buf)
   {
       if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
           buf != root->node && buf != root->commit_root &&
           (btrfs_header_generation(buf) <=
            btrfs_root_last_snapshot(&root->root_item) ||
            btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
               return 1;

       return 0;
   }

   Returns false (0) since 'buf' (extent buffer 36007936) matches the
   root's commit root.

   As a result, at update_ref_for_cow(), we don't check for the number
   of references for extent buffer 36007936, we just assume it's not
   shared and therefore that it has only 1 reference, so we set the local
   variable 'refs' to 1.

   Later on, in the final if-else statement at update_ref_for_cow():

   static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
                                          struct extent_buffer *buf,
                                          struct extent_buffer *cow,
                                          int *last_ref)
   {
      (...)
      if (refs > 1) {
          (...)
      } else {
          (...)
          btrfs_clear_buffer_dirty(trans, buf);
          *last_ref = 1;
      }
   }

   So we mark the extent buffer 36007936 as not dirty, and as a result
   we don't write it to disk later in the transaction commit, despite the
   fact that the snapshot's root points to it.

   Attempting to access the leaf or dumping the tree for example shows
   that the extent buffer was not written:

   $ btrfs inspect-internal dump-tree -t 292 /dev/sdb
   btrfs-progs v6.2.2
   file tree key (292 ROOT_ITEM 33)
   node 36110336 level 1 items 2 free space 119 generation 33 owner 292
   node 36110336 flags 0x1(WRITTEN) backref revision 1
   checksum stored a8103e3e
   checksum calced a8103e3e
   fs uuid 90c9a46f-ae9f-4626-9aff-0cbf3e2e3a79
   chunk uuid e8c9c885-78f4-4d31-85fe-89e5f5fd4a07
           key (256 INODE_ITEM 0) block 36007936 gen 33
           key (257 EXTENT_DATA 0) block 36052992 gen 33
   checksum verify failed on 36007936 wanted 0x00000000 found 0x86005f29
   checksum verify failed on 36007936 wanted 0x00000000 found 0x86005f29
   total bytes 107374182400
   bytes used 38572032
   uuid 90c9a46f-ae9f-4626-9aff-0cbf3e2e3a79

   The respective on disk region is full of zeroes as the device was
   trimmed at mkfs time.

   Obviously 'btrfs check' also detects and complains about this:

   $ btrfs check /dev/sdb
   Opening filesystem to check...
   Checking filesystem on /dev/sdb
   UUID: 90c9a46f-ae9f-4626-9aff-0cbf3e2e3a79
   generation: 33 (33)
   [1/7] checking root items
   [2/7] checking extents
   checksum verify failed on 36007936 wanted 0x00000000 found 0x86005f29
   checksum verify failed on 36007936 wanted 0x00000000 found 0x86005f29
   checksum verify failed on 36007936 wanted 0x00000000 found 0x86005f29
   bad tree block 36007936, bytenr mismatch, want=36007936, have=0
   owner ref check failed [36007936 4096]
   ERROR: errors found in extent allocation tree or chunk allocation
   [3/7] checking free space tree
   [4/7] checking fs roots
   checksum verify failed on 36007936 wanted 0x00000000 found 0x86005f29
   checksum verify failed on 36007936 wanted 0x00000000 found 0x86005f29
   checksum verify failed on 36007936 wanted 0x00000000 found 0x86005f29
   bad tree block 36007936, bytenr mismatch, want=36007936, have=0
   The following tree block(s) is corrupted in tree 292:
        tree block bytenr: 36110336, level: 1, node key: (256, 1, 0)
   root 292 root dir 256 not found
   ERROR: errors found in fs roots
   found 38572032 bytes used, error(s) found
   total csum bytes: 16048
   total tree bytes: 1265664
   total fs tree bytes: 1118208
   total extent tree bytes: 65536
   btree space waste bytes: 562598
   file data blocks allocated: 65978368
    referenced 36569088

Fix this by updating btrfs_block_can_be_shared() to consider that an
extent buffer may be shared if it matches the commit root and if its
generation matches the current transaction's generation.

This can be reproduced with the following script:

   $ cat test.sh
   #!/bin/bash

   MNT=/mnt/sdi
   DEV=/dev/sdi

   # Use a filesystem with a 64K node size so that we have the same node
   # size on every machine regardless of its page size (on x86_64 default
   # node size is 16K due to the 4K page size, while on PPC it's 64K by
   # default). This way we can make sure we are able to create a btree for
   # the subvolume with a height of 2.
   mkfs.btrfs -f -n 64K $DEV
   mount $DEV $MNT

   btrfs subvolume create $MNT/subvol

   # Create a few empty files on the subvolume, this bumps its btree
   # height to 2 (root node at level 1 and 2 leaves).
   for ((i = 1; i <= 300; i++)); do
       echo -n > $MNT/subvol/file_$i
   done

   btrfs subvolume snapshot -r $MNT/subvol $MNT/subvol/snap

   umount $DEV

   btrfs check $DEV

Running it on a 6.5 kernel (or any 6.6-rc kernel at the moment):

   $ ./test.sh
   Create subvolume '/mnt/sdi/subvol'
   Create a readonly snapshot of '/mnt/sdi/subvol' in '/mnt/sdi/subvol/snap'
   Opening filesystem to check...
   Checking filesystem on /dev/sdi
   UUID: bbdde2ff-7d02-45ca-8a73-3c36f23755a1
   [1/7] checking root items
   [2/7] checking extents
   parent transid verify failed on 30539776 wanted 7 found 5
   parent transid verify failed on 30539776 wanted 7 found 5
   parent transid verify failed on 30539776 wanted 7 found 5
   Ignoring transid failure
   owner ref check failed [30539776 65536]
   ERROR: errors found in extent allocation tree or chunk allocation
   [3/7] checking free space tree
   [4/7] checking fs roots
   parent transid verify failed on 30539776 wanted 7 found 5
   Ignoring transid failure
   Wrong key of child node/leaf, wanted: (256, 1, 0), have: (2, 132, 0)
   Wrong generation of child node/leaf, wanted: 5, have: 7
   root 257 root dir 256 not found
   ERROR: errors found in fs roots
   found 917504 bytes used, error(s) found
   total csum bytes: 0
   total tree bytes: 851968
   total fs tree bytes: 393216
   total extent tree bytes: 65536
   btree space waste bytes: 736550
   file data blocks allocated: 0
    referenced 0

A test case for fstests will follow soon.

Fixes: 1b53e51a4a8f ("btrfs: don't commit transaction for every subvol create")
CC: stable@vger.kernel.org # 6.5+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c    | 14 +++++++++-----
 fs/btrfs/backref.h    |  3 ++-
 fs/btrfs/ctree.c      | 21 ++++++++++++++++-----
 fs/btrfs/ctree.h      |  3 ++-
 fs/btrfs/relocation.c |  7 ++++---
 5 files changed, 33 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index b7d54efb4728..a4a809efc92f 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -3196,12 +3196,14 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
  * We still need to do a tree search to find out the parents. This is for
  * TREE_BLOCK_REF backref (keyed or inlined).
  *
+ * @trans:	Transaction handle.
  * @ref_key:	The same as @ref_key in  handle_direct_tree_backref()
  * @tree_key:	The first key of this tree block.
  * @path:	A clean (released) path, to avoid allocating path every time
  *		the function get called.
  */
-static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache,
+static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
+					struct btrfs_backref_cache *cache,
 					struct btrfs_path *path,
 					struct btrfs_key *ref_key,
 					struct btrfs_key *tree_key,
@@ -3315,7 +3317,7 @@ static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache,
 			 * If we know the block isn't shared we can avoid
 			 * checking its backrefs.
 			 */
-			if (btrfs_block_can_be_shared(root, eb))
+			if (btrfs_block_can_be_shared(trans, root, eb))
 				upper->checked = 0;
 			else
 				upper->checked = 1;
@@ -3363,11 +3365,13 @@ out:
  *	 links aren't yet bi-directional. Needs to finish such links.
  *	 Use btrfs_backref_finish_upper_links() to finish such linkage.
  *
+ * @trans:	Transaction handle.
  * @path:	Released path for indirect tree backref lookup
  * @iter:	Released backref iter for extent tree search
  * @node_key:	The first key of the tree block
  */
-int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache,
+int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
+				struct btrfs_backref_cache *cache,
 				struct btrfs_path *path,
 				struct btrfs_backref_iter *iter,
 				struct btrfs_key *node_key,
@@ -3467,8 +3471,8 @@ int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache,
 			 * offset means the root objectid. We need to search
 			 * the tree to get its parent bytenr.
 			 */
-			ret = handle_indirect_tree_backref(cache, path, &key, node_key,
-							   cur);
+			ret = handle_indirect_tree_backref(trans, cache, path,
+							   &key, node_key, cur);
 			if (ret < 0)
 				goto out;
 		}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 1616e3e3f1e4..71d535e03dca 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -540,7 +540,8 @@ static inline void btrfs_backref_panic(struct btrfs_fs_info *fs_info,
 		    bytenr);
 }
 
-int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache,
+int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
+				struct btrfs_backref_cache *cache,
 				struct btrfs_path *path,
 				struct btrfs_backref_iter *iter,
 				struct btrfs_key *node_key,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index da519c1b6ad0..617d4827eec2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -367,7 +367,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 /*
  * check if the tree block can be shared by multiple trees
  */
-int btrfs_block_can_be_shared(struct btrfs_root *root,
+int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
 			      struct extent_buffer *buf)
 {
 	/*
@@ -376,11 +377,21 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
 	 * not allocated by tree relocation, we know the block is not shared.
 	 */
 	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
-	    buf != root->node && buf != root->commit_root &&
+	    buf != root->node &&
 	    (btrfs_header_generation(buf) <=
 	     btrfs_root_last_snapshot(&root->root_item) ||
-	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
-		return 1;
+	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) {
+		if (buf != root->commit_root)
+			return 1;
+		/*
+		 * An extent buffer that used to be the commit root may still be
+		 * shared because the tree height may have increased and it
+		 * became a child of a higher level root. This can happen when
+		 * snapshotting a subvolume created in the current transaction.
+		 */
+		if (btrfs_header_generation(buf) == trans->transid)
+			return 1;
+	}
 
 	return 0;
 }
@@ -415,7 +426,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 	 * are only allowed for blocks use full backrefs.
 	 */
 
-	if (btrfs_block_can_be_shared(root, buf)) {
+	if (btrfs_block_can_be_shared(trans, root, buf)) {
 		ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
 					       btrfs_header_level(buf), 1,
 					       &refs, &flags);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9419f4e37a58..ff40acd63a37 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -540,7 +540,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      struct extent_buffer *buf,
 		      struct extent_buffer **cow_ret, u64 new_root_objectid);
-int btrfs_block_can_be_shared(struct btrfs_root *root,
+int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
 			      struct extent_buffer *buf);
 int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct btrfs_path *path, int level, int slot);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 9951a0caf5bb..c6d4bb8cbe29 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -466,6 +466,7 @@ static bool handle_useless_nodes(struct reloc_control *rc,
  * cached.
  */
 static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
+			struct btrfs_trans_handle *trans,
 			struct reloc_control *rc, struct btrfs_key *node_key,
 			int level, u64 bytenr)
 {
@@ -499,8 +500,8 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
 
 	/* Breadth-first search to build backref cache */
 	do {
-		ret = btrfs_backref_add_tree_node(cache, path, iter, node_key,
-						  cur);
+		ret = btrfs_backref_add_tree_node(trans, cache, path, iter,
+						  node_key, cur);
 		if (ret < 0) {
 			err = ret;
 			goto out;
@@ -2803,7 +2804,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
 
 	/* Do tree relocation */
 	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
-		node = build_backref_tree(rc, &block->key,
+		node = build_backref_tree(trans, rc, &block->key,
 					  block->level, block->bytenr);
 		if (IS_ERR(node)) {
 			err = PTR_ERR(node);
-- 
cgit v1.2.3