From 17c688c3dfffc274c87be00033da0050bb6eefc0 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 21 Jun 2010 16:12:26 -0700
Subject: ceph: delay umount until all mds requests drop inode+dentry refs

This fixes a race between handle_reply finishing an mds request, signalling
completion, and then dropping the request structing and its dentry+inode
refs, and pre_umount function waiting for requests to finish before
letting the vfs tear down the dcache.  If umount was delayed waiting for
mds requests, we could race and BUG in shrink_dcache_for_umount_subtree
because of a slow dput.

This delays umount until the msgr queue flushes, which means handle_reply
will exit and will have dropped the ceph_mds_request struct.  I'm assuming
the VFS has already ensured that its calls have all completed and those
request refs have thus been dropped as well (I haven't seen that race, at
least).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs/ceph/mds_client.c')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1766947fc07a..3ab79f6c4ce8 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2783,6 +2783,12 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
 	drop_leases(mdsc);
 	ceph_flush_dirty_caps(mdsc);
 	wait_requests(mdsc);
+
+	/*
+	 * wait for reply handlers to drop their request refs and
+	 * their inode/dcache refs
+	 */
+	ceph_msgr_flush();
 }
 
 /*
-- 
cgit v1.2.3


From 01a92f174f8a3b99dbb5e02c86e7ee1e576737af Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 15 Jul 2010 13:24:32 -0700
Subject: ceph: reuse request message when replaying against recovering mds

Replayed rename operations (after an mds failure/recovery) were broken
because the request paths were regenerated from the dentry names, which
get mangled when d_move() is called.

Instead, resend the previous request message when replaying completed
operations.  Just make sure the REPLAY flag is set and the target ino is
filled in.

This fixes problems with workloads doing renames when the MDS restarts,
where the rename operation appears to succeed, but on mds restart then
fails (leading to client confusion, app breakage, etc.).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

(limited to 'fs/ceph/mds_client.c')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 3ab79f6c4ce8..23332bc44515 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1580,6 +1580,27 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 	dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
 	     req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
 
+	if (req->r_got_unsafe) {
+		/*
+		 * Replay.  Do not regenerate message (and rebuild
+		 * paths, etc.); just use the original message.
+		 * Rebuilding paths will break for renames because
+		 * d_move mangles the src name.
+		 */
+		msg = req->r_request;
+		rhead = msg->front.iov_base;
+
+		flags = le32_to_cpu(rhead->flags);
+		flags |= CEPH_MDS_FLAG_REPLAY;
+		rhead->flags = cpu_to_le32(flags);
+
+		if (req->r_target_inode)
+			rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+
+		rhead->num_retry = req->r_attempts - 1;
+		return 0;
+	}
+
 	if (req->r_request) {
 		ceph_msg_put(req->r_request);
 		req->r_request = NULL;
@@ -1601,13 +1622,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 	rhead->flags = cpu_to_le32(flags);
 	rhead->num_fwd = req->r_num_fwd;
 	rhead->num_retry = req->r_attempts - 1;
+	rhead->ino = 0;
 
 	dout(" r_locked_dir = %p\n", req->r_locked_dir);
-
-	if (req->r_target_inode && req->r_got_unsafe)
-		rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
-	else
-		rhead->ino = 0;
 	return 0;
 }
 
-- 
cgit v1.2.3


From e979cf50395e24c4bdd489f60e2d5dd5ae66d255 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 15 Jul 2010 14:58:39 -0700
Subject: ceph: do not include cap/dentry releases in replayed messages

Strip the cap and dentry releases from replayed messages.  They can
cause the shared state to get out of sync because they were generated
(with the request message) earlier, and no longer reflect the current
client state.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs/ceph/mds_client.c')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 23332bc44515..416c08d315db 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1514,6 +1514,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 	ceph_encode_filepath(&p, end, ino1, path1);
 	ceph_encode_filepath(&p, end, ino2, path2);
 
+	/* make note of release offset, in case we need to replay */
+	req->r_request_release_offset = p - msg->front.iov_base;
+
 	/* cap releases */
 	releases = 0;
 	if (req->r_inode_drop)
@@ -1598,6 +1601,11 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 			rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
 
 		rhead->num_retry = req->r_attempts - 1;
+
+		/* remove cap/dentry releases from message */
+		rhead->num_releases = 0;
+		msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
+		msg->front.iov_len = req->r_request_release_offset;
 		return 0;
 	}
 
-- 
cgit v1.2.3