From 17c688c3dfffc274c87be00033da0050bb6eefc0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 21 Jun 2010 16:12:26 -0700 Subject: ceph: delay umount until all mds requests drop inode+dentry refs This fixes a race between handle_reply finishing an mds request, signalling completion, and then dropping the request structing and its dentry+inode refs, and pre_umount function waiting for requests to finish before letting the vfs tear down the dcache. If umount was delayed waiting for mds requests, we could race and BUG in shrink_dcache_for_umount_subtree because of a slow dput. This delays umount until the msgr queue flushes, which means handle_reply will exit and will have dropped the ceph_mds_request struct. I'm assuming the VFS has already ensured that its calls have all completed and those request refs have thus been dropped as well (I haven't seen that race, at least). Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/ceph/mds_client.c') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 1766947fc07a..3ab79f6c4ce8 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2783,6 +2783,12 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) drop_leases(mdsc); ceph_flush_dirty_caps(mdsc); wait_requests(mdsc); + + /* + * wait for reply handlers to drop their request refs and + * their inode/dcache refs + */ + ceph_msgr_flush(); } /* -- cgit v1.2.3 From 01a92f174f8a3b99dbb5e02c86e7ee1e576737af Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 15 Jul 2010 13:24:32 -0700 Subject: ceph: reuse request message when replaying against recovering mds Replayed rename operations (after an mds failure/recovery) were broken because the request paths were regenerated from the dentry names, which get mangled when d_move() is called. Instead, resend the previous request message when replaying completed operations. Just make sure the REPLAY flag is set and the target ino is filled in. This fixes problems with workloads doing renames when the MDS restarts, where the rename operation appears to succeed, but on mds restart then fails (leading to client confusion, app breakage, etc.). Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'fs/ceph/mds_client.c') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3ab79f6c4ce8..23332bc44515 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1580,6 +1580,27 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); + if (req->r_got_unsafe) { + /* + * Replay. Do not regenerate message (and rebuild + * paths, etc.); just use the original message. + * Rebuilding paths will break for renames because + * d_move mangles the src name. + */ + msg = req->r_request; + rhead = msg->front.iov_base; + + flags = le32_to_cpu(rhead->flags); + flags |= CEPH_MDS_FLAG_REPLAY; + rhead->flags = cpu_to_le32(flags); + + if (req->r_target_inode) + rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); + + rhead->num_retry = req->r_attempts - 1; + return 0; + } + if (req->r_request) { ceph_msg_put(req->r_request); req->r_request = NULL; @@ -1601,13 +1622,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, rhead->flags = cpu_to_le32(flags); rhead->num_fwd = req->r_num_fwd; rhead->num_retry = req->r_attempts - 1; + rhead->ino = 0; dout(" r_locked_dir = %p\n", req->r_locked_dir); - - if (req->r_target_inode && req->r_got_unsafe) - rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); - else - rhead->ino = 0; return 0; } -- cgit v1.2.3 From e979cf50395e24c4bdd489f60e2d5dd5ae66d255 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 15 Jul 2010 14:58:39 -0700 Subject: ceph: do not include cap/dentry releases in replayed messages Strip the cap and dentry releases from replayed messages. They can cause the shared state to get out of sync because they were generated (with the request message) earlier, and no longer reflect the current client state. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/ceph/mds_client.c') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 23332bc44515..416c08d315db 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1514,6 +1514,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ceph_encode_filepath(&p, end, ino1, path1); ceph_encode_filepath(&p, end, ino2, path2); + /* make note of release offset, in case we need to replay */ + req->r_request_release_offset = p - msg->front.iov_base; + /* cap releases */ releases = 0; if (req->r_inode_drop) @@ -1598,6 +1601,11 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); rhead->num_retry = req->r_attempts - 1; + + /* remove cap/dentry releases from message */ + rhead->num_releases = 0; + msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset); + msg->front.iov_len = req->r_request_release_offset; return 0; } -- cgit v1.2.3