diff options
author | Sage Weil <sage@inktank.com> | 2012-07-10 11:53:34 -0700 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2012-11-26 11:38:10 -0800 |
commit | 49da293c7dc4401c2c7963a2c70f633b1c8fa8c5 (patch) | |
tree | 7e61b4bfd7e2c192ba47b8db4866f3f9e7c039c2 | |
parent | 21cbad59b07693104dda76ee4afef41302b2b8fb (diff) |
libceph: fix messenger retry
(cherry picked from commit 5bdca4e0768d3e0f4efa43d9a2cc8210aeb91ab9)
In ancient times, the messenger could both initiate and accept connections.
An artifact if that was data structures to store/process an incoming
ceph_msg_connect request and send an outgoing ceph_msg_connect_reply.
Sadly, the negotiation code was referencing those structures and ignoring
important information (like the peer's connect_seq) from the correct ones.
Among other things, this fixes tight reconnect loops where the server sends
RETRY_SESSION and we (the client) retries with the same connect_seq as last
time. This bug pretty easily triggered by injecting socket failures on the
MDS and running some fs workload like workunits/direct_io/test_sync_io.
Signed-off-by: Sage Weil <sage@inktank.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r-- | include/linux/ceph/messenger.h | 12 | ||||
-rw-r--r-- | net/ceph/messenger.c | 12 |
2 files changed, 8 insertions, 16 deletions
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 2521a95fa6d9..44c87e731e9d 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -163,16 +163,8 @@ struct ceph_connection { /* connection negotiation temps */ char in_banner[CEPH_BANNER_MAX_LEN]; - union { - struct { /* outgoing connection */ - struct ceph_msg_connect out_connect; - struct ceph_msg_connect_reply in_reply; - }; - struct { /* incoming */ - struct ceph_msg_connect in_connect; - struct ceph_msg_connect_reply out_reply; - }; - }; + struct ceph_msg_connect out_connect; + struct ceph_msg_connect_reply in_reply; struct ceph_entity_addr actual_peer_addr; /* message out temps */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 375ae3953e25..0e270166f72f 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1423,7 +1423,7 @@ static int process_connect(struct ceph_connection *con) * dropped messages. */ dout("process_connect got RESET peer seq %u\n", - le32_to_cpu(con->in_connect.connect_seq)); + le32_to_cpu(con->in_reply.connect_seq)); pr_err("%s%lld %s connection reset\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr)); @@ -1450,10 +1450,10 @@ static int process_connect(struct ceph_connection *con) * If we sent a smaller connect_seq than the peer has, try * again with a larger value. */ - dout("process_connect got RETRY my seq = %u, peer_seq = %u\n", + dout("process_connect got RETRY_SESSION my seq %u, peer %u\n", le32_to_cpu(con->out_connect.connect_seq), - le32_to_cpu(con->in_connect.connect_seq)); - con->connect_seq = le32_to_cpu(con->in_connect.connect_seq); + le32_to_cpu(con->in_reply.connect_seq)); + con->connect_seq = le32_to_cpu(con->in_reply.connect_seq); ceph_con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) @@ -1468,9 +1468,9 @@ static int process_connect(struct ceph_connection *con) */ dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", con->peer_global_seq, - le32_to_cpu(con->in_connect.global_seq)); + le32_to_cpu(con->in_reply.global_seq)); get_global_seq(con->msgr, - le32_to_cpu(con->in_connect.global_seq)); + le32_to_cpu(con->in_reply.global_seq)); ceph_con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) |