From 4d7ace02ba5c6ef1f8eeb32a86fef7c528bd7f36 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 26 Nov 2019 07:24:21 -0500 Subject: ceph: fix mdsmap cluster available check based on laggy number In case the max_mds > 1 in MDS cluster and there is no any standby MDS and all the max_mds MDSs are in up:active state, if one of the up:active MDSs is dead, the m->m_num_laggy in kclient will be 1. Then the mount will fail without considering other healthy MDSs. There manybe some MDSs still "in" the cluster but not in up:active state, we will ignore them. Only when all the up:active MDSs in the cluster are laggy will treat the cluster as not be available. In case decreasing the max_mds, the cluster will not stop the extra up:active MDSs immediately and there will be a latency. During it the up:active MDS number will be larger than the max_mds, so later the m_info memories will 100% be reallocated. Here will pick out the up:active MDSs as the m_num_mds and allocate the needed memories once. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/mdsmap.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h index 0067d767c9ae..3a66f4f926ce 100644 --- a/include/linux/ceph/mdsmap.h +++ b/include/linux/ceph/mdsmap.h @@ -25,8 +25,9 @@ struct ceph_mdsmap { u32 m_session_timeout; /* seconds */ u32 m_session_autoclose; /* seconds */ u64 m_max_file_size; - u32 m_max_mds; /* size of m_addr, m_state arrays */ - int m_num_mds; + u32 m_max_mds; /* expected up:active mds number */ + int m_num_active_mds; /* actual up:active mds number */ + int m_num_mds; /* size of m_info array */ struct ceph_mds_info *m_info; /* which object pools file data can be stored in */ -- cgit v1.2.3 From b38c9eb4757d5bac1eb8634a9516ef918fca2525 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 4 Dec 2019 06:57:39 -0500 Subject: ceph: add possible_max_rank and make the code more readable The m_num_mds here is actually the number for MDSs which are in up:active status, and it will be duplicated to m_num_active_mds, so remove it. Add possible_max_rank to the mdsmap struct and this will be the correctly possible largest rank boundary. Remove the special case for one mds in __mdsmap_get_random_mds(), because the validate mds rank may not always be 0. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/mdsmap.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h index 3a66f4f926ce..35d385296fbb 100644 --- a/include/linux/ceph/mdsmap.h +++ b/include/linux/ceph/mdsmap.h @@ -26,8 +26,8 @@ struct ceph_mdsmap { u32 m_session_autoclose; /* seconds */ u64 m_max_file_size; u32 m_max_mds; /* expected up:active mds number */ - int m_num_active_mds; /* actual up:active mds number */ - int m_num_mds; /* size of m_info array */ + u32 m_num_active_mds; /* actual up:active mds number */ + u32 possible_max_rank; /* possible max rank index */ struct ceph_mds_info *m_info; /* which object pools file data can be stored in */ @@ -43,7 +43,7 @@ struct ceph_mdsmap { static inline struct ceph_entity_addr * ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) { - if (w >= m->m_num_mds) + if (w >= m->possible_max_rank) return NULL; return &m->m_info[w].addr; } @@ -51,14 +51,14 @@ ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w) { BUG_ON(w < 0); - if (w >= m->m_num_mds) + if (w >= m->possible_max_rank) return CEPH_MDS_STATE_DNE; return m->m_info[w].state; } static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w) { - if (w >= 0 && w < m->m_num_mds) + if (w >= 0 && w < m->possible_max_rank) return m->m_info[w].laggy; return false; } -- cgit v1.2.3 From 78beb0ff2feceb1d7568333f93195e1a4d95a49a Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Wed, 8 Jan 2020 10:03:53 +0000 Subject: ceph: use copy-from2 op in copy_file_range Instead of using the copy-from operation, switch copy_file_range to the new copy-from2 operation, which allows to send the truncate_seq and truncate_size parameters. If an OSD does not support the copy-from2 operation it will return -EOPNOTSUPP. In that case, the kernel client will stop trying to do remote object copies for this fs client and will always use the generic VFS copy_file_range. Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 1 + include/linux/ceph/rados.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index eaffbdddf89a..5a62dbd3f4c2 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -534,6 +534,7 @@ int ceph_osdc_copy_from(struct ceph_osd_client *osdc, struct ceph_object_id *dst_oid, struct ceph_object_locator *dst_oloc, u32 dst_fadvise_flags, + u32 truncate_seq, u64 truncate_size, u8 copy_from_flags); /* watch/notify */ diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 3eb0e55665b4..59bdfd470100 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -256,6 +256,7 @@ extern const char *ceph_osd_state_name(int s); \ /* tiering */ \ f(COPY_FROM, __CEPH_OSD_OP(WR, DATA, 26), "copy-from") \ + f(COPY_FROM2, __CEPH_OSD_OP(WR, DATA, 45), "copy-from2") \ f(COPY_GET_CLASSIC, __CEPH_OSD_OP(RD, DATA, 27), "copy-get-classic") \ f(UNDIRTY, __CEPH_OSD_OP(WR, DATA, 28), "undirty") \ f(ISDIRTY, __CEPH_OSD_OP(RD, DATA, 29), "isdirty") \ @@ -446,6 +447,7 @@ enum { CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to * cloneid */ CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */ + CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ = 32, /* send truncate_{seq,size} */ }; enum { -- cgit v1.2.3