From 4d7ace02ba5c6ef1f8eeb32a86fef7c528bd7f36 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Tue, 26 Nov 2019 07:24:21 -0500
Subject: ceph: fix mdsmap cluster available check based on laggy number

In case the max_mds > 1 in MDS cluster and there is no any standby
MDS and all the max_mds MDSs are in up:active state, if one of the
up:active MDSs is dead, the m->m_num_laggy in kclient will be 1.
Then the mount will fail without considering other healthy MDSs.

There manybe some MDSs still "in" the cluster but not in up:active
state, we will ignore them. Only when all the up:active MDSs in
the cluster are laggy will treat the cluster as not be available.

In case decreasing the max_mds, the cluster will not stop the extra
up:active MDSs immediately and there will be a latency. During it
the up:active MDS number will be larger than the max_mds, so later
the m_info memories will 100% be reallocated.

Here will pick out the up:active MDSs as the m_num_mds and allocate
the needed memories once.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/mdsmap.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index 0067d767c9ae..3a66f4f926ce 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -25,8 +25,9 @@ struct ceph_mdsmap {
 	u32 m_session_timeout;          /* seconds */
 	u32 m_session_autoclose;        /* seconds */
 	u64 m_max_file_size;
-	u32 m_max_mds;                  /* size of m_addr, m_state arrays */
-	int m_num_mds;
+	u32 m_max_mds;			/* expected up:active mds number */
+	int m_num_active_mds;		/* actual up:active mds number */
+	int m_num_mds;                  /* size of m_info array */
 	struct ceph_mds_info *m_info;
 
 	/* which object pools file data can be stored in */
-- 
cgit v1.2.3


From b38c9eb4757d5bac1eb8634a9516ef918fca2525 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Wed, 4 Dec 2019 06:57:39 -0500
Subject: ceph: add possible_max_rank and make the code more readable

The m_num_mds here is actually the number for MDSs which are in
up:active status, and it will be duplicated to m_num_active_mds,
so remove it.

Add possible_max_rank to the mdsmap struct and this will be
the correctly possible largest rank boundary.

Remove the special case for one mds in __mdsmap_get_random_mds(),
because the validate mds rank may not always be 0.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/mdsmap.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index 3a66f4f926ce..35d385296fbb 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -26,8 +26,8 @@ struct ceph_mdsmap {
 	u32 m_session_autoclose;        /* seconds */
 	u64 m_max_file_size;
 	u32 m_max_mds;			/* expected up:active mds number */
-	int m_num_active_mds;		/* actual up:active mds number */
-	int m_num_mds;                  /* size of m_info array */
+	u32 m_num_active_mds;		/* actual up:active mds number */
+	u32 possible_max_rank;		/* possible max rank index */
 	struct ceph_mds_info *m_info;
 
 	/* which object pools file data can be stored in */
@@ -43,7 +43,7 @@ struct ceph_mdsmap {
 static inline struct ceph_entity_addr *
 ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
 {
-	if (w >= m->m_num_mds)
+	if (w >= m->possible_max_rank)
 		return NULL;
 	return &m->m_info[w].addr;
 }
@@ -51,14 +51,14 @@ ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
 static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
 {
 	BUG_ON(w < 0);
-	if (w >= m->m_num_mds)
+	if (w >= m->possible_max_rank)
 		return CEPH_MDS_STATE_DNE;
 	return m->m_info[w].state;
 }
 
 static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
 {
-	if (w >= 0 && w < m->m_num_mds)
+	if (w >= 0 && w < m->possible_max_rank)
 		return m->m_info[w].laggy;
 	return false;
 }
-- 
cgit v1.2.3


From 78beb0ff2feceb1d7568333f93195e1a4d95a49a Mon Sep 17 00:00:00 2001
From: Luis Henriques <lhenriques@suse.com>
Date: Wed, 8 Jan 2020 10:03:53 +0000
Subject: ceph: use copy-from2 op in copy_file_range

Instead of using the copy-from operation, switch copy_file_range to the
new copy-from2 operation, which allows to send the truncate_seq and
truncate_size parameters.

If an OSD does not support the copy-from2 operation it will return
-EOPNOTSUPP.  In that case, the kernel client will stop trying to do
remote object copies for this fs client and will always use the generic
VFS copy_file_range.

Signed-off-by: Luis Henriques <lhenriques@suse.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h | 1 +
 include/linux/ceph/rados.h      | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index eaffbdddf89a..5a62dbd3f4c2 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -534,6 +534,7 @@ int ceph_osdc_copy_from(struct ceph_osd_client *osdc,
 			struct ceph_object_id *dst_oid,
 			struct ceph_object_locator *dst_oloc,
 			u32 dst_fadvise_flags,
+			u32 truncate_seq, u64 truncate_size,
 			u8 copy_from_flags);
 
 /* watch/notify */
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 3eb0e55665b4..59bdfd470100 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -256,6 +256,7 @@ extern const char *ceph_osd_state_name(int s);
 									    \
 	/* tiering */							    \
 	f(COPY_FROM,	__CEPH_OSD_OP(WR, DATA, 26),	"copy-from")	    \
+	f(COPY_FROM2,	__CEPH_OSD_OP(WR, DATA, 45),	"copy-from2")	    \
 	f(COPY_GET_CLASSIC, __CEPH_OSD_OP(RD, DATA, 27), "copy-get-classic") \
 	f(UNDIRTY,	__CEPH_OSD_OP(WR, DATA, 28),	"undirty")	    \
 	f(ISDIRTY,	__CEPH_OSD_OP(RD, DATA, 29),	"isdirty")	    \
@@ -446,6 +447,7 @@ enum {
 	CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to
 						     * cloneid */
 	CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16,     /* order with write */
+	CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ = 32,  /* send truncate_{seq,size} */
 };
 
 enum {
-- 
cgit v1.2.3