diff options
| -rw-r--r-- | drivers/block/Kconfig | 3 | ||||
| -rw-r--r-- | fs/ceph/Kconfig | 3 | ||||
| -rw-r--r-- | fs/ceph/caps.c | 4 | ||||
| -rw-r--r-- | fs/ceph/mds_client.c | 20 | ||||
| -rw-r--r-- | fs/ceph/snap.c | 2 | ||||
| -rw-r--r-- | fs/ceph/super.c | 3 | ||||
| -rw-r--r-- | include/trace/events/ceph.h | 234 | ||||
| -rw-r--r-- | net/ceph/osd_client.c | 6 | ||||
| -rw-r--r-- | net/ceph/osdmap.c | 120 |
9 files changed, 316 insertions, 79 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 77d694448990..858320b6ebb7 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -316,9 +316,6 @@ config BLK_DEV_RBD tristate "Rados block device (RBD)" depends on INET && BLOCK select CEPH_LIB - select CRC32 - select CRYPTO_AES - select CRYPTO help Say Y here if you want include the Rados block device, which stripes a block device over objects stored in the Ceph distributed object diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 3e7def3d31c1..3d64a316ca31 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -3,9 +3,6 @@ config CEPH_FS tristate "Ceph distributed file system" depends on INET select CEPH_LIB - select CRC32 - select CRYPTO_AES - select CRYPTO select NETFS_SUPPORT select FS_ENCRYPTION_ALGS if FS_ENCRYPTION default n diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b1a8ff612c41..2f663972da99 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -18,6 +18,7 @@ #include "crypto.h" #include <linux/ceph/decode.h> #include <linux/ceph/messenger.h> +#include <trace/events/ceph.h> /* * Capability management @@ -4452,6 +4453,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, session->s_mds, ceph_cap_op_name(op), vino.ino, vino.snap, inode, seq, issue_seq, mseq); + trace_ceph_handle_caps(mdsc, session, op, &vino, ceph_inode(inode), + seq, issue_seq, mseq); + mutex_lock(&session->s_mutex); if (!inode) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 1740047aef0f..7e4eab824dae 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -24,6 +24,7 @@ #include <linux/ceph/pagelist.h> #include <linux/ceph/auth.h> #include <linux/ceph/debugfs.h> +#include <trace/events/ceph.h> #define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE) @@ -3288,6 +3289,8 @@ static void complete_request(struct ceph_mds_client *mdsc, { req->r_end_latency = ktime_get(); + trace_ceph_mdsc_complete_request(mdsc, req); + if (req->r_callback) req->r_callback(mdsc, req); complete_all(&req->r_completion); @@ -3419,6 +3422,8 @@ static int __send_request(struct ceph_mds_session *session, { int err; + trace_ceph_mdsc_send_request(session, req); + err = __prepare_send_request(session, req, drop_cap_releases); if (!err) { ceph_msg_get(req->r_request); @@ -3470,6 +3475,8 @@ static void __do_request(struct ceph_mds_client *mdsc, } if (mdsc->mdsmap->m_epoch == 0) { doutc(cl, "no mdsmap, waiting for map\n"); + trace_ceph_mdsc_suspend_request(mdsc, session, req, + ceph_mdsc_suspend_reason_no_mdsmap); list_add(&req->r_wait, &mdsc->waiting_for_map); return; } @@ -3491,6 +3498,8 @@ static void __do_request(struct ceph_mds_client *mdsc, goto finish; } doutc(cl, "no mds or not active, waiting for map\n"); + trace_ceph_mdsc_suspend_request(mdsc, session, req, + ceph_mdsc_suspend_reason_no_active_mds); list_add(&req->r_wait, &mdsc->waiting_for_map); return; } @@ -3536,9 +3545,11 @@ static void __do_request(struct ceph_mds_client *mdsc, * it to the mdsc queue. */ if (session->s_state == CEPH_MDS_SESSION_REJECTED) { - if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER)) + if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER)) { + trace_ceph_mdsc_suspend_request(mdsc, session, req, + ceph_mdsc_suspend_reason_rejected); list_add(&req->r_wait, &mdsc->waiting_for_map); - else + } else err = -EACCES; goto out_session; } @@ -3552,6 +3563,8 @@ static void __do_request(struct ceph_mds_client *mdsc, if (random) req->r_resend_mds = mds; } + trace_ceph_mdsc_suspend_request(mdsc, session, req, + ceph_mdsc_suspend_reason_session); list_add(&req->r_wait, &session->s_waiting); goto out_session; } @@ -3652,6 +3665,7 @@ static void __wake_requests(struct ceph_mds_client *mdsc, list_del_init(&req->r_wait); doutc(cl, " wake request %p tid %llu\n", req, req->r_tid); + trace_ceph_mdsc_resume_request(mdsc, req); __do_request(mdsc, req); } } @@ -3678,6 +3692,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) req->r_session->s_mds == mds) { doutc(cl, " kicking tid %llu\n", req->r_tid); list_del_init(&req->r_wait); + trace_ceph_mdsc_resume_request(mdsc, req); __do_request(mdsc, req); } } @@ -3724,6 +3739,7 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, doutc(cl, "submit_request on %p for inode %p\n", req, dir); mutex_lock(&mdsc->mutex); __register_request(mdsc, req, dir); + trace_ceph_mdsc_submit_request(mdsc, req); __do_request(mdsc, req); err = req->r_err; mutex_unlock(&mdsc->mutex); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index c65f2b202b2b..521507ea8260 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -374,7 +374,7 @@ static int build_snap_context(struct ceph_mds_client *mdsc, /* alloc new snap context */ err = -ENOMEM; - if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) + if ((size_t)num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) goto fail; snapc = ceph_create_snap_context(num, GFP_NOFS); if (!snapc) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f6bf24b5c683..7c1c1dac320d 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -30,6 +30,9 @@ #include <uapi/linux/magic.h> +#define CREATE_TRACE_POINTS +#include <trace/events/ceph.h> + static DEFINE_SPINLOCK(ceph_fsc_lock); static LIST_HEAD(ceph_fsc_list); diff --git a/include/trace/events/ceph.h b/include/trace/events/ceph.h new file mode 100644 index 000000000000..08cb0659fbfc --- /dev/null +++ b/include/trace/events/ceph.h @@ -0,0 +1,234 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Ceph filesystem support module tracepoints + * + * Copyright (C) 2025 IONOS SE. All Rights Reserved. + * Written by Max Kellermann (max.kellermann@ionos.com) + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ceph + +#if !defined(_TRACE_CEPH_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_CEPH_H + +#include <linux/tracepoint.h> + +#define ceph_mdsc_suspend_reasons \ + EM(ceph_mdsc_suspend_reason_no_mdsmap, "no-mdsmap") \ + EM(ceph_mdsc_suspend_reason_no_active_mds, "no-active-mds") \ + EM(ceph_mdsc_suspend_reason_rejected, "rejected") \ + E_(ceph_mdsc_suspend_reason_session, "session") + +#ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY +#define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY + +#undef EM +#undef E_ +#define EM(a, b) a, +#define E_(a, b) a + +enum ceph_mdsc_suspend_reason { ceph_mdsc_suspend_reasons } __mode(byte); + +#endif + +/* + * Export enum symbols via userspace. + */ +#undef EM +#undef E_ +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define E_(a, b) TRACE_DEFINE_ENUM(a); + +ceph_mdsc_suspend_reasons; + +/* + * Now redefine the EM() and E_() macros to map the enums to the strings that + * will be printed in the output. + */ +#undef EM +#undef E_ +#define EM(a, b) { a, b }, +#define E_(a, b) { a, b } + +TRACE_EVENT(ceph_mdsc_submit_request, + TP_PROTO(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req), + + TP_ARGS(mdsc, req), + + TP_STRUCT__entry( + __field(u64, tid) + __field(int, op) + __field(u64, ino) + __field(u64, snap) + ), + + TP_fast_assign( + struct inode *inode; + + __entry->tid = req->r_tid; + __entry->op = req->r_op; + + inode = req->r_inode; + if (inode == NULL && req->r_dentry) + inode = d_inode(req->r_dentry); + + if (inode) { + __entry->ino = ceph_ino(inode); + __entry->snap = ceph_snap(inode); + } else { + __entry->ino = __entry->snap = 0; + } + ), + + TP_printk("R=%llu op=%s ino=%llx,%llx", + __entry->tid, + ceph_mds_op_name(__entry->op), + __entry->ino, __entry->snap) +); + +TRACE_EVENT(ceph_mdsc_suspend_request, + TP_PROTO(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_mds_request *req, + enum ceph_mdsc_suspend_reason reason), + + TP_ARGS(mdsc, session, req, reason), + + TP_STRUCT__entry( + __field(u64, tid) + __field(int, op) + __field(int, mds) + __field(enum ceph_mdsc_suspend_reason, reason) + ), + + TP_fast_assign( + __entry->tid = req->r_tid; + __entry->op = req->r_op; + __entry->mds = session ? session->s_mds : -1; + __entry->reason = reason; + ), + + TP_printk("R=%llu op=%s reason=%s", + __entry->tid, + ceph_mds_op_name(__entry->op), + __print_symbolic(__entry->reason, ceph_mdsc_suspend_reasons)) +); + +TRACE_EVENT(ceph_mdsc_resume_request, + TP_PROTO(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req), + + TP_ARGS(mdsc, req), + + TP_STRUCT__entry( + __field(u64, tid) + __field(int, op) + ), + + TP_fast_assign( + __entry->tid = req->r_tid; + __entry->op = req->r_op; + ), + + TP_printk("R=%llu op=%s", + __entry->tid, + ceph_mds_op_name(__entry->op)) +); + +TRACE_EVENT(ceph_mdsc_send_request, + TP_PROTO(struct ceph_mds_session *session, + struct ceph_mds_request *req), + + TP_ARGS(session, req), + + TP_STRUCT__entry( + __field(u64, tid) + __field(int, op) + __field(int, mds) + ), + + TP_fast_assign( + __entry->tid = req->r_tid; + __entry->op = req->r_op; + __entry->mds = session->s_mds; + ), + + TP_printk("R=%llu op=%s mds=%d", + __entry->tid, + ceph_mds_op_name(__entry->op), + __entry->mds) +); + +TRACE_EVENT(ceph_mdsc_complete_request, + TP_PROTO(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req), + + TP_ARGS(mdsc, req), + + TP_STRUCT__entry( + __field(u64, tid) + __field(int, op) + __field(int, err) + __field(unsigned long, latency_ns) + ), + + TP_fast_assign( + __entry->tid = req->r_tid; + __entry->op = req->r_op; + __entry->err = req->r_err; + __entry->latency_ns = req->r_end_latency - req->r_start_latency; + ), + + TP_printk("R=%llu op=%s err=%d latency_ns=%lu", + __entry->tid, + ceph_mds_op_name(__entry->op), + __entry->err, + __entry->latency_ns) +); + +TRACE_EVENT(ceph_handle_caps, + TP_PROTO(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + int op, + const struct ceph_vino *vino, + struct ceph_inode_info *inode, + u32 seq, u32 mseq, u32 issue_seq), + + TP_ARGS(mdsc, session, op, vino, inode, seq, mseq, issue_seq), + + TP_STRUCT__entry( + __field(int, mds) + __field(int, op) + __field(u64, ino) + __field(u64, snap) + __field(u32, seq) + __field(u32, mseq) + __field(u32, issue_seq) + ), + + TP_fast_assign( + __entry->mds = session->s_mds; + __entry->op = op; + __entry->ino = vino->ino; + __entry->snap = vino->snap; + __entry->seq = seq; + __entry->mseq = mseq; + __entry->issue_seq = issue_seq; + ), + + TP_printk("mds=%d op=%s vino=%llx.%llx seq=%u iseq=%u mseq=%u", + __entry->mds, + ceph_cap_op_name(__entry->op), + __entry->ino, + __entry->snap, + __entry->seq, + __entry->issue_seq, + __entry->mseq) +); + +#undef EM +#undef E_ +#endif /* _TRACE_CEPH_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 6664ea73ccf8..3667319b949d 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1280,8 +1280,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) static struct ceph_osd *get_osd(struct ceph_osd *osd) { if (refcount_inc_not_zero(&osd->o_ref)) { - dout("get_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref)-1, - refcount_read(&osd->o_ref)); + dout("get_osd %p -> %d\n", osd, refcount_read(&osd->o_ref)); return osd; } else { dout("get_osd %p FAIL\n", osd); @@ -1291,8 +1290,7 @@ static struct ceph_osd *get_osd(struct ceph_osd *osd) static void put_osd(struct ceph_osd *osd) { - dout("put_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref), - refcount_read(&osd->o_ref) - 1); + dout("put_osd %p -> %d\n", osd, refcount_read(&osd->o_ref) - 1); if (refcount_dec_and_test(&osd->o_ref)) { osd_cleanup(osd); kfree(osd); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index d245fa508e1c..34b3ab59602f 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -806,51 +806,49 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) ceph_decode_need(p, end, len, bad); pool_end = *p + len; + ceph_decode_need(p, end, 4 + 4 + 4, bad); pi->type = ceph_decode_8(p); pi->size = ceph_decode_8(p); pi->crush_ruleset = ceph_decode_8(p); pi->object_hash = ceph_decode_8(p); - pi->pg_num = ceph_decode_32(p); pi->pgp_num = ceph_decode_32(p); - *p += 4 + 4; /* skip lpg* */ - *p += 4; /* skip last_change */ - *p += 8 + 4; /* skip snap_seq, snap_epoch */ + /* lpg*, last_change, snap_seq, snap_epoch */ + ceph_decode_skip_n(p, end, 8 + 4 + 8 + 4, bad); /* skip snaps */ - num = ceph_decode_32(p); + ceph_decode_32_safe(p, end, num, bad); while (num--) { - *p += 8; /* snapid key */ - *p += 1 + 1; /* versions */ - len = ceph_decode_32(p); - *p += len; + /* snapid key, pool snap (with versions) */ + ceph_decode_skip_n(p, end, 8 + 2, bad); + ceph_decode_skip_string(p, end, bad); } - /* skip removed_snaps */ - num = ceph_decode_32(p); - *p += num * (8 + 8); + /* removed_snaps */ + ceph_decode_skip_map(p, end, 64, 64, bad); + ceph_decode_need(p, end, 8 + 8 + 4, bad); *p += 8; /* skip auid */ pi->flags = ceph_decode_64(p); *p += 4; /* skip crash_replay_interval */ if (ev >= 7) - pi->min_size = ceph_decode_8(p); + ceph_decode_8_safe(p, end, pi->min_size, bad); else pi->min_size = pi->size - pi->size / 2; if (ev >= 8) - *p += 8 + 8; /* skip quota_max_* */ + /* quota_max_* */ + ceph_decode_skip_n(p, end, 8 + 8, bad); if (ev >= 9) { - /* skip tiers */ - num = ceph_decode_32(p); - *p += num * 8; + /* tiers */ + ceph_decode_skip_set(p, end, 64, bad); + ceph_decode_need(p, end, 8 + 1 + 8 + 8, bad); *p += 8; /* skip tier_of */ *p += 1; /* skip cache_mode */ - pi->read_tier = ceph_decode_64(p); pi->write_tier = ceph_decode_64(p); } else { @@ -858,86 +856,76 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) pi->write_tier = -1; } - if (ev >= 10) { - /* skip properties */ - num = ceph_decode_32(p); - while (num--) { - len = ceph_decode_32(p); - *p += len; /* key */ - len = ceph_decode_32(p); - *p += len; /* val */ - } - } + if (ev >= 10) + /* properties */ + ceph_decode_skip_map(p, end, string, string, bad); if (ev >= 11) { - /* skip hit_set_params */ - *p += 1 + 1; /* versions */ - len = ceph_decode_32(p); - *p += len; + /* hit_set_params (with versions) */ + ceph_decode_skip_n(p, end, 2, bad); + ceph_decode_skip_string(p, end, bad); - *p += 4; /* skip hit_set_period */ - *p += 4; /* skip hit_set_count */ + /* hit_set_period, hit_set_count */ + ceph_decode_skip_n(p, end, 4 + 4, bad); } if (ev >= 12) - *p += 4; /* skip stripe_width */ + /* stripe_width */ + ceph_decode_skip_32(p, end, bad); - if (ev >= 13) { - *p += 8; /* skip target_max_bytes */ - *p += 8; /* skip target_max_objects */ - *p += 4; /* skip cache_target_dirty_ratio_micro */ - *p += 4; /* skip cache_target_full_ratio_micro */ - *p += 4; /* skip cache_min_flush_age */ - *p += 4; /* skip cache_min_evict_age */ - } + if (ev >= 13) + /* target_max_*, cache_target_*, cache_min_* */ + ceph_decode_skip_n(p, end, 16 + 8 + 8, bad); - if (ev >= 14) { - /* skip erasure_code_profile */ - len = ceph_decode_32(p); - *p += len; - } + if (ev >= 14) + /* erasure_code_profile */ + ceph_decode_skip_string(p, end, bad); /* * last_force_op_resend_preluminous, will be overridden if the * map was encoded with RESEND_ON_SPLIT */ if (ev >= 15) - pi->last_force_request_resend = ceph_decode_32(p); + ceph_decode_32_safe(p, end, pi->last_force_request_resend, bad); else pi->last_force_request_resend = 0; if (ev >= 16) - *p += 4; /* skip min_read_recency_for_promote */ + /* min_read_recency_for_promote */ + ceph_decode_skip_32(p, end, bad); if (ev >= 17) - *p += 8; /* skip expected_num_objects */ + /* expected_num_objects */ + ceph_decode_skip_64(p, end, bad); if (ev >= 19) - *p += 4; /* skip cache_target_dirty_high_ratio_micro */ + /* cache_target_dirty_high_ratio_micro */ + ceph_decode_skip_32(p, end, bad); if (ev >= 20) - *p += 4; /* skip min_write_recency_for_promote */ + /* min_write_recency_for_promote */ + ceph_decode_skip_32(p, end, bad); if (ev >= 21) - *p += 1; /* skip use_gmt_hitset */ + /* use_gmt_hitset */ + ceph_decode_skip_8(p, end, bad); if (ev >= 22) - *p += 1; /* skip fast_read */ + /* fast_read */ + ceph_decode_skip_8(p, end, bad); - if (ev >= 23) { - *p += 4; /* skip hit_set_grade_decay_rate */ - *p += 4; /* skip hit_set_search_last_n */ - } + if (ev >= 23) + /* hit_set_grade_decay_rate, hit_set_search_last_n */ + ceph_decode_skip_n(p, end, 4 + 4, bad); if (ev >= 24) { - /* skip opts */ - *p += 1 + 1; /* versions */ - len = ceph_decode_32(p); - *p += len; + /* opts (with versions) */ + ceph_decode_skip_n(p, end, 2, bad); + ceph_decode_skip_string(p, end, bad); } if (ev >= 25) - pi->last_force_request_resend = ceph_decode_32(p); + ceph_decode_32_safe(p, end, pi->last_force_request_resend, bad); /* ignore the rest */ @@ -1438,7 +1426,7 @@ static struct ceph_pg_mapping *__decode_pg_temp(void **p, void *end, ceph_decode_32_safe(p, end, len, e_inval); if (len == 0 && incremental) return NULL; /* new_pg_temp: [] to remove */ - if (len > (SIZE_MAX - sizeof(*pg)) / sizeof(u32)) + if ((size_t)len > (SIZE_MAX - sizeof(*pg)) / sizeof(u32)) return ERR_PTR(-EINVAL); ceph_decode_need(p, end, len * sizeof(u32), e_inval); @@ -1619,7 +1607,7 @@ static struct ceph_pg_mapping *__decode_pg_upmap_items(void **p, void *end, u32 len, i; ceph_decode_32_safe(p, end, len, e_inval); - if (len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32))) + if ((size_t)len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32))) return ERR_PTR(-EINVAL); ceph_decode_need(p, end, 2 * len * sizeof(u32), e_inval); |
