ceph: add trace points to the MDS client

This patch adds trace points to the Ceph filesystem MDS client:

- request submission (CEPH_MSG_CLIENT_REQUEST) and completion
(CEPH_MSG_CLIENT_REPLY)
- capabilities (CEPH_MSG_CLIENT_CAPS)

These are the central pieces that are useful for analyzing MDS
latency/performance problems from the client's perspective.

In the long run, all doutc() calls should be replaced with
tracepoints. This way, the Ceph filesystem can be traced at any time
(without spamming the kernel log). Additionally, trace points can be
used in BPF programs (which can even deference the pointer parameters
and extract more values).

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>

authored by Max Kellermann and committed by Ilya Dryomov d927a595 d6c2f41e

+259 -2
+4
fs/ceph/caps.c
··· 18 18 #include "crypto.h" 19 19 #include <linux/ceph/decode.h> 20 20 #include <linux/ceph/messenger.h> 21 + #include <trace/events/ceph.h> 21 22 22 23 /* 23 24 * Capability management ··· 4452 4451 doutc(cl, " caps mds%d op %s ino %llx.%llx inode %p seq %u iseq %u mseq %u\n", 4453 4452 session->s_mds, ceph_cap_op_name(op), vino.ino, vino.snap, inode, 4454 4453 seq, issue_seq, mseq); 4454 + 4455 + trace_ceph_handle_caps(mdsc, session, op, &vino, ceph_inode(inode), 4456 + seq, issue_seq, mseq); 4455 4457 4456 4458 mutex_lock(&session->s_mutex); 4457 4459
+18 -2
fs/ceph/mds_client.c
··· 24 24 #include <linux/ceph/pagelist.h> 25 25 #include <linux/ceph/auth.h> 26 26 #include <linux/ceph/debugfs.h> 27 + #include <trace/events/ceph.h> 27 28 28 29 #define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE) 29 30 ··· 3289 3288 { 3290 3289 req->r_end_latency = ktime_get(); 3291 3290 3291 + trace_ceph_mdsc_complete_request(mdsc, req); 3292 + 3292 3293 if (req->r_callback) 3293 3294 req->r_callback(mdsc, req); 3294 3295 complete_all(&req->r_completion); ··· 3422 3419 { 3423 3420 int err; 3424 3421 3422 + trace_ceph_mdsc_send_request(session, req); 3423 + 3425 3424 err = __prepare_send_request(session, req, drop_cap_releases); 3426 3425 if (!err) { 3427 3426 ceph_msg_get(req->r_request); ··· 3475 3470 } 3476 3471 if (mdsc->mdsmap->m_epoch == 0) { 3477 3472 doutc(cl, "no mdsmap, waiting for map\n"); 3473 + trace_ceph_mdsc_suspend_request(mdsc, session, req, 3474 + ceph_mdsc_suspend_reason_no_mdsmap); 3478 3475 list_add(&req->r_wait, &mdsc->waiting_for_map); 3479 3476 return; 3480 3477 } ··· 3498 3491 goto finish; 3499 3492 } 3500 3493 doutc(cl, "no mds or not active, waiting for map\n"); 3494 + trace_ceph_mdsc_suspend_request(mdsc, session, req, 3495 + ceph_mdsc_suspend_reason_no_active_mds); 3501 3496 list_add(&req->r_wait, &mdsc->waiting_for_map); 3502 3497 return; 3503 3498 } ··· 3545 3536 * it to the mdsc queue. 3546 3537 */ 3547 3538 if (session->s_state == CEPH_MDS_SESSION_REJECTED) { 3548 - if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER)) 3539 + if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER)) { 3540 + trace_ceph_mdsc_suspend_request(mdsc, session, req, 3541 + ceph_mdsc_suspend_reason_rejected); 3549 3542 list_add(&req->r_wait, &mdsc->waiting_for_map); 3550 - else 3543 + } else 3551 3544 err = -EACCES; 3552 3545 goto out_session; 3553 3546 } ··· 3563 3552 if (random) 3564 3553 req->r_resend_mds = mds; 3565 3554 } 3555 + trace_ceph_mdsc_suspend_request(mdsc, session, req, 3556 + ceph_mdsc_suspend_reason_session); 3566 3557 list_add(&req->r_wait, &session->s_waiting); 3567 3558 goto out_session; 3568 3559 } ··· 3665 3652 list_del_init(&req->r_wait); 3666 3653 doutc(cl, " wake request %p tid %llu\n", req, 3667 3654 req->r_tid); 3655 + trace_ceph_mdsc_resume_request(mdsc, req); 3668 3656 __do_request(mdsc, req); 3669 3657 } 3670 3658 } ··· 3692 3678 req->r_session->s_mds == mds) { 3693 3679 doutc(cl, " kicking tid %llu\n", req->r_tid); 3694 3680 list_del_init(&req->r_wait); 3681 + trace_ceph_mdsc_resume_request(mdsc, req); 3695 3682 __do_request(mdsc, req); 3696 3683 } 3697 3684 } ··· 3739 3724 doutc(cl, "submit_request on %p for inode %p\n", req, dir); 3740 3725 mutex_lock(&mdsc->mutex); 3741 3726 __register_request(mdsc, req, dir); 3727 + trace_ceph_mdsc_submit_request(mdsc, req); 3742 3728 __do_request(mdsc, req); 3743 3729 err = req->r_err; 3744 3730 mutex_unlock(&mdsc->mutex);
+3
fs/ceph/super.c
··· 30 30 31 31 #include <uapi/linux/magic.h> 32 32 33 + #define CREATE_TRACE_POINTS 34 + #include <trace/events/ceph.h> 35 + 33 36 static DEFINE_SPINLOCK(ceph_fsc_lock); 34 37 static LIST_HEAD(ceph_fsc_list); 35 38
+234
include/trace/events/ceph.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* Ceph filesystem support module tracepoints 3 + * 4 + * Copyright (C) 2025 IONOS SE. All Rights Reserved. 5 + * Written by Max Kellermann (max.kellermann@ionos.com) 6 + */ 7 + #undef TRACE_SYSTEM 8 + #define TRACE_SYSTEM ceph 9 + 10 + #if !defined(_TRACE_CEPH_H) || defined(TRACE_HEADER_MULTI_READ) 11 + #define _TRACE_CEPH_H 12 + 13 + #include <linux/tracepoint.h> 14 + 15 + #define ceph_mdsc_suspend_reasons \ 16 + EM(ceph_mdsc_suspend_reason_no_mdsmap, "no-mdsmap") \ 17 + EM(ceph_mdsc_suspend_reason_no_active_mds, "no-active-mds") \ 18 + EM(ceph_mdsc_suspend_reason_rejected, "rejected") \ 19 + E_(ceph_mdsc_suspend_reason_session, "session") 20 + 21 + #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY 22 + #define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY 23 + 24 + #undef EM 25 + #undef E_ 26 + #define EM(a, b) a, 27 + #define E_(a, b) a 28 + 29 + enum ceph_mdsc_suspend_reason { ceph_mdsc_suspend_reasons } __mode(byte); 30 + 31 + #endif 32 + 33 + /* 34 + * Export enum symbols via userspace. 35 + */ 36 + #undef EM 37 + #undef E_ 38 + #define EM(a, b) TRACE_DEFINE_ENUM(a); 39 + #define E_(a, b) TRACE_DEFINE_ENUM(a); 40 + 41 + ceph_mdsc_suspend_reasons; 42 + 43 + /* 44 + * Now redefine the EM() and E_() macros to map the enums to the strings that 45 + * will be printed in the output. 46 + */ 47 + #undef EM 48 + #undef E_ 49 + #define EM(a, b) { a, b }, 50 + #define E_(a, b) { a, b } 51 + 52 + TRACE_EVENT(ceph_mdsc_submit_request, 53 + TP_PROTO(struct ceph_mds_client *mdsc, 54 + struct ceph_mds_request *req), 55 + 56 + TP_ARGS(mdsc, req), 57 + 58 + TP_STRUCT__entry( 59 + __field(u64, tid) 60 + __field(int, op) 61 + __field(u64, ino) 62 + __field(u64, snap) 63 + ), 64 + 65 + TP_fast_assign( 66 + struct inode *inode; 67 + 68 + __entry->tid = req->r_tid; 69 + __entry->op = req->r_op; 70 + 71 + inode = req->r_inode; 72 + if (inode == NULL && req->r_dentry) 73 + inode = d_inode(req->r_dentry); 74 + 75 + if (inode) { 76 + __entry->ino = ceph_ino(inode); 77 + __entry->snap = ceph_snap(inode); 78 + } else { 79 + __entry->ino = __entry->snap = 0; 80 + } 81 + ), 82 + 83 + TP_printk("R=%llu op=%s ino=%llx,%llx", 84 + __entry->tid, 85 + ceph_mds_op_name(__entry->op), 86 + __entry->ino, __entry->snap) 87 + ); 88 + 89 + TRACE_EVENT(ceph_mdsc_suspend_request, 90 + TP_PROTO(struct ceph_mds_client *mdsc, 91 + struct ceph_mds_session *session, 92 + struct ceph_mds_request *req, 93 + enum ceph_mdsc_suspend_reason reason), 94 + 95 + TP_ARGS(mdsc, session, req, reason), 96 + 97 + TP_STRUCT__entry( 98 + __field(u64, tid) 99 + __field(int, op) 100 + __field(int, mds) 101 + __field(enum ceph_mdsc_suspend_reason, reason) 102 + ), 103 + 104 + TP_fast_assign( 105 + __entry->tid = req->r_tid; 106 + __entry->op = req->r_op; 107 + __entry->mds = session ? session->s_mds : -1; 108 + __entry->reason = reason; 109 + ), 110 + 111 + TP_printk("R=%llu op=%s reason=%s", 112 + __entry->tid, 113 + ceph_mds_op_name(__entry->op), 114 + __print_symbolic(__entry->reason, ceph_mdsc_suspend_reasons)) 115 + ); 116 + 117 + TRACE_EVENT(ceph_mdsc_resume_request, 118 + TP_PROTO(struct ceph_mds_client *mdsc, 119 + struct ceph_mds_request *req), 120 + 121 + TP_ARGS(mdsc, req), 122 + 123 + TP_STRUCT__entry( 124 + __field(u64, tid) 125 + __field(int, op) 126 + ), 127 + 128 + TP_fast_assign( 129 + __entry->tid = req->r_tid; 130 + __entry->op = req->r_op; 131 + ), 132 + 133 + TP_printk("R=%llu op=%s", 134 + __entry->tid, 135 + ceph_mds_op_name(__entry->op)) 136 + ); 137 + 138 + TRACE_EVENT(ceph_mdsc_send_request, 139 + TP_PROTO(struct ceph_mds_session *session, 140 + struct ceph_mds_request *req), 141 + 142 + TP_ARGS(session, req), 143 + 144 + TP_STRUCT__entry( 145 + __field(u64, tid) 146 + __field(int, op) 147 + __field(int, mds) 148 + ), 149 + 150 + TP_fast_assign( 151 + __entry->tid = req->r_tid; 152 + __entry->op = req->r_op; 153 + __entry->mds = session->s_mds; 154 + ), 155 + 156 + TP_printk("R=%llu op=%s mds=%d", 157 + __entry->tid, 158 + ceph_mds_op_name(__entry->op), 159 + __entry->mds) 160 + ); 161 + 162 + TRACE_EVENT(ceph_mdsc_complete_request, 163 + TP_PROTO(struct ceph_mds_client *mdsc, 164 + struct ceph_mds_request *req), 165 + 166 + TP_ARGS(mdsc, req), 167 + 168 + TP_STRUCT__entry( 169 + __field(u64, tid) 170 + __field(int, op) 171 + __field(int, err) 172 + __field(unsigned long, latency_ns) 173 + ), 174 + 175 + TP_fast_assign( 176 + __entry->tid = req->r_tid; 177 + __entry->op = req->r_op; 178 + __entry->err = req->r_err; 179 + __entry->latency_ns = req->r_end_latency - req->r_start_latency; 180 + ), 181 + 182 + TP_printk("R=%llu op=%s err=%d latency_ns=%lu", 183 + __entry->tid, 184 + ceph_mds_op_name(__entry->op), 185 + __entry->err, 186 + __entry->latency_ns) 187 + ); 188 + 189 + TRACE_EVENT(ceph_handle_caps, 190 + TP_PROTO(struct ceph_mds_client *mdsc, 191 + struct ceph_mds_session *session, 192 + int op, 193 + const struct ceph_vino *vino, 194 + struct ceph_inode_info *inode, 195 + u32 seq, u32 mseq, u32 issue_seq), 196 + 197 + TP_ARGS(mdsc, session, op, vino, inode, seq, mseq, issue_seq), 198 + 199 + TP_STRUCT__entry( 200 + __field(int, mds) 201 + __field(int, op) 202 + __field(u64, ino) 203 + __field(u64, snap) 204 + __field(u32, seq) 205 + __field(u32, mseq) 206 + __field(u32, issue_seq) 207 + ), 208 + 209 + TP_fast_assign( 210 + __entry->mds = session->s_mds; 211 + __entry->op = op; 212 + __entry->ino = vino->ino; 213 + __entry->snap = vino->snap; 214 + __entry->seq = seq; 215 + __entry->mseq = mseq; 216 + __entry->issue_seq = issue_seq; 217 + ), 218 + 219 + TP_printk("mds=%d op=%s vino=%llx.%llx seq=%u iseq=%u mseq=%u", 220 + __entry->mds, 221 + ceph_cap_op_name(__entry->op), 222 + __entry->ino, 223 + __entry->snap, 224 + __entry->seq, 225 + __entry->issue_seq, 226 + __entry->mseq) 227 + ); 228 + 229 + #undef EM 230 + #undef E_ 231 + #endif /* _TRACE_CEPH_H */ 232 + 233 + /* This part must be outside protection */ 234 + #include <trace/define_trace.h>