ceph: add trace points to the MDS client

This patch adds trace points to the Ceph filesystem MDS client:

- request submission (CEPH_MSG_CLIENT_REQUEST) and completion
(CEPH_MSG_CLIENT_REPLY)
- capabilities (CEPH_MSG_CLIENT_CAPS)

These are the central pieces that are useful for analyzing MDS
latency/performance problems from the client's perspective.

In the long run, all doutc() calls should be replaced with
tracepoints. This way, the Ceph filesystem can be traced at any time
(without spamming the kernel log). Additionally, trace points can be
used in BPF programs (which can even deference the pointer parameters
and extract more values).

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>

authored by Max Kellermann and committed by Ilya Dryomov d927a595 d6c2f41e

+259 -2
+4
fs/ceph/caps.c
··· 18 #include "crypto.h" 19 #include <linux/ceph/decode.h> 20 #include <linux/ceph/messenger.h> 21 22 /* 23 * Capability management ··· 4452 doutc(cl, " caps mds%d op %s ino %llx.%llx inode %p seq %u iseq %u mseq %u\n", 4453 session->s_mds, ceph_cap_op_name(op), vino.ino, vino.snap, inode, 4454 seq, issue_seq, mseq); 4455 4456 mutex_lock(&session->s_mutex); 4457
··· 18 #include "crypto.h" 19 #include <linux/ceph/decode.h> 20 #include <linux/ceph/messenger.h> 21 + #include <trace/events/ceph.h> 22 23 /* 24 * Capability management ··· 4451 doutc(cl, " caps mds%d op %s ino %llx.%llx inode %p seq %u iseq %u mseq %u\n", 4452 session->s_mds, ceph_cap_op_name(op), vino.ino, vino.snap, inode, 4453 seq, issue_seq, mseq); 4454 + 4455 + trace_ceph_handle_caps(mdsc, session, op, &vino, ceph_inode(inode), 4456 + seq, issue_seq, mseq); 4457 4458 mutex_lock(&session->s_mutex); 4459
+18 -2
fs/ceph/mds_client.c
··· 24 #include <linux/ceph/pagelist.h> 25 #include <linux/ceph/auth.h> 26 #include <linux/ceph/debugfs.h> 27 28 #define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE) 29 ··· 3289 { 3290 req->r_end_latency = ktime_get(); 3291 3292 if (req->r_callback) 3293 req->r_callback(mdsc, req); 3294 complete_all(&req->r_completion); ··· 3422 { 3423 int err; 3424 3425 err = __prepare_send_request(session, req, drop_cap_releases); 3426 if (!err) { 3427 ceph_msg_get(req->r_request); ··· 3475 } 3476 if (mdsc->mdsmap->m_epoch == 0) { 3477 doutc(cl, "no mdsmap, waiting for map\n"); 3478 list_add(&req->r_wait, &mdsc->waiting_for_map); 3479 return; 3480 } ··· 3498 goto finish; 3499 } 3500 doutc(cl, "no mds or not active, waiting for map\n"); 3501 list_add(&req->r_wait, &mdsc->waiting_for_map); 3502 return; 3503 } ··· 3545 * it to the mdsc queue. 3546 */ 3547 if (session->s_state == CEPH_MDS_SESSION_REJECTED) { 3548 - if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER)) 3549 list_add(&req->r_wait, &mdsc->waiting_for_map); 3550 - else 3551 err = -EACCES; 3552 goto out_session; 3553 } ··· 3563 if (random) 3564 req->r_resend_mds = mds; 3565 } 3566 list_add(&req->r_wait, &session->s_waiting); 3567 goto out_session; 3568 } ··· 3665 list_del_init(&req->r_wait); 3666 doutc(cl, " wake request %p tid %llu\n", req, 3667 req->r_tid); 3668 __do_request(mdsc, req); 3669 } 3670 } ··· 3692 req->r_session->s_mds == mds) { 3693 doutc(cl, " kicking tid %llu\n", req->r_tid); 3694 list_del_init(&req->r_wait); 3695 __do_request(mdsc, req); 3696 } 3697 } ··· 3739 doutc(cl, "submit_request on %p for inode %p\n", req, dir); 3740 mutex_lock(&mdsc->mutex); 3741 __register_request(mdsc, req, dir); 3742 __do_request(mdsc, req); 3743 err = req->r_err; 3744 mutex_unlock(&mdsc->mutex);
··· 24 #include <linux/ceph/pagelist.h> 25 #include <linux/ceph/auth.h> 26 #include <linux/ceph/debugfs.h> 27 + #include <trace/events/ceph.h> 28 29 #define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE) 30 ··· 3288 { 3289 req->r_end_latency = ktime_get(); 3290 3291 + trace_ceph_mdsc_complete_request(mdsc, req); 3292 + 3293 if (req->r_callback) 3294 req->r_callback(mdsc, req); 3295 complete_all(&req->r_completion); ··· 3419 { 3420 int err; 3421 3422 + trace_ceph_mdsc_send_request(session, req); 3423 + 3424 err = __prepare_send_request(session, req, drop_cap_releases); 3425 if (!err) { 3426 ceph_msg_get(req->r_request); ··· 3470 } 3471 if (mdsc->mdsmap->m_epoch == 0) { 3472 doutc(cl, "no mdsmap, waiting for map\n"); 3473 + trace_ceph_mdsc_suspend_request(mdsc, session, req, 3474 + ceph_mdsc_suspend_reason_no_mdsmap); 3475 list_add(&req->r_wait, &mdsc->waiting_for_map); 3476 return; 3477 } ··· 3491 goto finish; 3492 } 3493 doutc(cl, "no mds or not active, waiting for map\n"); 3494 + trace_ceph_mdsc_suspend_request(mdsc, session, req, 3495 + ceph_mdsc_suspend_reason_no_active_mds); 3496 list_add(&req->r_wait, &mdsc->waiting_for_map); 3497 return; 3498 } ··· 3536 * it to the mdsc queue. 3537 */ 3538 if (session->s_state == CEPH_MDS_SESSION_REJECTED) { 3539 + if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER)) { 3540 + trace_ceph_mdsc_suspend_request(mdsc, session, req, 3541 + ceph_mdsc_suspend_reason_rejected); 3542 list_add(&req->r_wait, &mdsc->waiting_for_map); 3543 + } else 3544 err = -EACCES; 3545 goto out_session; 3546 } ··· 3552 if (random) 3553 req->r_resend_mds = mds; 3554 } 3555 + trace_ceph_mdsc_suspend_request(mdsc, session, req, 3556 + ceph_mdsc_suspend_reason_session); 3557 list_add(&req->r_wait, &session->s_waiting); 3558 goto out_session; 3559 } ··· 3652 list_del_init(&req->r_wait); 3653 doutc(cl, " wake request %p tid %llu\n", req, 3654 req->r_tid); 3655 + trace_ceph_mdsc_resume_request(mdsc, req); 3656 __do_request(mdsc, req); 3657 } 3658 } ··· 3678 req->r_session->s_mds == mds) { 3679 doutc(cl, " kicking tid %llu\n", req->r_tid); 3680 list_del_init(&req->r_wait); 3681 + trace_ceph_mdsc_resume_request(mdsc, req); 3682 __do_request(mdsc, req); 3683 } 3684 } ··· 3724 doutc(cl, "submit_request on %p for inode %p\n", req, dir); 3725 mutex_lock(&mdsc->mutex); 3726 __register_request(mdsc, req, dir); 3727 + trace_ceph_mdsc_submit_request(mdsc, req); 3728 __do_request(mdsc, req); 3729 err = req->r_err; 3730 mutex_unlock(&mdsc->mutex);
+3
fs/ceph/super.c
··· 30 31 #include <uapi/linux/magic.h> 32 33 static DEFINE_SPINLOCK(ceph_fsc_lock); 34 static LIST_HEAD(ceph_fsc_list); 35
··· 30 31 #include <uapi/linux/magic.h> 32 33 + #define CREATE_TRACE_POINTS 34 + #include <trace/events/ceph.h> 35 + 36 static DEFINE_SPINLOCK(ceph_fsc_lock); 37 static LIST_HEAD(ceph_fsc_list); 38
+234
include/trace/events/ceph.h
···
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* Ceph filesystem support module tracepoints 3 + * 4 + * Copyright (C) 2025 IONOS SE. All Rights Reserved. 5 + * Written by Max Kellermann (max.kellermann@ionos.com) 6 + */ 7 + #undef TRACE_SYSTEM 8 + #define TRACE_SYSTEM ceph 9 + 10 + #if !defined(_TRACE_CEPH_H) || defined(TRACE_HEADER_MULTI_READ) 11 + #define _TRACE_CEPH_H 12 + 13 + #include <linux/tracepoint.h> 14 + 15 + #define ceph_mdsc_suspend_reasons \ 16 + EM(ceph_mdsc_suspend_reason_no_mdsmap, "no-mdsmap") \ 17 + EM(ceph_mdsc_suspend_reason_no_active_mds, "no-active-mds") \ 18 + EM(ceph_mdsc_suspend_reason_rejected, "rejected") \ 19 + E_(ceph_mdsc_suspend_reason_session, "session") 20 + 21 + #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY 22 + #define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY 23 + 24 + #undef EM 25 + #undef E_ 26 + #define EM(a, b) a, 27 + #define E_(a, b) a 28 + 29 + enum ceph_mdsc_suspend_reason { ceph_mdsc_suspend_reasons } __mode(byte); 30 + 31 + #endif 32 + 33 + /* 34 + * Export enum symbols via userspace. 35 + */ 36 + #undef EM 37 + #undef E_ 38 + #define EM(a, b) TRACE_DEFINE_ENUM(a); 39 + #define E_(a, b) TRACE_DEFINE_ENUM(a); 40 + 41 + ceph_mdsc_suspend_reasons; 42 + 43 + /* 44 + * Now redefine the EM() and E_() macros to map the enums to the strings that 45 + * will be printed in the output. 46 + */ 47 + #undef EM 48 + #undef E_ 49 + #define EM(a, b) { a, b }, 50 + #define E_(a, b) { a, b } 51 + 52 + TRACE_EVENT(ceph_mdsc_submit_request, 53 + TP_PROTO(struct ceph_mds_client *mdsc, 54 + struct ceph_mds_request *req), 55 + 56 + TP_ARGS(mdsc, req), 57 + 58 + TP_STRUCT__entry( 59 + __field(u64, tid) 60 + __field(int, op) 61 + __field(u64, ino) 62 + __field(u64, snap) 63 + ), 64 + 65 + TP_fast_assign( 66 + struct inode *inode; 67 + 68 + __entry->tid = req->r_tid; 69 + __entry->op = req->r_op; 70 + 71 + inode = req->r_inode; 72 + if (inode == NULL && req->r_dentry) 73 + inode = d_inode(req->r_dentry); 74 + 75 + if (inode) { 76 + __entry->ino = ceph_ino(inode); 77 + __entry->snap = ceph_snap(inode); 78 + } else { 79 + __entry->ino = __entry->snap = 0; 80 + } 81 + ), 82 + 83 + TP_printk("R=%llu op=%s ino=%llx,%llx", 84 + __entry->tid, 85 + ceph_mds_op_name(__entry->op), 86 + __entry->ino, __entry->snap) 87 + ); 88 + 89 + TRACE_EVENT(ceph_mdsc_suspend_request, 90 + TP_PROTO(struct ceph_mds_client *mdsc, 91 + struct ceph_mds_session *session, 92 + struct ceph_mds_request *req, 93 + enum ceph_mdsc_suspend_reason reason), 94 + 95 + TP_ARGS(mdsc, session, req, reason), 96 + 97 + TP_STRUCT__entry( 98 + __field(u64, tid) 99 + __field(int, op) 100 + __field(int, mds) 101 + __field(enum ceph_mdsc_suspend_reason, reason) 102 + ), 103 + 104 + TP_fast_assign( 105 + __entry->tid = req->r_tid; 106 + __entry->op = req->r_op; 107 + __entry->mds = session ? session->s_mds : -1; 108 + __entry->reason = reason; 109 + ), 110 + 111 + TP_printk("R=%llu op=%s reason=%s", 112 + __entry->tid, 113 + ceph_mds_op_name(__entry->op), 114 + __print_symbolic(__entry->reason, ceph_mdsc_suspend_reasons)) 115 + ); 116 + 117 + TRACE_EVENT(ceph_mdsc_resume_request, 118 + TP_PROTO(struct ceph_mds_client *mdsc, 119 + struct ceph_mds_request *req), 120 + 121 + TP_ARGS(mdsc, req), 122 + 123 + TP_STRUCT__entry( 124 + __field(u64, tid) 125 + __field(int, op) 126 + ), 127 + 128 + TP_fast_assign( 129 + __entry->tid = req->r_tid; 130 + __entry->op = req->r_op; 131 + ), 132 + 133 + TP_printk("R=%llu op=%s", 134 + __entry->tid, 135 + ceph_mds_op_name(__entry->op)) 136 + ); 137 + 138 + TRACE_EVENT(ceph_mdsc_send_request, 139 + TP_PROTO(struct ceph_mds_session *session, 140 + struct ceph_mds_request *req), 141 + 142 + TP_ARGS(session, req), 143 + 144 + TP_STRUCT__entry( 145 + __field(u64, tid) 146 + __field(int, op) 147 + __field(int, mds) 148 + ), 149 + 150 + TP_fast_assign( 151 + __entry->tid = req->r_tid; 152 + __entry->op = req->r_op; 153 + __entry->mds = session->s_mds; 154 + ), 155 + 156 + TP_printk("R=%llu op=%s mds=%d", 157 + __entry->tid, 158 + ceph_mds_op_name(__entry->op), 159 + __entry->mds) 160 + ); 161 + 162 + TRACE_EVENT(ceph_mdsc_complete_request, 163 + TP_PROTO(struct ceph_mds_client *mdsc, 164 + struct ceph_mds_request *req), 165 + 166 + TP_ARGS(mdsc, req), 167 + 168 + TP_STRUCT__entry( 169 + __field(u64, tid) 170 + __field(int, op) 171 + __field(int, err) 172 + __field(unsigned long, latency_ns) 173 + ), 174 + 175 + TP_fast_assign( 176 + __entry->tid = req->r_tid; 177 + __entry->op = req->r_op; 178 + __entry->err = req->r_err; 179 + __entry->latency_ns = req->r_end_latency - req->r_start_latency; 180 + ), 181 + 182 + TP_printk("R=%llu op=%s err=%d latency_ns=%lu", 183 + __entry->tid, 184 + ceph_mds_op_name(__entry->op), 185 + __entry->err, 186 + __entry->latency_ns) 187 + ); 188 + 189 + TRACE_EVENT(ceph_handle_caps, 190 + TP_PROTO(struct ceph_mds_client *mdsc, 191 + struct ceph_mds_session *session, 192 + int op, 193 + const struct ceph_vino *vino, 194 + struct ceph_inode_info *inode, 195 + u32 seq, u32 mseq, u32 issue_seq), 196 + 197 + TP_ARGS(mdsc, session, op, vino, inode, seq, mseq, issue_seq), 198 + 199 + TP_STRUCT__entry( 200 + __field(int, mds) 201 + __field(int, op) 202 + __field(u64, ino) 203 + __field(u64, snap) 204 + __field(u32, seq) 205 + __field(u32, mseq) 206 + __field(u32, issue_seq) 207 + ), 208 + 209 + TP_fast_assign( 210 + __entry->mds = session->s_mds; 211 + __entry->op = op; 212 + __entry->ino = vino->ino; 213 + __entry->snap = vino->snap; 214 + __entry->seq = seq; 215 + __entry->mseq = mseq; 216 + __entry->issue_seq = issue_seq; 217 + ), 218 + 219 + TP_printk("mds=%d op=%s vino=%llx.%llx seq=%u iseq=%u mseq=%u", 220 + __entry->mds, 221 + ceph_cap_op_name(__entry->op), 222 + __entry->ino, 223 + __entry->snap, 224 + __entry->seq, 225 + __entry->issue_seq, 226 + __entry->mseq) 227 + ); 228 + 229 + #undef EM 230 + #undef E_ 231 + #endif /* _TRACE_CEPH_H */ 232 + 233 + /* This part must be outside protection */ 234 + #include <trace/define_trace.h>