Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

RDMA/core: Trace points for diagnosing completion queue issues

Sample trace events:

kworker/u29:0-300 [007] 120.042217: cq_alloc: cq.id=4 nr_cqe=161 comp_vector=2 poll_ctx=WORKQUEUE
<idle>-0 [002] 120.056292: cq_schedule: cq.id=4
kworker/2:1H-482 [002] 120.056402: cq_process: cq.id=4 wake-up took 109 [us] from interrupt
kworker/2:1H-482 [002] 120.056407: cq_poll: cq.id=4 requested 16, returned 1
<idle>-0 [002] 120.067503: cq_schedule: cq.id=4
kworker/2:1H-482 [002] 120.067537: cq_process: cq.id=4 wake-up took 34 [us] from interrupt
kworker/2:1H-482 [002] 120.067541: cq_poll: cq.id=4 requested 16, returned 1
<idle>-0 [002] 120.067657: cq_schedule: cq.id=4
kworker/2:1H-482 [002] 120.067672: cq_process: cq.id=4 wake-up took 15 [us] from interrupt
kworker/2:1H-482 [002] 120.067674: cq_poll: cq.id=4 requested 16, returned 1

...

systemd-1 [002] 122.392653: cq_schedule: cq.id=4
kworker/2:1H-482 [002] 122.392688: cq_process: cq.id=4 wake-up took 35 [us] from interrupt
kworker/2:1H-482 [002] 122.392693: cq_poll: cq.id=4 requested 16, returned 16
kworker/2:1H-482 [002] 122.392836: cq_poll: cq.id=4 requested 16, returned 16
kworker/2:1H-482 [002] 122.392970: cq_poll: cq.id=4 requested 16, returned 16
kworker/2:1H-482 [002] 122.393083: cq_poll: cq.id=4 requested 16, returned 16
kworker/2:1H-482 [002] 122.393195: cq_poll: cq.id=4 requested 16, returned 3

Several features to note in this output:
- The WCE count and context type are reported at allocation time
- The CPU and kworker for each CQ is evident
- The CQ's restracker ID is tagged on each trace event
- CQ poll scheduling latency is measured
- Details about how often single completions occur versus multiple
completions are evident
- The cost of the ULP's completion handler is recorded

Link: https://lore.kernel.org/r/20191218201815.30584.3481.stgit@manet.1015granger.net
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>

authored by

Chuck Lever and committed by
Jason Gunthorpe
3e5901cb ed999f82

+320 -4
+2 -1
drivers/infiniband/core/Makefile
··· 11 11 device.o fmr_pool.o cache.o netlink.o \ 12 12 roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ 13 13 multicast.o mad.o smi.o agent.o mad_rmpp.o \ 14 - nldev.o restrack.o counters.o ib_core_uverbs.o 14 + nldev.o restrack.o counters.o ib_core_uverbs.o \ 15 + trace.o 15 16 16 17 ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o 17 18 ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
+24 -3
drivers/infiniband/core/cq.c
··· 7 7 #include <linux/slab.h> 8 8 #include <rdma/ib_verbs.h> 9 9 10 + #include <trace/events/rdma_core.h> 11 + 10 12 /* # of WCs to poll for with a single call to ib_poll_cq */ 11 13 #define IB_POLL_BATCH 16 12 14 #define IB_POLL_BATCH_DIRECT 8 ··· 43 41 44 42 dim->state = DIM_START_MEASURE; 45 43 44 + trace_cq_modify(cq, comps, usec); 46 45 cq->device->ops.modify_cq(cq, comps, usec); 47 46 } 48 47 ··· 68 65 INIT_WORK(&dim->work, ib_cq_rdma_dim_work); 69 66 } 70 67 68 + static int __poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) 69 + { 70 + int rc; 71 + 72 + rc = ib_poll_cq(cq, num_entries, wc); 73 + trace_cq_poll(cq, num_entries, rc); 74 + return rc; 75 + } 76 + 71 77 static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs, 72 78 int batch) 73 79 { 74 80 int i, n, completed = 0; 81 + 82 + trace_cq_process(cq); 75 83 76 84 /* 77 85 * budget might be (-1) if the caller does not 78 86 * want to bound this call, thus we need unsigned 79 87 * minimum here. 80 88 */ 81 - while ((n = ib_poll_cq(cq, min_t(u32, batch, 82 - budget - completed), wcs)) > 0) { 89 + while ((n = __poll_cq(cq, min_t(u32, batch, 90 + budget - completed), wcs)) > 0) { 83 91 for (i = 0; i < n; i++) { 84 92 struct ib_wc *wc = &wcs[i]; 85 93 ··· 145 131 completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH); 146 132 if (completed < budget) { 147 133 irq_poll_complete(&cq->iop); 148 - if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) 134 + if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) { 135 + trace_cq_reschedule(cq); 149 136 irq_poll_sched(&cq->iop); 137 + } 150 138 } 151 139 152 140 if (dim) ··· 159 143 160 144 static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) 161 145 { 146 + trace_cq_schedule(cq); 162 147 irq_poll_sched(&cq->iop); 163 148 } 164 149 ··· 179 162 180 163 static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) 181 164 { 165 + trace_cq_schedule(cq); 182 166 queue_work(cq->comp_wq, &cq->work); 183 167 } 184 168 ··· 257 239 goto out_destroy_cq; 258 240 } 259 241 242 + trace_cq_alloc(cq, nr_cqe, comp_vector, poll_ctx); 260 243 return cq; 261 244 262 245 out_destroy_cq: ··· 267 248 kfree(cq->wc); 268 249 out_free_cq: 269 250 kfree(cq); 251 + trace_cq_alloc_error(nr_cqe, comp_vector, poll_ctx, ret); 270 252 return ERR_PTR(ret); 271 253 } 272 254 EXPORT_SYMBOL(__ib_alloc_cq_user); ··· 324 304 WARN_ON_ONCE(1); 325 305 } 326 306 307 + trace_cq_free(cq); 327 308 rdma_restrack_del(&cq->res); 328 309 cq->device->ops.destroy_cq(cq, udata); 329 310 if (cq->dim)
+14
drivers/infiniband/core/trace.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Trace points for core RDMA functions. 4 + * 5 + * Author: Chuck Lever <chuck.lever@oracle.com> 6 + * 7 + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. 8 + */ 9 + 10 + #define CREATE_TRACE_POINTS 11 + 12 + #include <rdma/ib_verbs.h> 13 + 14 + #include <trace/events/rdma_core.h>
+4
drivers/infiniband/core/verbs.c
··· 53 53 54 54 #include "core_priv.h" 55 55 56 + #include <trace/events/rdma_core.h> 57 + 56 58 static int ib_resolve_eth_dmac(struct ib_device *device, 57 59 struct rdma_ah_attr *ah_attr); 58 60 ··· 2746 2744 qp->device->ops.drain_sq(qp); 2747 2745 else 2748 2746 __ib_drain_sq(qp); 2747 + trace_cq_drain_complete(qp->send_cq); 2749 2748 } 2750 2749 EXPORT_SYMBOL(ib_drain_sq); 2751 2750 ··· 2775 2772 qp->device->ops.drain_rq(qp); 2776 2773 else 2777 2774 __ib_drain_rq(qp); 2775 + trace_cq_drain_complete(qp->recv_cq); 2778 2776 } 2779 2777 EXPORT_SYMBOL(ib_drain_rq); 2780 2778
+5
include/rdma/ib_verbs.h
··· 1558 1558 }; 1559 1559 struct workqueue_struct *comp_wq; 1560 1560 struct dim *dim; 1561 + 1562 + /* updated only by trace points */ 1563 + ktime_t timestamp; 1564 + bool interrupt; 1565 + 1561 1566 /* 1562 1567 * Implementation details of the RDMA core, don't use in drivers: 1563 1568 */
+271
include/trace/events/rdma_core.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Trace point definitions for core RDMA functions. 4 + * 5 + * Author: Chuck Lever <chuck.lever@oracle.com> 6 + * 7 + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. 8 + */ 9 + 10 + #undef TRACE_SYSTEM 11 + #define TRACE_SYSTEM rdma_core 12 + 13 + #if !defined(_TRACE_RDMA_CORE_H) || defined(TRACE_HEADER_MULTI_READ) 14 + #define _TRACE_RDMA_CORE_H 15 + 16 + #include <linux/tracepoint.h> 17 + #include <rdma/ib_verbs.h> 18 + 19 + /* 20 + * enum ib_poll_context, from include/rdma/ib_verbs.h 21 + */ 22 + #define IB_POLL_CTX_LIST \ 23 + ib_poll_ctx(DIRECT) \ 24 + ib_poll_ctx(SOFTIRQ) \ 25 + ib_poll_ctx(WORKQUEUE) \ 26 + ib_poll_ctx_end(UNBOUND_WORKQUEUE) 27 + 28 + #undef ib_poll_ctx 29 + #undef ib_poll_ctx_end 30 + 31 + #define ib_poll_ctx(x) TRACE_DEFINE_ENUM(IB_POLL_##x); 32 + #define ib_poll_ctx_end(x) TRACE_DEFINE_ENUM(IB_POLL_##x); 33 + 34 + IB_POLL_CTX_LIST 35 + 36 + #undef ib_poll_ctx 37 + #undef ib_poll_ctx_end 38 + 39 + #define ib_poll_ctx(x) { IB_POLL_##x, #x }, 40 + #define ib_poll_ctx_end(x) { IB_POLL_##x, #x } 41 + 42 + #define rdma_show_ib_poll_ctx(x) \ 43 + __print_symbolic(x, IB_POLL_CTX_LIST) 44 + 45 + /** 46 + ** Completion Queue events 47 + **/ 48 + 49 + TRACE_EVENT(cq_schedule, 50 + TP_PROTO( 51 + struct ib_cq *cq 52 + ), 53 + 54 + TP_ARGS(cq), 55 + 56 + TP_STRUCT__entry( 57 + __field(u32, cq_id) 58 + ), 59 + 60 + TP_fast_assign( 61 + cq->timestamp = ktime_get(); 62 + cq->interrupt = true; 63 + 64 + __entry->cq_id = cq->res.id; 65 + ), 66 + 67 + TP_printk("cq.id=%u", __entry->cq_id) 68 + ); 69 + 70 + TRACE_EVENT(cq_reschedule, 71 + TP_PROTO( 72 + struct ib_cq *cq 73 + ), 74 + 75 + TP_ARGS(cq), 76 + 77 + TP_STRUCT__entry( 78 + __field(u32, cq_id) 79 + ), 80 + 81 + TP_fast_assign( 82 + cq->timestamp = ktime_get(); 83 + cq->interrupt = false; 84 + 85 + __entry->cq_id = cq->res.id; 86 + ), 87 + 88 + TP_printk("cq.id=%u", __entry->cq_id) 89 + ); 90 + 91 + TRACE_EVENT(cq_process, 92 + TP_PROTO( 93 + const struct ib_cq *cq 94 + ), 95 + 96 + TP_ARGS(cq), 97 + 98 + TP_STRUCT__entry( 99 + __field(u32, cq_id) 100 + __field(bool, interrupt) 101 + __field(s64, latency) 102 + ), 103 + 104 + TP_fast_assign( 105 + ktime_t latency = ktime_sub(ktime_get(), cq->timestamp); 106 + 107 + __entry->cq_id = cq->res.id; 108 + __entry->latency = ktime_to_us(latency); 109 + __entry->interrupt = cq->interrupt; 110 + ), 111 + 112 + TP_printk("cq.id=%u wake-up took %lld [us] from %s", 113 + __entry->cq_id, __entry->latency, 114 + __entry->interrupt ? "interrupt" : "reschedule" 115 + ) 116 + ); 117 + 118 + TRACE_EVENT(cq_poll, 119 + TP_PROTO( 120 + const struct ib_cq *cq, 121 + int requested, 122 + int rc 123 + ), 124 + 125 + TP_ARGS(cq, requested, rc), 126 + 127 + TP_STRUCT__entry( 128 + __field(u32, cq_id) 129 + __field(int, requested) 130 + __field(int, rc) 131 + ), 132 + 133 + TP_fast_assign( 134 + __entry->cq_id = cq->res.id; 135 + __entry->requested = requested; 136 + __entry->rc = rc; 137 + ), 138 + 139 + TP_printk("cq.id=%u requested %d, returned %d", 140 + __entry->cq_id, __entry->requested, __entry->rc 141 + ) 142 + ); 143 + 144 + TRACE_EVENT(cq_drain_complete, 145 + TP_PROTO( 146 + const struct ib_cq *cq 147 + ), 148 + 149 + TP_ARGS(cq), 150 + 151 + TP_STRUCT__entry( 152 + __field(u32, cq_id) 153 + ), 154 + 155 + TP_fast_assign( 156 + __entry->cq_id = cq->res.id; 157 + ), 158 + 159 + TP_printk("cq.id=%u", 160 + __entry->cq_id 161 + ) 162 + ); 163 + 164 + 165 + TRACE_EVENT(cq_modify, 166 + TP_PROTO( 167 + const struct ib_cq *cq, 168 + u16 comps, 169 + u16 usec 170 + ), 171 + 172 + TP_ARGS(cq, comps, usec), 173 + 174 + TP_STRUCT__entry( 175 + __field(u32, cq_id) 176 + __field(unsigned int, comps) 177 + __field(unsigned int, usec) 178 + ), 179 + 180 + TP_fast_assign( 181 + __entry->cq_id = cq->res.id; 182 + __entry->comps = comps; 183 + __entry->usec = usec; 184 + ), 185 + 186 + TP_printk("cq.id=%u comps=%u usec=%u", 187 + __entry->cq_id, __entry->comps, __entry->usec 188 + ) 189 + ); 190 + 191 + TRACE_EVENT(cq_alloc, 192 + TP_PROTO( 193 + const struct ib_cq *cq, 194 + int nr_cqe, 195 + int comp_vector, 196 + enum ib_poll_context poll_ctx 197 + ), 198 + 199 + TP_ARGS(cq, nr_cqe, comp_vector, poll_ctx), 200 + 201 + TP_STRUCT__entry( 202 + __field(u32, cq_id) 203 + __field(int, nr_cqe) 204 + __field(int, comp_vector) 205 + __field(unsigned long, poll_ctx) 206 + ), 207 + 208 + TP_fast_assign( 209 + __entry->cq_id = cq->res.id; 210 + __entry->nr_cqe = nr_cqe; 211 + __entry->comp_vector = comp_vector; 212 + __entry->poll_ctx = poll_ctx; 213 + ), 214 + 215 + TP_printk("cq.id=%u nr_cqe=%d comp_vector=%d poll_ctx=%s", 216 + __entry->cq_id, __entry->nr_cqe, __entry->comp_vector, 217 + rdma_show_ib_poll_ctx(__entry->poll_ctx) 218 + ) 219 + ); 220 + 221 + TRACE_EVENT(cq_alloc_error, 222 + TP_PROTO( 223 + int nr_cqe, 224 + int comp_vector, 225 + enum ib_poll_context poll_ctx, 226 + int rc 227 + ), 228 + 229 + TP_ARGS(nr_cqe, comp_vector, poll_ctx, rc), 230 + 231 + TP_STRUCT__entry( 232 + __field(int, rc) 233 + __field(int, nr_cqe) 234 + __field(int, comp_vector) 235 + __field(unsigned long, poll_ctx) 236 + ), 237 + 238 + TP_fast_assign( 239 + __entry->rc = rc; 240 + __entry->nr_cqe = nr_cqe; 241 + __entry->comp_vector = comp_vector; 242 + __entry->poll_ctx = poll_ctx; 243 + ), 244 + 245 + TP_printk("nr_cqe=%d comp_vector=%d poll_ctx=%s rc=%d", 246 + __entry->nr_cqe, __entry->comp_vector, 247 + rdma_show_ib_poll_ctx(__entry->poll_ctx), __entry->rc 248 + ) 249 + ); 250 + 251 + TRACE_EVENT(cq_free, 252 + TP_PROTO( 253 + const struct ib_cq *cq 254 + ), 255 + 256 + TP_ARGS(cq), 257 + 258 + TP_STRUCT__entry( 259 + __field(u32, cq_id) 260 + ), 261 + 262 + TP_fast_assign( 263 + __entry->cq_id = cq->res.id; 264 + ), 265 + 266 + TP_printk("cq.id=%u", __entry->cq_id) 267 + ); 268 + 269 + #endif /* _TRACE_RDMA_CORE_H */ 270 + 271 + #include <trace/define_trace.h>