drm/msm: add perf logging debugfs · tjh.dev/kernel@70c70f0

+1

drivers/gpu/drm/msm/Makefile

··· 34 34 msm_gem_submit.o \ 35 35 msm_gpu.o \ 36 36 msm_iommu.o \ 37 + msm_perf.o \ 37 38 msm_rd.o \ 38 39 msm_ringbuffer.o 39 40

+15 -5

drivers/gpu/drm/msm/adreno/a3xx_gpu.c

··· 207 207 /* Turn on performance counters: */ 208 208 gpu_write(gpu, REG_A3XX_RBBM_PERFCTR_CTL, 0x01); 209 209 210 - /* Set SP perfcounter 7 to count SP_FS_FULL_ALU_INSTRUCTIONS 211 - * we will use this to augment our hang detection: 212 - */ 213 - gpu_write(gpu, REG_A3XX_SP_PERFCOUNTER7_SELECT, 214 - SP_FS_FULL_ALU_INSTRUCTIONS); 210 + /* Enable the perfcntrs that we use.. */ 211 + for (i = 0; i < gpu->num_perfcntrs; i++) { 212 + const struct msm_gpu_perfcntr *perfcntr = &gpu->perfcntrs[i]; 213 + gpu_write(gpu, perfcntr->select_reg, perfcntr->select_val); 214 + } 215 215 216 216 gpu_write(gpu, REG_A3XX_RBBM_INT_0_MASK, A3XX_INT0_MASK); 217 217 ··· 465 465 }, 466 466 }; 467 467 468 + static const struct msm_gpu_perfcntr perfcntrs[] = { 469 + { REG_A3XX_SP_PERFCOUNTER6_SELECT, REG_A3XX_RBBM_PERFCTR_SP_6_LO, 470 + SP_ALU_ACTIVE_CYCLES, "ALUACTIVE" }, 471 + { REG_A3XX_SP_PERFCOUNTER7_SELECT, REG_A3XX_RBBM_PERFCTR_SP_7_LO, 472 + SP_FS_FULL_ALU_INSTRUCTIONS, "ALUFULL" }, 473 + }; 474 + 468 475 struct msm_gpu *a3xx_gpu_init(struct drm_device *dev) 469 476 { 470 477 struct a3xx_gpu *a3xx_gpu = NULL; ··· 510 503 511 504 DBG("fast_rate=%u, slow_rate=%u, bus_freq=%u", 512 505 gpu->fast_rate, gpu->slow_rate, gpu->bus_freq); 506 + 507 + gpu->perfcntrs = perfcntrs; 508 + gpu->num_perfcntrs = ARRAY_SIZE(perfcntrs); 513 509 514 510 ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, config->rev); 515 511 if (ret)

+7

drivers/gpu/drm/msm/msm_drv.c

··· 548 548 return ret; 549 549 } 550 550 551 + ret = msm_perf_debugfs_init(minor); 552 + if (ret) { 553 + dev_err(minor->dev->dev, "could not install perf debugfs\n"); 554 + return ret; 555 + } 556 + 551 557 return 0; 552 558 } 553 559 ··· 594 588 if (!minor->dev->dev_private) 595 589 return; 596 590 msm_rd_debugfs_cleanup(minor); 591 + msm_perf_debugfs_cleanup(minor); 597 592 } 598 593 #endif 599 594

+4

drivers/gpu/drm/msm/msm_drv.h

··· 56 56 struct msm_gpu; 57 57 struct msm_mmu; 58 58 struct msm_rd_state; 59 + struct msm_perf_state; 59 60 struct msm_gem_submit; 60 61 61 62 #define NUM_DOMAINS 2 /* one for KMS, then one per gpu core (?) */ ··· 86 85 wait_queue_head_t fence_event; 87 86 88 87 struct msm_rd_state *rd; 88 + struct msm_perf_state *perf; 89 89 90 90 /* list of GEM objects: */ 91 91 struct list_head inactive_list; ··· 214 212 int msm_rd_debugfs_init(struct drm_minor *minor); 215 213 void msm_rd_debugfs_cleanup(struct drm_minor *minor); 216 214 void msm_rd_dump_submit(struct msm_gem_submit *submit); 215 + int msm_perf_debugfs_init(struct drm_minor *minor); 216 + void msm_perf_debugfs_cleanup(struct drm_minor *minor); 217 217 #else 218 218 static inline int msm_debugfs_late_init(struct drm_device *dev) { return 0; } 219 219 static inline void msm_rd_dump_submit(struct msm_gem_submit *submit) {}

+103

drivers/gpu/drm/msm/msm_gpu.c

··· 320 320 } 321 321 322 322 /* 323 + * Performance Counters: 324 + */ 325 + 326 + /* called under perf_lock */ 327 + static int update_hw_cntrs(struct msm_gpu *gpu, uint32_t ncntrs, uint32_t *cntrs) 328 + { 329 + uint32_t current_cntrs[ARRAY_SIZE(gpu->last_cntrs)]; 330 + int i, n = min(ncntrs, gpu->num_perfcntrs); 331 + 332 + /* read current values: */ 333 + for (i = 0; i < gpu->num_perfcntrs; i++) 334 + current_cntrs[i] = gpu_read(gpu, gpu->perfcntrs[i].sample_reg); 335 + 336 + /* update cntrs: */ 337 + for (i = 0; i < n; i++) 338 + cntrs[i] = current_cntrs[i] - gpu->last_cntrs[i]; 339 + 340 + /* save current values: */ 341 + for (i = 0; i < gpu->num_perfcntrs; i++) 342 + gpu->last_cntrs[i] = current_cntrs[i]; 343 + 344 + return n; 345 + } 346 + 347 + static void update_sw_cntrs(struct msm_gpu *gpu) 348 + { 349 + ktime_t time; 350 + uint32_t elapsed; 351 + unsigned long flags; 352 + 353 + spin_lock_irqsave(&gpu->perf_lock, flags); 354 + if (!gpu->perfcntr_active) 355 + goto out; 356 + 357 + time = ktime_get(); 358 + elapsed = ktime_to_us(ktime_sub(time, gpu->last_sample.time)); 359 + 360 + gpu->totaltime += elapsed; 361 + if (gpu->last_sample.active) 362 + gpu->activetime += elapsed; 363 + 364 + gpu->last_sample.active = msm_gpu_active(gpu); 365 + gpu->last_sample.time = time; 366 + 367 + out: 368 + spin_unlock_irqrestore(&gpu->perf_lock, flags); 369 + } 370 + 371 + void msm_gpu_perfcntr_start(struct msm_gpu *gpu) 372 + { 373 + unsigned long flags; 374 + 375 + spin_lock_irqsave(&gpu->perf_lock, flags); 376 + /* we could dynamically enable/disable perfcntr registers too.. */ 377 + gpu->last_sample.active = msm_gpu_active(gpu); 378 + gpu->last_sample.time = ktime_get(); 379 + gpu->activetime = gpu->totaltime = 0; 380 + gpu->perfcntr_active = true; 381 + update_hw_cntrs(gpu, 0, NULL); 382 + spin_unlock_irqrestore(&gpu->perf_lock, flags); 383 + } 384 + 385 + void msm_gpu_perfcntr_stop(struct msm_gpu *gpu) 386 + { 387 + gpu->perfcntr_active = false; 388 + } 389 + 390 + /* returns -errno or # of cntrs sampled */ 391 + int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime, 392 + uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs) 393 + { 394 + unsigned long flags; 395 + int ret; 396 + 397 + spin_lock_irqsave(&gpu->perf_lock, flags); 398 + 399 + if (!gpu->perfcntr_active) { 400 + ret = -EINVAL; 401 + goto out; 402 + } 403 + 404 + *activetime = gpu->activetime; 405 + *totaltime = gpu->totaltime; 406 + 407 + gpu->activetime = gpu->totaltime = 0; 408 + 409 + ret = update_hw_cntrs(gpu, ncntrs, cntrs); 410 + 411 + out: 412 + spin_unlock_irqrestore(&gpu->perf_lock, flags); 413 + 414 + return ret; 415 + } 416 + 417 + /* 323 418 * Cmdstream submission/retirement: 324 419 */ 325 420 ··· 456 361 { 457 362 struct msm_drm_private *priv = gpu->dev->dev_private; 458 363 queue_work(priv->wq, &gpu->retire_work); 364 + update_sw_cntrs(gpu); 459 365 } 460 366 461 367 /* add bo's to gpu's ring, and kick gpu: */ ··· 476 380 msm_rd_dump_submit(submit); 477 381 478 382 gpu->submitted_fence = submit->fence; 383 + 384 + update_sw_cntrs(gpu); 479 385 480 386 ret = gpu->funcs->submit(gpu, submit, ctx); 481 387 priv->lastctx = ctx; ··· 531 433 struct iommu_domain *iommu; 532 434 int i, ret; 533 435 436 + if (WARN_ON(gpu->num_perfcntrs > ARRAY_SIZE(gpu->last_cntrs))) 437 + gpu->num_perfcntrs = ARRAY_SIZE(gpu->last_cntrs); 438 + 534 439 gpu->dev = drm; 535 440 gpu->funcs = funcs; 536 441 gpu->name = name; ··· 548 447 (unsigned long)gpu); 549 448 setup_timer(&gpu->hangcheck_timer, hangcheck_handler, 550 449 (unsigned long)gpu); 450 + 451 + spin_lock_init(&gpu->perf_lock); 551 452 552 453 BUG_ON(ARRAY_SIZE(clk_names) != ARRAY_SIZE(gpu->grp_clks)); 553 454

+31

drivers/gpu/drm/msm/msm_gpu.h

··· 25 25 #include "msm_ringbuffer.h" 26 26 27 27 struct msm_gem_submit; 28 + struct msm_gpu_perfcntr; 28 29 29 30 /* So far, with hardware that I've seen to date, we can have: 30 31 * + zero, one, or two z180 2d cores ··· 64 63 const char *name; 65 64 struct drm_device *dev; 66 65 const struct msm_gpu_funcs *funcs; 66 + 67 + /* performance counters (hw & sw): */ 68 + spinlock_t perf_lock; 69 + bool perfcntr_active; 70 + struct { 71 + bool active; 72 + ktime_t time; 73 + } last_sample; 74 + uint32_t totaltime, activetime; /* sw counters */ 75 + uint32_t last_cntrs[5]; /* hw counters */ 76 + const struct msm_gpu_perfcntr *perfcntrs; 77 + uint32_t num_perfcntrs; 67 78 68 79 struct msm_ringbuffer *rb; 69 80 uint32_t rb_iova; ··· 126 113 return gpu->submitted_fence > gpu->funcs->last_fence(gpu); 127 114 } 128 115 116 + /* Perf-Counters: 117 + * The select_reg and select_val are just there for the benefit of the child 118 + * class that actually enables the perf counter.. but msm_gpu base class 119 + * will handle sampling/displaying the counters. 120 + */ 121 + 122 + struct msm_gpu_perfcntr { 123 + uint32_t select_reg; 124 + uint32_t sample_reg; 125 + uint32_t select_val; 126 + const char *name; 127 + }; 128 + 129 129 static inline void gpu_write(struct msm_gpu *gpu, u32 reg, u32 data) 130 130 { 131 131 msm_writel(data, gpu->mmio + (reg << 2)); ··· 151 125 152 126 int msm_gpu_pm_suspend(struct msm_gpu *gpu); 153 127 int msm_gpu_pm_resume(struct msm_gpu *gpu); 128 + 129 + void msm_gpu_perfcntr_start(struct msm_gpu *gpu); 130 + void msm_gpu_perfcntr_stop(struct msm_gpu *gpu); 131 + int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime, 132 + uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs); 154 133 155 134 void msm_gpu_retire(struct msm_gpu *gpu); 156 135 int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,

+275

drivers/gpu/drm/msm/msm_perf.c

··· 1 + /* 2 + * Copyright (C) 2013 Red Hat 3 + * Author: Rob Clark <robdclark@gmail.com> 4 + * 5 + * This program is free software; you can redistribute it and/or modify it 6 + * under the terms of the GNU General Public License version 2 as published by 7 + * the Free Software Foundation. 8 + * 9 + * This program is distributed in the hope that it will be useful, but WITHOUT 10 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 + * more details. 13 + * 14 + * You should have received a copy of the GNU General Public License along with 15 + * this program. If not, see <http://www.gnu.org/licenses/>. 16 + */ 17 + 18 + /* For profiling, userspace can: 19 + * 20 + * tail -f /sys/kernel/debug/dri/<minor>/gpu 21 + * 22 + * This will enable performance counters/profiling to track the busy time 23 + * and any gpu specific performance counters that are supported. 24 + */ 25 + 26 + #ifdef CONFIG_DEBUG_FS 27 + 28 + #include <linux/debugfs.h> 29 + 30 + #include "msm_drv.h" 31 + #include "msm_gpu.h" 32 + 33 + struct msm_perf_state { 34 + struct drm_device *dev; 35 + 36 + bool open; 37 + int cnt; 38 + struct mutex read_lock; 39 + 40 + char buf[256]; 41 + int buftot, bufpos; 42 + 43 + unsigned long next_jiffies; 44 + 45 + struct dentry *ent; 46 + struct drm_info_node *node; 47 + }; 48 + 49 + #define SAMPLE_TIME (HZ/4) 50 + 51 + /* wait for next sample time: */ 52 + static int wait_sample(struct msm_perf_state *perf) 53 + { 54 + unsigned long start_jiffies = jiffies; 55 + 56 + if (time_after(perf->next_jiffies, start_jiffies)) { 57 + unsigned long remaining_jiffies = 58 + perf->next_jiffies - start_jiffies; 59 + int ret = schedule_timeout_interruptible(remaining_jiffies); 60 + if (ret > 0) { 61 + /* interrupted */ 62 + return -ERESTARTSYS; 63 + } 64 + } 65 + perf->next_jiffies += SAMPLE_TIME; 66 + return 0; 67 + } 68 + 69 + static int refill_buf(struct msm_perf_state *perf) 70 + { 71 + struct msm_drm_private *priv = perf->dev->dev_private; 72 + struct msm_gpu *gpu = priv->gpu; 73 + char *ptr = perf->buf; 74 + int rem = sizeof(perf->buf); 75 + int i, n; 76 + 77 + if ((perf->cnt++ % 32) == 0) { 78 + /* Header line: */ 79 + n = snprintf(ptr, rem, "%%BUSY"); 80 + ptr += n; 81 + rem -= n; 82 + 83 + for (i = 0; i < gpu->num_perfcntrs; i++) { 84 + const struct msm_gpu_perfcntr *perfcntr = &gpu->perfcntrs[i]; 85 + n = snprintf(ptr, rem, "\t%s", perfcntr->name); 86 + ptr += n; 87 + rem -= n; 88 + } 89 + } else { 90 + /* Sample line: */ 91 + uint32_t activetime = 0, totaltime = 0; 92 + uint32_t cntrs[5]; 93 + uint32_t val; 94 + int ret; 95 + 96 + /* sleep until next sample time: */ 97 + ret = wait_sample(perf); 98 + if (ret) 99 + return ret; 100 + 101 + ret = msm_gpu_perfcntr_sample(gpu, &activetime, &totaltime, 102 + ARRAY_SIZE(cntrs), cntrs); 103 + if (ret < 0) 104 + return ret; 105 + 106 + val = totaltime ? 1000 * activetime / totaltime : 0; 107 + n = snprintf(ptr, rem, "%3d.%d%%", val / 10, val % 10); 108 + ptr += n; 109 + rem -= n; 110 + 111 + for (i = 0; i < ret; i++) { 112 + /* cycle counters (I think).. convert to MHz.. */ 113 + val = cntrs[i] / 10000; 114 + n = snprintf(ptr, rem, "\t%5d.%02d", 115 + val / 100, val % 100); 116 + ptr += n; 117 + rem -= n; 118 + } 119 + } 120 + 121 + n = snprintf(ptr, rem, "\n"); 122 + ptr += n; 123 + rem -= n; 124 + 125 + perf->bufpos = 0; 126 + perf->buftot = ptr - perf->buf; 127 + 128 + return 0; 129 + } 130 + 131 + static ssize_t perf_read(struct file *file, char __user *buf, 132 + size_t sz, loff_t *ppos) 133 + { 134 + struct msm_perf_state *perf = file->private_data; 135 + int n = 0, ret; 136 + 137 + mutex_lock(&perf->read_lock); 138 + 139 + if (perf->bufpos >= perf->buftot) { 140 + ret = refill_buf(perf); 141 + if (ret) 142 + goto out; 143 + } 144 + 145 + n = min((int)sz, perf->buftot - perf->bufpos); 146 + ret = copy_to_user(buf, &perf->buf[perf->bufpos], n); 147 + if (ret) 148 + goto out; 149 + 150 + perf->bufpos += n; 151 + *ppos += n; 152 + 153 + out: 154 + mutex_unlock(&perf->read_lock); 155 + if (ret) 156 + return ret; 157 + return n; 158 + } 159 + 160 + static int perf_open(struct inode *inode, struct file *file) 161 + { 162 + struct msm_perf_state *perf = inode->i_private; 163 + struct drm_device *dev = perf->dev; 164 + struct msm_drm_private *priv = dev->dev_private; 165 + struct msm_gpu *gpu = priv->gpu; 166 + int ret = 0; 167 + 168 + mutex_lock(&dev->struct_mutex); 169 + 170 + if (perf->open || !gpu) { 171 + ret = -EBUSY; 172 + goto out; 173 + } 174 + 175 + file->private_data = perf; 176 + perf->open = true; 177 + perf->cnt = 0; 178 + perf->buftot = 0; 179 + perf->bufpos = 0; 180 + msm_gpu_perfcntr_start(gpu); 181 + perf->next_jiffies = jiffies + SAMPLE_TIME; 182 + 183 + out: 184 + mutex_unlock(&dev->struct_mutex); 185 + return ret; 186 + } 187 + 188 + static int perf_release(struct inode *inode, struct file *file) 189 + { 190 + struct msm_perf_state *perf = inode->i_private; 191 + struct msm_drm_private *priv = perf->dev->dev_private; 192 + msm_gpu_perfcntr_stop(priv->gpu); 193 + perf->open = false; 194 + return 0; 195 + } 196 + 197 + 198 + static const struct file_operations perf_debugfs_fops = { 199 + .owner = THIS_MODULE, 200 + .open = perf_open, 201 + .read = perf_read, 202 + .llseek = no_llseek, 203 + .release = perf_release, 204 + }; 205 + 206 + int msm_perf_debugfs_init(struct drm_minor *minor) 207 + { 208 + struct msm_drm_private *priv = minor->dev->dev_private; 209 + struct msm_perf_state *perf; 210 + 211 + /* only create on first minor: */ 212 + if (priv->perf) 213 + return 0; 214 + 215 + perf = kzalloc(sizeof(*perf), GFP_KERNEL); 216 + if (!perf) 217 + return -ENOMEM; 218 + 219 + perf->dev = minor->dev; 220 + 221 + mutex_init(&perf->read_lock); 222 + priv->perf = perf; 223 + 224 + perf->node = kzalloc(sizeof(*perf->node), GFP_KERNEL); 225 + if (!perf->node) 226 + goto fail; 227 + 228 + perf->ent = debugfs_create_file("perf", S_IFREG | S_IRUGO, 229 + minor->debugfs_root, perf, &perf_debugfs_fops); 230 + if (!perf->ent) { 231 + DRM_ERROR("Cannot create /sys/kernel/debug/dri/%s/perf\n", 232 + minor->debugfs_root->d_name.name); 233 + goto fail; 234 + } 235 + 236 + perf->node->minor = minor; 237 + perf->node->dent = perf->ent; 238 + perf->node->info_ent = NULL; 239 + 240 + mutex_lock(&minor->debugfs_lock); 241 + list_add(&perf->node->list, &minor->debugfs_list); 242 + mutex_unlock(&minor->debugfs_lock); 243 + 244 + return 0; 245 + 246 + fail: 247 + msm_perf_debugfs_cleanup(minor); 248 + return -1; 249 + } 250 + 251 + void msm_perf_debugfs_cleanup(struct drm_minor *minor) 252 + { 253 + struct msm_drm_private *priv = minor->dev->dev_private; 254 + struct msm_perf_state *perf = priv->perf; 255 + 256 + if (!perf) 257 + return; 258 + 259 + priv->perf = NULL; 260 + 261 + debugfs_remove(perf->ent); 262 + 263 + if (perf->node) { 264 + mutex_lock(&minor->debugfs_lock); 265 + list_del(&perf->node->list); 266 + mutex_unlock(&minor->debugfs_lock); 267 + kfree(perf->node); 268 + } 269 + 270 + mutex_destroy(&perf->read_lock); 271 + 272 + kfree(perf); 273 + } 274 + 275 + #endif