Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/v3d: Add support for compute shader dispatch.

The compute shader dispatch interface is pretty simple -- just pass in
the regs that userspace has passed us, with no CLs to run. However,
with no CL to run it means that we need to do manual cache flushing of
the L2 after the HW execution completes (for SSBO, atomic, and
image_load_store writes that are the output of compute shaders).

This doesn't yet expose the L2 cache's ability to have a region of the
address space not write back to memory (which could be used for
shared_var storage).

So far, the Mesa side has been tested on V3D v4.2 simpenrose (passing
the ES31 tests), and on the kernel side on 7278 (failing atomic
compswap tests in a way that doesn't reproduce on simpenrose).

v2: Fix excessive allocation for the clean_job (reported by Dan
Carpenter). Keep refs on jobs until clean_job is finished, to
avoid spurious MMU errors if the output BOs are freed by userspace
before L2 cleaning is finished.

Signed-off-by: Eric Anholt <eric@anholt.net>
Link: https://patchwork.freedesktop.org/patch/msgid/20190416225856.20264-4-eric@anholt.net
Acked-by: Rob Clark <robdclark@gmail.com>

+531 -19
+22
drivers/gpu/drm/v3d/v3d_debugfs.c
··· 58 58 REGDEF(V3D_GMP_VIO_ADDR), 59 59 }; 60 60 61 + static const struct v3d_reg_def v3d_csd_reg_defs[] = { 62 + REGDEF(V3D_CSD_STATUS), 63 + REGDEF(V3D_CSD_CURRENT_CFG0), 64 + REGDEF(V3D_CSD_CURRENT_CFG1), 65 + REGDEF(V3D_CSD_CURRENT_CFG2), 66 + REGDEF(V3D_CSD_CURRENT_CFG3), 67 + REGDEF(V3D_CSD_CURRENT_CFG4), 68 + REGDEF(V3D_CSD_CURRENT_CFG5), 69 + REGDEF(V3D_CSD_CURRENT_CFG6), 70 + }; 71 + 61 72 static int v3d_v3d_debugfs_regs(struct seq_file *m, void *unused) 62 73 { 63 74 struct drm_info_node *node = (struct drm_info_node *)m->private; ··· 99 88 v3d_core_reg_defs[i].reg, 100 89 V3D_CORE_READ(core, 101 90 v3d_core_reg_defs[i].reg)); 91 + } 92 + 93 + if (v3d_has_csd(v3d)) { 94 + for (i = 0; i < ARRAY_SIZE(v3d_csd_reg_defs); i++) { 95 + seq_printf(m, "core %d %s (0x%04x): 0x%08x\n", 96 + core, 97 + v3d_csd_reg_defs[i].name, 98 + v3d_csd_reg_defs[i].reg, 99 + V3D_CORE_READ(core, 100 + v3d_csd_reg_defs[i].reg)); 101 + } 102 102 } 103 103 } 104 104
+7 -3
drivers/gpu/drm/v3d/v3d_drv.c
··· 7 7 * This driver supports the Broadcom V3D 3.3 and 4.1 OpenGL ES GPUs. 8 8 * For V3D 2.x support, see the VC4 driver. 9 9 * 10 - * Currently only single-core rendering using the binner and renderer, 11 - * along with TFU (texture formatting unit) rendering is supported. 12 - * V3D 4.x's CSD (compute shader dispatch) is not yet supported. 10 + * The V3D GPU includes a tiled render (composed of a bin and render 11 + * pipelines), the TFU (texture formatting unit), and the CSD (compute 12 + * shader dispatch). 13 13 */ 14 14 15 15 #include <linux/clk.h> ··· 120 120 case DRM_V3D_PARAM_SUPPORTS_TFU: 121 121 args->value = 1; 122 122 return 0; 123 + case DRM_V3D_PARAM_SUPPORTS_CSD: 124 + args->value = v3d_has_csd(v3d); 125 + return 0; 123 126 default: 124 127 DRM_DEBUG("Unknown parameter %d\n", args->param); 125 128 return -EINVAL; ··· 182 179 DRM_IOCTL_DEF_DRV(V3D_GET_PARAM, v3d_get_param_ioctl, DRM_RENDER_ALLOW), 183 180 DRM_IOCTL_DEF_DRV(V3D_GET_BO_OFFSET, v3d_get_bo_offset_ioctl, DRM_RENDER_ALLOW), 184 181 DRM_IOCTL_DEF_DRV(V3D_SUBMIT_TFU, v3d_submit_tfu_ioctl, DRM_RENDER_ALLOW | DRM_AUTH), 182 + DRM_IOCTL_DEF_DRV(V3D_SUBMIT_CSD, v3d_submit_csd_ioctl, DRM_RENDER_ALLOW | DRM_AUTH), 185 183 }; 186 184 187 185 static struct drm_driver v3d_drm_driver = {
+27 -1
drivers/gpu/drm/v3d/v3d_drv.h
··· 16 16 V3D_BIN, 17 17 V3D_RENDER, 18 18 V3D_TFU, 19 + V3D_CSD, 20 + V3D_CACHE_CLEAN, 19 21 }; 20 22 21 - #define V3D_MAX_QUEUES (V3D_TFU + 1) 23 + #define V3D_MAX_QUEUES (V3D_CACHE_CLEAN + 1) 22 24 23 25 struct v3d_queue_state { 24 26 struct drm_gpu_scheduler sched; ··· 72 70 struct v3d_bin_job *bin_job; 73 71 struct v3d_render_job *render_job; 74 72 struct v3d_tfu_job *tfu_job; 73 + struct v3d_csd_job *csd_job; 75 74 76 75 struct v3d_queue_state queue[V3D_MAX_QUEUES]; 77 76 ··· 95 92 */ 96 93 struct mutex sched_lock; 97 94 95 + /* Lock taken during a cache clean and when initiating an L2 96 + * flush, to keep L2 flushes from interfering with the 97 + * synchronous L2 cleans. 98 + */ 99 + struct mutex cache_clean_lock; 100 + 98 101 struct { 99 102 u32 num_allocated; 100 103 u32 pages_allocated; ··· 111 102 to_v3d_dev(struct drm_device *dev) 112 103 { 113 104 return (struct v3d_dev *)dev->dev_private; 105 + } 106 + 107 + static inline bool 108 + v3d_has_csd(struct v3d_dev *v3d) 109 + { 110 + return v3d->ver >= 41; 114 111 } 115 112 116 113 /* The per-fd struct, which tracks the MMU mappings. */ ··· 237 222 struct drm_v3d_submit_tfu args; 238 223 }; 239 224 225 + struct v3d_csd_job { 226 + struct v3d_job base; 227 + 228 + u32 timedout_batches; 229 + 230 + struct drm_v3d_submit_csd args; 231 + }; 232 + 240 233 /** 241 234 * _wait_for - magic (register) wait macro 242 235 * ··· 306 283 struct drm_file *file_priv); 307 284 int v3d_submit_tfu_ioctl(struct drm_device *dev, void *data, 308 285 struct drm_file *file_priv); 286 + int v3d_submit_csd_ioctl(struct drm_device *dev, void *data, 287 + struct drm_file *file_priv); 309 288 int v3d_wait_bo_ioctl(struct drm_device *dev, void *data, 310 289 struct drm_file *file_priv); 311 290 void v3d_job_put(struct v3d_job *job); 312 291 void v3d_reset(struct v3d_dev *v3d); 313 292 void v3d_invalidate_caches(struct v3d_dev *v3d); 293 + void v3d_clean_caches(struct v3d_dev *v3d); 314 294 315 295 /* v3d_irq.c */ 316 296 int v3d_irq_init(struct v3d_dev *v3d);
+2
drivers/gpu/drm/v3d/v3d_fence.c
··· 36 36 return "v3d-render"; 37 37 case V3D_TFU: 38 38 return "v3d-tfu"; 39 + case V3D_CSD: 40 + return "v3d-csd"; 39 41 default: 40 42 return NULL; 41 43 }
+151 -5
drivers/gpu/drm/v3d/v3d_gem.c
··· 162 162 /* While there is a busy bit (V3D_L2TCACTL_L2TFLS), we don't 163 163 * need to wait for completion before dispatching the job -- 164 164 * L2T accesses will be stalled until the flush has completed. 165 + * However, we do need to make sure we don't try to trigger a 166 + * new flush while the L2_CLEAN queue is trying to 167 + * synchronously clean after a job. 165 168 */ 169 + mutex_lock(&v3d->cache_clean_lock); 166 170 V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, 167 171 V3D_L2TCACTL_L2TFLS | 168 172 V3D_SET_FIELD(V3D_L2TCACTL_FLM_FLUSH, V3D_L2TCACTL_FLM)); 173 + mutex_unlock(&v3d->cache_clean_lock); 174 + } 175 + 176 + /* Cleans texture L1 and L2 cachelines (writing back dirty data). 177 + * 178 + * For cleaning, which happens from the CACHE_CLEAN queue after CSD has 179 + * executed, we need to make sure that the clean is done before 180 + * signaling job completion. So, we synchronously wait before 181 + * returning, and we make sure that L2 invalidates don't happen in the 182 + * meantime to confuse our are-we-done checks. 183 + */ 184 + void 185 + v3d_clean_caches(struct v3d_dev *v3d) 186 + { 187 + struct drm_device *dev = &v3d->drm; 188 + int core = 0; 189 + 190 + trace_v3d_cache_clean_begin(dev); 191 + 192 + V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, V3D_L2TCACTL_TMUWCF); 193 + if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) & 194 + V3D_L2TCACTL_L2TFLS), 100)) { 195 + DRM_ERROR("Timeout waiting for L1T write combiner flush\n"); 196 + } 197 + 198 + mutex_lock(&v3d->cache_clean_lock); 199 + V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, 200 + V3D_L2TCACTL_L2TFLS | 201 + V3D_SET_FIELD(V3D_L2TCACTL_FLM_CLEAN, V3D_L2TCACTL_FLM)); 202 + 203 + if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) & 204 + V3D_L2TCACTL_L2TFLS), 100)) { 205 + DRM_ERROR("Timeout waiting for L2T clean\n"); 206 + } 207 + 208 + mutex_unlock(&v3d->cache_clean_lock); 209 + 210 + trace_v3d_cache_clean_end(dev); 169 211 } 170 212 171 213 /* Invalidates the slice caches. These are read-only caches. */ ··· 471 429 v3d_attach_fences_and_unlock_reservation(struct drm_file *file_priv, 472 430 struct v3d_job *job, 473 431 struct ww_acquire_ctx *acquire_ctx, 474 - u32 out_sync) 432 + u32 out_sync, 433 + struct dma_fence *done_fence) 475 434 { 476 435 struct drm_syncobj *sync_out; 477 436 int i; ··· 488 445 /* Update the return sync object for the job */ 489 446 sync_out = drm_syncobj_find(file_priv, out_sync); 490 447 if (sync_out) { 491 - drm_syncobj_replace_fence(sync_out, job->done_fence); 448 + drm_syncobj_replace_fence(sync_out, done_fence); 492 449 drm_syncobj_put(sync_out); 493 450 } 494 451 } ··· 584 541 mutex_unlock(&v3d->sched_lock); 585 542 586 543 v3d_attach_fences_and_unlock_reservation(file_priv, 587 - &render->base, &acquire_ctx, 588 - args->out_sync); 544 + &render->base, 545 + &acquire_ctx, 546 + args->out_sync, 547 + render->base.done_fence); 589 548 590 549 if (bin) 591 550 v3d_job_put(&bin->base); ··· 686 641 687 642 v3d_attach_fences_and_unlock_reservation(file_priv, 688 643 &job->base, &acquire_ctx, 689 - args->out_sync); 644 + args->out_sync, 645 + job->base.done_fence); 690 646 691 647 v3d_job_put(&job->base); 692 648 ··· 699 653 &acquire_ctx); 700 654 fail: 701 655 v3d_job_put(&job->base); 656 + 657 + return ret; 658 + } 659 + 660 + /** 661 + * v3d_submit_csd_ioctl() - Submits a CSD (texture formatting) job to the V3D. 662 + * @dev: DRM device 663 + * @data: ioctl argument 664 + * @file_priv: DRM file for this fd 665 + * 666 + * Userspace provides the register setup for the CSD, which we don't 667 + * need to validate since the CSD is behind the MMU. 668 + */ 669 + int 670 + v3d_submit_csd_ioctl(struct drm_device *dev, void *data, 671 + struct drm_file *file_priv) 672 + { 673 + struct v3d_dev *v3d = to_v3d_dev(dev); 674 + struct v3d_file_priv *v3d_priv = file_priv->driver_priv; 675 + struct drm_v3d_submit_csd *args = data; 676 + struct v3d_csd_job *job; 677 + struct v3d_job *clean_job; 678 + struct ww_acquire_ctx acquire_ctx; 679 + int ret; 680 + 681 + trace_v3d_submit_csd_ioctl(&v3d->drm, args->cfg[5], args->cfg[6]); 682 + 683 + if (!v3d_has_csd(v3d)) { 684 + DRM_DEBUG("Attempting CSD submit on non-CSD hardware\n"); 685 + return -EINVAL; 686 + } 687 + 688 + job = kcalloc(1, sizeof(*job), GFP_KERNEL); 689 + if (!job) 690 + return -ENOMEM; 691 + 692 + ret = v3d_job_init(v3d, file_priv, &job->base, 693 + v3d_job_free, args->in_sync); 694 + if (ret) { 695 + kfree(job); 696 + return ret; 697 + } 698 + 699 + clean_job = kcalloc(1, sizeof(*clean_job), GFP_KERNEL); 700 + if (!clean_job) { 701 + v3d_job_put(&job->base); 702 + kfree(job); 703 + return -ENOMEM; 704 + } 705 + 706 + ret = v3d_job_init(v3d, file_priv, clean_job, v3d_job_free, 0); 707 + if (ret) { 708 + v3d_job_put(&job->base); 709 + kfree(clean_job); 710 + return ret; 711 + } 712 + 713 + job->args = *args; 714 + 715 + ret = v3d_lookup_bos(dev, file_priv, clean_job, 716 + args->bo_handles, args->bo_handle_count); 717 + if (ret) 718 + goto fail; 719 + 720 + ret = v3d_lock_bo_reservations(clean_job->base.bo, 721 + clean_job->base.bo_count, 722 + &acquire_ctx); 723 + if (ret) 724 + goto fail; 725 + 726 + mutex_lock(&v3d->sched_lock); 727 + ret = v3d_push_job(v3d_priv, &job->base, V3D_CSD); 728 + if (ret) 729 + goto fail_unreserve; 730 + 731 + clean_job->in_fence = dma_fence_get(job->base.done_fence); 732 + ret = v3d_push_job(v3d_priv, clean_job, V3D_CACHE_CLEAN); 733 + if (ret) 734 + goto fail_unreserve; 735 + mutex_unlock(&v3d->sched_lock); 736 + 737 + v3d_attach_fences_and_unlock_reservation(file_priv, 738 + clean_job, 739 + &acquire_ctx, 740 + args->out_sync, 741 + clean_job->done_fence); 742 + 743 + v3d_job_put(&job->base); 744 + v3d_job_put(clean_job); 745 + 746 + return 0; 747 + 748 + fail_unreserve: 749 + mutex_unlock(&v3d->sched_lock); 750 + drm_gem_unlock_reservations(clean_job->bo, clean_job->bo_count, 751 + &acquire_ctx); 752 + fail: 753 + v3d_job_put(&job->base); 754 + v3d_job_put(clean_job); 702 755 703 756 return ret; 704 757 } ··· 817 672 mutex_init(&v3d->bo_lock); 818 673 mutex_init(&v3d->reset_lock); 819 674 mutex_init(&v3d->sched_lock); 675 + mutex_init(&v3d->cache_clean_lock); 820 676 821 677 /* Note: We don't allocate address 0. Various bits of HW 822 678 * treat 0 as special, such as the occlusion query counters
+13 -3
drivers/gpu/drm/v3d/v3d_irq.c
··· 4 4 /** 5 5 * DOC: Interrupt management for the V3D engine 6 6 * 7 - * When we take a bin, render, or TFU done interrupt, we need to 8 - * signal the fence for that job so that the scheduler can queue up 9 - * the next one and unblock any waiters. 7 + * When we take a bin, render, TFU done, or CSD done interrupt, we 8 + * need to signal the fence for that job so that the scheduler can 9 + * queue up the next one and unblock any waiters. 10 10 * 11 11 * When we take the binner out of memory interrupt, we need to 12 12 * allocate some new memory and pass it to the binner so that the ··· 20 20 #define V3D_CORE_IRQS ((u32)(V3D_INT_OUTOMEM | \ 21 21 V3D_INT_FLDONE | \ 22 22 V3D_INT_FRDONE | \ 23 + V3D_INT_CSDDONE | \ 23 24 V3D_INT_GMPV)) 24 25 25 26 #define V3D_HUB_IRQS ((u32)(V3D_HUB_INT_MMU_WRV | \ ··· 109 108 to_v3d_fence(v3d->render_job->base.irq_fence); 110 109 111 110 trace_v3d_rcl_irq(&v3d->drm, fence->seqno); 111 + dma_fence_signal(&fence->base); 112 + status = IRQ_HANDLED; 113 + } 114 + 115 + if (intsts & V3D_INT_CSDDONE) { 116 + struct v3d_fence *fence = 117 + to_v3d_fence(v3d->csd_job->base.irq_fence); 118 + 119 + trace_v3d_csd_irq(&v3d->drm, fence->seqno); 112 120 dma_fence_signal(&fence->base); 113 121 status = IRQ_HANDLED; 114 122 }
+73
drivers/gpu/drm/v3d/v3d_regs.h
··· 238 238 #define V3D_CTL_L2TCACTL 0x00030 239 239 # define V3D_L2TCACTL_TMUWCF BIT(8) 240 240 # define V3D_L2TCACTL_L2T_NO_WM BIT(4) 241 + /* Invalidates cache lines. */ 241 242 # define V3D_L2TCACTL_FLM_FLUSH 0 243 + /* Removes cachelines without writing dirty lines back. */ 242 244 # define V3D_L2TCACTL_FLM_CLEAR 1 245 + /* Writes out dirty cachelines and marks them clean, but doesn't invalidate. */ 243 246 # define V3D_L2TCACTL_FLM_CLEAN 2 244 247 # define V3D_L2TCACTL_FLM_MASK V3D_MASK(2, 1) 245 248 # define V3D_L2TCACTL_FLM_SHIFT 1 ··· 258 255 #define V3D_CTL_INT_MSK_CLR 0x00064 259 256 # define V3D_INT_QPU_MASK V3D_MASK(27, 16) 260 257 # define V3D_INT_QPU_SHIFT 16 258 + # define V3D_INT_CSDDONE BIT(7) 259 + # define V3D_INT_PCTR BIT(6) 261 260 # define V3D_INT_GMPV BIT(5) 262 261 # define V3D_INT_TRFB BIT(4) 263 262 # define V3D_INT_SPILLUSE BIT(3) ··· 378 373 #define V3D_GMP_CLEAR_LOAD 0x00814 379 374 #define V3D_GMP_PRESERVE_LOAD 0x00818 380 375 #define V3D_GMP_VALID_LINES 0x00820 376 + 377 + #define V3D_CSD_STATUS 0x00900 378 + # define V3D_CSD_STATUS_NUM_COMPLETED_MASK V3D_MASK(11, 4) 379 + # define V3D_CSD_STATUS_NUM_COMPLETED_SHIFT 4 380 + # define V3D_CSD_STATUS_NUM_ACTIVE_MASK V3D_MASK(3, 2) 381 + # define V3D_CSD_STATUS_NUM_ACTIVE_SHIFT 2 382 + # define V3D_CSD_STATUS_HAVE_CURRENT_DISPATCH BIT(1) 383 + # define V3D_CSD_STATUS_HAVE_QUEUED_DISPATCH BIT(0) 384 + 385 + #define V3D_CSD_QUEUED_CFG0 0x00904 386 + # define V3D_CSD_QUEUED_CFG0_NUM_WGS_X_MASK V3D_MASK(31, 16) 387 + # define V3D_CSD_QUEUED_CFG0_NUM_WGS_X_SHIFT 16 388 + # define V3D_CSD_QUEUED_CFG0_WG_X_OFFSET_MASK V3D_MASK(15, 0) 389 + # define V3D_CSD_QUEUED_CFG0_WG_X_OFFSET_SHIFT 0 390 + 391 + #define V3D_CSD_QUEUED_CFG1 0x00908 392 + # define V3D_CSD_QUEUED_CFG1_NUM_WGS_Y_MASK V3D_MASK(31, 16) 393 + # define V3D_CSD_QUEUED_CFG1_NUM_WGS_Y_SHIFT 16 394 + # define V3D_CSD_QUEUED_CFG1_WG_Y_OFFSET_MASK V3D_MASK(15, 0) 395 + # define V3D_CSD_QUEUED_CFG1_WG_Y_OFFSET_SHIFT 0 396 + 397 + #define V3D_CSD_QUEUED_CFG2 0x0090c 398 + # define V3D_CSD_QUEUED_CFG2_NUM_WGS_Z_MASK V3D_MASK(31, 16) 399 + # define V3D_CSD_QUEUED_CFG2_NUM_WGS_Z_SHIFT 16 400 + # define V3D_CSD_QUEUED_CFG2_WG_Z_OFFSET_MASK V3D_MASK(15, 0) 401 + # define V3D_CSD_QUEUED_CFG2_WG_Z_OFFSET_SHIFT 0 402 + 403 + #define V3D_CSD_QUEUED_CFG3 0x00910 404 + # define V3D_CSD_QUEUED_CFG3_OVERLAP_WITH_PREV BIT(26) 405 + # define V3D_CSD_QUEUED_CFG3_MAX_SG_ID_MASK V3D_MASK(25, 20) 406 + # define V3D_CSD_QUEUED_CFG3_MAX_SG_ID_SHIFT 20 407 + # define V3D_CSD_QUEUED_CFG3_BATCHES_PER_SG_M1_MASK V3D_MASK(19, 12) 408 + # define V3D_CSD_QUEUED_CFG3_BATCHES_PER_SG_M1_SHIFT 12 409 + # define V3D_CSD_QUEUED_CFG3_WGS_PER_SG_MASK V3D_MASK(11, 8) 410 + # define V3D_CSD_QUEUED_CFG3_WGS_PER_SG_SHIFT 8 411 + # define V3D_CSD_QUEUED_CFG3_WG_SIZE_MASK V3D_MASK(7, 0) 412 + # define V3D_CSD_QUEUED_CFG3_WG_SIZE_SHIFT 0 413 + 414 + /* Number of batches, minus 1 */ 415 + #define V3D_CSD_QUEUED_CFG4 0x00914 416 + 417 + /* Shader address, pnan, singleseg, threading, like a shader record. */ 418 + #define V3D_CSD_QUEUED_CFG5 0x00918 419 + 420 + /* Uniforms address (4 byte aligned) */ 421 + #define V3D_CSD_QUEUED_CFG6 0x0091c 422 + 423 + #define V3D_CSD_CURRENT_CFG0 0x00920 424 + #define V3D_CSD_CURRENT_CFG1 0x00924 425 + #define V3D_CSD_CURRENT_CFG2 0x00928 426 + #define V3D_CSD_CURRENT_CFG3 0x0092c 427 + #define V3D_CSD_CURRENT_CFG4 0x00930 428 + #define V3D_CSD_CURRENT_CFG5 0x00934 429 + #define V3D_CSD_CURRENT_CFG6 0x00938 430 + 431 + #define V3D_CSD_CURRENT_ID0 0x0093c 432 + # define V3D_CSD_CURRENT_ID0_WG_X_MASK V3D_MASK(31, 16) 433 + # define V3D_CSD_CURRENT_ID0_WG_X_SHIFT 16 434 + # define V3D_CSD_CURRENT_ID0_WG_IN_SG_MASK V3D_MASK(11, 8) 435 + # define V3D_CSD_CURRENT_ID0_WG_IN_SG_SHIFT 8 436 + # define V3D_CSD_CURRENT_ID0_L_IDX_MASK V3D_MASK(7, 0) 437 + # define V3D_CSD_CURRENT_ID0_L_IDX_SHIFT 0 438 + 439 + #define V3D_CSD_CURRENT_ID1 0x00940 440 + # define V3D_CSD_CURRENT_ID0_WG_Z_MASK V3D_MASK(31, 16) 441 + # define V3D_CSD_CURRENT_ID0_WG_Z_SHIFT 16 442 + # define V3D_CSD_CURRENT_ID0_WG_Y_MASK V3D_MASK(15, 0) 443 + # define V3D_CSD_CURRENT_ID0_WG_Y_SHIFT 0 381 444 382 445 #endif /* V3D_REGS_H */
+114 -7
drivers/gpu/drm/v3d/v3d_sched.c
··· 48 48 return container_of(sched_job, struct v3d_tfu_job, base.base); 49 49 } 50 50 51 + static struct v3d_csd_job * 52 + to_csd_job(struct drm_sched_job *sched_job) 53 + { 54 + return container_of(sched_job, struct v3d_csd_job, base.base); 55 + } 56 + 51 57 static void 52 58 v3d_job_free(struct drm_sched_job *sched_job) 53 59 { ··· 249 243 return fence; 250 244 } 251 245 246 + static struct dma_fence * 247 + v3d_csd_job_run(struct drm_sched_job *sched_job) 248 + { 249 + struct v3d_csd_job *job = to_csd_job(sched_job); 250 + struct v3d_dev *v3d = job->base.v3d; 251 + struct drm_device *dev = &v3d->drm; 252 + struct dma_fence *fence; 253 + int i; 254 + 255 + v3d->csd_job = job; 256 + 257 + v3d_invalidate_caches(v3d); 258 + 259 + fence = v3d_fence_create(v3d, V3D_CSD); 260 + if (IS_ERR(fence)) 261 + return NULL; 262 + 263 + if (job->base.irq_fence) 264 + dma_fence_put(job->base.irq_fence); 265 + job->base.irq_fence = dma_fence_get(fence); 266 + 267 + trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno); 268 + 269 + for (i = 1; i <= 6; i++) 270 + V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0 + 4 * i, job->args.cfg[i]); 271 + /* CFG0 write kicks off the job. */ 272 + V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0, job->args.cfg[0]); 273 + 274 + return fence; 275 + } 276 + 277 + static struct dma_fence * 278 + v3d_cache_clean_job_run(struct drm_sched_job *sched_job) 279 + { 280 + struct v3d_job *job = to_v3d_job(sched_job); 281 + struct v3d_dev *v3d = job->v3d; 282 + 283 + v3d_clean_caches(v3d); 284 + 285 + return NULL; 286 + } 287 + 252 288 static void 253 289 v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job) 254 290 { ··· 361 313 } 362 314 363 315 static void 364 - v3d_tfu_job_timedout(struct drm_sched_job *sched_job) 316 + v3d_generic_job_timedout(struct drm_sched_job *sched_job) 365 317 { 366 318 struct v3d_job *job = to_v3d_job(sched_job); 367 319 368 320 v3d_gpu_reset_for_timeout(job->v3d, sched_job); 321 + } 322 + 323 + static void 324 + v3d_csd_job_timedout(struct drm_sched_job *sched_job) 325 + { 326 + struct v3d_csd_job *job = to_csd_job(sched_job); 327 + struct v3d_dev *v3d = job->base.v3d; 328 + u32 batches = V3D_CORE_READ(0, V3D_CSD_CURRENT_CFG4); 329 + 330 + /* If we've made progress, skip reset and let the timer get 331 + * rearmed. 332 + */ 333 + if (job->timedout_batches != batches) { 334 + job->timedout_batches = batches; 335 + return; 336 + } 337 + 338 + v3d_gpu_reset_for_timeout(v3d, sched_job); 369 339 } 370 340 371 341 static const struct drm_sched_backend_ops v3d_bin_sched_ops = { ··· 403 337 static const struct drm_sched_backend_ops v3d_tfu_sched_ops = { 404 338 .dependency = v3d_job_dependency, 405 339 .run_job = v3d_tfu_job_run, 406 - .timedout_job = v3d_tfu_job_timedout, 340 + .timedout_job = v3d_generic_job_timedout, 407 341 .free_job = v3d_job_free, 342 + }; 343 + 344 + static const struct drm_sched_backend_ops v3d_csd_sched_ops = { 345 + .dependency = v3d_job_dependency, 346 + .run_job = v3d_csd_job_run, 347 + .timedout_job = v3d_csd_job_timedout, 348 + .free_job = v3d_job_free 349 + }; 350 + 351 + static const struct drm_sched_backend_ops v3d_cache_clean_sched_ops = { 352 + .dependency = v3d_job_dependency, 353 + .run_job = v3d_cache_clean_job_run, 354 + .timedout_job = v3d_generic_job_timedout, 355 + .free_job = v3d_job_free 408 356 }; 409 357 410 358 int ··· 447 367 if (ret) { 448 368 dev_err(v3d->dev, "Failed to create render scheduler: %d.", 449 369 ret); 450 - drm_sched_fini(&v3d->queue[V3D_BIN].sched); 370 + v3d_sched_fini(v3d); 451 371 return ret; 452 372 } 453 373 ··· 459 379 if (ret) { 460 380 dev_err(v3d->dev, "Failed to create TFU scheduler: %d.", 461 381 ret); 462 - drm_sched_fini(&v3d->queue[V3D_RENDER].sched); 463 - drm_sched_fini(&v3d->queue[V3D_BIN].sched); 382 + v3d_sched_fini(v3d); 464 383 return ret; 384 + } 385 + 386 + if (v3d_has_csd(v3d)) { 387 + ret = drm_sched_init(&v3d->queue[V3D_CSD].sched, 388 + &v3d_csd_sched_ops, 389 + hw_jobs_limit, job_hang_limit, 390 + msecs_to_jiffies(hang_limit_ms), 391 + "v3d_csd"); 392 + if (ret) { 393 + dev_err(v3d->dev, "Failed to create CSD scheduler: %d.", 394 + ret); 395 + v3d_sched_fini(v3d); 396 + return ret; 397 + } 398 + 399 + ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched, 400 + &v3d_cache_clean_sched_ops, 401 + hw_jobs_limit, job_hang_limit, 402 + msecs_to_jiffies(hang_limit_ms), 403 + "v3d_cache_clean"); 404 + if (ret) { 405 + dev_err(v3d->dev, "Failed to create CACHE_CLEAN scheduler: %d.", 406 + ret); 407 + v3d_sched_fini(v3d); 408 + return ret; 409 + } 465 410 } 466 411 467 412 return 0; ··· 497 392 { 498 393 enum v3d_queue q; 499 394 500 - for (q = 0; q < V3D_MAX_QUEUES; q++) 501 - drm_sched_fini(&v3d->queue[q].sched); 395 + for (q = 0; q < V3D_MAX_QUEUES; q++) { 396 + if (v3d->queue[q].sched.ready) 397 + drm_sched_fini(&v3d->queue[q].sched); 398 + } 502 399 }
+94
drivers/gpu/drm/v3d/v3d_trace.h
··· 124 124 __entry->seqno) 125 125 ); 126 126 127 + TRACE_EVENT(v3d_csd_irq, 128 + TP_PROTO(struct drm_device *dev, 129 + uint64_t seqno), 130 + TP_ARGS(dev, seqno), 131 + 132 + TP_STRUCT__entry( 133 + __field(u32, dev) 134 + __field(u64, seqno) 135 + ), 136 + 137 + TP_fast_assign( 138 + __entry->dev = dev->primary->index; 139 + __entry->seqno = seqno; 140 + ), 141 + 142 + TP_printk("dev=%u, seqno=%llu", 143 + __entry->dev, 144 + __entry->seqno) 145 + ); 146 + 127 147 TRACE_EVENT(v3d_submit_tfu_ioctl, 128 148 TP_PROTO(struct drm_device *dev, u32 iia), 129 149 TP_ARGS(dev, iia), ··· 181 161 TP_printk("dev=%u, seqno=%llu", 182 162 __entry->dev, 183 163 __entry->seqno) 164 + ); 165 + 166 + TRACE_EVENT(v3d_submit_csd_ioctl, 167 + TP_PROTO(struct drm_device *dev, u32 cfg5, u32 cfg6), 168 + TP_ARGS(dev, cfg5, cfg6), 169 + 170 + TP_STRUCT__entry( 171 + __field(u32, dev) 172 + __field(u32, cfg5) 173 + __field(u32, cfg6) 174 + ), 175 + 176 + TP_fast_assign( 177 + __entry->dev = dev->primary->index; 178 + __entry->cfg5 = cfg5; 179 + __entry->cfg6 = cfg6; 180 + ), 181 + 182 + TP_printk("dev=%u, CFG5 0x%08x, CFG6 0x%08x", 183 + __entry->dev, 184 + __entry->cfg5, 185 + __entry->cfg6) 186 + ); 187 + 188 + TRACE_EVENT(v3d_submit_csd, 189 + TP_PROTO(struct drm_device *dev, 190 + uint64_t seqno), 191 + TP_ARGS(dev, seqno), 192 + 193 + TP_STRUCT__entry( 194 + __field(u32, dev) 195 + __field(u64, seqno) 196 + ), 197 + 198 + TP_fast_assign( 199 + __entry->dev = dev->primary->index; 200 + __entry->seqno = seqno; 201 + ), 202 + 203 + TP_printk("dev=%u, seqno=%llu", 204 + __entry->dev, 205 + __entry->seqno) 206 + ); 207 + 208 + TRACE_EVENT(v3d_cache_clean_begin, 209 + TP_PROTO(struct drm_device *dev), 210 + TP_ARGS(dev), 211 + 212 + TP_STRUCT__entry( 213 + __field(u32, dev) 214 + ), 215 + 216 + TP_fast_assign( 217 + __entry->dev = dev->primary->index; 218 + ), 219 + 220 + TP_printk("dev=%u", 221 + __entry->dev) 222 + ); 223 + 224 + TRACE_EVENT(v3d_cache_clean_end, 225 + TP_PROTO(struct drm_device *dev), 226 + TP_ARGS(dev), 227 + 228 + TP_STRUCT__entry( 229 + __field(u32, dev) 230 + ), 231 + 232 + TP_fast_assign( 233 + __entry->dev = dev->primary->index; 234 + ), 235 + 236 + TP_printk("dev=%u", 237 + __entry->dev) 184 238 ); 185 239 186 240 TRACE_EVENT(v3d_reset_begin,
+28
include/uapi/drm/v3d_drm.h
··· 37 37 #define DRM_V3D_GET_PARAM 0x04 38 38 #define DRM_V3D_GET_BO_OFFSET 0x05 39 39 #define DRM_V3D_SUBMIT_TFU 0x06 40 + #define DRM_V3D_SUBMIT_CSD 0x07 40 41 41 42 #define DRM_IOCTL_V3D_SUBMIT_CL DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl) 42 43 #define DRM_IOCTL_V3D_WAIT_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo) ··· 46 45 #define DRM_IOCTL_V3D_GET_PARAM DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_PARAM, struct drm_v3d_get_param) 47 46 #define DRM_IOCTL_V3D_GET_BO_OFFSET DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset) 48 47 #define DRM_IOCTL_V3D_SUBMIT_TFU DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_TFU, struct drm_v3d_submit_tfu) 48 + #define DRM_IOCTL_V3D_SUBMIT_CSD DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CSD, struct drm_v3d_submit_csd) 49 49 50 50 /** 51 51 * struct drm_v3d_submit_cl - ioctl argument for submitting commands to the 3D ··· 192 190 DRM_V3D_PARAM_V3D_CORE0_IDENT1, 193 191 DRM_V3D_PARAM_V3D_CORE0_IDENT2, 194 192 DRM_V3D_PARAM_SUPPORTS_TFU, 193 + DRM_V3D_PARAM_SUPPORTS_CSD, 195 194 }; 196 195 197 196 struct drm_v3d_get_param { ··· 230 227 */ 231 228 __u32 in_sync; 232 229 /* Sync object to signal when the TFU job is done. */ 230 + __u32 out_sync; 231 + }; 232 + 233 + /* Submits a compute shader for dispatch. This job will block on any 234 + * previous compute shaders submitted on this fd, and any other 235 + * synchronization must be performed with in_sync/out_sync. 236 + */ 237 + struct drm_v3d_submit_csd { 238 + __u32 cfg[7]; 239 + __u32 coef[4]; 240 + 241 + /* Pointer to a u32 array of the BOs that are referenced by the job. 242 + */ 243 + __u64 bo_handles; 244 + 245 + /* Number of BO handles passed in (size is that times 4). */ 246 + __u32 bo_handle_count; 247 + 248 + /* sync object to block on before running the CSD job. Each 249 + * CSD job will execute in the order submitted to its FD. 250 + * Synchronization against rendering/TFU jobs or CSD from 251 + * other fds requires using sync objects. 252 + */ 253 + __u32 in_sync; 254 + /* Sync object to signal when the CSD job is done. */ 233 255 __u32 out_sync; 234 256 }; 235 257