Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/vmwgfx: Restart command buffers after errors

Previously we skipped the command buffer and added an extra fence to
avoid hangs due to skipped fence commands.
Now we instead restart the command buffer after the failing command,
if there are any commands left.
In addition we print out some information about the failing command
and its location in the command buffer.

Testing Done: ran glxgears using mesa modified to send the NOP_ERROR
command before each 10th clear and verified that we detected the device
error properly and that there were no other device errors caused by
incorrectly ordered command buffers. Also ran the piglit "quick" test
suite which generates a couple of device errors and verified that
they were handled as intended.

Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Sinclair Yeh <syeh@vmware.com>

authored by

Thomas Hellstrom and committed by
Sinclair Yeh
65b97a2b ef369904

+206 -27
+158 -25
drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf.c
··· 51 51 struct list_head hw_submitted; 52 52 struct list_head preempted; 53 53 unsigned num_hw_submitted; 54 + bool block_submission; 54 55 }; 55 56 56 57 /** ··· 61 60 * kernel command submissions, @cur. 62 61 * @space_mutex: Mutex to protect against starvation when we allocate 63 62 * main pool buffer space. 63 + * @error_mutex: Mutex to serialize the work queue error handling. 64 + * Note this is not needed if the same workqueue handler 65 + * can't race with itself... 64 66 * @work: A struct work_struct implementeing command buffer error handling. 65 67 * Immutable. 66 68 * @dev_priv: Pointer to the device private struct. Immutable. ··· 105 101 struct vmw_cmdbuf_man { 106 102 struct mutex cur_mutex; 107 103 struct mutex space_mutex; 104 + struct mutex error_mutex; 108 105 struct work_struct work; 109 106 struct vmw_private *dev_priv; 110 107 struct vmw_cmdbuf_context ctx[SVGA_CB_CONTEXT_MAX]; ··· 184 179 }; 185 180 186 181 /* Loop over each context in the command buffer manager. */ 187 - #define for_each_cmdbuf_ctx(_man, _i, _ctx) \ 182 + #define for_each_cmdbuf_ctx(_man, _i, _ctx) \ 188 183 for (_i = 0, _ctx = &(_man)->ctx[0]; (_i) < SVGA_CB_CONTEXT_MAX; \ 189 184 ++(_i), ++(_ctx)) 190 185 191 - static int vmw_cmdbuf_startstop(struct vmw_cmdbuf_man *man, bool enable); 192 - 186 + static int vmw_cmdbuf_startstop(struct vmw_cmdbuf_man *man, u32 context, 187 + bool enable); 188 + static int vmw_cmdbuf_preempt(struct vmw_cmdbuf_man *man, u32 context); 193 189 194 190 /** 195 191 * vmw_cmdbuf_cur_lock - Helper to lock the cur_mutex. ··· 335 329 struct vmw_cmdbuf_context *ctx) 336 330 { 337 331 while (ctx->num_hw_submitted < man->max_hw_submitted && 338 - !list_empty(&ctx->submitted)) { 332 + !list_empty(&ctx->submitted) && 333 + !ctx->block_submission) { 339 334 struct vmw_cmdbuf_header *entry; 340 335 SVGACBStatus status; 341 336 ··· 391 384 __vmw_cmdbuf_header_free(entry); 392 385 break; 393 386 case SVGA_CB_STATUS_COMMAND_ERROR: 394 - case SVGA_CB_STATUS_CB_HEADER_ERROR: 387 + entry->cb_header->status = SVGA_CB_STATUS_NONE; 395 388 list_add_tail(&entry->list, &man->error); 396 389 schedule_work(&man->work); 397 390 break; 398 391 case SVGA_CB_STATUS_PREEMPTED: 399 - list_add(&entry->list, &ctx->preempted); 392 + entry->cb_header->status = SVGA_CB_STATUS_NONE; 393 + list_add_tail(&entry->list, &ctx->preempted); 394 + break; 395 + case SVGA_CB_STATUS_CB_HEADER_ERROR: 396 + WARN_ONCE(true, "Command buffer header error.\n"); 397 + __vmw_cmdbuf_header_free(entry); 400 398 break; 401 399 default: 402 400 WARN_ONCE(true, "Undefined command buffer status.\n"); ··· 509 497 container_of(work, struct vmw_cmdbuf_man, work); 510 498 struct vmw_cmdbuf_header *entry, *next; 511 499 uint32_t dummy; 512 - bool restart = false; 500 + bool restart[SVGA_CB_CONTEXT_MAX]; 501 + bool send_fence = false; 502 + struct list_head restart_head[SVGA_CB_CONTEXT_MAX]; 503 + int i; 504 + struct vmw_cmdbuf_context *ctx; 513 505 506 + for_each_cmdbuf_ctx(man, i, ctx) { 507 + INIT_LIST_HEAD(&restart_head[i]); 508 + restart[i] = false; 509 + } 510 + 511 + mutex_lock(&man->error_mutex); 514 512 spin_lock(&man->lock); 515 513 list_for_each_entry_safe(entry, next, &man->error, list) { 516 - restart = true; 517 - DRM_ERROR("Command buffer error.\n"); 514 + SVGACBHeader *cb_hdr = entry->cb_header; 515 + SVGA3dCmdHeader *header = (SVGA3dCmdHeader *) 516 + (entry->cmd + cb_hdr->errorOffset); 517 + u32 error_cmd_size, new_start_offset; 518 + const char *cmd_name; 518 519 519 - list_del(&entry->list); 520 - __vmw_cmdbuf_header_free(entry); 521 - wake_up_all(&man->idle_queue); 520 + list_del_init(&entry->list); 521 + restart[entry->cb_context] = true; 522 + 523 + if (!vmw_cmd_describe(header, &error_cmd_size, &cmd_name)) { 524 + DRM_ERROR("Unknown command causing device error.\n"); 525 + DRM_ERROR("Command buffer offset is %lu\n", 526 + (unsigned long) cb_hdr->errorOffset); 527 + __vmw_cmdbuf_header_free(entry); 528 + send_fence = true; 529 + continue; 530 + } 531 + 532 + DRM_ERROR("Command \"%s\" causing device error.\n", cmd_name); 533 + DRM_ERROR("Command buffer offset is %lu\n", 534 + (unsigned long) cb_hdr->errorOffset); 535 + DRM_ERROR("Command size is %lu\n", 536 + (unsigned long) error_cmd_size); 537 + 538 + new_start_offset = cb_hdr->errorOffset + error_cmd_size; 539 + 540 + if (new_start_offset >= cb_hdr->length) { 541 + __vmw_cmdbuf_header_free(entry); 542 + send_fence = true; 543 + continue; 544 + } 545 + 546 + if (man->using_mob) 547 + cb_hdr->ptr.mob.mobOffset += new_start_offset; 548 + else 549 + cb_hdr->ptr.pa += (u64) new_start_offset; 550 + 551 + entry->cmd += new_start_offset; 552 + cb_hdr->length -= new_start_offset; 553 + cb_hdr->errorOffset = 0; 554 + list_add_tail(&entry->list, &restart_head[entry->cb_context]); 555 + man->ctx[entry->cb_context].block_submission = true; 522 556 } 523 557 spin_unlock(&man->lock); 524 558 525 - if (restart && vmw_cmdbuf_startstop(man, true)) 526 - DRM_ERROR("Failed restarting command buffer context 0.\n"); 559 + /* Preempt all contexts with errors */ 560 + for_each_cmdbuf_ctx(man, i, ctx) { 561 + if (ctx->block_submission && vmw_cmdbuf_preempt(man, i)) 562 + DRM_ERROR("Failed preempting command buffer " 563 + "context %u.\n", i); 564 + } 565 + 566 + spin_lock(&man->lock); 567 + for_each_cmdbuf_ctx(man, i, ctx) { 568 + if (!ctx->block_submission) 569 + continue; 570 + 571 + /* Move preempted command buffers to the preempted queue. */ 572 + vmw_cmdbuf_ctx_process(man, ctx, &dummy); 573 + 574 + /* 575 + * Add the preempted queue after the command buffer 576 + * that caused an error. 577 + */ 578 + list_splice_init(&ctx->preempted, restart_head[i].prev); 579 + 580 + /* 581 + * Finally add all command buffers first in the submitted 582 + * queue, to rerun them. 583 + */ 584 + list_splice_init(&restart_head[i], &ctx->submitted); 585 + 586 + ctx->block_submission = false; 587 + } 588 + 589 + vmw_cmdbuf_man_process(man); 590 + spin_unlock(&man->lock); 591 + 592 + for_each_cmdbuf_ctx(man, i, ctx) { 593 + if (restart[i] && vmw_cmdbuf_startstop(man, i, true)) 594 + DRM_ERROR("Failed restarting command buffer " 595 + "context %u.\n", i); 596 + } 527 597 528 598 /* Send a new fence in case one was removed */ 529 - vmw_fifo_send_fence(man->dev_priv, &dummy); 599 + if (send_fence) { 600 + vmw_fifo_send_fence(man->dev_priv, &dummy); 601 + wake_up_all(&man->idle_queue); 602 + } 603 + 604 + mutex_unlock(&man->error_mutex); 530 605 } 531 606 532 607 /** ··· 1156 1057 } 1157 1058 1158 1059 /** 1060 + * vmw_cmdbuf_preempt - Send a preempt command through the device 1061 + * context. 1062 + * 1063 + * @man: The command buffer manager. 1064 + * 1065 + * Synchronously sends a preempt command. 1066 + */ 1067 + static int vmw_cmdbuf_preempt(struct vmw_cmdbuf_man *man, u32 context) 1068 + { 1069 + struct { 1070 + uint32 id; 1071 + SVGADCCmdPreempt body; 1072 + } __packed cmd; 1073 + 1074 + cmd.id = SVGA_DC_CMD_PREEMPT; 1075 + cmd.body.context = SVGA_CB_CONTEXT_0 + context; 1076 + cmd.body.ignoreIDZero = 0; 1077 + 1078 + return vmw_cmdbuf_send_device_command(man, &cmd, sizeof(cmd)); 1079 + } 1080 + 1081 + 1082 + /** 1159 1083 * vmw_cmdbuf_startstop - Send a start / stop command through the device 1160 1084 * context. 1161 1085 * ··· 1187 1065 * 1188 1066 * Synchronously sends a device start / stop context command. 1189 1067 */ 1190 - static int vmw_cmdbuf_startstop(struct vmw_cmdbuf_man *man, 1068 + static int vmw_cmdbuf_startstop(struct vmw_cmdbuf_man *man, u32 context, 1191 1069 bool enable) 1192 1070 { 1193 1071 struct { ··· 1197 1075 1198 1076 cmd.id = SVGA_DC_CMD_START_STOP_CONTEXT; 1199 1077 cmd.body.enable = (enable) ? 1 : 0; 1200 - cmd.body.context = SVGA_CB_CONTEXT_0; 1078 + cmd.body.context = SVGA_CB_CONTEXT_0 + context; 1201 1079 1202 1080 return vmw_cmdbuf_send_device_command(man, &cmd, sizeof(cmd)); 1203 1081 } ··· 1296 1174 { 1297 1175 struct vmw_cmdbuf_man *man; 1298 1176 struct vmw_cmdbuf_context *ctx; 1299 - int i; 1177 + unsigned int i; 1300 1178 int ret; 1301 1179 1302 1180 if (!(dev_priv->capabilities & SVGA_CAP_COMMAND_BUFFERS)) ··· 1331 1209 spin_lock_init(&man->lock); 1332 1210 mutex_init(&man->cur_mutex); 1333 1211 mutex_init(&man->space_mutex); 1212 + mutex_init(&man->error_mutex); 1334 1213 man->default_size = VMW_CMDBUF_INLINE_SIZE; 1335 1214 init_waitqueue_head(&man->alloc_queue); 1336 1215 init_waitqueue_head(&man->idle_queue); ··· 1340 1217 INIT_WORK(&man->work, &vmw_cmdbuf_work_func); 1341 1218 vmw_generic_waiter_add(dev_priv, SVGA_IRQFLAG_ERROR, 1342 1219 &dev_priv->error_waiters); 1343 - ret = vmw_cmdbuf_startstop(man, true); 1344 - if (ret) { 1345 - DRM_ERROR("Failed starting command buffer context 0.\n"); 1346 - vmw_cmdbuf_man_destroy(man); 1347 - return ERR_PTR(ret); 1220 + for_each_cmdbuf_ctx(man, i, ctx) { 1221 + ret = vmw_cmdbuf_startstop(man, i, true); 1222 + if (ret) { 1223 + DRM_ERROR("Failed starting command buffer " 1224 + "context %u.\n", i); 1225 + vmw_cmdbuf_man_destroy(man); 1226 + return ERR_PTR(ret); 1227 + } 1348 1228 } 1349 1229 1350 1230 return man; ··· 1397 1271 */ 1398 1272 void vmw_cmdbuf_man_destroy(struct vmw_cmdbuf_man *man) 1399 1273 { 1274 + struct vmw_cmdbuf_context *ctx; 1275 + unsigned int i; 1276 + 1400 1277 WARN_ON_ONCE(man->has_pool); 1401 1278 (void) vmw_cmdbuf_idle(man, false, 10*HZ); 1402 - if (vmw_cmdbuf_startstop(man, false)) 1403 - DRM_ERROR("Failed stopping command buffer context 0.\n"); 1279 + 1280 + for_each_cmdbuf_ctx(man, i, ctx) 1281 + if (vmw_cmdbuf_startstop(man, i, false)) 1282 + DRM_ERROR("Failed stopping command buffer " 1283 + "context %u.\n", i); 1404 1284 1405 1285 vmw_generic_waiter_remove(man->dev_priv, SVGA_IRQFLAG_ERROR, 1406 1286 &man->dev_priv->error_waiters); ··· 1415 1283 dma_pool_destroy(man->headers); 1416 1284 mutex_destroy(&man->cur_mutex); 1417 1285 mutex_destroy(&man->space_mutex); 1286 + mutex_destroy(&man->error_mutex); 1418 1287 kfree(man); 1419 1288 }
+1 -1
drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
··· 846 846 struct ttm_buffer_object *bo, 847 847 bool interruptible, 848 848 bool validate_as_mob); 849 - 849 + bool vmw_cmd_describe(const void *buf, u32 *size, char const **cmd); 850 850 851 851 /** 852 852 * IRQs and wating - vmwgfx_irq.c
+47 -1
drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
··· 112 112 bool user_allow; 113 113 bool gb_disable; 114 114 bool gb_enable; 115 + const char *cmd_name; 115 116 }; 116 117 117 118 #define VMW_CMD_DEF(_cmd, _func, _user_allow, _gb_disable, _gb_enable) \ 118 119 [(_cmd) - SVGA_3D_CMD_BASE] = {(_func), (_user_allow),\ 119 - (_gb_disable), (_gb_enable)} 120 + (_gb_disable), (_gb_enable), #_cmd} 120 121 121 122 static int vmw_resource_context_res_add(struct vmw_private *dev_priv, 122 123 struct vmw_sw_context *sw_context, ··· 3469 3468 &vmw_cmd_dx_transfer_from_buffer, 3470 3469 true, false, true), 3471 3470 }; 3471 + 3472 + bool vmw_cmd_describe(const void *buf, u32 *size, char const **cmd) 3473 + { 3474 + u32 cmd_id = ((u32 *) buf)[0]; 3475 + 3476 + if (cmd_id >= SVGA_CMD_MAX) { 3477 + SVGA3dCmdHeader *header = (SVGA3dCmdHeader *) buf; 3478 + const struct vmw_cmd_entry *entry; 3479 + 3480 + *size = header->size + sizeof(SVGA3dCmdHeader); 3481 + cmd_id = header->id; 3482 + if (cmd_id >= SVGA_3D_CMD_MAX) 3483 + return false; 3484 + 3485 + cmd_id -= SVGA_3D_CMD_BASE; 3486 + entry = &vmw_cmd_entries[cmd_id]; 3487 + *cmd = entry->cmd_name; 3488 + return true; 3489 + } 3490 + 3491 + switch (cmd_id) { 3492 + case SVGA_CMD_UPDATE: 3493 + *cmd = "SVGA_CMD_UPDATE"; 3494 + *size = sizeof(u32) + sizeof(SVGAFifoCmdUpdate); 3495 + break; 3496 + case SVGA_CMD_DEFINE_GMRFB: 3497 + *cmd = "SVGA_CMD_DEFINE_GMRFB"; 3498 + *size = sizeof(u32) + sizeof(SVGAFifoCmdDefineGMRFB); 3499 + break; 3500 + case SVGA_CMD_BLIT_GMRFB_TO_SCREEN: 3501 + *cmd = "SVGA_CMD_BLIT_GMRFB_TO_SCREEN"; 3502 + *size = sizeof(u32) + sizeof(SVGAFifoCmdBlitGMRFBToScreen); 3503 + break; 3504 + case SVGA_CMD_BLIT_SCREEN_TO_GMRFB: 3505 + *cmd = "SVGA_CMD_BLIT_SCREEN_TO_GMRFB"; 3506 + *size = sizeof(u32) + sizeof(SVGAFifoCmdBlitGMRFBToScreen); 3507 + break; 3508 + default: 3509 + *cmd = "UNKNOWN"; 3510 + *size = 0; 3511 + return false; 3512 + } 3513 + 3514 + return true; 3515 + } 3472 3516 3473 3517 static int vmw_cmd_check(struct vmw_private *dev_priv, 3474 3518 struct vmw_sw_context *sw_context,