Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/xe/guc: Add capture size check in GuC log buffer

Capture-nodes generated by GuC are placed in the GuC capture ring
buffer which is a sub-region of the larger Guc-Log-buffer.
Add capture output size check before allocating the shared buffer.

Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
Reviewed-by: Alan Previn <alan.previn.teres.alexis@intel.com>
Signed-off-by: Matt Roper <matthew.d.roper@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241004193428.3311145-4-zhanjun.dong@intel.com

authored by

Zhanjun Dong and committed by
Matt Roper
84d15f42 b170d696

+174 -2
+20
drivers/gpu/drm/xe/abi/guc_log_abi.h
··· 1 + /* SPDX-License-Identifier: MIT */ 2 + /* 3 + * Copyright © 2024 Intel Corporation 4 + */ 5 + 6 + #ifndef _ABI_GUC_LOG_ABI_H 7 + #define _ABI_GUC_LOG_ABI_H 8 + 9 + #include <linux/types.h> 10 + 11 + /* GuC logging buffer types */ 12 + enum guc_log_buffer_type { 13 + GUC_LOG_BUFFER_CRASH_DUMP, 14 + GUC_LOG_BUFFER_DEBUG, 15 + GUC_LOG_BUFFER_CAPTURE, 16 + }; 17 + 18 + #define GUC_LOG_BUFFER_TYPE_MAX 3 19 + 20 + #endif
+82 -1
drivers/gpu/drm/xe/xe_guc_capture.c
··· 22 22 #include "xe_gt_mcr.h" 23 23 #include "xe_gt_printk.h" 24 24 #include "xe_guc.h" 25 + #include "xe_guc_ads.h" 25 26 #include "xe_guc_capture.h" 26 27 #include "xe_guc_capture_types.h" 27 28 #include "xe_guc_ct.h" ··· 670 669 return PAGE_ALIGN(total_size); 671 670 } 672 671 672 + static int guc_capture_output_size_est(struct xe_guc *guc) 673 + { 674 + struct xe_gt *gt = guc_to_gt(guc); 675 + struct xe_hw_engine *hwe; 676 + enum xe_hw_engine_id id; 677 + 678 + int capture_size = 0; 679 + size_t tmp = 0; 680 + 681 + if (!guc->capture) 682 + return -ENODEV; 683 + 684 + /* 685 + * If every single engine-instance suffered a failure in quick succession but 686 + * were all unrelated, then a burst of multiple error-capture events would dump 687 + * registers for every one engine instance, one at a time. In this case, GuC 688 + * would even dump the global-registers repeatedly. 689 + * 690 + * For each engine instance, there would be 1 x guc_state_capture_group_t output 691 + * followed by 3 x guc_state_capture_t lists. The latter is how the register 692 + * dumps are split across different register types (where the '3' are global vs class 693 + * vs instance). 694 + */ 695 + for_each_hw_engine(hwe, gt, id) { 696 + enum guc_capture_list_class_type capture_class; 697 + 698 + capture_class = xe_engine_class_to_guc_capture_class(hwe->class); 699 + capture_size += sizeof(struct guc_state_capture_group_header_t) + 700 + (3 * sizeof(struct guc_state_capture_header_t)); 701 + 702 + if (!guc_capture_getlistsize(guc, 0, GUC_STATE_CAPTURE_TYPE_GLOBAL, 703 + 0, &tmp, true)) 704 + capture_size += tmp; 705 + if (!guc_capture_getlistsize(guc, 0, GUC_STATE_CAPTURE_TYPE_ENGINE_CLASS, 706 + capture_class, &tmp, true)) 707 + capture_size += tmp; 708 + if (!guc_capture_getlistsize(guc, 0, GUC_STATE_CAPTURE_TYPE_ENGINE_INSTANCE, 709 + capture_class, &tmp, true)) 710 + capture_size += tmp; 711 + } 712 + 713 + return capture_size; 714 + } 715 + 716 + /* 717 + * Add on a 3x multiplier to allow for multiple back-to-back captures occurring 718 + * before the Xe can read the data out and process it 719 + */ 720 + #define GUC_CAPTURE_OVERBUFFER_MULTIPLIER 3 721 + 722 + static void check_guc_capture_size(struct xe_guc *guc) 723 + { 724 + int capture_size = guc_capture_output_size_est(guc); 725 + int spare_size = capture_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER; 726 + u32 buffer_size = xe_guc_log_section_size_capture(&guc->log); 727 + 728 + /* 729 + * NOTE: capture_size is much smaller than the capture region 730 + * allocation (DG2: <80K vs 1MB). 731 + * Additionally, its based on space needed to fit all engines getting 732 + * reset at once within the same G2H handler task slot. This is very 733 + * unlikely. However, if GuC really does run out of space for whatever 734 + * reason, we will see an separate warning message when processing the 735 + * G2H event capture-notification, search for: 736 + * xe_guc_STATE_CAPTURE_EVENT_STATUS_NOSPACE. 737 + */ 738 + if (capture_size < 0) 739 + xe_gt_dbg(guc_to_gt(guc), 740 + "Failed to calculate error state capture buffer minimum size: %d!\n", 741 + capture_size); 742 + if (capture_size > buffer_size) 743 + xe_gt_dbg(guc_to_gt(guc), "Error state capture buffer maybe small: %d < %d\n", 744 + buffer_size, capture_size); 745 + else if (spare_size > buffer_size) 746 + xe_gt_dbg(guc_to_gt(guc), 747 + "Error state capture buffer lacks spare size: %d < %d (min = %d)\n", 748 + buffer_size, spare_size, capture_size); 749 + } 750 + 673 751 /* 674 752 * xe_guc_capture_steered_list_init - Init steering register list 675 753 * @guc: The GuC object ··· 764 684 * the end of the pre-populated render list. 765 685 */ 766 686 guc_capture_alloc_steered_lists(guc); 687 + check_guc_capture_size(guc); 767 688 } 768 689 769 - /** 690 + /* 770 691 * xe_guc_capture_init - Init for GuC register capture 771 692 * @guc: The GuC object 772 693 *
+66
drivers/gpu/drm/xe/xe_guc_log.c
··· 270 270 271 271 return 0; 272 272 } 273 + 273 274 ALLOW_ERROR_INJECTION(xe_guc_log_init, ERRNO); /* See xe_pci_probe() */ 275 + 276 + static u32 xe_guc_log_section_size_crash(struct xe_guc_log *log) 277 + { 278 + return CRASH_BUFFER_SIZE; 279 + } 280 + 281 + static u32 xe_guc_log_section_size_debug(struct xe_guc_log *log) 282 + { 283 + return DEBUG_BUFFER_SIZE; 284 + } 285 + 286 + /** 287 + * xe_guc_log_section_size_capture - Get capture buffer size within log sections. 288 + * @log: The log object. 289 + * 290 + * This function will return the capture buffer size within log sections. 291 + * 292 + * Return: capture buffer size. 293 + */ 294 + u32 xe_guc_log_section_size_capture(struct xe_guc_log *log) 295 + { 296 + return CAPTURE_BUFFER_SIZE; 297 + } 298 + 299 + /** 300 + * xe_guc_get_log_buffer_size - Get log buffer size for a type. 301 + * @log: The log object. 302 + * @type: The log buffer type 303 + * 304 + * Return: buffer size. 305 + */ 306 + u32 xe_guc_get_log_buffer_size(struct xe_guc_log *log, enum guc_log_buffer_type type) 307 + { 308 + switch (type) { 309 + case GUC_LOG_BUFFER_CRASH_DUMP: 310 + return xe_guc_log_section_size_crash(log); 311 + case GUC_LOG_BUFFER_DEBUG: 312 + return xe_guc_log_section_size_debug(log); 313 + case GUC_LOG_BUFFER_CAPTURE: 314 + return xe_guc_log_section_size_capture(log); 315 + } 316 + return 0; 317 + } 318 + 319 + /** 320 + * xe_guc_get_log_buffer_offset - Get offset in log buffer for a type. 321 + * @log: The log object. 322 + * @type: The log buffer type 323 + * 324 + * This function will return the offset in the log buffer for a type. 325 + * Return: buffer offset. 326 + */ 327 + u32 xe_guc_get_log_buffer_offset(struct xe_guc_log *log, enum guc_log_buffer_type type) 328 + { 329 + enum guc_log_buffer_type i; 330 + u32 offset = PAGE_SIZE;/* for the log_buffer_states */ 331 + 332 + for (i = GUC_LOG_BUFFER_CRASH_DUMP; i < GUC_LOG_BUFFER_TYPE_MAX; ++i) { 333 + if (i == type) 334 + break; 335 + offset += xe_guc_get_log_buffer_size(log, i); 336 + } 337 + 338 + return offset; 339 + }
+6 -1
drivers/gpu/drm/xe/xe_guc_log.h
··· 7 7 #define _XE_GUC_LOG_H_ 8 8 9 9 #include "xe_guc_log_types.h" 10 + #include "abi/guc_log_abi.h" 10 11 11 12 struct drm_printer; 12 13 struct xe_device; ··· 19 18 #else 20 19 #define CRASH_BUFFER_SIZE SZ_8K 21 20 #define DEBUG_BUFFER_SIZE SZ_64K 22 - #define CAPTURE_BUFFER_SIZE SZ_16K 21 + #define CAPTURE_BUFFER_SIZE SZ_1M 23 22 #endif 24 23 /* 25 24 * While we're using plain log level in i915, GuC controls are much more... ··· 50 49 { 51 50 return log->level; 52 51 } 52 + 53 + u32 xe_guc_log_section_size_capture(struct xe_guc_log *log); 54 + u32 xe_guc_get_log_buffer_size(struct xe_guc_log *log, enum guc_log_buffer_type type); 55 + u32 xe_guc_get_log_buffer_offset(struct xe_guc_log *log, enum guc_log_buffer_type type); 53 56 54 57 #endif