Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/xe/vf: Add xe_gt_recovery_pending helper

Add xe_gt_recovery_pending helper.

This helper serves as the singular point to determine whether a GT
recovery is currently in progress. Expected callers include the GuC CT
layer and the GuC submission layer. Atomically visable as soon as vCPU
are unhalted until VF recovery completes.

v3:
- Add GT layer xe_gt_recovery_inprogress (Michal)
- Don't blow up in memirq not enabled (CI)
- Add __memirq_received with clear argument (Michal)
- xe_memirq_sw_int_0_irq_pending rename (Michal)
- Use offset in xe_memirq_sw_int_0_irq_pending (Michal)
v4:
- Refactor xe_gt_recovery_inprogress logic around memirq (Michal)
v5:
- s/inprogress/pending (Michal)
v7:
- Fix typos, adjust comment (Michal)

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Reviewed-by: Tomasz Lis <tomasz.lis@intel.com>
Link: https://lore.kernel.org/r/20251008214532.3442967-9-matthew.brost@intel.com

+99 -4
+13
drivers/gpu/drm/xe/xe_gt.h
··· 12 12 13 13 #include "xe_device.h" 14 14 #include "xe_device_types.h" 15 + #include "xe_gt_sriov_vf.h" 15 16 #include "xe_hw_engine.h" 16 17 17 18 #define for_each_hw_engine(hwe__, gt__, id__) \ ··· 123 122 124 123 return xe->info.has_usm && hwe->class == XE_ENGINE_CLASS_COPY && 125 124 hwe->instance == gt->usm.reserved_bcs_instance; 125 + } 126 + 127 + /** 128 + * xe_gt_recovery_pending() - GT recovery pending 129 + * @gt: the &xe_gt 130 + * 131 + * Return: True if GT recovery in pending, False otherwise 132 + */ 133 + static inline bool xe_gt_recovery_pending(struct xe_gt *gt) 134 + { 135 + return IS_SRIOV_VF(gt_to_xe(gt)) && 136 + xe_gt_sriov_vf_recovery_pending(gt); 126 137 } 127 138 128 139 #endif
+28
drivers/gpu/drm/xe/xe_gt_sriov_vf.c
··· 26 26 #include "xe_guc_hxg_helpers.h" 27 27 #include "xe_guc_relay.h" 28 28 #include "xe_lrc.h" 29 + #include "xe_memirq.h" 29 30 #include "xe_mmio.h" 30 31 #include "xe_sriov.h" 31 32 #include "xe_sriov_vf.h" ··· 777 776 struct xe_device *xe = gt_to_xe(gt); 778 777 779 778 xe_gt_assert(gt, IS_SRIOV_VF(xe)); 779 + xe_gt_assert(gt, xe_gt_sriov_vf_recovery_pending(gt)); 780 780 781 781 set_bit(gt->info.id, &xe->sriov.vf.migration.gt_flags); 782 782 /* ··· 1119 1117 GUC_RELAY_VERSION_LATEST_MAJOR, GUC_RELAY_VERSION_LATEST_MINOR); 1120 1118 drm_printf(p, "\thandshake:\t%u.%u\n", 1121 1119 pf_version->major, pf_version->minor); 1120 + } 1121 + 1122 + /** 1123 + * xe_gt_sriov_vf_recovery_pending() - VF post migration recovery pending 1124 + * @gt: the &xe_gt 1125 + * 1126 + * The return value of this function must be immediately visible upon vCPU 1127 + * unhalt and must persist until RESFIX_DONE is issued. This guarantee is 1128 + * currently implemented only for platforms that support memirq. If non-memirq 1129 + * platforms begin to support VF migration, this function will need to be 1130 + * updated accordingly. 1131 + * 1132 + * Return: True if VF post migration recovery is pending, False otherwise 1133 + */ 1134 + bool xe_gt_sriov_vf_recovery_pending(struct xe_gt *gt) 1135 + { 1136 + struct xe_memirq *memirq = &gt_to_tile(gt)->memirq; 1137 + 1138 + xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt))); 1139 + 1140 + /* early detection until recovery starts */ 1141 + if (xe_device_uses_memirq(gt_to_xe(gt)) && 1142 + xe_memirq_guc_sw_int_0_irq_pending(memirq, &gt->uc.guc)) 1143 + return true; 1144 + 1145 + return READ_ONCE(gt->sriov.vf.migration.recovery_inprogress); 1122 1146 }
+2
drivers/gpu/drm/xe/xe_gt_sriov_vf.h
··· 25 25 int xe_gt_sriov_vf_notify_resfix_done(struct xe_gt *gt); 26 26 void xe_gt_sriov_vf_migrated_event_handler(struct xe_gt *gt); 27 27 28 + bool xe_gt_sriov_vf_recovery_pending(struct xe_gt *gt); 29 + 28 30 u32 xe_gt_sriov_vf_gmdid(struct xe_gt *gt); 29 31 u16 xe_gt_sriov_vf_guc_ids(struct xe_gt *gt); 30 32 u64 xe_gt_sriov_vf_lmem(struct xe_gt *gt);
+10
drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h
··· 47 47 }; 48 48 49 49 /** 50 + * xe_gt_sriov_vf_migration - VF migration data. 51 + */ 52 + struct xe_gt_sriov_vf_migration { 53 + /** @recovery_inprogress: VF post migration recovery in progress */ 54 + bool recovery_inprogress; 55 + }; 56 + 57 + /** 50 58 * struct xe_gt_sriov_vf - GT level VF virtualization data. 51 59 */ 52 60 struct xe_gt_sriov_vf { ··· 66 58 struct xe_gt_sriov_vf_selfconfig self_config; 67 59 /** @runtime: runtime data retrieved from the PF. */ 68 60 struct xe_gt_sriov_vf_runtime runtime; 61 + /** @migration: migration data for the VF. */ 62 + struct xe_gt_sriov_vf_migration migration; 69 63 }; 70 64 71 65 #endif
+44 -4
drivers/gpu/drm/xe/xe_memirq.c
··· 397 397 memirq_set_enable(memirq, true); 398 398 } 399 399 400 - static bool memirq_received(struct xe_memirq *memirq, struct iosys_map *vector, 401 - u16 offset, const char *name) 400 + static bool __memirq_received(struct xe_memirq *memirq, 401 + struct iosys_map *vector, u16 offset, 402 + const char *name, bool clear) 402 403 { 403 404 u8 value; 404 405 ··· 409 408 memirq_err_ratelimited(memirq, 410 409 "Unexpected memirq value %#x from %s at %u\n", 411 410 value, name, offset); 412 - iosys_map_wr(vector, offset, u8, 0x00); 411 + if (clear) 412 + iosys_map_wr(vector, offset, u8, 0x00); 413 413 } 414 414 415 415 return value; 416 + } 417 + 418 + static bool memirq_received_noclear(struct xe_memirq *memirq, 419 + struct iosys_map *vector, 420 + u16 offset, const char *name) 421 + { 422 + return __memirq_received(memirq, vector, offset, name, false); 423 + } 424 + 425 + static bool memirq_received(struct xe_memirq *memirq, struct iosys_map *vector, 426 + u16 offset, const char *name) 427 + { 428 + return __memirq_received(memirq, vector, offset, name, true); 416 429 } 417 430 418 431 static void memirq_dispatch_engine(struct xe_memirq *memirq, struct iosys_map *status, ··· 448 433 if (memirq_received(memirq, status, ilog2(GUC_INTR_GUC2HOST), name)) 449 434 xe_guc_irq_handler(guc, GUC_INTR_GUC2HOST); 450 435 451 - if (memirq_received(memirq, status, ilog2(GUC_INTR_SW_INT_0), name)) 436 + /* 437 + * This is a software interrupt that must be cleared after it's consumed 438 + * to avoid race conditions where xe_gt_sriov_vf_recovery_pending() 439 + * returns false. 440 + */ 441 + if (memirq_received_noclear(memirq, status, ilog2(GUC_INTR_SW_INT_0), 442 + name)) { 452 443 xe_guc_irq_handler(guc, GUC_INTR_SW_INT_0); 444 + iosys_map_wr(status, ilog2(GUC_INTR_SW_INT_0), u8, 0x00); 445 + } 453 446 } 454 447 455 448 /** ··· 480 457 XE_MEMIRQ_STATUS_OFFSET(instance) + offset * SZ_16); 481 458 memirq_dispatch_engine(memirq, &status_offset, hwe); 482 459 } 460 + } 461 + 462 + /** 463 + * xe_memirq_guc_sw_int_0_irq_pending() - SW_INT_0 IRQ is pending 464 + * @memirq: the &xe_memirq 465 + * @guc: the &xe_guc to check for IRQ 466 + * 467 + * Return: True if SW_INT_0 IRQ is pending on @guc, False otherwise 468 + */ 469 + bool xe_memirq_guc_sw_int_0_irq_pending(struct xe_memirq *memirq, struct xe_guc *guc) 470 + { 471 + struct xe_gt *gt = guc_to_gt(guc); 472 + u32 offset = xe_gt_is_media_type(gt) ? ilog2(INTR_MGUC) : ilog2(INTR_GUC); 473 + struct iosys_map map = IOSYS_MAP_INIT_OFFSET(&memirq->status, offset * SZ_16); 474 + 475 + return memirq_received_noclear(memirq, &map, ilog2(GUC_INTR_SW_INT_0), 476 + guc_name(guc)); 483 477 } 484 478 485 479 /**
+2
drivers/gpu/drm/xe/xe_memirq.h
··· 25 25 26 26 int xe_memirq_init_guc(struct xe_memirq *memirq, struct xe_guc *guc); 27 27 28 + bool xe_memirq_guc_sw_int_0_irq_pending(struct xe_memirq *memirq, struct xe_guc *guc); 29 + 28 30 #endif