drm/amdgpu: handle IH ring1 overflow · tjh.dev/kernel@3c2d6ea

+7 -1

drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c

··· 350 350 * amdgpu_gmc_filter_faults - filter VM faults 351 351 * 352 352 * @adev: amdgpu device structure 353 + * @ih: interrupt ring that the fault received from 353 354 * @addr: address of the VM fault 354 355 * @pasid: PASID of the process causing the fault 355 356 * @timestamp: timestamp of the fault ··· 359 358 * True if the fault was filtered and should not be processed further. 360 359 * False if the fault is a new one and needs to be handled. 361 360 */ 362 - bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr, 361 + bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, 362 + struct amdgpu_ih_ring *ih, uint64_t addr, 363 363 uint16_t pasid, uint64_t timestamp) 364 364 { 365 365 struct amdgpu_gmc *gmc = &adev->gmc; 366 366 uint64_t stamp, key = amdgpu_gmc_fault_key(addr, pasid); 367 367 struct amdgpu_gmc_fault *fault; 368 368 uint32_t hash; 369 + 370 + /* Stale retry fault if timestamp goes backward */ 371 + if (amdgpu_ih_ts_after(timestamp, ih->processed_timestamp)) 372 + return true; 369 373 370 374 /* If we don't have space left in the ring buffer return immediately */ 371 375 stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -

+2 -1

drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h

··· 316 316 struct amdgpu_gmc *mc); 317 317 void amdgpu_gmc_agp_location(struct amdgpu_device *adev, 318 318 struct amdgpu_gmc *mc); 319 - bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr, 319 + bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, 320 + struct amdgpu_ih_ring *ih, uint64_t addr, 320 321 uint16_t pasid, uint64_t timestamp); 321 322 void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr, 322 323 uint16_t pasid);

+25 -30

drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c

··· 164 164 } 165 165 } 166 166 167 - /* Waiter helper that checks current rptr matches or passes checkpoint wptr */ 168 - static bool amdgpu_ih_has_checkpoint_processed(struct amdgpu_device *adev, 169 - struct amdgpu_ih_ring *ih, 170 - uint32_t checkpoint_wptr, 171 - uint32_t *prev_rptr) 172 - { 173 - uint32_t cur_rptr = ih->rptr | (*prev_rptr & ~ih->ptr_mask); 174 - 175 - /* rptr has wrapped. */ 176 - if (cur_rptr < *prev_rptr) 177 - cur_rptr += ih->ptr_mask + 1; 178 - *prev_rptr = cur_rptr; 179 - 180 - /* check ring is empty to workaround missing wptr overflow flag */ 181 - return cur_rptr >= checkpoint_wptr || 182 - (cur_rptr & ih->ptr_mask) == amdgpu_ih_get_wptr(adev, ih); 183 - } 184 - 185 167 /** 186 - * amdgpu_ih_wait_on_checkpoint_process - wait to process IVs up to checkpoint 168 + * amdgpu_ih_wait_on_checkpoint_process_ts - wait to process IVs up to checkpoint 187 169 * 188 170 * @adev: amdgpu_device pointer 189 171 * @ih: ih ring to process 190 172 * 191 173 * Used to ensure ring has processed IVs up to the checkpoint write pointer. 192 174 */ 193 - int amdgpu_ih_wait_on_checkpoint_process(struct amdgpu_device *adev, 175 + int amdgpu_ih_wait_on_checkpoint_process_ts(struct amdgpu_device *adev, 194 176 struct amdgpu_ih_ring *ih) 195 177 { 196 - uint32_t checkpoint_wptr, rptr; 178 + uint32_t checkpoint_wptr; 179 + uint64_t checkpoint_ts; 180 + long timeout = HZ; 197 181 198 182 if (!ih->enabled || adev->shutdown) 199 183 return -ENODEV; 200 184 201 185 checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih); 202 - /* Order wptr with rptr. */ 186 + /* Order wptr with ring data. */ 203 187 rmb(); 204 - rptr = READ_ONCE(ih->rptr); 188 + checkpoint_ts = amdgpu_ih_decode_iv_ts(adev, ih, checkpoint_wptr, -1); 205 189 206 - /* wptr has wrapped. */ 207 - if (rptr > checkpoint_wptr) 208 - checkpoint_wptr += ih->ptr_mask + 1; 209 - 210 - return wait_event_interruptible(ih->wait_process, 211 - amdgpu_ih_has_checkpoint_processed(adev, ih, 212 - checkpoint_wptr, &rptr)); 190 + return wait_event_interruptible_timeout(ih->wait_process, 191 + !amdgpu_ih_ts_after(ih->processed_timestamp, checkpoint_ts), 192 + timeout); 213 193 } 214 194 215 195 /** ··· 278 298 279 299 /* wptr/rptr are in bytes! */ 280 300 ih->rptr += 32; 301 + } 302 + 303 + uint64_t amdgpu_ih_decode_iv_ts_helper(struct amdgpu_ih_ring *ih, u32 rptr, 304 + signed int offset) 305 + { 306 + uint32_t iv_size = 32; 307 + uint32_t ring_index; 308 + uint32_t dw1, dw2; 309 + 310 + rptr += iv_size * offset; 311 + ring_index = (rptr & ih->ptr_mask) >> 2; 312 + 313 + dw1 = le32_to_cpu(ih->ring[ring_index + 1]); 314 + dw2 = le32_to_cpu(ih->ring[ring_index + 2]); 315 + return dw1 | ((u64)(dw2 & 0xffff) << 32); 281 316 }

+14 -2

drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h

··· 68 68 69 69 /* For waiting on IH processing at checkpoint. */ 70 70 wait_queue_head_t wait_process; 71 + uint64_t processed_timestamp; 71 72 }; 73 + 74 + /* return true if time stamp t2 is after t1 with 48bit wrap around */ 75 + #define amdgpu_ih_ts_after(t1, t2) \ 76 + (((int64_t)((t2) << 16) - (int64_t)((t1) << 16)) > 0LL) 72 77 73 78 /* provided by the ih block */ 74 79 struct amdgpu_ih_funcs { ··· 81 76 u32 (*get_wptr)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih); 82 77 void (*decode_iv)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih, 83 78 struct amdgpu_iv_entry *entry); 79 + uint64_t (*decode_iv_ts)(struct amdgpu_ih_ring *ih, u32 rptr, 80 + signed int offset); 84 81 void (*set_rptr)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih); 85 82 }; 86 83 87 84 #define amdgpu_ih_get_wptr(adev, ih) (adev)->irq.ih_funcs->get_wptr((adev), (ih)) 88 85 #define amdgpu_ih_decode_iv(adev, iv) \ 89 86 (adev)->irq.ih_funcs->decode_iv((adev), (ih), (iv)) 87 + #define amdgpu_ih_decode_iv_ts(adev, ih, rptr, offset) \ 88 + (WARN_ON_ONCE(!(adev)->irq.ih_funcs->decode_iv_ts) ? 0 : \ 89 + (adev)->irq.ih_funcs->decode_iv_ts((ih), (rptr), (offset))) 90 90 #define amdgpu_ih_set_rptr(adev, ih) (adev)->irq.ih_funcs->set_rptr((adev), (ih)) 91 91 92 92 int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih, ··· 99 89 void amdgpu_ih_ring_fini(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih); 100 90 void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv, 101 91 unsigned int num_dw); 102 - int amdgpu_ih_wait_on_checkpoint_process(struct amdgpu_device *adev, 103 - struct amdgpu_ih_ring *ih); 92 + int amdgpu_ih_wait_on_checkpoint_process_ts(struct amdgpu_device *adev, 93 + struct amdgpu_ih_ring *ih); 104 94 int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih); 105 95 void amdgpu_ih_decode_iv_helper(struct amdgpu_device *adev, 106 96 struct amdgpu_ih_ring *ih, 107 97 struct amdgpu_iv_entry *entry); 98 + uint64_t amdgpu_ih_decode_iv_ts_helper(struct amdgpu_ih_ring *ih, u32 rptr, 99 + signed int offset); 108 100 #endif

+6

drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c

··· 528 528 /* Send it to amdkfd as well if it isn't already handled */ 529 529 if (!handled) 530 530 amdgpu_amdkfd_interrupt(adev, entry.iv_entry); 531 + 532 + dev_WARN_ONCE(adev->dev, ih->processed_timestamp == entry.timestamp, 533 + "IH timestamps are not unique"); 534 + 535 + if (amdgpu_ih_ts_after(ih->processed_timestamp, entry.timestamp)) 536 + ih->processed_timestamp = entry.timestamp; 531 537 } 532 538 533 539 /**

+1 -1

drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c

··· 107 107 108 108 /* Process it onyl if it's the first fault for this address */ 109 109 if (entry->ih != &adev->irq.ih_soft && 110 - amdgpu_gmc_filter_faults(adev, addr, entry->pasid, 110 + amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid, 111 111 entry->timestamp)) 112 112 return 1; 113 113

+1 -1

drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

··· 523 523 524 524 /* Process it onyl if it's the first fault for this address */ 525 525 if (entry->ih != &adev->irq.ih_soft && 526 - amdgpu_gmc_filter_faults(adev, addr, entry->pasid, 526 + amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid, 527 527 entry->timestamp)) 528 528 return 1; 529 529

+1

drivers/gpu/drm/amd/amdgpu/navi10_ih.c

··· 716 716 static const struct amdgpu_ih_funcs navi10_ih_funcs = { 717 717 .get_wptr = navi10_ih_get_wptr, 718 718 .decode_iv = amdgpu_ih_decode_iv_helper, 719 + .decode_iv_ts = amdgpu_ih_decode_iv_ts_helper, 719 720 .set_rptr = navi10_ih_set_rptr 720 721 }; 721 722

+1

drivers/gpu/drm/amd/amdgpu/vega10_ih.c

··· 640 640 static const struct amdgpu_ih_funcs vega10_ih_funcs = { 641 641 .get_wptr = vega10_ih_get_wptr, 642 642 .decode_iv = amdgpu_ih_decode_iv_helper, 643 + .decode_iv_ts = amdgpu_ih_decode_iv_ts_helper, 643 644 .set_rptr = vega10_ih_set_rptr 644 645 }; 645 646

+1

drivers/gpu/drm/amd/amdgpu/vega20_ih.c

··· 688 688 static const struct amdgpu_ih_funcs vega20_ih_funcs = { 689 689 .get_wptr = vega20_ih_get_wptr, 690 690 .decode_iv = amdgpu_ih_decode_iv_helper, 691 + .decode_iv_ts = amdgpu_ih_decode_iv_ts_helper, 691 692 .set_rptr = vega20_ih_set_rptr 692 693 }; 693 694

+1 -1

drivers/gpu/drm/amd/amdkfd/kfd_svm.c

··· 1974 1974 1975 1975 pr_debug("drain retry fault gpu %d svms %p\n", i, svms); 1976 1976 1977 - amdgpu_ih_wait_on_checkpoint_process(pdd->dev->adev, 1977 + amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev, 1978 1978 &pdd->dev->adev->irq.ih1); 1979 1979 pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms); 1980 1980 }