Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28#include <linux/power_supply.h>
29#include <linux/kthread.h>
30#include <linux/module.h>
31#include <linux/console.h>
32#include <linux/slab.h>
33#include <linux/iommu.h>
34#include <linux/pci.h>
35#include <linux/devcoredump.h>
36#include <generated/utsrelease.h>
37#include <linux/pci-p2pdma.h>
38#include <linux/apple-gmux.h>
39
40#include <drm/drm_aperture.h>
41#include <drm/drm_atomic_helper.h>
42#include <drm/drm_crtc_helper.h>
43#include <drm/drm_fb_helper.h>
44#include <drm/drm_probe_helper.h>
45#include <drm/amdgpu_drm.h>
46#include <linux/vgaarb.h>
47#include <linux/vga_switcheroo.h>
48#include <linux/efi.h>
49#include "amdgpu.h"
50#include "amdgpu_trace.h"
51#include "amdgpu_i2c.h"
52#include "atom.h"
53#include "amdgpu_atombios.h"
54#include "amdgpu_atomfirmware.h"
55#include "amd_pcie.h"
56#ifdef CONFIG_DRM_AMDGPU_SI
57#include "si.h"
58#endif
59#ifdef CONFIG_DRM_AMDGPU_CIK
60#include "cik.h"
61#endif
62#include "vi.h"
63#include "soc15.h"
64#include "nv.h"
65#include "bif/bif_4_1_d.h"
66#include <linux/firmware.h>
67#include "amdgpu_vf_error.h"
68
69#include "amdgpu_amdkfd.h"
70#include "amdgpu_pm.h"
71
72#include "amdgpu_xgmi.h"
73#include "amdgpu_ras.h"
74#include "amdgpu_pmu.h"
75#include "amdgpu_fru_eeprom.h"
76#include "amdgpu_reset.h"
77
78#include <linux/suspend.h>
79#include <drm/task_barrier.h>
80#include <linux/pm_runtime.h>
81
82#include <drm/drm_drv.h>
83
84#if IS_ENABLED(CONFIG_X86)
85#include <asm/intel-family.h>
86#endif
87
88MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
89MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
90MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
91MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
92MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
93MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
94MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
95
96#define AMDGPU_RESUME_MS 2000
97#define AMDGPU_MAX_RETRY_LIMIT 2
98#define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
99
100static const struct drm_driver amdgpu_kms_driver;
101
102const char *amdgpu_asic_name[] = {
103 "TAHITI",
104 "PITCAIRN",
105 "VERDE",
106 "OLAND",
107 "HAINAN",
108 "BONAIRE",
109 "KAVERI",
110 "KABINI",
111 "HAWAII",
112 "MULLINS",
113 "TOPAZ",
114 "TONGA",
115 "FIJI",
116 "CARRIZO",
117 "STONEY",
118 "POLARIS10",
119 "POLARIS11",
120 "POLARIS12",
121 "VEGAM",
122 "VEGA10",
123 "VEGA12",
124 "VEGA20",
125 "RAVEN",
126 "ARCTURUS",
127 "RENOIR",
128 "ALDEBARAN",
129 "NAVI10",
130 "CYAN_SKILLFISH",
131 "NAVI14",
132 "NAVI12",
133 "SIENNA_CICHLID",
134 "NAVY_FLOUNDER",
135 "VANGOGH",
136 "DIMGREY_CAVEFISH",
137 "BEIGE_GOBY",
138 "YELLOW_CARP",
139 "IP DISCOVERY",
140 "LAST",
141};
142
143/**
144 * DOC: pcie_replay_count
145 *
146 * The amdgpu driver provides a sysfs API for reporting the total number
147 * of PCIe replays (NAKs)
148 * The file pcie_replay_count is used for this and returns the total
149 * number of replays as a sum of the NAKs generated and NAKs received
150 */
151
152static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
153 struct device_attribute *attr, char *buf)
154{
155 struct drm_device *ddev = dev_get_drvdata(dev);
156 struct amdgpu_device *adev = drm_to_adev(ddev);
157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
158
159 return sysfs_emit(buf, "%llu\n", cnt);
160}
161
162static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
163 amdgpu_device_get_pcie_replay_count, NULL);
164
165static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
166
167/**
168 * DOC: product_name
169 *
170 * The amdgpu driver provides a sysfs API for reporting the product name
171 * for the device
172 * The file product_name is used for this and returns the product name
173 * as returned from the FRU.
174 * NOTE: This is only available for certain server cards
175 */
176
177static ssize_t amdgpu_device_get_product_name(struct device *dev,
178 struct device_attribute *attr, char *buf)
179{
180 struct drm_device *ddev = dev_get_drvdata(dev);
181 struct amdgpu_device *adev = drm_to_adev(ddev);
182
183 return sysfs_emit(buf, "%s\n", adev->product_name);
184}
185
186static DEVICE_ATTR(product_name, S_IRUGO,
187 amdgpu_device_get_product_name, NULL);
188
189/**
190 * DOC: product_number
191 *
192 * The amdgpu driver provides a sysfs API for reporting the part number
193 * for the device
194 * The file product_number is used for this and returns the part number
195 * as returned from the FRU.
196 * NOTE: This is only available for certain server cards
197 */
198
199static ssize_t amdgpu_device_get_product_number(struct device *dev,
200 struct device_attribute *attr, char *buf)
201{
202 struct drm_device *ddev = dev_get_drvdata(dev);
203 struct amdgpu_device *adev = drm_to_adev(ddev);
204
205 return sysfs_emit(buf, "%s\n", adev->product_number);
206}
207
208static DEVICE_ATTR(product_number, S_IRUGO,
209 amdgpu_device_get_product_number, NULL);
210
211/**
212 * DOC: serial_number
213 *
214 * The amdgpu driver provides a sysfs API for reporting the serial number
215 * for the device
216 * The file serial_number is used for this and returns the serial number
217 * as returned from the FRU.
218 * NOTE: This is only available for certain server cards
219 */
220
221static ssize_t amdgpu_device_get_serial_number(struct device *dev,
222 struct device_attribute *attr, char *buf)
223{
224 struct drm_device *ddev = dev_get_drvdata(dev);
225 struct amdgpu_device *adev = drm_to_adev(ddev);
226
227 return sysfs_emit(buf, "%s\n", adev->serial);
228}
229
230static DEVICE_ATTR(serial_number, S_IRUGO,
231 amdgpu_device_get_serial_number, NULL);
232
233/**
234 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
235 *
236 * @dev: drm_device pointer
237 *
238 * Returns true if the device is a dGPU with ATPX power control,
239 * otherwise return false.
240 */
241bool amdgpu_device_supports_px(struct drm_device *dev)
242{
243 struct amdgpu_device *adev = drm_to_adev(dev);
244
245 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
246 return true;
247 return false;
248}
249
250/**
251 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
252 *
253 * @dev: drm_device pointer
254 *
255 * Returns true if the device is a dGPU with ACPI power control,
256 * otherwise return false.
257 */
258bool amdgpu_device_supports_boco(struct drm_device *dev)
259{
260 struct amdgpu_device *adev = drm_to_adev(dev);
261
262 if (adev->has_pr3 ||
263 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
264 return true;
265 return false;
266}
267
268/**
269 * amdgpu_device_supports_baco - Does the device support BACO
270 *
271 * @dev: drm_device pointer
272 *
273 * Returns true if the device supporte BACO,
274 * otherwise return false.
275 */
276bool amdgpu_device_supports_baco(struct drm_device *dev)
277{
278 struct amdgpu_device *adev = drm_to_adev(dev);
279
280 return amdgpu_asic_supports_baco(adev);
281}
282
283/**
284 * amdgpu_device_supports_smart_shift - Is the device dGPU with
285 * smart shift support
286 *
287 * @dev: drm_device pointer
288 *
289 * Returns true if the device is a dGPU with Smart Shift support,
290 * otherwise returns false.
291 */
292bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
293{
294 return (amdgpu_device_supports_boco(dev) &&
295 amdgpu_acpi_is_power_shift_control_supported());
296}
297
298/*
299 * VRAM access helper functions
300 */
301
302/**
303 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
304 *
305 * @adev: amdgpu_device pointer
306 * @pos: offset of the buffer in vram
307 * @buf: virtual address of the buffer in system memory
308 * @size: read/write size, sizeof(@buf) must > @size
309 * @write: true - write to vram, otherwise - read from vram
310 */
311void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
312 void *buf, size_t size, bool write)
313{
314 unsigned long flags;
315 uint32_t hi = ~0, tmp = 0;
316 uint32_t *data = buf;
317 uint64_t last;
318 int idx;
319
320 if (!drm_dev_enter(adev_to_drm(adev), &idx))
321 return;
322
323 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
324
325 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
326 for (last = pos + size; pos < last; pos += 4) {
327 tmp = pos >> 31;
328
329 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
330 if (tmp != hi) {
331 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
332 hi = tmp;
333 }
334 if (write)
335 WREG32_NO_KIQ(mmMM_DATA, *data++);
336 else
337 *data++ = RREG32_NO_KIQ(mmMM_DATA);
338 }
339
340 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
341 drm_dev_exit(idx);
342}
343
344/**
345 * amdgpu_device_aper_access - access vram by vram aperature
346 *
347 * @adev: amdgpu_device pointer
348 * @pos: offset of the buffer in vram
349 * @buf: virtual address of the buffer in system memory
350 * @size: read/write size, sizeof(@buf) must > @size
351 * @write: true - write to vram, otherwise - read from vram
352 *
353 * The return value means how many bytes have been transferred.
354 */
355size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
356 void *buf, size_t size, bool write)
357{
358#ifdef CONFIG_64BIT
359 void __iomem *addr;
360 size_t count = 0;
361 uint64_t last;
362
363 if (!adev->mman.aper_base_kaddr)
364 return 0;
365
366 last = min(pos + size, adev->gmc.visible_vram_size);
367 if (last > pos) {
368 addr = adev->mman.aper_base_kaddr + pos;
369 count = last - pos;
370
371 if (write) {
372 memcpy_toio(addr, buf, count);
373 mb();
374 amdgpu_device_flush_hdp(adev, NULL);
375 } else {
376 amdgpu_device_invalidate_hdp(adev, NULL);
377 mb();
378 memcpy_fromio(buf, addr, count);
379 }
380
381 }
382
383 return count;
384#else
385 return 0;
386#endif
387}
388
389/**
390 * amdgpu_device_vram_access - read/write a buffer in vram
391 *
392 * @adev: amdgpu_device pointer
393 * @pos: offset of the buffer in vram
394 * @buf: virtual address of the buffer in system memory
395 * @size: read/write size, sizeof(@buf) must > @size
396 * @write: true - write to vram, otherwise - read from vram
397 */
398void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
399 void *buf, size_t size, bool write)
400{
401 size_t count;
402
403 /* try to using vram apreature to access vram first */
404 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
405 size -= count;
406 if (size) {
407 /* using MM to access rest vram */
408 pos += count;
409 buf += count;
410 amdgpu_device_mm_access(adev, pos, buf, size, write);
411 }
412}
413
414/*
415 * register access helper functions.
416 */
417
418/* Check if hw access should be skipped because of hotplug or device error */
419bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
420{
421 if (adev->no_hw_access)
422 return true;
423
424#ifdef CONFIG_LOCKDEP
425 /*
426 * This is a bit complicated to understand, so worth a comment. What we assert
427 * here is that the GPU reset is not running on another thread in parallel.
428 *
429 * For this we trylock the read side of the reset semaphore, if that succeeds
430 * we know that the reset is not running in paralell.
431 *
432 * If the trylock fails we assert that we are either already holding the read
433 * side of the lock or are the reset thread itself and hold the write side of
434 * the lock.
435 */
436 if (in_task()) {
437 if (down_read_trylock(&adev->reset_domain->sem))
438 up_read(&adev->reset_domain->sem);
439 else
440 lockdep_assert_held(&adev->reset_domain->sem);
441 }
442#endif
443 return false;
444}
445
446/**
447 * amdgpu_device_rreg - read a memory mapped IO or indirect register
448 *
449 * @adev: amdgpu_device pointer
450 * @reg: dword aligned register offset
451 * @acc_flags: access flags which require special behavior
452 *
453 * Returns the 32 bit value from the offset specified.
454 */
455uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
456 uint32_t reg, uint32_t acc_flags)
457{
458 uint32_t ret;
459
460 if (amdgpu_device_skip_hw_access(adev))
461 return 0;
462
463 if ((reg * 4) < adev->rmmio_size) {
464 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
465 amdgpu_sriov_runtime(adev) &&
466 down_read_trylock(&adev->reset_domain->sem)) {
467 ret = amdgpu_kiq_rreg(adev, reg);
468 up_read(&adev->reset_domain->sem);
469 } else {
470 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
471 }
472 } else {
473 ret = adev->pcie_rreg(adev, reg * 4);
474 }
475
476 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
477
478 return ret;
479}
480
481/*
482 * MMIO register read with bytes helper functions
483 * @offset:bytes offset from MMIO start
484 *
485*/
486
487/**
488 * amdgpu_mm_rreg8 - read a memory mapped IO register
489 *
490 * @adev: amdgpu_device pointer
491 * @offset: byte aligned register offset
492 *
493 * Returns the 8 bit value from the offset specified.
494 */
495uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
496{
497 if (amdgpu_device_skip_hw_access(adev))
498 return 0;
499
500 if (offset < adev->rmmio_size)
501 return (readb(adev->rmmio + offset));
502 BUG();
503}
504
505/*
506 * MMIO register write with bytes helper functions
507 * @offset:bytes offset from MMIO start
508 * @value: the value want to be written to the register
509 *
510*/
511/**
512 * amdgpu_mm_wreg8 - read a memory mapped IO register
513 *
514 * @adev: amdgpu_device pointer
515 * @offset: byte aligned register offset
516 * @value: 8 bit value to write
517 *
518 * Writes the value specified to the offset specified.
519 */
520void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
521{
522 if (amdgpu_device_skip_hw_access(adev))
523 return;
524
525 if (offset < adev->rmmio_size)
526 writeb(value, adev->rmmio + offset);
527 else
528 BUG();
529}
530
531/**
532 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
533 *
534 * @adev: amdgpu_device pointer
535 * @reg: dword aligned register offset
536 * @v: 32 bit value to write to the register
537 * @acc_flags: access flags which require special behavior
538 *
539 * Writes the value specified to the offset specified.
540 */
541void amdgpu_device_wreg(struct amdgpu_device *adev,
542 uint32_t reg, uint32_t v,
543 uint32_t acc_flags)
544{
545 if (amdgpu_device_skip_hw_access(adev))
546 return;
547
548 if ((reg * 4) < adev->rmmio_size) {
549 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
550 amdgpu_sriov_runtime(adev) &&
551 down_read_trylock(&adev->reset_domain->sem)) {
552 amdgpu_kiq_wreg(adev, reg, v);
553 up_read(&adev->reset_domain->sem);
554 } else {
555 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
556 }
557 } else {
558 adev->pcie_wreg(adev, reg * 4, v);
559 }
560
561 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
562}
563
564/**
565 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
566 *
567 * @adev: amdgpu_device pointer
568 * @reg: mmio/rlc register
569 * @v: value to write
570 *
571 * this function is invoked only for the debugfs register access
572 */
573void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
574 uint32_t reg, uint32_t v)
575{
576 if (amdgpu_device_skip_hw_access(adev))
577 return;
578
579 if (amdgpu_sriov_fullaccess(adev) &&
580 adev->gfx.rlc.funcs &&
581 adev->gfx.rlc.funcs->is_rlcg_access_range) {
582 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
583 return amdgpu_sriov_wreg(adev, reg, v, 0, 0);
584 } else if ((reg * 4) >= adev->rmmio_size) {
585 adev->pcie_wreg(adev, reg * 4, v);
586 } else {
587 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
588 }
589}
590
591/**
592 * amdgpu_mm_rdoorbell - read a doorbell dword
593 *
594 * @adev: amdgpu_device pointer
595 * @index: doorbell index
596 *
597 * Returns the value in the doorbell aperture at the
598 * requested doorbell index (CIK).
599 */
600u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
601{
602 if (amdgpu_device_skip_hw_access(adev))
603 return 0;
604
605 if (index < adev->doorbell.num_kernel_doorbells) {
606 return readl(adev->doorbell.ptr + index);
607 } else {
608 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
609 return 0;
610 }
611}
612
613/**
614 * amdgpu_mm_wdoorbell - write a doorbell dword
615 *
616 * @adev: amdgpu_device pointer
617 * @index: doorbell index
618 * @v: value to write
619 *
620 * Writes @v to the doorbell aperture at the
621 * requested doorbell index (CIK).
622 */
623void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
624{
625 if (amdgpu_device_skip_hw_access(adev))
626 return;
627
628 if (index < adev->doorbell.num_kernel_doorbells) {
629 writel(v, adev->doorbell.ptr + index);
630 } else {
631 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
632 }
633}
634
635/**
636 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
637 *
638 * @adev: amdgpu_device pointer
639 * @index: doorbell index
640 *
641 * Returns the value in the doorbell aperture at the
642 * requested doorbell index (VEGA10+).
643 */
644u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
645{
646 if (amdgpu_device_skip_hw_access(adev))
647 return 0;
648
649 if (index < adev->doorbell.num_kernel_doorbells) {
650 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
651 } else {
652 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
653 return 0;
654 }
655}
656
657/**
658 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
659 *
660 * @adev: amdgpu_device pointer
661 * @index: doorbell index
662 * @v: value to write
663 *
664 * Writes @v to the doorbell aperture at the
665 * requested doorbell index (VEGA10+).
666 */
667void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
668{
669 if (amdgpu_device_skip_hw_access(adev))
670 return;
671
672 if (index < adev->doorbell.num_kernel_doorbells) {
673 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
674 } else {
675 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
676 }
677}
678
679/**
680 * amdgpu_device_indirect_rreg - read an indirect register
681 *
682 * @adev: amdgpu_device pointer
683 * @reg_addr: indirect register address to read from
684 *
685 * Returns the value of indirect register @reg_addr
686 */
687u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
688 u32 reg_addr)
689{
690 unsigned long flags, pcie_index, pcie_data;
691 void __iomem *pcie_index_offset;
692 void __iomem *pcie_data_offset;
693 u32 r;
694
695 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
696 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
697
698 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
699 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
700 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
701
702 writel(reg_addr, pcie_index_offset);
703 readl(pcie_index_offset);
704 r = readl(pcie_data_offset);
705 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
706
707 return r;
708}
709
710u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
711 u64 reg_addr)
712{
713 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
714 u32 r;
715 void __iomem *pcie_index_offset;
716 void __iomem *pcie_index_hi_offset;
717 void __iomem *pcie_data_offset;
718
719 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
720 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
721 if (adev->nbio.funcs->get_pcie_index_hi_offset)
722 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
723 else
724 pcie_index_hi = 0;
725
726 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
727 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
728 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
729 if (pcie_index_hi != 0)
730 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
731 pcie_index_hi * 4;
732
733 writel(reg_addr, pcie_index_offset);
734 readl(pcie_index_offset);
735 if (pcie_index_hi != 0) {
736 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
737 readl(pcie_index_hi_offset);
738 }
739 r = readl(pcie_data_offset);
740
741 /* clear the high bits */
742 if (pcie_index_hi != 0) {
743 writel(0, pcie_index_hi_offset);
744 readl(pcie_index_hi_offset);
745 }
746
747 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
748
749 return r;
750}
751
752/**
753 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
754 *
755 * @adev: amdgpu_device pointer
756 * @reg_addr: indirect register address to read from
757 *
758 * Returns the value of indirect register @reg_addr
759 */
760u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
761 u32 reg_addr)
762{
763 unsigned long flags, pcie_index, pcie_data;
764 void __iomem *pcie_index_offset;
765 void __iomem *pcie_data_offset;
766 u64 r;
767
768 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
769 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
770
771 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
772 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
773 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
774
775 /* read low 32 bits */
776 writel(reg_addr, pcie_index_offset);
777 readl(pcie_index_offset);
778 r = readl(pcie_data_offset);
779 /* read high 32 bits */
780 writel(reg_addr + 4, pcie_index_offset);
781 readl(pcie_index_offset);
782 r |= ((u64)readl(pcie_data_offset) << 32);
783 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
784
785 return r;
786}
787
788/**
789 * amdgpu_device_indirect_wreg - write an indirect register address
790 *
791 * @adev: amdgpu_device pointer
792 * @reg_addr: indirect register offset
793 * @reg_data: indirect register data
794 *
795 */
796void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
797 u32 reg_addr, u32 reg_data)
798{
799 unsigned long flags, pcie_index, pcie_data;
800 void __iomem *pcie_index_offset;
801 void __iomem *pcie_data_offset;
802
803 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
804 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
805
806 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
807 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
808 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
809
810 writel(reg_addr, pcie_index_offset);
811 readl(pcie_index_offset);
812 writel(reg_data, pcie_data_offset);
813 readl(pcie_data_offset);
814 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
815}
816
817void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
818 u64 reg_addr, u32 reg_data)
819{
820 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
821 void __iomem *pcie_index_offset;
822 void __iomem *pcie_index_hi_offset;
823 void __iomem *pcie_data_offset;
824
825 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
826 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
827 if (adev->nbio.funcs->get_pcie_index_hi_offset)
828 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
829 else
830 pcie_index_hi = 0;
831
832 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
833 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
834 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
835 if (pcie_index_hi != 0)
836 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
837 pcie_index_hi * 4;
838
839 writel(reg_addr, pcie_index_offset);
840 readl(pcie_index_offset);
841 if (pcie_index_hi != 0) {
842 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
843 readl(pcie_index_hi_offset);
844 }
845 writel(reg_data, pcie_data_offset);
846 readl(pcie_data_offset);
847
848 /* clear the high bits */
849 if (pcie_index_hi != 0) {
850 writel(0, pcie_index_hi_offset);
851 readl(pcie_index_hi_offset);
852 }
853
854 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
855}
856
857/**
858 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
859 *
860 * @adev: amdgpu_device pointer
861 * @reg_addr: indirect register offset
862 * @reg_data: indirect register data
863 *
864 */
865void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
866 u32 reg_addr, u64 reg_data)
867{
868 unsigned long flags, pcie_index, pcie_data;
869 void __iomem *pcie_index_offset;
870 void __iomem *pcie_data_offset;
871
872 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
873 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
874
875 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
876 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
877 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
878
879 /* write low 32 bits */
880 writel(reg_addr, pcie_index_offset);
881 readl(pcie_index_offset);
882 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
883 readl(pcie_data_offset);
884 /* write high 32 bits */
885 writel(reg_addr + 4, pcie_index_offset);
886 readl(pcie_index_offset);
887 writel((u32)(reg_data >> 32), pcie_data_offset);
888 readl(pcie_data_offset);
889 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
890}
891
892/**
893 * amdgpu_device_get_rev_id - query device rev_id
894 *
895 * @adev: amdgpu_device pointer
896 *
897 * Return device rev_id
898 */
899u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
900{
901 return adev->nbio.funcs->get_rev_id(adev);
902}
903
904/**
905 * amdgpu_invalid_rreg - dummy reg read function
906 *
907 * @adev: amdgpu_device pointer
908 * @reg: offset of register
909 *
910 * Dummy register read function. Used for register blocks
911 * that certain asics don't have (all asics).
912 * Returns the value in the register.
913 */
914static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
915{
916 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
917 BUG();
918 return 0;
919}
920
921static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
922{
923 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
924 BUG();
925 return 0;
926}
927
928/**
929 * amdgpu_invalid_wreg - dummy reg write function
930 *
931 * @adev: amdgpu_device pointer
932 * @reg: offset of register
933 * @v: value to write to the register
934 *
935 * Dummy register read function. Used for register blocks
936 * that certain asics don't have (all asics).
937 */
938static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
939{
940 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
941 reg, v);
942 BUG();
943}
944
945static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
946{
947 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
948 reg, v);
949 BUG();
950}
951
952/**
953 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
954 *
955 * @adev: amdgpu_device pointer
956 * @reg: offset of register
957 *
958 * Dummy register read function. Used for register blocks
959 * that certain asics don't have (all asics).
960 * Returns the value in the register.
961 */
962static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
963{
964 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
965 BUG();
966 return 0;
967}
968
969/**
970 * amdgpu_invalid_wreg64 - dummy reg write function
971 *
972 * @adev: amdgpu_device pointer
973 * @reg: offset of register
974 * @v: value to write to the register
975 *
976 * Dummy register read function. Used for register blocks
977 * that certain asics don't have (all asics).
978 */
979static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
980{
981 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
982 reg, v);
983 BUG();
984}
985
986/**
987 * amdgpu_block_invalid_rreg - dummy reg read function
988 *
989 * @adev: amdgpu_device pointer
990 * @block: offset of instance
991 * @reg: offset of register
992 *
993 * Dummy register read function. Used for register blocks
994 * that certain asics don't have (all asics).
995 * Returns the value in the register.
996 */
997static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
998 uint32_t block, uint32_t reg)
999{
1000 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
1001 reg, block);
1002 BUG();
1003 return 0;
1004}
1005
1006/**
1007 * amdgpu_block_invalid_wreg - dummy reg write function
1008 *
1009 * @adev: amdgpu_device pointer
1010 * @block: offset of instance
1011 * @reg: offset of register
1012 * @v: value to write to the register
1013 *
1014 * Dummy register read function. Used for register blocks
1015 * that certain asics don't have (all asics).
1016 */
1017static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
1018 uint32_t block,
1019 uint32_t reg, uint32_t v)
1020{
1021 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1022 reg, block, v);
1023 BUG();
1024}
1025
1026/**
1027 * amdgpu_device_asic_init - Wrapper for atom asic_init
1028 *
1029 * @adev: amdgpu_device pointer
1030 *
1031 * Does any asic specific work and then calls atom asic init.
1032 */
1033static int amdgpu_device_asic_init(struct amdgpu_device *adev)
1034{
1035 amdgpu_asic_pre_asic_init(adev);
1036
1037 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) ||
1038 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0))
1039 return amdgpu_atomfirmware_asic_init(adev, true);
1040 else
1041 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
1042}
1043
1044/**
1045 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1046 *
1047 * @adev: amdgpu_device pointer
1048 *
1049 * Allocates a scratch page of VRAM for use by various things in the
1050 * driver.
1051 */
1052static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
1053{
1054 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
1055 AMDGPU_GEM_DOMAIN_VRAM |
1056 AMDGPU_GEM_DOMAIN_GTT,
1057 &adev->mem_scratch.robj,
1058 &adev->mem_scratch.gpu_addr,
1059 (void **)&adev->mem_scratch.ptr);
1060}
1061
1062/**
1063 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1064 *
1065 * @adev: amdgpu_device pointer
1066 *
1067 * Frees the VRAM scratch page.
1068 */
1069static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
1070{
1071 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
1072}
1073
1074/**
1075 * amdgpu_device_program_register_sequence - program an array of registers.
1076 *
1077 * @adev: amdgpu_device pointer
1078 * @registers: pointer to the register array
1079 * @array_size: size of the register array
1080 *
1081 * Programs an array or registers with and and or masks.
1082 * This is a helper for setting golden registers.
1083 */
1084void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
1085 const u32 *registers,
1086 const u32 array_size)
1087{
1088 u32 tmp, reg, and_mask, or_mask;
1089 int i;
1090
1091 if (array_size % 3)
1092 return;
1093
1094 for (i = 0; i < array_size; i += 3) {
1095 reg = registers[i + 0];
1096 and_mask = registers[i + 1];
1097 or_mask = registers[i + 2];
1098
1099 if (and_mask == 0xffffffff) {
1100 tmp = or_mask;
1101 } else {
1102 tmp = RREG32(reg);
1103 tmp &= ~and_mask;
1104 if (adev->family >= AMDGPU_FAMILY_AI)
1105 tmp |= (or_mask & and_mask);
1106 else
1107 tmp |= or_mask;
1108 }
1109 WREG32(reg, tmp);
1110 }
1111}
1112
1113/**
1114 * amdgpu_device_pci_config_reset - reset the GPU
1115 *
1116 * @adev: amdgpu_device pointer
1117 *
1118 * Resets the GPU using the pci config reset sequence.
1119 * Only applicable to asics prior to vega10.
1120 */
1121void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1122{
1123 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1124}
1125
1126/**
1127 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1128 *
1129 * @adev: amdgpu_device pointer
1130 *
1131 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1132 */
1133int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1134{
1135 return pci_reset_function(adev->pdev);
1136}
1137
1138/*
1139 * GPU doorbell aperture helpers function.
1140 */
1141/**
1142 * amdgpu_device_doorbell_init - Init doorbell driver information.
1143 *
1144 * @adev: amdgpu_device pointer
1145 *
1146 * Init doorbell driver information (CIK)
1147 * Returns 0 on success, error on failure.
1148 */
1149static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
1150{
1151
1152 /* No doorbell on SI hardware generation */
1153 if (adev->asic_type < CHIP_BONAIRE) {
1154 adev->doorbell.base = 0;
1155 adev->doorbell.size = 0;
1156 adev->doorbell.num_kernel_doorbells = 0;
1157 adev->doorbell.ptr = NULL;
1158 return 0;
1159 }
1160
1161 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
1162 return -EINVAL;
1163
1164 amdgpu_asic_init_doorbell_index(adev);
1165
1166 /* doorbell bar mapping */
1167 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
1168 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
1169
1170 if (adev->enable_mes) {
1171 adev->doorbell.num_kernel_doorbells =
1172 adev->doorbell.size / sizeof(u32);
1173 } else {
1174 adev->doorbell.num_kernel_doorbells =
1175 min_t(u32, adev->doorbell.size / sizeof(u32),
1176 adev->doorbell_index.max_assignment+1);
1177 if (adev->doorbell.num_kernel_doorbells == 0)
1178 return -EINVAL;
1179
1180 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
1181 * paging queue doorbell use the second page. The
1182 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
1183 * doorbells are in the first page. So with paging queue enabled,
1184 * the max num_kernel_doorbells should + 1 page (0x400 in dword)
1185 */
1186 if (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(4, 0, 0) &&
1187 adev->ip_versions[SDMA0_HWIP][0] < IP_VERSION(4, 2, 0))
1188 adev->doorbell.num_kernel_doorbells += 0x400;
1189 }
1190
1191 adev->doorbell.ptr = ioremap(adev->doorbell.base,
1192 adev->doorbell.num_kernel_doorbells *
1193 sizeof(u32));
1194 if (adev->doorbell.ptr == NULL)
1195 return -ENOMEM;
1196
1197 return 0;
1198}
1199
1200/**
1201 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
1202 *
1203 * @adev: amdgpu_device pointer
1204 *
1205 * Tear down doorbell driver information (CIK)
1206 */
1207static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
1208{
1209 iounmap(adev->doorbell.ptr);
1210 adev->doorbell.ptr = NULL;
1211}
1212
1213
1214
1215/*
1216 * amdgpu_device_wb_*()
1217 * Writeback is the method by which the GPU updates special pages in memory
1218 * with the status of certain GPU events (fences, ring pointers,etc.).
1219 */
1220
1221/**
1222 * amdgpu_device_wb_fini - Disable Writeback and free memory
1223 *
1224 * @adev: amdgpu_device pointer
1225 *
1226 * Disables Writeback and frees the Writeback memory (all asics).
1227 * Used at driver shutdown.
1228 */
1229static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1230{
1231 if (adev->wb.wb_obj) {
1232 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1233 &adev->wb.gpu_addr,
1234 (void **)&adev->wb.wb);
1235 adev->wb.wb_obj = NULL;
1236 }
1237}
1238
1239/**
1240 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1241 *
1242 * @adev: amdgpu_device pointer
1243 *
1244 * Initializes writeback and allocates writeback memory (all asics).
1245 * Used at driver startup.
1246 * Returns 0 on success or an -error on failure.
1247 */
1248static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1249{
1250 int r;
1251
1252 if (adev->wb.wb_obj == NULL) {
1253 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1254 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1255 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1256 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1257 (void **)&adev->wb.wb);
1258 if (r) {
1259 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1260 return r;
1261 }
1262
1263 adev->wb.num_wb = AMDGPU_MAX_WB;
1264 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1265
1266 /* clear wb memory */
1267 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1268 }
1269
1270 return 0;
1271}
1272
1273/**
1274 * amdgpu_device_wb_get - Allocate a wb entry
1275 *
1276 * @adev: amdgpu_device pointer
1277 * @wb: wb index
1278 *
1279 * Allocate a wb slot for use by the driver (all asics).
1280 * Returns 0 on success or -EINVAL on failure.
1281 */
1282int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1283{
1284 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1285
1286 if (offset < adev->wb.num_wb) {
1287 __set_bit(offset, adev->wb.used);
1288 *wb = offset << 3; /* convert to dw offset */
1289 return 0;
1290 } else {
1291 return -EINVAL;
1292 }
1293}
1294
1295/**
1296 * amdgpu_device_wb_free - Free a wb entry
1297 *
1298 * @adev: amdgpu_device pointer
1299 * @wb: wb index
1300 *
1301 * Free a wb slot allocated for use by the driver (all asics)
1302 */
1303void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1304{
1305 wb >>= 3;
1306 if (wb < adev->wb.num_wb)
1307 __clear_bit(wb, adev->wb.used);
1308}
1309
1310/**
1311 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1312 *
1313 * @adev: amdgpu_device pointer
1314 *
1315 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1316 * to fail, but if any of the BARs is not accessible after the size we abort
1317 * driver loading by returning -ENODEV.
1318 */
1319int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1320{
1321 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1322 struct pci_bus *root;
1323 struct resource *res;
1324 unsigned i;
1325 u16 cmd;
1326 int r;
1327
1328 /* Bypass for VF */
1329 if (amdgpu_sriov_vf(adev))
1330 return 0;
1331
1332 /* skip if the bios has already enabled large BAR */
1333 if (adev->gmc.real_vram_size &&
1334 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1335 return 0;
1336
1337 /* Check if the root BUS has 64bit memory resources */
1338 root = adev->pdev->bus;
1339 while (root->parent)
1340 root = root->parent;
1341
1342 pci_bus_for_each_resource(root, res, i) {
1343 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1344 res->start > 0x100000000ull)
1345 break;
1346 }
1347
1348 /* Trying to resize is pointless without a root hub window above 4GB */
1349 if (!res)
1350 return 0;
1351
1352 /* Limit the BAR size to what is available */
1353 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1354 rbar_size);
1355
1356 /* Disable memory decoding while we change the BAR addresses and size */
1357 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1358 pci_write_config_word(adev->pdev, PCI_COMMAND,
1359 cmd & ~PCI_COMMAND_MEMORY);
1360
1361 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1362 amdgpu_device_doorbell_fini(adev);
1363 if (adev->asic_type >= CHIP_BONAIRE)
1364 pci_release_resource(adev->pdev, 2);
1365
1366 pci_release_resource(adev->pdev, 0);
1367
1368 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1369 if (r == -ENOSPC)
1370 DRM_INFO("Not enough PCI address space for a large BAR.");
1371 else if (r && r != -ENOTSUPP)
1372 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1373
1374 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1375
1376 /* When the doorbell or fb BAR isn't available we have no chance of
1377 * using the device.
1378 */
1379 r = amdgpu_device_doorbell_init(adev);
1380 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1381 return -ENODEV;
1382
1383 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1384
1385 return 0;
1386}
1387
1388static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1389{
1390 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) {
1391 return false;
1392 }
1393
1394 return true;
1395}
1396
1397/*
1398 * GPU helpers function.
1399 */
1400/**
1401 * amdgpu_device_need_post - check if the hw need post or not
1402 *
1403 * @adev: amdgpu_device pointer
1404 *
1405 * Check if the asic has been initialized (all asics) at driver startup
1406 * or post is needed if hw reset is performed.
1407 * Returns true if need or false if not.
1408 */
1409bool amdgpu_device_need_post(struct amdgpu_device *adev)
1410{
1411 uint32_t reg;
1412
1413 if (amdgpu_sriov_vf(adev))
1414 return false;
1415
1416 if (!amdgpu_device_read_bios(adev))
1417 return false;
1418
1419 if (amdgpu_passthrough(adev)) {
1420 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1421 * some old smc fw still need driver do vPost otherwise gpu hang, while
1422 * those smc fw version above 22.15 doesn't have this flaw, so we force
1423 * vpost executed for smc version below 22.15
1424 */
1425 if (adev->asic_type == CHIP_FIJI) {
1426 int err;
1427 uint32_t fw_ver;
1428 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1429 /* force vPost if error occured */
1430 if (err)
1431 return true;
1432
1433 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1434 if (fw_ver < 0x00160e00)
1435 return true;
1436 }
1437 }
1438
1439 /* Don't post if we need to reset whole hive on init */
1440 if (adev->gmc.xgmi.pending_reset)
1441 return false;
1442
1443 if (adev->has_hw_reset) {
1444 adev->has_hw_reset = false;
1445 return true;
1446 }
1447
1448 /* bios scratch used on CIK+ */
1449 if (adev->asic_type >= CHIP_BONAIRE)
1450 return amdgpu_atombios_scratch_need_asic_init(adev);
1451
1452 /* check MEM_SIZE for older asics */
1453 reg = amdgpu_asic_get_config_memsize(adev);
1454
1455 if ((reg != 0) && (reg != 0xffffffff))
1456 return false;
1457
1458 return true;
1459}
1460
1461/*
1462 * On APUs with >= 64GB white flickering has been observed w/ SG enabled.
1463 * Disable S/G on such systems until we have a proper fix.
1464 * https://gitlab.freedesktop.org/drm/amd/-/issues/2354
1465 * https://gitlab.freedesktop.org/drm/amd/-/issues/2735
1466 */
1467bool amdgpu_sg_display_supported(struct amdgpu_device *adev)
1468{
1469 switch (amdgpu_sg_display) {
1470 case -1:
1471 break;
1472 case 0:
1473 return false;
1474 case 1:
1475 return true;
1476 default:
1477 return false;
1478 }
1479 if ((totalram_pages() << (PAGE_SHIFT - 10)) +
1480 (adev->gmc.real_vram_size / 1024) >= 64000000) {
1481 DRM_WARN("Disabling S/G due to >=64GB RAM\n");
1482 return false;
1483 }
1484 return true;
1485}
1486
1487/*
1488 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
1489 * speed switching. Until we have confirmation from Intel that a specific host
1490 * supports it, it's safer that we keep it disabled for all.
1491 *
1492 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1493 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1494 */
1495bool amdgpu_device_pcie_dynamic_switching_supported(void)
1496{
1497#if IS_ENABLED(CONFIG_X86)
1498 struct cpuinfo_x86 *c = &cpu_data(0);
1499
1500 if (c->x86_vendor == X86_VENDOR_INTEL)
1501 return false;
1502#endif
1503 return true;
1504}
1505
1506/**
1507 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1508 *
1509 * @adev: amdgpu_device pointer
1510 *
1511 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1512 * be set for this device.
1513 *
1514 * Returns true if it should be used or false if not.
1515 */
1516bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1517{
1518 switch (amdgpu_aspm) {
1519 case -1:
1520 break;
1521 case 0:
1522 return false;
1523 case 1:
1524 return true;
1525 default:
1526 return false;
1527 }
1528 return pcie_aspm_enabled(adev->pdev);
1529}
1530
1531bool amdgpu_device_aspm_support_quirk(void)
1532{
1533#if IS_ENABLED(CONFIG_X86)
1534 struct cpuinfo_x86 *c = &cpu_data(0);
1535
1536 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
1537#else
1538 return true;
1539#endif
1540}
1541
1542/* if we get transitioned to only one device, take VGA back */
1543/**
1544 * amdgpu_device_vga_set_decode - enable/disable vga decode
1545 *
1546 * @pdev: PCI device pointer
1547 * @state: enable/disable vga decode
1548 *
1549 * Enable/disable vga decode (all asics).
1550 * Returns VGA resource flags.
1551 */
1552static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1553 bool state)
1554{
1555 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1556 amdgpu_asic_set_vga_state(adev, state);
1557 if (state)
1558 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1559 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1560 else
1561 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1562}
1563
1564/**
1565 * amdgpu_device_check_block_size - validate the vm block size
1566 *
1567 * @adev: amdgpu_device pointer
1568 *
1569 * Validates the vm block size specified via module parameter.
1570 * The vm block size defines number of bits in page table versus page directory,
1571 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1572 * page table and the remaining bits are in the page directory.
1573 */
1574static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1575{
1576 /* defines number of bits in page table versus page directory,
1577 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1578 * page table and the remaining bits are in the page directory */
1579 if (amdgpu_vm_block_size == -1)
1580 return;
1581
1582 if (amdgpu_vm_block_size < 9) {
1583 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1584 amdgpu_vm_block_size);
1585 amdgpu_vm_block_size = -1;
1586 }
1587}
1588
1589/**
1590 * amdgpu_device_check_vm_size - validate the vm size
1591 *
1592 * @adev: amdgpu_device pointer
1593 *
1594 * Validates the vm size in GB specified via module parameter.
1595 * The VM size is the size of the GPU virtual memory space in GB.
1596 */
1597static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1598{
1599 /* no need to check the default value */
1600 if (amdgpu_vm_size == -1)
1601 return;
1602
1603 if (amdgpu_vm_size < 1) {
1604 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1605 amdgpu_vm_size);
1606 amdgpu_vm_size = -1;
1607 }
1608}
1609
1610static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1611{
1612 struct sysinfo si;
1613 bool is_os_64 = (sizeof(void *) == 8);
1614 uint64_t total_memory;
1615 uint64_t dram_size_seven_GB = 0x1B8000000;
1616 uint64_t dram_size_three_GB = 0xB8000000;
1617
1618 if (amdgpu_smu_memory_pool_size == 0)
1619 return;
1620
1621 if (!is_os_64) {
1622 DRM_WARN("Not 64-bit OS, feature not supported\n");
1623 goto def_value;
1624 }
1625 si_meminfo(&si);
1626 total_memory = (uint64_t)si.totalram * si.mem_unit;
1627
1628 if ((amdgpu_smu_memory_pool_size == 1) ||
1629 (amdgpu_smu_memory_pool_size == 2)) {
1630 if (total_memory < dram_size_three_GB)
1631 goto def_value1;
1632 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1633 (amdgpu_smu_memory_pool_size == 8)) {
1634 if (total_memory < dram_size_seven_GB)
1635 goto def_value1;
1636 } else {
1637 DRM_WARN("Smu memory pool size not supported\n");
1638 goto def_value;
1639 }
1640 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1641
1642 return;
1643
1644def_value1:
1645 DRM_WARN("No enough system memory\n");
1646def_value:
1647 adev->pm.smu_prv_buffer_size = 0;
1648}
1649
1650static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1651{
1652 if (!(adev->flags & AMD_IS_APU) ||
1653 adev->asic_type < CHIP_RAVEN)
1654 return 0;
1655
1656 switch (adev->asic_type) {
1657 case CHIP_RAVEN:
1658 if (adev->pdev->device == 0x15dd)
1659 adev->apu_flags |= AMD_APU_IS_RAVEN;
1660 if (adev->pdev->device == 0x15d8)
1661 adev->apu_flags |= AMD_APU_IS_PICASSO;
1662 break;
1663 case CHIP_RENOIR:
1664 if ((adev->pdev->device == 0x1636) ||
1665 (adev->pdev->device == 0x164c))
1666 adev->apu_flags |= AMD_APU_IS_RENOIR;
1667 else
1668 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1669 break;
1670 case CHIP_VANGOGH:
1671 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1672 break;
1673 case CHIP_YELLOW_CARP:
1674 break;
1675 case CHIP_CYAN_SKILLFISH:
1676 if ((adev->pdev->device == 0x13FE) ||
1677 (adev->pdev->device == 0x143F))
1678 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1679 break;
1680 default:
1681 break;
1682 }
1683
1684 return 0;
1685}
1686
1687/**
1688 * amdgpu_device_check_arguments - validate module params
1689 *
1690 * @adev: amdgpu_device pointer
1691 *
1692 * Validates certain module parameters and updates
1693 * the associated values used by the driver (all asics).
1694 */
1695static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1696{
1697 if (amdgpu_sched_jobs < 4) {
1698 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1699 amdgpu_sched_jobs);
1700 amdgpu_sched_jobs = 4;
1701 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
1702 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1703 amdgpu_sched_jobs);
1704 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1705 }
1706
1707 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1708 /* gart size must be greater or equal to 32M */
1709 dev_warn(adev->dev, "gart size (%d) too small\n",
1710 amdgpu_gart_size);
1711 amdgpu_gart_size = -1;
1712 }
1713
1714 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1715 /* gtt size must be greater or equal to 32M */
1716 dev_warn(adev->dev, "gtt size (%d) too small\n",
1717 amdgpu_gtt_size);
1718 amdgpu_gtt_size = -1;
1719 }
1720
1721 /* valid range is between 4 and 9 inclusive */
1722 if (amdgpu_vm_fragment_size != -1 &&
1723 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1724 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1725 amdgpu_vm_fragment_size = -1;
1726 }
1727
1728 if (amdgpu_sched_hw_submission < 2) {
1729 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1730 amdgpu_sched_hw_submission);
1731 amdgpu_sched_hw_submission = 2;
1732 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1733 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1734 amdgpu_sched_hw_submission);
1735 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1736 }
1737
1738 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1739 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1740 amdgpu_reset_method = -1;
1741 }
1742
1743 amdgpu_device_check_smu_prv_buffer_size(adev);
1744
1745 amdgpu_device_check_vm_size(adev);
1746
1747 amdgpu_device_check_block_size(adev);
1748
1749 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1750
1751 return 0;
1752}
1753
1754/**
1755 * amdgpu_switcheroo_set_state - set switcheroo state
1756 *
1757 * @pdev: pci dev pointer
1758 * @state: vga_switcheroo state
1759 *
1760 * Callback for the switcheroo driver. Suspends or resumes
1761 * the asics before or after it is powered up using ACPI methods.
1762 */
1763static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1764 enum vga_switcheroo_state state)
1765{
1766 struct drm_device *dev = pci_get_drvdata(pdev);
1767 int r;
1768
1769 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1770 return;
1771
1772 if (state == VGA_SWITCHEROO_ON) {
1773 pr_info("switched on\n");
1774 /* don't suspend or resume card normally */
1775 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1776
1777 pci_set_power_state(pdev, PCI_D0);
1778 amdgpu_device_load_pci_state(pdev);
1779 r = pci_enable_device(pdev);
1780 if (r)
1781 DRM_WARN("pci_enable_device failed (%d)\n", r);
1782 amdgpu_device_resume(dev, true);
1783
1784 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1785 } else {
1786 pr_info("switched off\n");
1787 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1788 amdgpu_device_suspend(dev, true);
1789 amdgpu_device_cache_pci_state(pdev);
1790 /* Shut down the device */
1791 pci_disable_device(pdev);
1792 pci_set_power_state(pdev, PCI_D3cold);
1793 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1794 }
1795}
1796
1797/**
1798 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1799 *
1800 * @pdev: pci dev pointer
1801 *
1802 * Callback for the switcheroo driver. Check of the switcheroo
1803 * state can be changed.
1804 * Returns true if the state can be changed, false if not.
1805 */
1806static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1807{
1808 struct drm_device *dev = pci_get_drvdata(pdev);
1809
1810 /*
1811 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1812 * locking inversion with the driver load path. And the access here is
1813 * completely racy anyway. So don't bother with locking for now.
1814 */
1815 return atomic_read(&dev->open_count) == 0;
1816}
1817
1818static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1819 .set_gpu_state = amdgpu_switcheroo_set_state,
1820 .reprobe = NULL,
1821 .can_switch = amdgpu_switcheroo_can_switch,
1822};
1823
1824/**
1825 * amdgpu_device_ip_set_clockgating_state - set the CG state
1826 *
1827 * @dev: amdgpu_device pointer
1828 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1829 * @state: clockgating state (gate or ungate)
1830 *
1831 * Sets the requested clockgating state for all instances of
1832 * the hardware IP specified.
1833 * Returns the error code from the last instance.
1834 */
1835int amdgpu_device_ip_set_clockgating_state(void *dev,
1836 enum amd_ip_block_type block_type,
1837 enum amd_clockgating_state state)
1838{
1839 struct amdgpu_device *adev = dev;
1840 int i, r = 0;
1841
1842 for (i = 0; i < adev->num_ip_blocks; i++) {
1843 if (!adev->ip_blocks[i].status.valid)
1844 continue;
1845 if (adev->ip_blocks[i].version->type != block_type)
1846 continue;
1847 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1848 continue;
1849 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1850 (void *)adev, state);
1851 if (r)
1852 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1853 adev->ip_blocks[i].version->funcs->name, r);
1854 }
1855 return r;
1856}
1857
1858/**
1859 * amdgpu_device_ip_set_powergating_state - set the PG state
1860 *
1861 * @dev: amdgpu_device pointer
1862 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1863 * @state: powergating state (gate or ungate)
1864 *
1865 * Sets the requested powergating state for all instances of
1866 * the hardware IP specified.
1867 * Returns the error code from the last instance.
1868 */
1869int amdgpu_device_ip_set_powergating_state(void *dev,
1870 enum amd_ip_block_type block_type,
1871 enum amd_powergating_state state)
1872{
1873 struct amdgpu_device *adev = dev;
1874 int i, r = 0;
1875
1876 for (i = 0; i < adev->num_ip_blocks; i++) {
1877 if (!adev->ip_blocks[i].status.valid)
1878 continue;
1879 if (adev->ip_blocks[i].version->type != block_type)
1880 continue;
1881 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1882 continue;
1883 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1884 (void *)adev, state);
1885 if (r)
1886 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1887 adev->ip_blocks[i].version->funcs->name, r);
1888 }
1889 return r;
1890}
1891
1892/**
1893 * amdgpu_device_ip_get_clockgating_state - get the CG state
1894 *
1895 * @adev: amdgpu_device pointer
1896 * @flags: clockgating feature flags
1897 *
1898 * Walks the list of IPs on the device and updates the clockgating
1899 * flags for each IP.
1900 * Updates @flags with the feature flags for each hardware IP where
1901 * clockgating is enabled.
1902 */
1903void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1904 u64 *flags)
1905{
1906 int i;
1907
1908 for (i = 0; i < adev->num_ip_blocks; i++) {
1909 if (!adev->ip_blocks[i].status.valid)
1910 continue;
1911 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1912 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1913 }
1914}
1915
1916/**
1917 * amdgpu_device_ip_wait_for_idle - wait for idle
1918 *
1919 * @adev: amdgpu_device pointer
1920 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1921 *
1922 * Waits for the request hardware IP to be idle.
1923 * Returns 0 for success or a negative error code on failure.
1924 */
1925int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1926 enum amd_ip_block_type block_type)
1927{
1928 int i, r;
1929
1930 for (i = 0; i < adev->num_ip_blocks; i++) {
1931 if (!adev->ip_blocks[i].status.valid)
1932 continue;
1933 if (adev->ip_blocks[i].version->type == block_type) {
1934 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1935 if (r)
1936 return r;
1937 break;
1938 }
1939 }
1940 return 0;
1941
1942}
1943
1944/**
1945 * amdgpu_device_ip_is_idle - is the hardware IP idle
1946 *
1947 * @adev: amdgpu_device pointer
1948 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1949 *
1950 * Check if the hardware IP is idle or not.
1951 * Returns true if it the IP is idle, false if not.
1952 */
1953bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1954 enum amd_ip_block_type block_type)
1955{
1956 int i;
1957
1958 for (i = 0; i < adev->num_ip_blocks; i++) {
1959 if (!adev->ip_blocks[i].status.valid)
1960 continue;
1961 if (adev->ip_blocks[i].version->type == block_type)
1962 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1963 }
1964 return true;
1965
1966}
1967
1968/**
1969 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1970 *
1971 * @adev: amdgpu_device pointer
1972 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1973 *
1974 * Returns a pointer to the hardware IP block structure
1975 * if it exists for the asic, otherwise NULL.
1976 */
1977struct amdgpu_ip_block *
1978amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1979 enum amd_ip_block_type type)
1980{
1981 int i;
1982
1983 for (i = 0; i < adev->num_ip_blocks; i++)
1984 if (adev->ip_blocks[i].version->type == type)
1985 return &adev->ip_blocks[i];
1986
1987 return NULL;
1988}
1989
1990/**
1991 * amdgpu_device_ip_block_version_cmp
1992 *
1993 * @adev: amdgpu_device pointer
1994 * @type: enum amd_ip_block_type
1995 * @major: major version
1996 * @minor: minor version
1997 *
1998 * return 0 if equal or greater
1999 * return 1 if smaller or the ip_block doesn't exist
2000 */
2001int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
2002 enum amd_ip_block_type type,
2003 u32 major, u32 minor)
2004{
2005 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
2006
2007 if (ip_block && ((ip_block->version->major > major) ||
2008 ((ip_block->version->major == major) &&
2009 (ip_block->version->minor >= minor))))
2010 return 0;
2011
2012 return 1;
2013}
2014
2015/**
2016 * amdgpu_device_ip_block_add
2017 *
2018 * @adev: amdgpu_device pointer
2019 * @ip_block_version: pointer to the IP to add
2020 *
2021 * Adds the IP block driver information to the collection of IPs
2022 * on the asic.
2023 */
2024int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
2025 const struct amdgpu_ip_block_version *ip_block_version)
2026{
2027 if (!ip_block_version)
2028 return -EINVAL;
2029
2030 switch (ip_block_version->type) {
2031 case AMD_IP_BLOCK_TYPE_VCN:
2032 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
2033 return 0;
2034 break;
2035 case AMD_IP_BLOCK_TYPE_JPEG:
2036 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
2037 return 0;
2038 break;
2039 default:
2040 break;
2041 }
2042
2043 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
2044 ip_block_version->funcs->name);
2045
2046 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
2047
2048 return 0;
2049}
2050
2051/**
2052 * amdgpu_device_enable_virtual_display - enable virtual display feature
2053 *
2054 * @adev: amdgpu_device pointer
2055 *
2056 * Enabled the virtual display feature if the user has enabled it via
2057 * the module parameter virtual_display. This feature provides a virtual
2058 * display hardware on headless boards or in virtualized environments.
2059 * This function parses and validates the configuration string specified by
2060 * the user and configues the virtual display configuration (number of
2061 * virtual connectors, crtcs, etc.) specified.
2062 */
2063static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
2064{
2065 adev->enable_virtual_display = false;
2066
2067 if (amdgpu_virtual_display) {
2068 const char *pci_address_name = pci_name(adev->pdev);
2069 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
2070
2071 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
2072 pciaddstr_tmp = pciaddstr;
2073 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
2074 pciaddname = strsep(&pciaddname_tmp, ",");
2075 if (!strcmp("all", pciaddname)
2076 || !strcmp(pci_address_name, pciaddname)) {
2077 long num_crtc;
2078 int res = -1;
2079
2080 adev->enable_virtual_display = true;
2081
2082 if (pciaddname_tmp)
2083 res = kstrtol(pciaddname_tmp, 10,
2084 &num_crtc);
2085
2086 if (!res) {
2087 if (num_crtc < 1)
2088 num_crtc = 1;
2089 if (num_crtc > 6)
2090 num_crtc = 6;
2091 adev->mode_info.num_crtc = num_crtc;
2092 } else {
2093 adev->mode_info.num_crtc = 1;
2094 }
2095 break;
2096 }
2097 }
2098
2099 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2100 amdgpu_virtual_display, pci_address_name,
2101 adev->enable_virtual_display, adev->mode_info.num_crtc);
2102
2103 kfree(pciaddstr);
2104 }
2105}
2106
2107void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
2108{
2109 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
2110 adev->mode_info.num_crtc = 1;
2111 adev->enable_virtual_display = true;
2112 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
2113 adev->enable_virtual_display, adev->mode_info.num_crtc);
2114 }
2115}
2116
2117/**
2118 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2119 *
2120 * @adev: amdgpu_device pointer
2121 *
2122 * Parses the asic configuration parameters specified in the gpu info
2123 * firmware and makes them availale to the driver for use in configuring
2124 * the asic.
2125 * Returns 0 on success, -EINVAL on failure.
2126 */
2127static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
2128{
2129 const char *chip_name;
2130 char fw_name[40];
2131 int err;
2132 const struct gpu_info_firmware_header_v1_0 *hdr;
2133
2134 adev->firmware.gpu_info_fw = NULL;
2135
2136 if (adev->mman.discovery_bin) {
2137 /*
2138 * FIXME: The bounding box is still needed by Navi12, so
2139 * temporarily read it from gpu_info firmware. Should be dropped
2140 * when DAL no longer needs it.
2141 */
2142 if (adev->asic_type != CHIP_NAVI12)
2143 return 0;
2144 }
2145
2146 switch (adev->asic_type) {
2147 default:
2148 return 0;
2149 case CHIP_VEGA10:
2150 chip_name = "vega10";
2151 break;
2152 case CHIP_VEGA12:
2153 chip_name = "vega12";
2154 break;
2155 case CHIP_RAVEN:
2156 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2157 chip_name = "raven2";
2158 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2159 chip_name = "picasso";
2160 else
2161 chip_name = "raven";
2162 break;
2163 case CHIP_ARCTURUS:
2164 chip_name = "arcturus";
2165 break;
2166 case CHIP_NAVI12:
2167 chip_name = "navi12";
2168 break;
2169 }
2170
2171 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
2172 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
2173 if (err) {
2174 dev_err(adev->dev,
2175 "Failed to get gpu_info firmware \"%s\"\n",
2176 fw_name);
2177 goto out;
2178 }
2179
2180 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2181 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2182
2183 switch (hdr->version_major) {
2184 case 1:
2185 {
2186 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2187 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2188 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2189
2190 /*
2191 * Should be droped when DAL no longer needs it.
2192 */
2193 if (adev->asic_type == CHIP_NAVI12)
2194 goto parse_soc_bounding_box;
2195
2196 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2197 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2198 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2199 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2200 adev->gfx.config.max_texture_channel_caches =
2201 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2202 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2203 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2204 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2205 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2206 adev->gfx.config.double_offchip_lds_buf =
2207 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2208 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2209 adev->gfx.cu_info.max_waves_per_simd =
2210 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2211 adev->gfx.cu_info.max_scratch_slots_per_cu =
2212 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2213 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2214 if (hdr->version_minor >= 1) {
2215 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2216 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2217 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2218 adev->gfx.config.num_sc_per_sh =
2219 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2220 adev->gfx.config.num_packer_per_sc =
2221 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2222 }
2223
2224parse_soc_bounding_box:
2225 /*
2226 * soc bounding box info is not integrated in disocovery table,
2227 * we always need to parse it from gpu info firmware if needed.
2228 */
2229 if (hdr->version_minor == 2) {
2230 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2231 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2232 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2233 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2234 }
2235 break;
2236 }
2237 default:
2238 dev_err(adev->dev,
2239 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2240 err = -EINVAL;
2241 goto out;
2242 }
2243out:
2244 return err;
2245}
2246
2247/**
2248 * amdgpu_device_ip_early_init - run early init for hardware IPs
2249 *
2250 * @adev: amdgpu_device pointer
2251 *
2252 * Early initialization pass for hardware IPs. The hardware IPs that make
2253 * up each asic are discovered each IP's early_init callback is run. This
2254 * is the first stage in initializing the asic.
2255 * Returns 0 on success, negative error code on failure.
2256 */
2257static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2258{
2259 struct drm_device *dev = adev_to_drm(adev);
2260 struct pci_dev *parent;
2261 int i, r;
2262 bool total;
2263
2264 amdgpu_device_enable_virtual_display(adev);
2265
2266 if (amdgpu_sriov_vf(adev)) {
2267 r = amdgpu_virt_request_full_gpu(adev, true);
2268 if (r)
2269 return r;
2270 }
2271
2272 switch (adev->asic_type) {
2273#ifdef CONFIG_DRM_AMDGPU_SI
2274 case CHIP_VERDE:
2275 case CHIP_TAHITI:
2276 case CHIP_PITCAIRN:
2277 case CHIP_OLAND:
2278 case CHIP_HAINAN:
2279 adev->family = AMDGPU_FAMILY_SI;
2280 r = si_set_ip_blocks(adev);
2281 if (r)
2282 return r;
2283 break;
2284#endif
2285#ifdef CONFIG_DRM_AMDGPU_CIK
2286 case CHIP_BONAIRE:
2287 case CHIP_HAWAII:
2288 case CHIP_KAVERI:
2289 case CHIP_KABINI:
2290 case CHIP_MULLINS:
2291 if (adev->flags & AMD_IS_APU)
2292 adev->family = AMDGPU_FAMILY_KV;
2293 else
2294 adev->family = AMDGPU_FAMILY_CI;
2295
2296 r = cik_set_ip_blocks(adev);
2297 if (r)
2298 return r;
2299 break;
2300#endif
2301 case CHIP_TOPAZ:
2302 case CHIP_TONGA:
2303 case CHIP_FIJI:
2304 case CHIP_POLARIS10:
2305 case CHIP_POLARIS11:
2306 case CHIP_POLARIS12:
2307 case CHIP_VEGAM:
2308 case CHIP_CARRIZO:
2309 case CHIP_STONEY:
2310 if (adev->flags & AMD_IS_APU)
2311 adev->family = AMDGPU_FAMILY_CZ;
2312 else
2313 adev->family = AMDGPU_FAMILY_VI;
2314
2315 r = vi_set_ip_blocks(adev);
2316 if (r)
2317 return r;
2318 break;
2319 default:
2320 r = amdgpu_discovery_set_ip_blocks(adev);
2321 if (r)
2322 return r;
2323 break;
2324 }
2325
2326 if (amdgpu_has_atpx() &&
2327 (amdgpu_is_atpx_hybrid() ||
2328 amdgpu_has_atpx_dgpu_power_cntl()) &&
2329 ((adev->flags & AMD_IS_APU) == 0) &&
2330 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2331 adev->flags |= AMD_IS_PX;
2332
2333 if (!(adev->flags & AMD_IS_APU)) {
2334 parent = pci_upstream_bridge(adev->pdev);
2335 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2336 }
2337
2338
2339 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2340 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2341 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2342 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2343 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2344
2345 total = true;
2346 for (i = 0; i < adev->num_ip_blocks; i++) {
2347 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2348 DRM_WARN("disabled ip block: %d <%s>\n",
2349 i, adev->ip_blocks[i].version->funcs->name);
2350 adev->ip_blocks[i].status.valid = false;
2351 } else {
2352 if (adev->ip_blocks[i].version->funcs->early_init) {
2353 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2354 if (r == -ENOENT) {
2355 adev->ip_blocks[i].status.valid = false;
2356 } else if (r) {
2357 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2358 adev->ip_blocks[i].version->funcs->name, r);
2359 total = false;
2360 } else {
2361 adev->ip_blocks[i].status.valid = true;
2362 }
2363 } else {
2364 adev->ip_blocks[i].status.valid = true;
2365 }
2366 }
2367 /* get the vbios after the asic_funcs are set up */
2368 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2369 r = amdgpu_device_parse_gpu_info_fw(adev);
2370 if (r)
2371 return r;
2372
2373 /* Read BIOS */
2374 if (amdgpu_device_read_bios(adev)) {
2375 if (!amdgpu_get_bios(adev))
2376 return -EINVAL;
2377
2378 r = amdgpu_atombios_init(adev);
2379 if (r) {
2380 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2381 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2382 return r;
2383 }
2384 }
2385
2386 /*get pf2vf msg info at it's earliest time*/
2387 if (amdgpu_sriov_vf(adev))
2388 amdgpu_virt_init_data_exchange(adev);
2389
2390 }
2391 }
2392 if (!total)
2393 return -ENODEV;
2394
2395 amdgpu_amdkfd_device_probe(adev);
2396 adev->cg_flags &= amdgpu_cg_mask;
2397 adev->pg_flags &= amdgpu_pg_mask;
2398
2399 return 0;
2400}
2401
2402static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2403{
2404 int i, r;
2405
2406 for (i = 0; i < adev->num_ip_blocks; i++) {
2407 if (!adev->ip_blocks[i].status.sw)
2408 continue;
2409 if (adev->ip_blocks[i].status.hw)
2410 continue;
2411 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2412 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2413 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2414 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2415 if (r) {
2416 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2417 adev->ip_blocks[i].version->funcs->name, r);
2418 return r;
2419 }
2420 adev->ip_blocks[i].status.hw = true;
2421 }
2422 }
2423
2424 return 0;
2425}
2426
2427static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2428{
2429 int i, r;
2430
2431 for (i = 0; i < adev->num_ip_blocks; i++) {
2432 if (!adev->ip_blocks[i].status.sw)
2433 continue;
2434 if (adev->ip_blocks[i].status.hw)
2435 continue;
2436 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2437 if (r) {
2438 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2439 adev->ip_blocks[i].version->funcs->name, r);
2440 return r;
2441 }
2442 adev->ip_blocks[i].status.hw = true;
2443 }
2444
2445 return 0;
2446}
2447
2448static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2449{
2450 int r = 0;
2451 int i;
2452 uint32_t smu_version;
2453
2454 if (adev->asic_type >= CHIP_VEGA10) {
2455 for (i = 0; i < adev->num_ip_blocks; i++) {
2456 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2457 continue;
2458
2459 if (!adev->ip_blocks[i].status.sw)
2460 continue;
2461
2462 /* no need to do the fw loading again if already done*/
2463 if (adev->ip_blocks[i].status.hw == true)
2464 break;
2465
2466 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2467 r = adev->ip_blocks[i].version->funcs->resume(adev);
2468 if (r) {
2469 DRM_ERROR("resume of IP block <%s> failed %d\n",
2470 adev->ip_blocks[i].version->funcs->name, r);
2471 return r;
2472 }
2473 } else {
2474 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2475 if (r) {
2476 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2477 adev->ip_blocks[i].version->funcs->name, r);
2478 return r;
2479 }
2480 }
2481
2482 adev->ip_blocks[i].status.hw = true;
2483 break;
2484 }
2485 }
2486
2487 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2488 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2489
2490 return r;
2491}
2492
2493static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2494{
2495 long timeout;
2496 int r, i;
2497
2498 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2499 struct amdgpu_ring *ring = adev->rings[i];
2500
2501 /* No need to setup the GPU scheduler for rings that don't need it */
2502 if (!ring || ring->no_scheduler)
2503 continue;
2504
2505 switch (ring->funcs->type) {
2506 case AMDGPU_RING_TYPE_GFX:
2507 timeout = adev->gfx_timeout;
2508 break;
2509 case AMDGPU_RING_TYPE_COMPUTE:
2510 timeout = adev->compute_timeout;
2511 break;
2512 case AMDGPU_RING_TYPE_SDMA:
2513 timeout = adev->sdma_timeout;
2514 break;
2515 default:
2516 timeout = adev->video_timeout;
2517 break;
2518 }
2519
2520 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2521 ring->num_hw_submission, 0,
2522 timeout, adev->reset_domain->wq,
2523 ring->sched_score, ring->name,
2524 adev->dev);
2525 if (r) {
2526 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2527 ring->name);
2528 return r;
2529 }
2530 }
2531
2532 amdgpu_xcp_update_partition_sched_list(adev);
2533
2534 return 0;
2535}
2536
2537
2538/**
2539 * amdgpu_device_ip_init - run init for hardware IPs
2540 *
2541 * @adev: amdgpu_device pointer
2542 *
2543 * Main initialization pass for hardware IPs. The list of all the hardware
2544 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2545 * are run. sw_init initializes the software state associated with each IP
2546 * and hw_init initializes the hardware associated with each IP.
2547 * Returns 0 on success, negative error code on failure.
2548 */
2549static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2550{
2551 int i, r;
2552
2553 r = amdgpu_ras_init(adev);
2554 if (r)
2555 return r;
2556
2557 for (i = 0; i < adev->num_ip_blocks; i++) {
2558 if (!adev->ip_blocks[i].status.valid)
2559 continue;
2560 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2561 if (r) {
2562 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2563 adev->ip_blocks[i].version->funcs->name, r);
2564 goto init_failed;
2565 }
2566 adev->ip_blocks[i].status.sw = true;
2567
2568 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2569 /* need to do common hw init early so everything is set up for gmc */
2570 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2571 if (r) {
2572 DRM_ERROR("hw_init %d failed %d\n", i, r);
2573 goto init_failed;
2574 }
2575 adev->ip_blocks[i].status.hw = true;
2576 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2577 /* need to do gmc hw init early so we can allocate gpu mem */
2578 /* Try to reserve bad pages early */
2579 if (amdgpu_sriov_vf(adev))
2580 amdgpu_virt_exchange_data(adev);
2581
2582 r = amdgpu_device_mem_scratch_init(adev);
2583 if (r) {
2584 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2585 goto init_failed;
2586 }
2587 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2588 if (r) {
2589 DRM_ERROR("hw_init %d failed %d\n", i, r);
2590 goto init_failed;
2591 }
2592 r = amdgpu_device_wb_init(adev);
2593 if (r) {
2594 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2595 goto init_failed;
2596 }
2597 adev->ip_blocks[i].status.hw = true;
2598
2599 /* right after GMC hw init, we create CSA */
2600 if (adev->gfx.mcbp) {
2601 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2602 AMDGPU_GEM_DOMAIN_VRAM |
2603 AMDGPU_GEM_DOMAIN_GTT,
2604 AMDGPU_CSA_SIZE);
2605 if (r) {
2606 DRM_ERROR("allocate CSA failed %d\n", r);
2607 goto init_failed;
2608 }
2609 }
2610 }
2611 }
2612
2613 if (amdgpu_sriov_vf(adev))
2614 amdgpu_virt_init_data_exchange(adev);
2615
2616 r = amdgpu_ib_pool_init(adev);
2617 if (r) {
2618 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2619 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2620 goto init_failed;
2621 }
2622
2623 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2624 if (r)
2625 goto init_failed;
2626
2627 r = amdgpu_device_ip_hw_init_phase1(adev);
2628 if (r)
2629 goto init_failed;
2630
2631 r = amdgpu_device_fw_loading(adev);
2632 if (r)
2633 goto init_failed;
2634
2635 r = amdgpu_device_ip_hw_init_phase2(adev);
2636 if (r)
2637 goto init_failed;
2638
2639 /*
2640 * retired pages will be loaded from eeprom and reserved here,
2641 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2642 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2643 * for I2C communication which only true at this point.
2644 *
2645 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2646 * failure from bad gpu situation and stop amdgpu init process
2647 * accordingly. For other failed cases, it will still release all
2648 * the resource and print error message, rather than returning one
2649 * negative value to upper level.
2650 *
2651 * Note: theoretically, this should be called before all vram allocations
2652 * to protect retired page from abusing
2653 */
2654 r = amdgpu_ras_recovery_init(adev);
2655 if (r)
2656 goto init_failed;
2657
2658 /**
2659 * In case of XGMI grab extra reference for reset domain for this device
2660 */
2661 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2662 if (amdgpu_xgmi_add_device(adev) == 0) {
2663 if (!amdgpu_sriov_vf(adev)) {
2664 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2665
2666 if (WARN_ON(!hive)) {
2667 r = -ENOENT;
2668 goto init_failed;
2669 }
2670
2671 if (!hive->reset_domain ||
2672 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2673 r = -ENOENT;
2674 amdgpu_put_xgmi_hive(hive);
2675 goto init_failed;
2676 }
2677
2678 /* Drop the early temporary reset domain we created for device */
2679 amdgpu_reset_put_reset_domain(adev->reset_domain);
2680 adev->reset_domain = hive->reset_domain;
2681 amdgpu_put_xgmi_hive(hive);
2682 }
2683 }
2684 }
2685
2686 r = amdgpu_device_init_schedulers(adev);
2687 if (r)
2688 goto init_failed;
2689
2690 /* Don't init kfd if whole hive need to be reset during init */
2691 if (!adev->gmc.xgmi.pending_reset) {
2692 kgd2kfd_init_zone_device(adev);
2693 amdgpu_amdkfd_device_init(adev);
2694 }
2695
2696 amdgpu_fru_get_product_info(adev);
2697
2698init_failed:
2699
2700 return r;
2701}
2702
2703/**
2704 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2705 *
2706 * @adev: amdgpu_device pointer
2707 *
2708 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2709 * this function before a GPU reset. If the value is retained after a
2710 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2711 */
2712static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2713{
2714 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2715}
2716
2717/**
2718 * amdgpu_device_check_vram_lost - check if vram is valid
2719 *
2720 * @adev: amdgpu_device pointer
2721 *
2722 * Checks the reset magic value written to the gart pointer in VRAM.
2723 * The driver calls this after a GPU reset to see if the contents of
2724 * VRAM is lost or now.
2725 * returns true if vram is lost, false if not.
2726 */
2727static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2728{
2729 if (memcmp(adev->gart.ptr, adev->reset_magic,
2730 AMDGPU_RESET_MAGIC_NUM))
2731 return true;
2732
2733 if (!amdgpu_in_reset(adev))
2734 return false;
2735
2736 /*
2737 * For all ASICs with baco/mode1 reset, the VRAM is
2738 * always assumed to be lost.
2739 */
2740 switch (amdgpu_asic_reset_method(adev)) {
2741 case AMD_RESET_METHOD_BACO:
2742 case AMD_RESET_METHOD_MODE1:
2743 return true;
2744 default:
2745 return false;
2746 }
2747}
2748
2749/**
2750 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2751 *
2752 * @adev: amdgpu_device pointer
2753 * @state: clockgating state (gate or ungate)
2754 *
2755 * The list of all the hardware IPs that make up the asic is walked and the
2756 * set_clockgating_state callbacks are run.
2757 * Late initialization pass enabling clockgating for hardware IPs.
2758 * Fini or suspend, pass disabling clockgating for hardware IPs.
2759 * Returns 0 on success, negative error code on failure.
2760 */
2761
2762int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2763 enum amd_clockgating_state state)
2764{
2765 int i, j, r;
2766
2767 if (amdgpu_emu_mode == 1)
2768 return 0;
2769
2770 for (j = 0; j < adev->num_ip_blocks; j++) {
2771 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2772 if (!adev->ip_blocks[i].status.late_initialized)
2773 continue;
2774 /* skip CG for GFX, SDMA on S0ix */
2775 if (adev->in_s0ix &&
2776 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2777 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2778 continue;
2779 /* skip CG for VCE/UVD, it's handled specially */
2780 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2781 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2782 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2783 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2784 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2785 /* enable clockgating to save power */
2786 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2787 state);
2788 if (r) {
2789 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2790 adev->ip_blocks[i].version->funcs->name, r);
2791 return r;
2792 }
2793 }
2794 }
2795
2796 return 0;
2797}
2798
2799int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2800 enum amd_powergating_state state)
2801{
2802 int i, j, r;
2803
2804 if (amdgpu_emu_mode == 1)
2805 return 0;
2806
2807 for (j = 0; j < adev->num_ip_blocks; j++) {
2808 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2809 if (!adev->ip_blocks[i].status.late_initialized)
2810 continue;
2811 /* skip PG for GFX, SDMA on S0ix */
2812 if (adev->in_s0ix &&
2813 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2814 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2815 continue;
2816 /* skip CG for VCE/UVD, it's handled specially */
2817 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2818 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2819 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2820 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2821 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2822 /* enable powergating to save power */
2823 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2824 state);
2825 if (r) {
2826 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2827 adev->ip_blocks[i].version->funcs->name, r);
2828 return r;
2829 }
2830 }
2831 }
2832 return 0;
2833}
2834
2835static int amdgpu_device_enable_mgpu_fan_boost(void)
2836{
2837 struct amdgpu_gpu_instance *gpu_ins;
2838 struct amdgpu_device *adev;
2839 int i, ret = 0;
2840
2841 mutex_lock(&mgpu_info.mutex);
2842
2843 /*
2844 * MGPU fan boost feature should be enabled
2845 * only when there are two or more dGPUs in
2846 * the system
2847 */
2848 if (mgpu_info.num_dgpu < 2)
2849 goto out;
2850
2851 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2852 gpu_ins = &(mgpu_info.gpu_ins[i]);
2853 adev = gpu_ins->adev;
2854 if (!(adev->flags & AMD_IS_APU) &&
2855 !gpu_ins->mgpu_fan_enabled) {
2856 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2857 if (ret)
2858 break;
2859
2860 gpu_ins->mgpu_fan_enabled = 1;
2861 }
2862 }
2863
2864out:
2865 mutex_unlock(&mgpu_info.mutex);
2866
2867 return ret;
2868}
2869
2870/**
2871 * amdgpu_device_ip_late_init - run late init for hardware IPs
2872 *
2873 * @adev: amdgpu_device pointer
2874 *
2875 * Late initialization pass for hardware IPs. The list of all the hardware
2876 * IPs that make up the asic is walked and the late_init callbacks are run.
2877 * late_init covers any special initialization that an IP requires
2878 * after all of the have been initialized or something that needs to happen
2879 * late in the init process.
2880 * Returns 0 on success, negative error code on failure.
2881 */
2882static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2883{
2884 struct amdgpu_gpu_instance *gpu_instance;
2885 int i = 0, r;
2886
2887 for (i = 0; i < adev->num_ip_blocks; i++) {
2888 if (!adev->ip_blocks[i].status.hw)
2889 continue;
2890 if (adev->ip_blocks[i].version->funcs->late_init) {
2891 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2892 if (r) {
2893 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2894 adev->ip_blocks[i].version->funcs->name, r);
2895 return r;
2896 }
2897 }
2898 adev->ip_blocks[i].status.late_initialized = true;
2899 }
2900
2901 r = amdgpu_ras_late_init(adev);
2902 if (r) {
2903 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2904 return r;
2905 }
2906
2907 amdgpu_ras_set_error_query_ready(adev, true);
2908
2909 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2910 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2911
2912 amdgpu_device_fill_reset_magic(adev);
2913
2914 r = amdgpu_device_enable_mgpu_fan_boost();
2915 if (r)
2916 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2917
2918 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2919 if (amdgpu_passthrough(adev) &&
2920 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
2921 adev->asic_type == CHIP_ALDEBARAN))
2922 amdgpu_dpm_handle_passthrough_sbr(adev, true);
2923
2924 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2925 mutex_lock(&mgpu_info.mutex);
2926
2927 /*
2928 * Reset device p-state to low as this was booted with high.
2929 *
2930 * This should be performed only after all devices from the same
2931 * hive get initialized.
2932 *
2933 * However, it's unknown how many device in the hive in advance.
2934 * As this is counted one by one during devices initializations.
2935 *
2936 * So, we wait for all XGMI interlinked devices initialized.
2937 * This may bring some delays as those devices may come from
2938 * different hives. But that should be OK.
2939 */
2940 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2941 for (i = 0; i < mgpu_info.num_gpu; i++) {
2942 gpu_instance = &(mgpu_info.gpu_ins[i]);
2943 if (gpu_instance->adev->flags & AMD_IS_APU)
2944 continue;
2945
2946 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2947 AMDGPU_XGMI_PSTATE_MIN);
2948 if (r) {
2949 DRM_ERROR("pstate setting failed (%d).\n", r);
2950 break;
2951 }
2952 }
2953 }
2954
2955 mutex_unlock(&mgpu_info.mutex);
2956 }
2957
2958 return 0;
2959}
2960
2961/**
2962 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2963 *
2964 * @adev: amdgpu_device pointer
2965 *
2966 * For ASICs need to disable SMC first
2967 */
2968static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2969{
2970 int i, r;
2971
2972 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2973 return;
2974
2975 for (i = 0; i < adev->num_ip_blocks; i++) {
2976 if (!adev->ip_blocks[i].status.hw)
2977 continue;
2978 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2979 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2980 /* XXX handle errors */
2981 if (r) {
2982 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2983 adev->ip_blocks[i].version->funcs->name, r);
2984 }
2985 adev->ip_blocks[i].status.hw = false;
2986 break;
2987 }
2988 }
2989}
2990
2991static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2992{
2993 int i, r;
2994
2995 for (i = 0; i < adev->num_ip_blocks; i++) {
2996 if (!adev->ip_blocks[i].version->funcs->early_fini)
2997 continue;
2998
2999 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
3000 if (r) {
3001 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
3002 adev->ip_blocks[i].version->funcs->name, r);
3003 }
3004 }
3005
3006 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3007 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3008
3009 amdgpu_amdkfd_suspend(adev, false);
3010
3011 /* Workaroud for ASICs need to disable SMC first */
3012 amdgpu_device_smu_fini_early(adev);
3013
3014 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3015 if (!adev->ip_blocks[i].status.hw)
3016 continue;
3017
3018 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
3019 /* XXX handle errors */
3020 if (r) {
3021 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3022 adev->ip_blocks[i].version->funcs->name, r);
3023 }
3024
3025 adev->ip_blocks[i].status.hw = false;
3026 }
3027
3028 if (amdgpu_sriov_vf(adev)) {
3029 if (amdgpu_virt_release_full_gpu(adev, false))
3030 DRM_ERROR("failed to release exclusive mode on fini\n");
3031 }
3032
3033 return 0;
3034}
3035
3036/**
3037 * amdgpu_device_ip_fini - run fini for hardware IPs
3038 *
3039 * @adev: amdgpu_device pointer
3040 *
3041 * Main teardown pass for hardware IPs. The list of all the hardware
3042 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3043 * are run. hw_fini tears down the hardware associated with each IP
3044 * and sw_fini tears down any software state associated with each IP.
3045 * Returns 0 on success, negative error code on failure.
3046 */
3047static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
3048{
3049 int i, r;
3050
3051 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
3052 amdgpu_virt_release_ras_err_handler_data(adev);
3053
3054 if (adev->gmc.xgmi.num_physical_nodes > 1)
3055 amdgpu_xgmi_remove_device(adev);
3056
3057 amdgpu_amdkfd_device_fini_sw(adev);
3058
3059 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3060 if (!adev->ip_blocks[i].status.sw)
3061 continue;
3062
3063 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3064 amdgpu_ucode_free_bo(adev);
3065 amdgpu_free_static_csa(&adev->virt.csa_obj);
3066 amdgpu_device_wb_fini(adev);
3067 amdgpu_device_mem_scratch_fini(adev);
3068 amdgpu_ib_pool_fini(adev);
3069 }
3070
3071 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
3072 /* XXX handle errors */
3073 if (r) {
3074 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
3075 adev->ip_blocks[i].version->funcs->name, r);
3076 }
3077 adev->ip_blocks[i].status.sw = false;
3078 adev->ip_blocks[i].status.valid = false;
3079 }
3080
3081 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3082 if (!adev->ip_blocks[i].status.late_initialized)
3083 continue;
3084 if (adev->ip_blocks[i].version->funcs->late_fini)
3085 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
3086 adev->ip_blocks[i].status.late_initialized = false;
3087 }
3088
3089 amdgpu_ras_fini(adev);
3090
3091 return 0;
3092}
3093
3094/**
3095 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3096 *
3097 * @work: work_struct.
3098 */
3099static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
3100{
3101 struct amdgpu_device *adev =
3102 container_of(work, struct amdgpu_device, delayed_init_work.work);
3103 int r;
3104
3105 r = amdgpu_ib_ring_tests(adev);
3106 if (r)
3107 DRM_ERROR("ib ring test failed (%d).\n", r);
3108}
3109
3110static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
3111{
3112 struct amdgpu_device *adev =
3113 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
3114
3115 WARN_ON_ONCE(adev->gfx.gfx_off_state);
3116 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
3117
3118 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
3119 adev->gfx.gfx_off_state = true;
3120}
3121
3122/**
3123 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3124 *
3125 * @adev: amdgpu_device pointer
3126 *
3127 * Main suspend function for hardware IPs. The list of all the hardware
3128 * IPs that make up the asic is walked, clockgating is disabled and the
3129 * suspend callbacks are run. suspend puts the hardware and software state
3130 * in each IP into a state suitable for suspend.
3131 * Returns 0 on success, negative error code on failure.
3132 */
3133static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
3134{
3135 int i, r;
3136
3137 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3138 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3139
3140 /*
3141 * Per PMFW team's suggestion, driver needs to handle gfxoff
3142 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3143 * scenario. Add the missing df cstate disablement here.
3144 */
3145 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
3146 dev_warn(adev->dev, "Failed to disallow df cstate");
3147
3148 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3149 if (!adev->ip_blocks[i].status.valid)
3150 continue;
3151
3152 /* displays are handled separately */
3153 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
3154 continue;
3155
3156 /* XXX handle errors */
3157 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3158 /* XXX handle errors */
3159 if (r) {
3160 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3161 adev->ip_blocks[i].version->funcs->name, r);
3162 return r;
3163 }
3164
3165 adev->ip_blocks[i].status.hw = false;
3166 }
3167
3168 return 0;
3169}
3170
3171/**
3172 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3173 *
3174 * @adev: amdgpu_device pointer
3175 *
3176 * Main suspend function for hardware IPs. The list of all the hardware
3177 * IPs that make up the asic is walked, clockgating is disabled and the
3178 * suspend callbacks are run. suspend puts the hardware and software state
3179 * in each IP into a state suitable for suspend.
3180 * Returns 0 on success, negative error code on failure.
3181 */
3182static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3183{
3184 int i, r;
3185
3186 if (adev->in_s0ix)
3187 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3188
3189 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3190 if (!adev->ip_blocks[i].status.valid)
3191 continue;
3192 /* displays are handled in phase1 */
3193 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3194 continue;
3195 /* PSP lost connection when err_event_athub occurs */
3196 if (amdgpu_ras_intr_triggered() &&
3197 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3198 adev->ip_blocks[i].status.hw = false;
3199 continue;
3200 }
3201
3202 /* skip unnecessary suspend if we do not initialize them yet */
3203 if (adev->gmc.xgmi.pending_reset &&
3204 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3205 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
3206 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3207 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
3208 adev->ip_blocks[i].status.hw = false;
3209 continue;
3210 }
3211
3212 /* skip suspend of gfx/mes and psp for S0ix
3213 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3214 * like at runtime. PSP is also part of the always on hardware
3215 * so no need to suspend it.
3216 */
3217 if (adev->in_s0ix &&
3218 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3219 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3220 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3221 continue;
3222
3223 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3224 if (adev->in_s0ix &&
3225 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
3226 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3227 continue;
3228
3229 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3230 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3231 * from this location and RLC Autoload automatically also gets loaded
3232 * from here based on PMFW -> PSP message during re-init sequence.
3233 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3234 * the TMR and reload FWs again for IMU enabled APU ASICs.
3235 */
3236 if (amdgpu_in_reset(adev) &&
3237 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3238 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3239 continue;
3240
3241 /* XXX handle errors */
3242 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3243 /* XXX handle errors */
3244 if (r) {
3245 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3246 adev->ip_blocks[i].version->funcs->name, r);
3247 }
3248 adev->ip_blocks[i].status.hw = false;
3249 /* handle putting the SMC in the appropriate state */
3250 if (!amdgpu_sriov_vf(adev)) {
3251 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3252 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3253 if (r) {
3254 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3255 adev->mp1_state, r);
3256 return r;
3257 }
3258 }
3259 }
3260 }
3261
3262 return 0;
3263}
3264
3265/**
3266 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3267 *
3268 * @adev: amdgpu_device pointer
3269 *
3270 * Main suspend function for hardware IPs. The list of all the hardware
3271 * IPs that make up the asic is walked, clockgating is disabled and the
3272 * suspend callbacks are run. suspend puts the hardware and software state
3273 * in each IP into a state suitable for suspend.
3274 * Returns 0 on success, negative error code on failure.
3275 */
3276int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3277{
3278 int r;
3279
3280 if (amdgpu_sriov_vf(adev)) {
3281 amdgpu_virt_fini_data_exchange(adev);
3282 amdgpu_virt_request_full_gpu(adev, false);
3283 }
3284
3285 r = amdgpu_device_ip_suspend_phase1(adev);
3286 if (r)
3287 return r;
3288 r = amdgpu_device_ip_suspend_phase2(adev);
3289
3290 if (amdgpu_sriov_vf(adev))
3291 amdgpu_virt_release_full_gpu(adev, false);
3292
3293 return r;
3294}
3295
3296static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3297{
3298 int i, r;
3299
3300 static enum amd_ip_block_type ip_order[] = {
3301 AMD_IP_BLOCK_TYPE_COMMON,
3302 AMD_IP_BLOCK_TYPE_GMC,
3303 AMD_IP_BLOCK_TYPE_PSP,
3304 AMD_IP_BLOCK_TYPE_IH,
3305 };
3306
3307 for (i = 0; i < adev->num_ip_blocks; i++) {
3308 int j;
3309 struct amdgpu_ip_block *block;
3310
3311 block = &adev->ip_blocks[i];
3312 block->status.hw = false;
3313
3314 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3315
3316 if (block->version->type != ip_order[j] ||
3317 !block->status.valid)
3318 continue;
3319
3320 r = block->version->funcs->hw_init(adev);
3321 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3322 if (r)
3323 return r;
3324 block->status.hw = true;
3325 }
3326 }
3327
3328 return 0;
3329}
3330
3331static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3332{
3333 int i, r;
3334
3335 static enum amd_ip_block_type ip_order[] = {
3336 AMD_IP_BLOCK_TYPE_SMC,
3337 AMD_IP_BLOCK_TYPE_DCE,
3338 AMD_IP_BLOCK_TYPE_GFX,
3339 AMD_IP_BLOCK_TYPE_SDMA,
3340 AMD_IP_BLOCK_TYPE_MES,
3341 AMD_IP_BLOCK_TYPE_UVD,
3342 AMD_IP_BLOCK_TYPE_VCE,
3343 AMD_IP_BLOCK_TYPE_VCN,
3344 AMD_IP_BLOCK_TYPE_JPEG
3345 };
3346
3347 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3348 int j;
3349 struct amdgpu_ip_block *block;
3350
3351 for (j = 0; j < adev->num_ip_blocks; j++) {
3352 block = &adev->ip_blocks[j];
3353
3354 if (block->version->type != ip_order[i] ||
3355 !block->status.valid ||
3356 block->status.hw)
3357 continue;
3358
3359 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3360 r = block->version->funcs->resume(adev);
3361 else
3362 r = block->version->funcs->hw_init(adev);
3363
3364 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3365 if (r)
3366 return r;
3367 block->status.hw = true;
3368 }
3369 }
3370
3371 return 0;
3372}
3373
3374/**
3375 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3376 *
3377 * @adev: amdgpu_device pointer
3378 *
3379 * First resume function for hardware IPs. The list of all the hardware
3380 * IPs that make up the asic is walked and the resume callbacks are run for
3381 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3382 * after a suspend and updates the software state as necessary. This
3383 * function is also used for restoring the GPU after a GPU reset.
3384 * Returns 0 on success, negative error code on failure.
3385 */
3386static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3387{
3388 int i, r;
3389
3390 for (i = 0; i < adev->num_ip_blocks; i++) {
3391 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3392 continue;
3393 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3394 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3395 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3396 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3397
3398 r = adev->ip_blocks[i].version->funcs->resume(adev);
3399 if (r) {
3400 DRM_ERROR("resume of IP block <%s> failed %d\n",
3401 adev->ip_blocks[i].version->funcs->name, r);
3402 return r;
3403 }
3404 adev->ip_blocks[i].status.hw = true;
3405 }
3406 }
3407
3408 return 0;
3409}
3410
3411/**
3412 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3413 *
3414 * @adev: amdgpu_device pointer
3415 *
3416 * First resume function for hardware IPs. The list of all the hardware
3417 * IPs that make up the asic is walked and the resume callbacks are run for
3418 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3419 * functional state after a suspend and updates the software state as
3420 * necessary. This function is also used for restoring the GPU after a GPU
3421 * reset.
3422 * Returns 0 on success, negative error code on failure.
3423 */
3424static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3425{
3426 int i, r;
3427
3428 for (i = 0; i < adev->num_ip_blocks; i++) {
3429 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3430 continue;
3431 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3432 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3433 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3434 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3435 continue;
3436 r = adev->ip_blocks[i].version->funcs->resume(adev);
3437 if (r) {
3438 DRM_ERROR("resume of IP block <%s> failed %d\n",
3439 adev->ip_blocks[i].version->funcs->name, r);
3440 return r;
3441 }
3442 adev->ip_blocks[i].status.hw = true;
3443 }
3444
3445 return 0;
3446}
3447
3448/**
3449 * amdgpu_device_ip_resume - run resume for hardware IPs
3450 *
3451 * @adev: amdgpu_device pointer
3452 *
3453 * Main resume function for hardware IPs. The hardware IPs
3454 * are split into two resume functions because they are
3455 * are also used in in recovering from a GPU reset and some additional
3456 * steps need to be take between them. In this case (S3/S4) they are
3457 * run sequentially.
3458 * Returns 0 on success, negative error code on failure.
3459 */
3460static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3461{
3462 int r;
3463
3464 if (!adev->in_s0ix) {
3465 r = amdgpu_amdkfd_resume_iommu(adev);
3466 if (r)
3467 return r;
3468 }
3469
3470 r = amdgpu_device_ip_resume_phase1(adev);
3471 if (r)
3472 return r;
3473
3474 r = amdgpu_device_fw_loading(adev);
3475 if (r)
3476 return r;
3477
3478 r = amdgpu_device_ip_resume_phase2(adev);
3479
3480 return r;
3481}
3482
3483/**
3484 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3485 *
3486 * @adev: amdgpu_device pointer
3487 *
3488 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3489 */
3490static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3491{
3492 if (amdgpu_sriov_vf(adev)) {
3493 if (adev->is_atom_fw) {
3494 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3495 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3496 } else {
3497 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3498 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3499 }
3500
3501 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3502 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3503 }
3504}
3505
3506/**
3507 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3508 *
3509 * @asic_type: AMD asic type
3510 *
3511 * Check if there is DC (new modesetting infrastructre) support for an asic.
3512 * returns true if DC has support, false if not.
3513 */
3514bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3515{
3516 switch (asic_type) {
3517#ifdef CONFIG_DRM_AMDGPU_SI
3518 case CHIP_HAINAN:
3519#endif
3520 case CHIP_TOPAZ:
3521 /* chips with no display hardware */
3522 return false;
3523#if defined(CONFIG_DRM_AMD_DC)
3524 case CHIP_TAHITI:
3525 case CHIP_PITCAIRN:
3526 case CHIP_VERDE:
3527 case CHIP_OLAND:
3528 /*
3529 * We have systems in the wild with these ASICs that require
3530 * LVDS and VGA support which is not supported with DC.
3531 *
3532 * Fallback to the non-DC driver here by default so as not to
3533 * cause regressions.
3534 */
3535#if defined(CONFIG_DRM_AMD_DC_SI)
3536 return amdgpu_dc > 0;
3537#else
3538 return false;
3539#endif
3540 case CHIP_BONAIRE:
3541 case CHIP_KAVERI:
3542 case CHIP_KABINI:
3543 case CHIP_MULLINS:
3544 /*
3545 * We have systems in the wild with these ASICs that require
3546 * VGA support which is not supported with DC.
3547 *
3548 * Fallback to the non-DC driver here by default so as not to
3549 * cause regressions.
3550 */
3551 return amdgpu_dc > 0;
3552 default:
3553 return amdgpu_dc != 0;
3554#else
3555 default:
3556 if (amdgpu_dc > 0)
3557 DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3558 "but isn't supported by ASIC, ignoring\n");
3559 return false;
3560#endif
3561 }
3562}
3563
3564/**
3565 * amdgpu_device_has_dc_support - check if dc is supported
3566 *
3567 * @adev: amdgpu_device pointer
3568 *
3569 * Returns true for supported, false for not supported
3570 */
3571bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3572{
3573 if (adev->enable_virtual_display ||
3574 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3575 return false;
3576
3577 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3578}
3579
3580static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3581{
3582 struct amdgpu_device *adev =
3583 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3584 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3585
3586 /* It's a bug to not have a hive within this function */
3587 if (WARN_ON(!hive))
3588 return;
3589
3590 /*
3591 * Use task barrier to synchronize all xgmi reset works across the
3592 * hive. task_barrier_enter and task_barrier_exit will block
3593 * until all the threads running the xgmi reset works reach
3594 * those points. task_barrier_full will do both blocks.
3595 */
3596 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3597
3598 task_barrier_enter(&hive->tb);
3599 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3600
3601 if (adev->asic_reset_res)
3602 goto fail;
3603
3604 task_barrier_exit(&hive->tb);
3605 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3606
3607 if (adev->asic_reset_res)
3608 goto fail;
3609
3610 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3611 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3612 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3613 } else {
3614
3615 task_barrier_full(&hive->tb);
3616 adev->asic_reset_res = amdgpu_asic_reset(adev);
3617 }
3618
3619fail:
3620 if (adev->asic_reset_res)
3621 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3622 adev->asic_reset_res, adev_to_drm(adev)->unique);
3623 amdgpu_put_xgmi_hive(hive);
3624}
3625
3626static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3627{
3628 char *input = amdgpu_lockup_timeout;
3629 char *timeout_setting = NULL;
3630 int index = 0;
3631 long timeout;
3632 int ret = 0;
3633
3634 /*
3635 * By default timeout for non compute jobs is 10000
3636 * and 60000 for compute jobs.
3637 * In SR-IOV or passthrough mode, timeout for compute
3638 * jobs are 60000 by default.
3639 */
3640 adev->gfx_timeout = msecs_to_jiffies(10000);
3641 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3642 if (amdgpu_sriov_vf(adev))
3643 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3644 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3645 else
3646 adev->compute_timeout = msecs_to_jiffies(60000);
3647
3648 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3649 while ((timeout_setting = strsep(&input, ",")) &&
3650 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3651 ret = kstrtol(timeout_setting, 0, &timeout);
3652 if (ret)
3653 return ret;
3654
3655 if (timeout == 0) {
3656 index++;
3657 continue;
3658 } else if (timeout < 0) {
3659 timeout = MAX_SCHEDULE_TIMEOUT;
3660 dev_warn(adev->dev, "lockup timeout disabled");
3661 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3662 } else {
3663 timeout = msecs_to_jiffies(timeout);
3664 }
3665
3666 switch (index++) {
3667 case 0:
3668 adev->gfx_timeout = timeout;
3669 break;
3670 case 1:
3671 adev->compute_timeout = timeout;
3672 break;
3673 case 2:
3674 adev->sdma_timeout = timeout;
3675 break;
3676 case 3:
3677 adev->video_timeout = timeout;
3678 break;
3679 default:
3680 break;
3681 }
3682 }
3683 /*
3684 * There is only one value specified and
3685 * it should apply to all non-compute jobs.
3686 */
3687 if (index == 1) {
3688 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3689 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3690 adev->compute_timeout = adev->gfx_timeout;
3691 }
3692 }
3693
3694 return ret;
3695}
3696
3697/**
3698 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3699 *
3700 * @adev: amdgpu_device pointer
3701 *
3702 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3703 */
3704static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3705{
3706 struct iommu_domain *domain;
3707
3708 domain = iommu_get_domain_for_dev(adev->dev);
3709 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3710 adev->ram_is_direct_mapped = true;
3711}
3712
3713static const struct attribute *amdgpu_dev_attributes[] = {
3714 &dev_attr_product_name.attr,
3715 &dev_attr_product_number.attr,
3716 &dev_attr_serial_number.attr,
3717 &dev_attr_pcie_replay_count.attr,
3718 NULL
3719};
3720
3721static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
3722{
3723 if (amdgpu_mcbp == 1)
3724 adev->gfx.mcbp = true;
3725
3726 if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) &&
3727 (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) &&
3728 adev->gfx.num_gfx_rings)
3729 adev->gfx.mcbp = true;
3730
3731 if (amdgpu_sriov_vf(adev))
3732 adev->gfx.mcbp = true;
3733
3734 if (adev->gfx.mcbp)
3735 DRM_INFO("MCBP is enabled\n");
3736}
3737
3738/**
3739 * amdgpu_device_init - initialize the driver
3740 *
3741 * @adev: amdgpu_device pointer
3742 * @flags: driver flags
3743 *
3744 * Initializes the driver info and hw (all asics).
3745 * Returns 0 for success or an error on failure.
3746 * Called at driver startup.
3747 */
3748int amdgpu_device_init(struct amdgpu_device *adev,
3749 uint32_t flags)
3750{
3751 struct drm_device *ddev = adev_to_drm(adev);
3752 struct pci_dev *pdev = adev->pdev;
3753 int r, i;
3754 bool px = false;
3755 u32 max_MBps;
3756 int tmp;
3757
3758 adev->shutdown = false;
3759 adev->flags = flags;
3760
3761 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3762 adev->asic_type = amdgpu_force_asic_type;
3763 else
3764 adev->asic_type = flags & AMD_ASIC_MASK;
3765
3766 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3767 if (amdgpu_emu_mode == 1)
3768 adev->usec_timeout *= 10;
3769 adev->gmc.gart_size = 512 * 1024 * 1024;
3770 adev->accel_working = false;
3771 adev->num_rings = 0;
3772 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3773 adev->mman.buffer_funcs = NULL;
3774 adev->mman.buffer_funcs_ring = NULL;
3775 adev->vm_manager.vm_pte_funcs = NULL;
3776 adev->vm_manager.vm_pte_num_scheds = 0;
3777 adev->gmc.gmc_funcs = NULL;
3778 adev->harvest_ip_mask = 0x0;
3779 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3780 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3781
3782 adev->smc_rreg = &amdgpu_invalid_rreg;
3783 adev->smc_wreg = &amdgpu_invalid_wreg;
3784 adev->pcie_rreg = &amdgpu_invalid_rreg;
3785 adev->pcie_wreg = &amdgpu_invalid_wreg;
3786 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
3787 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
3788 adev->pciep_rreg = &amdgpu_invalid_rreg;
3789 adev->pciep_wreg = &amdgpu_invalid_wreg;
3790 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3791 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3792 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3793 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3794 adev->didt_rreg = &amdgpu_invalid_rreg;
3795 adev->didt_wreg = &amdgpu_invalid_wreg;
3796 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3797 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3798 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3799 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3800
3801 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3802 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3803 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3804
3805 /* mutex initialization are all done here so we
3806 * can recall function without having locking issues */
3807 mutex_init(&adev->firmware.mutex);
3808 mutex_init(&adev->pm.mutex);
3809 mutex_init(&adev->gfx.gpu_clock_mutex);
3810 mutex_init(&adev->srbm_mutex);
3811 mutex_init(&adev->gfx.pipe_reserve_mutex);
3812 mutex_init(&adev->gfx.gfx_off_mutex);
3813 mutex_init(&adev->gfx.partition_mutex);
3814 mutex_init(&adev->grbm_idx_mutex);
3815 mutex_init(&adev->mn_lock);
3816 mutex_init(&adev->virt.vf_errors.lock);
3817 hash_init(adev->mn_hash);
3818 mutex_init(&adev->psp.mutex);
3819 mutex_init(&adev->notifier_lock);
3820 mutex_init(&adev->pm.stable_pstate_ctx_lock);
3821 mutex_init(&adev->benchmark_mutex);
3822
3823 amdgpu_device_init_apu_flags(adev);
3824
3825 r = amdgpu_device_check_arguments(adev);
3826 if (r)
3827 return r;
3828
3829 spin_lock_init(&adev->mmio_idx_lock);
3830 spin_lock_init(&adev->smc_idx_lock);
3831 spin_lock_init(&adev->pcie_idx_lock);
3832 spin_lock_init(&adev->uvd_ctx_idx_lock);
3833 spin_lock_init(&adev->didt_idx_lock);
3834 spin_lock_init(&adev->gc_cac_idx_lock);
3835 spin_lock_init(&adev->se_cac_idx_lock);
3836 spin_lock_init(&adev->audio_endpt_idx_lock);
3837 spin_lock_init(&adev->mm_stats.lock);
3838
3839 INIT_LIST_HEAD(&adev->shadow_list);
3840 mutex_init(&adev->shadow_list_lock);
3841
3842 INIT_LIST_HEAD(&adev->reset_list);
3843
3844 INIT_LIST_HEAD(&adev->ras_list);
3845
3846 INIT_DELAYED_WORK(&adev->delayed_init_work,
3847 amdgpu_device_delayed_init_work_handler);
3848 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3849 amdgpu_device_delay_enable_gfx_off);
3850
3851 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3852
3853 adev->gfx.gfx_off_req_count = 1;
3854 adev->gfx.gfx_off_residency = 0;
3855 adev->gfx.gfx_off_entrycount = 0;
3856 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3857
3858 atomic_set(&adev->throttling_logging_enabled, 1);
3859 /*
3860 * If throttling continues, logging will be performed every minute
3861 * to avoid log flooding. "-1" is subtracted since the thermal
3862 * throttling interrupt comes every second. Thus, the total logging
3863 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3864 * for throttling interrupt) = 60 seconds.
3865 */
3866 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3867 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3868
3869 /* Registers mapping */
3870 /* TODO: block userspace mapping of io register */
3871 if (adev->asic_type >= CHIP_BONAIRE) {
3872 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3873 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3874 } else {
3875 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3876 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3877 }
3878
3879 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3880 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3881
3882 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3883 if (adev->rmmio == NULL) {
3884 return -ENOMEM;
3885 }
3886 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3887 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3888
3889 /*
3890 * Reset domain needs to be present early, before XGMI hive discovered
3891 * (if any) and intitialized to use reset sem and in_gpu reset flag
3892 * early on during init and before calling to RREG32.
3893 */
3894 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3895 if (!adev->reset_domain)
3896 return -ENOMEM;
3897
3898 /* detect hw virtualization here */
3899 amdgpu_detect_virtualization(adev);
3900
3901 amdgpu_device_get_pcie_info(adev);
3902
3903 r = amdgpu_device_get_job_timeout_settings(adev);
3904 if (r) {
3905 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3906 return r;
3907 }
3908
3909 /* early init functions */
3910 r = amdgpu_device_ip_early_init(adev);
3911 if (r)
3912 return r;
3913
3914 amdgpu_device_set_mcbp(adev);
3915
3916 /* Get rid of things like offb */
3917 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3918 if (r)
3919 return r;
3920
3921 /* Enable TMZ based on IP_VERSION */
3922 amdgpu_gmc_tmz_set(adev);
3923
3924 amdgpu_gmc_noretry_set(adev);
3925 /* Need to get xgmi info early to decide the reset behavior*/
3926 if (adev->gmc.xgmi.supported) {
3927 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3928 if (r)
3929 return r;
3930 }
3931
3932 /* enable PCIE atomic ops */
3933 if (amdgpu_sriov_vf(adev)) {
3934 if (adev->virt.fw_reserve.p_pf2vf)
3935 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3936 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3937 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3938 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3939 * internal path natively support atomics, set have_atomics_support to true.
3940 */
3941 } else if ((adev->flags & AMD_IS_APU) &&
3942 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) {
3943 adev->have_atomics_support = true;
3944 } else {
3945 adev->have_atomics_support =
3946 !pci_enable_atomic_ops_to_root(adev->pdev,
3947 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3948 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3949 }
3950
3951 if (!adev->have_atomics_support)
3952 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3953
3954 /* doorbell bar mapping and doorbell index init*/
3955 amdgpu_device_doorbell_init(adev);
3956
3957 if (amdgpu_emu_mode == 1) {
3958 /* post the asic on emulation mode */
3959 emu_soc_asic_init(adev);
3960 goto fence_driver_init;
3961 }
3962
3963 amdgpu_reset_init(adev);
3964
3965 /* detect if we are with an SRIOV vbios */
3966 if (adev->bios)
3967 amdgpu_device_detect_sriov_bios(adev);
3968
3969 /* check if we need to reset the asic
3970 * E.g., driver was not cleanly unloaded previously, etc.
3971 */
3972 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3973 if (adev->gmc.xgmi.num_physical_nodes) {
3974 dev_info(adev->dev, "Pending hive reset.\n");
3975 adev->gmc.xgmi.pending_reset = true;
3976 /* Only need to init necessary block for SMU to handle the reset */
3977 for (i = 0; i < adev->num_ip_blocks; i++) {
3978 if (!adev->ip_blocks[i].status.valid)
3979 continue;
3980 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3981 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3982 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3983 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3984 DRM_DEBUG("IP %s disabled for hw_init.\n",
3985 adev->ip_blocks[i].version->funcs->name);
3986 adev->ip_blocks[i].status.hw = true;
3987 }
3988 }
3989 } else {
3990 tmp = amdgpu_reset_method;
3991 /* It should do a default reset when loading or reloading the driver,
3992 * regardless of the module parameter reset_method.
3993 */
3994 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
3995 r = amdgpu_asic_reset(adev);
3996 amdgpu_reset_method = tmp;
3997 if (r) {
3998 dev_err(adev->dev, "asic reset on init failed\n");
3999 goto failed;
4000 }
4001 }
4002 }
4003
4004 /* Post card if necessary */
4005 if (amdgpu_device_need_post(adev)) {
4006 if (!adev->bios) {
4007 dev_err(adev->dev, "no vBIOS found\n");
4008 r = -EINVAL;
4009 goto failed;
4010 }
4011 DRM_INFO("GPU posting now...\n");
4012 r = amdgpu_device_asic_init(adev);
4013 if (r) {
4014 dev_err(adev->dev, "gpu post error!\n");
4015 goto failed;
4016 }
4017 }
4018
4019 if (adev->bios) {
4020 if (adev->is_atom_fw) {
4021 /* Initialize clocks */
4022 r = amdgpu_atomfirmware_get_clock_info(adev);
4023 if (r) {
4024 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
4025 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4026 goto failed;
4027 }
4028 } else {
4029 /* Initialize clocks */
4030 r = amdgpu_atombios_get_clock_info(adev);
4031 if (r) {
4032 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
4033 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4034 goto failed;
4035 }
4036 /* init i2c buses */
4037 if (!amdgpu_device_has_dc_support(adev))
4038 amdgpu_atombios_i2c_init(adev);
4039 }
4040 }
4041
4042fence_driver_init:
4043 /* Fence driver */
4044 r = amdgpu_fence_driver_sw_init(adev);
4045 if (r) {
4046 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
4047 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
4048 goto failed;
4049 }
4050
4051 /* init the mode config */
4052 drm_mode_config_init(adev_to_drm(adev));
4053
4054 r = amdgpu_device_ip_init(adev);
4055 if (r) {
4056 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
4057 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
4058 goto release_ras_con;
4059 }
4060
4061 amdgpu_fence_driver_hw_init(adev);
4062
4063 dev_info(adev->dev,
4064 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4065 adev->gfx.config.max_shader_engines,
4066 adev->gfx.config.max_sh_per_se,
4067 adev->gfx.config.max_cu_per_sh,
4068 adev->gfx.cu_info.number);
4069
4070 adev->accel_working = true;
4071
4072 amdgpu_vm_check_compute_bug(adev);
4073
4074 /* Initialize the buffer migration limit. */
4075 if (amdgpu_moverate >= 0)
4076 max_MBps = amdgpu_moverate;
4077 else
4078 max_MBps = 8; /* Allow 8 MB/s. */
4079 /* Get a log2 for easy divisions. */
4080 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
4081
4082 r = amdgpu_atombios_sysfs_init(adev);
4083 if (r)
4084 drm_err(&adev->ddev,
4085 "registering atombios sysfs failed (%d).\n", r);
4086
4087 r = amdgpu_pm_sysfs_init(adev);
4088 if (r)
4089 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
4090
4091 r = amdgpu_ucode_sysfs_init(adev);
4092 if (r) {
4093 adev->ucode_sysfs_en = false;
4094 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
4095 } else
4096 adev->ucode_sysfs_en = true;
4097
4098 r = amdgpu_psp_sysfs_init(adev);
4099 if (r) {
4100 adev->psp_sysfs_en = false;
4101 if (!amdgpu_sriov_vf(adev))
4102 DRM_ERROR("Creating psp sysfs failed\n");
4103 } else
4104 adev->psp_sysfs_en = true;
4105
4106 /*
4107 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4108 * Otherwise the mgpu fan boost feature will be skipped due to the
4109 * gpu instance is counted less.
4110 */
4111 amdgpu_register_gpu_instance(adev);
4112
4113 /* enable clockgating, etc. after ib tests, etc. since some blocks require
4114 * explicit gating rather than handling it automatically.
4115 */
4116 if (!adev->gmc.xgmi.pending_reset) {
4117 r = amdgpu_device_ip_late_init(adev);
4118 if (r) {
4119 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
4120 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
4121 goto release_ras_con;
4122 }
4123 /* must succeed. */
4124 amdgpu_ras_resume(adev);
4125 queue_delayed_work(system_wq, &adev->delayed_init_work,
4126 msecs_to_jiffies(AMDGPU_RESUME_MS));
4127 }
4128
4129 if (amdgpu_sriov_vf(adev)) {
4130 amdgpu_virt_release_full_gpu(adev, true);
4131 flush_delayed_work(&adev->delayed_init_work);
4132 }
4133
4134 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
4135 if (r)
4136 dev_err(adev->dev, "Could not create amdgpu device attr\n");
4137
4138 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4139 r = amdgpu_pmu_init(adev);
4140 if (r)
4141 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4142
4143 /* Have stored pci confspace at hand for restore in sudden PCI error */
4144 if (amdgpu_device_cache_pci_state(adev->pdev))
4145 pci_restore_state(pdev);
4146
4147 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4148 /* this will fail for cards that aren't VGA class devices, just
4149 * ignore it */
4150 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4151 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4152
4153 px = amdgpu_device_supports_px(ddev);
4154
4155 if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
4156 apple_gmux_detect(NULL, NULL)))
4157 vga_switcheroo_register_client(adev->pdev,
4158 &amdgpu_switcheroo_ops, px);
4159
4160 if (px)
4161 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4162
4163 if (adev->gmc.xgmi.pending_reset)
4164 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
4165 msecs_to_jiffies(AMDGPU_RESUME_MS));
4166
4167 amdgpu_device_check_iommu_direct_map(adev);
4168
4169 return 0;
4170
4171release_ras_con:
4172 if (amdgpu_sriov_vf(adev))
4173 amdgpu_virt_release_full_gpu(adev, true);
4174
4175 /* failed in exclusive mode due to timeout */
4176 if (amdgpu_sriov_vf(adev) &&
4177 !amdgpu_sriov_runtime(adev) &&
4178 amdgpu_virt_mmio_blocked(adev) &&
4179 !amdgpu_virt_wait_reset(adev)) {
4180 dev_err(adev->dev, "VF exclusive mode timeout\n");
4181 /* Don't send request since VF is inactive. */
4182 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4183 adev->virt.ops = NULL;
4184 r = -EAGAIN;
4185 }
4186 amdgpu_release_ras_context(adev);
4187
4188failed:
4189 amdgpu_vf_error_trans_all(adev);
4190
4191 return r;
4192}
4193
4194static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4195{
4196
4197 /* Clear all CPU mappings pointing to this device */
4198 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4199
4200 /* Unmap all mapped bars - Doorbell, registers and VRAM */
4201 amdgpu_device_doorbell_fini(adev);
4202
4203 iounmap(adev->rmmio);
4204 adev->rmmio = NULL;
4205 if (adev->mman.aper_base_kaddr)
4206 iounmap(adev->mman.aper_base_kaddr);
4207 adev->mman.aper_base_kaddr = NULL;
4208
4209 /* Memory manager related */
4210 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4211 arch_phys_wc_del(adev->gmc.vram_mtrr);
4212 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4213 }
4214}
4215
4216/**
4217 * amdgpu_device_fini_hw - tear down the driver
4218 *
4219 * @adev: amdgpu_device pointer
4220 *
4221 * Tear down the driver info (all asics).
4222 * Called at driver shutdown.
4223 */
4224void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4225{
4226 dev_info(adev->dev, "amdgpu: finishing device.\n");
4227 flush_delayed_work(&adev->delayed_init_work);
4228 adev->shutdown = true;
4229
4230 /* make sure IB test finished before entering exclusive mode
4231 * to avoid preemption on IB test
4232 * */
4233 if (amdgpu_sriov_vf(adev)) {
4234 amdgpu_virt_request_full_gpu(adev, false);
4235 amdgpu_virt_fini_data_exchange(adev);
4236 }
4237
4238 /* disable all interrupts */
4239 amdgpu_irq_disable_all(adev);
4240 if (adev->mode_info.mode_config_initialized) {
4241 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4242 drm_helper_force_disable_all(adev_to_drm(adev));
4243 else
4244 drm_atomic_helper_shutdown(adev_to_drm(adev));
4245 }
4246 amdgpu_fence_driver_hw_fini(adev);
4247
4248 if (adev->mman.initialized)
4249 drain_workqueue(adev->mman.bdev.wq);
4250
4251 if (adev->pm.sysfs_initialized)
4252 amdgpu_pm_sysfs_fini(adev);
4253 if (adev->ucode_sysfs_en)
4254 amdgpu_ucode_sysfs_fini(adev);
4255 if (adev->psp_sysfs_en)
4256 amdgpu_psp_sysfs_fini(adev);
4257 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4258
4259 /* disable ras feature must before hw fini */
4260 amdgpu_ras_pre_fini(adev);
4261
4262 amdgpu_device_ip_fini_early(adev);
4263
4264 amdgpu_irq_fini_hw(adev);
4265
4266 if (adev->mman.initialized)
4267 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4268
4269 amdgpu_gart_dummy_page_fini(adev);
4270
4271 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4272 amdgpu_device_unmap_mmio(adev);
4273
4274}
4275
4276void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4277{
4278 int idx;
4279 bool px;
4280
4281 amdgpu_fence_driver_sw_fini(adev);
4282 amdgpu_device_ip_fini(adev);
4283 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4284 adev->accel_working = false;
4285 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4286
4287 amdgpu_reset_fini(adev);
4288
4289 /* free i2c buses */
4290 if (!amdgpu_device_has_dc_support(adev))
4291 amdgpu_i2c_fini(adev);
4292
4293 if (amdgpu_emu_mode != 1)
4294 amdgpu_atombios_fini(adev);
4295
4296 kfree(adev->bios);
4297 adev->bios = NULL;
4298
4299 px = amdgpu_device_supports_px(adev_to_drm(adev));
4300
4301 if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
4302 apple_gmux_detect(NULL, NULL)))
4303 vga_switcheroo_unregister_client(adev->pdev);
4304
4305 if (px)
4306 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4307
4308 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4309 vga_client_unregister(adev->pdev);
4310
4311 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4312
4313 iounmap(adev->rmmio);
4314 adev->rmmio = NULL;
4315 amdgpu_device_doorbell_fini(adev);
4316 drm_dev_exit(idx);
4317 }
4318
4319 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4320 amdgpu_pmu_fini(adev);
4321 if (adev->mman.discovery_bin)
4322 amdgpu_discovery_fini(adev);
4323
4324 amdgpu_reset_put_reset_domain(adev->reset_domain);
4325 adev->reset_domain = NULL;
4326
4327 kfree(adev->pci_state);
4328
4329}
4330
4331/**
4332 * amdgpu_device_evict_resources - evict device resources
4333 * @adev: amdgpu device object
4334 *
4335 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4336 * of the vram memory type. Mainly used for evicting device resources
4337 * at suspend time.
4338 *
4339 */
4340static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4341{
4342 int ret;
4343
4344 /* No need to evict vram on APUs for suspend to ram or s2idle */
4345 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4346 return 0;
4347
4348 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4349 if (ret)
4350 DRM_WARN("evicting device resources failed\n");
4351 return ret;
4352}
4353
4354/*
4355 * Suspend & resume.
4356 */
4357/**
4358 * amdgpu_device_suspend - initiate device suspend
4359 *
4360 * @dev: drm dev pointer
4361 * @fbcon : notify the fbdev of suspend
4362 *
4363 * Puts the hw in the suspend state (all asics).
4364 * Returns 0 for success or an error on failure.
4365 * Called at driver suspend.
4366 */
4367int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4368{
4369 struct amdgpu_device *adev = drm_to_adev(dev);
4370 int r = 0;
4371
4372 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4373 return 0;
4374
4375 adev->in_suspend = true;
4376
4377 /* Evict the majority of BOs before grabbing the full access */
4378 r = amdgpu_device_evict_resources(adev);
4379 if (r)
4380 return r;
4381
4382 if (amdgpu_sriov_vf(adev)) {
4383 amdgpu_virt_fini_data_exchange(adev);
4384 r = amdgpu_virt_request_full_gpu(adev, false);
4385 if (r)
4386 return r;
4387 }
4388
4389 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4390 DRM_WARN("smart shift update failed\n");
4391
4392 if (fbcon)
4393 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4394
4395 cancel_delayed_work_sync(&adev->delayed_init_work);
4396
4397 amdgpu_ras_suspend(adev);
4398
4399 amdgpu_device_ip_suspend_phase1(adev);
4400
4401 if (!adev->in_s0ix)
4402 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4403
4404 r = amdgpu_device_evict_resources(adev);
4405 if (r)
4406 return r;
4407
4408 amdgpu_fence_driver_hw_fini(adev);
4409
4410 amdgpu_device_ip_suspend_phase2(adev);
4411
4412 if (amdgpu_sriov_vf(adev))
4413 amdgpu_virt_release_full_gpu(adev, false);
4414
4415 return 0;
4416}
4417
4418/**
4419 * amdgpu_device_resume - initiate device resume
4420 *
4421 * @dev: drm dev pointer
4422 * @fbcon : notify the fbdev of resume
4423 *
4424 * Bring the hw back to operating state (all asics).
4425 * Returns 0 for success or an error on failure.
4426 * Called at driver resume.
4427 */
4428int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4429{
4430 struct amdgpu_device *adev = drm_to_adev(dev);
4431 int r = 0;
4432
4433 if (amdgpu_sriov_vf(adev)) {
4434 r = amdgpu_virt_request_full_gpu(adev, true);
4435 if (r)
4436 return r;
4437 }
4438
4439 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4440 return 0;
4441
4442 if (adev->in_s0ix)
4443 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4444
4445 /* post card */
4446 if (amdgpu_device_need_post(adev)) {
4447 r = amdgpu_device_asic_init(adev);
4448 if (r)
4449 dev_err(adev->dev, "amdgpu asic init failed\n");
4450 }
4451
4452 r = amdgpu_device_ip_resume(adev);
4453
4454 if (r) {
4455 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4456 goto exit;
4457 }
4458 amdgpu_fence_driver_hw_init(adev);
4459
4460 r = amdgpu_device_ip_late_init(adev);
4461 if (r)
4462 goto exit;
4463
4464 queue_delayed_work(system_wq, &adev->delayed_init_work,
4465 msecs_to_jiffies(AMDGPU_RESUME_MS));
4466
4467 if (!adev->in_s0ix) {
4468 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4469 if (r)
4470 goto exit;
4471 }
4472
4473exit:
4474 if (amdgpu_sriov_vf(adev)) {
4475 amdgpu_virt_init_data_exchange(adev);
4476 amdgpu_virt_release_full_gpu(adev, true);
4477 }
4478
4479 if (r)
4480 return r;
4481
4482 /* Make sure IB tests flushed */
4483 flush_delayed_work(&adev->delayed_init_work);
4484
4485 if (fbcon)
4486 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4487
4488 amdgpu_ras_resume(adev);
4489
4490 if (adev->mode_info.num_crtc) {
4491 /*
4492 * Most of the connector probing functions try to acquire runtime pm
4493 * refs to ensure that the GPU is powered on when connector polling is
4494 * performed. Since we're calling this from a runtime PM callback,
4495 * trying to acquire rpm refs will cause us to deadlock.
4496 *
4497 * Since we're guaranteed to be holding the rpm lock, it's safe to
4498 * temporarily disable the rpm helpers so this doesn't deadlock us.
4499 */
4500#ifdef CONFIG_PM
4501 dev->dev->power.disable_depth++;
4502#endif
4503 if (!adev->dc_enabled)
4504 drm_helper_hpd_irq_event(dev);
4505 else
4506 drm_kms_helper_hotplug_event(dev);
4507#ifdef CONFIG_PM
4508 dev->dev->power.disable_depth--;
4509#endif
4510 }
4511 adev->in_suspend = false;
4512
4513 if (adev->enable_mes)
4514 amdgpu_mes_self_test(adev);
4515
4516 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4517 DRM_WARN("smart shift update failed\n");
4518
4519 return 0;
4520}
4521
4522/**
4523 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4524 *
4525 * @adev: amdgpu_device pointer
4526 *
4527 * The list of all the hardware IPs that make up the asic is walked and
4528 * the check_soft_reset callbacks are run. check_soft_reset determines
4529 * if the asic is still hung or not.
4530 * Returns true if any of the IPs are still in a hung state, false if not.
4531 */
4532static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4533{
4534 int i;
4535 bool asic_hang = false;
4536
4537 if (amdgpu_sriov_vf(adev))
4538 return true;
4539
4540 if (amdgpu_asic_need_full_reset(adev))
4541 return true;
4542
4543 for (i = 0; i < adev->num_ip_blocks; i++) {
4544 if (!adev->ip_blocks[i].status.valid)
4545 continue;
4546 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4547 adev->ip_blocks[i].status.hang =
4548 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4549 if (adev->ip_blocks[i].status.hang) {
4550 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4551 asic_hang = true;
4552 }
4553 }
4554 return asic_hang;
4555}
4556
4557/**
4558 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4559 *
4560 * @adev: amdgpu_device pointer
4561 *
4562 * The list of all the hardware IPs that make up the asic is walked and the
4563 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
4564 * handles any IP specific hardware or software state changes that are
4565 * necessary for a soft reset to succeed.
4566 * Returns 0 on success, negative error code on failure.
4567 */
4568static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4569{
4570 int i, r = 0;
4571
4572 for (i = 0; i < adev->num_ip_blocks; i++) {
4573 if (!adev->ip_blocks[i].status.valid)
4574 continue;
4575 if (adev->ip_blocks[i].status.hang &&
4576 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4577 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4578 if (r)
4579 return r;
4580 }
4581 }
4582
4583 return 0;
4584}
4585
4586/**
4587 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4588 *
4589 * @adev: amdgpu_device pointer
4590 *
4591 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4592 * reset is necessary to recover.
4593 * Returns true if a full asic reset is required, false if not.
4594 */
4595static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4596{
4597 int i;
4598
4599 if (amdgpu_asic_need_full_reset(adev))
4600 return true;
4601
4602 for (i = 0; i < adev->num_ip_blocks; i++) {
4603 if (!adev->ip_blocks[i].status.valid)
4604 continue;
4605 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4606 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4607 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4608 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4609 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4610 if (adev->ip_blocks[i].status.hang) {
4611 dev_info(adev->dev, "Some block need full reset!\n");
4612 return true;
4613 }
4614 }
4615 }
4616 return false;
4617}
4618
4619/**
4620 * amdgpu_device_ip_soft_reset - do a soft reset
4621 *
4622 * @adev: amdgpu_device pointer
4623 *
4624 * The list of all the hardware IPs that make up the asic is walked and the
4625 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4626 * IP specific hardware or software state changes that are necessary to soft
4627 * reset the IP.
4628 * Returns 0 on success, negative error code on failure.
4629 */
4630static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4631{
4632 int i, r = 0;
4633
4634 for (i = 0; i < adev->num_ip_blocks; i++) {
4635 if (!adev->ip_blocks[i].status.valid)
4636 continue;
4637 if (adev->ip_blocks[i].status.hang &&
4638 adev->ip_blocks[i].version->funcs->soft_reset) {
4639 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4640 if (r)
4641 return r;
4642 }
4643 }
4644
4645 return 0;
4646}
4647
4648/**
4649 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4650 *
4651 * @adev: amdgpu_device pointer
4652 *
4653 * The list of all the hardware IPs that make up the asic is walked and the
4654 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4655 * handles any IP specific hardware or software state changes that are
4656 * necessary after the IP has been soft reset.
4657 * Returns 0 on success, negative error code on failure.
4658 */
4659static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4660{
4661 int i, r = 0;
4662
4663 for (i = 0; i < adev->num_ip_blocks; i++) {
4664 if (!adev->ip_blocks[i].status.valid)
4665 continue;
4666 if (adev->ip_blocks[i].status.hang &&
4667 adev->ip_blocks[i].version->funcs->post_soft_reset)
4668 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4669 if (r)
4670 return r;
4671 }
4672
4673 return 0;
4674}
4675
4676/**
4677 * amdgpu_device_recover_vram - Recover some VRAM contents
4678 *
4679 * @adev: amdgpu_device pointer
4680 *
4681 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4682 * restore things like GPUVM page tables after a GPU reset where
4683 * the contents of VRAM might be lost.
4684 *
4685 * Returns:
4686 * 0 on success, negative error code on failure.
4687 */
4688static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4689{
4690 struct dma_fence *fence = NULL, *next = NULL;
4691 struct amdgpu_bo *shadow;
4692 struct amdgpu_bo_vm *vmbo;
4693 long r = 1, tmo;
4694
4695 if (amdgpu_sriov_runtime(adev))
4696 tmo = msecs_to_jiffies(8000);
4697 else
4698 tmo = msecs_to_jiffies(100);
4699
4700 dev_info(adev->dev, "recover vram bo from shadow start\n");
4701 mutex_lock(&adev->shadow_list_lock);
4702 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4703 /* If vm is compute context or adev is APU, shadow will be NULL */
4704 if (!vmbo->shadow)
4705 continue;
4706 shadow = vmbo->shadow;
4707
4708 /* No need to recover an evicted BO */
4709 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4710 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4711 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4712 continue;
4713
4714 r = amdgpu_bo_restore_shadow(shadow, &next);
4715 if (r)
4716 break;
4717
4718 if (fence) {
4719 tmo = dma_fence_wait_timeout(fence, false, tmo);
4720 dma_fence_put(fence);
4721 fence = next;
4722 if (tmo == 0) {
4723 r = -ETIMEDOUT;
4724 break;
4725 } else if (tmo < 0) {
4726 r = tmo;
4727 break;
4728 }
4729 } else {
4730 fence = next;
4731 }
4732 }
4733 mutex_unlock(&adev->shadow_list_lock);
4734
4735 if (fence)
4736 tmo = dma_fence_wait_timeout(fence, false, tmo);
4737 dma_fence_put(fence);
4738
4739 if (r < 0 || tmo <= 0) {
4740 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4741 return -EIO;
4742 }
4743
4744 dev_info(adev->dev, "recover vram bo from shadow done\n");
4745 return 0;
4746}
4747
4748
4749/**
4750 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4751 *
4752 * @adev: amdgpu_device pointer
4753 * @from_hypervisor: request from hypervisor
4754 *
4755 * do VF FLR and reinitialize Asic
4756 * return 0 means succeeded otherwise failed
4757 */
4758static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4759 bool from_hypervisor)
4760{
4761 int r;
4762 struct amdgpu_hive_info *hive = NULL;
4763 int retry_limit = 0;
4764
4765retry:
4766 amdgpu_amdkfd_pre_reset(adev);
4767
4768 if (from_hypervisor)
4769 r = amdgpu_virt_request_full_gpu(adev, true);
4770 else
4771 r = amdgpu_virt_reset_gpu(adev);
4772 if (r)
4773 return r;
4774
4775 /* Resume IP prior to SMC */
4776 r = amdgpu_device_ip_reinit_early_sriov(adev);
4777 if (r)
4778 goto error;
4779
4780 amdgpu_virt_init_data_exchange(adev);
4781
4782 r = amdgpu_device_fw_loading(adev);
4783 if (r)
4784 return r;
4785
4786 /* now we are okay to resume SMC/CP/SDMA */
4787 r = amdgpu_device_ip_reinit_late_sriov(adev);
4788 if (r)
4789 goto error;
4790
4791 hive = amdgpu_get_xgmi_hive(adev);
4792 /* Update PSP FW topology after reset */
4793 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4794 r = amdgpu_xgmi_update_topology(hive, adev);
4795
4796 if (hive)
4797 amdgpu_put_xgmi_hive(hive);
4798
4799 if (!r) {
4800 amdgpu_irq_gpu_reset_resume_helper(adev);
4801 r = amdgpu_ib_ring_tests(adev);
4802
4803 amdgpu_amdkfd_post_reset(adev);
4804 }
4805
4806error:
4807 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4808 amdgpu_inc_vram_lost(adev);
4809 r = amdgpu_device_recover_vram(adev);
4810 }
4811 amdgpu_virt_release_full_gpu(adev, true);
4812
4813 if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4814 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4815 retry_limit++;
4816 goto retry;
4817 } else
4818 DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4819 }
4820
4821 return r;
4822}
4823
4824/**
4825 * amdgpu_device_has_job_running - check if there is any job in mirror list
4826 *
4827 * @adev: amdgpu_device pointer
4828 *
4829 * check if there is any job in mirror list
4830 */
4831bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4832{
4833 int i;
4834 struct drm_sched_job *job;
4835
4836 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4837 struct amdgpu_ring *ring = adev->rings[i];
4838
4839 if (!ring || !ring->sched.thread)
4840 continue;
4841
4842 spin_lock(&ring->sched.job_list_lock);
4843 job = list_first_entry_or_null(&ring->sched.pending_list,
4844 struct drm_sched_job, list);
4845 spin_unlock(&ring->sched.job_list_lock);
4846 if (job)
4847 return true;
4848 }
4849 return false;
4850}
4851
4852/**
4853 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4854 *
4855 * @adev: amdgpu_device pointer
4856 *
4857 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4858 * a hung GPU.
4859 */
4860bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4861{
4862
4863 if (amdgpu_gpu_recovery == 0)
4864 goto disabled;
4865
4866 /* Skip soft reset check in fatal error mode */
4867 if (!amdgpu_ras_is_poison_mode_supported(adev))
4868 return true;
4869
4870 if (amdgpu_sriov_vf(adev))
4871 return true;
4872
4873 if (amdgpu_gpu_recovery == -1) {
4874 switch (adev->asic_type) {
4875#ifdef CONFIG_DRM_AMDGPU_SI
4876 case CHIP_VERDE:
4877 case CHIP_TAHITI:
4878 case CHIP_PITCAIRN:
4879 case CHIP_OLAND:
4880 case CHIP_HAINAN:
4881#endif
4882#ifdef CONFIG_DRM_AMDGPU_CIK
4883 case CHIP_KAVERI:
4884 case CHIP_KABINI:
4885 case CHIP_MULLINS:
4886#endif
4887 case CHIP_CARRIZO:
4888 case CHIP_STONEY:
4889 case CHIP_CYAN_SKILLFISH:
4890 goto disabled;
4891 default:
4892 break;
4893 }
4894 }
4895
4896 return true;
4897
4898disabled:
4899 dev_info(adev->dev, "GPU recovery disabled.\n");
4900 return false;
4901}
4902
4903int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4904{
4905 u32 i;
4906 int ret = 0;
4907
4908 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4909
4910 dev_info(adev->dev, "GPU mode1 reset\n");
4911
4912 /* disable BM */
4913 pci_clear_master(adev->pdev);
4914
4915 amdgpu_device_cache_pci_state(adev->pdev);
4916
4917 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4918 dev_info(adev->dev, "GPU smu mode1 reset\n");
4919 ret = amdgpu_dpm_mode1_reset(adev);
4920 } else {
4921 dev_info(adev->dev, "GPU psp mode1 reset\n");
4922 ret = psp_gpu_reset(adev);
4923 }
4924
4925 if (ret)
4926 dev_err(adev->dev, "GPU mode1 reset failed\n");
4927
4928 amdgpu_device_load_pci_state(adev->pdev);
4929
4930 /* wait for asic to come out of reset */
4931 for (i = 0; i < adev->usec_timeout; i++) {
4932 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4933
4934 if (memsize != 0xffffffff)
4935 break;
4936 udelay(1);
4937 }
4938
4939 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4940 return ret;
4941}
4942
4943int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4944 struct amdgpu_reset_context *reset_context)
4945{
4946 int i, r = 0;
4947 struct amdgpu_job *job = NULL;
4948 bool need_full_reset =
4949 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4950
4951 if (reset_context->reset_req_dev == adev)
4952 job = reset_context->job;
4953
4954 if (amdgpu_sriov_vf(adev)) {
4955 /* stop the data exchange thread */
4956 amdgpu_virt_fini_data_exchange(adev);
4957 }
4958
4959 amdgpu_fence_driver_isr_toggle(adev, true);
4960
4961 /* block all schedulers and reset given job's ring */
4962 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4963 struct amdgpu_ring *ring = adev->rings[i];
4964
4965 if (!ring || !ring->sched.thread)
4966 continue;
4967
4968 /*clear job fence from fence drv to avoid force_completion
4969 *leave NULL and vm flush fence in fence drv */
4970 amdgpu_fence_driver_clear_job_fences(ring);
4971
4972 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4973 amdgpu_fence_driver_force_completion(ring);
4974 }
4975
4976 amdgpu_fence_driver_isr_toggle(adev, false);
4977
4978 if (job && job->vm)
4979 drm_sched_increase_karma(&job->base);
4980
4981 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4982 /* If reset handler not implemented, continue; otherwise return */
4983 if (r == -ENOSYS)
4984 r = 0;
4985 else
4986 return r;
4987
4988 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4989 if (!amdgpu_sriov_vf(adev)) {
4990
4991 if (!need_full_reset)
4992 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4993
4994 if (!need_full_reset && amdgpu_gpu_recovery &&
4995 amdgpu_device_ip_check_soft_reset(adev)) {
4996 amdgpu_device_ip_pre_soft_reset(adev);
4997 r = amdgpu_device_ip_soft_reset(adev);
4998 amdgpu_device_ip_post_soft_reset(adev);
4999 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
5000 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
5001 need_full_reset = true;
5002 }
5003 }
5004
5005 if (need_full_reset)
5006 r = amdgpu_device_ip_suspend(adev);
5007 if (need_full_reset)
5008 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5009 else
5010 clear_bit(AMDGPU_NEED_FULL_RESET,
5011 &reset_context->flags);
5012 }
5013
5014 return r;
5015}
5016
5017static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
5018{
5019 int i;
5020
5021 lockdep_assert_held(&adev->reset_domain->sem);
5022
5023 for (i = 0; i < adev->num_regs; i++) {
5024 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
5025 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
5026 adev->reset_dump_reg_value[i]);
5027 }
5028
5029 return 0;
5030}
5031
5032#ifdef CONFIG_DEV_COREDUMP
5033static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
5034 size_t count, void *data, size_t datalen)
5035{
5036 struct drm_printer p;
5037 struct amdgpu_device *adev = data;
5038 struct drm_print_iterator iter;
5039 int i;
5040
5041 iter.data = buffer;
5042 iter.offset = 0;
5043 iter.start = offset;
5044 iter.remain = count;
5045
5046 p = drm_coredump_printer(&iter);
5047
5048 drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
5049 drm_printf(&p, "kernel: " UTS_RELEASE "\n");
5050 drm_printf(&p, "module: " KBUILD_MODNAME "\n");
5051 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
5052 if (adev->reset_task_info.pid)
5053 drm_printf(&p, "process_name: %s PID: %d\n",
5054 adev->reset_task_info.process_name,
5055 adev->reset_task_info.pid);
5056
5057 if (adev->reset_vram_lost)
5058 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
5059 if (adev->num_regs) {
5060 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
5061
5062 for (i = 0; i < adev->num_regs; i++)
5063 drm_printf(&p, "0x%08x: 0x%08x\n",
5064 adev->reset_dump_reg_list[i],
5065 adev->reset_dump_reg_value[i]);
5066 }
5067
5068 return count - iter.remain;
5069}
5070
5071static void amdgpu_devcoredump_free(void *data)
5072{
5073}
5074
5075static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
5076{
5077 struct drm_device *dev = adev_to_drm(adev);
5078
5079 ktime_get_ts64(&adev->reset_time);
5080 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,
5081 amdgpu_devcoredump_read, amdgpu_devcoredump_free);
5082}
5083#endif
5084
5085int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5086 struct amdgpu_reset_context *reset_context)
5087{
5088 struct amdgpu_device *tmp_adev = NULL;
5089 bool need_full_reset, skip_hw_reset, vram_lost = false;
5090 int r = 0;
5091 bool gpu_reset_for_dev_remove = 0;
5092
5093 /* Try reset handler method first */
5094 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5095 reset_list);
5096 amdgpu_reset_reg_dumps(tmp_adev);
5097
5098 reset_context->reset_device_list = device_list_handle;
5099 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5100 /* If reset handler not implemented, continue; otherwise return */
5101 if (r == -ENOSYS)
5102 r = 0;
5103 else
5104 return r;
5105
5106 /* Reset handler not implemented, use the default method */
5107 need_full_reset =
5108 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5109 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5110
5111 gpu_reset_for_dev_remove =
5112 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5113 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5114
5115 /*
5116 * ASIC reset has to be done on all XGMI hive nodes ASAP
5117 * to allow proper links negotiation in FW (within 1 sec)
5118 */
5119 if (!skip_hw_reset && need_full_reset) {
5120 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5121 /* For XGMI run all resets in parallel to speed up the process */
5122 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5123 tmp_adev->gmc.xgmi.pending_reset = false;
5124 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
5125 r = -EALREADY;
5126 } else
5127 r = amdgpu_asic_reset(tmp_adev);
5128
5129 if (r) {
5130 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
5131 r, adev_to_drm(tmp_adev)->unique);
5132 break;
5133 }
5134 }
5135
5136 /* For XGMI wait for all resets to complete before proceed */
5137 if (!r) {
5138 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5139 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5140 flush_work(&tmp_adev->xgmi_reset_work);
5141 r = tmp_adev->asic_reset_res;
5142 if (r)
5143 break;
5144 }
5145 }
5146 }
5147 }
5148
5149 if (!r && amdgpu_ras_intr_triggered()) {
5150 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5151 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
5152 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
5153 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
5154 }
5155
5156 amdgpu_ras_intr_cleared();
5157 }
5158
5159 /* Since the mode1 reset affects base ip blocks, the
5160 * phase1 ip blocks need to be resumed. Otherwise there
5161 * will be a BIOS signature error and the psp bootloader
5162 * can't load kdb on the next amdgpu install.
5163 */
5164 if (gpu_reset_for_dev_remove) {
5165 list_for_each_entry(tmp_adev, device_list_handle, reset_list)
5166 amdgpu_device_ip_resume_phase1(tmp_adev);
5167
5168 goto end;
5169 }
5170
5171 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5172 if (need_full_reset) {
5173 /* post card */
5174 r = amdgpu_device_asic_init(tmp_adev);
5175 if (r) {
5176 dev_warn(tmp_adev->dev, "asic atom init failed!");
5177 } else {
5178 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5179 r = amdgpu_amdkfd_resume_iommu(tmp_adev);
5180 if (r)
5181 goto out;
5182
5183 r = amdgpu_device_ip_resume_phase1(tmp_adev);
5184 if (r)
5185 goto out;
5186
5187 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5188#ifdef CONFIG_DEV_COREDUMP
5189 tmp_adev->reset_vram_lost = vram_lost;
5190 memset(&tmp_adev->reset_task_info, 0,
5191 sizeof(tmp_adev->reset_task_info));
5192 if (reset_context->job && reset_context->job->vm)
5193 tmp_adev->reset_task_info =
5194 reset_context->job->vm->task_info;
5195 amdgpu_reset_capture_coredumpm(tmp_adev);
5196#endif
5197 if (vram_lost) {
5198 DRM_INFO("VRAM is lost due to GPU reset!\n");
5199 amdgpu_inc_vram_lost(tmp_adev);
5200 }
5201
5202 r = amdgpu_device_fw_loading(tmp_adev);
5203 if (r)
5204 return r;
5205
5206 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5207 if (r)
5208 goto out;
5209
5210 if (vram_lost)
5211 amdgpu_device_fill_reset_magic(tmp_adev);
5212
5213 /*
5214 * Add this ASIC as tracked as reset was already
5215 * complete successfully.
5216 */
5217 amdgpu_register_gpu_instance(tmp_adev);
5218
5219 if (!reset_context->hive &&
5220 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5221 amdgpu_xgmi_add_device(tmp_adev);
5222
5223 r = amdgpu_device_ip_late_init(tmp_adev);
5224 if (r)
5225 goto out;
5226
5227 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
5228
5229 /*
5230 * The GPU enters bad state once faulty pages
5231 * by ECC has reached the threshold, and ras
5232 * recovery is scheduled next. So add one check
5233 * here to break recovery if it indeed exceeds
5234 * bad page threshold, and remind user to
5235 * retire this GPU or setting one bigger
5236 * bad_page_threshold value to fix this once
5237 * probing driver again.
5238 */
5239 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
5240 /* must succeed. */
5241 amdgpu_ras_resume(tmp_adev);
5242 } else {
5243 r = -EINVAL;
5244 goto out;
5245 }
5246
5247 /* Update PSP FW topology after reset */
5248 if (reset_context->hive &&
5249 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5250 r = amdgpu_xgmi_update_topology(
5251 reset_context->hive, tmp_adev);
5252 }
5253 }
5254
5255out:
5256 if (!r) {
5257 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5258 r = amdgpu_ib_ring_tests(tmp_adev);
5259 if (r) {
5260 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5261 need_full_reset = true;
5262 r = -EAGAIN;
5263 goto end;
5264 }
5265 }
5266
5267 if (!r)
5268 r = amdgpu_device_recover_vram(tmp_adev);
5269 else
5270 tmp_adev->asic_reset_res = r;
5271 }
5272
5273end:
5274 if (need_full_reset)
5275 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5276 else
5277 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5278 return r;
5279}
5280
5281static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5282{
5283
5284 switch (amdgpu_asic_reset_method(adev)) {
5285 case AMD_RESET_METHOD_MODE1:
5286 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5287 break;
5288 case AMD_RESET_METHOD_MODE2:
5289 adev->mp1_state = PP_MP1_STATE_RESET;
5290 break;
5291 default:
5292 adev->mp1_state = PP_MP1_STATE_NONE;
5293 break;
5294 }
5295}
5296
5297static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5298{
5299 amdgpu_vf_error_trans_all(adev);
5300 adev->mp1_state = PP_MP1_STATE_NONE;
5301}
5302
5303static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5304{
5305 struct pci_dev *p = NULL;
5306
5307 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5308 adev->pdev->bus->number, 1);
5309 if (p) {
5310 pm_runtime_enable(&(p->dev));
5311 pm_runtime_resume(&(p->dev));
5312 }
5313
5314 pci_dev_put(p);
5315}
5316
5317static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5318{
5319 enum amd_reset_method reset_method;
5320 struct pci_dev *p = NULL;
5321 u64 expires;
5322
5323 /*
5324 * For now, only BACO and mode1 reset are confirmed
5325 * to suffer the audio issue without proper suspended.
5326 */
5327 reset_method = amdgpu_asic_reset_method(adev);
5328 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5329 (reset_method != AMD_RESET_METHOD_MODE1))
5330 return -EINVAL;
5331
5332 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5333 adev->pdev->bus->number, 1);
5334 if (!p)
5335 return -ENODEV;
5336
5337 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5338 if (!expires)
5339 /*
5340 * If we cannot get the audio device autosuspend delay,
5341 * a fixed 4S interval will be used. Considering 3S is
5342 * the audio controller default autosuspend delay setting.
5343 * 4S used here is guaranteed to cover that.
5344 */
5345 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5346
5347 while (!pm_runtime_status_suspended(&(p->dev))) {
5348 if (!pm_runtime_suspend(&(p->dev)))
5349 break;
5350
5351 if (expires < ktime_get_mono_fast_ns()) {
5352 dev_warn(adev->dev, "failed to suspend display audio\n");
5353 pci_dev_put(p);
5354 /* TODO: abort the succeeding gpu reset? */
5355 return -ETIMEDOUT;
5356 }
5357 }
5358
5359 pm_runtime_disable(&(p->dev));
5360
5361 pci_dev_put(p);
5362 return 0;
5363}
5364
5365static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5366{
5367 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5368
5369#if defined(CONFIG_DEBUG_FS)
5370 if (!amdgpu_sriov_vf(adev))
5371 cancel_work(&adev->reset_work);
5372#endif
5373
5374 if (adev->kfd.dev)
5375 cancel_work(&adev->kfd.reset_work);
5376
5377 if (amdgpu_sriov_vf(adev))
5378 cancel_work(&adev->virt.flr_work);
5379
5380 if (con && adev->ras_enabled)
5381 cancel_work(&con->recovery_work);
5382
5383}
5384
5385/**
5386 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5387 *
5388 * @adev: amdgpu_device pointer
5389 * @job: which job trigger hang
5390 * @reset_context: amdgpu reset context pointer
5391 *
5392 * Attempt to reset the GPU if it has hung (all asics).
5393 * Attempt to do soft-reset or full-reset and reinitialize Asic
5394 * Returns 0 for success or an error on failure.
5395 */
5396
5397int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5398 struct amdgpu_job *job,
5399 struct amdgpu_reset_context *reset_context)
5400{
5401 struct list_head device_list, *device_list_handle = NULL;
5402 bool job_signaled = false;
5403 struct amdgpu_hive_info *hive = NULL;
5404 struct amdgpu_device *tmp_adev = NULL;
5405 int i, r = 0;
5406 bool need_emergency_restart = false;
5407 bool audio_suspended = false;
5408 bool gpu_reset_for_dev_remove = false;
5409
5410 gpu_reset_for_dev_remove =
5411 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5412 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5413
5414 /*
5415 * Special case: RAS triggered and full reset isn't supported
5416 */
5417 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5418
5419 /*
5420 * Flush RAM to disk so that after reboot
5421 * the user can read log and see why the system rebooted.
5422 */
5423 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
5424 DRM_WARN("Emergency reboot.");
5425
5426 ksys_sync_helper();
5427 emergency_restart();
5428 }
5429
5430 dev_info(adev->dev, "GPU %s begin!\n",
5431 need_emergency_restart ? "jobs stop":"reset");
5432
5433 if (!amdgpu_sriov_vf(adev))
5434 hive = amdgpu_get_xgmi_hive(adev);
5435 if (hive)
5436 mutex_lock(&hive->hive_lock);
5437
5438 reset_context->job = job;
5439 reset_context->hive = hive;
5440 /*
5441 * Build list of devices to reset.
5442 * In case we are in XGMI hive mode, resort the device list
5443 * to put adev in the 1st position.
5444 */
5445 INIT_LIST_HEAD(&device_list);
5446 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
5447 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5448 list_add_tail(&tmp_adev->reset_list, &device_list);
5449 if (gpu_reset_for_dev_remove && adev->shutdown)
5450 tmp_adev->shutdown = true;
5451 }
5452 if (!list_is_first(&adev->reset_list, &device_list))
5453 list_rotate_to_front(&adev->reset_list, &device_list);
5454 device_list_handle = &device_list;
5455 } else {
5456 list_add_tail(&adev->reset_list, &device_list);
5457 device_list_handle = &device_list;
5458 }
5459
5460 /* We need to lock reset domain only once both for XGMI and single device */
5461 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5462 reset_list);
5463 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5464
5465 /* block all schedulers and reset given job's ring */
5466 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5467
5468 amdgpu_device_set_mp1_state(tmp_adev);
5469
5470 /*
5471 * Try to put the audio codec into suspend state
5472 * before gpu reset started.
5473 *
5474 * Due to the power domain of the graphics device
5475 * is shared with AZ power domain. Without this,
5476 * we may change the audio hardware from behind
5477 * the audio driver's back. That will trigger
5478 * some audio codec errors.
5479 */
5480 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5481 audio_suspended = true;
5482
5483 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5484
5485 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5486
5487 if (!amdgpu_sriov_vf(tmp_adev))
5488 amdgpu_amdkfd_pre_reset(tmp_adev);
5489
5490 /*
5491 * Mark these ASICs to be reseted as untracked first
5492 * And add them back after reset completed
5493 */
5494 amdgpu_unregister_gpu_instance(tmp_adev);
5495
5496 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5497
5498 /* disable ras on ALL IPs */
5499 if (!need_emergency_restart &&
5500 amdgpu_device_ip_need_full_reset(tmp_adev))
5501 amdgpu_ras_suspend(tmp_adev);
5502
5503 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5504 struct amdgpu_ring *ring = tmp_adev->rings[i];
5505
5506 if (!ring || !ring->sched.thread)
5507 continue;
5508
5509 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5510
5511 if (need_emergency_restart)
5512 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5513 }
5514 atomic_inc(&tmp_adev->gpu_reset_counter);
5515 }
5516
5517 if (need_emergency_restart)
5518 goto skip_sched_resume;
5519
5520 /*
5521 * Must check guilty signal here since after this point all old
5522 * HW fences are force signaled.
5523 *
5524 * job->base holds a reference to parent fence
5525 */
5526 if (job && dma_fence_is_signaled(&job->hw_fence)) {
5527 job_signaled = true;
5528 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5529 goto skip_hw_reset;
5530 }
5531
5532retry: /* Rest of adevs pre asic reset from XGMI hive. */
5533 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5534 if (gpu_reset_for_dev_remove) {
5535 /* Workaroud for ASICs need to disable SMC first */
5536 amdgpu_device_smu_fini_early(tmp_adev);
5537 }
5538 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5539 /*TODO Should we stop ?*/
5540 if (r) {
5541 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5542 r, adev_to_drm(tmp_adev)->unique);
5543 tmp_adev->asic_reset_res = r;
5544 }
5545
5546 /*
5547 * Drop all pending non scheduler resets. Scheduler resets
5548 * were already dropped during drm_sched_stop
5549 */
5550 amdgpu_device_stop_pending_resets(tmp_adev);
5551 }
5552
5553 /* Actual ASIC resets if needed.*/
5554 /* Host driver will handle XGMI hive reset for SRIOV */
5555 if (amdgpu_sriov_vf(adev)) {
5556 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5557 if (r)
5558 adev->asic_reset_res = r;
5559
5560 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5561 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) ||
5562 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3))
5563 amdgpu_ras_resume(adev);
5564 } else {
5565 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5566 if (r && r == -EAGAIN)
5567 goto retry;
5568
5569 if (!r && gpu_reset_for_dev_remove)
5570 goto recover_end;
5571 }
5572
5573skip_hw_reset:
5574
5575 /* Post ASIC reset for all devs .*/
5576 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5577
5578 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5579 struct amdgpu_ring *ring = tmp_adev->rings[i];
5580
5581 if (!ring || !ring->sched.thread)
5582 continue;
5583
5584 drm_sched_start(&ring->sched, true);
5585 }
5586
5587 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5588 amdgpu_mes_self_test(tmp_adev);
5589
5590 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {
5591 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5592 }
5593
5594 if (tmp_adev->asic_reset_res)
5595 r = tmp_adev->asic_reset_res;
5596
5597 tmp_adev->asic_reset_res = 0;
5598
5599 if (r) {
5600 /* bad news, how to tell it to userspace ? */
5601 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5602 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5603 } else {
5604 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5605 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5606 DRM_WARN("smart shift update failed\n");
5607 }
5608 }
5609
5610skip_sched_resume:
5611 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5612 /* unlock kfd: SRIOV would do it separately */
5613 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5614 amdgpu_amdkfd_post_reset(tmp_adev);
5615
5616 /* kfd_post_reset will do nothing if kfd device is not initialized,
5617 * need to bring up kfd here if it's not be initialized before
5618 */
5619 if (!adev->kfd.init_complete)
5620 amdgpu_amdkfd_device_init(adev);
5621
5622 if (audio_suspended)
5623 amdgpu_device_resume_display_audio(tmp_adev);
5624
5625 amdgpu_device_unset_mp1_state(tmp_adev);
5626
5627 amdgpu_ras_set_error_query_ready(tmp_adev, true);
5628 }
5629
5630recover_end:
5631 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5632 reset_list);
5633 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5634
5635 if (hive) {
5636 mutex_unlock(&hive->hive_lock);
5637 amdgpu_put_xgmi_hive(hive);
5638 }
5639
5640 if (r)
5641 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5642
5643 atomic_set(&adev->reset_domain->reset_res, r);
5644 return r;
5645}
5646
5647/**
5648 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5649 *
5650 * @adev: amdgpu_device pointer
5651 *
5652 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5653 * and lanes) of the slot the device is in. Handles APUs and
5654 * virtualized environments where PCIE config space may not be available.
5655 */
5656static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5657{
5658 struct pci_dev *pdev;
5659 enum pci_bus_speed speed_cap, platform_speed_cap;
5660 enum pcie_link_width platform_link_width;
5661
5662 if (amdgpu_pcie_gen_cap)
5663 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5664
5665 if (amdgpu_pcie_lane_cap)
5666 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5667
5668 /* covers APUs as well */
5669 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
5670 if (adev->pm.pcie_gen_mask == 0)
5671 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5672 if (adev->pm.pcie_mlw_mask == 0)
5673 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5674 return;
5675 }
5676
5677 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5678 return;
5679
5680 pcie_bandwidth_available(adev->pdev, NULL,
5681 &platform_speed_cap, &platform_link_width);
5682
5683 if (adev->pm.pcie_gen_mask == 0) {
5684 /* asic caps */
5685 pdev = adev->pdev;
5686 speed_cap = pcie_get_speed_cap(pdev);
5687 if (speed_cap == PCI_SPEED_UNKNOWN) {
5688 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5689 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5690 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5691 } else {
5692 if (speed_cap == PCIE_SPEED_32_0GT)
5693 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5694 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5695 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5696 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5697 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5698 else if (speed_cap == PCIE_SPEED_16_0GT)
5699 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5700 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5701 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5702 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5703 else if (speed_cap == PCIE_SPEED_8_0GT)
5704 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5705 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5706 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5707 else if (speed_cap == PCIE_SPEED_5_0GT)
5708 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5709 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5710 else
5711 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5712 }
5713 /* platform caps */
5714 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5715 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5716 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5717 } else {
5718 if (platform_speed_cap == PCIE_SPEED_32_0GT)
5719 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5720 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5721 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5722 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5723 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5724 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5725 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5726 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5727 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5728 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5729 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5730 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5731 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5732 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5733 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5734 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5735 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5736 else
5737 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5738
5739 }
5740 }
5741 if (adev->pm.pcie_mlw_mask == 0) {
5742 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5743 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5744 } else {
5745 switch (platform_link_width) {
5746 case PCIE_LNK_X32:
5747 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5748 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5749 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5750 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5751 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5752 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5753 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5754 break;
5755 case PCIE_LNK_X16:
5756 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5757 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5758 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5759 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5760 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5761 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5762 break;
5763 case PCIE_LNK_X12:
5764 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5765 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5766 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5767 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5768 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5769 break;
5770 case PCIE_LNK_X8:
5771 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5772 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5773 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5774 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5775 break;
5776 case PCIE_LNK_X4:
5777 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5778 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5779 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5780 break;
5781 case PCIE_LNK_X2:
5782 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5783 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5784 break;
5785 case PCIE_LNK_X1:
5786 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5787 break;
5788 default:
5789 break;
5790 }
5791 }
5792 }
5793}
5794
5795/**
5796 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5797 *
5798 * @adev: amdgpu_device pointer
5799 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5800 *
5801 * Return true if @peer_adev can access (DMA) @adev through the PCIe
5802 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5803 * @peer_adev.
5804 */
5805bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5806 struct amdgpu_device *peer_adev)
5807{
5808#ifdef CONFIG_HSA_AMD_P2P
5809 uint64_t address_mask = peer_adev->dev->dma_mask ?
5810 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5811 resource_size_t aper_limit =
5812 adev->gmc.aper_base + adev->gmc.aper_size - 1;
5813 bool p2p_access =
5814 !adev->gmc.xgmi.connected_to_cpu &&
5815 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5816
5817 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5818 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5819 !(adev->gmc.aper_base & address_mask ||
5820 aper_limit & address_mask));
5821#else
5822 return false;
5823#endif
5824}
5825
5826int amdgpu_device_baco_enter(struct drm_device *dev)
5827{
5828 struct amdgpu_device *adev = drm_to_adev(dev);
5829 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5830
5831 if (!amdgpu_device_supports_baco(dev))
5832 return -ENOTSUPP;
5833
5834 if (ras && adev->ras_enabled &&
5835 adev->nbio.funcs->enable_doorbell_interrupt)
5836 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5837
5838 return amdgpu_dpm_baco_enter(adev);
5839}
5840
5841int amdgpu_device_baco_exit(struct drm_device *dev)
5842{
5843 struct amdgpu_device *adev = drm_to_adev(dev);
5844 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5845 int ret = 0;
5846
5847 if (!amdgpu_device_supports_baco(dev))
5848 return -ENOTSUPP;
5849
5850 ret = amdgpu_dpm_baco_exit(adev);
5851 if (ret)
5852 return ret;
5853
5854 if (ras && adev->ras_enabled &&
5855 adev->nbio.funcs->enable_doorbell_interrupt)
5856 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5857
5858 if (amdgpu_passthrough(adev) &&
5859 adev->nbio.funcs->clear_doorbell_interrupt)
5860 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5861
5862 return 0;
5863}
5864
5865/**
5866 * amdgpu_pci_error_detected - Called when a PCI error is detected.
5867 * @pdev: PCI device struct
5868 * @state: PCI channel state
5869 *
5870 * Description: Called when a PCI error is detected.
5871 *
5872 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5873 */
5874pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5875{
5876 struct drm_device *dev = pci_get_drvdata(pdev);
5877 struct amdgpu_device *adev = drm_to_adev(dev);
5878 int i;
5879
5880 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5881
5882 if (adev->gmc.xgmi.num_physical_nodes > 1) {
5883 DRM_WARN("No support for XGMI hive yet...");
5884 return PCI_ERS_RESULT_DISCONNECT;
5885 }
5886
5887 adev->pci_channel_state = state;
5888
5889 switch (state) {
5890 case pci_channel_io_normal:
5891 return PCI_ERS_RESULT_CAN_RECOVER;
5892 /* Fatal error, prepare for slot reset */
5893 case pci_channel_io_frozen:
5894 /*
5895 * Locking adev->reset_domain->sem will prevent any external access
5896 * to GPU during PCI error recovery
5897 */
5898 amdgpu_device_lock_reset_domain(adev->reset_domain);
5899 amdgpu_device_set_mp1_state(adev);
5900
5901 /*
5902 * Block any work scheduling as we do for regular GPU reset
5903 * for the duration of the recovery
5904 */
5905 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5906 struct amdgpu_ring *ring = adev->rings[i];
5907
5908 if (!ring || !ring->sched.thread)
5909 continue;
5910
5911 drm_sched_stop(&ring->sched, NULL);
5912 }
5913 atomic_inc(&adev->gpu_reset_counter);
5914 return PCI_ERS_RESULT_NEED_RESET;
5915 case pci_channel_io_perm_failure:
5916 /* Permanent error, prepare for device removal */
5917 return PCI_ERS_RESULT_DISCONNECT;
5918 }
5919
5920 return PCI_ERS_RESULT_NEED_RESET;
5921}
5922
5923/**
5924 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5925 * @pdev: pointer to PCI device
5926 */
5927pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5928{
5929
5930 DRM_INFO("PCI error: mmio enabled callback!!\n");
5931
5932 /* TODO - dump whatever for debugging purposes */
5933
5934 /* This called only if amdgpu_pci_error_detected returns
5935 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5936 * works, no need to reset slot.
5937 */
5938
5939 return PCI_ERS_RESULT_RECOVERED;
5940}
5941
5942/**
5943 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5944 * @pdev: PCI device struct
5945 *
5946 * Description: This routine is called by the pci error recovery
5947 * code after the PCI slot has been reset, just before we
5948 * should resume normal operations.
5949 */
5950pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5951{
5952 struct drm_device *dev = pci_get_drvdata(pdev);
5953 struct amdgpu_device *adev = drm_to_adev(dev);
5954 int r, i;
5955 struct amdgpu_reset_context reset_context;
5956 u32 memsize;
5957 struct list_head device_list;
5958
5959 DRM_INFO("PCI error: slot reset callback!!\n");
5960
5961 memset(&reset_context, 0, sizeof(reset_context));
5962
5963 INIT_LIST_HEAD(&device_list);
5964 list_add_tail(&adev->reset_list, &device_list);
5965
5966 /* wait for asic to come out of reset */
5967 msleep(500);
5968
5969 /* Restore PCI confspace */
5970 amdgpu_device_load_pci_state(pdev);
5971
5972 /* confirm ASIC came out of reset */
5973 for (i = 0; i < adev->usec_timeout; i++) {
5974 memsize = amdgpu_asic_get_config_memsize(adev);
5975
5976 if (memsize != 0xffffffff)
5977 break;
5978 udelay(1);
5979 }
5980 if (memsize == 0xffffffff) {
5981 r = -ETIME;
5982 goto out;
5983 }
5984
5985 reset_context.method = AMD_RESET_METHOD_NONE;
5986 reset_context.reset_req_dev = adev;
5987 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5988 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5989
5990 adev->no_hw_access = true;
5991 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5992 adev->no_hw_access = false;
5993 if (r)
5994 goto out;
5995
5996 r = amdgpu_do_asic_reset(&device_list, &reset_context);
5997
5998out:
5999 if (!r) {
6000 if (amdgpu_device_cache_pci_state(adev->pdev))
6001 pci_restore_state(adev->pdev);
6002
6003 DRM_INFO("PCIe error recovery succeeded\n");
6004 } else {
6005 DRM_ERROR("PCIe error recovery failed, err:%d", r);
6006 amdgpu_device_unset_mp1_state(adev);
6007 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6008 }
6009
6010 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
6011}
6012
6013/**
6014 * amdgpu_pci_resume() - resume normal ops after PCI reset
6015 * @pdev: pointer to PCI device
6016 *
6017 * Called when the error recovery driver tells us that its
6018 * OK to resume normal operation.
6019 */
6020void amdgpu_pci_resume(struct pci_dev *pdev)
6021{
6022 struct drm_device *dev = pci_get_drvdata(pdev);
6023 struct amdgpu_device *adev = drm_to_adev(dev);
6024 int i;
6025
6026
6027 DRM_INFO("PCI error: resume callback!!\n");
6028
6029 /* Only continue execution for the case of pci_channel_io_frozen */
6030 if (adev->pci_channel_state != pci_channel_io_frozen)
6031 return;
6032
6033 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6034 struct amdgpu_ring *ring = adev->rings[i];
6035
6036 if (!ring || !ring->sched.thread)
6037 continue;
6038
6039 drm_sched_start(&ring->sched, true);
6040 }
6041
6042 amdgpu_device_unset_mp1_state(adev);
6043 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6044}
6045
6046bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
6047{
6048 struct drm_device *dev = pci_get_drvdata(pdev);
6049 struct amdgpu_device *adev = drm_to_adev(dev);
6050 int r;
6051
6052 r = pci_save_state(pdev);
6053 if (!r) {
6054 kfree(adev->pci_state);
6055
6056 adev->pci_state = pci_store_saved_state(pdev);
6057
6058 if (!adev->pci_state) {
6059 DRM_ERROR("Failed to store PCI saved state");
6060 return false;
6061 }
6062 } else {
6063 DRM_WARN("Failed to save PCI state, err:%d\n", r);
6064 return false;
6065 }
6066
6067 return true;
6068}
6069
6070bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6071{
6072 struct drm_device *dev = pci_get_drvdata(pdev);
6073 struct amdgpu_device *adev = drm_to_adev(dev);
6074 int r;
6075
6076 if (!adev->pci_state)
6077 return false;
6078
6079 r = pci_load_saved_state(pdev, adev->pci_state);
6080
6081 if (!r) {
6082 pci_restore_state(pdev);
6083 } else {
6084 DRM_WARN("Failed to load PCI state, err:%d\n", r);
6085 return false;
6086 }
6087
6088 return true;
6089}
6090
6091void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6092 struct amdgpu_ring *ring)
6093{
6094#ifdef CONFIG_X86_64
6095 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6096 return;
6097#endif
6098 if (adev->gmc.xgmi.connected_to_cpu)
6099 return;
6100
6101 if (ring && ring->funcs->emit_hdp_flush)
6102 amdgpu_ring_emit_hdp_flush(ring);
6103 else
6104 amdgpu_asic_flush_hdp(adev, ring);
6105}
6106
6107void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6108 struct amdgpu_ring *ring)
6109{
6110#ifdef CONFIG_X86_64
6111 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6112 return;
6113#endif
6114 if (adev->gmc.xgmi.connected_to_cpu)
6115 return;
6116
6117 amdgpu_asic_invalidate_hdp(adev, ring);
6118}
6119
6120int amdgpu_in_reset(struct amdgpu_device *adev)
6121{
6122 return atomic_read(&adev->reset_domain->in_gpu_reset);
6123}
6124
6125/**
6126 * amdgpu_device_halt() - bring hardware to some kind of halt state
6127 *
6128 * @adev: amdgpu_device pointer
6129 *
6130 * Bring hardware to some kind of halt state so that no one can touch it
6131 * any more. It will help to maintain error context when error occurred.
6132 * Compare to a simple hang, the system will keep stable at least for SSH
6133 * access. Then it should be trivial to inspect the hardware state and
6134 * see what's going on. Implemented as following:
6135 *
6136 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6137 * clears all CPU mappings to device, disallows remappings through page faults
6138 * 2. amdgpu_irq_disable_all() disables all interrupts
6139 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6140 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6141 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6142 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6143 * flush any in flight DMA operations
6144 */
6145void amdgpu_device_halt(struct amdgpu_device *adev)
6146{
6147 struct pci_dev *pdev = adev->pdev;
6148 struct drm_device *ddev = adev_to_drm(adev);
6149
6150 amdgpu_xcp_dev_unplug(adev);
6151 drm_dev_unplug(ddev);
6152
6153 amdgpu_irq_disable_all(adev);
6154
6155 amdgpu_fence_driver_hw_fini(adev);
6156
6157 adev->no_hw_access = true;
6158
6159 amdgpu_device_unmap_mmio(adev);
6160
6161 pci_disable_device(pdev);
6162 pci_wait_for_pending_transaction(pdev);
6163}
6164
6165u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6166 u32 reg)
6167{
6168 unsigned long flags, address, data;
6169 u32 r;
6170
6171 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6172 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6173
6174 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6175 WREG32(address, reg * 4);
6176 (void)RREG32(address);
6177 r = RREG32(data);
6178 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6179 return r;
6180}
6181
6182void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6183 u32 reg, u32 v)
6184{
6185 unsigned long flags, address, data;
6186
6187 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6188 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6189
6190 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6191 WREG32(address, reg * 4);
6192 (void)RREG32(address);
6193 WREG32(data, v);
6194 (void)RREG32(data);
6195 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6196}
6197
6198/**
6199 * amdgpu_device_switch_gang - switch to a new gang
6200 * @adev: amdgpu_device pointer
6201 * @gang: the gang to switch to
6202 *
6203 * Try to switch to a new gang.
6204 * Returns: NULL if we switched to the new gang or a reference to the current
6205 * gang leader.
6206 */
6207struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6208 struct dma_fence *gang)
6209{
6210 struct dma_fence *old = NULL;
6211
6212 do {
6213 dma_fence_put(old);
6214 rcu_read_lock();
6215 old = dma_fence_get_rcu_safe(&adev->gang_submit);
6216 rcu_read_unlock();
6217
6218 if (old == gang)
6219 break;
6220
6221 if (!dma_fence_is_signaled(old))
6222 return old;
6223
6224 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6225 old, gang) != old);
6226
6227 dma_fence_put(old);
6228 return NULL;
6229}
6230
6231bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6232{
6233 switch (adev->asic_type) {
6234#ifdef CONFIG_DRM_AMDGPU_SI
6235 case CHIP_HAINAN:
6236#endif
6237 case CHIP_TOPAZ:
6238 /* chips with no display hardware */
6239 return false;
6240#ifdef CONFIG_DRM_AMDGPU_SI
6241 case CHIP_TAHITI:
6242 case CHIP_PITCAIRN:
6243 case CHIP_VERDE:
6244 case CHIP_OLAND:
6245#endif
6246#ifdef CONFIG_DRM_AMDGPU_CIK
6247 case CHIP_BONAIRE:
6248 case CHIP_HAWAII:
6249 case CHIP_KAVERI:
6250 case CHIP_KABINI:
6251 case CHIP_MULLINS:
6252#endif
6253 case CHIP_TONGA:
6254 case CHIP_FIJI:
6255 case CHIP_POLARIS10:
6256 case CHIP_POLARIS11:
6257 case CHIP_POLARIS12:
6258 case CHIP_VEGAM:
6259 case CHIP_CARRIZO:
6260 case CHIP_STONEY:
6261 /* chips with display hardware */
6262 return true;
6263 default:
6264 /* IP discovery */
6265 if (!adev->ip_versions[DCE_HWIP][0] ||
6266 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6267 return false;
6268 return true;
6269 }
6270}
6271
6272uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6273 uint32_t inst, uint32_t reg_addr, char reg_name[],
6274 uint32_t expected_value, uint32_t mask)
6275{
6276 uint32_t ret = 0;
6277 uint32_t old_ = 0;
6278 uint32_t tmp_ = RREG32(reg_addr);
6279 uint32_t loop = adev->usec_timeout;
6280
6281 while ((tmp_ & (mask)) != (expected_value)) {
6282 if (old_ != tmp_) {
6283 loop = adev->usec_timeout;
6284 old_ = tmp_;
6285 } else
6286 udelay(1);
6287 tmp_ = RREG32(reg_addr);
6288 loop--;
6289 if (!loop) {
6290 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6291 inst, reg_name, (uint32_t)expected_value,
6292 (uint32_t)(tmp_ & (mask)));
6293 ret = -ETIMEDOUT;
6294 break;
6295 }
6296 }
6297 return ret;
6298}