Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28#include <linux/power_supply.h>
29#include <linux/kthread.h>
30#include <linux/module.h>
31#include <linux/console.h>
32#include <linux/slab.h>
33#include <linux/iommu.h>
34#include <linux/pci.h>
35#include <linux/pci-p2pdma.h>
36#include <linux/apple-gmux.h>
37
38#include <drm/drm_aperture.h>
39#include <drm/drm_atomic_helper.h>
40#include <drm/drm_crtc_helper.h>
41#include <drm/drm_fb_helper.h>
42#include <drm/drm_probe_helper.h>
43#include <drm/amdgpu_drm.h>
44#include <linux/device.h>
45#include <linux/vgaarb.h>
46#include <linux/vga_switcheroo.h>
47#include <linux/efi.h>
48#include "amdgpu.h"
49#include "amdgpu_trace.h"
50#include "amdgpu_i2c.h"
51#include "atom.h"
52#include "amdgpu_atombios.h"
53#include "amdgpu_atomfirmware.h"
54#include "amd_pcie.h"
55#ifdef CONFIG_DRM_AMDGPU_SI
56#include "si.h"
57#endif
58#ifdef CONFIG_DRM_AMDGPU_CIK
59#include "cik.h"
60#endif
61#include "vi.h"
62#include "soc15.h"
63#include "nv.h"
64#include "bif/bif_4_1_d.h"
65#include <linux/firmware.h>
66#include "amdgpu_vf_error.h"
67
68#include "amdgpu_amdkfd.h"
69#include "amdgpu_pm.h"
70
71#include "amdgpu_xgmi.h"
72#include "amdgpu_ras.h"
73#include "amdgpu_pmu.h"
74#include "amdgpu_fru_eeprom.h"
75#include "amdgpu_reset.h"
76#include "amdgpu_virt.h"
77#include "amdgpu_dev_coredump.h"
78
79#include <linux/suspend.h>
80#include <drm/task_barrier.h>
81#include <linux/pm_runtime.h>
82
83#include <drm/drm_drv.h>
84
85#if IS_ENABLED(CONFIG_X86)
86#include <asm/intel-family.h>
87#endif
88
89MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
90MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
91MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
92MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
93MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
94MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
95MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
96
97#define AMDGPU_RESUME_MS 2000
98#define AMDGPU_MAX_RETRY_LIMIT 2
99#define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
100#define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
101#define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
102#define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
103
104static const struct drm_driver amdgpu_kms_driver;
105
106const char *amdgpu_asic_name[] = {
107 "TAHITI",
108 "PITCAIRN",
109 "VERDE",
110 "OLAND",
111 "HAINAN",
112 "BONAIRE",
113 "KAVERI",
114 "KABINI",
115 "HAWAII",
116 "MULLINS",
117 "TOPAZ",
118 "TONGA",
119 "FIJI",
120 "CARRIZO",
121 "STONEY",
122 "POLARIS10",
123 "POLARIS11",
124 "POLARIS12",
125 "VEGAM",
126 "VEGA10",
127 "VEGA12",
128 "VEGA20",
129 "RAVEN",
130 "ARCTURUS",
131 "RENOIR",
132 "ALDEBARAN",
133 "NAVI10",
134 "CYAN_SKILLFISH",
135 "NAVI14",
136 "NAVI12",
137 "SIENNA_CICHLID",
138 "NAVY_FLOUNDER",
139 "VANGOGH",
140 "DIMGREY_CAVEFISH",
141 "BEIGE_GOBY",
142 "YELLOW_CARP",
143 "IP DISCOVERY",
144 "LAST",
145};
146
147static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
148
149/**
150 * DOC: pcie_replay_count
151 *
152 * The amdgpu driver provides a sysfs API for reporting the total number
153 * of PCIe replays (NAKs)
154 * The file pcie_replay_count is used for this and returns the total
155 * number of replays as a sum of the NAKs generated and NAKs received
156 */
157
158static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
159 struct device_attribute *attr, char *buf)
160{
161 struct drm_device *ddev = dev_get_drvdata(dev);
162 struct amdgpu_device *adev = drm_to_adev(ddev);
163 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
164
165 return sysfs_emit(buf, "%llu\n", cnt);
166}
167
168static DEVICE_ATTR(pcie_replay_count, 0444,
169 amdgpu_device_get_pcie_replay_count, NULL);
170
171static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
172 struct bin_attribute *attr, char *buf,
173 loff_t ppos, size_t count)
174{
175 struct device *dev = kobj_to_dev(kobj);
176 struct drm_device *ddev = dev_get_drvdata(dev);
177 struct amdgpu_device *adev = drm_to_adev(ddev);
178 ssize_t bytes_read;
179
180 switch (ppos) {
181 case AMDGPU_SYS_REG_STATE_XGMI:
182 bytes_read = amdgpu_asic_get_reg_state(
183 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
184 break;
185 case AMDGPU_SYS_REG_STATE_WAFL:
186 bytes_read = amdgpu_asic_get_reg_state(
187 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
188 break;
189 case AMDGPU_SYS_REG_STATE_PCIE:
190 bytes_read = amdgpu_asic_get_reg_state(
191 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
192 break;
193 case AMDGPU_SYS_REG_STATE_USR:
194 bytes_read = amdgpu_asic_get_reg_state(
195 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
196 break;
197 case AMDGPU_SYS_REG_STATE_USR_1:
198 bytes_read = amdgpu_asic_get_reg_state(
199 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
200 break;
201 default:
202 return -EINVAL;
203 }
204
205 return bytes_read;
206}
207
208BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
209 AMDGPU_SYS_REG_STATE_END);
210
211int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
212{
213 int ret;
214
215 if (!amdgpu_asic_get_reg_state_supported(adev))
216 return 0;
217
218 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
219
220 return ret;
221}
222
223void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
224{
225 if (!amdgpu_asic_get_reg_state_supported(adev))
226 return;
227 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
228}
229
230/**
231 * DOC: board_info
232 *
233 * The amdgpu driver provides a sysfs API for giving board related information.
234 * It provides the form factor information in the format
235 *
236 * type : form factor
237 *
238 * Possible form factor values
239 *
240 * - "cem" - PCIE CEM card
241 * - "oam" - Open Compute Accelerator Module
242 * - "unknown" - Not known
243 *
244 */
245
246static ssize_t amdgpu_device_get_board_info(struct device *dev,
247 struct device_attribute *attr,
248 char *buf)
249{
250 struct drm_device *ddev = dev_get_drvdata(dev);
251 struct amdgpu_device *adev = drm_to_adev(ddev);
252 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM;
253 const char *pkg;
254
255 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type)
256 pkg_type = adev->smuio.funcs->get_pkg_type(adev);
257
258 switch (pkg_type) {
259 case AMDGPU_PKG_TYPE_CEM:
260 pkg = "cem";
261 break;
262 case AMDGPU_PKG_TYPE_OAM:
263 pkg = "oam";
264 break;
265 default:
266 pkg = "unknown";
267 break;
268 }
269
270 return sysfs_emit(buf, "%s : %s\n", "type", pkg);
271}
272
273static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL);
274
275static struct attribute *amdgpu_board_attrs[] = {
276 &dev_attr_board_info.attr,
277 NULL,
278};
279
280static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj,
281 struct attribute *attr, int n)
282{
283 struct device *dev = kobj_to_dev(kobj);
284 struct drm_device *ddev = dev_get_drvdata(dev);
285 struct amdgpu_device *adev = drm_to_adev(ddev);
286
287 if (adev->flags & AMD_IS_APU)
288 return 0;
289
290 return attr->mode;
291}
292
293static const struct attribute_group amdgpu_board_attrs_group = {
294 .attrs = amdgpu_board_attrs,
295 .is_visible = amdgpu_board_attrs_is_visible
296};
297
298static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
299
300
301/**
302 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
303 *
304 * @dev: drm_device pointer
305 *
306 * Returns true if the device is a dGPU with ATPX power control,
307 * otherwise return false.
308 */
309bool amdgpu_device_supports_px(struct drm_device *dev)
310{
311 struct amdgpu_device *adev = drm_to_adev(dev);
312
313 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
314 return true;
315 return false;
316}
317
318/**
319 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
320 *
321 * @dev: drm_device pointer
322 *
323 * Returns true if the device is a dGPU with ACPI power control,
324 * otherwise return false.
325 */
326bool amdgpu_device_supports_boco(struct drm_device *dev)
327{
328 struct amdgpu_device *adev = drm_to_adev(dev);
329
330 if (adev->has_pr3 ||
331 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
332 return true;
333 return false;
334}
335
336/**
337 * amdgpu_device_supports_baco - Does the device support BACO
338 *
339 * @dev: drm_device pointer
340 *
341 * Return:
342 * 1 if the device supporte BACO;
343 * 3 if the device support MACO (only works if BACO is supported)
344 * otherwise return 0.
345 */
346int amdgpu_device_supports_baco(struct drm_device *dev)
347{
348 struct amdgpu_device *adev = drm_to_adev(dev);
349
350 return amdgpu_asic_supports_baco(adev);
351}
352
353void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
354{
355 struct drm_device *dev;
356 int bamaco_support;
357
358 dev = adev_to_drm(adev);
359
360 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
361 bamaco_support = amdgpu_device_supports_baco(dev);
362
363 switch (amdgpu_runtime_pm) {
364 case 2:
365 if (bamaco_support & MACO_SUPPORT) {
366 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
367 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
368 } else if (bamaco_support == BACO_SUPPORT) {
369 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
370 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
371 }
372 break;
373 case 1:
374 if (bamaco_support & BACO_SUPPORT) {
375 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
376 dev_info(adev->dev, "Forcing BACO for runtime pm\n");
377 }
378 break;
379 case -1:
380 case -2:
381 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */
382 adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
383 dev_info(adev->dev, "Using ATPX for runtime pm\n");
384 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */
385 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
386 dev_info(adev->dev, "Using BOCO for runtime pm\n");
387 } else {
388 if (!bamaco_support)
389 goto no_runtime_pm;
390
391 switch (adev->asic_type) {
392 case CHIP_VEGA20:
393 case CHIP_ARCTURUS:
394 /* BACO are not supported on vega20 and arctrus */
395 break;
396 case CHIP_VEGA10:
397 /* enable BACO as runpm mode if noretry=0 */
398 if (!adev->gmc.noretry)
399 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
400 break;
401 default:
402 /* enable BACO as runpm mode on CI+ */
403 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
404 break;
405 }
406
407 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
408 if (bamaco_support & MACO_SUPPORT) {
409 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
410 dev_info(adev->dev, "Using BAMACO for runtime pm\n");
411 } else {
412 dev_info(adev->dev, "Using BACO for runtime pm\n");
413 }
414 }
415 }
416 break;
417 case 0:
418 dev_info(adev->dev, "runtime pm is manually disabled\n");
419 break;
420 default:
421 break;
422 }
423
424no_runtime_pm:
425 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
426 dev_info(adev->dev, "Runtime PM not available\n");
427}
428/**
429 * amdgpu_device_supports_smart_shift - Is the device dGPU with
430 * smart shift support
431 *
432 * @dev: drm_device pointer
433 *
434 * Returns true if the device is a dGPU with Smart Shift support,
435 * otherwise returns false.
436 */
437bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
438{
439 return (amdgpu_device_supports_boco(dev) &&
440 amdgpu_acpi_is_power_shift_control_supported());
441}
442
443/*
444 * VRAM access helper functions
445 */
446
447/**
448 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
449 *
450 * @adev: amdgpu_device pointer
451 * @pos: offset of the buffer in vram
452 * @buf: virtual address of the buffer in system memory
453 * @size: read/write size, sizeof(@buf) must > @size
454 * @write: true - write to vram, otherwise - read from vram
455 */
456void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
457 void *buf, size_t size, bool write)
458{
459 unsigned long flags;
460 uint32_t hi = ~0, tmp = 0;
461 uint32_t *data = buf;
462 uint64_t last;
463 int idx;
464
465 if (!drm_dev_enter(adev_to_drm(adev), &idx))
466 return;
467
468 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
469
470 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
471 for (last = pos + size; pos < last; pos += 4) {
472 tmp = pos >> 31;
473
474 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
475 if (tmp != hi) {
476 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
477 hi = tmp;
478 }
479 if (write)
480 WREG32_NO_KIQ(mmMM_DATA, *data++);
481 else
482 *data++ = RREG32_NO_KIQ(mmMM_DATA);
483 }
484
485 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
486 drm_dev_exit(idx);
487}
488
489/**
490 * amdgpu_device_aper_access - access vram by vram aperature
491 *
492 * @adev: amdgpu_device pointer
493 * @pos: offset of the buffer in vram
494 * @buf: virtual address of the buffer in system memory
495 * @size: read/write size, sizeof(@buf) must > @size
496 * @write: true - write to vram, otherwise - read from vram
497 *
498 * The return value means how many bytes have been transferred.
499 */
500size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
501 void *buf, size_t size, bool write)
502{
503#ifdef CONFIG_64BIT
504 void __iomem *addr;
505 size_t count = 0;
506 uint64_t last;
507
508 if (!adev->mman.aper_base_kaddr)
509 return 0;
510
511 last = min(pos + size, adev->gmc.visible_vram_size);
512 if (last > pos) {
513 addr = adev->mman.aper_base_kaddr + pos;
514 count = last - pos;
515
516 if (write) {
517 memcpy_toio(addr, buf, count);
518 /* Make sure HDP write cache flush happens without any reordering
519 * after the system memory contents are sent over PCIe device
520 */
521 mb();
522 amdgpu_device_flush_hdp(adev, NULL);
523 } else {
524 amdgpu_device_invalidate_hdp(adev, NULL);
525 /* Make sure HDP read cache is invalidated before issuing a read
526 * to the PCIe device
527 */
528 mb();
529 memcpy_fromio(buf, addr, count);
530 }
531
532 }
533
534 return count;
535#else
536 return 0;
537#endif
538}
539
540/**
541 * amdgpu_device_vram_access - read/write a buffer in vram
542 *
543 * @adev: amdgpu_device pointer
544 * @pos: offset of the buffer in vram
545 * @buf: virtual address of the buffer in system memory
546 * @size: read/write size, sizeof(@buf) must > @size
547 * @write: true - write to vram, otherwise - read from vram
548 */
549void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
550 void *buf, size_t size, bool write)
551{
552 size_t count;
553
554 /* try to using vram apreature to access vram first */
555 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
556 size -= count;
557 if (size) {
558 /* using MM to access rest vram */
559 pos += count;
560 buf += count;
561 amdgpu_device_mm_access(adev, pos, buf, size, write);
562 }
563}
564
565/*
566 * register access helper functions.
567 */
568
569/* Check if hw access should be skipped because of hotplug or device error */
570bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
571{
572 if (adev->no_hw_access)
573 return true;
574
575#ifdef CONFIG_LOCKDEP
576 /*
577 * This is a bit complicated to understand, so worth a comment. What we assert
578 * here is that the GPU reset is not running on another thread in parallel.
579 *
580 * For this we trylock the read side of the reset semaphore, if that succeeds
581 * we know that the reset is not running in paralell.
582 *
583 * If the trylock fails we assert that we are either already holding the read
584 * side of the lock or are the reset thread itself and hold the write side of
585 * the lock.
586 */
587 if (in_task()) {
588 if (down_read_trylock(&adev->reset_domain->sem))
589 up_read(&adev->reset_domain->sem);
590 else
591 lockdep_assert_held(&adev->reset_domain->sem);
592 }
593#endif
594 return false;
595}
596
597/**
598 * amdgpu_device_rreg - read a memory mapped IO or indirect register
599 *
600 * @adev: amdgpu_device pointer
601 * @reg: dword aligned register offset
602 * @acc_flags: access flags which require special behavior
603 *
604 * Returns the 32 bit value from the offset specified.
605 */
606uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
607 uint32_t reg, uint32_t acc_flags)
608{
609 uint32_t ret;
610
611 if (amdgpu_device_skip_hw_access(adev))
612 return 0;
613
614 if ((reg * 4) < adev->rmmio_size) {
615 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
616 amdgpu_sriov_runtime(adev) &&
617 down_read_trylock(&adev->reset_domain->sem)) {
618 ret = amdgpu_kiq_rreg(adev, reg, 0);
619 up_read(&adev->reset_domain->sem);
620 } else {
621 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
622 }
623 } else {
624 ret = adev->pcie_rreg(adev, reg * 4);
625 }
626
627 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
628
629 return ret;
630}
631
632/*
633 * MMIO register read with bytes helper functions
634 * @offset:bytes offset from MMIO start
635 */
636
637/**
638 * amdgpu_mm_rreg8 - read a memory mapped IO register
639 *
640 * @adev: amdgpu_device pointer
641 * @offset: byte aligned register offset
642 *
643 * Returns the 8 bit value from the offset specified.
644 */
645uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
646{
647 if (amdgpu_device_skip_hw_access(adev))
648 return 0;
649
650 if (offset < adev->rmmio_size)
651 return (readb(adev->rmmio + offset));
652 BUG();
653}
654
655
656/**
657 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
658 *
659 * @adev: amdgpu_device pointer
660 * @reg: dword aligned register offset
661 * @acc_flags: access flags which require special behavior
662 * @xcc_id: xcc accelerated compute core id
663 *
664 * Returns the 32 bit value from the offset specified.
665 */
666uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
667 uint32_t reg, uint32_t acc_flags,
668 uint32_t xcc_id)
669{
670 uint32_t ret, rlcg_flag;
671
672 if (amdgpu_device_skip_hw_access(adev))
673 return 0;
674
675 if ((reg * 4) < adev->rmmio_size) {
676 if (amdgpu_sriov_vf(adev) &&
677 !amdgpu_sriov_runtime(adev) &&
678 adev->gfx.rlc.rlcg_reg_access_supported &&
679 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
680 GC_HWIP, false,
681 &rlcg_flag)) {
682 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
683 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
684 amdgpu_sriov_runtime(adev) &&
685 down_read_trylock(&adev->reset_domain->sem)) {
686 ret = amdgpu_kiq_rreg(adev, reg, xcc_id);
687 up_read(&adev->reset_domain->sem);
688 } else {
689 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
690 }
691 } else {
692 ret = adev->pcie_rreg(adev, reg * 4);
693 }
694
695 return ret;
696}
697
698/*
699 * MMIO register write with bytes helper functions
700 * @offset:bytes offset from MMIO start
701 * @value: the value want to be written to the register
702 */
703
704/**
705 * amdgpu_mm_wreg8 - read a memory mapped IO register
706 *
707 * @adev: amdgpu_device pointer
708 * @offset: byte aligned register offset
709 * @value: 8 bit value to write
710 *
711 * Writes the value specified to the offset specified.
712 */
713void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
714{
715 if (amdgpu_device_skip_hw_access(adev))
716 return;
717
718 if (offset < adev->rmmio_size)
719 writeb(value, adev->rmmio + offset);
720 else
721 BUG();
722}
723
724/**
725 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
726 *
727 * @adev: amdgpu_device pointer
728 * @reg: dword aligned register offset
729 * @v: 32 bit value to write to the register
730 * @acc_flags: access flags which require special behavior
731 *
732 * Writes the value specified to the offset specified.
733 */
734void amdgpu_device_wreg(struct amdgpu_device *adev,
735 uint32_t reg, uint32_t v,
736 uint32_t acc_flags)
737{
738 if (amdgpu_device_skip_hw_access(adev))
739 return;
740
741 if ((reg * 4) < adev->rmmio_size) {
742 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
743 amdgpu_sriov_runtime(adev) &&
744 down_read_trylock(&adev->reset_domain->sem)) {
745 amdgpu_kiq_wreg(adev, reg, v, 0);
746 up_read(&adev->reset_domain->sem);
747 } else {
748 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
749 }
750 } else {
751 adev->pcie_wreg(adev, reg * 4, v);
752 }
753
754 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
755}
756
757/**
758 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
759 *
760 * @adev: amdgpu_device pointer
761 * @reg: mmio/rlc register
762 * @v: value to write
763 * @xcc_id: xcc accelerated compute core id
764 *
765 * this function is invoked only for the debugfs register access
766 */
767void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
768 uint32_t reg, uint32_t v,
769 uint32_t xcc_id)
770{
771 if (amdgpu_device_skip_hw_access(adev))
772 return;
773
774 if (amdgpu_sriov_fullaccess(adev) &&
775 adev->gfx.rlc.funcs &&
776 adev->gfx.rlc.funcs->is_rlcg_access_range) {
777 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
778 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
779 } else if ((reg * 4) >= adev->rmmio_size) {
780 adev->pcie_wreg(adev, reg * 4, v);
781 } else {
782 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
783 }
784}
785
786/**
787 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
788 *
789 * @adev: amdgpu_device pointer
790 * @reg: dword aligned register offset
791 * @v: 32 bit value to write to the register
792 * @acc_flags: access flags which require special behavior
793 * @xcc_id: xcc accelerated compute core id
794 *
795 * Writes the value specified to the offset specified.
796 */
797void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
798 uint32_t reg, uint32_t v,
799 uint32_t acc_flags, uint32_t xcc_id)
800{
801 uint32_t rlcg_flag;
802
803 if (amdgpu_device_skip_hw_access(adev))
804 return;
805
806 if ((reg * 4) < adev->rmmio_size) {
807 if (amdgpu_sriov_vf(adev) &&
808 !amdgpu_sriov_runtime(adev) &&
809 adev->gfx.rlc.rlcg_reg_access_supported &&
810 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
811 GC_HWIP, true,
812 &rlcg_flag)) {
813 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
814 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
815 amdgpu_sriov_runtime(adev) &&
816 down_read_trylock(&adev->reset_domain->sem)) {
817 amdgpu_kiq_wreg(adev, reg, v, xcc_id);
818 up_read(&adev->reset_domain->sem);
819 } else {
820 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
821 }
822 } else {
823 adev->pcie_wreg(adev, reg * 4, v);
824 }
825}
826
827/**
828 * amdgpu_device_indirect_rreg - read an indirect register
829 *
830 * @adev: amdgpu_device pointer
831 * @reg_addr: indirect register address to read from
832 *
833 * Returns the value of indirect register @reg_addr
834 */
835u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
836 u32 reg_addr)
837{
838 unsigned long flags, pcie_index, pcie_data;
839 void __iomem *pcie_index_offset;
840 void __iomem *pcie_data_offset;
841 u32 r;
842
843 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
844 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
845
846 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
847 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
848 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
849
850 writel(reg_addr, pcie_index_offset);
851 readl(pcie_index_offset);
852 r = readl(pcie_data_offset);
853 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
854
855 return r;
856}
857
858u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
859 u64 reg_addr)
860{
861 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
862 u32 r;
863 void __iomem *pcie_index_offset;
864 void __iomem *pcie_index_hi_offset;
865 void __iomem *pcie_data_offset;
866
867 if (unlikely(!adev->nbio.funcs)) {
868 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
869 pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
870 } else {
871 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
872 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
873 }
874
875 if (reg_addr >> 32) {
876 if (unlikely(!adev->nbio.funcs))
877 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
878 else
879 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
880 } else {
881 pcie_index_hi = 0;
882 }
883
884 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
885 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
886 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
887 if (pcie_index_hi != 0)
888 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
889 pcie_index_hi * 4;
890
891 writel(reg_addr, pcie_index_offset);
892 readl(pcie_index_offset);
893 if (pcie_index_hi != 0) {
894 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
895 readl(pcie_index_hi_offset);
896 }
897 r = readl(pcie_data_offset);
898
899 /* clear the high bits */
900 if (pcie_index_hi != 0) {
901 writel(0, pcie_index_hi_offset);
902 readl(pcie_index_hi_offset);
903 }
904
905 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
906
907 return r;
908}
909
910/**
911 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
912 *
913 * @adev: amdgpu_device pointer
914 * @reg_addr: indirect register address to read from
915 *
916 * Returns the value of indirect register @reg_addr
917 */
918u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
919 u32 reg_addr)
920{
921 unsigned long flags, pcie_index, pcie_data;
922 void __iomem *pcie_index_offset;
923 void __iomem *pcie_data_offset;
924 u64 r;
925
926 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
927 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
928
929 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
930 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
931 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
932
933 /* read low 32 bits */
934 writel(reg_addr, pcie_index_offset);
935 readl(pcie_index_offset);
936 r = readl(pcie_data_offset);
937 /* read high 32 bits */
938 writel(reg_addr + 4, pcie_index_offset);
939 readl(pcie_index_offset);
940 r |= ((u64)readl(pcie_data_offset) << 32);
941 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
942
943 return r;
944}
945
946u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
947 u64 reg_addr)
948{
949 unsigned long flags, pcie_index, pcie_data;
950 unsigned long pcie_index_hi = 0;
951 void __iomem *pcie_index_offset;
952 void __iomem *pcie_index_hi_offset;
953 void __iomem *pcie_data_offset;
954 u64 r;
955
956 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
957 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
958 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
959 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
960
961 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
962 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
963 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
964 if (pcie_index_hi != 0)
965 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
966 pcie_index_hi * 4;
967
968 /* read low 32 bits */
969 writel(reg_addr, pcie_index_offset);
970 readl(pcie_index_offset);
971 if (pcie_index_hi != 0) {
972 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
973 readl(pcie_index_hi_offset);
974 }
975 r = readl(pcie_data_offset);
976 /* read high 32 bits */
977 writel(reg_addr + 4, pcie_index_offset);
978 readl(pcie_index_offset);
979 if (pcie_index_hi != 0) {
980 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
981 readl(pcie_index_hi_offset);
982 }
983 r |= ((u64)readl(pcie_data_offset) << 32);
984
985 /* clear the high bits */
986 if (pcie_index_hi != 0) {
987 writel(0, pcie_index_hi_offset);
988 readl(pcie_index_hi_offset);
989 }
990
991 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
992
993 return r;
994}
995
996/**
997 * amdgpu_device_indirect_wreg - write an indirect register address
998 *
999 * @adev: amdgpu_device pointer
1000 * @reg_addr: indirect register offset
1001 * @reg_data: indirect register data
1002 *
1003 */
1004void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
1005 u32 reg_addr, u32 reg_data)
1006{
1007 unsigned long flags, pcie_index, pcie_data;
1008 void __iomem *pcie_index_offset;
1009 void __iomem *pcie_data_offset;
1010
1011 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1012 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1013
1014 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1015 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1016 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1017
1018 writel(reg_addr, pcie_index_offset);
1019 readl(pcie_index_offset);
1020 writel(reg_data, pcie_data_offset);
1021 readl(pcie_data_offset);
1022 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1023}
1024
1025void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
1026 u64 reg_addr, u32 reg_data)
1027{
1028 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
1029 void __iomem *pcie_index_offset;
1030 void __iomem *pcie_index_hi_offset;
1031 void __iomem *pcie_data_offset;
1032
1033 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1034 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1035 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1036 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1037 else
1038 pcie_index_hi = 0;
1039
1040 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1041 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1042 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1043 if (pcie_index_hi != 0)
1044 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1045 pcie_index_hi * 4;
1046
1047 writel(reg_addr, pcie_index_offset);
1048 readl(pcie_index_offset);
1049 if (pcie_index_hi != 0) {
1050 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1051 readl(pcie_index_hi_offset);
1052 }
1053 writel(reg_data, pcie_data_offset);
1054 readl(pcie_data_offset);
1055
1056 /* clear the high bits */
1057 if (pcie_index_hi != 0) {
1058 writel(0, pcie_index_hi_offset);
1059 readl(pcie_index_hi_offset);
1060 }
1061
1062 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1063}
1064
1065/**
1066 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
1067 *
1068 * @adev: amdgpu_device pointer
1069 * @reg_addr: indirect register offset
1070 * @reg_data: indirect register data
1071 *
1072 */
1073void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
1074 u32 reg_addr, u64 reg_data)
1075{
1076 unsigned long flags, pcie_index, pcie_data;
1077 void __iomem *pcie_index_offset;
1078 void __iomem *pcie_data_offset;
1079
1080 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1081 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1082
1083 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1084 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1085 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1086
1087 /* write low 32 bits */
1088 writel(reg_addr, pcie_index_offset);
1089 readl(pcie_index_offset);
1090 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1091 readl(pcie_data_offset);
1092 /* write high 32 bits */
1093 writel(reg_addr + 4, pcie_index_offset);
1094 readl(pcie_index_offset);
1095 writel((u32)(reg_data >> 32), pcie_data_offset);
1096 readl(pcie_data_offset);
1097 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1098}
1099
1100void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
1101 u64 reg_addr, u64 reg_data)
1102{
1103 unsigned long flags, pcie_index, pcie_data;
1104 unsigned long pcie_index_hi = 0;
1105 void __iomem *pcie_index_offset;
1106 void __iomem *pcie_index_hi_offset;
1107 void __iomem *pcie_data_offset;
1108
1109 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1110 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1111 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1112 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1113
1114 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1115 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1116 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1117 if (pcie_index_hi != 0)
1118 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1119 pcie_index_hi * 4;
1120
1121 /* write low 32 bits */
1122 writel(reg_addr, pcie_index_offset);
1123 readl(pcie_index_offset);
1124 if (pcie_index_hi != 0) {
1125 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1126 readl(pcie_index_hi_offset);
1127 }
1128 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1129 readl(pcie_data_offset);
1130 /* write high 32 bits */
1131 writel(reg_addr + 4, pcie_index_offset);
1132 readl(pcie_index_offset);
1133 if (pcie_index_hi != 0) {
1134 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1135 readl(pcie_index_hi_offset);
1136 }
1137 writel((u32)(reg_data >> 32), pcie_data_offset);
1138 readl(pcie_data_offset);
1139
1140 /* clear the high bits */
1141 if (pcie_index_hi != 0) {
1142 writel(0, pcie_index_hi_offset);
1143 readl(pcie_index_hi_offset);
1144 }
1145
1146 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1147}
1148
1149/**
1150 * amdgpu_device_get_rev_id - query device rev_id
1151 *
1152 * @adev: amdgpu_device pointer
1153 *
1154 * Return device rev_id
1155 */
1156u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
1157{
1158 return adev->nbio.funcs->get_rev_id(adev);
1159}
1160
1161/**
1162 * amdgpu_invalid_rreg - dummy reg read function
1163 *
1164 * @adev: amdgpu_device pointer
1165 * @reg: offset of register
1166 *
1167 * Dummy register read function. Used for register blocks
1168 * that certain asics don't have (all asics).
1169 * Returns the value in the register.
1170 */
1171static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
1172{
1173 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
1174 BUG();
1175 return 0;
1176}
1177
1178static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
1179{
1180 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1181 BUG();
1182 return 0;
1183}
1184
1185/**
1186 * amdgpu_invalid_wreg - dummy reg write function
1187 *
1188 * @adev: amdgpu_device pointer
1189 * @reg: offset of register
1190 * @v: value to write to the register
1191 *
1192 * Dummy register read function. Used for register blocks
1193 * that certain asics don't have (all asics).
1194 */
1195static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
1196{
1197 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
1198 reg, v);
1199 BUG();
1200}
1201
1202static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
1203{
1204 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
1205 reg, v);
1206 BUG();
1207}
1208
1209/**
1210 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
1211 *
1212 * @adev: amdgpu_device pointer
1213 * @reg: offset of register
1214 *
1215 * Dummy register read function. Used for register blocks
1216 * that certain asics don't have (all asics).
1217 * Returns the value in the register.
1218 */
1219static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
1220{
1221 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
1222 BUG();
1223 return 0;
1224}
1225
1226static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
1227{
1228 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1229 BUG();
1230 return 0;
1231}
1232
1233/**
1234 * amdgpu_invalid_wreg64 - dummy reg write function
1235 *
1236 * @adev: amdgpu_device pointer
1237 * @reg: offset of register
1238 * @v: value to write to the register
1239 *
1240 * Dummy register read function. Used for register blocks
1241 * that certain asics don't have (all asics).
1242 */
1243static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
1244{
1245 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
1246 reg, v);
1247 BUG();
1248}
1249
1250static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
1251{
1252 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
1253 reg, v);
1254 BUG();
1255}
1256
1257/**
1258 * amdgpu_block_invalid_rreg - dummy reg read function
1259 *
1260 * @adev: amdgpu_device pointer
1261 * @block: offset of instance
1262 * @reg: offset of register
1263 *
1264 * Dummy register read function. Used for register blocks
1265 * that certain asics don't have (all asics).
1266 * Returns the value in the register.
1267 */
1268static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
1269 uint32_t block, uint32_t reg)
1270{
1271 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
1272 reg, block);
1273 BUG();
1274 return 0;
1275}
1276
1277/**
1278 * amdgpu_block_invalid_wreg - dummy reg write function
1279 *
1280 * @adev: amdgpu_device pointer
1281 * @block: offset of instance
1282 * @reg: offset of register
1283 * @v: value to write to the register
1284 *
1285 * Dummy register read function. Used for register blocks
1286 * that certain asics don't have (all asics).
1287 */
1288static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
1289 uint32_t block,
1290 uint32_t reg, uint32_t v)
1291{
1292 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1293 reg, block, v);
1294 BUG();
1295}
1296
1297/**
1298 * amdgpu_device_asic_init - Wrapper for atom asic_init
1299 *
1300 * @adev: amdgpu_device pointer
1301 *
1302 * Does any asic specific work and then calls atom asic init.
1303 */
1304static int amdgpu_device_asic_init(struct amdgpu_device *adev)
1305{
1306 int ret;
1307
1308 amdgpu_asic_pre_asic_init(adev);
1309
1310 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
1311 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
1312 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
1313 amdgpu_psp_wait_for_bootloader(adev);
1314 ret = amdgpu_atomfirmware_asic_init(adev, true);
1315 return ret;
1316 } else {
1317 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
1318 }
1319
1320 return 0;
1321}
1322
1323/**
1324 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1325 *
1326 * @adev: amdgpu_device pointer
1327 *
1328 * Allocates a scratch page of VRAM for use by various things in the
1329 * driver.
1330 */
1331static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
1332{
1333 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
1334 AMDGPU_GEM_DOMAIN_VRAM |
1335 AMDGPU_GEM_DOMAIN_GTT,
1336 &adev->mem_scratch.robj,
1337 &adev->mem_scratch.gpu_addr,
1338 (void **)&adev->mem_scratch.ptr);
1339}
1340
1341/**
1342 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1343 *
1344 * @adev: amdgpu_device pointer
1345 *
1346 * Frees the VRAM scratch page.
1347 */
1348static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
1349{
1350 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
1351}
1352
1353/**
1354 * amdgpu_device_program_register_sequence - program an array of registers.
1355 *
1356 * @adev: amdgpu_device pointer
1357 * @registers: pointer to the register array
1358 * @array_size: size of the register array
1359 *
1360 * Programs an array or registers with and or masks.
1361 * This is a helper for setting golden registers.
1362 */
1363void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
1364 const u32 *registers,
1365 const u32 array_size)
1366{
1367 u32 tmp, reg, and_mask, or_mask;
1368 int i;
1369
1370 if (array_size % 3)
1371 return;
1372
1373 for (i = 0; i < array_size; i += 3) {
1374 reg = registers[i + 0];
1375 and_mask = registers[i + 1];
1376 or_mask = registers[i + 2];
1377
1378 if (and_mask == 0xffffffff) {
1379 tmp = or_mask;
1380 } else {
1381 tmp = RREG32(reg);
1382 tmp &= ~and_mask;
1383 if (adev->family >= AMDGPU_FAMILY_AI)
1384 tmp |= (or_mask & and_mask);
1385 else
1386 tmp |= or_mask;
1387 }
1388 WREG32(reg, tmp);
1389 }
1390}
1391
1392/**
1393 * amdgpu_device_pci_config_reset - reset the GPU
1394 *
1395 * @adev: amdgpu_device pointer
1396 *
1397 * Resets the GPU using the pci config reset sequence.
1398 * Only applicable to asics prior to vega10.
1399 */
1400void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1401{
1402 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1403}
1404
1405/**
1406 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1407 *
1408 * @adev: amdgpu_device pointer
1409 *
1410 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1411 */
1412int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1413{
1414 return pci_reset_function(adev->pdev);
1415}
1416
1417/*
1418 * amdgpu_device_wb_*()
1419 * Writeback is the method by which the GPU updates special pages in memory
1420 * with the status of certain GPU events (fences, ring pointers,etc.).
1421 */
1422
1423/**
1424 * amdgpu_device_wb_fini - Disable Writeback and free memory
1425 *
1426 * @adev: amdgpu_device pointer
1427 *
1428 * Disables Writeback and frees the Writeback memory (all asics).
1429 * Used at driver shutdown.
1430 */
1431static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1432{
1433 if (adev->wb.wb_obj) {
1434 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1435 &adev->wb.gpu_addr,
1436 (void **)&adev->wb.wb);
1437 adev->wb.wb_obj = NULL;
1438 }
1439}
1440
1441/**
1442 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1443 *
1444 * @adev: amdgpu_device pointer
1445 *
1446 * Initializes writeback and allocates writeback memory (all asics).
1447 * Used at driver startup.
1448 * Returns 0 on success or an -error on failure.
1449 */
1450static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1451{
1452 int r;
1453
1454 if (adev->wb.wb_obj == NULL) {
1455 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1456 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1457 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1458 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1459 (void **)&adev->wb.wb);
1460 if (r) {
1461 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1462 return r;
1463 }
1464
1465 adev->wb.num_wb = AMDGPU_MAX_WB;
1466 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1467
1468 /* clear wb memory */
1469 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1470 }
1471
1472 return 0;
1473}
1474
1475/**
1476 * amdgpu_device_wb_get - Allocate a wb entry
1477 *
1478 * @adev: amdgpu_device pointer
1479 * @wb: wb index
1480 *
1481 * Allocate a wb slot for use by the driver (all asics).
1482 * Returns 0 on success or -EINVAL on failure.
1483 */
1484int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1485{
1486 unsigned long flags, offset;
1487
1488 spin_lock_irqsave(&adev->wb.lock, flags);
1489 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1490 if (offset < adev->wb.num_wb) {
1491 __set_bit(offset, adev->wb.used);
1492 spin_unlock_irqrestore(&adev->wb.lock, flags);
1493 *wb = offset << 3; /* convert to dw offset */
1494 return 0;
1495 } else {
1496 spin_unlock_irqrestore(&adev->wb.lock, flags);
1497 return -EINVAL;
1498 }
1499}
1500
1501/**
1502 * amdgpu_device_wb_free - Free a wb entry
1503 *
1504 * @adev: amdgpu_device pointer
1505 * @wb: wb index
1506 *
1507 * Free a wb slot allocated for use by the driver (all asics)
1508 */
1509void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1510{
1511 unsigned long flags;
1512
1513 wb >>= 3;
1514 spin_lock_irqsave(&adev->wb.lock, flags);
1515 if (wb < adev->wb.num_wb)
1516 __clear_bit(wb, adev->wb.used);
1517 spin_unlock_irqrestore(&adev->wb.lock, flags);
1518}
1519
1520/**
1521 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1522 *
1523 * @adev: amdgpu_device pointer
1524 *
1525 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1526 * to fail, but if any of the BARs is not accessible after the size we abort
1527 * driver loading by returning -ENODEV.
1528 */
1529int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1530{
1531 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1532 struct pci_bus *root;
1533 struct resource *res;
1534 unsigned int i;
1535 u16 cmd;
1536 int r;
1537
1538 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1539 return 0;
1540
1541 /* Bypass for VF */
1542 if (amdgpu_sriov_vf(adev))
1543 return 0;
1544
1545 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1546 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1547 DRM_WARN("System can't access extended configuration space, please check!!\n");
1548
1549 /* skip if the bios has already enabled large BAR */
1550 if (adev->gmc.real_vram_size &&
1551 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1552 return 0;
1553
1554 /* Check if the root BUS has 64bit memory resources */
1555 root = adev->pdev->bus;
1556 while (root->parent)
1557 root = root->parent;
1558
1559 pci_bus_for_each_resource(root, res, i) {
1560 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1561 res->start > 0x100000000ull)
1562 break;
1563 }
1564
1565 /* Trying to resize is pointless without a root hub window above 4GB */
1566 if (!res)
1567 return 0;
1568
1569 /* Limit the BAR size to what is available */
1570 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1571 rbar_size);
1572
1573 /* Disable memory decoding while we change the BAR addresses and size */
1574 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1575 pci_write_config_word(adev->pdev, PCI_COMMAND,
1576 cmd & ~PCI_COMMAND_MEMORY);
1577
1578 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1579 amdgpu_doorbell_fini(adev);
1580 if (adev->asic_type >= CHIP_BONAIRE)
1581 pci_release_resource(adev->pdev, 2);
1582
1583 pci_release_resource(adev->pdev, 0);
1584
1585 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1586 if (r == -ENOSPC)
1587 DRM_INFO("Not enough PCI address space for a large BAR.");
1588 else if (r && r != -ENOTSUPP)
1589 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1590
1591 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1592
1593 /* When the doorbell or fb BAR isn't available we have no chance of
1594 * using the device.
1595 */
1596 r = amdgpu_doorbell_init(adev);
1597 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1598 return -ENODEV;
1599
1600 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1601
1602 return 0;
1603}
1604
1605static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1606{
1607 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1608 return false;
1609
1610 return true;
1611}
1612
1613/*
1614 * GPU helpers function.
1615 */
1616/**
1617 * amdgpu_device_need_post - check if the hw need post or not
1618 *
1619 * @adev: amdgpu_device pointer
1620 *
1621 * Check if the asic has been initialized (all asics) at driver startup
1622 * or post is needed if hw reset is performed.
1623 * Returns true if need or false if not.
1624 */
1625bool amdgpu_device_need_post(struct amdgpu_device *adev)
1626{
1627 uint32_t reg;
1628
1629 if (amdgpu_sriov_vf(adev))
1630 return false;
1631
1632 if (!amdgpu_device_read_bios(adev))
1633 return false;
1634
1635 if (amdgpu_passthrough(adev)) {
1636 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1637 * some old smc fw still need driver do vPost otherwise gpu hang, while
1638 * those smc fw version above 22.15 doesn't have this flaw, so we force
1639 * vpost executed for smc version below 22.15
1640 */
1641 if (adev->asic_type == CHIP_FIJI) {
1642 int err;
1643 uint32_t fw_ver;
1644
1645 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1646 /* force vPost if error occured */
1647 if (err)
1648 return true;
1649
1650 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1651 release_firmware(adev->pm.fw);
1652 if (fw_ver < 0x00160e00)
1653 return true;
1654 }
1655 }
1656
1657 /* Don't post if we need to reset whole hive on init */
1658 if (adev->gmc.xgmi.pending_reset)
1659 return false;
1660
1661 if (adev->has_hw_reset) {
1662 adev->has_hw_reset = false;
1663 return true;
1664 }
1665
1666 /* bios scratch used on CIK+ */
1667 if (adev->asic_type >= CHIP_BONAIRE)
1668 return amdgpu_atombios_scratch_need_asic_init(adev);
1669
1670 /* check MEM_SIZE for older asics */
1671 reg = amdgpu_asic_get_config_memsize(adev);
1672
1673 if ((reg != 0) && (reg != 0xffffffff))
1674 return false;
1675
1676 return true;
1677}
1678
1679/*
1680 * Check whether seamless boot is supported.
1681 *
1682 * So far we only support seamless boot on DCE 3.0 or later.
1683 * If users report that it works on older ASICS as well, we may
1684 * loosen this.
1685 */
1686bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
1687{
1688 switch (amdgpu_seamless) {
1689 case -1:
1690 break;
1691 case 1:
1692 return true;
1693 case 0:
1694 return false;
1695 default:
1696 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n",
1697 amdgpu_seamless);
1698 return false;
1699 }
1700
1701 if (!(adev->flags & AMD_IS_APU))
1702 return false;
1703
1704 if (adev->mman.keep_stolen_vga_memory)
1705 return false;
1706
1707 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
1708}
1709
1710/*
1711 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
1712 * don't support dynamic speed switching. Until we have confirmation from Intel
1713 * that a specific host supports it, it's safer that we keep it disabled for all.
1714 *
1715 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1716 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1717 */
1718static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
1719{
1720#if IS_ENABLED(CONFIG_X86)
1721 struct cpuinfo_x86 *c = &cpu_data(0);
1722
1723 /* eGPU change speeds based on USB4 fabric conditions */
1724 if (dev_is_removable(adev->dev))
1725 return true;
1726
1727 if (c->x86_vendor == X86_VENDOR_INTEL)
1728 return false;
1729#endif
1730 return true;
1731}
1732
1733/**
1734 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1735 *
1736 * @adev: amdgpu_device pointer
1737 *
1738 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1739 * be set for this device.
1740 *
1741 * Returns true if it should be used or false if not.
1742 */
1743bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1744{
1745 switch (amdgpu_aspm) {
1746 case -1:
1747 break;
1748 case 0:
1749 return false;
1750 case 1:
1751 return true;
1752 default:
1753 return false;
1754 }
1755 if (adev->flags & AMD_IS_APU)
1756 return false;
1757 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK))
1758 return false;
1759 return pcie_aspm_enabled(adev->pdev);
1760}
1761
1762/* if we get transitioned to only one device, take VGA back */
1763/**
1764 * amdgpu_device_vga_set_decode - enable/disable vga decode
1765 *
1766 * @pdev: PCI device pointer
1767 * @state: enable/disable vga decode
1768 *
1769 * Enable/disable vga decode (all asics).
1770 * Returns VGA resource flags.
1771 */
1772static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1773 bool state)
1774{
1775 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1776
1777 amdgpu_asic_set_vga_state(adev, state);
1778 if (state)
1779 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1780 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1781 else
1782 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1783}
1784
1785/**
1786 * amdgpu_device_check_block_size - validate the vm block size
1787 *
1788 * @adev: amdgpu_device pointer
1789 *
1790 * Validates the vm block size specified via module parameter.
1791 * The vm block size defines number of bits in page table versus page directory,
1792 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1793 * page table and the remaining bits are in the page directory.
1794 */
1795static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1796{
1797 /* defines number of bits in page table versus page directory,
1798 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1799 * page table and the remaining bits are in the page directory
1800 */
1801 if (amdgpu_vm_block_size == -1)
1802 return;
1803
1804 if (amdgpu_vm_block_size < 9) {
1805 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1806 amdgpu_vm_block_size);
1807 amdgpu_vm_block_size = -1;
1808 }
1809}
1810
1811/**
1812 * amdgpu_device_check_vm_size - validate the vm size
1813 *
1814 * @adev: amdgpu_device pointer
1815 *
1816 * Validates the vm size in GB specified via module parameter.
1817 * The VM size is the size of the GPU virtual memory space in GB.
1818 */
1819static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1820{
1821 /* no need to check the default value */
1822 if (amdgpu_vm_size == -1)
1823 return;
1824
1825 if (amdgpu_vm_size < 1) {
1826 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1827 amdgpu_vm_size);
1828 amdgpu_vm_size = -1;
1829 }
1830}
1831
1832static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1833{
1834 struct sysinfo si;
1835 bool is_os_64 = (sizeof(void *) == 8);
1836 uint64_t total_memory;
1837 uint64_t dram_size_seven_GB = 0x1B8000000;
1838 uint64_t dram_size_three_GB = 0xB8000000;
1839
1840 if (amdgpu_smu_memory_pool_size == 0)
1841 return;
1842
1843 if (!is_os_64) {
1844 DRM_WARN("Not 64-bit OS, feature not supported\n");
1845 goto def_value;
1846 }
1847 si_meminfo(&si);
1848 total_memory = (uint64_t)si.totalram * si.mem_unit;
1849
1850 if ((amdgpu_smu_memory_pool_size == 1) ||
1851 (amdgpu_smu_memory_pool_size == 2)) {
1852 if (total_memory < dram_size_three_GB)
1853 goto def_value1;
1854 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1855 (amdgpu_smu_memory_pool_size == 8)) {
1856 if (total_memory < dram_size_seven_GB)
1857 goto def_value1;
1858 } else {
1859 DRM_WARN("Smu memory pool size not supported\n");
1860 goto def_value;
1861 }
1862 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1863
1864 return;
1865
1866def_value1:
1867 DRM_WARN("No enough system memory\n");
1868def_value:
1869 adev->pm.smu_prv_buffer_size = 0;
1870}
1871
1872static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1873{
1874 if (!(adev->flags & AMD_IS_APU) ||
1875 adev->asic_type < CHIP_RAVEN)
1876 return 0;
1877
1878 switch (adev->asic_type) {
1879 case CHIP_RAVEN:
1880 if (adev->pdev->device == 0x15dd)
1881 adev->apu_flags |= AMD_APU_IS_RAVEN;
1882 if (adev->pdev->device == 0x15d8)
1883 adev->apu_flags |= AMD_APU_IS_PICASSO;
1884 break;
1885 case CHIP_RENOIR:
1886 if ((adev->pdev->device == 0x1636) ||
1887 (adev->pdev->device == 0x164c))
1888 adev->apu_flags |= AMD_APU_IS_RENOIR;
1889 else
1890 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1891 break;
1892 case CHIP_VANGOGH:
1893 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1894 break;
1895 case CHIP_YELLOW_CARP:
1896 break;
1897 case CHIP_CYAN_SKILLFISH:
1898 if ((adev->pdev->device == 0x13FE) ||
1899 (adev->pdev->device == 0x143F))
1900 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1901 break;
1902 default:
1903 break;
1904 }
1905
1906 return 0;
1907}
1908
1909/**
1910 * amdgpu_device_check_arguments - validate module params
1911 *
1912 * @adev: amdgpu_device pointer
1913 *
1914 * Validates certain module parameters and updates
1915 * the associated values used by the driver (all asics).
1916 */
1917static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1918{
1919 if (amdgpu_sched_jobs < 4) {
1920 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1921 amdgpu_sched_jobs);
1922 amdgpu_sched_jobs = 4;
1923 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
1924 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1925 amdgpu_sched_jobs);
1926 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1927 }
1928
1929 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1930 /* gart size must be greater or equal to 32M */
1931 dev_warn(adev->dev, "gart size (%d) too small\n",
1932 amdgpu_gart_size);
1933 amdgpu_gart_size = -1;
1934 }
1935
1936 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1937 /* gtt size must be greater or equal to 32M */
1938 dev_warn(adev->dev, "gtt size (%d) too small\n",
1939 amdgpu_gtt_size);
1940 amdgpu_gtt_size = -1;
1941 }
1942
1943 /* valid range is between 4 and 9 inclusive */
1944 if (amdgpu_vm_fragment_size != -1 &&
1945 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1946 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1947 amdgpu_vm_fragment_size = -1;
1948 }
1949
1950 if (amdgpu_sched_hw_submission < 2) {
1951 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1952 amdgpu_sched_hw_submission);
1953 amdgpu_sched_hw_submission = 2;
1954 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1955 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1956 amdgpu_sched_hw_submission);
1957 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1958 }
1959
1960 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1961 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1962 amdgpu_reset_method = -1;
1963 }
1964
1965 amdgpu_device_check_smu_prv_buffer_size(adev);
1966
1967 amdgpu_device_check_vm_size(adev);
1968
1969 amdgpu_device_check_block_size(adev);
1970
1971 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1972
1973 return 0;
1974}
1975
1976/**
1977 * amdgpu_switcheroo_set_state - set switcheroo state
1978 *
1979 * @pdev: pci dev pointer
1980 * @state: vga_switcheroo state
1981 *
1982 * Callback for the switcheroo driver. Suspends or resumes
1983 * the asics before or after it is powered up using ACPI methods.
1984 */
1985static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1986 enum vga_switcheroo_state state)
1987{
1988 struct drm_device *dev = pci_get_drvdata(pdev);
1989 int r;
1990
1991 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1992 return;
1993
1994 if (state == VGA_SWITCHEROO_ON) {
1995 pr_info("switched on\n");
1996 /* don't suspend or resume card normally */
1997 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1998
1999 pci_set_power_state(pdev, PCI_D0);
2000 amdgpu_device_load_pci_state(pdev);
2001 r = pci_enable_device(pdev);
2002 if (r)
2003 DRM_WARN("pci_enable_device failed (%d)\n", r);
2004 amdgpu_device_resume(dev, true);
2005
2006 dev->switch_power_state = DRM_SWITCH_POWER_ON;
2007 } else {
2008 pr_info("switched off\n");
2009 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2010 amdgpu_device_prepare(dev);
2011 amdgpu_device_suspend(dev, true);
2012 amdgpu_device_cache_pci_state(pdev);
2013 /* Shut down the device */
2014 pci_disable_device(pdev);
2015 pci_set_power_state(pdev, PCI_D3cold);
2016 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
2017 }
2018}
2019
2020/**
2021 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
2022 *
2023 * @pdev: pci dev pointer
2024 *
2025 * Callback for the switcheroo driver. Check of the switcheroo
2026 * state can be changed.
2027 * Returns true if the state can be changed, false if not.
2028 */
2029static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
2030{
2031 struct drm_device *dev = pci_get_drvdata(pdev);
2032
2033 /*
2034 * FIXME: open_count is protected by drm_global_mutex but that would lead to
2035 * locking inversion with the driver load path. And the access here is
2036 * completely racy anyway. So don't bother with locking for now.
2037 */
2038 return atomic_read(&dev->open_count) == 0;
2039}
2040
2041static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
2042 .set_gpu_state = amdgpu_switcheroo_set_state,
2043 .reprobe = NULL,
2044 .can_switch = amdgpu_switcheroo_can_switch,
2045};
2046
2047/**
2048 * amdgpu_device_ip_set_clockgating_state - set the CG state
2049 *
2050 * @dev: amdgpu_device pointer
2051 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2052 * @state: clockgating state (gate or ungate)
2053 *
2054 * Sets the requested clockgating state for all instances of
2055 * the hardware IP specified.
2056 * Returns the error code from the last instance.
2057 */
2058int amdgpu_device_ip_set_clockgating_state(void *dev,
2059 enum amd_ip_block_type block_type,
2060 enum amd_clockgating_state state)
2061{
2062 struct amdgpu_device *adev = dev;
2063 int i, r = 0;
2064
2065 for (i = 0; i < adev->num_ip_blocks; i++) {
2066 if (!adev->ip_blocks[i].status.valid)
2067 continue;
2068 if (adev->ip_blocks[i].version->type != block_type)
2069 continue;
2070 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
2071 continue;
2072 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
2073 (void *)adev, state);
2074 if (r)
2075 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
2076 adev->ip_blocks[i].version->funcs->name, r);
2077 }
2078 return r;
2079}
2080
2081/**
2082 * amdgpu_device_ip_set_powergating_state - set the PG state
2083 *
2084 * @dev: amdgpu_device pointer
2085 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2086 * @state: powergating state (gate or ungate)
2087 *
2088 * Sets the requested powergating state for all instances of
2089 * the hardware IP specified.
2090 * Returns the error code from the last instance.
2091 */
2092int amdgpu_device_ip_set_powergating_state(void *dev,
2093 enum amd_ip_block_type block_type,
2094 enum amd_powergating_state state)
2095{
2096 struct amdgpu_device *adev = dev;
2097 int i, r = 0;
2098
2099 for (i = 0; i < adev->num_ip_blocks; i++) {
2100 if (!adev->ip_blocks[i].status.valid)
2101 continue;
2102 if (adev->ip_blocks[i].version->type != block_type)
2103 continue;
2104 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
2105 continue;
2106 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
2107 (void *)adev, state);
2108 if (r)
2109 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
2110 adev->ip_blocks[i].version->funcs->name, r);
2111 }
2112 return r;
2113}
2114
2115/**
2116 * amdgpu_device_ip_get_clockgating_state - get the CG state
2117 *
2118 * @adev: amdgpu_device pointer
2119 * @flags: clockgating feature flags
2120 *
2121 * Walks the list of IPs on the device and updates the clockgating
2122 * flags for each IP.
2123 * Updates @flags with the feature flags for each hardware IP where
2124 * clockgating is enabled.
2125 */
2126void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
2127 u64 *flags)
2128{
2129 int i;
2130
2131 for (i = 0; i < adev->num_ip_blocks; i++) {
2132 if (!adev->ip_blocks[i].status.valid)
2133 continue;
2134 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
2135 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
2136 }
2137}
2138
2139/**
2140 * amdgpu_device_ip_wait_for_idle - wait for idle
2141 *
2142 * @adev: amdgpu_device pointer
2143 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2144 *
2145 * Waits for the request hardware IP to be idle.
2146 * Returns 0 for success or a negative error code on failure.
2147 */
2148int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
2149 enum amd_ip_block_type block_type)
2150{
2151 int i, r;
2152
2153 for (i = 0; i < adev->num_ip_blocks; i++) {
2154 if (!adev->ip_blocks[i].status.valid)
2155 continue;
2156 if (adev->ip_blocks[i].version->type == block_type) {
2157 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
2158 if (r)
2159 return r;
2160 break;
2161 }
2162 }
2163 return 0;
2164
2165}
2166
2167/**
2168 * amdgpu_device_ip_is_idle - is the hardware IP idle
2169 *
2170 * @adev: amdgpu_device pointer
2171 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2172 *
2173 * Check if the hardware IP is idle or not.
2174 * Returns true if it the IP is idle, false if not.
2175 */
2176bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
2177 enum amd_ip_block_type block_type)
2178{
2179 int i;
2180
2181 for (i = 0; i < adev->num_ip_blocks; i++) {
2182 if (!adev->ip_blocks[i].status.valid)
2183 continue;
2184 if (adev->ip_blocks[i].version->type == block_type)
2185 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
2186 }
2187 return true;
2188
2189}
2190
2191/**
2192 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
2193 *
2194 * @adev: amdgpu_device pointer
2195 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
2196 *
2197 * Returns a pointer to the hardware IP block structure
2198 * if it exists for the asic, otherwise NULL.
2199 */
2200struct amdgpu_ip_block *
2201amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
2202 enum amd_ip_block_type type)
2203{
2204 int i;
2205
2206 for (i = 0; i < adev->num_ip_blocks; i++)
2207 if (adev->ip_blocks[i].version->type == type)
2208 return &adev->ip_blocks[i];
2209
2210 return NULL;
2211}
2212
2213/**
2214 * amdgpu_device_ip_block_version_cmp
2215 *
2216 * @adev: amdgpu_device pointer
2217 * @type: enum amd_ip_block_type
2218 * @major: major version
2219 * @minor: minor version
2220 *
2221 * return 0 if equal or greater
2222 * return 1 if smaller or the ip_block doesn't exist
2223 */
2224int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
2225 enum amd_ip_block_type type,
2226 u32 major, u32 minor)
2227{
2228 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
2229
2230 if (ip_block && ((ip_block->version->major > major) ||
2231 ((ip_block->version->major == major) &&
2232 (ip_block->version->minor >= minor))))
2233 return 0;
2234
2235 return 1;
2236}
2237
2238/**
2239 * amdgpu_device_ip_block_add
2240 *
2241 * @adev: amdgpu_device pointer
2242 * @ip_block_version: pointer to the IP to add
2243 *
2244 * Adds the IP block driver information to the collection of IPs
2245 * on the asic.
2246 */
2247int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
2248 const struct amdgpu_ip_block_version *ip_block_version)
2249{
2250 if (!ip_block_version)
2251 return -EINVAL;
2252
2253 switch (ip_block_version->type) {
2254 case AMD_IP_BLOCK_TYPE_VCN:
2255 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
2256 return 0;
2257 break;
2258 case AMD_IP_BLOCK_TYPE_JPEG:
2259 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
2260 return 0;
2261 break;
2262 default:
2263 break;
2264 }
2265
2266 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
2267 ip_block_version->funcs->name);
2268
2269 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
2270
2271 return 0;
2272}
2273
2274/**
2275 * amdgpu_device_enable_virtual_display - enable virtual display feature
2276 *
2277 * @adev: amdgpu_device pointer
2278 *
2279 * Enabled the virtual display feature if the user has enabled it via
2280 * the module parameter virtual_display. This feature provides a virtual
2281 * display hardware on headless boards or in virtualized environments.
2282 * This function parses and validates the configuration string specified by
2283 * the user and configues the virtual display configuration (number of
2284 * virtual connectors, crtcs, etc.) specified.
2285 */
2286static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
2287{
2288 adev->enable_virtual_display = false;
2289
2290 if (amdgpu_virtual_display) {
2291 const char *pci_address_name = pci_name(adev->pdev);
2292 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
2293
2294 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
2295 pciaddstr_tmp = pciaddstr;
2296 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
2297 pciaddname = strsep(&pciaddname_tmp, ",");
2298 if (!strcmp("all", pciaddname)
2299 || !strcmp(pci_address_name, pciaddname)) {
2300 long num_crtc;
2301 int res = -1;
2302
2303 adev->enable_virtual_display = true;
2304
2305 if (pciaddname_tmp)
2306 res = kstrtol(pciaddname_tmp, 10,
2307 &num_crtc);
2308
2309 if (!res) {
2310 if (num_crtc < 1)
2311 num_crtc = 1;
2312 if (num_crtc > 6)
2313 num_crtc = 6;
2314 adev->mode_info.num_crtc = num_crtc;
2315 } else {
2316 adev->mode_info.num_crtc = 1;
2317 }
2318 break;
2319 }
2320 }
2321
2322 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2323 amdgpu_virtual_display, pci_address_name,
2324 adev->enable_virtual_display, adev->mode_info.num_crtc);
2325
2326 kfree(pciaddstr);
2327 }
2328}
2329
2330void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
2331{
2332 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
2333 adev->mode_info.num_crtc = 1;
2334 adev->enable_virtual_display = true;
2335 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
2336 adev->enable_virtual_display, adev->mode_info.num_crtc);
2337 }
2338}
2339
2340/**
2341 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2342 *
2343 * @adev: amdgpu_device pointer
2344 *
2345 * Parses the asic configuration parameters specified in the gpu info
2346 * firmware and makes them availale to the driver for use in configuring
2347 * the asic.
2348 * Returns 0 on success, -EINVAL on failure.
2349 */
2350static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
2351{
2352 const char *chip_name;
2353 int err;
2354 const struct gpu_info_firmware_header_v1_0 *hdr;
2355
2356 adev->firmware.gpu_info_fw = NULL;
2357
2358 if (adev->mman.discovery_bin)
2359 return 0;
2360
2361 switch (adev->asic_type) {
2362 default:
2363 return 0;
2364 case CHIP_VEGA10:
2365 chip_name = "vega10";
2366 break;
2367 case CHIP_VEGA12:
2368 chip_name = "vega12";
2369 break;
2370 case CHIP_RAVEN:
2371 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2372 chip_name = "raven2";
2373 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2374 chip_name = "picasso";
2375 else
2376 chip_name = "raven";
2377 break;
2378 case CHIP_ARCTURUS:
2379 chip_name = "arcturus";
2380 break;
2381 case CHIP_NAVI12:
2382 chip_name = "navi12";
2383 break;
2384 }
2385
2386 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
2387 "amdgpu/%s_gpu_info.bin", chip_name);
2388 if (err) {
2389 dev_err(adev->dev,
2390 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
2391 chip_name);
2392 goto out;
2393 }
2394
2395 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2396 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2397
2398 switch (hdr->version_major) {
2399 case 1:
2400 {
2401 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2402 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2403 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2404
2405 /*
2406 * Should be droped when DAL no longer needs it.
2407 */
2408 if (adev->asic_type == CHIP_NAVI12)
2409 goto parse_soc_bounding_box;
2410
2411 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2412 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2413 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2414 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2415 adev->gfx.config.max_texture_channel_caches =
2416 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2417 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2418 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2419 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2420 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2421 adev->gfx.config.double_offchip_lds_buf =
2422 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2423 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2424 adev->gfx.cu_info.max_waves_per_simd =
2425 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2426 adev->gfx.cu_info.max_scratch_slots_per_cu =
2427 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2428 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2429 if (hdr->version_minor >= 1) {
2430 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2431 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2432 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2433 adev->gfx.config.num_sc_per_sh =
2434 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2435 adev->gfx.config.num_packer_per_sc =
2436 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2437 }
2438
2439parse_soc_bounding_box:
2440 /*
2441 * soc bounding box info is not integrated in disocovery table,
2442 * we always need to parse it from gpu info firmware if needed.
2443 */
2444 if (hdr->version_minor == 2) {
2445 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2446 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2447 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2448 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2449 }
2450 break;
2451 }
2452 default:
2453 dev_err(adev->dev,
2454 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2455 err = -EINVAL;
2456 goto out;
2457 }
2458out:
2459 return err;
2460}
2461
2462/**
2463 * amdgpu_device_ip_early_init - run early init for hardware IPs
2464 *
2465 * @adev: amdgpu_device pointer
2466 *
2467 * Early initialization pass for hardware IPs. The hardware IPs that make
2468 * up each asic are discovered each IP's early_init callback is run. This
2469 * is the first stage in initializing the asic.
2470 * Returns 0 on success, negative error code on failure.
2471 */
2472static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2473{
2474 struct pci_dev *parent;
2475 int i, r;
2476 bool total;
2477
2478 amdgpu_device_enable_virtual_display(adev);
2479
2480 if (amdgpu_sriov_vf(adev)) {
2481 r = amdgpu_virt_request_full_gpu(adev, true);
2482 if (r)
2483 return r;
2484 }
2485
2486 switch (adev->asic_type) {
2487#ifdef CONFIG_DRM_AMDGPU_SI
2488 case CHIP_VERDE:
2489 case CHIP_TAHITI:
2490 case CHIP_PITCAIRN:
2491 case CHIP_OLAND:
2492 case CHIP_HAINAN:
2493 adev->family = AMDGPU_FAMILY_SI;
2494 r = si_set_ip_blocks(adev);
2495 if (r)
2496 return r;
2497 break;
2498#endif
2499#ifdef CONFIG_DRM_AMDGPU_CIK
2500 case CHIP_BONAIRE:
2501 case CHIP_HAWAII:
2502 case CHIP_KAVERI:
2503 case CHIP_KABINI:
2504 case CHIP_MULLINS:
2505 if (adev->flags & AMD_IS_APU)
2506 adev->family = AMDGPU_FAMILY_KV;
2507 else
2508 adev->family = AMDGPU_FAMILY_CI;
2509
2510 r = cik_set_ip_blocks(adev);
2511 if (r)
2512 return r;
2513 break;
2514#endif
2515 case CHIP_TOPAZ:
2516 case CHIP_TONGA:
2517 case CHIP_FIJI:
2518 case CHIP_POLARIS10:
2519 case CHIP_POLARIS11:
2520 case CHIP_POLARIS12:
2521 case CHIP_VEGAM:
2522 case CHIP_CARRIZO:
2523 case CHIP_STONEY:
2524 if (adev->flags & AMD_IS_APU)
2525 adev->family = AMDGPU_FAMILY_CZ;
2526 else
2527 adev->family = AMDGPU_FAMILY_VI;
2528
2529 r = vi_set_ip_blocks(adev);
2530 if (r)
2531 return r;
2532 break;
2533 default:
2534 r = amdgpu_discovery_set_ip_blocks(adev);
2535 if (r)
2536 return r;
2537 break;
2538 }
2539
2540 if (amdgpu_has_atpx() &&
2541 (amdgpu_is_atpx_hybrid() ||
2542 amdgpu_has_atpx_dgpu_power_cntl()) &&
2543 ((adev->flags & AMD_IS_APU) == 0) &&
2544 !dev_is_removable(&adev->pdev->dev))
2545 adev->flags |= AMD_IS_PX;
2546
2547 if (!(adev->flags & AMD_IS_APU)) {
2548 parent = pcie_find_root_port(adev->pdev);
2549 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2550 }
2551
2552
2553 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2554 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2555 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2556 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2557 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2558 if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
2559 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2560
2561 total = true;
2562 for (i = 0; i < adev->num_ip_blocks; i++) {
2563 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2564 DRM_WARN("disabled ip block: %d <%s>\n",
2565 i, adev->ip_blocks[i].version->funcs->name);
2566 adev->ip_blocks[i].status.valid = false;
2567 } else {
2568 if (adev->ip_blocks[i].version->funcs->early_init) {
2569 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2570 if (r == -ENOENT) {
2571 adev->ip_blocks[i].status.valid = false;
2572 } else if (r) {
2573 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2574 adev->ip_blocks[i].version->funcs->name, r);
2575 total = false;
2576 } else {
2577 adev->ip_blocks[i].status.valid = true;
2578 }
2579 } else {
2580 adev->ip_blocks[i].status.valid = true;
2581 }
2582 }
2583 /* get the vbios after the asic_funcs are set up */
2584 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2585 r = amdgpu_device_parse_gpu_info_fw(adev);
2586 if (r)
2587 return r;
2588
2589 /* Read BIOS */
2590 if (amdgpu_device_read_bios(adev)) {
2591 if (!amdgpu_get_bios(adev))
2592 return -EINVAL;
2593
2594 r = amdgpu_atombios_init(adev);
2595 if (r) {
2596 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2597 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2598 return r;
2599 }
2600 }
2601
2602 /*get pf2vf msg info at it's earliest time*/
2603 if (amdgpu_sriov_vf(adev))
2604 amdgpu_virt_init_data_exchange(adev);
2605
2606 }
2607 }
2608 if (!total)
2609 return -ENODEV;
2610
2611 amdgpu_amdkfd_device_probe(adev);
2612 adev->cg_flags &= amdgpu_cg_mask;
2613 adev->pg_flags &= amdgpu_pg_mask;
2614
2615 return 0;
2616}
2617
2618static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2619{
2620 int i, r;
2621
2622 for (i = 0; i < adev->num_ip_blocks; i++) {
2623 if (!adev->ip_blocks[i].status.sw)
2624 continue;
2625 if (adev->ip_blocks[i].status.hw)
2626 continue;
2627 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2628 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2629 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2630 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2631 if (r) {
2632 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2633 adev->ip_blocks[i].version->funcs->name, r);
2634 return r;
2635 }
2636 adev->ip_blocks[i].status.hw = true;
2637 }
2638 }
2639
2640 return 0;
2641}
2642
2643static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2644{
2645 int i, r;
2646
2647 for (i = 0; i < adev->num_ip_blocks; i++) {
2648 if (!adev->ip_blocks[i].status.sw)
2649 continue;
2650 if (adev->ip_blocks[i].status.hw)
2651 continue;
2652 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2653 if (r) {
2654 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2655 adev->ip_blocks[i].version->funcs->name, r);
2656 return r;
2657 }
2658 adev->ip_blocks[i].status.hw = true;
2659 }
2660
2661 return 0;
2662}
2663
2664static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2665{
2666 int r = 0;
2667 int i;
2668 uint32_t smu_version;
2669
2670 if (adev->asic_type >= CHIP_VEGA10) {
2671 for (i = 0; i < adev->num_ip_blocks; i++) {
2672 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2673 continue;
2674
2675 if (!adev->ip_blocks[i].status.sw)
2676 continue;
2677
2678 /* no need to do the fw loading again if already done*/
2679 if (adev->ip_blocks[i].status.hw == true)
2680 break;
2681
2682 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2683 r = adev->ip_blocks[i].version->funcs->resume(adev);
2684 if (r) {
2685 DRM_ERROR("resume of IP block <%s> failed %d\n",
2686 adev->ip_blocks[i].version->funcs->name, r);
2687 return r;
2688 }
2689 } else {
2690 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2691 if (r) {
2692 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2693 adev->ip_blocks[i].version->funcs->name, r);
2694 return r;
2695 }
2696 }
2697
2698 adev->ip_blocks[i].status.hw = true;
2699 break;
2700 }
2701 }
2702
2703 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2704 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2705
2706 return r;
2707}
2708
2709static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2710{
2711 long timeout;
2712 int r, i;
2713
2714 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2715 struct amdgpu_ring *ring = adev->rings[i];
2716
2717 /* No need to setup the GPU scheduler for rings that don't need it */
2718 if (!ring || ring->no_scheduler)
2719 continue;
2720
2721 switch (ring->funcs->type) {
2722 case AMDGPU_RING_TYPE_GFX:
2723 timeout = adev->gfx_timeout;
2724 break;
2725 case AMDGPU_RING_TYPE_COMPUTE:
2726 timeout = adev->compute_timeout;
2727 break;
2728 case AMDGPU_RING_TYPE_SDMA:
2729 timeout = adev->sdma_timeout;
2730 break;
2731 default:
2732 timeout = adev->video_timeout;
2733 break;
2734 }
2735
2736 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL,
2737 DRM_SCHED_PRIORITY_COUNT,
2738 ring->num_hw_submission, 0,
2739 timeout, adev->reset_domain->wq,
2740 ring->sched_score, ring->name,
2741 adev->dev);
2742 if (r) {
2743 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2744 ring->name);
2745 return r;
2746 }
2747 r = amdgpu_uvd_entity_init(adev, ring);
2748 if (r) {
2749 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
2750 ring->name);
2751 return r;
2752 }
2753 r = amdgpu_vce_entity_init(adev, ring);
2754 if (r) {
2755 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
2756 ring->name);
2757 return r;
2758 }
2759 }
2760
2761 amdgpu_xcp_update_partition_sched_list(adev);
2762
2763 return 0;
2764}
2765
2766
2767/**
2768 * amdgpu_device_ip_init - run init for hardware IPs
2769 *
2770 * @adev: amdgpu_device pointer
2771 *
2772 * Main initialization pass for hardware IPs. The list of all the hardware
2773 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2774 * are run. sw_init initializes the software state associated with each IP
2775 * and hw_init initializes the hardware associated with each IP.
2776 * Returns 0 on success, negative error code on failure.
2777 */
2778static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2779{
2780 int i, r;
2781
2782 r = amdgpu_ras_init(adev);
2783 if (r)
2784 return r;
2785
2786 for (i = 0; i < adev->num_ip_blocks; i++) {
2787 if (!adev->ip_blocks[i].status.valid)
2788 continue;
2789 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2790 if (r) {
2791 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2792 adev->ip_blocks[i].version->funcs->name, r);
2793 goto init_failed;
2794 }
2795 adev->ip_blocks[i].status.sw = true;
2796
2797 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2798 /* need to do common hw init early so everything is set up for gmc */
2799 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2800 if (r) {
2801 DRM_ERROR("hw_init %d failed %d\n", i, r);
2802 goto init_failed;
2803 }
2804 adev->ip_blocks[i].status.hw = true;
2805 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2806 /* need to do gmc hw init early so we can allocate gpu mem */
2807 /* Try to reserve bad pages early */
2808 if (amdgpu_sriov_vf(adev))
2809 amdgpu_virt_exchange_data(adev);
2810
2811 r = amdgpu_device_mem_scratch_init(adev);
2812 if (r) {
2813 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2814 goto init_failed;
2815 }
2816 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2817 if (r) {
2818 DRM_ERROR("hw_init %d failed %d\n", i, r);
2819 goto init_failed;
2820 }
2821 r = amdgpu_device_wb_init(adev);
2822 if (r) {
2823 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2824 goto init_failed;
2825 }
2826 adev->ip_blocks[i].status.hw = true;
2827
2828 /* right after GMC hw init, we create CSA */
2829 if (adev->gfx.mcbp) {
2830 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2831 AMDGPU_GEM_DOMAIN_VRAM |
2832 AMDGPU_GEM_DOMAIN_GTT,
2833 AMDGPU_CSA_SIZE);
2834 if (r) {
2835 DRM_ERROR("allocate CSA failed %d\n", r);
2836 goto init_failed;
2837 }
2838 }
2839
2840 r = amdgpu_seq64_init(adev);
2841 if (r) {
2842 DRM_ERROR("allocate seq64 failed %d\n", r);
2843 goto init_failed;
2844 }
2845 }
2846 }
2847
2848 if (amdgpu_sriov_vf(adev))
2849 amdgpu_virt_init_data_exchange(adev);
2850
2851 r = amdgpu_ib_pool_init(adev);
2852 if (r) {
2853 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2854 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2855 goto init_failed;
2856 }
2857
2858 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2859 if (r)
2860 goto init_failed;
2861
2862 r = amdgpu_device_ip_hw_init_phase1(adev);
2863 if (r)
2864 goto init_failed;
2865
2866 r = amdgpu_device_fw_loading(adev);
2867 if (r)
2868 goto init_failed;
2869
2870 r = amdgpu_device_ip_hw_init_phase2(adev);
2871 if (r)
2872 goto init_failed;
2873
2874 /*
2875 * retired pages will be loaded from eeprom and reserved here,
2876 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2877 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2878 * for I2C communication which only true at this point.
2879 *
2880 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2881 * failure from bad gpu situation and stop amdgpu init process
2882 * accordingly. For other failed cases, it will still release all
2883 * the resource and print error message, rather than returning one
2884 * negative value to upper level.
2885 *
2886 * Note: theoretically, this should be called before all vram allocations
2887 * to protect retired page from abusing
2888 */
2889 r = amdgpu_ras_recovery_init(adev);
2890 if (r)
2891 goto init_failed;
2892
2893 /**
2894 * In case of XGMI grab extra reference for reset domain for this device
2895 */
2896 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2897 if (amdgpu_xgmi_add_device(adev) == 0) {
2898 if (!amdgpu_sriov_vf(adev)) {
2899 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2900
2901 if (WARN_ON(!hive)) {
2902 r = -ENOENT;
2903 goto init_failed;
2904 }
2905
2906 if (!hive->reset_domain ||
2907 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2908 r = -ENOENT;
2909 amdgpu_put_xgmi_hive(hive);
2910 goto init_failed;
2911 }
2912
2913 /* Drop the early temporary reset domain we created for device */
2914 amdgpu_reset_put_reset_domain(adev->reset_domain);
2915 adev->reset_domain = hive->reset_domain;
2916 amdgpu_put_xgmi_hive(hive);
2917 }
2918 }
2919 }
2920
2921 r = amdgpu_device_init_schedulers(adev);
2922 if (r)
2923 goto init_failed;
2924
2925 if (adev->mman.buffer_funcs_ring->sched.ready)
2926 amdgpu_ttm_set_buffer_funcs_status(adev, true);
2927
2928 /* Don't init kfd if whole hive need to be reset during init */
2929 if (!adev->gmc.xgmi.pending_reset) {
2930 kgd2kfd_init_zone_device(adev);
2931 amdgpu_amdkfd_device_init(adev);
2932 }
2933
2934 amdgpu_fru_get_product_info(adev);
2935
2936init_failed:
2937
2938 return r;
2939}
2940
2941/**
2942 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2943 *
2944 * @adev: amdgpu_device pointer
2945 *
2946 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2947 * this function before a GPU reset. If the value is retained after a
2948 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2949 */
2950static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2951{
2952 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2953}
2954
2955/**
2956 * amdgpu_device_check_vram_lost - check if vram is valid
2957 *
2958 * @adev: amdgpu_device pointer
2959 *
2960 * Checks the reset magic value written to the gart pointer in VRAM.
2961 * The driver calls this after a GPU reset to see if the contents of
2962 * VRAM is lost or now.
2963 * returns true if vram is lost, false if not.
2964 */
2965static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2966{
2967 if (memcmp(adev->gart.ptr, adev->reset_magic,
2968 AMDGPU_RESET_MAGIC_NUM))
2969 return true;
2970
2971 if (!amdgpu_in_reset(adev))
2972 return false;
2973
2974 /*
2975 * For all ASICs with baco/mode1 reset, the VRAM is
2976 * always assumed to be lost.
2977 */
2978 switch (amdgpu_asic_reset_method(adev)) {
2979 case AMD_RESET_METHOD_BACO:
2980 case AMD_RESET_METHOD_MODE1:
2981 return true;
2982 default:
2983 return false;
2984 }
2985}
2986
2987/**
2988 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2989 *
2990 * @adev: amdgpu_device pointer
2991 * @state: clockgating state (gate or ungate)
2992 *
2993 * The list of all the hardware IPs that make up the asic is walked and the
2994 * set_clockgating_state callbacks are run.
2995 * Late initialization pass enabling clockgating for hardware IPs.
2996 * Fini or suspend, pass disabling clockgating for hardware IPs.
2997 * Returns 0 on success, negative error code on failure.
2998 */
2999
3000int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
3001 enum amd_clockgating_state state)
3002{
3003 int i, j, r;
3004
3005 if (amdgpu_emu_mode == 1)
3006 return 0;
3007
3008 for (j = 0; j < adev->num_ip_blocks; j++) {
3009 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3010 if (!adev->ip_blocks[i].status.late_initialized)
3011 continue;
3012 /* skip CG for GFX, SDMA on S0ix */
3013 if (adev->in_s0ix &&
3014 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3015 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3016 continue;
3017 /* skip CG for VCE/UVD, it's handled specially */
3018 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3019 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3020 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3021 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3022 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
3023 /* enable clockgating to save power */
3024 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
3025 state);
3026 if (r) {
3027 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
3028 adev->ip_blocks[i].version->funcs->name, r);
3029 return r;
3030 }
3031 }
3032 }
3033
3034 return 0;
3035}
3036
3037int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
3038 enum amd_powergating_state state)
3039{
3040 int i, j, r;
3041
3042 if (amdgpu_emu_mode == 1)
3043 return 0;
3044
3045 for (j = 0; j < adev->num_ip_blocks; j++) {
3046 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3047 if (!adev->ip_blocks[i].status.late_initialized)
3048 continue;
3049 /* skip PG for GFX, SDMA on S0ix */
3050 if (adev->in_s0ix &&
3051 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3052 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3053 continue;
3054 /* skip CG for VCE/UVD, it's handled specially */
3055 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3056 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3057 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3058 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3059 adev->ip_blocks[i].version->funcs->set_powergating_state) {
3060 /* enable powergating to save power */
3061 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
3062 state);
3063 if (r) {
3064 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
3065 adev->ip_blocks[i].version->funcs->name, r);
3066 return r;
3067 }
3068 }
3069 }
3070 return 0;
3071}
3072
3073static int amdgpu_device_enable_mgpu_fan_boost(void)
3074{
3075 struct amdgpu_gpu_instance *gpu_ins;
3076 struct amdgpu_device *adev;
3077 int i, ret = 0;
3078
3079 mutex_lock(&mgpu_info.mutex);
3080
3081 /*
3082 * MGPU fan boost feature should be enabled
3083 * only when there are two or more dGPUs in
3084 * the system
3085 */
3086 if (mgpu_info.num_dgpu < 2)
3087 goto out;
3088
3089 for (i = 0; i < mgpu_info.num_dgpu; i++) {
3090 gpu_ins = &(mgpu_info.gpu_ins[i]);
3091 adev = gpu_ins->adev;
3092 if (!(adev->flags & AMD_IS_APU) &&
3093 !gpu_ins->mgpu_fan_enabled) {
3094 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
3095 if (ret)
3096 break;
3097
3098 gpu_ins->mgpu_fan_enabled = 1;
3099 }
3100 }
3101
3102out:
3103 mutex_unlock(&mgpu_info.mutex);
3104
3105 return ret;
3106}
3107
3108/**
3109 * amdgpu_device_ip_late_init - run late init for hardware IPs
3110 *
3111 * @adev: amdgpu_device pointer
3112 *
3113 * Late initialization pass for hardware IPs. The list of all the hardware
3114 * IPs that make up the asic is walked and the late_init callbacks are run.
3115 * late_init covers any special initialization that an IP requires
3116 * after all of the have been initialized or something that needs to happen
3117 * late in the init process.
3118 * Returns 0 on success, negative error code on failure.
3119 */
3120static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
3121{
3122 struct amdgpu_gpu_instance *gpu_instance;
3123 int i = 0, r;
3124
3125 for (i = 0; i < adev->num_ip_blocks; i++) {
3126 if (!adev->ip_blocks[i].status.hw)
3127 continue;
3128 if (adev->ip_blocks[i].version->funcs->late_init) {
3129 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
3130 if (r) {
3131 DRM_ERROR("late_init of IP block <%s> failed %d\n",
3132 adev->ip_blocks[i].version->funcs->name, r);
3133 return r;
3134 }
3135 }
3136 adev->ip_blocks[i].status.late_initialized = true;
3137 }
3138
3139 r = amdgpu_ras_late_init(adev);
3140 if (r) {
3141 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
3142 return r;
3143 }
3144
3145 if (!amdgpu_in_reset(adev))
3146 amdgpu_ras_set_error_query_ready(adev, true);
3147
3148 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3149 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3150
3151 amdgpu_device_fill_reset_magic(adev);
3152
3153 r = amdgpu_device_enable_mgpu_fan_boost();
3154 if (r)
3155 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
3156
3157 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
3158 if (amdgpu_passthrough(adev) &&
3159 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
3160 adev->asic_type == CHIP_ALDEBARAN))
3161 amdgpu_dpm_handle_passthrough_sbr(adev, true);
3162
3163 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3164 mutex_lock(&mgpu_info.mutex);
3165
3166 /*
3167 * Reset device p-state to low as this was booted with high.
3168 *
3169 * This should be performed only after all devices from the same
3170 * hive get initialized.
3171 *
3172 * However, it's unknown how many device in the hive in advance.
3173 * As this is counted one by one during devices initializations.
3174 *
3175 * So, we wait for all XGMI interlinked devices initialized.
3176 * This may bring some delays as those devices may come from
3177 * different hives. But that should be OK.
3178 */
3179 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
3180 for (i = 0; i < mgpu_info.num_gpu; i++) {
3181 gpu_instance = &(mgpu_info.gpu_ins[i]);
3182 if (gpu_instance->adev->flags & AMD_IS_APU)
3183 continue;
3184
3185 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
3186 AMDGPU_XGMI_PSTATE_MIN);
3187 if (r) {
3188 DRM_ERROR("pstate setting failed (%d).\n", r);
3189 break;
3190 }
3191 }
3192 }
3193
3194 mutex_unlock(&mgpu_info.mutex);
3195 }
3196
3197 return 0;
3198}
3199
3200/**
3201 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
3202 *
3203 * @adev: amdgpu_device pointer
3204 *
3205 * For ASICs need to disable SMC first
3206 */
3207static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
3208{
3209 int i, r;
3210
3211 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
3212 return;
3213
3214 for (i = 0; i < adev->num_ip_blocks; i++) {
3215 if (!adev->ip_blocks[i].status.hw)
3216 continue;
3217 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3218 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
3219 /* XXX handle errors */
3220 if (r) {
3221 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3222 adev->ip_blocks[i].version->funcs->name, r);
3223 }
3224 adev->ip_blocks[i].status.hw = false;
3225 break;
3226 }
3227 }
3228}
3229
3230static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
3231{
3232 int i, r;
3233
3234 for (i = 0; i < adev->num_ip_blocks; i++) {
3235 if (!adev->ip_blocks[i].version->funcs->early_fini)
3236 continue;
3237
3238 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
3239 if (r) {
3240 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
3241 adev->ip_blocks[i].version->funcs->name, r);
3242 }
3243 }
3244
3245 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3246 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3247
3248 amdgpu_amdkfd_suspend(adev, false);
3249
3250 /* Workaroud for ASICs need to disable SMC first */
3251 amdgpu_device_smu_fini_early(adev);
3252
3253 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3254 if (!adev->ip_blocks[i].status.hw)
3255 continue;
3256
3257 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
3258 /* XXX handle errors */
3259 if (r) {
3260 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3261 adev->ip_blocks[i].version->funcs->name, r);
3262 }
3263
3264 adev->ip_blocks[i].status.hw = false;
3265 }
3266
3267 if (amdgpu_sriov_vf(adev)) {
3268 if (amdgpu_virt_release_full_gpu(adev, false))
3269 DRM_ERROR("failed to release exclusive mode on fini\n");
3270 }
3271
3272 return 0;
3273}
3274
3275/**
3276 * amdgpu_device_ip_fini - run fini for hardware IPs
3277 *
3278 * @adev: amdgpu_device pointer
3279 *
3280 * Main teardown pass for hardware IPs. The list of all the hardware
3281 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3282 * are run. hw_fini tears down the hardware associated with each IP
3283 * and sw_fini tears down any software state associated with each IP.
3284 * Returns 0 on success, negative error code on failure.
3285 */
3286static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
3287{
3288 int i, r;
3289
3290 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
3291 amdgpu_virt_release_ras_err_handler_data(adev);
3292
3293 if (adev->gmc.xgmi.num_physical_nodes > 1)
3294 amdgpu_xgmi_remove_device(adev);
3295
3296 amdgpu_amdkfd_device_fini_sw(adev);
3297
3298 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3299 if (!adev->ip_blocks[i].status.sw)
3300 continue;
3301
3302 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3303 amdgpu_ucode_free_bo(adev);
3304 amdgpu_free_static_csa(&adev->virt.csa_obj);
3305 amdgpu_device_wb_fini(adev);
3306 amdgpu_device_mem_scratch_fini(adev);
3307 amdgpu_ib_pool_fini(adev);
3308 amdgpu_seq64_fini(adev);
3309 }
3310
3311 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
3312 /* XXX handle errors */
3313 if (r) {
3314 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
3315 adev->ip_blocks[i].version->funcs->name, r);
3316 }
3317 adev->ip_blocks[i].status.sw = false;
3318 adev->ip_blocks[i].status.valid = false;
3319 }
3320
3321 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3322 if (!adev->ip_blocks[i].status.late_initialized)
3323 continue;
3324 if (adev->ip_blocks[i].version->funcs->late_fini)
3325 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
3326 adev->ip_blocks[i].status.late_initialized = false;
3327 }
3328
3329 amdgpu_ras_fini(adev);
3330
3331 return 0;
3332}
3333
3334/**
3335 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3336 *
3337 * @work: work_struct.
3338 */
3339static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
3340{
3341 struct amdgpu_device *adev =
3342 container_of(work, struct amdgpu_device, delayed_init_work.work);
3343 int r;
3344
3345 r = amdgpu_ib_ring_tests(adev);
3346 if (r)
3347 DRM_ERROR("ib ring test failed (%d).\n", r);
3348}
3349
3350static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
3351{
3352 struct amdgpu_device *adev =
3353 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
3354
3355 WARN_ON_ONCE(adev->gfx.gfx_off_state);
3356 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
3357
3358 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
3359 adev->gfx.gfx_off_state = true;
3360}
3361
3362/**
3363 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3364 *
3365 * @adev: amdgpu_device pointer
3366 *
3367 * Main suspend function for hardware IPs. The list of all the hardware
3368 * IPs that make up the asic is walked, clockgating is disabled and the
3369 * suspend callbacks are run. suspend puts the hardware and software state
3370 * in each IP into a state suitable for suspend.
3371 * Returns 0 on success, negative error code on failure.
3372 */
3373static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
3374{
3375 int i, r;
3376
3377 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3378 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3379
3380 /*
3381 * Per PMFW team's suggestion, driver needs to handle gfxoff
3382 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3383 * scenario. Add the missing df cstate disablement here.
3384 */
3385 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
3386 dev_warn(adev->dev, "Failed to disallow df cstate");
3387
3388 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3389 if (!adev->ip_blocks[i].status.valid)
3390 continue;
3391
3392 /* displays are handled separately */
3393 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
3394 continue;
3395
3396 /* XXX handle errors */
3397 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3398 /* XXX handle errors */
3399 if (r) {
3400 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3401 adev->ip_blocks[i].version->funcs->name, r);
3402 return r;
3403 }
3404
3405 adev->ip_blocks[i].status.hw = false;
3406 }
3407
3408 return 0;
3409}
3410
3411/**
3412 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3413 *
3414 * @adev: amdgpu_device pointer
3415 *
3416 * Main suspend function for hardware IPs. The list of all the hardware
3417 * IPs that make up the asic is walked, clockgating is disabled and the
3418 * suspend callbacks are run. suspend puts the hardware and software state
3419 * in each IP into a state suitable for suspend.
3420 * Returns 0 on success, negative error code on failure.
3421 */
3422static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3423{
3424 int i, r;
3425
3426 if (adev->in_s0ix)
3427 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3428
3429 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3430 if (!adev->ip_blocks[i].status.valid)
3431 continue;
3432 /* displays are handled in phase1 */
3433 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3434 continue;
3435 /* PSP lost connection when err_event_athub occurs */
3436 if (amdgpu_ras_intr_triggered() &&
3437 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3438 adev->ip_blocks[i].status.hw = false;
3439 continue;
3440 }
3441
3442 /* skip unnecessary suspend if we do not initialize them yet */
3443 if (adev->gmc.xgmi.pending_reset &&
3444 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3445 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
3446 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3447 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
3448 adev->ip_blocks[i].status.hw = false;
3449 continue;
3450 }
3451
3452 /* skip suspend of gfx/mes and psp for S0ix
3453 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3454 * like at runtime. PSP is also part of the always on hardware
3455 * so no need to suspend it.
3456 */
3457 if (adev->in_s0ix &&
3458 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3459 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3460 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3461 continue;
3462
3463 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3464 if (adev->in_s0ix &&
3465 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
3466 IP_VERSION(5, 0, 0)) &&
3467 (adev->ip_blocks[i].version->type ==
3468 AMD_IP_BLOCK_TYPE_SDMA))
3469 continue;
3470
3471 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3472 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3473 * from this location and RLC Autoload automatically also gets loaded
3474 * from here based on PMFW -> PSP message during re-init sequence.
3475 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3476 * the TMR and reload FWs again for IMU enabled APU ASICs.
3477 */
3478 if (amdgpu_in_reset(adev) &&
3479 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3480 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3481 continue;
3482
3483 /* XXX handle errors */
3484 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3485 /* XXX handle errors */
3486 if (r) {
3487 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3488 adev->ip_blocks[i].version->funcs->name, r);
3489 }
3490 adev->ip_blocks[i].status.hw = false;
3491 /* handle putting the SMC in the appropriate state */
3492 if (!amdgpu_sriov_vf(adev)) {
3493 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3494 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3495 if (r) {
3496 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3497 adev->mp1_state, r);
3498 return r;
3499 }
3500 }
3501 }
3502 }
3503
3504 return 0;
3505}
3506
3507/**
3508 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3509 *
3510 * @adev: amdgpu_device pointer
3511 *
3512 * Main suspend function for hardware IPs. The list of all the hardware
3513 * IPs that make up the asic is walked, clockgating is disabled and the
3514 * suspend callbacks are run. suspend puts the hardware and software state
3515 * in each IP into a state suitable for suspend.
3516 * Returns 0 on success, negative error code on failure.
3517 */
3518int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3519{
3520 int r;
3521
3522 if (amdgpu_sriov_vf(adev)) {
3523 amdgpu_virt_fini_data_exchange(adev);
3524 amdgpu_virt_request_full_gpu(adev, false);
3525 }
3526
3527 amdgpu_ttm_set_buffer_funcs_status(adev, false);
3528
3529 r = amdgpu_device_ip_suspend_phase1(adev);
3530 if (r)
3531 return r;
3532 r = amdgpu_device_ip_suspend_phase2(adev);
3533
3534 if (amdgpu_sriov_vf(adev))
3535 amdgpu_virt_release_full_gpu(adev, false);
3536
3537 return r;
3538}
3539
3540static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3541{
3542 int i, r;
3543
3544 static enum amd_ip_block_type ip_order[] = {
3545 AMD_IP_BLOCK_TYPE_COMMON,
3546 AMD_IP_BLOCK_TYPE_GMC,
3547 AMD_IP_BLOCK_TYPE_PSP,
3548 AMD_IP_BLOCK_TYPE_IH,
3549 };
3550
3551 for (i = 0; i < adev->num_ip_blocks; i++) {
3552 int j;
3553 struct amdgpu_ip_block *block;
3554
3555 block = &adev->ip_blocks[i];
3556 block->status.hw = false;
3557
3558 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3559
3560 if (block->version->type != ip_order[j] ||
3561 !block->status.valid)
3562 continue;
3563
3564 r = block->version->funcs->hw_init(adev);
3565 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3566 if (r)
3567 return r;
3568 block->status.hw = true;
3569 }
3570 }
3571
3572 return 0;
3573}
3574
3575static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3576{
3577 int i, r;
3578
3579 static enum amd_ip_block_type ip_order[] = {
3580 AMD_IP_BLOCK_TYPE_SMC,
3581 AMD_IP_BLOCK_TYPE_DCE,
3582 AMD_IP_BLOCK_TYPE_GFX,
3583 AMD_IP_BLOCK_TYPE_SDMA,
3584 AMD_IP_BLOCK_TYPE_MES,
3585 AMD_IP_BLOCK_TYPE_UVD,
3586 AMD_IP_BLOCK_TYPE_VCE,
3587 AMD_IP_BLOCK_TYPE_VCN,
3588 AMD_IP_BLOCK_TYPE_JPEG
3589 };
3590
3591 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3592 int j;
3593 struct amdgpu_ip_block *block;
3594
3595 for (j = 0; j < adev->num_ip_blocks; j++) {
3596 block = &adev->ip_blocks[j];
3597
3598 if (block->version->type != ip_order[i] ||
3599 !block->status.valid ||
3600 block->status.hw)
3601 continue;
3602
3603 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3604 r = block->version->funcs->resume(adev);
3605 else
3606 r = block->version->funcs->hw_init(adev);
3607
3608 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3609 if (r)
3610 return r;
3611 block->status.hw = true;
3612 }
3613 }
3614
3615 return 0;
3616}
3617
3618/**
3619 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3620 *
3621 * @adev: amdgpu_device pointer
3622 *
3623 * First resume function for hardware IPs. The list of all the hardware
3624 * IPs that make up the asic is walked and the resume callbacks are run for
3625 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3626 * after a suspend and updates the software state as necessary. This
3627 * function is also used for restoring the GPU after a GPU reset.
3628 * Returns 0 on success, negative error code on failure.
3629 */
3630static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3631{
3632 int i, r;
3633
3634 for (i = 0; i < adev->num_ip_blocks; i++) {
3635 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3636 continue;
3637 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3638 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3639 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3640 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3641
3642 r = adev->ip_blocks[i].version->funcs->resume(adev);
3643 if (r) {
3644 DRM_ERROR("resume of IP block <%s> failed %d\n",
3645 adev->ip_blocks[i].version->funcs->name, r);
3646 return r;
3647 }
3648 adev->ip_blocks[i].status.hw = true;
3649 }
3650 }
3651
3652 return 0;
3653}
3654
3655/**
3656 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3657 *
3658 * @adev: amdgpu_device pointer
3659 *
3660 * First resume function for hardware IPs. The list of all the hardware
3661 * IPs that make up the asic is walked and the resume callbacks are run for
3662 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3663 * functional state after a suspend and updates the software state as
3664 * necessary. This function is also used for restoring the GPU after a GPU
3665 * reset.
3666 * Returns 0 on success, negative error code on failure.
3667 */
3668static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3669{
3670 int i, r;
3671
3672 for (i = 0; i < adev->num_ip_blocks; i++) {
3673 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3674 continue;
3675 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3676 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3677 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3678 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3679 continue;
3680 r = adev->ip_blocks[i].version->funcs->resume(adev);
3681 if (r) {
3682 DRM_ERROR("resume of IP block <%s> failed %d\n",
3683 adev->ip_blocks[i].version->funcs->name, r);
3684 return r;
3685 }
3686 adev->ip_blocks[i].status.hw = true;
3687 }
3688
3689 return 0;
3690}
3691
3692/**
3693 * amdgpu_device_ip_resume - run resume for hardware IPs
3694 *
3695 * @adev: amdgpu_device pointer
3696 *
3697 * Main resume function for hardware IPs. The hardware IPs
3698 * are split into two resume functions because they are
3699 * also used in recovering from a GPU reset and some additional
3700 * steps need to be take between them. In this case (S3/S4) they are
3701 * run sequentially.
3702 * Returns 0 on success, negative error code on failure.
3703 */
3704static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3705{
3706 int r;
3707
3708 r = amdgpu_device_ip_resume_phase1(adev);
3709 if (r)
3710 return r;
3711
3712 r = amdgpu_device_fw_loading(adev);
3713 if (r)
3714 return r;
3715
3716 r = amdgpu_device_ip_resume_phase2(adev);
3717
3718 if (adev->mman.buffer_funcs_ring->sched.ready)
3719 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3720
3721 return r;
3722}
3723
3724/**
3725 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3726 *
3727 * @adev: amdgpu_device pointer
3728 *
3729 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3730 */
3731static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3732{
3733 if (amdgpu_sriov_vf(adev)) {
3734 if (adev->is_atom_fw) {
3735 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3736 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3737 } else {
3738 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3739 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3740 }
3741
3742 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3743 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3744 }
3745}
3746
3747/**
3748 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3749 *
3750 * @asic_type: AMD asic type
3751 *
3752 * Check if there is DC (new modesetting infrastructre) support for an asic.
3753 * returns true if DC has support, false if not.
3754 */
3755bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3756{
3757 switch (asic_type) {
3758#ifdef CONFIG_DRM_AMDGPU_SI
3759 case CHIP_HAINAN:
3760#endif
3761 case CHIP_TOPAZ:
3762 /* chips with no display hardware */
3763 return false;
3764#if defined(CONFIG_DRM_AMD_DC)
3765 case CHIP_TAHITI:
3766 case CHIP_PITCAIRN:
3767 case CHIP_VERDE:
3768 case CHIP_OLAND:
3769 /*
3770 * We have systems in the wild with these ASICs that require
3771 * LVDS and VGA support which is not supported with DC.
3772 *
3773 * Fallback to the non-DC driver here by default so as not to
3774 * cause regressions.
3775 */
3776#if defined(CONFIG_DRM_AMD_DC_SI)
3777 return amdgpu_dc > 0;
3778#else
3779 return false;
3780#endif
3781 case CHIP_BONAIRE:
3782 case CHIP_KAVERI:
3783 case CHIP_KABINI:
3784 case CHIP_MULLINS:
3785 /*
3786 * We have systems in the wild with these ASICs that require
3787 * VGA support which is not supported with DC.
3788 *
3789 * Fallback to the non-DC driver here by default so as not to
3790 * cause regressions.
3791 */
3792 return amdgpu_dc > 0;
3793 default:
3794 return amdgpu_dc != 0;
3795#else
3796 default:
3797 if (amdgpu_dc > 0)
3798 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3799 return false;
3800#endif
3801 }
3802}
3803
3804/**
3805 * amdgpu_device_has_dc_support - check if dc is supported
3806 *
3807 * @adev: amdgpu_device pointer
3808 *
3809 * Returns true for supported, false for not supported
3810 */
3811bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3812{
3813 if (adev->enable_virtual_display ||
3814 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3815 return false;
3816
3817 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3818}
3819
3820static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3821{
3822 struct amdgpu_device *adev =
3823 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3824 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3825
3826 /* It's a bug to not have a hive within this function */
3827 if (WARN_ON(!hive))
3828 return;
3829
3830 /*
3831 * Use task barrier to synchronize all xgmi reset works across the
3832 * hive. task_barrier_enter and task_barrier_exit will block
3833 * until all the threads running the xgmi reset works reach
3834 * those points. task_barrier_full will do both blocks.
3835 */
3836 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3837
3838 task_barrier_enter(&hive->tb);
3839 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3840
3841 if (adev->asic_reset_res)
3842 goto fail;
3843
3844 task_barrier_exit(&hive->tb);
3845 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3846
3847 if (adev->asic_reset_res)
3848 goto fail;
3849
3850 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
3851 } else {
3852
3853 task_barrier_full(&hive->tb);
3854 adev->asic_reset_res = amdgpu_asic_reset(adev);
3855 }
3856
3857fail:
3858 if (adev->asic_reset_res)
3859 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3860 adev->asic_reset_res, adev_to_drm(adev)->unique);
3861 amdgpu_put_xgmi_hive(hive);
3862}
3863
3864static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3865{
3866 char *input = amdgpu_lockup_timeout;
3867 char *timeout_setting = NULL;
3868 int index = 0;
3869 long timeout;
3870 int ret = 0;
3871
3872 /*
3873 * By default timeout for non compute jobs is 10000
3874 * and 60000 for compute jobs.
3875 * In SR-IOV or passthrough mode, timeout for compute
3876 * jobs are 60000 by default.
3877 */
3878 adev->gfx_timeout = msecs_to_jiffies(10000);
3879 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3880 if (amdgpu_sriov_vf(adev))
3881 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3882 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3883 else
3884 adev->compute_timeout = msecs_to_jiffies(60000);
3885
3886 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3887 while ((timeout_setting = strsep(&input, ",")) &&
3888 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3889 ret = kstrtol(timeout_setting, 0, &timeout);
3890 if (ret)
3891 return ret;
3892
3893 if (timeout == 0) {
3894 index++;
3895 continue;
3896 } else if (timeout < 0) {
3897 timeout = MAX_SCHEDULE_TIMEOUT;
3898 dev_warn(adev->dev, "lockup timeout disabled");
3899 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3900 } else {
3901 timeout = msecs_to_jiffies(timeout);
3902 }
3903
3904 switch (index++) {
3905 case 0:
3906 adev->gfx_timeout = timeout;
3907 break;
3908 case 1:
3909 adev->compute_timeout = timeout;
3910 break;
3911 case 2:
3912 adev->sdma_timeout = timeout;
3913 break;
3914 case 3:
3915 adev->video_timeout = timeout;
3916 break;
3917 default:
3918 break;
3919 }
3920 }
3921 /*
3922 * There is only one value specified and
3923 * it should apply to all non-compute jobs.
3924 */
3925 if (index == 1) {
3926 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3927 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3928 adev->compute_timeout = adev->gfx_timeout;
3929 }
3930 }
3931
3932 return ret;
3933}
3934
3935/**
3936 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3937 *
3938 * @adev: amdgpu_device pointer
3939 *
3940 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3941 */
3942static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3943{
3944 struct iommu_domain *domain;
3945
3946 domain = iommu_get_domain_for_dev(adev->dev);
3947 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3948 adev->ram_is_direct_mapped = true;
3949}
3950
3951static const struct attribute *amdgpu_dev_attributes[] = {
3952 &dev_attr_pcie_replay_count.attr,
3953 NULL
3954};
3955
3956static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
3957{
3958 if (amdgpu_mcbp == 1)
3959 adev->gfx.mcbp = true;
3960 else if (amdgpu_mcbp == 0)
3961 adev->gfx.mcbp = false;
3962
3963 if (amdgpu_sriov_vf(adev))
3964 adev->gfx.mcbp = true;
3965
3966 if (adev->gfx.mcbp)
3967 DRM_INFO("MCBP is enabled\n");
3968}
3969
3970/**
3971 * amdgpu_device_init - initialize the driver
3972 *
3973 * @adev: amdgpu_device pointer
3974 * @flags: driver flags
3975 *
3976 * Initializes the driver info and hw (all asics).
3977 * Returns 0 for success or an error on failure.
3978 * Called at driver startup.
3979 */
3980int amdgpu_device_init(struct amdgpu_device *adev,
3981 uint32_t flags)
3982{
3983 struct drm_device *ddev = adev_to_drm(adev);
3984 struct pci_dev *pdev = adev->pdev;
3985 int r, i;
3986 bool px = false;
3987 u32 max_MBps;
3988 int tmp;
3989
3990 adev->shutdown = false;
3991 adev->flags = flags;
3992
3993 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3994 adev->asic_type = amdgpu_force_asic_type;
3995 else
3996 adev->asic_type = flags & AMD_ASIC_MASK;
3997
3998 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3999 if (amdgpu_emu_mode == 1)
4000 adev->usec_timeout *= 10;
4001 adev->gmc.gart_size = 512 * 1024 * 1024;
4002 adev->accel_working = false;
4003 adev->num_rings = 0;
4004 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
4005 adev->mman.buffer_funcs = NULL;
4006 adev->mman.buffer_funcs_ring = NULL;
4007 adev->vm_manager.vm_pte_funcs = NULL;
4008 adev->vm_manager.vm_pte_num_scheds = 0;
4009 adev->gmc.gmc_funcs = NULL;
4010 adev->harvest_ip_mask = 0x0;
4011 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
4012 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
4013
4014 adev->smc_rreg = &amdgpu_invalid_rreg;
4015 adev->smc_wreg = &amdgpu_invalid_wreg;
4016 adev->pcie_rreg = &amdgpu_invalid_rreg;
4017 adev->pcie_wreg = &amdgpu_invalid_wreg;
4018 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
4019 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
4020 adev->pciep_rreg = &amdgpu_invalid_rreg;
4021 adev->pciep_wreg = &amdgpu_invalid_wreg;
4022 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
4023 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
4024 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext;
4025 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;
4026 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
4027 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
4028 adev->didt_rreg = &amdgpu_invalid_rreg;
4029 adev->didt_wreg = &amdgpu_invalid_wreg;
4030 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
4031 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
4032 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
4033 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
4034
4035 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
4036 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
4037 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
4038
4039 /* mutex initialization are all done here so we
4040 * can recall function without having locking issues
4041 */
4042 mutex_init(&adev->firmware.mutex);
4043 mutex_init(&adev->pm.mutex);
4044 mutex_init(&adev->gfx.gpu_clock_mutex);
4045 mutex_init(&adev->srbm_mutex);
4046 mutex_init(&adev->gfx.pipe_reserve_mutex);
4047 mutex_init(&adev->gfx.gfx_off_mutex);
4048 mutex_init(&adev->gfx.partition_mutex);
4049 mutex_init(&adev->grbm_idx_mutex);
4050 mutex_init(&adev->mn_lock);
4051 mutex_init(&adev->virt.vf_errors.lock);
4052 mutex_init(&adev->virt.rlcg_reg_lock);
4053 hash_init(adev->mn_hash);
4054 mutex_init(&adev->psp.mutex);
4055 mutex_init(&adev->notifier_lock);
4056 mutex_init(&adev->pm.stable_pstate_ctx_lock);
4057 mutex_init(&adev->benchmark_mutex);
4058
4059 amdgpu_device_init_apu_flags(adev);
4060
4061 r = amdgpu_device_check_arguments(adev);
4062 if (r)
4063 return r;
4064
4065 spin_lock_init(&adev->mmio_idx_lock);
4066 spin_lock_init(&adev->smc_idx_lock);
4067 spin_lock_init(&adev->pcie_idx_lock);
4068 spin_lock_init(&adev->uvd_ctx_idx_lock);
4069 spin_lock_init(&adev->didt_idx_lock);
4070 spin_lock_init(&adev->gc_cac_idx_lock);
4071 spin_lock_init(&adev->se_cac_idx_lock);
4072 spin_lock_init(&adev->audio_endpt_idx_lock);
4073 spin_lock_init(&adev->mm_stats.lock);
4074 spin_lock_init(&adev->wb.lock);
4075
4076 INIT_LIST_HEAD(&adev->shadow_list);
4077 mutex_init(&adev->shadow_list_lock);
4078
4079 INIT_LIST_HEAD(&adev->reset_list);
4080
4081 INIT_LIST_HEAD(&adev->ras_list);
4082
4083 INIT_LIST_HEAD(&adev->pm.od_kobj_list);
4084
4085 INIT_DELAYED_WORK(&adev->delayed_init_work,
4086 amdgpu_device_delayed_init_work_handler);
4087 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
4088 amdgpu_device_delay_enable_gfx_off);
4089
4090 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
4091
4092 adev->gfx.gfx_off_req_count = 1;
4093 adev->gfx.gfx_off_residency = 0;
4094 adev->gfx.gfx_off_entrycount = 0;
4095 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
4096
4097 atomic_set(&adev->throttling_logging_enabled, 1);
4098 /*
4099 * If throttling continues, logging will be performed every minute
4100 * to avoid log flooding. "-1" is subtracted since the thermal
4101 * throttling interrupt comes every second. Thus, the total logging
4102 * interval is 59 seconds(retelimited printk interval) + 1(waiting
4103 * for throttling interrupt) = 60 seconds.
4104 */
4105 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
4106 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
4107
4108 /* Registers mapping */
4109 /* TODO: block userspace mapping of io register */
4110 if (adev->asic_type >= CHIP_BONAIRE) {
4111 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
4112 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
4113 } else {
4114 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
4115 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
4116 }
4117
4118 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
4119 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
4120
4121 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
4122 if (!adev->rmmio)
4123 return -ENOMEM;
4124
4125 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
4126 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
4127
4128 /*
4129 * Reset domain needs to be present early, before XGMI hive discovered
4130 * (if any) and intitialized to use reset sem and in_gpu reset flag
4131 * early on during init and before calling to RREG32.
4132 */
4133 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
4134 if (!adev->reset_domain)
4135 return -ENOMEM;
4136
4137 /* detect hw virtualization here */
4138 amdgpu_detect_virtualization(adev);
4139
4140 amdgpu_device_get_pcie_info(adev);
4141
4142 r = amdgpu_device_get_job_timeout_settings(adev);
4143 if (r) {
4144 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4145 return r;
4146 }
4147
4148 amdgpu_device_set_mcbp(adev);
4149
4150 /* early init functions */
4151 r = amdgpu_device_ip_early_init(adev);
4152 if (r)
4153 return r;
4154
4155 /* Get rid of things like offb */
4156 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
4157 if (r)
4158 return r;
4159
4160 /* Enable TMZ based on IP_VERSION */
4161 amdgpu_gmc_tmz_set(adev);
4162
4163 if (amdgpu_sriov_vf(adev) &&
4164 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
4165 /* VF MMIO access (except mailbox range) from CPU
4166 * will be blocked during sriov runtime
4167 */
4168 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
4169
4170 amdgpu_gmc_noretry_set(adev);
4171 /* Need to get xgmi info early to decide the reset behavior*/
4172 if (adev->gmc.xgmi.supported) {
4173 r = adev->gfxhub.funcs->get_xgmi_info(adev);
4174 if (r)
4175 return r;
4176 }
4177
4178 /* enable PCIE atomic ops */
4179 if (amdgpu_sriov_vf(adev)) {
4180 if (adev->virt.fw_reserve.p_pf2vf)
4181 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
4182 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
4183 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4184 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4185 * internal path natively support atomics, set have_atomics_support to true.
4186 */
4187 } else if ((adev->flags & AMD_IS_APU) &&
4188 (amdgpu_ip_version(adev, GC_HWIP, 0) >
4189 IP_VERSION(9, 0, 0))) {
4190 adev->have_atomics_support = true;
4191 } else {
4192 adev->have_atomics_support =
4193 !pci_enable_atomic_ops_to_root(adev->pdev,
4194 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
4195 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4196 }
4197
4198 if (!adev->have_atomics_support)
4199 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
4200
4201 /* doorbell bar mapping and doorbell index init*/
4202 amdgpu_doorbell_init(adev);
4203
4204 if (amdgpu_emu_mode == 1) {
4205 /* post the asic on emulation mode */
4206 emu_soc_asic_init(adev);
4207 goto fence_driver_init;
4208 }
4209
4210 amdgpu_reset_init(adev);
4211
4212 /* detect if we are with an SRIOV vbios */
4213 if (adev->bios)
4214 amdgpu_device_detect_sriov_bios(adev);
4215
4216 /* check if we need to reset the asic
4217 * E.g., driver was not cleanly unloaded previously, etc.
4218 */
4219 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
4220 if (adev->gmc.xgmi.num_physical_nodes) {
4221 dev_info(adev->dev, "Pending hive reset.\n");
4222 adev->gmc.xgmi.pending_reset = true;
4223 /* Only need to init necessary block for SMU to handle the reset */
4224 for (i = 0; i < adev->num_ip_blocks; i++) {
4225 if (!adev->ip_blocks[i].status.valid)
4226 continue;
4227 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
4228 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
4229 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
4230 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
4231 DRM_DEBUG("IP %s disabled for hw_init.\n",
4232 adev->ip_blocks[i].version->funcs->name);
4233 adev->ip_blocks[i].status.hw = true;
4234 }
4235 }
4236 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
4237 !amdgpu_device_has_display_hardware(adev)) {
4238 r = psp_gpu_reset(adev);
4239 } else {
4240 tmp = amdgpu_reset_method;
4241 /* It should do a default reset when loading or reloading the driver,
4242 * regardless of the module parameter reset_method.
4243 */
4244 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
4245 r = amdgpu_asic_reset(adev);
4246 amdgpu_reset_method = tmp;
4247 }
4248
4249 if (r) {
4250 dev_err(adev->dev, "asic reset on init failed\n");
4251 goto failed;
4252 }
4253 }
4254
4255 /* Post card if necessary */
4256 if (amdgpu_device_need_post(adev)) {
4257 if (!adev->bios) {
4258 dev_err(adev->dev, "no vBIOS found\n");
4259 r = -EINVAL;
4260 goto failed;
4261 }
4262 DRM_INFO("GPU posting now...\n");
4263 r = amdgpu_device_asic_init(adev);
4264 if (r) {
4265 dev_err(adev->dev, "gpu post error!\n");
4266 goto failed;
4267 }
4268 }
4269
4270 if (adev->bios) {
4271 if (adev->is_atom_fw) {
4272 /* Initialize clocks */
4273 r = amdgpu_atomfirmware_get_clock_info(adev);
4274 if (r) {
4275 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
4276 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4277 goto failed;
4278 }
4279 } else {
4280 /* Initialize clocks */
4281 r = amdgpu_atombios_get_clock_info(adev);
4282 if (r) {
4283 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
4284 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4285 goto failed;
4286 }
4287 /* init i2c buses */
4288 if (!amdgpu_device_has_dc_support(adev))
4289 amdgpu_atombios_i2c_init(adev);
4290 }
4291 }
4292
4293fence_driver_init:
4294 /* Fence driver */
4295 r = amdgpu_fence_driver_sw_init(adev);
4296 if (r) {
4297 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
4298 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
4299 goto failed;
4300 }
4301
4302 /* init the mode config */
4303 drm_mode_config_init(adev_to_drm(adev));
4304
4305 r = amdgpu_device_ip_init(adev);
4306 if (r) {
4307 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
4308 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
4309 goto release_ras_con;
4310 }
4311
4312 amdgpu_fence_driver_hw_init(adev);
4313
4314 dev_info(adev->dev,
4315 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4316 adev->gfx.config.max_shader_engines,
4317 adev->gfx.config.max_sh_per_se,
4318 adev->gfx.config.max_cu_per_sh,
4319 adev->gfx.cu_info.number);
4320
4321 adev->accel_working = true;
4322
4323 amdgpu_vm_check_compute_bug(adev);
4324
4325 /* Initialize the buffer migration limit. */
4326 if (amdgpu_moverate >= 0)
4327 max_MBps = amdgpu_moverate;
4328 else
4329 max_MBps = 8; /* Allow 8 MB/s. */
4330 /* Get a log2 for easy divisions. */
4331 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
4332
4333 /*
4334 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4335 * Otherwise the mgpu fan boost feature will be skipped due to the
4336 * gpu instance is counted less.
4337 */
4338 amdgpu_register_gpu_instance(adev);
4339
4340 /* enable clockgating, etc. after ib tests, etc. since some blocks require
4341 * explicit gating rather than handling it automatically.
4342 */
4343 if (!adev->gmc.xgmi.pending_reset) {
4344 r = amdgpu_device_ip_late_init(adev);
4345 if (r) {
4346 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
4347 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
4348 goto release_ras_con;
4349 }
4350 /* must succeed. */
4351 amdgpu_ras_resume(adev);
4352 queue_delayed_work(system_wq, &adev->delayed_init_work,
4353 msecs_to_jiffies(AMDGPU_RESUME_MS));
4354 }
4355
4356 if (amdgpu_sriov_vf(adev)) {
4357 amdgpu_virt_release_full_gpu(adev, true);
4358 flush_delayed_work(&adev->delayed_init_work);
4359 }
4360
4361 /*
4362 * Place those sysfs registering after `late_init`. As some of those
4363 * operations performed in `late_init` might affect the sysfs
4364 * interfaces creating.
4365 */
4366 r = amdgpu_atombios_sysfs_init(adev);
4367 if (r)
4368 drm_err(&adev->ddev,
4369 "registering atombios sysfs failed (%d).\n", r);
4370
4371 r = amdgpu_pm_sysfs_init(adev);
4372 if (r)
4373 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
4374
4375 r = amdgpu_ucode_sysfs_init(adev);
4376 if (r) {
4377 adev->ucode_sysfs_en = false;
4378 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
4379 } else
4380 adev->ucode_sysfs_en = true;
4381
4382 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
4383 if (r)
4384 dev_err(adev->dev, "Could not create amdgpu device attr\n");
4385
4386 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
4387 if (r)
4388 dev_err(adev->dev,
4389 "Could not create amdgpu board attributes\n");
4390
4391 amdgpu_fru_sysfs_init(adev);
4392 amdgpu_reg_state_sysfs_init(adev);
4393
4394 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4395 r = amdgpu_pmu_init(adev);
4396 if (r)
4397 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4398
4399 /* Have stored pci confspace at hand for restore in sudden PCI error */
4400 if (amdgpu_device_cache_pci_state(adev->pdev))
4401 pci_restore_state(pdev);
4402
4403 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4404 /* this will fail for cards that aren't VGA class devices, just
4405 * ignore it
4406 */
4407 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4408 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4409
4410 px = amdgpu_device_supports_px(ddev);
4411
4412 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4413 apple_gmux_detect(NULL, NULL)))
4414 vga_switcheroo_register_client(adev->pdev,
4415 &amdgpu_switcheroo_ops, px);
4416
4417 if (px)
4418 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4419
4420 if (adev->gmc.xgmi.pending_reset)
4421 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
4422 msecs_to_jiffies(AMDGPU_RESUME_MS));
4423
4424 amdgpu_device_check_iommu_direct_map(adev);
4425
4426 return 0;
4427
4428release_ras_con:
4429 if (amdgpu_sriov_vf(adev))
4430 amdgpu_virt_release_full_gpu(adev, true);
4431
4432 /* failed in exclusive mode due to timeout */
4433 if (amdgpu_sriov_vf(adev) &&
4434 !amdgpu_sriov_runtime(adev) &&
4435 amdgpu_virt_mmio_blocked(adev) &&
4436 !amdgpu_virt_wait_reset(adev)) {
4437 dev_err(adev->dev, "VF exclusive mode timeout\n");
4438 /* Don't send request since VF is inactive. */
4439 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4440 adev->virt.ops = NULL;
4441 r = -EAGAIN;
4442 }
4443 amdgpu_release_ras_context(adev);
4444
4445failed:
4446 amdgpu_vf_error_trans_all(adev);
4447
4448 return r;
4449}
4450
4451static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4452{
4453
4454 /* Clear all CPU mappings pointing to this device */
4455 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4456
4457 /* Unmap all mapped bars - Doorbell, registers and VRAM */
4458 amdgpu_doorbell_fini(adev);
4459
4460 iounmap(adev->rmmio);
4461 adev->rmmio = NULL;
4462 if (adev->mman.aper_base_kaddr)
4463 iounmap(adev->mman.aper_base_kaddr);
4464 adev->mman.aper_base_kaddr = NULL;
4465
4466 /* Memory manager related */
4467 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4468 arch_phys_wc_del(adev->gmc.vram_mtrr);
4469 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4470 }
4471}
4472
4473/**
4474 * amdgpu_device_fini_hw - tear down the driver
4475 *
4476 * @adev: amdgpu_device pointer
4477 *
4478 * Tear down the driver info (all asics).
4479 * Called at driver shutdown.
4480 */
4481void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4482{
4483 dev_info(adev->dev, "amdgpu: finishing device.\n");
4484 flush_delayed_work(&adev->delayed_init_work);
4485 adev->shutdown = true;
4486
4487 /* make sure IB test finished before entering exclusive mode
4488 * to avoid preemption on IB test
4489 */
4490 if (amdgpu_sriov_vf(adev)) {
4491 amdgpu_virt_request_full_gpu(adev, false);
4492 amdgpu_virt_fini_data_exchange(adev);
4493 }
4494
4495 /* disable all interrupts */
4496 amdgpu_irq_disable_all(adev);
4497 if (adev->mode_info.mode_config_initialized) {
4498 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4499 drm_helper_force_disable_all(adev_to_drm(adev));
4500 else
4501 drm_atomic_helper_shutdown(adev_to_drm(adev));
4502 }
4503 amdgpu_fence_driver_hw_fini(adev);
4504
4505 if (adev->mman.initialized)
4506 drain_workqueue(adev->mman.bdev.wq);
4507
4508 if (adev->pm.sysfs_initialized)
4509 amdgpu_pm_sysfs_fini(adev);
4510 if (adev->ucode_sysfs_en)
4511 amdgpu_ucode_sysfs_fini(adev);
4512 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4513 amdgpu_fru_sysfs_fini(adev);
4514
4515 amdgpu_reg_state_sysfs_fini(adev);
4516
4517 /* disable ras feature must before hw fini */
4518 amdgpu_ras_pre_fini(adev);
4519
4520 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4521
4522 amdgpu_device_ip_fini_early(adev);
4523
4524 amdgpu_irq_fini_hw(adev);
4525
4526 if (adev->mman.initialized)
4527 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4528
4529 amdgpu_gart_dummy_page_fini(adev);
4530
4531 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4532 amdgpu_device_unmap_mmio(adev);
4533
4534}
4535
4536void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4537{
4538 int idx;
4539 bool px;
4540
4541 amdgpu_fence_driver_sw_fini(adev);
4542 amdgpu_device_ip_fini(adev);
4543 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4544 adev->accel_working = false;
4545 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4546
4547 amdgpu_reset_fini(adev);
4548
4549 /* free i2c buses */
4550 if (!amdgpu_device_has_dc_support(adev))
4551 amdgpu_i2c_fini(adev);
4552
4553 if (amdgpu_emu_mode != 1)
4554 amdgpu_atombios_fini(adev);
4555
4556 kfree(adev->bios);
4557 adev->bios = NULL;
4558
4559 kfree(adev->fru_info);
4560 adev->fru_info = NULL;
4561
4562 px = amdgpu_device_supports_px(adev_to_drm(adev));
4563
4564 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4565 apple_gmux_detect(NULL, NULL)))
4566 vga_switcheroo_unregister_client(adev->pdev);
4567
4568 if (px)
4569 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4570
4571 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4572 vga_client_unregister(adev->pdev);
4573
4574 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4575
4576 iounmap(adev->rmmio);
4577 adev->rmmio = NULL;
4578 amdgpu_doorbell_fini(adev);
4579 drm_dev_exit(idx);
4580 }
4581
4582 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4583 amdgpu_pmu_fini(adev);
4584 if (adev->mman.discovery_bin)
4585 amdgpu_discovery_fini(adev);
4586
4587 amdgpu_reset_put_reset_domain(adev->reset_domain);
4588 adev->reset_domain = NULL;
4589
4590 kfree(adev->pci_state);
4591
4592}
4593
4594/**
4595 * amdgpu_device_evict_resources - evict device resources
4596 * @adev: amdgpu device object
4597 *
4598 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4599 * of the vram memory type. Mainly used for evicting device resources
4600 * at suspend time.
4601 *
4602 */
4603static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4604{
4605 int ret;
4606
4607 /* No need to evict vram on APUs for suspend to ram or s2idle */
4608 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4609 return 0;
4610
4611 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4612 if (ret)
4613 DRM_WARN("evicting device resources failed\n");
4614 return ret;
4615}
4616
4617/*
4618 * Suspend & resume.
4619 */
4620/**
4621 * amdgpu_device_prepare - prepare for device suspend
4622 *
4623 * @dev: drm dev pointer
4624 *
4625 * Prepare to put the hw in the suspend state (all asics).
4626 * Returns 0 for success or an error on failure.
4627 * Called at driver suspend.
4628 */
4629int amdgpu_device_prepare(struct drm_device *dev)
4630{
4631 struct amdgpu_device *adev = drm_to_adev(dev);
4632 int i, r;
4633
4634 amdgpu_choose_low_power_state(adev);
4635
4636 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4637 return 0;
4638
4639 /* Evict the majority of BOs before starting suspend sequence */
4640 r = amdgpu_device_evict_resources(adev);
4641 if (r)
4642 goto unprepare;
4643
4644 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4645
4646 for (i = 0; i < adev->num_ip_blocks; i++) {
4647 if (!adev->ip_blocks[i].status.valid)
4648 continue;
4649 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4650 continue;
4651 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev);
4652 if (r)
4653 goto unprepare;
4654 }
4655
4656 return 0;
4657
4658unprepare:
4659 adev->in_s0ix = adev->in_s3 = false;
4660
4661 return r;
4662}
4663
4664/**
4665 * amdgpu_device_suspend - initiate device suspend
4666 *
4667 * @dev: drm dev pointer
4668 * @fbcon : notify the fbdev of suspend
4669 *
4670 * Puts the hw in the suspend state (all asics).
4671 * Returns 0 for success or an error on failure.
4672 * Called at driver suspend.
4673 */
4674int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4675{
4676 struct amdgpu_device *adev = drm_to_adev(dev);
4677 int r = 0;
4678
4679 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4680 return 0;
4681
4682 adev->in_suspend = true;
4683
4684 if (amdgpu_sriov_vf(adev)) {
4685 amdgpu_virt_fini_data_exchange(adev);
4686 r = amdgpu_virt_request_full_gpu(adev, false);
4687 if (r)
4688 return r;
4689 }
4690
4691 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4692 DRM_WARN("smart shift update failed\n");
4693
4694 if (fbcon)
4695 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4696
4697 cancel_delayed_work_sync(&adev->delayed_init_work);
4698
4699 amdgpu_ras_suspend(adev);
4700
4701 amdgpu_device_ip_suspend_phase1(adev);
4702
4703 if (!adev->in_s0ix)
4704 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4705
4706 r = amdgpu_device_evict_resources(adev);
4707 if (r)
4708 return r;
4709
4710 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4711
4712 amdgpu_fence_driver_hw_fini(adev);
4713
4714 amdgpu_device_ip_suspend_phase2(adev);
4715
4716 if (amdgpu_sriov_vf(adev))
4717 amdgpu_virt_release_full_gpu(adev, false);
4718
4719 r = amdgpu_dpm_notify_rlc_state(adev, false);
4720 if (r)
4721 return r;
4722
4723 return 0;
4724}
4725
4726/**
4727 * amdgpu_device_resume - initiate device resume
4728 *
4729 * @dev: drm dev pointer
4730 * @fbcon : notify the fbdev of resume
4731 *
4732 * Bring the hw back to operating state (all asics).
4733 * Returns 0 for success or an error on failure.
4734 * Called at driver resume.
4735 */
4736int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4737{
4738 struct amdgpu_device *adev = drm_to_adev(dev);
4739 int r = 0;
4740
4741 if (amdgpu_sriov_vf(adev)) {
4742 r = amdgpu_virt_request_full_gpu(adev, true);
4743 if (r)
4744 return r;
4745 }
4746
4747 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4748 return 0;
4749
4750 if (adev->in_s0ix)
4751 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4752
4753 /* post card */
4754 if (amdgpu_device_need_post(adev)) {
4755 r = amdgpu_device_asic_init(adev);
4756 if (r)
4757 dev_err(adev->dev, "amdgpu asic init failed\n");
4758 }
4759
4760 r = amdgpu_device_ip_resume(adev);
4761
4762 if (r) {
4763 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4764 goto exit;
4765 }
4766 amdgpu_fence_driver_hw_init(adev);
4767
4768 if (!adev->in_s0ix) {
4769 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4770 if (r)
4771 goto exit;
4772 }
4773
4774 r = amdgpu_device_ip_late_init(adev);
4775 if (r)
4776 goto exit;
4777
4778 queue_delayed_work(system_wq, &adev->delayed_init_work,
4779 msecs_to_jiffies(AMDGPU_RESUME_MS));
4780exit:
4781 if (amdgpu_sriov_vf(adev)) {
4782 amdgpu_virt_init_data_exchange(adev);
4783 amdgpu_virt_release_full_gpu(adev, true);
4784 }
4785
4786 if (r)
4787 return r;
4788
4789 /* Make sure IB tests flushed */
4790 flush_delayed_work(&adev->delayed_init_work);
4791
4792 if (fbcon)
4793 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4794
4795 amdgpu_ras_resume(adev);
4796
4797 if (adev->mode_info.num_crtc) {
4798 /*
4799 * Most of the connector probing functions try to acquire runtime pm
4800 * refs to ensure that the GPU is powered on when connector polling is
4801 * performed. Since we're calling this from a runtime PM callback,
4802 * trying to acquire rpm refs will cause us to deadlock.
4803 *
4804 * Since we're guaranteed to be holding the rpm lock, it's safe to
4805 * temporarily disable the rpm helpers so this doesn't deadlock us.
4806 */
4807#ifdef CONFIG_PM
4808 dev->dev->power.disable_depth++;
4809#endif
4810 if (!adev->dc_enabled)
4811 drm_helper_hpd_irq_event(dev);
4812 else
4813 drm_kms_helper_hotplug_event(dev);
4814#ifdef CONFIG_PM
4815 dev->dev->power.disable_depth--;
4816#endif
4817 }
4818 adev->in_suspend = false;
4819
4820 if (adev->enable_mes)
4821 amdgpu_mes_self_test(adev);
4822
4823 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4824 DRM_WARN("smart shift update failed\n");
4825
4826 return 0;
4827}
4828
4829/**
4830 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4831 *
4832 * @adev: amdgpu_device pointer
4833 *
4834 * The list of all the hardware IPs that make up the asic is walked and
4835 * the check_soft_reset callbacks are run. check_soft_reset determines
4836 * if the asic is still hung or not.
4837 * Returns true if any of the IPs are still in a hung state, false if not.
4838 */
4839static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4840{
4841 int i;
4842 bool asic_hang = false;
4843
4844 if (amdgpu_sriov_vf(adev))
4845 return true;
4846
4847 if (amdgpu_asic_need_full_reset(adev))
4848 return true;
4849
4850 for (i = 0; i < adev->num_ip_blocks; i++) {
4851 if (!adev->ip_blocks[i].status.valid)
4852 continue;
4853 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4854 adev->ip_blocks[i].status.hang =
4855 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4856 if (adev->ip_blocks[i].status.hang) {
4857 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4858 asic_hang = true;
4859 }
4860 }
4861 return asic_hang;
4862}
4863
4864/**
4865 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4866 *
4867 * @adev: amdgpu_device pointer
4868 *
4869 * The list of all the hardware IPs that make up the asic is walked and the
4870 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
4871 * handles any IP specific hardware or software state changes that are
4872 * necessary for a soft reset to succeed.
4873 * Returns 0 on success, negative error code on failure.
4874 */
4875static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4876{
4877 int i, r = 0;
4878
4879 for (i = 0; i < adev->num_ip_blocks; i++) {
4880 if (!adev->ip_blocks[i].status.valid)
4881 continue;
4882 if (adev->ip_blocks[i].status.hang &&
4883 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4884 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4885 if (r)
4886 return r;
4887 }
4888 }
4889
4890 return 0;
4891}
4892
4893/**
4894 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4895 *
4896 * @adev: amdgpu_device pointer
4897 *
4898 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4899 * reset is necessary to recover.
4900 * Returns true if a full asic reset is required, false if not.
4901 */
4902static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4903{
4904 int i;
4905
4906 if (amdgpu_asic_need_full_reset(adev))
4907 return true;
4908
4909 for (i = 0; i < adev->num_ip_blocks; i++) {
4910 if (!adev->ip_blocks[i].status.valid)
4911 continue;
4912 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4913 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4914 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4915 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4916 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4917 if (adev->ip_blocks[i].status.hang) {
4918 dev_info(adev->dev, "Some block need full reset!\n");
4919 return true;
4920 }
4921 }
4922 }
4923 return false;
4924}
4925
4926/**
4927 * amdgpu_device_ip_soft_reset - do a soft reset
4928 *
4929 * @adev: amdgpu_device pointer
4930 *
4931 * The list of all the hardware IPs that make up the asic is walked and the
4932 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4933 * IP specific hardware or software state changes that are necessary to soft
4934 * reset the IP.
4935 * Returns 0 on success, negative error code on failure.
4936 */
4937static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4938{
4939 int i, r = 0;
4940
4941 for (i = 0; i < adev->num_ip_blocks; i++) {
4942 if (!adev->ip_blocks[i].status.valid)
4943 continue;
4944 if (adev->ip_blocks[i].status.hang &&
4945 adev->ip_blocks[i].version->funcs->soft_reset) {
4946 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4947 if (r)
4948 return r;
4949 }
4950 }
4951
4952 return 0;
4953}
4954
4955/**
4956 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4957 *
4958 * @adev: amdgpu_device pointer
4959 *
4960 * The list of all the hardware IPs that make up the asic is walked and the
4961 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4962 * handles any IP specific hardware or software state changes that are
4963 * necessary after the IP has been soft reset.
4964 * Returns 0 on success, negative error code on failure.
4965 */
4966static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4967{
4968 int i, r = 0;
4969
4970 for (i = 0; i < adev->num_ip_blocks; i++) {
4971 if (!adev->ip_blocks[i].status.valid)
4972 continue;
4973 if (adev->ip_blocks[i].status.hang &&
4974 adev->ip_blocks[i].version->funcs->post_soft_reset)
4975 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4976 if (r)
4977 return r;
4978 }
4979
4980 return 0;
4981}
4982
4983/**
4984 * amdgpu_device_recover_vram - Recover some VRAM contents
4985 *
4986 * @adev: amdgpu_device pointer
4987 *
4988 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4989 * restore things like GPUVM page tables after a GPU reset where
4990 * the contents of VRAM might be lost.
4991 *
4992 * Returns:
4993 * 0 on success, negative error code on failure.
4994 */
4995static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4996{
4997 struct dma_fence *fence = NULL, *next = NULL;
4998 struct amdgpu_bo *shadow;
4999 struct amdgpu_bo_vm *vmbo;
5000 long r = 1, tmo;
5001
5002 if (amdgpu_sriov_runtime(adev))
5003 tmo = msecs_to_jiffies(8000);
5004 else
5005 tmo = msecs_to_jiffies(100);
5006
5007 dev_info(adev->dev, "recover vram bo from shadow start\n");
5008 mutex_lock(&adev->shadow_list_lock);
5009 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
5010 /* If vm is compute context or adev is APU, shadow will be NULL */
5011 if (!vmbo->shadow)
5012 continue;
5013 shadow = vmbo->shadow;
5014
5015 /* No need to recover an evicted BO */
5016 if (!shadow->tbo.resource ||
5017 shadow->tbo.resource->mem_type != TTM_PL_TT ||
5018 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
5019 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
5020 continue;
5021
5022 r = amdgpu_bo_restore_shadow(shadow, &next);
5023 if (r)
5024 break;
5025
5026 if (fence) {
5027 tmo = dma_fence_wait_timeout(fence, false, tmo);
5028 dma_fence_put(fence);
5029 fence = next;
5030 if (tmo == 0) {
5031 r = -ETIMEDOUT;
5032 break;
5033 } else if (tmo < 0) {
5034 r = tmo;
5035 break;
5036 }
5037 } else {
5038 fence = next;
5039 }
5040 }
5041 mutex_unlock(&adev->shadow_list_lock);
5042
5043 if (fence)
5044 tmo = dma_fence_wait_timeout(fence, false, tmo);
5045 dma_fence_put(fence);
5046
5047 if (r < 0 || tmo <= 0) {
5048 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
5049 return -EIO;
5050 }
5051
5052 dev_info(adev->dev, "recover vram bo from shadow done\n");
5053 return 0;
5054}
5055
5056
5057/**
5058 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5059 *
5060 * @adev: amdgpu_device pointer
5061 * @reset_context: amdgpu reset context pointer
5062 *
5063 * do VF FLR and reinitialize Asic
5064 * return 0 means succeeded otherwise failed
5065 */
5066static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
5067 struct amdgpu_reset_context *reset_context)
5068{
5069 int r;
5070 struct amdgpu_hive_info *hive = NULL;
5071
5072 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
5073 if (!amdgpu_ras_get_fed_status(adev))
5074 amdgpu_virt_ready_to_reset(adev);
5075 amdgpu_virt_wait_reset(adev);
5076 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5077 r = amdgpu_virt_request_full_gpu(adev, true);
5078 } else {
5079 r = amdgpu_virt_reset_gpu(adev);
5080 }
5081 if (r)
5082 return r;
5083
5084 amdgpu_ras_set_fed(adev, false);
5085 amdgpu_irq_gpu_reset_resume_helper(adev);
5086
5087 /* some sw clean up VF needs to do before recover */
5088 amdgpu_virt_post_reset(adev);
5089
5090 /* Resume IP prior to SMC */
5091 r = amdgpu_device_ip_reinit_early_sriov(adev);
5092 if (r)
5093 return r;
5094
5095 amdgpu_virt_init_data_exchange(adev);
5096
5097 r = amdgpu_device_fw_loading(adev);
5098 if (r)
5099 return r;
5100
5101 /* now we are okay to resume SMC/CP/SDMA */
5102 r = amdgpu_device_ip_reinit_late_sriov(adev);
5103 if (r)
5104 return r;
5105
5106 hive = amdgpu_get_xgmi_hive(adev);
5107 /* Update PSP FW topology after reset */
5108 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
5109 r = amdgpu_xgmi_update_topology(hive, adev);
5110 if (hive)
5111 amdgpu_put_xgmi_hive(hive);
5112 if (r)
5113 return r;
5114
5115 r = amdgpu_ib_ring_tests(adev);
5116 if (r)
5117 return r;
5118
5119 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
5120 amdgpu_inc_vram_lost(adev);
5121 r = amdgpu_device_recover_vram(adev);
5122 }
5123 if (r)
5124 return r;
5125
5126 /* need to be called during full access so we can't do it later like
5127 * bare-metal does.
5128 */
5129 amdgpu_amdkfd_post_reset(adev);
5130 amdgpu_virt_release_full_gpu(adev, true);
5131
5132 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5133 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
5134 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
5135 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
5136 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
5137 amdgpu_ras_resume(adev);
5138 return 0;
5139}
5140
5141/**
5142 * amdgpu_device_has_job_running - check if there is any job in mirror list
5143 *
5144 * @adev: amdgpu_device pointer
5145 *
5146 * check if there is any job in mirror list
5147 */
5148bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
5149{
5150 int i;
5151 struct drm_sched_job *job;
5152
5153 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5154 struct amdgpu_ring *ring = adev->rings[i];
5155
5156 if (!amdgpu_ring_sched_ready(ring))
5157 continue;
5158
5159 spin_lock(&ring->sched.job_list_lock);
5160 job = list_first_entry_or_null(&ring->sched.pending_list,
5161 struct drm_sched_job, list);
5162 spin_unlock(&ring->sched.job_list_lock);
5163 if (job)
5164 return true;
5165 }
5166 return false;
5167}
5168
5169/**
5170 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
5171 *
5172 * @adev: amdgpu_device pointer
5173 *
5174 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
5175 * a hung GPU.
5176 */
5177bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
5178{
5179
5180 if (amdgpu_gpu_recovery == 0)
5181 goto disabled;
5182
5183 /* Skip soft reset check in fatal error mode */
5184 if (!amdgpu_ras_is_poison_mode_supported(adev))
5185 return true;
5186
5187 if (amdgpu_sriov_vf(adev))
5188 return true;
5189
5190 if (amdgpu_gpu_recovery == -1) {
5191 switch (adev->asic_type) {
5192#ifdef CONFIG_DRM_AMDGPU_SI
5193 case CHIP_VERDE:
5194 case CHIP_TAHITI:
5195 case CHIP_PITCAIRN:
5196 case CHIP_OLAND:
5197 case CHIP_HAINAN:
5198#endif
5199#ifdef CONFIG_DRM_AMDGPU_CIK
5200 case CHIP_KAVERI:
5201 case CHIP_KABINI:
5202 case CHIP_MULLINS:
5203#endif
5204 case CHIP_CARRIZO:
5205 case CHIP_STONEY:
5206 case CHIP_CYAN_SKILLFISH:
5207 goto disabled;
5208 default:
5209 break;
5210 }
5211 }
5212
5213 return true;
5214
5215disabled:
5216 dev_info(adev->dev, "GPU recovery disabled.\n");
5217 return false;
5218}
5219
5220int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
5221{
5222 u32 i;
5223 int ret = 0;
5224
5225 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
5226
5227 dev_info(adev->dev, "GPU mode1 reset\n");
5228
5229 /* Cache the state before bus master disable. The saved config space
5230 * values are used in other cases like restore after mode-2 reset.
5231 */
5232 amdgpu_device_cache_pci_state(adev->pdev);
5233
5234 /* disable BM */
5235 pci_clear_master(adev->pdev);
5236
5237 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
5238 dev_info(adev->dev, "GPU smu mode1 reset\n");
5239 ret = amdgpu_dpm_mode1_reset(adev);
5240 } else {
5241 dev_info(adev->dev, "GPU psp mode1 reset\n");
5242 ret = psp_gpu_reset(adev);
5243 }
5244
5245 if (ret)
5246 goto mode1_reset_failed;
5247
5248 amdgpu_device_load_pci_state(adev->pdev);
5249 ret = amdgpu_psp_wait_for_bootloader(adev);
5250 if (ret)
5251 goto mode1_reset_failed;
5252
5253 /* wait for asic to come out of reset */
5254 for (i = 0; i < adev->usec_timeout; i++) {
5255 u32 memsize = adev->nbio.funcs->get_memsize(adev);
5256
5257 if (memsize != 0xffffffff)
5258 break;
5259 udelay(1);
5260 }
5261
5262 if (i >= adev->usec_timeout) {
5263 ret = -ETIMEDOUT;
5264 goto mode1_reset_failed;
5265 }
5266
5267 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
5268
5269 return 0;
5270
5271mode1_reset_failed:
5272 dev_err(adev->dev, "GPU mode1 reset failed\n");
5273 return ret;
5274}
5275
5276int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
5277 struct amdgpu_reset_context *reset_context)
5278{
5279 int i, r = 0;
5280 struct amdgpu_job *job = NULL;
5281 bool need_full_reset =
5282 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5283
5284 if (reset_context->reset_req_dev == adev)
5285 job = reset_context->job;
5286
5287 if (amdgpu_sriov_vf(adev)) {
5288 /* stop the data exchange thread */
5289 amdgpu_virt_fini_data_exchange(adev);
5290 }
5291
5292 amdgpu_fence_driver_isr_toggle(adev, true);
5293
5294 /* block all schedulers and reset given job's ring */
5295 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5296 struct amdgpu_ring *ring = adev->rings[i];
5297
5298 if (!amdgpu_ring_sched_ready(ring))
5299 continue;
5300
5301 /* Clear job fence from fence drv to avoid force_completion
5302 * leave NULL and vm flush fence in fence drv
5303 */
5304 amdgpu_fence_driver_clear_job_fences(ring);
5305
5306 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
5307 amdgpu_fence_driver_force_completion(ring);
5308 }
5309
5310 amdgpu_fence_driver_isr_toggle(adev, false);
5311
5312 if (job && job->vm)
5313 drm_sched_increase_karma(&job->base);
5314
5315 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
5316 /* If reset handler not implemented, continue; otherwise return */
5317 if (r == -EOPNOTSUPP)
5318 r = 0;
5319 else
5320 return r;
5321
5322 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
5323 if (!amdgpu_sriov_vf(adev)) {
5324
5325 if (!need_full_reset)
5326 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
5327
5328 if (!need_full_reset && amdgpu_gpu_recovery &&
5329 amdgpu_device_ip_check_soft_reset(adev)) {
5330 amdgpu_device_ip_pre_soft_reset(adev);
5331 r = amdgpu_device_ip_soft_reset(adev);
5332 amdgpu_device_ip_post_soft_reset(adev);
5333 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
5334 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
5335 need_full_reset = true;
5336 }
5337 }
5338
5339 if (need_full_reset)
5340 r = amdgpu_device_ip_suspend(adev);
5341 if (need_full_reset)
5342 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5343 else
5344 clear_bit(AMDGPU_NEED_FULL_RESET,
5345 &reset_context->flags);
5346 }
5347
5348 return r;
5349}
5350
5351static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
5352{
5353 int i;
5354
5355 lockdep_assert_held(&adev->reset_domain->sem);
5356
5357 for (i = 0; i < adev->reset_info.num_regs; i++) {
5358 adev->reset_info.reset_dump_reg_value[i] =
5359 RREG32(adev->reset_info.reset_dump_reg_list[i]);
5360
5361 trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
5362 adev->reset_info.reset_dump_reg_value[i]);
5363 }
5364
5365 return 0;
5366}
5367
5368int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5369 struct amdgpu_reset_context *reset_context)
5370{
5371 struct amdgpu_device *tmp_adev = NULL;
5372 bool need_full_reset, skip_hw_reset, vram_lost = false;
5373 int r = 0;
5374 uint32_t i;
5375
5376 /* Try reset handler method first */
5377 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5378 reset_list);
5379
5380 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
5381 amdgpu_reset_reg_dumps(tmp_adev);
5382
5383 dev_info(tmp_adev->dev, "Dumping IP State\n");
5384 /* Trigger ip dump before we reset the asic */
5385 for (i = 0; i < tmp_adev->num_ip_blocks; i++)
5386 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
5387 tmp_adev->ip_blocks[i].version->funcs
5388 ->dump_ip_state((void *)tmp_adev);
5389 dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
5390 }
5391
5392 reset_context->reset_device_list = device_list_handle;
5393 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5394 /* If reset handler not implemented, continue; otherwise return */
5395 if (r == -EOPNOTSUPP)
5396 r = 0;
5397 else
5398 return r;
5399
5400 /* Reset handler not implemented, use the default method */
5401 need_full_reset =
5402 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5403 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5404
5405 /*
5406 * ASIC reset has to be done on all XGMI hive nodes ASAP
5407 * to allow proper links negotiation in FW (within 1 sec)
5408 */
5409 if (!skip_hw_reset && need_full_reset) {
5410 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5411 /* For XGMI run all resets in parallel to speed up the process */
5412 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5413 tmp_adev->gmc.xgmi.pending_reset = false;
5414 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
5415 r = -EALREADY;
5416 } else
5417 r = amdgpu_asic_reset(tmp_adev);
5418
5419 if (r) {
5420 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
5421 r, adev_to_drm(tmp_adev)->unique);
5422 goto out;
5423 }
5424 }
5425
5426 /* For XGMI wait for all resets to complete before proceed */
5427 if (!r) {
5428 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5429 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5430 flush_work(&tmp_adev->xgmi_reset_work);
5431 r = tmp_adev->asic_reset_res;
5432 if (r)
5433 break;
5434 }
5435 }
5436 }
5437 }
5438
5439 if (!r && amdgpu_ras_intr_triggered()) {
5440 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5441 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB);
5442 }
5443
5444 amdgpu_ras_intr_cleared();
5445 }
5446
5447 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5448 if (need_full_reset) {
5449 /* post card */
5450 amdgpu_ras_set_fed(tmp_adev, false);
5451 r = amdgpu_device_asic_init(tmp_adev);
5452 if (r) {
5453 dev_warn(tmp_adev->dev, "asic atom init failed!");
5454 } else {
5455 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5456
5457 r = amdgpu_device_ip_resume_phase1(tmp_adev);
5458 if (r)
5459 goto out;
5460
5461 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5462
5463 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
5464 amdgpu_coredump(tmp_adev, vram_lost, reset_context);
5465
5466 if (vram_lost) {
5467 DRM_INFO("VRAM is lost due to GPU reset!\n");
5468 amdgpu_inc_vram_lost(tmp_adev);
5469 }
5470
5471 r = amdgpu_device_fw_loading(tmp_adev);
5472 if (r)
5473 return r;
5474
5475 r = amdgpu_xcp_restore_partition_mode(
5476 tmp_adev->xcp_mgr);
5477 if (r)
5478 goto out;
5479
5480 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5481 if (r)
5482 goto out;
5483
5484 if (tmp_adev->mman.buffer_funcs_ring->sched.ready)
5485 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
5486
5487 if (vram_lost)
5488 amdgpu_device_fill_reset_magic(tmp_adev);
5489
5490 /*
5491 * Add this ASIC as tracked as reset was already
5492 * complete successfully.
5493 */
5494 amdgpu_register_gpu_instance(tmp_adev);
5495
5496 if (!reset_context->hive &&
5497 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5498 amdgpu_xgmi_add_device(tmp_adev);
5499
5500 r = amdgpu_device_ip_late_init(tmp_adev);
5501 if (r)
5502 goto out;
5503
5504 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
5505
5506 /*
5507 * The GPU enters bad state once faulty pages
5508 * by ECC has reached the threshold, and ras
5509 * recovery is scheduled next. So add one check
5510 * here to break recovery if it indeed exceeds
5511 * bad page threshold, and remind user to
5512 * retire this GPU or setting one bigger
5513 * bad_page_threshold value to fix this once
5514 * probing driver again.
5515 */
5516 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
5517 /* must succeed. */
5518 amdgpu_ras_resume(tmp_adev);
5519 } else {
5520 r = -EINVAL;
5521 goto out;
5522 }
5523
5524 /* Update PSP FW topology after reset */
5525 if (reset_context->hive &&
5526 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5527 r = amdgpu_xgmi_update_topology(
5528 reset_context->hive, tmp_adev);
5529 }
5530 }
5531
5532out:
5533 if (!r) {
5534 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5535 r = amdgpu_ib_ring_tests(tmp_adev);
5536 if (r) {
5537 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5538 need_full_reset = true;
5539 r = -EAGAIN;
5540 goto end;
5541 }
5542 }
5543
5544 if (!r)
5545 r = amdgpu_device_recover_vram(tmp_adev);
5546 else
5547 tmp_adev->asic_reset_res = r;
5548 }
5549
5550end:
5551 if (need_full_reset)
5552 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5553 else
5554 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5555 return r;
5556}
5557
5558static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5559{
5560
5561 switch (amdgpu_asic_reset_method(adev)) {
5562 case AMD_RESET_METHOD_MODE1:
5563 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5564 break;
5565 case AMD_RESET_METHOD_MODE2:
5566 adev->mp1_state = PP_MP1_STATE_RESET;
5567 break;
5568 default:
5569 adev->mp1_state = PP_MP1_STATE_NONE;
5570 break;
5571 }
5572}
5573
5574static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5575{
5576 amdgpu_vf_error_trans_all(adev);
5577 adev->mp1_state = PP_MP1_STATE_NONE;
5578}
5579
5580static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5581{
5582 struct pci_dev *p = NULL;
5583
5584 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5585 adev->pdev->bus->number, 1);
5586 if (p) {
5587 pm_runtime_enable(&(p->dev));
5588 pm_runtime_resume(&(p->dev));
5589 }
5590
5591 pci_dev_put(p);
5592}
5593
5594static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5595{
5596 enum amd_reset_method reset_method;
5597 struct pci_dev *p = NULL;
5598 u64 expires;
5599
5600 /*
5601 * For now, only BACO and mode1 reset are confirmed
5602 * to suffer the audio issue without proper suspended.
5603 */
5604 reset_method = amdgpu_asic_reset_method(adev);
5605 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5606 (reset_method != AMD_RESET_METHOD_MODE1))
5607 return -EINVAL;
5608
5609 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5610 adev->pdev->bus->number, 1);
5611 if (!p)
5612 return -ENODEV;
5613
5614 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5615 if (!expires)
5616 /*
5617 * If we cannot get the audio device autosuspend delay,
5618 * a fixed 4S interval will be used. Considering 3S is
5619 * the audio controller default autosuspend delay setting.
5620 * 4S used here is guaranteed to cover that.
5621 */
5622 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5623
5624 while (!pm_runtime_status_suspended(&(p->dev))) {
5625 if (!pm_runtime_suspend(&(p->dev)))
5626 break;
5627
5628 if (expires < ktime_get_mono_fast_ns()) {
5629 dev_warn(adev->dev, "failed to suspend display audio\n");
5630 pci_dev_put(p);
5631 /* TODO: abort the succeeding gpu reset? */
5632 return -ETIMEDOUT;
5633 }
5634 }
5635
5636 pm_runtime_disable(&(p->dev));
5637
5638 pci_dev_put(p);
5639 return 0;
5640}
5641
5642static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5643{
5644 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5645
5646#if defined(CONFIG_DEBUG_FS)
5647 if (!amdgpu_sriov_vf(adev))
5648 cancel_work(&adev->reset_work);
5649#endif
5650
5651 if (adev->kfd.dev)
5652 cancel_work(&adev->kfd.reset_work);
5653
5654 if (amdgpu_sriov_vf(adev))
5655 cancel_work(&adev->virt.flr_work);
5656
5657 if (con && adev->ras_enabled)
5658 cancel_work(&con->recovery_work);
5659
5660}
5661
5662static int amdgpu_device_health_check(struct list_head *device_list_handle)
5663{
5664 struct amdgpu_device *tmp_adev;
5665 int ret = 0;
5666 u32 status;
5667
5668 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5669 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
5670 if (PCI_POSSIBLE_ERROR(status)) {
5671 dev_err(tmp_adev->dev, "device lost from bus!");
5672 ret = -ENODEV;
5673 }
5674 }
5675
5676 return ret;
5677}
5678
5679/**
5680 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5681 *
5682 * @adev: amdgpu_device pointer
5683 * @job: which job trigger hang
5684 * @reset_context: amdgpu reset context pointer
5685 *
5686 * Attempt to reset the GPU if it has hung (all asics).
5687 * Attempt to do soft-reset or full-reset and reinitialize Asic
5688 * Returns 0 for success or an error on failure.
5689 */
5690
5691int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5692 struct amdgpu_job *job,
5693 struct amdgpu_reset_context *reset_context)
5694{
5695 struct list_head device_list, *device_list_handle = NULL;
5696 bool job_signaled = false;
5697 struct amdgpu_hive_info *hive = NULL;
5698 struct amdgpu_device *tmp_adev = NULL;
5699 int i, r = 0;
5700 bool need_emergency_restart = false;
5701 bool audio_suspended = false;
5702 int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
5703
5704 /*
5705 * Special case: RAS triggered and full reset isn't supported
5706 */
5707 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5708
5709 /*
5710 * Flush RAM to disk so that after reboot
5711 * the user can read log and see why the system rebooted.
5712 */
5713 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5714 amdgpu_ras_get_context(adev)->reboot) {
5715 DRM_WARN("Emergency reboot.");
5716
5717 ksys_sync_helper();
5718 emergency_restart();
5719 }
5720
5721 dev_info(adev->dev, "GPU %s begin!\n",
5722 need_emergency_restart ? "jobs stop":"reset");
5723
5724 if (!amdgpu_sriov_vf(adev))
5725 hive = amdgpu_get_xgmi_hive(adev);
5726 if (hive)
5727 mutex_lock(&hive->hive_lock);
5728
5729 reset_context->job = job;
5730 reset_context->hive = hive;
5731 /*
5732 * Build list of devices to reset.
5733 * In case we are in XGMI hive mode, resort the device list
5734 * to put adev in the 1st position.
5735 */
5736 INIT_LIST_HEAD(&device_list);
5737 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5738 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5739 list_add_tail(&tmp_adev->reset_list, &device_list);
5740 if (adev->shutdown)
5741 tmp_adev->shutdown = true;
5742 }
5743 if (!list_is_first(&adev->reset_list, &device_list))
5744 list_rotate_to_front(&adev->reset_list, &device_list);
5745 device_list_handle = &device_list;
5746 } else {
5747 list_add_tail(&adev->reset_list, &device_list);
5748 device_list_handle = &device_list;
5749 }
5750
5751 if (!amdgpu_sriov_vf(adev)) {
5752 r = amdgpu_device_health_check(device_list_handle);
5753 if (r)
5754 goto end_reset;
5755 }
5756
5757 /* We need to lock reset domain only once both for XGMI and single device */
5758 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5759 reset_list);
5760 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5761
5762 /* block all schedulers and reset given job's ring */
5763 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5764
5765 amdgpu_device_set_mp1_state(tmp_adev);
5766
5767 /*
5768 * Try to put the audio codec into suspend state
5769 * before gpu reset started.
5770 *
5771 * Due to the power domain of the graphics device
5772 * is shared with AZ power domain. Without this,
5773 * we may change the audio hardware from behind
5774 * the audio driver's back. That will trigger
5775 * some audio codec errors.
5776 */
5777 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5778 audio_suspended = true;
5779
5780 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5781
5782 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5783
5784 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
5785
5786 /*
5787 * Mark these ASICs to be reseted as untracked first
5788 * And add them back after reset completed
5789 */
5790 amdgpu_unregister_gpu_instance(tmp_adev);
5791
5792 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5793
5794 /* disable ras on ALL IPs */
5795 if (!need_emergency_restart &&
5796 amdgpu_device_ip_need_full_reset(tmp_adev))
5797 amdgpu_ras_suspend(tmp_adev);
5798
5799 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5800 struct amdgpu_ring *ring = tmp_adev->rings[i];
5801
5802 if (!amdgpu_ring_sched_ready(ring))
5803 continue;
5804
5805 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5806
5807 if (need_emergency_restart)
5808 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5809 }
5810 atomic_inc(&tmp_adev->gpu_reset_counter);
5811 }
5812
5813 if (need_emergency_restart)
5814 goto skip_sched_resume;
5815
5816 /*
5817 * Must check guilty signal here since after this point all old
5818 * HW fences are force signaled.
5819 *
5820 * job->base holds a reference to parent fence
5821 */
5822 if (job && dma_fence_is_signaled(&job->hw_fence)) {
5823 job_signaled = true;
5824 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5825 goto skip_hw_reset;
5826 }
5827
5828retry: /* Rest of adevs pre asic reset from XGMI hive. */
5829 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5830 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5831 /*TODO Should we stop ?*/
5832 if (r) {
5833 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5834 r, adev_to_drm(tmp_adev)->unique);
5835 tmp_adev->asic_reset_res = r;
5836 }
5837 }
5838
5839 /* Actual ASIC resets if needed.*/
5840 /* Host driver will handle XGMI hive reset for SRIOV */
5841 if (amdgpu_sriov_vf(adev)) {
5842 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
5843 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
5844 amdgpu_ras_set_fed(adev, true);
5845 set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5846 }
5847
5848 r = amdgpu_device_reset_sriov(adev, reset_context);
5849 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
5850 amdgpu_virt_release_full_gpu(adev, true);
5851 goto retry;
5852 }
5853 if (r)
5854 adev->asic_reset_res = r;
5855 } else {
5856 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5857 if (r && r == -EAGAIN)
5858 goto retry;
5859 }
5860
5861 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5862 /*
5863 * Drop any pending non scheduler resets queued before reset is done.
5864 * Any reset scheduled after this point would be valid. Scheduler resets
5865 * were already dropped during drm_sched_stop and no new ones can come
5866 * in before drm_sched_start.
5867 */
5868 amdgpu_device_stop_pending_resets(tmp_adev);
5869 }
5870
5871skip_hw_reset:
5872
5873 /* Post ASIC reset for all devs .*/
5874 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5875
5876 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5877 struct amdgpu_ring *ring = tmp_adev->rings[i];
5878
5879 if (!amdgpu_ring_sched_ready(ring))
5880 continue;
5881
5882 drm_sched_start(&ring->sched, true);
5883 }
5884
5885 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
5886 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5887
5888 if (tmp_adev->asic_reset_res)
5889 r = tmp_adev->asic_reset_res;
5890
5891 tmp_adev->asic_reset_res = 0;
5892
5893 if (r) {
5894 /* bad news, how to tell it to userspace ? */
5895 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5896 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5897 } else {
5898 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5899 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5900 DRM_WARN("smart shift update failed\n");
5901 }
5902 }
5903
5904skip_sched_resume:
5905 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5906 /* unlock kfd: SRIOV would do it separately */
5907 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5908 amdgpu_amdkfd_post_reset(tmp_adev);
5909
5910 /* kfd_post_reset will do nothing if kfd device is not initialized,
5911 * need to bring up kfd here if it's not be initialized before
5912 */
5913 if (!adev->kfd.init_complete)
5914 amdgpu_amdkfd_device_init(adev);
5915
5916 if (audio_suspended)
5917 amdgpu_device_resume_display_audio(tmp_adev);
5918
5919 amdgpu_device_unset_mp1_state(tmp_adev);
5920
5921 amdgpu_ras_set_error_query_ready(tmp_adev, true);
5922 }
5923
5924 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5925 reset_list);
5926 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5927
5928end_reset:
5929 if (hive) {
5930 mutex_unlock(&hive->hive_lock);
5931 amdgpu_put_xgmi_hive(hive);
5932 }
5933
5934 if (r)
5935 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5936
5937 atomic_set(&adev->reset_domain->reset_res, r);
5938 return r;
5939}
5940
5941/**
5942 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
5943 *
5944 * @adev: amdgpu_device pointer
5945 * @speed: pointer to the speed of the link
5946 * @width: pointer to the width of the link
5947 *
5948 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
5949 * first physical partner to an AMD dGPU.
5950 * This will exclude any virtual switches and links.
5951 */
5952static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
5953 enum pci_bus_speed *speed,
5954 enum pcie_link_width *width)
5955{
5956 struct pci_dev *parent = adev->pdev;
5957
5958 if (!speed || !width)
5959 return;
5960
5961 *speed = PCI_SPEED_UNKNOWN;
5962 *width = PCIE_LNK_WIDTH_UNKNOWN;
5963
5964 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
5965 while ((parent = pci_upstream_bridge(parent))) {
5966 /* skip upstream/downstream switches internal to dGPU*/
5967 if (parent->vendor == PCI_VENDOR_ID_ATI)
5968 continue;
5969 *speed = pcie_get_speed_cap(parent);
5970 *width = pcie_get_width_cap(parent);
5971 break;
5972 }
5973 } else {
5974 /* use the current speeds rather than max if switching is not supported */
5975 pcie_bandwidth_available(adev->pdev, NULL, speed, width);
5976 }
5977}
5978
5979/**
5980 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5981 *
5982 * @adev: amdgpu_device pointer
5983 *
5984 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5985 * and lanes) of the slot the device is in. Handles APUs and
5986 * virtualized environments where PCIE config space may not be available.
5987 */
5988static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5989{
5990 struct pci_dev *pdev;
5991 enum pci_bus_speed speed_cap, platform_speed_cap;
5992 enum pcie_link_width platform_link_width;
5993
5994 if (amdgpu_pcie_gen_cap)
5995 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5996
5997 if (amdgpu_pcie_lane_cap)
5998 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5999
6000 /* covers APUs as well */
6001 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
6002 if (adev->pm.pcie_gen_mask == 0)
6003 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
6004 if (adev->pm.pcie_mlw_mask == 0)
6005 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
6006 return;
6007 }
6008
6009 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
6010 return;
6011
6012 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
6013 &platform_link_width);
6014
6015 if (adev->pm.pcie_gen_mask == 0) {
6016 /* asic caps */
6017 pdev = adev->pdev;
6018 speed_cap = pcie_get_speed_cap(pdev);
6019 if (speed_cap == PCI_SPEED_UNKNOWN) {
6020 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6021 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6022 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6023 } else {
6024 if (speed_cap == PCIE_SPEED_32_0GT)
6025 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6026 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6027 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6028 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6029 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
6030 else if (speed_cap == PCIE_SPEED_16_0GT)
6031 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6032 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6033 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6034 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
6035 else if (speed_cap == PCIE_SPEED_8_0GT)
6036 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6037 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6038 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6039 else if (speed_cap == PCIE_SPEED_5_0GT)
6040 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6041 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
6042 else
6043 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
6044 }
6045 /* platform caps */
6046 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
6047 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6048 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6049 } else {
6050 if (platform_speed_cap == PCIE_SPEED_32_0GT)
6051 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6052 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6053 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6054 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6055 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
6056 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
6057 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6058 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6059 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6060 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
6061 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
6062 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6063 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6064 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
6065 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
6066 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6067 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6068 else
6069 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
6070
6071 }
6072 }
6073 if (adev->pm.pcie_mlw_mask == 0) {
6074 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6075 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
6076 } else {
6077 switch (platform_link_width) {
6078 case PCIE_LNK_X32:
6079 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
6080 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6081 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6082 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6083 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6084 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6085 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6086 break;
6087 case PCIE_LNK_X16:
6088 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6089 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6090 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6091 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6092 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6093 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6094 break;
6095 case PCIE_LNK_X12:
6096 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6097 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6098 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6099 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6100 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6101 break;
6102 case PCIE_LNK_X8:
6103 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6104 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6105 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6106 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6107 break;
6108 case PCIE_LNK_X4:
6109 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6110 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6111 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6112 break;
6113 case PCIE_LNK_X2:
6114 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6115 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6116 break;
6117 case PCIE_LNK_X1:
6118 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
6119 break;
6120 default:
6121 break;
6122 }
6123 }
6124 }
6125}
6126
6127/**
6128 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
6129 *
6130 * @adev: amdgpu_device pointer
6131 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
6132 *
6133 * Return true if @peer_adev can access (DMA) @adev through the PCIe
6134 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
6135 * @peer_adev.
6136 */
6137bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
6138 struct amdgpu_device *peer_adev)
6139{
6140#ifdef CONFIG_HSA_AMD_P2P
6141 uint64_t address_mask = peer_adev->dev->dma_mask ?
6142 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
6143 resource_size_t aper_limit =
6144 adev->gmc.aper_base + adev->gmc.aper_size - 1;
6145 bool p2p_access =
6146 !adev->gmc.xgmi.connected_to_cpu &&
6147 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
6148
6149 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
6150 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
6151 !(adev->gmc.aper_base & address_mask ||
6152 aper_limit & address_mask));
6153#else
6154 return false;
6155#endif
6156}
6157
6158int amdgpu_device_baco_enter(struct drm_device *dev)
6159{
6160 struct amdgpu_device *adev = drm_to_adev(dev);
6161 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6162
6163 if (!amdgpu_device_supports_baco(dev))
6164 return -ENOTSUPP;
6165
6166 if (ras && adev->ras_enabled &&
6167 adev->nbio.funcs->enable_doorbell_interrupt)
6168 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
6169
6170 return amdgpu_dpm_baco_enter(adev);
6171}
6172
6173int amdgpu_device_baco_exit(struct drm_device *dev)
6174{
6175 struct amdgpu_device *adev = drm_to_adev(dev);
6176 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6177 int ret = 0;
6178
6179 if (!amdgpu_device_supports_baco(dev))
6180 return -ENOTSUPP;
6181
6182 ret = amdgpu_dpm_baco_exit(adev);
6183 if (ret)
6184 return ret;
6185
6186 if (ras && adev->ras_enabled &&
6187 adev->nbio.funcs->enable_doorbell_interrupt)
6188 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
6189
6190 if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
6191 adev->nbio.funcs->clear_doorbell_interrupt)
6192 adev->nbio.funcs->clear_doorbell_interrupt(adev);
6193
6194 return 0;
6195}
6196
6197/**
6198 * amdgpu_pci_error_detected - Called when a PCI error is detected.
6199 * @pdev: PCI device struct
6200 * @state: PCI channel state
6201 *
6202 * Description: Called when a PCI error is detected.
6203 *
6204 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
6205 */
6206pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
6207{
6208 struct drm_device *dev = pci_get_drvdata(pdev);
6209 struct amdgpu_device *adev = drm_to_adev(dev);
6210 int i;
6211
6212 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
6213
6214 if (adev->gmc.xgmi.num_physical_nodes > 1) {
6215 DRM_WARN("No support for XGMI hive yet...");
6216 return PCI_ERS_RESULT_DISCONNECT;
6217 }
6218
6219 adev->pci_channel_state = state;
6220
6221 switch (state) {
6222 case pci_channel_io_normal:
6223 return PCI_ERS_RESULT_CAN_RECOVER;
6224 /* Fatal error, prepare for slot reset */
6225 case pci_channel_io_frozen:
6226 /*
6227 * Locking adev->reset_domain->sem will prevent any external access
6228 * to GPU during PCI error recovery
6229 */
6230 amdgpu_device_lock_reset_domain(adev->reset_domain);
6231 amdgpu_device_set_mp1_state(adev);
6232
6233 /*
6234 * Block any work scheduling as we do for regular GPU reset
6235 * for the duration of the recovery
6236 */
6237 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6238 struct amdgpu_ring *ring = adev->rings[i];
6239
6240 if (!amdgpu_ring_sched_ready(ring))
6241 continue;
6242
6243 drm_sched_stop(&ring->sched, NULL);
6244 }
6245 atomic_inc(&adev->gpu_reset_counter);
6246 return PCI_ERS_RESULT_NEED_RESET;
6247 case pci_channel_io_perm_failure:
6248 /* Permanent error, prepare for device removal */
6249 return PCI_ERS_RESULT_DISCONNECT;
6250 }
6251
6252 return PCI_ERS_RESULT_NEED_RESET;
6253}
6254
6255/**
6256 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
6257 * @pdev: pointer to PCI device
6258 */
6259pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
6260{
6261
6262 DRM_INFO("PCI error: mmio enabled callback!!\n");
6263
6264 /* TODO - dump whatever for debugging purposes */
6265
6266 /* This called only if amdgpu_pci_error_detected returns
6267 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
6268 * works, no need to reset slot.
6269 */
6270
6271 return PCI_ERS_RESULT_RECOVERED;
6272}
6273
6274/**
6275 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
6276 * @pdev: PCI device struct
6277 *
6278 * Description: This routine is called by the pci error recovery
6279 * code after the PCI slot has been reset, just before we
6280 * should resume normal operations.
6281 */
6282pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
6283{
6284 struct drm_device *dev = pci_get_drvdata(pdev);
6285 struct amdgpu_device *adev = drm_to_adev(dev);
6286 int r, i;
6287 struct amdgpu_reset_context reset_context;
6288 u32 memsize;
6289 struct list_head device_list;
6290
6291 /* PCI error slot reset should be skipped During RAS recovery */
6292 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
6293 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
6294 amdgpu_ras_in_recovery(adev))
6295 return PCI_ERS_RESULT_RECOVERED;
6296
6297 DRM_INFO("PCI error: slot reset callback!!\n");
6298
6299 memset(&reset_context, 0, sizeof(reset_context));
6300
6301 INIT_LIST_HEAD(&device_list);
6302 list_add_tail(&adev->reset_list, &device_list);
6303
6304 /* wait for asic to come out of reset */
6305 msleep(500);
6306
6307 /* Restore PCI confspace */
6308 amdgpu_device_load_pci_state(pdev);
6309
6310 /* confirm ASIC came out of reset */
6311 for (i = 0; i < adev->usec_timeout; i++) {
6312 memsize = amdgpu_asic_get_config_memsize(adev);
6313
6314 if (memsize != 0xffffffff)
6315 break;
6316 udelay(1);
6317 }
6318 if (memsize == 0xffffffff) {
6319 r = -ETIME;
6320 goto out;
6321 }
6322
6323 reset_context.method = AMD_RESET_METHOD_NONE;
6324 reset_context.reset_req_dev = adev;
6325 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
6326 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
6327
6328 adev->no_hw_access = true;
6329 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
6330 adev->no_hw_access = false;
6331 if (r)
6332 goto out;
6333
6334 r = amdgpu_do_asic_reset(&device_list, &reset_context);
6335
6336out:
6337 if (!r) {
6338 if (amdgpu_device_cache_pci_state(adev->pdev))
6339 pci_restore_state(adev->pdev);
6340
6341 DRM_INFO("PCIe error recovery succeeded\n");
6342 } else {
6343 DRM_ERROR("PCIe error recovery failed, err:%d", r);
6344 amdgpu_device_unset_mp1_state(adev);
6345 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6346 }
6347
6348 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
6349}
6350
6351/**
6352 * amdgpu_pci_resume() - resume normal ops after PCI reset
6353 * @pdev: pointer to PCI device
6354 *
6355 * Called when the error recovery driver tells us that its
6356 * OK to resume normal operation.
6357 */
6358void amdgpu_pci_resume(struct pci_dev *pdev)
6359{
6360 struct drm_device *dev = pci_get_drvdata(pdev);
6361 struct amdgpu_device *adev = drm_to_adev(dev);
6362 int i;
6363
6364
6365 DRM_INFO("PCI error: resume callback!!\n");
6366
6367 /* Only continue execution for the case of pci_channel_io_frozen */
6368 if (adev->pci_channel_state != pci_channel_io_frozen)
6369 return;
6370
6371 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6372 struct amdgpu_ring *ring = adev->rings[i];
6373
6374 if (!amdgpu_ring_sched_ready(ring))
6375 continue;
6376
6377 drm_sched_start(&ring->sched, true);
6378 }
6379
6380 amdgpu_device_unset_mp1_state(adev);
6381 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6382}
6383
6384bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
6385{
6386 struct drm_device *dev = pci_get_drvdata(pdev);
6387 struct amdgpu_device *adev = drm_to_adev(dev);
6388 int r;
6389
6390 r = pci_save_state(pdev);
6391 if (!r) {
6392 kfree(adev->pci_state);
6393
6394 adev->pci_state = pci_store_saved_state(pdev);
6395
6396 if (!adev->pci_state) {
6397 DRM_ERROR("Failed to store PCI saved state");
6398 return false;
6399 }
6400 } else {
6401 DRM_WARN("Failed to save PCI state, err:%d\n", r);
6402 return false;
6403 }
6404
6405 return true;
6406}
6407
6408bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6409{
6410 struct drm_device *dev = pci_get_drvdata(pdev);
6411 struct amdgpu_device *adev = drm_to_adev(dev);
6412 int r;
6413
6414 if (!adev->pci_state)
6415 return false;
6416
6417 r = pci_load_saved_state(pdev, adev->pci_state);
6418
6419 if (!r) {
6420 pci_restore_state(pdev);
6421 } else {
6422 DRM_WARN("Failed to load PCI state, err:%d\n", r);
6423 return false;
6424 }
6425
6426 return true;
6427}
6428
6429void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6430 struct amdgpu_ring *ring)
6431{
6432#ifdef CONFIG_X86_64
6433 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6434 return;
6435#endif
6436 if (adev->gmc.xgmi.connected_to_cpu)
6437 return;
6438
6439 if (ring && ring->funcs->emit_hdp_flush)
6440 amdgpu_ring_emit_hdp_flush(ring);
6441 else
6442 amdgpu_asic_flush_hdp(adev, ring);
6443}
6444
6445void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6446 struct amdgpu_ring *ring)
6447{
6448#ifdef CONFIG_X86_64
6449 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6450 return;
6451#endif
6452 if (adev->gmc.xgmi.connected_to_cpu)
6453 return;
6454
6455 amdgpu_asic_invalidate_hdp(adev, ring);
6456}
6457
6458int amdgpu_in_reset(struct amdgpu_device *adev)
6459{
6460 return atomic_read(&adev->reset_domain->in_gpu_reset);
6461}
6462
6463/**
6464 * amdgpu_device_halt() - bring hardware to some kind of halt state
6465 *
6466 * @adev: amdgpu_device pointer
6467 *
6468 * Bring hardware to some kind of halt state so that no one can touch it
6469 * any more. It will help to maintain error context when error occurred.
6470 * Compare to a simple hang, the system will keep stable at least for SSH
6471 * access. Then it should be trivial to inspect the hardware state and
6472 * see what's going on. Implemented as following:
6473 *
6474 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6475 * clears all CPU mappings to device, disallows remappings through page faults
6476 * 2. amdgpu_irq_disable_all() disables all interrupts
6477 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6478 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6479 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6480 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6481 * flush any in flight DMA operations
6482 */
6483void amdgpu_device_halt(struct amdgpu_device *adev)
6484{
6485 struct pci_dev *pdev = adev->pdev;
6486 struct drm_device *ddev = adev_to_drm(adev);
6487
6488 amdgpu_xcp_dev_unplug(adev);
6489 drm_dev_unplug(ddev);
6490
6491 amdgpu_irq_disable_all(adev);
6492
6493 amdgpu_fence_driver_hw_fini(adev);
6494
6495 adev->no_hw_access = true;
6496
6497 amdgpu_device_unmap_mmio(adev);
6498
6499 pci_disable_device(pdev);
6500 pci_wait_for_pending_transaction(pdev);
6501}
6502
6503u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6504 u32 reg)
6505{
6506 unsigned long flags, address, data;
6507 u32 r;
6508
6509 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6510 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6511
6512 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6513 WREG32(address, reg * 4);
6514 (void)RREG32(address);
6515 r = RREG32(data);
6516 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6517 return r;
6518}
6519
6520void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6521 u32 reg, u32 v)
6522{
6523 unsigned long flags, address, data;
6524
6525 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6526 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6527
6528 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6529 WREG32(address, reg * 4);
6530 (void)RREG32(address);
6531 WREG32(data, v);
6532 (void)RREG32(data);
6533 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6534}
6535
6536/**
6537 * amdgpu_device_get_gang - return a reference to the current gang
6538 * @adev: amdgpu_device pointer
6539 *
6540 * Returns: A new reference to the current gang leader.
6541 */
6542struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
6543{
6544 struct dma_fence *fence;
6545
6546 rcu_read_lock();
6547 fence = dma_fence_get_rcu_safe(&adev->gang_submit);
6548 rcu_read_unlock();
6549 return fence;
6550}
6551
6552/**
6553 * amdgpu_device_switch_gang - switch to a new gang
6554 * @adev: amdgpu_device pointer
6555 * @gang: the gang to switch to
6556 *
6557 * Try to switch to a new gang.
6558 * Returns: NULL if we switched to the new gang or a reference to the current
6559 * gang leader.
6560 */
6561struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6562 struct dma_fence *gang)
6563{
6564 struct dma_fence *old = NULL;
6565
6566 do {
6567 dma_fence_put(old);
6568 old = amdgpu_device_get_gang(adev);
6569 if (old == gang)
6570 break;
6571
6572 if (!dma_fence_is_signaled(old))
6573 return old;
6574
6575 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6576 old, gang) != old);
6577
6578 dma_fence_put(old);
6579 return NULL;
6580}
6581
6582bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6583{
6584 switch (adev->asic_type) {
6585#ifdef CONFIG_DRM_AMDGPU_SI
6586 case CHIP_HAINAN:
6587#endif
6588 case CHIP_TOPAZ:
6589 /* chips with no display hardware */
6590 return false;
6591#ifdef CONFIG_DRM_AMDGPU_SI
6592 case CHIP_TAHITI:
6593 case CHIP_PITCAIRN:
6594 case CHIP_VERDE:
6595 case CHIP_OLAND:
6596#endif
6597#ifdef CONFIG_DRM_AMDGPU_CIK
6598 case CHIP_BONAIRE:
6599 case CHIP_HAWAII:
6600 case CHIP_KAVERI:
6601 case CHIP_KABINI:
6602 case CHIP_MULLINS:
6603#endif
6604 case CHIP_TONGA:
6605 case CHIP_FIJI:
6606 case CHIP_POLARIS10:
6607 case CHIP_POLARIS11:
6608 case CHIP_POLARIS12:
6609 case CHIP_VEGAM:
6610 case CHIP_CARRIZO:
6611 case CHIP_STONEY:
6612 /* chips with display hardware */
6613 return true;
6614 default:
6615 /* IP discovery */
6616 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||
6617 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6618 return false;
6619 return true;
6620 }
6621}
6622
6623uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6624 uint32_t inst, uint32_t reg_addr, char reg_name[],
6625 uint32_t expected_value, uint32_t mask)
6626{
6627 uint32_t ret = 0;
6628 uint32_t old_ = 0;
6629 uint32_t tmp_ = RREG32(reg_addr);
6630 uint32_t loop = adev->usec_timeout;
6631
6632 while ((tmp_ & (mask)) != (expected_value)) {
6633 if (old_ != tmp_) {
6634 loop = adev->usec_timeout;
6635 old_ = tmp_;
6636 } else
6637 udelay(1);
6638 tmp_ = RREG32(reg_addr);
6639 loop--;
6640 if (!loop) {
6641 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6642 inst, reg_name, (uint32_t)expected_value,
6643 (uint32_t)(tmp_ & (mask)));
6644 ret = -ETIMEDOUT;
6645 break;
6646 }
6647 }
6648 return ret;
6649}