Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28
29#include <linux/aperture.h>
30#include <linux/power_supply.h>
31#include <linux/kthread.h>
32#include <linux/module.h>
33#include <linux/console.h>
34#include <linux/slab.h>
35#include <linux/iommu.h>
36#include <linux/pci.h>
37#include <linux/pci-p2pdma.h>
38#include <linux/apple-gmux.h>
39
40#include <drm/drm_atomic_helper.h>
41#include <drm/drm_client_event.h>
42#include <drm/drm_crtc_helper.h>
43#include <drm/drm_probe_helper.h>
44#include <drm/amdgpu_drm.h>
45#include <linux/device.h>
46#include <linux/vgaarb.h>
47#include <linux/vga_switcheroo.h>
48#include <linux/efi.h>
49#include "amdgpu.h"
50#include "amdgpu_trace.h"
51#include "amdgpu_i2c.h"
52#include "atom.h"
53#include "amdgpu_atombios.h"
54#include "amdgpu_atomfirmware.h"
55#include "amd_pcie.h"
56#ifdef CONFIG_DRM_AMDGPU_SI
57#include "si.h"
58#endif
59#ifdef CONFIG_DRM_AMDGPU_CIK
60#include "cik.h"
61#endif
62#include "vi.h"
63#include "soc15.h"
64#include "nv.h"
65#include "bif/bif_4_1_d.h"
66#include <linux/firmware.h>
67#include "amdgpu_vf_error.h"
68
69#include "amdgpu_amdkfd.h"
70#include "amdgpu_pm.h"
71
72#include "amdgpu_xgmi.h"
73#include "amdgpu_ras.h"
74#include "amdgpu_pmu.h"
75#include "amdgpu_fru_eeprom.h"
76#include "amdgpu_reset.h"
77#include "amdgpu_virt.h"
78#include "amdgpu_dev_coredump.h"
79
80#include <linux/suspend.h>
81#include <drm/task_barrier.h>
82#include <linux/pm_runtime.h>
83
84#include <drm/drm_drv.h>
85
86#if IS_ENABLED(CONFIG_X86)
87#include <asm/intel-family.h>
88#endif
89
90MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
91MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
92MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
93MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
94MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
95MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
96MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
97
98#define AMDGPU_RESUME_MS 2000
99#define AMDGPU_MAX_RETRY_LIMIT 2
100#define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
101#define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
102#define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
103#define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
104
105static const struct drm_driver amdgpu_kms_driver;
106
107const char *amdgpu_asic_name[] = {
108 "TAHITI",
109 "PITCAIRN",
110 "VERDE",
111 "OLAND",
112 "HAINAN",
113 "BONAIRE",
114 "KAVERI",
115 "KABINI",
116 "HAWAII",
117 "MULLINS",
118 "TOPAZ",
119 "TONGA",
120 "FIJI",
121 "CARRIZO",
122 "STONEY",
123 "POLARIS10",
124 "POLARIS11",
125 "POLARIS12",
126 "VEGAM",
127 "VEGA10",
128 "VEGA12",
129 "VEGA20",
130 "RAVEN",
131 "ARCTURUS",
132 "RENOIR",
133 "ALDEBARAN",
134 "NAVI10",
135 "CYAN_SKILLFISH",
136 "NAVI14",
137 "NAVI12",
138 "SIENNA_CICHLID",
139 "NAVY_FLOUNDER",
140 "VANGOGH",
141 "DIMGREY_CAVEFISH",
142 "BEIGE_GOBY",
143 "YELLOW_CARP",
144 "IP DISCOVERY",
145 "LAST",
146};
147
148#define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMDGPU_MAX_IP_NUM - 1, 0)
149/*
150 * Default init level where all blocks are expected to be initialized. This is
151 * the level of initialization expected by default and also after a full reset
152 * of the device.
153 */
154struct amdgpu_init_level amdgpu_init_default = {
155 .level = AMDGPU_INIT_LEVEL_DEFAULT,
156 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
157};
158
159struct amdgpu_init_level amdgpu_init_recovery = {
160 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY,
161 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
162};
163
164/*
165 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This
166 * is used for cases like reset on initialization where the entire hive needs to
167 * be reset before first use.
168 */
169struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
170 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
171 .hwini_ip_block_mask =
172 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) |
173 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) |
174 BIT(AMD_IP_BLOCK_TYPE_PSP)
175};
176
177static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
178 enum amd_ip_block_type block)
179{
180 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0;
181}
182
183void amdgpu_set_init_level(struct amdgpu_device *adev,
184 enum amdgpu_init_lvl_id lvl)
185{
186 switch (lvl) {
187 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI:
188 adev->init_lvl = &amdgpu_init_minimal_xgmi;
189 break;
190 case AMDGPU_INIT_LEVEL_RESET_RECOVERY:
191 adev->init_lvl = &amdgpu_init_recovery;
192 break;
193 case AMDGPU_INIT_LEVEL_DEFAULT:
194 fallthrough;
195 default:
196 adev->init_lvl = &amdgpu_init_default;
197 break;
198 }
199}
200
201static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
202
203/**
204 * DOC: pcie_replay_count
205 *
206 * The amdgpu driver provides a sysfs API for reporting the total number
207 * of PCIe replays (NAKs)
208 * The file pcie_replay_count is used for this and returns the total
209 * number of replays as a sum of the NAKs generated and NAKs received
210 */
211
212static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
213 struct device_attribute *attr, char *buf)
214{
215 struct drm_device *ddev = dev_get_drvdata(dev);
216 struct amdgpu_device *adev = drm_to_adev(ddev);
217 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
218
219 return sysfs_emit(buf, "%llu\n", cnt);
220}
221
222static DEVICE_ATTR(pcie_replay_count, 0444,
223 amdgpu_device_get_pcie_replay_count, NULL);
224
225static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
226 struct bin_attribute *attr, char *buf,
227 loff_t ppos, size_t count)
228{
229 struct device *dev = kobj_to_dev(kobj);
230 struct drm_device *ddev = dev_get_drvdata(dev);
231 struct amdgpu_device *adev = drm_to_adev(ddev);
232 ssize_t bytes_read;
233
234 switch (ppos) {
235 case AMDGPU_SYS_REG_STATE_XGMI:
236 bytes_read = amdgpu_asic_get_reg_state(
237 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
238 break;
239 case AMDGPU_SYS_REG_STATE_WAFL:
240 bytes_read = amdgpu_asic_get_reg_state(
241 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
242 break;
243 case AMDGPU_SYS_REG_STATE_PCIE:
244 bytes_read = amdgpu_asic_get_reg_state(
245 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
246 break;
247 case AMDGPU_SYS_REG_STATE_USR:
248 bytes_read = amdgpu_asic_get_reg_state(
249 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
250 break;
251 case AMDGPU_SYS_REG_STATE_USR_1:
252 bytes_read = amdgpu_asic_get_reg_state(
253 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
254 break;
255 default:
256 return -EINVAL;
257 }
258
259 return bytes_read;
260}
261
262BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
263 AMDGPU_SYS_REG_STATE_END);
264
265int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
266{
267 int ret;
268
269 if (!amdgpu_asic_get_reg_state_supported(adev))
270 return 0;
271
272 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
273
274 return ret;
275}
276
277void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
278{
279 if (!amdgpu_asic_get_reg_state_supported(adev))
280 return;
281 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
282}
283
284int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block)
285{
286 int r;
287
288 if (ip_block->version->funcs->suspend) {
289 r = ip_block->version->funcs->suspend(ip_block);
290 if (r) {
291 dev_err(ip_block->adev->dev,
292 "suspend of IP block <%s> failed %d\n",
293 ip_block->version->funcs->name, r);
294 return r;
295 }
296 }
297
298 ip_block->status.hw = false;
299 return 0;
300}
301
302int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block)
303{
304 int r;
305
306 if (ip_block->version->funcs->resume) {
307 r = ip_block->version->funcs->resume(ip_block);
308 if (r) {
309 dev_err(ip_block->adev->dev,
310 "resume of IP block <%s> failed %d\n",
311 ip_block->version->funcs->name, r);
312 return r;
313 }
314 }
315
316 ip_block->status.hw = true;
317 return 0;
318}
319
320/**
321 * DOC: board_info
322 *
323 * The amdgpu driver provides a sysfs API for giving board related information.
324 * It provides the form factor information in the format
325 *
326 * type : form factor
327 *
328 * Possible form factor values
329 *
330 * - "cem" - PCIE CEM card
331 * - "oam" - Open Compute Accelerator Module
332 * - "unknown" - Not known
333 *
334 */
335
336static ssize_t amdgpu_device_get_board_info(struct device *dev,
337 struct device_attribute *attr,
338 char *buf)
339{
340 struct drm_device *ddev = dev_get_drvdata(dev);
341 struct amdgpu_device *adev = drm_to_adev(ddev);
342 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM;
343 const char *pkg;
344
345 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type)
346 pkg_type = adev->smuio.funcs->get_pkg_type(adev);
347
348 switch (pkg_type) {
349 case AMDGPU_PKG_TYPE_CEM:
350 pkg = "cem";
351 break;
352 case AMDGPU_PKG_TYPE_OAM:
353 pkg = "oam";
354 break;
355 default:
356 pkg = "unknown";
357 break;
358 }
359
360 return sysfs_emit(buf, "%s : %s\n", "type", pkg);
361}
362
363static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL);
364
365static struct attribute *amdgpu_board_attrs[] = {
366 &dev_attr_board_info.attr,
367 NULL,
368};
369
370static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj,
371 struct attribute *attr, int n)
372{
373 struct device *dev = kobj_to_dev(kobj);
374 struct drm_device *ddev = dev_get_drvdata(dev);
375 struct amdgpu_device *adev = drm_to_adev(ddev);
376
377 if (adev->flags & AMD_IS_APU)
378 return 0;
379
380 return attr->mode;
381}
382
383static const struct attribute_group amdgpu_board_attrs_group = {
384 .attrs = amdgpu_board_attrs,
385 .is_visible = amdgpu_board_attrs_is_visible
386};
387
388static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
389
390
391/**
392 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
393 *
394 * @dev: drm_device pointer
395 *
396 * Returns true if the device is a dGPU with ATPX power control,
397 * otherwise return false.
398 */
399bool amdgpu_device_supports_px(struct drm_device *dev)
400{
401 struct amdgpu_device *adev = drm_to_adev(dev);
402
403 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
404 return true;
405 return false;
406}
407
408/**
409 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
410 *
411 * @dev: drm_device pointer
412 *
413 * Returns true if the device is a dGPU with ACPI power control,
414 * otherwise return false.
415 */
416bool amdgpu_device_supports_boco(struct drm_device *dev)
417{
418 struct amdgpu_device *adev = drm_to_adev(dev);
419
420 if (adev->has_pr3 ||
421 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
422 return true;
423 return false;
424}
425
426/**
427 * amdgpu_device_supports_baco - Does the device support BACO
428 *
429 * @dev: drm_device pointer
430 *
431 * Return:
432 * 1 if the device supporte BACO;
433 * 3 if the device support MACO (only works if BACO is supported)
434 * otherwise return 0.
435 */
436int amdgpu_device_supports_baco(struct drm_device *dev)
437{
438 struct amdgpu_device *adev = drm_to_adev(dev);
439
440 return amdgpu_asic_supports_baco(adev);
441}
442
443void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
444{
445 struct drm_device *dev;
446 int bamaco_support;
447
448 dev = adev_to_drm(adev);
449
450 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
451 bamaco_support = amdgpu_device_supports_baco(dev);
452
453 switch (amdgpu_runtime_pm) {
454 case 2:
455 if (bamaco_support & MACO_SUPPORT) {
456 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
457 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
458 } else if (bamaco_support == BACO_SUPPORT) {
459 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
460 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
461 }
462 break;
463 case 1:
464 if (bamaco_support & BACO_SUPPORT) {
465 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
466 dev_info(adev->dev, "Forcing BACO for runtime pm\n");
467 }
468 break;
469 case -1:
470 case -2:
471 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */
472 adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
473 dev_info(adev->dev, "Using ATPX for runtime pm\n");
474 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */
475 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
476 dev_info(adev->dev, "Using BOCO for runtime pm\n");
477 } else {
478 if (!bamaco_support)
479 goto no_runtime_pm;
480
481 switch (adev->asic_type) {
482 case CHIP_VEGA20:
483 case CHIP_ARCTURUS:
484 /* BACO are not supported on vega20 and arctrus */
485 break;
486 case CHIP_VEGA10:
487 /* enable BACO as runpm mode if noretry=0 */
488 if (!adev->gmc.noretry)
489 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
490 break;
491 default:
492 /* enable BACO as runpm mode on CI+ */
493 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
494 break;
495 }
496
497 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
498 if (bamaco_support & MACO_SUPPORT) {
499 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
500 dev_info(adev->dev, "Using BAMACO for runtime pm\n");
501 } else {
502 dev_info(adev->dev, "Using BACO for runtime pm\n");
503 }
504 }
505 }
506 break;
507 case 0:
508 dev_info(adev->dev, "runtime pm is manually disabled\n");
509 break;
510 default:
511 break;
512 }
513
514no_runtime_pm:
515 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
516 dev_info(adev->dev, "Runtime PM not available\n");
517}
518/**
519 * amdgpu_device_supports_smart_shift - Is the device dGPU with
520 * smart shift support
521 *
522 * @dev: drm_device pointer
523 *
524 * Returns true if the device is a dGPU with Smart Shift support,
525 * otherwise returns false.
526 */
527bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
528{
529 return (amdgpu_device_supports_boco(dev) &&
530 amdgpu_acpi_is_power_shift_control_supported());
531}
532
533/*
534 * VRAM access helper functions
535 */
536
537/**
538 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
539 *
540 * @adev: amdgpu_device pointer
541 * @pos: offset of the buffer in vram
542 * @buf: virtual address of the buffer in system memory
543 * @size: read/write size, sizeof(@buf) must > @size
544 * @write: true - write to vram, otherwise - read from vram
545 */
546void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
547 void *buf, size_t size, bool write)
548{
549 unsigned long flags;
550 uint32_t hi = ~0, tmp = 0;
551 uint32_t *data = buf;
552 uint64_t last;
553 int idx;
554
555 if (!drm_dev_enter(adev_to_drm(adev), &idx))
556 return;
557
558 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
559
560 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
561 for (last = pos + size; pos < last; pos += 4) {
562 tmp = pos >> 31;
563
564 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
565 if (tmp != hi) {
566 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
567 hi = tmp;
568 }
569 if (write)
570 WREG32_NO_KIQ(mmMM_DATA, *data++);
571 else
572 *data++ = RREG32_NO_KIQ(mmMM_DATA);
573 }
574
575 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
576 drm_dev_exit(idx);
577}
578
579/**
580 * amdgpu_device_aper_access - access vram by vram aperature
581 *
582 * @adev: amdgpu_device pointer
583 * @pos: offset of the buffer in vram
584 * @buf: virtual address of the buffer in system memory
585 * @size: read/write size, sizeof(@buf) must > @size
586 * @write: true - write to vram, otherwise - read from vram
587 *
588 * The return value means how many bytes have been transferred.
589 */
590size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
591 void *buf, size_t size, bool write)
592{
593#ifdef CONFIG_64BIT
594 void __iomem *addr;
595 size_t count = 0;
596 uint64_t last;
597
598 if (!adev->mman.aper_base_kaddr)
599 return 0;
600
601 last = min(pos + size, adev->gmc.visible_vram_size);
602 if (last > pos) {
603 addr = adev->mman.aper_base_kaddr + pos;
604 count = last - pos;
605
606 if (write) {
607 memcpy_toio(addr, buf, count);
608 /* Make sure HDP write cache flush happens without any reordering
609 * after the system memory contents are sent over PCIe device
610 */
611 mb();
612 amdgpu_device_flush_hdp(adev, NULL);
613 } else {
614 amdgpu_device_invalidate_hdp(adev, NULL);
615 /* Make sure HDP read cache is invalidated before issuing a read
616 * to the PCIe device
617 */
618 mb();
619 memcpy_fromio(buf, addr, count);
620 }
621
622 }
623
624 return count;
625#else
626 return 0;
627#endif
628}
629
630/**
631 * amdgpu_device_vram_access - read/write a buffer in vram
632 *
633 * @adev: amdgpu_device pointer
634 * @pos: offset of the buffer in vram
635 * @buf: virtual address of the buffer in system memory
636 * @size: read/write size, sizeof(@buf) must > @size
637 * @write: true - write to vram, otherwise - read from vram
638 */
639void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
640 void *buf, size_t size, bool write)
641{
642 size_t count;
643
644 /* try to using vram apreature to access vram first */
645 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
646 size -= count;
647 if (size) {
648 /* using MM to access rest vram */
649 pos += count;
650 buf += count;
651 amdgpu_device_mm_access(adev, pos, buf, size, write);
652 }
653}
654
655/*
656 * register access helper functions.
657 */
658
659/* Check if hw access should be skipped because of hotplug or device error */
660bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
661{
662 if (adev->no_hw_access)
663 return true;
664
665#ifdef CONFIG_LOCKDEP
666 /*
667 * This is a bit complicated to understand, so worth a comment. What we assert
668 * here is that the GPU reset is not running on another thread in parallel.
669 *
670 * For this we trylock the read side of the reset semaphore, if that succeeds
671 * we know that the reset is not running in paralell.
672 *
673 * If the trylock fails we assert that we are either already holding the read
674 * side of the lock or are the reset thread itself and hold the write side of
675 * the lock.
676 */
677 if (in_task()) {
678 if (down_read_trylock(&adev->reset_domain->sem))
679 up_read(&adev->reset_domain->sem);
680 else
681 lockdep_assert_held(&adev->reset_domain->sem);
682 }
683#endif
684 return false;
685}
686
687/**
688 * amdgpu_device_rreg - read a memory mapped IO or indirect register
689 *
690 * @adev: amdgpu_device pointer
691 * @reg: dword aligned register offset
692 * @acc_flags: access flags which require special behavior
693 *
694 * Returns the 32 bit value from the offset specified.
695 */
696uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
697 uint32_t reg, uint32_t acc_flags)
698{
699 uint32_t ret;
700
701 if (amdgpu_device_skip_hw_access(adev))
702 return 0;
703
704 if ((reg * 4) < adev->rmmio_size) {
705 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
706 amdgpu_sriov_runtime(adev) &&
707 down_read_trylock(&adev->reset_domain->sem)) {
708 ret = amdgpu_kiq_rreg(adev, reg, 0);
709 up_read(&adev->reset_domain->sem);
710 } else {
711 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
712 }
713 } else {
714 ret = adev->pcie_rreg(adev, reg * 4);
715 }
716
717 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
718
719 return ret;
720}
721
722/*
723 * MMIO register read with bytes helper functions
724 * @offset:bytes offset from MMIO start
725 */
726
727/**
728 * amdgpu_mm_rreg8 - read a memory mapped IO register
729 *
730 * @adev: amdgpu_device pointer
731 * @offset: byte aligned register offset
732 *
733 * Returns the 8 bit value from the offset specified.
734 */
735uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
736{
737 if (amdgpu_device_skip_hw_access(adev))
738 return 0;
739
740 if (offset < adev->rmmio_size)
741 return (readb(adev->rmmio + offset));
742 BUG();
743}
744
745
746/**
747 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
748 *
749 * @adev: amdgpu_device pointer
750 * @reg: dword aligned register offset
751 * @acc_flags: access flags which require special behavior
752 * @xcc_id: xcc accelerated compute core id
753 *
754 * Returns the 32 bit value from the offset specified.
755 */
756uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
757 uint32_t reg, uint32_t acc_flags,
758 uint32_t xcc_id)
759{
760 uint32_t ret, rlcg_flag;
761
762 if (amdgpu_device_skip_hw_access(adev))
763 return 0;
764
765 if ((reg * 4) < adev->rmmio_size) {
766 if (amdgpu_sriov_vf(adev) &&
767 !amdgpu_sriov_runtime(adev) &&
768 adev->gfx.rlc.rlcg_reg_access_supported &&
769 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
770 GC_HWIP, false,
771 &rlcg_flag)) {
772 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
773 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
774 amdgpu_sriov_runtime(adev) &&
775 down_read_trylock(&adev->reset_domain->sem)) {
776 ret = amdgpu_kiq_rreg(adev, reg, xcc_id);
777 up_read(&adev->reset_domain->sem);
778 } else {
779 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
780 }
781 } else {
782 ret = adev->pcie_rreg(adev, reg * 4);
783 }
784
785 return ret;
786}
787
788/*
789 * MMIO register write with bytes helper functions
790 * @offset:bytes offset from MMIO start
791 * @value: the value want to be written to the register
792 */
793
794/**
795 * amdgpu_mm_wreg8 - read a memory mapped IO register
796 *
797 * @adev: amdgpu_device pointer
798 * @offset: byte aligned register offset
799 * @value: 8 bit value to write
800 *
801 * Writes the value specified to the offset specified.
802 */
803void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
804{
805 if (amdgpu_device_skip_hw_access(adev))
806 return;
807
808 if (offset < adev->rmmio_size)
809 writeb(value, adev->rmmio + offset);
810 else
811 BUG();
812}
813
814/**
815 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
816 *
817 * @adev: amdgpu_device pointer
818 * @reg: dword aligned register offset
819 * @v: 32 bit value to write to the register
820 * @acc_flags: access flags which require special behavior
821 *
822 * Writes the value specified to the offset specified.
823 */
824void amdgpu_device_wreg(struct amdgpu_device *adev,
825 uint32_t reg, uint32_t v,
826 uint32_t acc_flags)
827{
828 if (amdgpu_device_skip_hw_access(adev))
829 return;
830
831 if ((reg * 4) < adev->rmmio_size) {
832 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
833 amdgpu_sriov_runtime(adev) &&
834 down_read_trylock(&adev->reset_domain->sem)) {
835 amdgpu_kiq_wreg(adev, reg, v, 0);
836 up_read(&adev->reset_domain->sem);
837 } else {
838 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
839 }
840 } else {
841 adev->pcie_wreg(adev, reg * 4, v);
842 }
843
844 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
845}
846
847/**
848 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
849 *
850 * @adev: amdgpu_device pointer
851 * @reg: mmio/rlc register
852 * @v: value to write
853 * @xcc_id: xcc accelerated compute core id
854 *
855 * this function is invoked only for the debugfs register access
856 */
857void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
858 uint32_t reg, uint32_t v,
859 uint32_t xcc_id)
860{
861 if (amdgpu_device_skip_hw_access(adev))
862 return;
863
864 if (amdgpu_sriov_fullaccess(adev) &&
865 adev->gfx.rlc.funcs &&
866 adev->gfx.rlc.funcs->is_rlcg_access_range) {
867 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
868 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
869 } else if ((reg * 4) >= adev->rmmio_size) {
870 adev->pcie_wreg(adev, reg * 4, v);
871 } else {
872 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
873 }
874}
875
876/**
877 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
878 *
879 * @adev: amdgpu_device pointer
880 * @reg: dword aligned register offset
881 * @v: 32 bit value to write to the register
882 * @acc_flags: access flags which require special behavior
883 * @xcc_id: xcc accelerated compute core id
884 *
885 * Writes the value specified to the offset specified.
886 */
887void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
888 uint32_t reg, uint32_t v,
889 uint32_t acc_flags, uint32_t xcc_id)
890{
891 uint32_t rlcg_flag;
892
893 if (amdgpu_device_skip_hw_access(adev))
894 return;
895
896 if ((reg * 4) < adev->rmmio_size) {
897 if (amdgpu_sriov_vf(adev) &&
898 !amdgpu_sriov_runtime(adev) &&
899 adev->gfx.rlc.rlcg_reg_access_supported &&
900 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
901 GC_HWIP, true,
902 &rlcg_flag)) {
903 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
904 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
905 amdgpu_sriov_runtime(adev) &&
906 down_read_trylock(&adev->reset_domain->sem)) {
907 amdgpu_kiq_wreg(adev, reg, v, xcc_id);
908 up_read(&adev->reset_domain->sem);
909 } else {
910 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
911 }
912 } else {
913 adev->pcie_wreg(adev, reg * 4, v);
914 }
915}
916
917/**
918 * amdgpu_device_indirect_rreg - read an indirect register
919 *
920 * @adev: amdgpu_device pointer
921 * @reg_addr: indirect register address to read from
922 *
923 * Returns the value of indirect register @reg_addr
924 */
925u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
926 u32 reg_addr)
927{
928 unsigned long flags, pcie_index, pcie_data;
929 void __iomem *pcie_index_offset;
930 void __iomem *pcie_data_offset;
931 u32 r;
932
933 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
934 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
935
936 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
937 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
938 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
939
940 writel(reg_addr, pcie_index_offset);
941 readl(pcie_index_offset);
942 r = readl(pcie_data_offset);
943 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
944
945 return r;
946}
947
948u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
949 u64 reg_addr)
950{
951 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
952 u32 r;
953 void __iomem *pcie_index_offset;
954 void __iomem *pcie_index_hi_offset;
955 void __iomem *pcie_data_offset;
956
957 if (unlikely(!adev->nbio.funcs)) {
958 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
959 pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
960 } else {
961 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
962 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
963 }
964
965 if (reg_addr >> 32) {
966 if (unlikely(!adev->nbio.funcs))
967 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
968 else
969 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
970 } else {
971 pcie_index_hi = 0;
972 }
973
974 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
975 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
976 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
977 if (pcie_index_hi != 0)
978 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
979 pcie_index_hi * 4;
980
981 writel(reg_addr, pcie_index_offset);
982 readl(pcie_index_offset);
983 if (pcie_index_hi != 0) {
984 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
985 readl(pcie_index_hi_offset);
986 }
987 r = readl(pcie_data_offset);
988
989 /* clear the high bits */
990 if (pcie_index_hi != 0) {
991 writel(0, pcie_index_hi_offset);
992 readl(pcie_index_hi_offset);
993 }
994
995 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
996
997 return r;
998}
999
1000/**
1001 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
1002 *
1003 * @adev: amdgpu_device pointer
1004 * @reg_addr: indirect register address to read from
1005 *
1006 * Returns the value of indirect register @reg_addr
1007 */
1008u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
1009 u32 reg_addr)
1010{
1011 unsigned long flags, pcie_index, pcie_data;
1012 void __iomem *pcie_index_offset;
1013 void __iomem *pcie_data_offset;
1014 u64 r;
1015
1016 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1017 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1018
1019 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1020 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1021 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1022
1023 /* read low 32 bits */
1024 writel(reg_addr, pcie_index_offset);
1025 readl(pcie_index_offset);
1026 r = readl(pcie_data_offset);
1027 /* read high 32 bits */
1028 writel(reg_addr + 4, pcie_index_offset);
1029 readl(pcie_index_offset);
1030 r |= ((u64)readl(pcie_data_offset) << 32);
1031 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1032
1033 return r;
1034}
1035
1036u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
1037 u64 reg_addr)
1038{
1039 unsigned long flags, pcie_index, pcie_data;
1040 unsigned long pcie_index_hi = 0;
1041 void __iomem *pcie_index_offset;
1042 void __iomem *pcie_index_hi_offset;
1043 void __iomem *pcie_data_offset;
1044 u64 r;
1045
1046 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1047 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1048 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1049 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1050
1051 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1052 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1053 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1054 if (pcie_index_hi != 0)
1055 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1056 pcie_index_hi * 4;
1057
1058 /* read low 32 bits */
1059 writel(reg_addr, pcie_index_offset);
1060 readl(pcie_index_offset);
1061 if (pcie_index_hi != 0) {
1062 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1063 readl(pcie_index_hi_offset);
1064 }
1065 r = readl(pcie_data_offset);
1066 /* read high 32 bits */
1067 writel(reg_addr + 4, pcie_index_offset);
1068 readl(pcie_index_offset);
1069 if (pcie_index_hi != 0) {
1070 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1071 readl(pcie_index_hi_offset);
1072 }
1073 r |= ((u64)readl(pcie_data_offset) << 32);
1074
1075 /* clear the high bits */
1076 if (pcie_index_hi != 0) {
1077 writel(0, pcie_index_hi_offset);
1078 readl(pcie_index_hi_offset);
1079 }
1080
1081 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1082
1083 return r;
1084}
1085
1086/**
1087 * amdgpu_device_indirect_wreg - write an indirect register address
1088 *
1089 * @adev: amdgpu_device pointer
1090 * @reg_addr: indirect register offset
1091 * @reg_data: indirect register data
1092 *
1093 */
1094void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
1095 u32 reg_addr, u32 reg_data)
1096{
1097 unsigned long flags, pcie_index, pcie_data;
1098 void __iomem *pcie_index_offset;
1099 void __iomem *pcie_data_offset;
1100
1101 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1102 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1103
1104 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1105 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1106 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1107
1108 writel(reg_addr, pcie_index_offset);
1109 readl(pcie_index_offset);
1110 writel(reg_data, pcie_data_offset);
1111 readl(pcie_data_offset);
1112 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1113}
1114
1115void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
1116 u64 reg_addr, u32 reg_data)
1117{
1118 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
1119 void __iomem *pcie_index_offset;
1120 void __iomem *pcie_index_hi_offset;
1121 void __iomem *pcie_data_offset;
1122
1123 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1124 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1125 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1126 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1127 else
1128 pcie_index_hi = 0;
1129
1130 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1131 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1132 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1133 if (pcie_index_hi != 0)
1134 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1135 pcie_index_hi * 4;
1136
1137 writel(reg_addr, pcie_index_offset);
1138 readl(pcie_index_offset);
1139 if (pcie_index_hi != 0) {
1140 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1141 readl(pcie_index_hi_offset);
1142 }
1143 writel(reg_data, pcie_data_offset);
1144 readl(pcie_data_offset);
1145
1146 /* clear the high bits */
1147 if (pcie_index_hi != 0) {
1148 writel(0, pcie_index_hi_offset);
1149 readl(pcie_index_hi_offset);
1150 }
1151
1152 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1153}
1154
1155/**
1156 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
1157 *
1158 * @adev: amdgpu_device pointer
1159 * @reg_addr: indirect register offset
1160 * @reg_data: indirect register data
1161 *
1162 */
1163void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
1164 u32 reg_addr, u64 reg_data)
1165{
1166 unsigned long flags, pcie_index, pcie_data;
1167 void __iomem *pcie_index_offset;
1168 void __iomem *pcie_data_offset;
1169
1170 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1171 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1172
1173 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1174 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1175 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1176
1177 /* write low 32 bits */
1178 writel(reg_addr, pcie_index_offset);
1179 readl(pcie_index_offset);
1180 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1181 readl(pcie_data_offset);
1182 /* write high 32 bits */
1183 writel(reg_addr + 4, pcie_index_offset);
1184 readl(pcie_index_offset);
1185 writel((u32)(reg_data >> 32), pcie_data_offset);
1186 readl(pcie_data_offset);
1187 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1188}
1189
1190void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
1191 u64 reg_addr, u64 reg_data)
1192{
1193 unsigned long flags, pcie_index, pcie_data;
1194 unsigned long pcie_index_hi = 0;
1195 void __iomem *pcie_index_offset;
1196 void __iomem *pcie_index_hi_offset;
1197 void __iomem *pcie_data_offset;
1198
1199 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1200 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1201 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1202 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1203
1204 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1205 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1206 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1207 if (pcie_index_hi != 0)
1208 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1209 pcie_index_hi * 4;
1210
1211 /* write low 32 bits */
1212 writel(reg_addr, pcie_index_offset);
1213 readl(pcie_index_offset);
1214 if (pcie_index_hi != 0) {
1215 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1216 readl(pcie_index_hi_offset);
1217 }
1218 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1219 readl(pcie_data_offset);
1220 /* write high 32 bits */
1221 writel(reg_addr + 4, pcie_index_offset);
1222 readl(pcie_index_offset);
1223 if (pcie_index_hi != 0) {
1224 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1225 readl(pcie_index_hi_offset);
1226 }
1227 writel((u32)(reg_data >> 32), pcie_data_offset);
1228 readl(pcie_data_offset);
1229
1230 /* clear the high bits */
1231 if (pcie_index_hi != 0) {
1232 writel(0, pcie_index_hi_offset);
1233 readl(pcie_index_hi_offset);
1234 }
1235
1236 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1237}
1238
1239/**
1240 * amdgpu_device_get_rev_id - query device rev_id
1241 *
1242 * @adev: amdgpu_device pointer
1243 *
1244 * Return device rev_id
1245 */
1246u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
1247{
1248 return adev->nbio.funcs->get_rev_id(adev);
1249}
1250
1251/**
1252 * amdgpu_invalid_rreg - dummy reg read function
1253 *
1254 * @adev: amdgpu_device pointer
1255 * @reg: offset of register
1256 *
1257 * Dummy register read function. Used for register blocks
1258 * that certain asics don't have (all asics).
1259 * Returns the value in the register.
1260 */
1261static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
1262{
1263 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
1264 BUG();
1265 return 0;
1266}
1267
1268static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
1269{
1270 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1271 BUG();
1272 return 0;
1273}
1274
1275/**
1276 * amdgpu_invalid_wreg - dummy reg write function
1277 *
1278 * @adev: amdgpu_device pointer
1279 * @reg: offset of register
1280 * @v: value to write to the register
1281 *
1282 * Dummy register read function. Used for register blocks
1283 * that certain asics don't have (all asics).
1284 */
1285static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
1286{
1287 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
1288 reg, v);
1289 BUG();
1290}
1291
1292static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
1293{
1294 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
1295 reg, v);
1296 BUG();
1297}
1298
1299/**
1300 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
1301 *
1302 * @adev: amdgpu_device pointer
1303 * @reg: offset of register
1304 *
1305 * Dummy register read function. Used for register blocks
1306 * that certain asics don't have (all asics).
1307 * Returns the value in the register.
1308 */
1309static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
1310{
1311 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
1312 BUG();
1313 return 0;
1314}
1315
1316static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
1317{
1318 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1319 BUG();
1320 return 0;
1321}
1322
1323/**
1324 * amdgpu_invalid_wreg64 - dummy reg write function
1325 *
1326 * @adev: amdgpu_device pointer
1327 * @reg: offset of register
1328 * @v: value to write to the register
1329 *
1330 * Dummy register read function. Used for register blocks
1331 * that certain asics don't have (all asics).
1332 */
1333static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
1334{
1335 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
1336 reg, v);
1337 BUG();
1338}
1339
1340static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
1341{
1342 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
1343 reg, v);
1344 BUG();
1345}
1346
1347/**
1348 * amdgpu_block_invalid_rreg - dummy reg read function
1349 *
1350 * @adev: amdgpu_device pointer
1351 * @block: offset of instance
1352 * @reg: offset of register
1353 *
1354 * Dummy register read function. Used for register blocks
1355 * that certain asics don't have (all asics).
1356 * Returns the value in the register.
1357 */
1358static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
1359 uint32_t block, uint32_t reg)
1360{
1361 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
1362 reg, block);
1363 BUG();
1364 return 0;
1365}
1366
1367/**
1368 * amdgpu_block_invalid_wreg - dummy reg write function
1369 *
1370 * @adev: amdgpu_device pointer
1371 * @block: offset of instance
1372 * @reg: offset of register
1373 * @v: value to write to the register
1374 *
1375 * Dummy register read function. Used for register blocks
1376 * that certain asics don't have (all asics).
1377 */
1378static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
1379 uint32_t block,
1380 uint32_t reg, uint32_t v)
1381{
1382 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1383 reg, block, v);
1384 BUG();
1385}
1386
1387/**
1388 * amdgpu_device_asic_init - Wrapper for atom asic_init
1389 *
1390 * @adev: amdgpu_device pointer
1391 *
1392 * Does any asic specific work and then calls atom asic init.
1393 */
1394static int amdgpu_device_asic_init(struct amdgpu_device *adev)
1395{
1396 int ret;
1397
1398 amdgpu_asic_pre_asic_init(adev);
1399
1400 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
1401 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
1402 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
1403 amdgpu_psp_wait_for_bootloader(adev);
1404 ret = amdgpu_atomfirmware_asic_init(adev, true);
1405 return ret;
1406 } else {
1407 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
1408 }
1409
1410 return 0;
1411}
1412
1413/**
1414 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1415 *
1416 * @adev: amdgpu_device pointer
1417 *
1418 * Allocates a scratch page of VRAM for use by various things in the
1419 * driver.
1420 */
1421static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
1422{
1423 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
1424 AMDGPU_GEM_DOMAIN_VRAM |
1425 AMDGPU_GEM_DOMAIN_GTT,
1426 &adev->mem_scratch.robj,
1427 &adev->mem_scratch.gpu_addr,
1428 (void **)&adev->mem_scratch.ptr);
1429}
1430
1431/**
1432 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1433 *
1434 * @adev: amdgpu_device pointer
1435 *
1436 * Frees the VRAM scratch page.
1437 */
1438static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
1439{
1440 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
1441}
1442
1443/**
1444 * amdgpu_device_program_register_sequence - program an array of registers.
1445 *
1446 * @adev: amdgpu_device pointer
1447 * @registers: pointer to the register array
1448 * @array_size: size of the register array
1449 *
1450 * Programs an array or registers with and or masks.
1451 * This is a helper for setting golden registers.
1452 */
1453void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
1454 const u32 *registers,
1455 const u32 array_size)
1456{
1457 u32 tmp, reg, and_mask, or_mask;
1458 int i;
1459
1460 if (array_size % 3)
1461 return;
1462
1463 for (i = 0; i < array_size; i += 3) {
1464 reg = registers[i + 0];
1465 and_mask = registers[i + 1];
1466 or_mask = registers[i + 2];
1467
1468 if (and_mask == 0xffffffff) {
1469 tmp = or_mask;
1470 } else {
1471 tmp = RREG32(reg);
1472 tmp &= ~and_mask;
1473 if (adev->family >= AMDGPU_FAMILY_AI)
1474 tmp |= (or_mask & and_mask);
1475 else
1476 tmp |= or_mask;
1477 }
1478 WREG32(reg, tmp);
1479 }
1480}
1481
1482/**
1483 * amdgpu_device_pci_config_reset - reset the GPU
1484 *
1485 * @adev: amdgpu_device pointer
1486 *
1487 * Resets the GPU using the pci config reset sequence.
1488 * Only applicable to asics prior to vega10.
1489 */
1490void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1491{
1492 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1493}
1494
1495/**
1496 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1497 *
1498 * @adev: amdgpu_device pointer
1499 *
1500 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1501 */
1502int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1503{
1504 return pci_reset_function(adev->pdev);
1505}
1506
1507/*
1508 * amdgpu_device_wb_*()
1509 * Writeback is the method by which the GPU updates special pages in memory
1510 * with the status of certain GPU events (fences, ring pointers,etc.).
1511 */
1512
1513/**
1514 * amdgpu_device_wb_fini - Disable Writeback and free memory
1515 *
1516 * @adev: amdgpu_device pointer
1517 *
1518 * Disables Writeback and frees the Writeback memory (all asics).
1519 * Used at driver shutdown.
1520 */
1521static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1522{
1523 if (adev->wb.wb_obj) {
1524 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1525 &adev->wb.gpu_addr,
1526 (void **)&adev->wb.wb);
1527 adev->wb.wb_obj = NULL;
1528 }
1529}
1530
1531/**
1532 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1533 *
1534 * @adev: amdgpu_device pointer
1535 *
1536 * Initializes writeback and allocates writeback memory (all asics).
1537 * Used at driver startup.
1538 * Returns 0 on success or an -error on failure.
1539 */
1540static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1541{
1542 int r;
1543
1544 if (adev->wb.wb_obj == NULL) {
1545 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1546 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1547 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1548 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1549 (void **)&adev->wb.wb);
1550 if (r) {
1551 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1552 return r;
1553 }
1554
1555 adev->wb.num_wb = AMDGPU_MAX_WB;
1556 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1557
1558 /* clear wb memory */
1559 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1560 }
1561
1562 return 0;
1563}
1564
1565/**
1566 * amdgpu_device_wb_get - Allocate a wb entry
1567 *
1568 * @adev: amdgpu_device pointer
1569 * @wb: wb index
1570 *
1571 * Allocate a wb slot for use by the driver (all asics).
1572 * Returns 0 on success or -EINVAL on failure.
1573 */
1574int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1575{
1576 unsigned long flags, offset;
1577
1578 spin_lock_irqsave(&adev->wb.lock, flags);
1579 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1580 if (offset < adev->wb.num_wb) {
1581 __set_bit(offset, adev->wb.used);
1582 spin_unlock_irqrestore(&adev->wb.lock, flags);
1583 *wb = offset << 3; /* convert to dw offset */
1584 return 0;
1585 } else {
1586 spin_unlock_irqrestore(&adev->wb.lock, flags);
1587 return -EINVAL;
1588 }
1589}
1590
1591/**
1592 * amdgpu_device_wb_free - Free a wb entry
1593 *
1594 * @adev: amdgpu_device pointer
1595 * @wb: wb index
1596 *
1597 * Free a wb slot allocated for use by the driver (all asics)
1598 */
1599void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1600{
1601 unsigned long flags;
1602
1603 wb >>= 3;
1604 spin_lock_irqsave(&adev->wb.lock, flags);
1605 if (wb < adev->wb.num_wb)
1606 __clear_bit(wb, adev->wb.used);
1607 spin_unlock_irqrestore(&adev->wb.lock, flags);
1608}
1609
1610/**
1611 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1612 *
1613 * @adev: amdgpu_device pointer
1614 *
1615 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1616 * to fail, but if any of the BARs is not accessible after the size we abort
1617 * driver loading by returning -ENODEV.
1618 */
1619int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1620{
1621 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1622 struct pci_bus *root;
1623 struct resource *res;
1624 unsigned int i;
1625 u16 cmd;
1626 int r;
1627
1628 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1629 return 0;
1630
1631 /* Bypass for VF */
1632 if (amdgpu_sriov_vf(adev))
1633 return 0;
1634
1635 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1636 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1637 DRM_WARN("System can't access extended configuration space, please check!!\n");
1638
1639 /* skip if the bios has already enabled large BAR */
1640 if (adev->gmc.real_vram_size &&
1641 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1642 return 0;
1643
1644 /* Check if the root BUS has 64bit memory resources */
1645 root = adev->pdev->bus;
1646 while (root->parent)
1647 root = root->parent;
1648
1649 pci_bus_for_each_resource(root, res, i) {
1650 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1651 res->start > 0x100000000ull)
1652 break;
1653 }
1654
1655 /* Trying to resize is pointless without a root hub window above 4GB */
1656 if (!res)
1657 return 0;
1658
1659 /* Limit the BAR size to what is available */
1660 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1661 rbar_size);
1662
1663 /* Disable memory decoding while we change the BAR addresses and size */
1664 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1665 pci_write_config_word(adev->pdev, PCI_COMMAND,
1666 cmd & ~PCI_COMMAND_MEMORY);
1667
1668 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1669 amdgpu_doorbell_fini(adev);
1670 if (adev->asic_type >= CHIP_BONAIRE)
1671 pci_release_resource(adev->pdev, 2);
1672
1673 pci_release_resource(adev->pdev, 0);
1674
1675 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1676 if (r == -ENOSPC)
1677 DRM_INFO("Not enough PCI address space for a large BAR.");
1678 else if (r && r != -ENOTSUPP)
1679 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1680
1681 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1682
1683 /* When the doorbell or fb BAR isn't available we have no chance of
1684 * using the device.
1685 */
1686 r = amdgpu_doorbell_init(adev);
1687 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1688 return -ENODEV;
1689
1690 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1691
1692 return 0;
1693}
1694
1695static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1696{
1697 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1698 return false;
1699
1700 return true;
1701}
1702
1703/*
1704 * GPU helpers function.
1705 */
1706/**
1707 * amdgpu_device_need_post - check if the hw need post or not
1708 *
1709 * @adev: amdgpu_device pointer
1710 *
1711 * Check if the asic has been initialized (all asics) at driver startup
1712 * or post is needed if hw reset is performed.
1713 * Returns true if need or false if not.
1714 */
1715bool amdgpu_device_need_post(struct amdgpu_device *adev)
1716{
1717 uint32_t reg;
1718
1719 if (amdgpu_sriov_vf(adev))
1720 return false;
1721
1722 if (!amdgpu_device_read_bios(adev))
1723 return false;
1724
1725 if (amdgpu_passthrough(adev)) {
1726 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1727 * some old smc fw still need driver do vPost otherwise gpu hang, while
1728 * those smc fw version above 22.15 doesn't have this flaw, so we force
1729 * vpost executed for smc version below 22.15
1730 */
1731 if (adev->asic_type == CHIP_FIJI) {
1732 int err;
1733 uint32_t fw_ver;
1734
1735 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1736 /* force vPost if error occured */
1737 if (err)
1738 return true;
1739
1740 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1741 release_firmware(adev->pm.fw);
1742 if (fw_ver < 0x00160e00)
1743 return true;
1744 }
1745 }
1746
1747 /* Don't post if we need to reset whole hive on init */
1748 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
1749 return false;
1750
1751 if (adev->has_hw_reset) {
1752 adev->has_hw_reset = false;
1753 return true;
1754 }
1755
1756 /* bios scratch used on CIK+ */
1757 if (adev->asic_type >= CHIP_BONAIRE)
1758 return amdgpu_atombios_scratch_need_asic_init(adev);
1759
1760 /* check MEM_SIZE for older asics */
1761 reg = amdgpu_asic_get_config_memsize(adev);
1762
1763 if ((reg != 0) && (reg != 0xffffffff))
1764 return false;
1765
1766 return true;
1767}
1768
1769/*
1770 * Check whether seamless boot is supported.
1771 *
1772 * So far we only support seamless boot on DCE 3.0 or later.
1773 * If users report that it works on older ASICS as well, we may
1774 * loosen this.
1775 */
1776bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
1777{
1778 switch (amdgpu_seamless) {
1779 case -1:
1780 break;
1781 case 1:
1782 return true;
1783 case 0:
1784 return false;
1785 default:
1786 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n",
1787 amdgpu_seamless);
1788 return false;
1789 }
1790
1791 if (!(adev->flags & AMD_IS_APU))
1792 return false;
1793
1794 if (adev->mman.keep_stolen_vga_memory)
1795 return false;
1796
1797 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
1798}
1799
1800/*
1801 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
1802 * don't support dynamic speed switching. Until we have confirmation from Intel
1803 * that a specific host supports it, it's safer that we keep it disabled for all.
1804 *
1805 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1806 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1807 */
1808static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
1809{
1810#if IS_ENABLED(CONFIG_X86)
1811 struct cpuinfo_x86 *c = &cpu_data(0);
1812
1813 /* eGPU change speeds based on USB4 fabric conditions */
1814 if (dev_is_removable(adev->dev))
1815 return true;
1816
1817 if (c->x86_vendor == X86_VENDOR_INTEL)
1818 return false;
1819#endif
1820 return true;
1821}
1822
1823/**
1824 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1825 *
1826 * @adev: amdgpu_device pointer
1827 *
1828 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1829 * be set for this device.
1830 *
1831 * Returns true if it should be used or false if not.
1832 */
1833bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1834{
1835 switch (amdgpu_aspm) {
1836 case -1:
1837 break;
1838 case 0:
1839 return false;
1840 case 1:
1841 return true;
1842 default:
1843 return false;
1844 }
1845 if (adev->flags & AMD_IS_APU)
1846 return false;
1847 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK))
1848 return false;
1849 return pcie_aspm_enabled(adev->pdev);
1850}
1851
1852/* if we get transitioned to only one device, take VGA back */
1853/**
1854 * amdgpu_device_vga_set_decode - enable/disable vga decode
1855 *
1856 * @pdev: PCI device pointer
1857 * @state: enable/disable vga decode
1858 *
1859 * Enable/disable vga decode (all asics).
1860 * Returns VGA resource flags.
1861 */
1862static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1863 bool state)
1864{
1865 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1866
1867 amdgpu_asic_set_vga_state(adev, state);
1868 if (state)
1869 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1870 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1871 else
1872 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1873}
1874
1875/**
1876 * amdgpu_device_check_block_size - validate the vm block size
1877 *
1878 * @adev: amdgpu_device pointer
1879 *
1880 * Validates the vm block size specified via module parameter.
1881 * The vm block size defines number of bits in page table versus page directory,
1882 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1883 * page table and the remaining bits are in the page directory.
1884 */
1885static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1886{
1887 /* defines number of bits in page table versus page directory,
1888 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1889 * page table and the remaining bits are in the page directory
1890 */
1891 if (amdgpu_vm_block_size == -1)
1892 return;
1893
1894 if (amdgpu_vm_block_size < 9) {
1895 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1896 amdgpu_vm_block_size);
1897 amdgpu_vm_block_size = -1;
1898 }
1899}
1900
1901/**
1902 * amdgpu_device_check_vm_size - validate the vm size
1903 *
1904 * @adev: amdgpu_device pointer
1905 *
1906 * Validates the vm size in GB specified via module parameter.
1907 * The VM size is the size of the GPU virtual memory space in GB.
1908 */
1909static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1910{
1911 /* no need to check the default value */
1912 if (amdgpu_vm_size == -1)
1913 return;
1914
1915 if (amdgpu_vm_size < 1) {
1916 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1917 amdgpu_vm_size);
1918 amdgpu_vm_size = -1;
1919 }
1920}
1921
1922static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1923{
1924 struct sysinfo si;
1925 bool is_os_64 = (sizeof(void *) == 8);
1926 uint64_t total_memory;
1927 uint64_t dram_size_seven_GB = 0x1B8000000;
1928 uint64_t dram_size_three_GB = 0xB8000000;
1929
1930 if (amdgpu_smu_memory_pool_size == 0)
1931 return;
1932
1933 if (!is_os_64) {
1934 DRM_WARN("Not 64-bit OS, feature not supported\n");
1935 goto def_value;
1936 }
1937 si_meminfo(&si);
1938 total_memory = (uint64_t)si.totalram * si.mem_unit;
1939
1940 if ((amdgpu_smu_memory_pool_size == 1) ||
1941 (amdgpu_smu_memory_pool_size == 2)) {
1942 if (total_memory < dram_size_three_GB)
1943 goto def_value1;
1944 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1945 (amdgpu_smu_memory_pool_size == 8)) {
1946 if (total_memory < dram_size_seven_GB)
1947 goto def_value1;
1948 } else {
1949 DRM_WARN("Smu memory pool size not supported\n");
1950 goto def_value;
1951 }
1952 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1953
1954 return;
1955
1956def_value1:
1957 DRM_WARN("No enough system memory\n");
1958def_value:
1959 adev->pm.smu_prv_buffer_size = 0;
1960}
1961
1962static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1963{
1964 if (!(adev->flags & AMD_IS_APU) ||
1965 adev->asic_type < CHIP_RAVEN)
1966 return 0;
1967
1968 switch (adev->asic_type) {
1969 case CHIP_RAVEN:
1970 if (adev->pdev->device == 0x15dd)
1971 adev->apu_flags |= AMD_APU_IS_RAVEN;
1972 if (adev->pdev->device == 0x15d8)
1973 adev->apu_flags |= AMD_APU_IS_PICASSO;
1974 break;
1975 case CHIP_RENOIR:
1976 if ((adev->pdev->device == 0x1636) ||
1977 (adev->pdev->device == 0x164c))
1978 adev->apu_flags |= AMD_APU_IS_RENOIR;
1979 else
1980 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1981 break;
1982 case CHIP_VANGOGH:
1983 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1984 break;
1985 case CHIP_YELLOW_CARP:
1986 break;
1987 case CHIP_CYAN_SKILLFISH:
1988 if ((adev->pdev->device == 0x13FE) ||
1989 (adev->pdev->device == 0x143F))
1990 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1991 break;
1992 default:
1993 break;
1994 }
1995
1996 return 0;
1997}
1998
1999/**
2000 * amdgpu_device_check_arguments - validate module params
2001 *
2002 * @adev: amdgpu_device pointer
2003 *
2004 * Validates certain module parameters and updates
2005 * the associated values used by the driver (all asics).
2006 */
2007static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
2008{
2009 int i;
2010
2011 if (amdgpu_sched_jobs < 4) {
2012 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
2013 amdgpu_sched_jobs);
2014 amdgpu_sched_jobs = 4;
2015 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
2016 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
2017 amdgpu_sched_jobs);
2018 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
2019 }
2020
2021 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
2022 /* gart size must be greater or equal to 32M */
2023 dev_warn(adev->dev, "gart size (%d) too small\n",
2024 amdgpu_gart_size);
2025 amdgpu_gart_size = -1;
2026 }
2027
2028 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
2029 /* gtt size must be greater or equal to 32M */
2030 dev_warn(adev->dev, "gtt size (%d) too small\n",
2031 amdgpu_gtt_size);
2032 amdgpu_gtt_size = -1;
2033 }
2034
2035 /* valid range is between 4 and 9 inclusive */
2036 if (amdgpu_vm_fragment_size != -1 &&
2037 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
2038 dev_warn(adev->dev, "valid range is between 4 and 9\n");
2039 amdgpu_vm_fragment_size = -1;
2040 }
2041
2042 if (amdgpu_sched_hw_submission < 2) {
2043 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
2044 amdgpu_sched_hw_submission);
2045 amdgpu_sched_hw_submission = 2;
2046 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
2047 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
2048 amdgpu_sched_hw_submission);
2049 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
2050 }
2051
2052 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
2053 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
2054 amdgpu_reset_method = -1;
2055 }
2056
2057 amdgpu_device_check_smu_prv_buffer_size(adev);
2058
2059 amdgpu_device_check_vm_size(adev);
2060
2061 amdgpu_device_check_block_size(adev);
2062
2063 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
2064
2065 for (i = 0; i < MAX_XCP; i++)
2066 adev->enforce_isolation[i] = !!enforce_isolation;
2067
2068 return 0;
2069}
2070
2071/**
2072 * amdgpu_switcheroo_set_state - set switcheroo state
2073 *
2074 * @pdev: pci dev pointer
2075 * @state: vga_switcheroo state
2076 *
2077 * Callback for the switcheroo driver. Suspends or resumes
2078 * the asics before or after it is powered up using ACPI methods.
2079 */
2080static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
2081 enum vga_switcheroo_state state)
2082{
2083 struct drm_device *dev = pci_get_drvdata(pdev);
2084 int r;
2085
2086 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
2087 return;
2088
2089 if (state == VGA_SWITCHEROO_ON) {
2090 pr_info("switched on\n");
2091 /* don't suspend or resume card normally */
2092 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2093
2094 pci_set_power_state(pdev, PCI_D0);
2095 amdgpu_device_load_pci_state(pdev);
2096 r = pci_enable_device(pdev);
2097 if (r)
2098 DRM_WARN("pci_enable_device failed (%d)\n", r);
2099 amdgpu_device_resume(dev, true);
2100
2101 dev->switch_power_state = DRM_SWITCH_POWER_ON;
2102 } else {
2103 pr_info("switched off\n");
2104 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2105 amdgpu_device_prepare(dev);
2106 amdgpu_device_suspend(dev, true);
2107 amdgpu_device_cache_pci_state(pdev);
2108 /* Shut down the device */
2109 pci_disable_device(pdev);
2110 pci_set_power_state(pdev, PCI_D3cold);
2111 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
2112 }
2113}
2114
2115/**
2116 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
2117 *
2118 * @pdev: pci dev pointer
2119 *
2120 * Callback for the switcheroo driver. Check of the switcheroo
2121 * state can be changed.
2122 * Returns true if the state can be changed, false if not.
2123 */
2124static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
2125{
2126 struct drm_device *dev = pci_get_drvdata(pdev);
2127
2128 /*
2129 * FIXME: open_count is protected by drm_global_mutex but that would lead to
2130 * locking inversion with the driver load path. And the access here is
2131 * completely racy anyway. So don't bother with locking for now.
2132 */
2133 return atomic_read(&dev->open_count) == 0;
2134}
2135
2136static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
2137 .set_gpu_state = amdgpu_switcheroo_set_state,
2138 .reprobe = NULL,
2139 .can_switch = amdgpu_switcheroo_can_switch,
2140};
2141
2142/**
2143 * amdgpu_device_ip_set_clockgating_state - set the CG state
2144 *
2145 * @dev: amdgpu_device pointer
2146 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2147 * @state: clockgating state (gate or ungate)
2148 *
2149 * Sets the requested clockgating state for all instances of
2150 * the hardware IP specified.
2151 * Returns the error code from the last instance.
2152 */
2153int amdgpu_device_ip_set_clockgating_state(void *dev,
2154 enum amd_ip_block_type block_type,
2155 enum amd_clockgating_state state)
2156{
2157 struct amdgpu_device *adev = dev;
2158 int i, r = 0;
2159
2160 for (i = 0; i < adev->num_ip_blocks; i++) {
2161 if (!adev->ip_blocks[i].status.valid)
2162 continue;
2163 if (adev->ip_blocks[i].version->type != block_type)
2164 continue;
2165 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
2166 continue;
2167 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
2168 (void *)adev, state);
2169 if (r)
2170 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
2171 adev->ip_blocks[i].version->funcs->name, r);
2172 }
2173 return r;
2174}
2175
2176/**
2177 * amdgpu_device_ip_set_powergating_state - set the PG state
2178 *
2179 * @dev: amdgpu_device pointer
2180 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2181 * @state: powergating state (gate or ungate)
2182 *
2183 * Sets the requested powergating state for all instances of
2184 * the hardware IP specified.
2185 * Returns the error code from the last instance.
2186 */
2187int amdgpu_device_ip_set_powergating_state(void *dev,
2188 enum amd_ip_block_type block_type,
2189 enum amd_powergating_state state)
2190{
2191 struct amdgpu_device *adev = dev;
2192 int i, r = 0;
2193
2194 for (i = 0; i < adev->num_ip_blocks; i++) {
2195 if (!adev->ip_blocks[i].status.valid)
2196 continue;
2197 if (adev->ip_blocks[i].version->type != block_type)
2198 continue;
2199 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
2200 continue;
2201 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
2202 (void *)adev, state);
2203 if (r)
2204 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
2205 adev->ip_blocks[i].version->funcs->name, r);
2206 }
2207 return r;
2208}
2209
2210/**
2211 * amdgpu_device_ip_get_clockgating_state - get the CG state
2212 *
2213 * @adev: amdgpu_device pointer
2214 * @flags: clockgating feature flags
2215 *
2216 * Walks the list of IPs on the device and updates the clockgating
2217 * flags for each IP.
2218 * Updates @flags with the feature flags for each hardware IP where
2219 * clockgating is enabled.
2220 */
2221void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
2222 u64 *flags)
2223{
2224 int i;
2225
2226 for (i = 0; i < adev->num_ip_blocks; i++) {
2227 if (!adev->ip_blocks[i].status.valid)
2228 continue;
2229 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
2230 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
2231 }
2232}
2233
2234/**
2235 * amdgpu_device_ip_wait_for_idle - wait for idle
2236 *
2237 * @adev: amdgpu_device pointer
2238 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2239 *
2240 * Waits for the request hardware IP to be idle.
2241 * Returns 0 for success or a negative error code on failure.
2242 */
2243int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
2244 enum amd_ip_block_type block_type)
2245{
2246 int i, r;
2247
2248 for (i = 0; i < adev->num_ip_blocks; i++) {
2249 if (!adev->ip_blocks[i].status.valid)
2250 continue;
2251 if (adev->ip_blocks[i].version->type == block_type) {
2252 if (adev->ip_blocks[i].version->funcs->wait_for_idle) {
2253 r = adev->ip_blocks[i].version->funcs->wait_for_idle(
2254 &adev->ip_blocks[i]);
2255 if (r)
2256 return r;
2257 }
2258 break;
2259 }
2260 }
2261 return 0;
2262
2263}
2264
2265/**
2266 * amdgpu_device_ip_is_valid - is the hardware IP enabled
2267 *
2268 * @adev: amdgpu_device pointer
2269 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2270 *
2271 * Check if the hardware IP is enable or not.
2272 * Returns true if it the IP is enable, false if not.
2273 */
2274bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
2275 enum amd_ip_block_type block_type)
2276{
2277 int i;
2278
2279 for (i = 0; i < adev->num_ip_blocks; i++) {
2280 if (adev->ip_blocks[i].version->type == block_type)
2281 return adev->ip_blocks[i].status.valid;
2282 }
2283 return false;
2284
2285}
2286
2287/**
2288 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
2289 *
2290 * @adev: amdgpu_device pointer
2291 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
2292 *
2293 * Returns a pointer to the hardware IP block structure
2294 * if it exists for the asic, otherwise NULL.
2295 */
2296struct amdgpu_ip_block *
2297amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
2298 enum amd_ip_block_type type)
2299{
2300 int i;
2301
2302 for (i = 0; i < adev->num_ip_blocks; i++)
2303 if (adev->ip_blocks[i].version->type == type)
2304 return &adev->ip_blocks[i];
2305
2306 return NULL;
2307}
2308
2309/**
2310 * amdgpu_device_ip_block_version_cmp
2311 *
2312 * @adev: amdgpu_device pointer
2313 * @type: enum amd_ip_block_type
2314 * @major: major version
2315 * @minor: minor version
2316 *
2317 * return 0 if equal or greater
2318 * return 1 if smaller or the ip_block doesn't exist
2319 */
2320int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
2321 enum amd_ip_block_type type,
2322 u32 major, u32 minor)
2323{
2324 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
2325
2326 if (ip_block && ((ip_block->version->major > major) ||
2327 ((ip_block->version->major == major) &&
2328 (ip_block->version->minor >= minor))))
2329 return 0;
2330
2331 return 1;
2332}
2333
2334/**
2335 * amdgpu_device_ip_block_add
2336 *
2337 * @adev: amdgpu_device pointer
2338 * @ip_block_version: pointer to the IP to add
2339 *
2340 * Adds the IP block driver information to the collection of IPs
2341 * on the asic.
2342 */
2343int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
2344 const struct amdgpu_ip_block_version *ip_block_version)
2345{
2346 if (!ip_block_version)
2347 return -EINVAL;
2348
2349 switch (ip_block_version->type) {
2350 case AMD_IP_BLOCK_TYPE_VCN:
2351 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
2352 return 0;
2353 break;
2354 case AMD_IP_BLOCK_TYPE_JPEG:
2355 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
2356 return 0;
2357 break;
2358 default:
2359 break;
2360 }
2361
2362 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
2363 ip_block_version->funcs->name);
2364
2365 adev->ip_blocks[adev->num_ip_blocks].adev = adev;
2366
2367 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
2368
2369 return 0;
2370}
2371
2372/**
2373 * amdgpu_device_enable_virtual_display - enable virtual display feature
2374 *
2375 * @adev: amdgpu_device pointer
2376 *
2377 * Enabled the virtual display feature if the user has enabled it via
2378 * the module parameter virtual_display. This feature provides a virtual
2379 * display hardware on headless boards or in virtualized environments.
2380 * This function parses and validates the configuration string specified by
2381 * the user and configues the virtual display configuration (number of
2382 * virtual connectors, crtcs, etc.) specified.
2383 */
2384static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
2385{
2386 adev->enable_virtual_display = false;
2387
2388 if (amdgpu_virtual_display) {
2389 const char *pci_address_name = pci_name(adev->pdev);
2390 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
2391
2392 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
2393 pciaddstr_tmp = pciaddstr;
2394 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
2395 pciaddname = strsep(&pciaddname_tmp, ",");
2396 if (!strcmp("all", pciaddname)
2397 || !strcmp(pci_address_name, pciaddname)) {
2398 long num_crtc;
2399 int res = -1;
2400
2401 adev->enable_virtual_display = true;
2402
2403 if (pciaddname_tmp)
2404 res = kstrtol(pciaddname_tmp, 10,
2405 &num_crtc);
2406
2407 if (!res) {
2408 if (num_crtc < 1)
2409 num_crtc = 1;
2410 if (num_crtc > 6)
2411 num_crtc = 6;
2412 adev->mode_info.num_crtc = num_crtc;
2413 } else {
2414 adev->mode_info.num_crtc = 1;
2415 }
2416 break;
2417 }
2418 }
2419
2420 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2421 amdgpu_virtual_display, pci_address_name,
2422 adev->enable_virtual_display, adev->mode_info.num_crtc);
2423
2424 kfree(pciaddstr);
2425 }
2426}
2427
2428void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
2429{
2430 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
2431 adev->mode_info.num_crtc = 1;
2432 adev->enable_virtual_display = true;
2433 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
2434 adev->enable_virtual_display, adev->mode_info.num_crtc);
2435 }
2436}
2437
2438/**
2439 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2440 *
2441 * @adev: amdgpu_device pointer
2442 *
2443 * Parses the asic configuration parameters specified in the gpu info
2444 * firmware and makes them availale to the driver for use in configuring
2445 * the asic.
2446 * Returns 0 on success, -EINVAL on failure.
2447 */
2448static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
2449{
2450 const char *chip_name;
2451 int err;
2452 const struct gpu_info_firmware_header_v1_0 *hdr;
2453
2454 adev->firmware.gpu_info_fw = NULL;
2455
2456 if (adev->mman.discovery_bin)
2457 return 0;
2458
2459 switch (adev->asic_type) {
2460 default:
2461 return 0;
2462 case CHIP_VEGA10:
2463 chip_name = "vega10";
2464 break;
2465 case CHIP_VEGA12:
2466 chip_name = "vega12";
2467 break;
2468 case CHIP_RAVEN:
2469 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2470 chip_name = "raven2";
2471 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2472 chip_name = "picasso";
2473 else
2474 chip_name = "raven";
2475 break;
2476 case CHIP_ARCTURUS:
2477 chip_name = "arcturus";
2478 break;
2479 case CHIP_NAVI12:
2480 chip_name = "navi12";
2481 break;
2482 }
2483
2484 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
2485 "amdgpu/%s_gpu_info.bin", chip_name);
2486 if (err) {
2487 dev_err(adev->dev,
2488 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
2489 chip_name);
2490 goto out;
2491 }
2492
2493 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2494 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2495
2496 switch (hdr->version_major) {
2497 case 1:
2498 {
2499 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2500 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2501 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2502
2503 /*
2504 * Should be droped when DAL no longer needs it.
2505 */
2506 if (adev->asic_type == CHIP_NAVI12)
2507 goto parse_soc_bounding_box;
2508
2509 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2510 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2511 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2512 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2513 adev->gfx.config.max_texture_channel_caches =
2514 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2515 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2516 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2517 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2518 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2519 adev->gfx.config.double_offchip_lds_buf =
2520 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2521 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2522 adev->gfx.cu_info.max_waves_per_simd =
2523 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2524 adev->gfx.cu_info.max_scratch_slots_per_cu =
2525 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2526 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2527 if (hdr->version_minor >= 1) {
2528 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2529 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2530 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2531 adev->gfx.config.num_sc_per_sh =
2532 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2533 adev->gfx.config.num_packer_per_sc =
2534 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2535 }
2536
2537parse_soc_bounding_box:
2538 /*
2539 * soc bounding box info is not integrated in disocovery table,
2540 * we always need to parse it from gpu info firmware if needed.
2541 */
2542 if (hdr->version_minor == 2) {
2543 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2544 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2545 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2546 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2547 }
2548 break;
2549 }
2550 default:
2551 dev_err(adev->dev,
2552 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2553 err = -EINVAL;
2554 goto out;
2555 }
2556out:
2557 return err;
2558}
2559
2560/**
2561 * amdgpu_device_ip_early_init - run early init for hardware IPs
2562 *
2563 * @adev: amdgpu_device pointer
2564 *
2565 * Early initialization pass for hardware IPs. The hardware IPs that make
2566 * up each asic are discovered each IP's early_init callback is run. This
2567 * is the first stage in initializing the asic.
2568 * Returns 0 on success, negative error code on failure.
2569 */
2570static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2571{
2572 struct amdgpu_ip_block *ip_block;
2573 struct pci_dev *parent;
2574 int i, r;
2575 bool total;
2576
2577 amdgpu_device_enable_virtual_display(adev);
2578
2579 if (amdgpu_sriov_vf(adev)) {
2580 r = amdgpu_virt_request_full_gpu(adev, true);
2581 if (r)
2582 return r;
2583 }
2584
2585 switch (adev->asic_type) {
2586#ifdef CONFIG_DRM_AMDGPU_SI
2587 case CHIP_VERDE:
2588 case CHIP_TAHITI:
2589 case CHIP_PITCAIRN:
2590 case CHIP_OLAND:
2591 case CHIP_HAINAN:
2592 adev->family = AMDGPU_FAMILY_SI;
2593 r = si_set_ip_blocks(adev);
2594 if (r)
2595 return r;
2596 break;
2597#endif
2598#ifdef CONFIG_DRM_AMDGPU_CIK
2599 case CHIP_BONAIRE:
2600 case CHIP_HAWAII:
2601 case CHIP_KAVERI:
2602 case CHIP_KABINI:
2603 case CHIP_MULLINS:
2604 if (adev->flags & AMD_IS_APU)
2605 adev->family = AMDGPU_FAMILY_KV;
2606 else
2607 adev->family = AMDGPU_FAMILY_CI;
2608
2609 r = cik_set_ip_blocks(adev);
2610 if (r)
2611 return r;
2612 break;
2613#endif
2614 case CHIP_TOPAZ:
2615 case CHIP_TONGA:
2616 case CHIP_FIJI:
2617 case CHIP_POLARIS10:
2618 case CHIP_POLARIS11:
2619 case CHIP_POLARIS12:
2620 case CHIP_VEGAM:
2621 case CHIP_CARRIZO:
2622 case CHIP_STONEY:
2623 if (adev->flags & AMD_IS_APU)
2624 adev->family = AMDGPU_FAMILY_CZ;
2625 else
2626 adev->family = AMDGPU_FAMILY_VI;
2627
2628 r = vi_set_ip_blocks(adev);
2629 if (r)
2630 return r;
2631 break;
2632 default:
2633 r = amdgpu_discovery_set_ip_blocks(adev);
2634 if (r)
2635 return r;
2636 break;
2637 }
2638
2639 if (amdgpu_has_atpx() &&
2640 (amdgpu_is_atpx_hybrid() ||
2641 amdgpu_has_atpx_dgpu_power_cntl()) &&
2642 ((adev->flags & AMD_IS_APU) == 0) &&
2643 !dev_is_removable(&adev->pdev->dev))
2644 adev->flags |= AMD_IS_PX;
2645
2646 if (!(adev->flags & AMD_IS_APU)) {
2647 parent = pcie_find_root_port(adev->pdev);
2648 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2649 }
2650
2651
2652 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2653 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2654 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2655 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2656 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2657 if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
2658 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2659
2660 total = true;
2661 for (i = 0; i < adev->num_ip_blocks; i++) {
2662 ip_block = &adev->ip_blocks[i];
2663
2664 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2665 DRM_WARN("disabled ip block: %d <%s>\n",
2666 i, adev->ip_blocks[i].version->funcs->name);
2667 adev->ip_blocks[i].status.valid = false;
2668 } else if (ip_block->version->funcs->early_init) {
2669 r = ip_block->version->funcs->early_init(ip_block);
2670 if (r == -ENOENT) {
2671 adev->ip_blocks[i].status.valid = false;
2672 } else if (r) {
2673 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2674 adev->ip_blocks[i].version->funcs->name, r);
2675 total = false;
2676 } else {
2677 adev->ip_blocks[i].status.valid = true;
2678 }
2679 } else {
2680 adev->ip_blocks[i].status.valid = true;
2681 }
2682 /* get the vbios after the asic_funcs are set up */
2683 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2684 r = amdgpu_device_parse_gpu_info_fw(adev);
2685 if (r)
2686 return r;
2687
2688 /* Read BIOS */
2689 if (amdgpu_device_read_bios(adev)) {
2690 if (!amdgpu_get_bios(adev))
2691 return -EINVAL;
2692
2693 r = amdgpu_atombios_init(adev);
2694 if (r) {
2695 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2696 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2697 return r;
2698 }
2699 }
2700
2701 /*get pf2vf msg info at it's earliest time*/
2702 if (amdgpu_sriov_vf(adev))
2703 amdgpu_virt_init_data_exchange(adev);
2704
2705 }
2706 }
2707 if (!total)
2708 return -ENODEV;
2709
2710 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
2711 if (ip_block->status.valid != false)
2712 amdgpu_amdkfd_device_probe(adev);
2713
2714 adev->cg_flags &= amdgpu_cg_mask;
2715 adev->pg_flags &= amdgpu_pg_mask;
2716
2717 return 0;
2718}
2719
2720static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2721{
2722 int i, r;
2723
2724 for (i = 0; i < adev->num_ip_blocks; i++) {
2725 if (!adev->ip_blocks[i].status.sw)
2726 continue;
2727 if (adev->ip_blocks[i].status.hw)
2728 continue;
2729 if (!amdgpu_ip_member_of_hwini(
2730 adev, adev->ip_blocks[i].version->type))
2731 continue;
2732 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2733 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2734 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2735 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2736 if (r) {
2737 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2738 adev->ip_blocks[i].version->funcs->name, r);
2739 return r;
2740 }
2741 adev->ip_blocks[i].status.hw = true;
2742 }
2743 }
2744
2745 return 0;
2746}
2747
2748static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2749{
2750 int i, r;
2751
2752 for (i = 0; i < adev->num_ip_blocks; i++) {
2753 if (!adev->ip_blocks[i].status.sw)
2754 continue;
2755 if (adev->ip_blocks[i].status.hw)
2756 continue;
2757 if (!amdgpu_ip_member_of_hwini(
2758 adev, adev->ip_blocks[i].version->type))
2759 continue;
2760 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2761 if (r) {
2762 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2763 adev->ip_blocks[i].version->funcs->name, r);
2764 return r;
2765 }
2766 adev->ip_blocks[i].status.hw = true;
2767 }
2768
2769 return 0;
2770}
2771
2772static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2773{
2774 int r = 0;
2775 int i;
2776 uint32_t smu_version;
2777
2778 if (adev->asic_type >= CHIP_VEGA10) {
2779 for (i = 0; i < adev->num_ip_blocks; i++) {
2780 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2781 continue;
2782
2783 if (!amdgpu_ip_member_of_hwini(adev,
2784 AMD_IP_BLOCK_TYPE_PSP))
2785 break;
2786
2787 if (!adev->ip_blocks[i].status.sw)
2788 continue;
2789
2790 /* no need to do the fw loading again if already done*/
2791 if (adev->ip_blocks[i].status.hw == true)
2792 break;
2793
2794 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2795 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
2796 if (r)
2797 return r;
2798 } else {
2799 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2800 if (r) {
2801 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2802 adev->ip_blocks[i].version->funcs->name, r);
2803 return r;
2804 }
2805 adev->ip_blocks[i].status.hw = true;
2806 }
2807 break;
2808 }
2809 }
2810
2811 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2812 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2813
2814 return r;
2815}
2816
2817static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2818{
2819 long timeout;
2820 int r, i;
2821
2822 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2823 struct amdgpu_ring *ring = adev->rings[i];
2824
2825 /* No need to setup the GPU scheduler for rings that don't need it */
2826 if (!ring || ring->no_scheduler)
2827 continue;
2828
2829 switch (ring->funcs->type) {
2830 case AMDGPU_RING_TYPE_GFX:
2831 timeout = adev->gfx_timeout;
2832 break;
2833 case AMDGPU_RING_TYPE_COMPUTE:
2834 timeout = adev->compute_timeout;
2835 break;
2836 case AMDGPU_RING_TYPE_SDMA:
2837 timeout = adev->sdma_timeout;
2838 break;
2839 default:
2840 timeout = adev->video_timeout;
2841 break;
2842 }
2843
2844 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL,
2845 DRM_SCHED_PRIORITY_COUNT,
2846 ring->num_hw_submission, 0,
2847 timeout, adev->reset_domain->wq,
2848 ring->sched_score, ring->name,
2849 adev->dev);
2850 if (r) {
2851 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2852 ring->name);
2853 return r;
2854 }
2855 r = amdgpu_uvd_entity_init(adev, ring);
2856 if (r) {
2857 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
2858 ring->name);
2859 return r;
2860 }
2861 r = amdgpu_vce_entity_init(adev, ring);
2862 if (r) {
2863 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
2864 ring->name);
2865 return r;
2866 }
2867 }
2868
2869 amdgpu_xcp_update_partition_sched_list(adev);
2870
2871 return 0;
2872}
2873
2874
2875/**
2876 * amdgpu_device_ip_init - run init for hardware IPs
2877 *
2878 * @adev: amdgpu_device pointer
2879 *
2880 * Main initialization pass for hardware IPs. The list of all the hardware
2881 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2882 * are run. sw_init initializes the software state associated with each IP
2883 * and hw_init initializes the hardware associated with each IP.
2884 * Returns 0 on success, negative error code on failure.
2885 */
2886static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2887{
2888 bool init_badpage;
2889 int i, r;
2890
2891 r = amdgpu_ras_init(adev);
2892 if (r)
2893 return r;
2894
2895 for (i = 0; i < adev->num_ip_blocks; i++) {
2896 if (!adev->ip_blocks[i].status.valid)
2897 continue;
2898 if (adev->ip_blocks[i].version->funcs->sw_init) {
2899 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]);
2900 if (r) {
2901 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2902 adev->ip_blocks[i].version->funcs->name, r);
2903 goto init_failed;
2904 }
2905 }
2906 adev->ip_blocks[i].status.sw = true;
2907
2908 if (!amdgpu_ip_member_of_hwini(
2909 adev, adev->ip_blocks[i].version->type))
2910 continue;
2911
2912 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2913 /* need to do common hw init early so everything is set up for gmc */
2914 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2915 if (r) {
2916 DRM_ERROR("hw_init %d failed %d\n", i, r);
2917 goto init_failed;
2918 }
2919 adev->ip_blocks[i].status.hw = true;
2920 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2921 /* need to do gmc hw init early so we can allocate gpu mem */
2922 /* Try to reserve bad pages early */
2923 if (amdgpu_sriov_vf(adev))
2924 amdgpu_virt_exchange_data(adev);
2925
2926 r = amdgpu_device_mem_scratch_init(adev);
2927 if (r) {
2928 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2929 goto init_failed;
2930 }
2931 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2932 if (r) {
2933 DRM_ERROR("hw_init %d failed %d\n", i, r);
2934 goto init_failed;
2935 }
2936 r = amdgpu_device_wb_init(adev);
2937 if (r) {
2938 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2939 goto init_failed;
2940 }
2941 adev->ip_blocks[i].status.hw = true;
2942
2943 /* right after GMC hw init, we create CSA */
2944 if (adev->gfx.mcbp) {
2945 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2946 AMDGPU_GEM_DOMAIN_VRAM |
2947 AMDGPU_GEM_DOMAIN_GTT,
2948 AMDGPU_CSA_SIZE);
2949 if (r) {
2950 DRM_ERROR("allocate CSA failed %d\n", r);
2951 goto init_failed;
2952 }
2953 }
2954
2955 r = amdgpu_seq64_init(adev);
2956 if (r) {
2957 DRM_ERROR("allocate seq64 failed %d\n", r);
2958 goto init_failed;
2959 }
2960 }
2961 }
2962
2963 if (amdgpu_sriov_vf(adev))
2964 amdgpu_virt_init_data_exchange(adev);
2965
2966 r = amdgpu_ib_pool_init(adev);
2967 if (r) {
2968 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2969 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2970 goto init_failed;
2971 }
2972
2973 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2974 if (r)
2975 goto init_failed;
2976
2977 r = amdgpu_device_ip_hw_init_phase1(adev);
2978 if (r)
2979 goto init_failed;
2980
2981 r = amdgpu_device_fw_loading(adev);
2982 if (r)
2983 goto init_failed;
2984
2985 r = amdgpu_device_ip_hw_init_phase2(adev);
2986 if (r)
2987 goto init_failed;
2988
2989 /*
2990 * retired pages will be loaded from eeprom and reserved here,
2991 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2992 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2993 * for I2C communication which only true at this point.
2994 *
2995 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2996 * failure from bad gpu situation and stop amdgpu init process
2997 * accordingly. For other failed cases, it will still release all
2998 * the resource and print error message, rather than returning one
2999 * negative value to upper level.
3000 *
3001 * Note: theoretically, this should be called before all vram allocations
3002 * to protect retired page from abusing
3003 */
3004 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
3005 r = amdgpu_ras_recovery_init(adev, init_badpage);
3006 if (r)
3007 goto init_failed;
3008
3009 /**
3010 * In case of XGMI grab extra reference for reset domain for this device
3011 */
3012 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3013 if (amdgpu_xgmi_add_device(adev) == 0) {
3014 if (!amdgpu_sriov_vf(adev)) {
3015 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3016
3017 if (WARN_ON(!hive)) {
3018 r = -ENOENT;
3019 goto init_failed;
3020 }
3021
3022 if (!hive->reset_domain ||
3023 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
3024 r = -ENOENT;
3025 amdgpu_put_xgmi_hive(hive);
3026 goto init_failed;
3027 }
3028
3029 /* Drop the early temporary reset domain we created for device */
3030 amdgpu_reset_put_reset_domain(adev->reset_domain);
3031 adev->reset_domain = hive->reset_domain;
3032 amdgpu_put_xgmi_hive(hive);
3033 }
3034 }
3035 }
3036
3037 r = amdgpu_device_init_schedulers(adev);
3038 if (r)
3039 goto init_failed;
3040
3041 if (adev->mman.buffer_funcs_ring->sched.ready)
3042 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3043
3044 /* Don't init kfd if whole hive need to be reset during init */
3045 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
3046 kgd2kfd_init_zone_device(adev);
3047 amdgpu_amdkfd_device_init(adev);
3048 }
3049
3050 amdgpu_fru_get_product_info(adev);
3051
3052init_failed:
3053
3054 return r;
3055}
3056
3057/**
3058 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
3059 *
3060 * @adev: amdgpu_device pointer
3061 *
3062 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
3063 * this function before a GPU reset. If the value is retained after a
3064 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
3065 */
3066static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
3067{
3068 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
3069}
3070
3071/**
3072 * amdgpu_device_check_vram_lost - check if vram is valid
3073 *
3074 * @adev: amdgpu_device pointer
3075 *
3076 * Checks the reset magic value written to the gart pointer in VRAM.
3077 * The driver calls this after a GPU reset to see if the contents of
3078 * VRAM is lost or now.
3079 * returns true if vram is lost, false if not.
3080 */
3081static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
3082{
3083 if (memcmp(adev->gart.ptr, adev->reset_magic,
3084 AMDGPU_RESET_MAGIC_NUM))
3085 return true;
3086
3087 if (!amdgpu_in_reset(adev))
3088 return false;
3089
3090 /*
3091 * For all ASICs with baco/mode1 reset, the VRAM is
3092 * always assumed to be lost.
3093 */
3094 switch (amdgpu_asic_reset_method(adev)) {
3095 case AMD_RESET_METHOD_BACO:
3096 case AMD_RESET_METHOD_MODE1:
3097 return true;
3098 default:
3099 return false;
3100 }
3101}
3102
3103/**
3104 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
3105 *
3106 * @adev: amdgpu_device pointer
3107 * @state: clockgating state (gate or ungate)
3108 *
3109 * The list of all the hardware IPs that make up the asic is walked and the
3110 * set_clockgating_state callbacks are run.
3111 * Late initialization pass enabling clockgating for hardware IPs.
3112 * Fini or suspend, pass disabling clockgating for hardware IPs.
3113 * Returns 0 on success, negative error code on failure.
3114 */
3115
3116int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
3117 enum amd_clockgating_state state)
3118{
3119 int i, j, r;
3120
3121 if (amdgpu_emu_mode == 1)
3122 return 0;
3123
3124 for (j = 0; j < adev->num_ip_blocks; j++) {
3125 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3126 if (!adev->ip_blocks[i].status.late_initialized)
3127 continue;
3128 /* skip CG for GFX, SDMA on S0ix */
3129 if (adev->in_s0ix &&
3130 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3131 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3132 continue;
3133 /* skip CG for VCE/UVD, it's handled specially */
3134 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3135 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3136 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3137 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3138 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
3139 /* enable clockgating to save power */
3140 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
3141 state);
3142 if (r) {
3143 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
3144 adev->ip_blocks[i].version->funcs->name, r);
3145 return r;
3146 }
3147 }
3148 }
3149
3150 return 0;
3151}
3152
3153int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
3154 enum amd_powergating_state state)
3155{
3156 int i, j, r;
3157
3158 if (amdgpu_emu_mode == 1)
3159 return 0;
3160
3161 for (j = 0; j < adev->num_ip_blocks; j++) {
3162 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3163 if (!adev->ip_blocks[i].status.late_initialized)
3164 continue;
3165 /* skip PG for GFX, SDMA on S0ix */
3166 if (adev->in_s0ix &&
3167 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3168 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3169 continue;
3170 /* skip CG for VCE/UVD, it's handled specially */
3171 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3172 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3173 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3174 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3175 adev->ip_blocks[i].version->funcs->set_powergating_state) {
3176 /* enable powergating to save power */
3177 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
3178 state);
3179 if (r) {
3180 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
3181 adev->ip_blocks[i].version->funcs->name, r);
3182 return r;
3183 }
3184 }
3185 }
3186 return 0;
3187}
3188
3189static int amdgpu_device_enable_mgpu_fan_boost(void)
3190{
3191 struct amdgpu_gpu_instance *gpu_ins;
3192 struct amdgpu_device *adev;
3193 int i, ret = 0;
3194
3195 mutex_lock(&mgpu_info.mutex);
3196
3197 /*
3198 * MGPU fan boost feature should be enabled
3199 * only when there are two or more dGPUs in
3200 * the system
3201 */
3202 if (mgpu_info.num_dgpu < 2)
3203 goto out;
3204
3205 for (i = 0; i < mgpu_info.num_dgpu; i++) {
3206 gpu_ins = &(mgpu_info.gpu_ins[i]);
3207 adev = gpu_ins->adev;
3208 if (!(adev->flags & AMD_IS_APU) &&
3209 !gpu_ins->mgpu_fan_enabled) {
3210 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
3211 if (ret)
3212 break;
3213
3214 gpu_ins->mgpu_fan_enabled = 1;
3215 }
3216 }
3217
3218out:
3219 mutex_unlock(&mgpu_info.mutex);
3220
3221 return ret;
3222}
3223
3224/**
3225 * amdgpu_device_ip_late_init - run late init for hardware IPs
3226 *
3227 * @adev: amdgpu_device pointer
3228 *
3229 * Late initialization pass for hardware IPs. The list of all the hardware
3230 * IPs that make up the asic is walked and the late_init callbacks are run.
3231 * late_init covers any special initialization that an IP requires
3232 * after all of the have been initialized or something that needs to happen
3233 * late in the init process.
3234 * Returns 0 on success, negative error code on failure.
3235 */
3236static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
3237{
3238 struct amdgpu_gpu_instance *gpu_instance;
3239 int i = 0, r;
3240
3241 for (i = 0; i < adev->num_ip_blocks; i++) {
3242 if (!adev->ip_blocks[i].status.hw)
3243 continue;
3244 if (adev->ip_blocks[i].version->funcs->late_init) {
3245 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]);
3246 if (r) {
3247 DRM_ERROR("late_init of IP block <%s> failed %d\n",
3248 adev->ip_blocks[i].version->funcs->name, r);
3249 return r;
3250 }
3251 }
3252 adev->ip_blocks[i].status.late_initialized = true;
3253 }
3254
3255 r = amdgpu_ras_late_init(adev);
3256 if (r) {
3257 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
3258 return r;
3259 }
3260
3261 if (!amdgpu_reset_in_recovery(adev))
3262 amdgpu_ras_set_error_query_ready(adev, true);
3263
3264 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3265 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3266
3267 amdgpu_device_fill_reset_magic(adev);
3268
3269 r = amdgpu_device_enable_mgpu_fan_boost();
3270 if (r)
3271 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
3272
3273 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
3274 if (amdgpu_passthrough(adev) &&
3275 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
3276 adev->asic_type == CHIP_ALDEBARAN))
3277 amdgpu_dpm_handle_passthrough_sbr(adev, true);
3278
3279 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3280 mutex_lock(&mgpu_info.mutex);
3281
3282 /*
3283 * Reset device p-state to low as this was booted with high.
3284 *
3285 * This should be performed only after all devices from the same
3286 * hive get initialized.
3287 *
3288 * However, it's unknown how many device in the hive in advance.
3289 * As this is counted one by one during devices initializations.
3290 *
3291 * So, we wait for all XGMI interlinked devices initialized.
3292 * This may bring some delays as those devices may come from
3293 * different hives. But that should be OK.
3294 */
3295 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
3296 for (i = 0; i < mgpu_info.num_gpu; i++) {
3297 gpu_instance = &(mgpu_info.gpu_ins[i]);
3298 if (gpu_instance->adev->flags & AMD_IS_APU)
3299 continue;
3300
3301 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
3302 AMDGPU_XGMI_PSTATE_MIN);
3303 if (r) {
3304 DRM_ERROR("pstate setting failed (%d).\n", r);
3305 break;
3306 }
3307 }
3308 }
3309
3310 mutex_unlock(&mgpu_info.mutex);
3311 }
3312
3313 return 0;
3314}
3315
3316static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block)
3317{
3318 int r;
3319
3320 if (!ip_block->version->funcs->hw_fini) {
3321 DRM_ERROR("hw_fini of IP block <%s> not defined\n",
3322 ip_block->version->funcs->name);
3323 } else {
3324 r = ip_block->version->funcs->hw_fini(ip_block);
3325 /* XXX handle errors */
3326 if (r) {
3327 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3328 ip_block->version->funcs->name, r);
3329 }
3330 }
3331
3332 ip_block->status.hw = false;
3333}
3334
3335/**
3336 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
3337 *
3338 * @adev: amdgpu_device pointer
3339 *
3340 * For ASICs need to disable SMC first
3341 */
3342static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
3343{
3344 int i;
3345
3346 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
3347 return;
3348
3349 for (i = 0; i < adev->num_ip_blocks; i++) {
3350 if (!adev->ip_blocks[i].status.hw)
3351 continue;
3352 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3353 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3354 break;
3355 }
3356 }
3357}
3358
3359static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
3360{
3361 int i, r;
3362
3363 for (i = 0; i < adev->num_ip_blocks; i++) {
3364 if (!adev->ip_blocks[i].version->funcs->early_fini)
3365 continue;
3366
3367 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
3368 if (r) {
3369 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
3370 adev->ip_blocks[i].version->funcs->name, r);
3371 }
3372 }
3373
3374 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3375 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3376
3377 amdgpu_amdkfd_suspend(adev, false);
3378
3379 /* Workaroud for ASICs need to disable SMC first */
3380 amdgpu_device_smu_fini_early(adev);
3381
3382 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3383 if (!adev->ip_blocks[i].status.hw)
3384 continue;
3385
3386 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3387 }
3388
3389 if (amdgpu_sriov_vf(adev)) {
3390 if (amdgpu_virt_release_full_gpu(adev, false))
3391 DRM_ERROR("failed to release exclusive mode on fini\n");
3392 }
3393
3394 return 0;
3395}
3396
3397/**
3398 * amdgpu_device_ip_fini - run fini for hardware IPs
3399 *
3400 * @adev: amdgpu_device pointer
3401 *
3402 * Main teardown pass for hardware IPs. The list of all the hardware
3403 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3404 * are run. hw_fini tears down the hardware associated with each IP
3405 * and sw_fini tears down any software state associated with each IP.
3406 * Returns 0 on success, negative error code on failure.
3407 */
3408static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
3409{
3410 int i, r;
3411
3412 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
3413 amdgpu_virt_release_ras_err_handler_data(adev);
3414
3415 if (adev->gmc.xgmi.num_physical_nodes > 1)
3416 amdgpu_xgmi_remove_device(adev);
3417
3418 amdgpu_amdkfd_device_fini_sw(adev);
3419
3420 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3421 if (!adev->ip_blocks[i].status.sw)
3422 continue;
3423
3424 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3425 amdgpu_ucode_free_bo(adev);
3426 amdgpu_free_static_csa(&adev->virt.csa_obj);
3427 amdgpu_device_wb_fini(adev);
3428 amdgpu_device_mem_scratch_fini(adev);
3429 amdgpu_ib_pool_fini(adev);
3430 amdgpu_seq64_fini(adev);
3431 }
3432 if (adev->ip_blocks[i].version->funcs->sw_fini) {
3433 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]);
3434 /* XXX handle errors */
3435 if (r) {
3436 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
3437 adev->ip_blocks[i].version->funcs->name, r);
3438 }
3439 }
3440 adev->ip_blocks[i].status.sw = false;
3441 adev->ip_blocks[i].status.valid = false;
3442 }
3443
3444 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3445 if (!adev->ip_blocks[i].status.late_initialized)
3446 continue;
3447 if (adev->ip_blocks[i].version->funcs->late_fini)
3448 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]);
3449 adev->ip_blocks[i].status.late_initialized = false;
3450 }
3451
3452 amdgpu_ras_fini(adev);
3453
3454 return 0;
3455}
3456
3457/**
3458 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3459 *
3460 * @work: work_struct.
3461 */
3462static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
3463{
3464 struct amdgpu_device *adev =
3465 container_of(work, struct amdgpu_device, delayed_init_work.work);
3466 int r;
3467
3468 r = amdgpu_ib_ring_tests(adev);
3469 if (r)
3470 DRM_ERROR("ib ring test failed (%d).\n", r);
3471}
3472
3473static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
3474{
3475 struct amdgpu_device *adev =
3476 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
3477
3478 WARN_ON_ONCE(adev->gfx.gfx_off_state);
3479 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
3480
3481 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
3482 adev->gfx.gfx_off_state = true;
3483}
3484
3485/**
3486 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3487 *
3488 * @adev: amdgpu_device pointer
3489 *
3490 * Main suspend function for hardware IPs. The list of all the hardware
3491 * IPs that make up the asic is walked, clockgating is disabled and the
3492 * suspend callbacks are run. suspend puts the hardware and software state
3493 * in each IP into a state suitable for suspend.
3494 * Returns 0 on success, negative error code on failure.
3495 */
3496static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
3497{
3498 int i, r;
3499
3500 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3501 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3502
3503 /*
3504 * Per PMFW team's suggestion, driver needs to handle gfxoff
3505 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3506 * scenario. Add the missing df cstate disablement here.
3507 */
3508 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
3509 dev_warn(adev->dev, "Failed to disallow df cstate");
3510
3511 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3512 if (!adev->ip_blocks[i].status.valid)
3513 continue;
3514
3515 /* displays are handled separately */
3516 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
3517 continue;
3518
3519 /* XXX handle errors */
3520 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3521 if (r)
3522 return r;
3523 }
3524
3525 return 0;
3526}
3527
3528/**
3529 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3530 *
3531 * @adev: amdgpu_device pointer
3532 *
3533 * Main suspend function for hardware IPs. The list of all the hardware
3534 * IPs that make up the asic is walked, clockgating is disabled and the
3535 * suspend callbacks are run. suspend puts the hardware and software state
3536 * in each IP into a state suitable for suspend.
3537 * Returns 0 on success, negative error code on failure.
3538 */
3539static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3540{
3541 int i, r;
3542
3543 if (adev->in_s0ix)
3544 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3545
3546 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3547 if (!adev->ip_blocks[i].status.valid)
3548 continue;
3549 /* displays are handled in phase1 */
3550 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3551 continue;
3552 /* PSP lost connection when err_event_athub occurs */
3553 if (amdgpu_ras_intr_triggered() &&
3554 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3555 adev->ip_blocks[i].status.hw = false;
3556 continue;
3557 }
3558
3559 /* skip unnecessary suspend if we do not initialize them yet */
3560 if (!amdgpu_ip_member_of_hwini(
3561 adev, adev->ip_blocks[i].version->type))
3562 continue;
3563
3564 /* skip suspend of gfx/mes and psp for S0ix
3565 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3566 * like at runtime. PSP is also part of the always on hardware
3567 * so no need to suspend it.
3568 */
3569 if (adev->in_s0ix &&
3570 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3571 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3572 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3573 continue;
3574
3575 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3576 if (adev->in_s0ix &&
3577 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
3578 IP_VERSION(5, 0, 0)) &&
3579 (adev->ip_blocks[i].version->type ==
3580 AMD_IP_BLOCK_TYPE_SDMA))
3581 continue;
3582
3583 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3584 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3585 * from this location and RLC Autoload automatically also gets loaded
3586 * from here based on PMFW -> PSP message during re-init sequence.
3587 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3588 * the TMR and reload FWs again for IMU enabled APU ASICs.
3589 */
3590 if (amdgpu_in_reset(adev) &&
3591 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3592 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3593 continue;
3594
3595 /* XXX handle errors */
3596 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3597 adev->ip_blocks[i].status.hw = false;
3598
3599 /* handle putting the SMC in the appropriate state */
3600 if (!amdgpu_sriov_vf(adev)) {
3601 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3602 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3603 if (r) {
3604 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3605 adev->mp1_state, r);
3606 return r;
3607 }
3608 }
3609 }
3610 }
3611
3612 return 0;
3613}
3614
3615/**
3616 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3617 *
3618 * @adev: amdgpu_device pointer
3619 *
3620 * Main suspend function for hardware IPs. The list of all the hardware
3621 * IPs that make up the asic is walked, clockgating is disabled and the
3622 * suspend callbacks are run. suspend puts the hardware and software state
3623 * in each IP into a state suitable for suspend.
3624 * Returns 0 on success, negative error code on failure.
3625 */
3626int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3627{
3628 int r;
3629
3630 if (amdgpu_sriov_vf(adev)) {
3631 amdgpu_virt_fini_data_exchange(adev);
3632 amdgpu_virt_request_full_gpu(adev, false);
3633 }
3634
3635 amdgpu_ttm_set_buffer_funcs_status(adev, false);
3636
3637 r = amdgpu_device_ip_suspend_phase1(adev);
3638 if (r)
3639 return r;
3640 r = amdgpu_device_ip_suspend_phase2(adev);
3641
3642 if (amdgpu_sriov_vf(adev))
3643 amdgpu_virt_release_full_gpu(adev, false);
3644
3645 return r;
3646}
3647
3648static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3649{
3650 int i, r;
3651
3652 static enum amd_ip_block_type ip_order[] = {
3653 AMD_IP_BLOCK_TYPE_COMMON,
3654 AMD_IP_BLOCK_TYPE_GMC,
3655 AMD_IP_BLOCK_TYPE_PSP,
3656 AMD_IP_BLOCK_TYPE_IH,
3657 };
3658
3659 for (i = 0; i < adev->num_ip_blocks; i++) {
3660 int j;
3661 struct amdgpu_ip_block *block;
3662
3663 block = &adev->ip_blocks[i];
3664 block->status.hw = false;
3665
3666 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3667
3668 if (block->version->type != ip_order[j] ||
3669 !block->status.valid)
3670 continue;
3671
3672 r = block->version->funcs->hw_init(&adev->ip_blocks[i]);
3673 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3674 if (r)
3675 return r;
3676 block->status.hw = true;
3677 }
3678 }
3679
3680 return 0;
3681}
3682
3683static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3684{
3685 int i, r;
3686
3687 static enum amd_ip_block_type ip_order[] = {
3688 AMD_IP_BLOCK_TYPE_SMC,
3689 AMD_IP_BLOCK_TYPE_DCE,
3690 AMD_IP_BLOCK_TYPE_GFX,
3691 AMD_IP_BLOCK_TYPE_SDMA,
3692 AMD_IP_BLOCK_TYPE_MES,
3693 AMD_IP_BLOCK_TYPE_UVD,
3694 AMD_IP_BLOCK_TYPE_VCE,
3695 AMD_IP_BLOCK_TYPE_VCN,
3696 AMD_IP_BLOCK_TYPE_JPEG
3697 };
3698
3699 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3700 int j;
3701 struct amdgpu_ip_block *block;
3702
3703 for (j = 0; j < adev->num_ip_blocks; j++) {
3704 block = &adev->ip_blocks[j];
3705
3706 if (block->version->type != ip_order[i] ||
3707 !block->status.valid ||
3708 block->status.hw)
3709 continue;
3710
3711 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
3712 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3713 if (r)
3714 return r;
3715 } else {
3716 r = block->version->funcs->hw_init(&adev->ip_blocks[i]);
3717 if (r) {
3718 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
3719 adev->ip_blocks[i].version->funcs->name, r);
3720 return r;
3721 }
3722 block->status.hw = true;
3723 }
3724 }
3725 }
3726
3727 return 0;
3728}
3729
3730/**
3731 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3732 *
3733 * @adev: amdgpu_device pointer
3734 *
3735 * First resume function for hardware IPs. The list of all the hardware
3736 * IPs that make up the asic is walked and the resume callbacks are run for
3737 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3738 * after a suspend and updates the software state as necessary. This
3739 * function is also used for restoring the GPU after a GPU reset.
3740 * Returns 0 on success, negative error code on failure.
3741 */
3742static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3743{
3744 int i, r;
3745
3746 for (i = 0; i < adev->num_ip_blocks; i++) {
3747 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3748 continue;
3749 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3750 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3751 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3752 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3753
3754 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3755 if (r)
3756 return r;
3757 }
3758 }
3759
3760 return 0;
3761}
3762
3763/**
3764 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3765 *
3766 * @adev: amdgpu_device pointer
3767 *
3768 * First resume function for hardware IPs. The list of all the hardware
3769 * IPs that make up the asic is walked and the resume callbacks are run for
3770 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3771 * functional state after a suspend and updates the software state as
3772 * necessary. This function is also used for restoring the GPU after a GPU
3773 * reset.
3774 * Returns 0 on success, negative error code on failure.
3775 */
3776static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3777{
3778 int i, r;
3779
3780 for (i = 0; i < adev->num_ip_blocks; i++) {
3781 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3782 continue;
3783 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3784 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3785 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3786 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3787 continue;
3788 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3789 if (r)
3790 return r;
3791 }
3792
3793 return 0;
3794}
3795
3796/**
3797 * amdgpu_device_ip_resume - run resume for hardware IPs
3798 *
3799 * @adev: amdgpu_device pointer
3800 *
3801 * Main resume function for hardware IPs. The hardware IPs
3802 * are split into two resume functions because they are
3803 * also used in recovering from a GPU reset and some additional
3804 * steps need to be take between them. In this case (S3/S4) they are
3805 * run sequentially.
3806 * Returns 0 on success, negative error code on failure.
3807 */
3808static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3809{
3810 int r;
3811
3812 r = amdgpu_device_ip_resume_phase1(adev);
3813 if (r)
3814 return r;
3815
3816 r = amdgpu_device_fw_loading(adev);
3817 if (r)
3818 return r;
3819
3820 r = amdgpu_device_ip_resume_phase2(adev);
3821
3822 if (adev->mman.buffer_funcs_ring->sched.ready)
3823 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3824
3825 return r;
3826}
3827
3828/**
3829 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3830 *
3831 * @adev: amdgpu_device pointer
3832 *
3833 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3834 */
3835static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3836{
3837 if (amdgpu_sriov_vf(adev)) {
3838 if (adev->is_atom_fw) {
3839 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3840 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3841 } else {
3842 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3843 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3844 }
3845
3846 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3847 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3848 }
3849}
3850
3851/**
3852 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3853 *
3854 * @asic_type: AMD asic type
3855 *
3856 * Check if there is DC (new modesetting infrastructre) support for an asic.
3857 * returns true if DC has support, false if not.
3858 */
3859bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3860{
3861 switch (asic_type) {
3862#ifdef CONFIG_DRM_AMDGPU_SI
3863 case CHIP_HAINAN:
3864#endif
3865 case CHIP_TOPAZ:
3866 /* chips with no display hardware */
3867 return false;
3868#if defined(CONFIG_DRM_AMD_DC)
3869 case CHIP_TAHITI:
3870 case CHIP_PITCAIRN:
3871 case CHIP_VERDE:
3872 case CHIP_OLAND:
3873 /*
3874 * We have systems in the wild with these ASICs that require
3875 * LVDS and VGA support which is not supported with DC.
3876 *
3877 * Fallback to the non-DC driver here by default so as not to
3878 * cause regressions.
3879 */
3880#if defined(CONFIG_DRM_AMD_DC_SI)
3881 return amdgpu_dc > 0;
3882#else
3883 return false;
3884#endif
3885 case CHIP_BONAIRE:
3886 case CHIP_KAVERI:
3887 case CHIP_KABINI:
3888 case CHIP_MULLINS:
3889 /*
3890 * We have systems in the wild with these ASICs that require
3891 * VGA support which is not supported with DC.
3892 *
3893 * Fallback to the non-DC driver here by default so as not to
3894 * cause regressions.
3895 */
3896 return amdgpu_dc > 0;
3897 default:
3898 return amdgpu_dc != 0;
3899#else
3900 default:
3901 if (amdgpu_dc > 0)
3902 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3903 return false;
3904#endif
3905 }
3906}
3907
3908/**
3909 * amdgpu_device_has_dc_support - check if dc is supported
3910 *
3911 * @adev: amdgpu_device pointer
3912 *
3913 * Returns true for supported, false for not supported
3914 */
3915bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3916{
3917 if (adev->enable_virtual_display ||
3918 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3919 return false;
3920
3921 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3922}
3923
3924static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3925{
3926 struct amdgpu_device *adev =
3927 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3928 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3929
3930 /* It's a bug to not have a hive within this function */
3931 if (WARN_ON(!hive))
3932 return;
3933
3934 /*
3935 * Use task barrier to synchronize all xgmi reset works across the
3936 * hive. task_barrier_enter and task_barrier_exit will block
3937 * until all the threads running the xgmi reset works reach
3938 * those points. task_barrier_full will do both blocks.
3939 */
3940 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3941
3942 task_barrier_enter(&hive->tb);
3943 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3944
3945 if (adev->asic_reset_res)
3946 goto fail;
3947
3948 task_barrier_exit(&hive->tb);
3949 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3950
3951 if (adev->asic_reset_res)
3952 goto fail;
3953
3954 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
3955 } else {
3956
3957 task_barrier_full(&hive->tb);
3958 adev->asic_reset_res = amdgpu_asic_reset(adev);
3959 }
3960
3961fail:
3962 if (adev->asic_reset_res)
3963 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3964 adev->asic_reset_res, adev_to_drm(adev)->unique);
3965 amdgpu_put_xgmi_hive(hive);
3966}
3967
3968static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3969{
3970 char *input = amdgpu_lockup_timeout;
3971 char *timeout_setting = NULL;
3972 int index = 0;
3973 long timeout;
3974 int ret = 0;
3975
3976 /*
3977 * By default timeout for non compute jobs is 10000
3978 * and 60000 for compute jobs.
3979 * In SR-IOV or passthrough mode, timeout for compute
3980 * jobs are 60000 by default.
3981 */
3982 adev->gfx_timeout = msecs_to_jiffies(10000);
3983 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3984 if (amdgpu_sriov_vf(adev))
3985 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3986 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3987 else
3988 adev->compute_timeout = msecs_to_jiffies(60000);
3989
3990 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3991 while ((timeout_setting = strsep(&input, ",")) &&
3992 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3993 ret = kstrtol(timeout_setting, 0, &timeout);
3994 if (ret)
3995 return ret;
3996
3997 if (timeout == 0) {
3998 index++;
3999 continue;
4000 } else if (timeout < 0) {
4001 timeout = MAX_SCHEDULE_TIMEOUT;
4002 dev_warn(adev->dev, "lockup timeout disabled");
4003 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
4004 } else {
4005 timeout = msecs_to_jiffies(timeout);
4006 }
4007
4008 switch (index++) {
4009 case 0:
4010 adev->gfx_timeout = timeout;
4011 break;
4012 case 1:
4013 adev->compute_timeout = timeout;
4014 break;
4015 case 2:
4016 adev->sdma_timeout = timeout;
4017 break;
4018 case 3:
4019 adev->video_timeout = timeout;
4020 break;
4021 default:
4022 break;
4023 }
4024 }
4025 /*
4026 * There is only one value specified and
4027 * it should apply to all non-compute jobs.
4028 */
4029 if (index == 1) {
4030 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4031 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
4032 adev->compute_timeout = adev->gfx_timeout;
4033 }
4034 }
4035
4036 return ret;
4037}
4038
4039/**
4040 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
4041 *
4042 * @adev: amdgpu_device pointer
4043 *
4044 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
4045 */
4046static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
4047{
4048 struct iommu_domain *domain;
4049
4050 domain = iommu_get_domain_for_dev(adev->dev);
4051 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
4052 adev->ram_is_direct_mapped = true;
4053}
4054
4055#if defined(CONFIG_HSA_AMD_P2P)
4056/**
4057 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled.
4058 *
4059 * @adev: amdgpu_device pointer
4060 *
4061 * return if IOMMU remapping bar address
4062 */
4063static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
4064{
4065 struct iommu_domain *domain;
4066
4067 domain = iommu_get_domain_for_dev(adev->dev);
4068 if (domain && (domain->type == IOMMU_DOMAIN_DMA ||
4069 domain->type == IOMMU_DOMAIN_DMA_FQ))
4070 return true;
4071
4072 return false;
4073}
4074#endif
4075
4076static const struct attribute *amdgpu_dev_attributes[] = {
4077 &dev_attr_pcie_replay_count.attr,
4078 NULL
4079};
4080
4081static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
4082{
4083 if (amdgpu_mcbp == 1)
4084 adev->gfx.mcbp = true;
4085 else if (amdgpu_mcbp == 0)
4086 adev->gfx.mcbp = false;
4087
4088 if (amdgpu_sriov_vf(adev))
4089 adev->gfx.mcbp = true;
4090
4091 if (adev->gfx.mcbp)
4092 DRM_INFO("MCBP is enabled\n");
4093}
4094
4095/**
4096 * amdgpu_device_init - initialize the driver
4097 *
4098 * @adev: amdgpu_device pointer
4099 * @flags: driver flags
4100 *
4101 * Initializes the driver info and hw (all asics).
4102 * Returns 0 for success or an error on failure.
4103 * Called at driver startup.
4104 */
4105int amdgpu_device_init(struct amdgpu_device *adev,
4106 uint32_t flags)
4107{
4108 struct drm_device *ddev = adev_to_drm(adev);
4109 struct pci_dev *pdev = adev->pdev;
4110 int r, i;
4111 bool px = false;
4112 u32 max_MBps;
4113 int tmp;
4114
4115 adev->shutdown = false;
4116 adev->flags = flags;
4117
4118 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
4119 adev->asic_type = amdgpu_force_asic_type;
4120 else
4121 adev->asic_type = flags & AMD_ASIC_MASK;
4122
4123 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
4124 if (amdgpu_emu_mode == 1)
4125 adev->usec_timeout *= 10;
4126 adev->gmc.gart_size = 512 * 1024 * 1024;
4127 adev->accel_working = false;
4128 adev->num_rings = 0;
4129 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
4130 adev->mman.buffer_funcs = NULL;
4131 adev->mman.buffer_funcs_ring = NULL;
4132 adev->vm_manager.vm_pte_funcs = NULL;
4133 adev->vm_manager.vm_pte_num_scheds = 0;
4134 adev->gmc.gmc_funcs = NULL;
4135 adev->harvest_ip_mask = 0x0;
4136 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
4137 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
4138
4139 adev->smc_rreg = &amdgpu_invalid_rreg;
4140 adev->smc_wreg = &amdgpu_invalid_wreg;
4141 adev->pcie_rreg = &amdgpu_invalid_rreg;
4142 adev->pcie_wreg = &amdgpu_invalid_wreg;
4143 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
4144 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
4145 adev->pciep_rreg = &amdgpu_invalid_rreg;
4146 adev->pciep_wreg = &amdgpu_invalid_wreg;
4147 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
4148 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
4149 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext;
4150 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;
4151 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
4152 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
4153 adev->didt_rreg = &amdgpu_invalid_rreg;
4154 adev->didt_wreg = &amdgpu_invalid_wreg;
4155 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
4156 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
4157 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
4158 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
4159
4160 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
4161 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
4162 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
4163
4164 /* mutex initialization are all done here so we
4165 * can recall function without having locking issues
4166 */
4167 mutex_init(&adev->firmware.mutex);
4168 mutex_init(&adev->pm.mutex);
4169 mutex_init(&adev->gfx.gpu_clock_mutex);
4170 mutex_init(&adev->srbm_mutex);
4171 mutex_init(&adev->gfx.pipe_reserve_mutex);
4172 mutex_init(&adev->gfx.gfx_off_mutex);
4173 mutex_init(&adev->gfx.partition_mutex);
4174 mutex_init(&adev->grbm_idx_mutex);
4175 mutex_init(&adev->mn_lock);
4176 mutex_init(&adev->virt.vf_errors.lock);
4177 mutex_init(&adev->virt.rlcg_reg_lock);
4178 hash_init(adev->mn_hash);
4179 mutex_init(&adev->psp.mutex);
4180 mutex_init(&adev->notifier_lock);
4181 mutex_init(&adev->pm.stable_pstate_ctx_lock);
4182 mutex_init(&adev->benchmark_mutex);
4183 mutex_init(&adev->gfx.reset_sem_mutex);
4184 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
4185 mutex_init(&adev->enforce_isolation_mutex);
4186 mutex_init(&adev->gfx.kfd_sch_mutex);
4187
4188 amdgpu_device_init_apu_flags(adev);
4189
4190 r = amdgpu_device_check_arguments(adev);
4191 if (r)
4192 return r;
4193
4194 spin_lock_init(&adev->mmio_idx_lock);
4195 spin_lock_init(&adev->smc_idx_lock);
4196 spin_lock_init(&adev->pcie_idx_lock);
4197 spin_lock_init(&adev->uvd_ctx_idx_lock);
4198 spin_lock_init(&adev->didt_idx_lock);
4199 spin_lock_init(&adev->gc_cac_idx_lock);
4200 spin_lock_init(&adev->se_cac_idx_lock);
4201 spin_lock_init(&adev->audio_endpt_idx_lock);
4202 spin_lock_init(&adev->mm_stats.lock);
4203 spin_lock_init(&adev->wb.lock);
4204
4205 INIT_LIST_HEAD(&adev->reset_list);
4206
4207 INIT_LIST_HEAD(&adev->ras_list);
4208
4209 INIT_LIST_HEAD(&adev->pm.od_kobj_list);
4210
4211 INIT_DELAYED_WORK(&adev->delayed_init_work,
4212 amdgpu_device_delayed_init_work_handler);
4213 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
4214 amdgpu_device_delay_enable_gfx_off);
4215 /*
4216 * Initialize the enforce_isolation work structures for each XCP
4217 * partition. This work handler is responsible for enforcing shader
4218 * isolation on AMD GPUs. It counts the number of emitted fences for
4219 * each GFX and compute ring. If there are any fences, it schedules
4220 * the `enforce_isolation_work` to be run after a delay. If there are
4221 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the
4222 * runqueue.
4223 */
4224 for (i = 0; i < MAX_XCP; i++) {
4225 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work,
4226 amdgpu_gfx_enforce_isolation_handler);
4227 adev->gfx.enforce_isolation[i].adev = adev;
4228 adev->gfx.enforce_isolation[i].xcp_id = i;
4229 }
4230
4231 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
4232
4233 adev->gfx.gfx_off_req_count = 1;
4234 adev->gfx.gfx_off_residency = 0;
4235 adev->gfx.gfx_off_entrycount = 0;
4236 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
4237
4238 atomic_set(&adev->throttling_logging_enabled, 1);
4239 /*
4240 * If throttling continues, logging will be performed every minute
4241 * to avoid log flooding. "-1" is subtracted since the thermal
4242 * throttling interrupt comes every second. Thus, the total logging
4243 * interval is 59 seconds(retelimited printk interval) + 1(waiting
4244 * for throttling interrupt) = 60 seconds.
4245 */
4246 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
4247 ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1);
4248
4249 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
4250 ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE);
4251
4252 /* Registers mapping */
4253 /* TODO: block userspace mapping of io register */
4254 if (adev->asic_type >= CHIP_BONAIRE) {
4255 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
4256 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
4257 } else {
4258 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
4259 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
4260 }
4261
4262 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
4263 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
4264
4265 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
4266 if (!adev->rmmio)
4267 return -ENOMEM;
4268
4269 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
4270 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
4271
4272 /*
4273 * Reset domain needs to be present early, before XGMI hive discovered
4274 * (if any) and intitialized to use reset sem and in_gpu reset flag
4275 * early on during init and before calling to RREG32.
4276 */
4277 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
4278 if (!adev->reset_domain)
4279 return -ENOMEM;
4280
4281 /* detect hw virtualization here */
4282 amdgpu_detect_virtualization(adev);
4283
4284 amdgpu_device_get_pcie_info(adev);
4285
4286 r = amdgpu_device_get_job_timeout_settings(adev);
4287 if (r) {
4288 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4289 return r;
4290 }
4291
4292 amdgpu_device_set_mcbp(adev);
4293
4294 /*
4295 * By default, use default mode where all blocks are expected to be
4296 * initialized. At present a 'swinit' of blocks is required to be
4297 * completed before the need for a different level is detected.
4298 */
4299 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT);
4300 /* early init functions */
4301 r = amdgpu_device_ip_early_init(adev);
4302 if (r)
4303 return r;
4304
4305 /* Get rid of things like offb */
4306 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name);
4307 if (r)
4308 return r;
4309
4310 /* Enable TMZ based on IP_VERSION */
4311 amdgpu_gmc_tmz_set(adev);
4312
4313 if (amdgpu_sriov_vf(adev) &&
4314 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
4315 /* VF MMIO access (except mailbox range) from CPU
4316 * will be blocked during sriov runtime
4317 */
4318 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
4319
4320 amdgpu_gmc_noretry_set(adev);
4321 /* Need to get xgmi info early to decide the reset behavior*/
4322 if (adev->gmc.xgmi.supported) {
4323 r = adev->gfxhub.funcs->get_xgmi_info(adev);
4324 if (r)
4325 return r;
4326 }
4327
4328 /* enable PCIE atomic ops */
4329 if (amdgpu_sriov_vf(adev)) {
4330 if (adev->virt.fw_reserve.p_pf2vf)
4331 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
4332 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
4333 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4334 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4335 * internal path natively support atomics, set have_atomics_support to true.
4336 */
4337 } else if ((adev->flags & AMD_IS_APU) &&
4338 (amdgpu_ip_version(adev, GC_HWIP, 0) >
4339 IP_VERSION(9, 0, 0))) {
4340 adev->have_atomics_support = true;
4341 } else {
4342 adev->have_atomics_support =
4343 !pci_enable_atomic_ops_to_root(adev->pdev,
4344 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
4345 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4346 }
4347
4348 if (!adev->have_atomics_support)
4349 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
4350
4351 /* doorbell bar mapping and doorbell index init*/
4352 amdgpu_doorbell_init(adev);
4353
4354 if (amdgpu_emu_mode == 1) {
4355 /* post the asic on emulation mode */
4356 emu_soc_asic_init(adev);
4357 goto fence_driver_init;
4358 }
4359
4360 amdgpu_reset_init(adev);
4361
4362 /* detect if we are with an SRIOV vbios */
4363 if (adev->bios)
4364 amdgpu_device_detect_sriov_bios(adev);
4365
4366 /* check if we need to reset the asic
4367 * E.g., driver was not cleanly unloaded previously, etc.
4368 */
4369 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
4370 if (adev->gmc.xgmi.num_physical_nodes) {
4371 dev_info(adev->dev, "Pending hive reset.\n");
4372 amdgpu_set_init_level(adev,
4373 AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
4374 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
4375 !amdgpu_device_has_display_hardware(adev)) {
4376 r = psp_gpu_reset(adev);
4377 } else {
4378 tmp = amdgpu_reset_method;
4379 /* It should do a default reset when loading or reloading the driver,
4380 * regardless of the module parameter reset_method.
4381 */
4382 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
4383 r = amdgpu_asic_reset(adev);
4384 amdgpu_reset_method = tmp;
4385 }
4386
4387 if (r) {
4388 dev_err(adev->dev, "asic reset on init failed\n");
4389 goto failed;
4390 }
4391 }
4392
4393 /* Post card if necessary */
4394 if (amdgpu_device_need_post(adev)) {
4395 if (!adev->bios) {
4396 dev_err(adev->dev, "no vBIOS found\n");
4397 r = -EINVAL;
4398 goto failed;
4399 }
4400 DRM_INFO("GPU posting now...\n");
4401 r = amdgpu_device_asic_init(adev);
4402 if (r) {
4403 dev_err(adev->dev, "gpu post error!\n");
4404 goto failed;
4405 }
4406 }
4407
4408 if (adev->bios) {
4409 if (adev->is_atom_fw) {
4410 /* Initialize clocks */
4411 r = amdgpu_atomfirmware_get_clock_info(adev);
4412 if (r) {
4413 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
4414 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4415 goto failed;
4416 }
4417 } else {
4418 /* Initialize clocks */
4419 r = amdgpu_atombios_get_clock_info(adev);
4420 if (r) {
4421 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
4422 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4423 goto failed;
4424 }
4425 /* init i2c buses */
4426 if (!amdgpu_device_has_dc_support(adev))
4427 amdgpu_atombios_i2c_init(adev);
4428 }
4429 }
4430
4431fence_driver_init:
4432 /* Fence driver */
4433 r = amdgpu_fence_driver_sw_init(adev);
4434 if (r) {
4435 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
4436 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
4437 goto failed;
4438 }
4439
4440 /* init the mode config */
4441 drm_mode_config_init(adev_to_drm(adev));
4442
4443 r = amdgpu_device_ip_init(adev);
4444 if (r) {
4445 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
4446 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
4447 goto release_ras_con;
4448 }
4449
4450 amdgpu_fence_driver_hw_init(adev);
4451
4452 dev_info(adev->dev,
4453 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4454 adev->gfx.config.max_shader_engines,
4455 adev->gfx.config.max_sh_per_se,
4456 adev->gfx.config.max_cu_per_sh,
4457 adev->gfx.cu_info.number);
4458
4459 adev->accel_working = true;
4460
4461 amdgpu_vm_check_compute_bug(adev);
4462
4463 /* Initialize the buffer migration limit. */
4464 if (amdgpu_moverate >= 0)
4465 max_MBps = amdgpu_moverate;
4466 else
4467 max_MBps = 8; /* Allow 8 MB/s. */
4468 /* Get a log2 for easy divisions. */
4469 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
4470
4471 /*
4472 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4473 * Otherwise the mgpu fan boost feature will be skipped due to the
4474 * gpu instance is counted less.
4475 */
4476 amdgpu_register_gpu_instance(adev);
4477
4478 /* enable clockgating, etc. after ib tests, etc. since some blocks require
4479 * explicit gating rather than handling it automatically.
4480 */
4481 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
4482 r = amdgpu_device_ip_late_init(adev);
4483 if (r) {
4484 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
4485 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
4486 goto release_ras_con;
4487 }
4488 /* must succeed. */
4489 amdgpu_ras_resume(adev);
4490 queue_delayed_work(system_wq, &adev->delayed_init_work,
4491 msecs_to_jiffies(AMDGPU_RESUME_MS));
4492 }
4493
4494 if (amdgpu_sriov_vf(adev)) {
4495 amdgpu_virt_release_full_gpu(adev, true);
4496 flush_delayed_work(&adev->delayed_init_work);
4497 }
4498
4499 /*
4500 * Place those sysfs registering after `late_init`. As some of those
4501 * operations performed in `late_init` might affect the sysfs
4502 * interfaces creating.
4503 */
4504 r = amdgpu_atombios_sysfs_init(adev);
4505 if (r)
4506 drm_err(&adev->ddev,
4507 "registering atombios sysfs failed (%d).\n", r);
4508
4509 r = amdgpu_pm_sysfs_init(adev);
4510 if (r)
4511 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
4512
4513 r = amdgpu_ucode_sysfs_init(adev);
4514 if (r) {
4515 adev->ucode_sysfs_en = false;
4516 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
4517 } else
4518 adev->ucode_sysfs_en = true;
4519
4520 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
4521 if (r)
4522 dev_err(adev->dev, "Could not create amdgpu device attr\n");
4523
4524 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
4525 if (r)
4526 dev_err(adev->dev,
4527 "Could not create amdgpu board attributes\n");
4528
4529 amdgpu_fru_sysfs_init(adev);
4530 amdgpu_reg_state_sysfs_init(adev);
4531 amdgpu_xcp_cfg_sysfs_init(adev);
4532
4533 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4534 r = amdgpu_pmu_init(adev);
4535 if (r)
4536 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4537
4538 /* Have stored pci confspace at hand for restore in sudden PCI error */
4539 if (amdgpu_device_cache_pci_state(adev->pdev))
4540 pci_restore_state(pdev);
4541
4542 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4543 /* this will fail for cards that aren't VGA class devices, just
4544 * ignore it
4545 */
4546 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4547 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4548
4549 px = amdgpu_device_supports_px(ddev);
4550
4551 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4552 apple_gmux_detect(NULL, NULL)))
4553 vga_switcheroo_register_client(adev->pdev,
4554 &amdgpu_switcheroo_ops, px);
4555
4556 if (px)
4557 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4558
4559 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
4560 amdgpu_xgmi_reset_on_init(adev);
4561
4562 amdgpu_device_check_iommu_direct_map(adev);
4563
4564 return 0;
4565
4566release_ras_con:
4567 if (amdgpu_sriov_vf(adev))
4568 amdgpu_virt_release_full_gpu(adev, true);
4569
4570 /* failed in exclusive mode due to timeout */
4571 if (amdgpu_sriov_vf(adev) &&
4572 !amdgpu_sriov_runtime(adev) &&
4573 amdgpu_virt_mmio_blocked(adev) &&
4574 !amdgpu_virt_wait_reset(adev)) {
4575 dev_err(adev->dev, "VF exclusive mode timeout\n");
4576 /* Don't send request since VF is inactive. */
4577 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4578 adev->virt.ops = NULL;
4579 r = -EAGAIN;
4580 }
4581 amdgpu_release_ras_context(adev);
4582
4583failed:
4584 amdgpu_vf_error_trans_all(adev);
4585
4586 return r;
4587}
4588
4589static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4590{
4591
4592 /* Clear all CPU mappings pointing to this device */
4593 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4594
4595 /* Unmap all mapped bars - Doorbell, registers and VRAM */
4596 amdgpu_doorbell_fini(adev);
4597
4598 iounmap(adev->rmmio);
4599 adev->rmmio = NULL;
4600 if (adev->mman.aper_base_kaddr)
4601 iounmap(adev->mman.aper_base_kaddr);
4602 adev->mman.aper_base_kaddr = NULL;
4603
4604 /* Memory manager related */
4605 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4606 arch_phys_wc_del(adev->gmc.vram_mtrr);
4607 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4608 }
4609}
4610
4611/**
4612 * amdgpu_device_fini_hw - tear down the driver
4613 *
4614 * @adev: amdgpu_device pointer
4615 *
4616 * Tear down the driver info (all asics).
4617 * Called at driver shutdown.
4618 */
4619void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4620{
4621 dev_info(adev->dev, "amdgpu: finishing device.\n");
4622 flush_delayed_work(&adev->delayed_init_work);
4623
4624 if (adev->mman.initialized)
4625 drain_workqueue(adev->mman.bdev.wq);
4626 adev->shutdown = true;
4627
4628 /* make sure IB test finished before entering exclusive mode
4629 * to avoid preemption on IB test
4630 */
4631 if (amdgpu_sriov_vf(adev)) {
4632 amdgpu_virt_request_full_gpu(adev, false);
4633 amdgpu_virt_fini_data_exchange(adev);
4634 }
4635
4636 /* disable all interrupts */
4637 amdgpu_irq_disable_all(adev);
4638 if (adev->mode_info.mode_config_initialized) {
4639 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4640 drm_helper_force_disable_all(adev_to_drm(adev));
4641 else
4642 drm_atomic_helper_shutdown(adev_to_drm(adev));
4643 }
4644 amdgpu_fence_driver_hw_fini(adev);
4645
4646 if (adev->pm.sysfs_initialized)
4647 amdgpu_pm_sysfs_fini(adev);
4648 if (adev->ucode_sysfs_en)
4649 amdgpu_ucode_sysfs_fini(adev);
4650 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4651 amdgpu_fru_sysfs_fini(adev);
4652
4653 amdgpu_reg_state_sysfs_fini(adev);
4654 amdgpu_xcp_cfg_sysfs_fini(adev);
4655
4656 /* disable ras feature must before hw fini */
4657 amdgpu_ras_pre_fini(adev);
4658
4659 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4660
4661 amdgpu_device_ip_fini_early(adev);
4662
4663 amdgpu_irq_fini_hw(adev);
4664
4665 if (adev->mman.initialized)
4666 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4667
4668 amdgpu_gart_dummy_page_fini(adev);
4669
4670 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4671 amdgpu_device_unmap_mmio(adev);
4672
4673}
4674
4675void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4676{
4677 int idx;
4678 bool px;
4679
4680 amdgpu_device_ip_fini(adev);
4681 amdgpu_fence_driver_sw_fini(adev);
4682 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4683 adev->accel_working = false;
4684 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4685
4686 amdgpu_reset_fini(adev);
4687
4688 /* free i2c buses */
4689 if (!amdgpu_device_has_dc_support(adev))
4690 amdgpu_i2c_fini(adev);
4691
4692 if (amdgpu_emu_mode != 1)
4693 amdgpu_atombios_fini(adev);
4694
4695 kfree(adev->bios);
4696 adev->bios = NULL;
4697
4698 kfree(adev->fru_info);
4699 adev->fru_info = NULL;
4700
4701 px = amdgpu_device_supports_px(adev_to_drm(adev));
4702
4703 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4704 apple_gmux_detect(NULL, NULL)))
4705 vga_switcheroo_unregister_client(adev->pdev);
4706
4707 if (px)
4708 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4709
4710 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4711 vga_client_unregister(adev->pdev);
4712
4713 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4714
4715 iounmap(adev->rmmio);
4716 adev->rmmio = NULL;
4717 amdgpu_doorbell_fini(adev);
4718 drm_dev_exit(idx);
4719 }
4720
4721 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4722 amdgpu_pmu_fini(adev);
4723 if (adev->mman.discovery_bin)
4724 amdgpu_discovery_fini(adev);
4725
4726 amdgpu_reset_put_reset_domain(adev->reset_domain);
4727 adev->reset_domain = NULL;
4728
4729 kfree(adev->pci_state);
4730
4731}
4732
4733/**
4734 * amdgpu_device_evict_resources - evict device resources
4735 * @adev: amdgpu device object
4736 *
4737 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4738 * of the vram memory type. Mainly used for evicting device resources
4739 * at suspend time.
4740 *
4741 */
4742static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4743{
4744 int ret;
4745
4746 /* No need to evict vram on APUs for suspend to ram or s2idle */
4747 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4748 return 0;
4749
4750 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4751 if (ret)
4752 DRM_WARN("evicting device resources failed\n");
4753 return ret;
4754}
4755
4756/*
4757 * Suspend & resume.
4758 */
4759/**
4760 * amdgpu_device_prepare - prepare for device suspend
4761 *
4762 * @dev: drm dev pointer
4763 *
4764 * Prepare to put the hw in the suspend state (all asics).
4765 * Returns 0 for success or an error on failure.
4766 * Called at driver suspend.
4767 */
4768int amdgpu_device_prepare(struct drm_device *dev)
4769{
4770 struct amdgpu_device *adev = drm_to_adev(dev);
4771 int i, r;
4772
4773 amdgpu_choose_low_power_state(adev);
4774
4775 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4776 return 0;
4777
4778 /* Evict the majority of BOs before starting suspend sequence */
4779 r = amdgpu_device_evict_resources(adev);
4780 if (r)
4781 goto unprepare;
4782
4783 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4784
4785 for (i = 0; i < adev->num_ip_blocks; i++) {
4786 if (!adev->ip_blocks[i].status.valid)
4787 continue;
4788 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4789 continue;
4790 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]);
4791 if (r)
4792 goto unprepare;
4793 }
4794
4795 return 0;
4796
4797unprepare:
4798 adev->in_s0ix = adev->in_s3 = false;
4799
4800 return r;
4801}
4802
4803/**
4804 * amdgpu_device_suspend - initiate device suspend
4805 *
4806 * @dev: drm dev pointer
4807 * @notify_clients: notify in-kernel DRM clients
4808 *
4809 * Puts the hw in the suspend state (all asics).
4810 * Returns 0 for success or an error on failure.
4811 * Called at driver suspend.
4812 */
4813int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
4814{
4815 struct amdgpu_device *adev = drm_to_adev(dev);
4816 int r = 0;
4817
4818 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4819 return 0;
4820
4821 adev->in_suspend = true;
4822
4823 if (amdgpu_sriov_vf(adev)) {
4824 amdgpu_virt_fini_data_exchange(adev);
4825 r = amdgpu_virt_request_full_gpu(adev, false);
4826 if (r)
4827 return r;
4828 }
4829
4830 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4831 DRM_WARN("smart shift update failed\n");
4832
4833 if (notify_clients)
4834 drm_client_dev_suspend(adev_to_drm(adev), false);
4835
4836 cancel_delayed_work_sync(&adev->delayed_init_work);
4837
4838 amdgpu_ras_suspend(adev);
4839
4840 amdgpu_device_ip_suspend_phase1(adev);
4841
4842 if (!adev->in_s0ix)
4843 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4844
4845 r = amdgpu_device_evict_resources(adev);
4846 if (r)
4847 return r;
4848
4849 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4850
4851 amdgpu_fence_driver_hw_fini(adev);
4852
4853 amdgpu_device_ip_suspend_phase2(adev);
4854
4855 if (amdgpu_sriov_vf(adev))
4856 amdgpu_virt_release_full_gpu(adev, false);
4857
4858 r = amdgpu_dpm_notify_rlc_state(adev, false);
4859 if (r)
4860 return r;
4861
4862 return 0;
4863}
4864
4865/**
4866 * amdgpu_device_resume - initiate device resume
4867 *
4868 * @dev: drm dev pointer
4869 * @notify_clients: notify in-kernel DRM clients
4870 *
4871 * Bring the hw back to operating state (all asics).
4872 * Returns 0 for success or an error on failure.
4873 * Called at driver resume.
4874 */
4875int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
4876{
4877 struct amdgpu_device *adev = drm_to_adev(dev);
4878 int r = 0;
4879
4880 if (amdgpu_sriov_vf(adev)) {
4881 r = amdgpu_virt_request_full_gpu(adev, true);
4882 if (r)
4883 return r;
4884 }
4885
4886 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4887 return 0;
4888
4889 if (adev->in_s0ix)
4890 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4891
4892 /* post card */
4893 if (amdgpu_device_need_post(adev)) {
4894 r = amdgpu_device_asic_init(adev);
4895 if (r)
4896 dev_err(adev->dev, "amdgpu asic init failed\n");
4897 }
4898
4899 r = amdgpu_device_ip_resume(adev);
4900
4901 if (r) {
4902 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4903 goto exit;
4904 }
4905 amdgpu_fence_driver_hw_init(adev);
4906
4907 if (!adev->in_s0ix) {
4908 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4909 if (r)
4910 goto exit;
4911 }
4912
4913 r = amdgpu_device_ip_late_init(adev);
4914 if (r)
4915 goto exit;
4916
4917 queue_delayed_work(system_wq, &adev->delayed_init_work,
4918 msecs_to_jiffies(AMDGPU_RESUME_MS));
4919exit:
4920 if (amdgpu_sriov_vf(adev)) {
4921 amdgpu_virt_init_data_exchange(adev);
4922 amdgpu_virt_release_full_gpu(adev, true);
4923 }
4924
4925 if (r)
4926 return r;
4927
4928 /* Make sure IB tests flushed */
4929 flush_delayed_work(&adev->delayed_init_work);
4930
4931 if (notify_clients)
4932 drm_client_dev_resume(adev_to_drm(adev), false);
4933
4934 amdgpu_ras_resume(adev);
4935
4936 if (adev->mode_info.num_crtc) {
4937 /*
4938 * Most of the connector probing functions try to acquire runtime pm
4939 * refs to ensure that the GPU is powered on when connector polling is
4940 * performed. Since we're calling this from a runtime PM callback,
4941 * trying to acquire rpm refs will cause us to deadlock.
4942 *
4943 * Since we're guaranteed to be holding the rpm lock, it's safe to
4944 * temporarily disable the rpm helpers so this doesn't deadlock us.
4945 */
4946#ifdef CONFIG_PM
4947 dev->dev->power.disable_depth++;
4948#endif
4949 if (!adev->dc_enabled)
4950 drm_helper_hpd_irq_event(dev);
4951 else
4952 drm_kms_helper_hotplug_event(dev);
4953#ifdef CONFIG_PM
4954 dev->dev->power.disable_depth--;
4955#endif
4956 }
4957 adev->in_suspend = false;
4958
4959 if (adev->enable_mes)
4960 amdgpu_mes_self_test(adev);
4961
4962 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4963 DRM_WARN("smart shift update failed\n");
4964
4965 return 0;
4966}
4967
4968/**
4969 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4970 *
4971 * @adev: amdgpu_device pointer
4972 *
4973 * The list of all the hardware IPs that make up the asic is walked and
4974 * the check_soft_reset callbacks are run. check_soft_reset determines
4975 * if the asic is still hung or not.
4976 * Returns true if any of the IPs are still in a hung state, false if not.
4977 */
4978static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4979{
4980 int i;
4981 bool asic_hang = false;
4982
4983 if (amdgpu_sriov_vf(adev))
4984 return true;
4985
4986 if (amdgpu_asic_need_full_reset(adev))
4987 return true;
4988
4989 for (i = 0; i < adev->num_ip_blocks; i++) {
4990 if (!adev->ip_blocks[i].status.valid)
4991 continue;
4992 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4993 adev->ip_blocks[i].status.hang =
4994 adev->ip_blocks[i].version->funcs->check_soft_reset(
4995 &adev->ip_blocks[i]);
4996 if (adev->ip_blocks[i].status.hang) {
4997 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4998 asic_hang = true;
4999 }
5000 }
5001 return asic_hang;
5002}
5003
5004/**
5005 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
5006 *
5007 * @adev: amdgpu_device pointer
5008 *
5009 * The list of all the hardware IPs that make up the asic is walked and the
5010 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
5011 * handles any IP specific hardware or software state changes that are
5012 * necessary for a soft reset to succeed.
5013 * Returns 0 on success, negative error code on failure.
5014 */
5015static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
5016{
5017 int i, r = 0;
5018
5019 for (i = 0; i < adev->num_ip_blocks; i++) {
5020 if (!adev->ip_blocks[i].status.valid)
5021 continue;
5022 if (adev->ip_blocks[i].status.hang &&
5023 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
5024 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]);
5025 if (r)
5026 return r;
5027 }
5028 }
5029
5030 return 0;
5031}
5032
5033/**
5034 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
5035 *
5036 * @adev: amdgpu_device pointer
5037 *
5038 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
5039 * reset is necessary to recover.
5040 * Returns true if a full asic reset is required, false if not.
5041 */
5042static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
5043{
5044 int i;
5045
5046 if (amdgpu_asic_need_full_reset(adev))
5047 return true;
5048
5049 for (i = 0; i < adev->num_ip_blocks; i++) {
5050 if (!adev->ip_blocks[i].status.valid)
5051 continue;
5052 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
5053 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
5054 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
5055 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
5056 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
5057 if (adev->ip_blocks[i].status.hang) {
5058 dev_info(adev->dev, "Some block need full reset!\n");
5059 return true;
5060 }
5061 }
5062 }
5063 return false;
5064}
5065
5066/**
5067 * amdgpu_device_ip_soft_reset - do a soft reset
5068 *
5069 * @adev: amdgpu_device pointer
5070 *
5071 * The list of all the hardware IPs that make up the asic is walked and the
5072 * soft_reset callbacks are run if the block is hung. soft_reset handles any
5073 * IP specific hardware or software state changes that are necessary to soft
5074 * reset the IP.
5075 * Returns 0 on success, negative error code on failure.
5076 */
5077static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
5078{
5079 int i, r = 0;
5080
5081 for (i = 0; i < adev->num_ip_blocks; i++) {
5082 if (!adev->ip_blocks[i].status.valid)
5083 continue;
5084 if (adev->ip_blocks[i].status.hang &&
5085 adev->ip_blocks[i].version->funcs->soft_reset) {
5086 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]);
5087 if (r)
5088 return r;
5089 }
5090 }
5091
5092 return 0;
5093}
5094
5095/**
5096 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
5097 *
5098 * @adev: amdgpu_device pointer
5099 *
5100 * The list of all the hardware IPs that make up the asic is walked and the
5101 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
5102 * handles any IP specific hardware or software state changes that are
5103 * necessary after the IP has been soft reset.
5104 * Returns 0 on success, negative error code on failure.
5105 */
5106static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
5107{
5108 int i, r = 0;
5109
5110 for (i = 0; i < adev->num_ip_blocks; i++) {
5111 if (!adev->ip_blocks[i].status.valid)
5112 continue;
5113 if (adev->ip_blocks[i].status.hang &&
5114 adev->ip_blocks[i].version->funcs->post_soft_reset)
5115 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]);
5116 if (r)
5117 return r;
5118 }
5119
5120 return 0;
5121}
5122
5123/**
5124 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5125 *
5126 * @adev: amdgpu_device pointer
5127 * @reset_context: amdgpu reset context pointer
5128 *
5129 * do VF FLR and reinitialize Asic
5130 * return 0 means succeeded otherwise failed
5131 */
5132static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
5133 struct amdgpu_reset_context *reset_context)
5134{
5135 int r;
5136 struct amdgpu_hive_info *hive = NULL;
5137
5138 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
5139 if (!amdgpu_ras_get_fed_status(adev))
5140 amdgpu_virt_ready_to_reset(adev);
5141 amdgpu_virt_wait_reset(adev);
5142 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5143 r = amdgpu_virt_request_full_gpu(adev, true);
5144 } else {
5145 r = amdgpu_virt_reset_gpu(adev);
5146 }
5147 if (r)
5148 return r;
5149
5150 amdgpu_ras_set_fed(adev, false);
5151 amdgpu_irq_gpu_reset_resume_helper(adev);
5152
5153 /* some sw clean up VF needs to do before recover */
5154 amdgpu_virt_post_reset(adev);
5155
5156 /* Resume IP prior to SMC */
5157 r = amdgpu_device_ip_reinit_early_sriov(adev);
5158 if (r)
5159 return r;
5160
5161 amdgpu_virt_init_data_exchange(adev);
5162
5163 r = amdgpu_device_fw_loading(adev);
5164 if (r)
5165 return r;
5166
5167 /* now we are okay to resume SMC/CP/SDMA */
5168 r = amdgpu_device_ip_reinit_late_sriov(adev);
5169 if (r)
5170 return r;
5171
5172 hive = amdgpu_get_xgmi_hive(adev);
5173 /* Update PSP FW topology after reset */
5174 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
5175 r = amdgpu_xgmi_update_topology(hive, adev);
5176 if (hive)
5177 amdgpu_put_xgmi_hive(hive);
5178 if (r)
5179 return r;
5180
5181 r = amdgpu_ib_ring_tests(adev);
5182 if (r)
5183 return r;
5184
5185 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
5186 amdgpu_inc_vram_lost(adev);
5187
5188 /* need to be called during full access so we can't do it later like
5189 * bare-metal does.
5190 */
5191 amdgpu_amdkfd_post_reset(adev);
5192 amdgpu_virt_release_full_gpu(adev, true);
5193
5194 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5195 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
5196 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
5197 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
5198 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
5199 amdgpu_ras_resume(adev);
5200
5201 amdgpu_virt_ras_telemetry_post_reset(adev);
5202
5203 return 0;
5204}
5205
5206/**
5207 * amdgpu_device_has_job_running - check if there is any job in mirror list
5208 *
5209 * @adev: amdgpu_device pointer
5210 *
5211 * check if there is any job in mirror list
5212 */
5213bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
5214{
5215 int i;
5216 struct drm_sched_job *job;
5217
5218 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5219 struct amdgpu_ring *ring = adev->rings[i];
5220
5221 if (!amdgpu_ring_sched_ready(ring))
5222 continue;
5223
5224 spin_lock(&ring->sched.job_list_lock);
5225 job = list_first_entry_or_null(&ring->sched.pending_list,
5226 struct drm_sched_job, list);
5227 spin_unlock(&ring->sched.job_list_lock);
5228 if (job)
5229 return true;
5230 }
5231 return false;
5232}
5233
5234/**
5235 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
5236 *
5237 * @adev: amdgpu_device pointer
5238 *
5239 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
5240 * a hung GPU.
5241 */
5242bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
5243{
5244
5245 if (amdgpu_gpu_recovery == 0)
5246 goto disabled;
5247
5248 /* Skip soft reset check in fatal error mode */
5249 if (!amdgpu_ras_is_poison_mode_supported(adev))
5250 return true;
5251
5252 if (amdgpu_sriov_vf(adev))
5253 return true;
5254
5255 if (amdgpu_gpu_recovery == -1) {
5256 switch (adev->asic_type) {
5257#ifdef CONFIG_DRM_AMDGPU_SI
5258 case CHIP_VERDE:
5259 case CHIP_TAHITI:
5260 case CHIP_PITCAIRN:
5261 case CHIP_OLAND:
5262 case CHIP_HAINAN:
5263#endif
5264#ifdef CONFIG_DRM_AMDGPU_CIK
5265 case CHIP_KAVERI:
5266 case CHIP_KABINI:
5267 case CHIP_MULLINS:
5268#endif
5269 case CHIP_CARRIZO:
5270 case CHIP_STONEY:
5271 case CHIP_CYAN_SKILLFISH:
5272 goto disabled;
5273 default:
5274 break;
5275 }
5276 }
5277
5278 return true;
5279
5280disabled:
5281 dev_info(adev->dev, "GPU recovery disabled.\n");
5282 return false;
5283}
5284
5285int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
5286{
5287 u32 i;
5288 int ret = 0;
5289
5290 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
5291
5292 dev_info(adev->dev, "GPU mode1 reset\n");
5293
5294 /* Cache the state before bus master disable. The saved config space
5295 * values are used in other cases like restore after mode-2 reset.
5296 */
5297 amdgpu_device_cache_pci_state(adev->pdev);
5298
5299 /* disable BM */
5300 pci_clear_master(adev->pdev);
5301
5302 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
5303 dev_info(adev->dev, "GPU smu mode1 reset\n");
5304 ret = amdgpu_dpm_mode1_reset(adev);
5305 } else {
5306 dev_info(adev->dev, "GPU psp mode1 reset\n");
5307 ret = psp_gpu_reset(adev);
5308 }
5309
5310 if (ret)
5311 goto mode1_reset_failed;
5312
5313 amdgpu_device_load_pci_state(adev->pdev);
5314 ret = amdgpu_psp_wait_for_bootloader(adev);
5315 if (ret)
5316 goto mode1_reset_failed;
5317
5318 /* wait for asic to come out of reset */
5319 for (i = 0; i < adev->usec_timeout; i++) {
5320 u32 memsize = adev->nbio.funcs->get_memsize(adev);
5321
5322 if (memsize != 0xffffffff)
5323 break;
5324 udelay(1);
5325 }
5326
5327 if (i >= adev->usec_timeout) {
5328 ret = -ETIMEDOUT;
5329 goto mode1_reset_failed;
5330 }
5331
5332 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
5333
5334 return 0;
5335
5336mode1_reset_failed:
5337 dev_err(adev->dev, "GPU mode1 reset failed\n");
5338 return ret;
5339}
5340
5341int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
5342 struct amdgpu_reset_context *reset_context)
5343{
5344 int i, r = 0;
5345 struct amdgpu_job *job = NULL;
5346 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
5347 bool need_full_reset =
5348 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5349
5350 if (reset_context->reset_req_dev == adev)
5351 job = reset_context->job;
5352
5353 if (amdgpu_sriov_vf(adev))
5354 amdgpu_virt_pre_reset(adev);
5355
5356 amdgpu_fence_driver_isr_toggle(adev, true);
5357
5358 /* block all schedulers and reset given job's ring */
5359 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5360 struct amdgpu_ring *ring = adev->rings[i];
5361
5362 if (!amdgpu_ring_sched_ready(ring))
5363 continue;
5364
5365 /* Clear job fence from fence drv to avoid force_completion
5366 * leave NULL and vm flush fence in fence drv
5367 */
5368 amdgpu_fence_driver_clear_job_fences(ring);
5369
5370 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
5371 amdgpu_fence_driver_force_completion(ring);
5372 }
5373
5374 amdgpu_fence_driver_isr_toggle(adev, false);
5375
5376 if (job && job->vm)
5377 drm_sched_increase_karma(&job->base);
5378
5379 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
5380 /* If reset handler not implemented, continue; otherwise return */
5381 if (r == -EOPNOTSUPP)
5382 r = 0;
5383 else
5384 return r;
5385
5386 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
5387 if (!amdgpu_sriov_vf(adev)) {
5388
5389 if (!need_full_reset)
5390 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
5391
5392 if (!need_full_reset && amdgpu_gpu_recovery &&
5393 amdgpu_device_ip_check_soft_reset(adev)) {
5394 amdgpu_device_ip_pre_soft_reset(adev);
5395 r = amdgpu_device_ip_soft_reset(adev);
5396 amdgpu_device_ip_post_soft_reset(adev);
5397 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
5398 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
5399 need_full_reset = true;
5400 }
5401 }
5402
5403 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
5404 dev_info(tmp_adev->dev, "Dumping IP State\n");
5405 /* Trigger ip dump before we reset the asic */
5406 for (i = 0; i < tmp_adev->num_ip_blocks; i++)
5407 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
5408 tmp_adev->ip_blocks[i].version->funcs
5409 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]);
5410 dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
5411 }
5412
5413 if (need_full_reset)
5414 r = amdgpu_device_ip_suspend(adev);
5415 if (need_full_reset)
5416 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5417 else
5418 clear_bit(AMDGPU_NEED_FULL_RESET,
5419 &reset_context->flags);
5420 }
5421
5422 return r;
5423}
5424
5425int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
5426{
5427 struct list_head *device_list_handle;
5428 bool full_reset, vram_lost = false;
5429 struct amdgpu_device *tmp_adev;
5430 int r, init_level;
5431
5432 device_list_handle = reset_context->reset_device_list;
5433
5434 if (!device_list_handle)
5435 return -EINVAL;
5436
5437 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5438
5439 /**
5440 * If it's reset on init, it's default init level, otherwise keep level
5441 * as recovery level.
5442 */
5443 if (reset_context->method == AMD_RESET_METHOD_ON_INIT)
5444 init_level = AMDGPU_INIT_LEVEL_DEFAULT;
5445 else
5446 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY;
5447
5448 r = 0;
5449 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5450 amdgpu_set_init_level(tmp_adev, init_level);
5451 if (full_reset) {
5452 /* post card */
5453 amdgpu_ras_set_fed(tmp_adev, false);
5454 r = amdgpu_device_asic_init(tmp_adev);
5455 if (r) {
5456 dev_warn(tmp_adev->dev, "asic atom init failed!");
5457 } else {
5458 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5459
5460 r = amdgpu_device_ip_resume_phase1(tmp_adev);
5461 if (r)
5462 goto out;
5463
5464 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5465
5466 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
5467 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
5468
5469 if (vram_lost) {
5470 DRM_INFO("VRAM is lost due to GPU reset!\n");
5471 amdgpu_inc_vram_lost(tmp_adev);
5472 }
5473
5474 r = amdgpu_device_fw_loading(tmp_adev);
5475 if (r)
5476 return r;
5477
5478 r = amdgpu_xcp_restore_partition_mode(
5479 tmp_adev->xcp_mgr);
5480 if (r)
5481 goto out;
5482
5483 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5484 if (r)
5485 goto out;
5486
5487 if (tmp_adev->mman.buffer_funcs_ring->sched.ready)
5488 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
5489
5490 if (vram_lost)
5491 amdgpu_device_fill_reset_magic(tmp_adev);
5492
5493 /*
5494 * Add this ASIC as tracked as reset was already
5495 * complete successfully.
5496 */
5497 amdgpu_register_gpu_instance(tmp_adev);
5498
5499 if (!reset_context->hive &&
5500 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5501 amdgpu_xgmi_add_device(tmp_adev);
5502
5503 r = amdgpu_device_ip_late_init(tmp_adev);
5504 if (r)
5505 goto out;
5506
5507 drm_client_dev_resume(adev_to_drm(tmp_adev), false);
5508
5509 /*
5510 * The GPU enters bad state once faulty pages
5511 * by ECC has reached the threshold, and ras
5512 * recovery is scheduled next. So add one check
5513 * here to break recovery if it indeed exceeds
5514 * bad page threshold, and remind user to
5515 * retire this GPU or setting one bigger
5516 * bad_page_threshold value to fix this once
5517 * probing driver again.
5518 */
5519 if (!amdgpu_ras_is_rma(tmp_adev)) {
5520 /* must succeed. */
5521 amdgpu_ras_resume(tmp_adev);
5522 } else {
5523 r = -EINVAL;
5524 goto out;
5525 }
5526
5527 /* Update PSP FW topology after reset */
5528 if (reset_context->hive &&
5529 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5530 r = amdgpu_xgmi_update_topology(
5531 reset_context->hive, tmp_adev);
5532 }
5533 }
5534
5535out:
5536 if (!r) {
5537 /* IP init is complete now, set level as default */
5538 amdgpu_set_init_level(tmp_adev,
5539 AMDGPU_INIT_LEVEL_DEFAULT);
5540 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5541 r = amdgpu_ib_ring_tests(tmp_adev);
5542 if (r) {
5543 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5544 r = -EAGAIN;
5545 goto end;
5546 }
5547 }
5548
5549 if (r)
5550 tmp_adev->asic_reset_res = r;
5551 }
5552
5553end:
5554 return r;
5555}
5556
5557int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5558 struct amdgpu_reset_context *reset_context)
5559{
5560 struct amdgpu_device *tmp_adev = NULL;
5561 bool need_full_reset, skip_hw_reset;
5562 int r = 0;
5563
5564 /* Try reset handler method first */
5565 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5566 reset_list);
5567
5568 reset_context->reset_device_list = device_list_handle;
5569 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5570 /* If reset handler not implemented, continue; otherwise return */
5571 if (r == -EOPNOTSUPP)
5572 r = 0;
5573 else
5574 return r;
5575
5576 /* Reset handler not implemented, use the default method */
5577 need_full_reset =
5578 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5579 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5580
5581 /*
5582 * ASIC reset has to be done on all XGMI hive nodes ASAP
5583 * to allow proper links negotiation in FW (within 1 sec)
5584 */
5585 if (!skip_hw_reset && need_full_reset) {
5586 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5587 /* For XGMI run all resets in parallel to speed up the process */
5588 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5589 if (!queue_work(system_unbound_wq,
5590 &tmp_adev->xgmi_reset_work))
5591 r = -EALREADY;
5592 } else
5593 r = amdgpu_asic_reset(tmp_adev);
5594
5595 if (r) {
5596 dev_err(tmp_adev->dev,
5597 "ASIC reset failed with error, %d for drm dev, %s",
5598 r, adev_to_drm(tmp_adev)->unique);
5599 goto out;
5600 }
5601 }
5602
5603 /* For XGMI wait for all resets to complete before proceed */
5604 if (!r) {
5605 list_for_each_entry(tmp_adev, device_list_handle,
5606 reset_list) {
5607 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5608 flush_work(&tmp_adev->xgmi_reset_work);
5609 r = tmp_adev->asic_reset_res;
5610 if (r)
5611 break;
5612 }
5613 }
5614 }
5615 }
5616
5617 if (!r && amdgpu_ras_intr_triggered()) {
5618 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5619 amdgpu_ras_reset_error_count(tmp_adev,
5620 AMDGPU_RAS_BLOCK__MMHUB);
5621 }
5622
5623 amdgpu_ras_intr_cleared();
5624 }
5625
5626 r = amdgpu_device_reinit_after_reset(reset_context);
5627 if (r == -EAGAIN)
5628 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5629 else
5630 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5631
5632out:
5633 return r;
5634}
5635
5636static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5637{
5638
5639 switch (amdgpu_asic_reset_method(adev)) {
5640 case AMD_RESET_METHOD_MODE1:
5641 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5642 break;
5643 case AMD_RESET_METHOD_MODE2:
5644 adev->mp1_state = PP_MP1_STATE_RESET;
5645 break;
5646 default:
5647 adev->mp1_state = PP_MP1_STATE_NONE;
5648 break;
5649 }
5650}
5651
5652static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5653{
5654 amdgpu_vf_error_trans_all(adev);
5655 adev->mp1_state = PP_MP1_STATE_NONE;
5656}
5657
5658static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5659{
5660 struct pci_dev *p = NULL;
5661
5662 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5663 adev->pdev->bus->number, 1);
5664 if (p) {
5665 pm_runtime_enable(&(p->dev));
5666 pm_runtime_resume(&(p->dev));
5667 }
5668
5669 pci_dev_put(p);
5670}
5671
5672static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5673{
5674 enum amd_reset_method reset_method;
5675 struct pci_dev *p = NULL;
5676 u64 expires;
5677
5678 /*
5679 * For now, only BACO and mode1 reset are confirmed
5680 * to suffer the audio issue without proper suspended.
5681 */
5682 reset_method = amdgpu_asic_reset_method(adev);
5683 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5684 (reset_method != AMD_RESET_METHOD_MODE1))
5685 return -EINVAL;
5686
5687 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5688 adev->pdev->bus->number, 1);
5689 if (!p)
5690 return -ENODEV;
5691
5692 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5693 if (!expires)
5694 /*
5695 * If we cannot get the audio device autosuspend delay,
5696 * a fixed 4S interval will be used. Considering 3S is
5697 * the audio controller default autosuspend delay setting.
5698 * 4S used here is guaranteed to cover that.
5699 */
5700 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5701
5702 while (!pm_runtime_status_suspended(&(p->dev))) {
5703 if (!pm_runtime_suspend(&(p->dev)))
5704 break;
5705
5706 if (expires < ktime_get_mono_fast_ns()) {
5707 dev_warn(adev->dev, "failed to suspend display audio\n");
5708 pci_dev_put(p);
5709 /* TODO: abort the succeeding gpu reset? */
5710 return -ETIMEDOUT;
5711 }
5712 }
5713
5714 pm_runtime_disable(&(p->dev));
5715
5716 pci_dev_put(p);
5717 return 0;
5718}
5719
5720static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5721{
5722 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5723
5724#if defined(CONFIG_DEBUG_FS)
5725 if (!amdgpu_sriov_vf(adev))
5726 cancel_work(&adev->reset_work);
5727#endif
5728
5729 if (adev->kfd.dev)
5730 cancel_work(&adev->kfd.reset_work);
5731
5732 if (amdgpu_sriov_vf(adev))
5733 cancel_work(&adev->virt.flr_work);
5734
5735 if (con && adev->ras_enabled)
5736 cancel_work(&con->recovery_work);
5737
5738}
5739
5740static int amdgpu_device_health_check(struct list_head *device_list_handle)
5741{
5742 struct amdgpu_device *tmp_adev;
5743 int ret = 0;
5744 u32 status;
5745
5746 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5747 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
5748 if (PCI_POSSIBLE_ERROR(status)) {
5749 dev_err(tmp_adev->dev, "device lost from bus!");
5750 ret = -ENODEV;
5751 }
5752 }
5753
5754 return ret;
5755}
5756
5757/**
5758 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5759 *
5760 * @adev: amdgpu_device pointer
5761 * @job: which job trigger hang
5762 * @reset_context: amdgpu reset context pointer
5763 *
5764 * Attempt to reset the GPU if it has hung (all asics).
5765 * Attempt to do soft-reset or full-reset and reinitialize Asic
5766 * Returns 0 for success or an error on failure.
5767 */
5768
5769int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5770 struct amdgpu_job *job,
5771 struct amdgpu_reset_context *reset_context)
5772{
5773 struct list_head device_list, *device_list_handle = NULL;
5774 bool job_signaled = false;
5775 struct amdgpu_hive_info *hive = NULL;
5776 struct amdgpu_device *tmp_adev = NULL;
5777 int i, r = 0;
5778 bool need_emergency_restart = false;
5779 bool audio_suspended = false;
5780 int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
5781
5782 /*
5783 * Special case: RAS triggered and full reset isn't supported
5784 */
5785 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5786
5787 /*
5788 * Flush RAM to disk so that after reboot
5789 * the user can read log and see why the system rebooted.
5790 */
5791 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5792 amdgpu_ras_get_context(adev)->reboot) {
5793 DRM_WARN("Emergency reboot.");
5794
5795 ksys_sync_helper();
5796 emergency_restart();
5797 }
5798
5799 dev_info(adev->dev, "GPU %s begin!\n",
5800 need_emergency_restart ? "jobs stop":"reset");
5801
5802 if (!amdgpu_sriov_vf(adev))
5803 hive = amdgpu_get_xgmi_hive(adev);
5804 if (hive)
5805 mutex_lock(&hive->hive_lock);
5806
5807 reset_context->job = job;
5808 reset_context->hive = hive;
5809 /*
5810 * Build list of devices to reset.
5811 * In case we are in XGMI hive mode, resort the device list
5812 * to put adev in the 1st position.
5813 */
5814 INIT_LIST_HEAD(&device_list);
5815 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5816 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5817 list_add_tail(&tmp_adev->reset_list, &device_list);
5818 if (adev->shutdown)
5819 tmp_adev->shutdown = true;
5820 }
5821 if (!list_is_first(&adev->reset_list, &device_list))
5822 list_rotate_to_front(&adev->reset_list, &device_list);
5823 device_list_handle = &device_list;
5824 } else {
5825 list_add_tail(&adev->reset_list, &device_list);
5826 device_list_handle = &device_list;
5827 }
5828
5829 if (!amdgpu_sriov_vf(adev)) {
5830 r = amdgpu_device_health_check(device_list_handle);
5831 if (r)
5832 goto end_reset;
5833 }
5834
5835 /* We need to lock reset domain only once both for XGMI and single device */
5836 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5837 reset_list);
5838 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5839
5840 /* block all schedulers and reset given job's ring */
5841 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5842
5843 amdgpu_device_set_mp1_state(tmp_adev);
5844
5845 /*
5846 * Try to put the audio codec into suspend state
5847 * before gpu reset started.
5848 *
5849 * Due to the power domain of the graphics device
5850 * is shared with AZ power domain. Without this,
5851 * we may change the audio hardware from behind
5852 * the audio driver's back. That will trigger
5853 * some audio codec errors.
5854 */
5855 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5856 audio_suspended = true;
5857
5858 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5859
5860 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5861
5862 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
5863
5864 /*
5865 * Mark these ASICs to be reseted as untracked first
5866 * And add them back after reset completed
5867 */
5868 amdgpu_unregister_gpu_instance(tmp_adev);
5869
5870 drm_client_dev_suspend(adev_to_drm(tmp_adev), false);
5871
5872 /* disable ras on ALL IPs */
5873 if (!need_emergency_restart &&
5874 amdgpu_device_ip_need_full_reset(tmp_adev))
5875 amdgpu_ras_suspend(tmp_adev);
5876
5877 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5878 struct amdgpu_ring *ring = tmp_adev->rings[i];
5879
5880 if (!amdgpu_ring_sched_ready(ring))
5881 continue;
5882
5883 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5884
5885 if (need_emergency_restart)
5886 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5887 }
5888 atomic_inc(&tmp_adev->gpu_reset_counter);
5889 }
5890
5891 if (need_emergency_restart)
5892 goto skip_sched_resume;
5893
5894 /*
5895 * Must check guilty signal here since after this point all old
5896 * HW fences are force signaled.
5897 *
5898 * job->base holds a reference to parent fence
5899 */
5900 if (job && dma_fence_is_signaled(&job->hw_fence)) {
5901 job_signaled = true;
5902 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5903 goto skip_hw_reset;
5904 }
5905
5906retry: /* Rest of adevs pre asic reset from XGMI hive. */
5907 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5908 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5909 /*TODO Should we stop ?*/
5910 if (r) {
5911 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5912 r, adev_to_drm(tmp_adev)->unique);
5913 tmp_adev->asic_reset_res = r;
5914 }
5915 }
5916
5917 /* Actual ASIC resets if needed.*/
5918 /* Host driver will handle XGMI hive reset for SRIOV */
5919 if (amdgpu_sriov_vf(adev)) {
5920 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
5921 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
5922 amdgpu_ras_set_fed(adev, true);
5923 set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5924 }
5925
5926 r = amdgpu_device_reset_sriov(adev, reset_context);
5927 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
5928 amdgpu_virt_release_full_gpu(adev, true);
5929 goto retry;
5930 }
5931 if (r)
5932 adev->asic_reset_res = r;
5933 } else {
5934 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5935 if (r && r == -EAGAIN)
5936 goto retry;
5937 }
5938
5939 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5940 /*
5941 * Drop any pending non scheduler resets queued before reset is done.
5942 * Any reset scheduled after this point would be valid. Scheduler resets
5943 * were already dropped during drm_sched_stop and no new ones can come
5944 * in before drm_sched_start.
5945 */
5946 amdgpu_device_stop_pending_resets(tmp_adev);
5947 }
5948
5949skip_hw_reset:
5950
5951 /* Post ASIC reset for all devs .*/
5952 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5953
5954 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5955 struct amdgpu_ring *ring = tmp_adev->rings[i];
5956
5957 if (!amdgpu_ring_sched_ready(ring))
5958 continue;
5959
5960 drm_sched_start(&ring->sched, 0);
5961 }
5962
5963 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
5964 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5965
5966 if (tmp_adev->asic_reset_res)
5967 r = tmp_adev->asic_reset_res;
5968
5969 tmp_adev->asic_reset_res = 0;
5970
5971 if (r) {
5972 /* bad news, how to tell it to userspace ?
5973 * for ras error, we should report GPU bad status instead of
5974 * reset failure
5975 */
5976 if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
5977 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
5978 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
5979 atomic_read(&tmp_adev->gpu_reset_counter));
5980 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5981 } else {
5982 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5983 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5984 DRM_WARN("smart shift update failed\n");
5985 }
5986 }
5987
5988skip_sched_resume:
5989 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5990 /* unlock kfd: SRIOV would do it separately */
5991 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5992 amdgpu_amdkfd_post_reset(tmp_adev);
5993
5994 /* kfd_post_reset will do nothing if kfd device is not initialized,
5995 * need to bring up kfd here if it's not be initialized before
5996 */
5997 if (!adev->kfd.init_complete)
5998 amdgpu_amdkfd_device_init(adev);
5999
6000 if (audio_suspended)
6001 amdgpu_device_resume_display_audio(tmp_adev);
6002
6003 amdgpu_device_unset_mp1_state(tmp_adev);
6004
6005 amdgpu_ras_set_error_query_ready(tmp_adev, true);
6006 }
6007
6008 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
6009 reset_list);
6010 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
6011
6012end_reset:
6013 if (hive) {
6014 mutex_unlock(&hive->hive_lock);
6015 amdgpu_put_xgmi_hive(hive);
6016 }
6017
6018 if (r)
6019 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
6020
6021 atomic_set(&adev->reset_domain->reset_res, r);
6022 return r;
6023}
6024
6025/**
6026 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
6027 *
6028 * @adev: amdgpu_device pointer
6029 * @speed: pointer to the speed of the link
6030 * @width: pointer to the width of the link
6031 *
6032 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6033 * first physical partner to an AMD dGPU.
6034 * This will exclude any virtual switches and links.
6035 */
6036static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
6037 enum pci_bus_speed *speed,
6038 enum pcie_link_width *width)
6039{
6040 struct pci_dev *parent = adev->pdev;
6041
6042 if (!speed || !width)
6043 return;
6044
6045 *speed = PCI_SPEED_UNKNOWN;
6046 *width = PCIE_LNK_WIDTH_UNKNOWN;
6047
6048 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
6049 while ((parent = pci_upstream_bridge(parent))) {
6050 /* skip upstream/downstream switches internal to dGPU*/
6051 if (parent->vendor == PCI_VENDOR_ID_ATI)
6052 continue;
6053 *speed = pcie_get_speed_cap(parent);
6054 *width = pcie_get_width_cap(parent);
6055 break;
6056 }
6057 } else {
6058 /* use the current speeds rather than max if switching is not supported */
6059 pcie_bandwidth_available(adev->pdev, NULL, speed, width);
6060 }
6061}
6062
6063/**
6064 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
6065 *
6066 * @adev: amdgpu_device pointer
6067 *
6068 * Fetchs and stores in the driver the PCIE capabilities (gen speed
6069 * and lanes) of the slot the device is in. Handles APUs and
6070 * virtualized environments where PCIE config space may not be available.
6071 */
6072static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
6073{
6074 struct pci_dev *pdev;
6075 enum pci_bus_speed speed_cap, platform_speed_cap;
6076 enum pcie_link_width platform_link_width;
6077
6078 if (amdgpu_pcie_gen_cap)
6079 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
6080
6081 if (amdgpu_pcie_lane_cap)
6082 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
6083
6084 /* covers APUs as well */
6085 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
6086 if (adev->pm.pcie_gen_mask == 0)
6087 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
6088 if (adev->pm.pcie_mlw_mask == 0)
6089 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
6090 return;
6091 }
6092
6093 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
6094 return;
6095
6096 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
6097 &platform_link_width);
6098
6099 if (adev->pm.pcie_gen_mask == 0) {
6100 /* asic caps */
6101 pdev = adev->pdev;
6102 speed_cap = pcie_get_speed_cap(pdev);
6103 if (speed_cap == PCI_SPEED_UNKNOWN) {
6104 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6105 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6106 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6107 } else {
6108 if (speed_cap == PCIE_SPEED_32_0GT)
6109 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6110 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6111 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6112 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6113 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
6114 else if (speed_cap == PCIE_SPEED_16_0GT)
6115 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6116 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6117 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6118 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
6119 else if (speed_cap == PCIE_SPEED_8_0GT)
6120 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6121 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6122 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6123 else if (speed_cap == PCIE_SPEED_5_0GT)
6124 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6125 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
6126 else
6127 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
6128 }
6129 /* platform caps */
6130 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
6131 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6132 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6133 } else {
6134 if (platform_speed_cap == PCIE_SPEED_32_0GT)
6135 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6136 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6137 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6138 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6139 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
6140 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
6141 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6142 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6143 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6144 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
6145 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
6146 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6147 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6148 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
6149 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
6150 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6151 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6152 else
6153 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
6154
6155 }
6156 }
6157 if (adev->pm.pcie_mlw_mask == 0) {
6158 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6159 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
6160 } else {
6161 switch (platform_link_width) {
6162 case PCIE_LNK_X32:
6163 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
6164 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6165 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6166 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6167 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6168 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6169 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6170 break;
6171 case PCIE_LNK_X16:
6172 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6173 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6174 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6175 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6176 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6177 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6178 break;
6179 case PCIE_LNK_X12:
6180 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6181 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6182 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6183 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6184 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6185 break;
6186 case PCIE_LNK_X8:
6187 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6188 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6189 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6190 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6191 break;
6192 case PCIE_LNK_X4:
6193 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6194 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6195 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6196 break;
6197 case PCIE_LNK_X2:
6198 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6199 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6200 break;
6201 case PCIE_LNK_X1:
6202 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
6203 break;
6204 default:
6205 break;
6206 }
6207 }
6208 }
6209}
6210
6211/**
6212 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
6213 *
6214 * @adev: amdgpu_device pointer
6215 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
6216 *
6217 * Return true if @peer_adev can access (DMA) @adev through the PCIe
6218 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
6219 * @peer_adev.
6220 */
6221bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
6222 struct amdgpu_device *peer_adev)
6223{
6224#ifdef CONFIG_HSA_AMD_P2P
6225 bool p2p_access =
6226 !adev->gmc.xgmi.connected_to_cpu &&
6227 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
6228 if (!p2p_access)
6229 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n",
6230 pci_name(peer_adev->pdev));
6231
6232 bool is_large_bar = adev->gmc.visible_vram_size &&
6233 adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
6234 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev);
6235
6236 if (!p2p_addressable) {
6237 uint64_t address_mask = peer_adev->dev->dma_mask ?
6238 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
6239 resource_size_t aper_limit =
6240 adev->gmc.aper_base + adev->gmc.aper_size - 1;
6241
6242 p2p_addressable = !(adev->gmc.aper_base & address_mask ||
6243 aper_limit & address_mask);
6244 }
6245 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable;
6246#else
6247 return false;
6248#endif
6249}
6250
6251int amdgpu_device_baco_enter(struct drm_device *dev)
6252{
6253 struct amdgpu_device *adev = drm_to_adev(dev);
6254 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6255
6256 if (!amdgpu_device_supports_baco(dev))
6257 return -ENOTSUPP;
6258
6259 if (ras && adev->ras_enabled &&
6260 adev->nbio.funcs->enable_doorbell_interrupt)
6261 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
6262
6263 return amdgpu_dpm_baco_enter(adev);
6264}
6265
6266int amdgpu_device_baco_exit(struct drm_device *dev)
6267{
6268 struct amdgpu_device *adev = drm_to_adev(dev);
6269 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6270 int ret = 0;
6271
6272 if (!amdgpu_device_supports_baco(dev))
6273 return -ENOTSUPP;
6274
6275 ret = amdgpu_dpm_baco_exit(adev);
6276 if (ret)
6277 return ret;
6278
6279 if (ras && adev->ras_enabled &&
6280 adev->nbio.funcs->enable_doorbell_interrupt)
6281 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
6282
6283 if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
6284 adev->nbio.funcs->clear_doorbell_interrupt)
6285 adev->nbio.funcs->clear_doorbell_interrupt(adev);
6286
6287 return 0;
6288}
6289
6290/**
6291 * amdgpu_pci_error_detected - Called when a PCI error is detected.
6292 * @pdev: PCI device struct
6293 * @state: PCI channel state
6294 *
6295 * Description: Called when a PCI error is detected.
6296 *
6297 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
6298 */
6299pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
6300{
6301 struct drm_device *dev = pci_get_drvdata(pdev);
6302 struct amdgpu_device *adev = drm_to_adev(dev);
6303 int i;
6304
6305 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
6306
6307 if (adev->gmc.xgmi.num_physical_nodes > 1) {
6308 DRM_WARN("No support for XGMI hive yet...");
6309 return PCI_ERS_RESULT_DISCONNECT;
6310 }
6311
6312 adev->pci_channel_state = state;
6313
6314 switch (state) {
6315 case pci_channel_io_normal:
6316 return PCI_ERS_RESULT_CAN_RECOVER;
6317 /* Fatal error, prepare for slot reset */
6318 case pci_channel_io_frozen:
6319 /*
6320 * Locking adev->reset_domain->sem will prevent any external access
6321 * to GPU during PCI error recovery
6322 */
6323 amdgpu_device_lock_reset_domain(adev->reset_domain);
6324 amdgpu_device_set_mp1_state(adev);
6325
6326 /*
6327 * Block any work scheduling as we do for regular GPU reset
6328 * for the duration of the recovery
6329 */
6330 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6331 struct amdgpu_ring *ring = adev->rings[i];
6332
6333 if (!amdgpu_ring_sched_ready(ring))
6334 continue;
6335
6336 drm_sched_stop(&ring->sched, NULL);
6337 }
6338 atomic_inc(&adev->gpu_reset_counter);
6339 return PCI_ERS_RESULT_NEED_RESET;
6340 case pci_channel_io_perm_failure:
6341 /* Permanent error, prepare for device removal */
6342 return PCI_ERS_RESULT_DISCONNECT;
6343 }
6344
6345 return PCI_ERS_RESULT_NEED_RESET;
6346}
6347
6348/**
6349 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
6350 * @pdev: pointer to PCI device
6351 */
6352pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
6353{
6354
6355 DRM_INFO("PCI error: mmio enabled callback!!\n");
6356
6357 /* TODO - dump whatever for debugging purposes */
6358
6359 /* This called only if amdgpu_pci_error_detected returns
6360 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
6361 * works, no need to reset slot.
6362 */
6363
6364 return PCI_ERS_RESULT_RECOVERED;
6365}
6366
6367/**
6368 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
6369 * @pdev: PCI device struct
6370 *
6371 * Description: This routine is called by the pci error recovery
6372 * code after the PCI slot has been reset, just before we
6373 * should resume normal operations.
6374 */
6375pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
6376{
6377 struct drm_device *dev = pci_get_drvdata(pdev);
6378 struct amdgpu_device *adev = drm_to_adev(dev);
6379 int r, i;
6380 struct amdgpu_reset_context reset_context;
6381 u32 memsize;
6382 struct list_head device_list;
6383
6384 /* PCI error slot reset should be skipped During RAS recovery */
6385 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
6386 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
6387 amdgpu_ras_in_recovery(adev))
6388 return PCI_ERS_RESULT_RECOVERED;
6389
6390 DRM_INFO("PCI error: slot reset callback!!\n");
6391
6392 memset(&reset_context, 0, sizeof(reset_context));
6393
6394 INIT_LIST_HEAD(&device_list);
6395 list_add_tail(&adev->reset_list, &device_list);
6396
6397 /* wait for asic to come out of reset */
6398 msleep(500);
6399
6400 /* Restore PCI confspace */
6401 amdgpu_device_load_pci_state(pdev);
6402
6403 /* confirm ASIC came out of reset */
6404 for (i = 0; i < adev->usec_timeout; i++) {
6405 memsize = amdgpu_asic_get_config_memsize(adev);
6406
6407 if (memsize != 0xffffffff)
6408 break;
6409 udelay(1);
6410 }
6411 if (memsize == 0xffffffff) {
6412 r = -ETIME;
6413 goto out;
6414 }
6415
6416 reset_context.method = AMD_RESET_METHOD_NONE;
6417 reset_context.reset_req_dev = adev;
6418 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
6419 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
6420
6421 adev->no_hw_access = true;
6422 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
6423 adev->no_hw_access = false;
6424 if (r)
6425 goto out;
6426
6427 r = amdgpu_do_asic_reset(&device_list, &reset_context);
6428
6429out:
6430 if (!r) {
6431 if (amdgpu_device_cache_pci_state(adev->pdev))
6432 pci_restore_state(adev->pdev);
6433
6434 DRM_INFO("PCIe error recovery succeeded\n");
6435 } else {
6436 DRM_ERROR("PCIe error recovery failed, err:%d", r);
6437 amdgpu_device_unset_mp1_state(adev);
6438 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6439 }
6440
6441 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
6442}
6443
6444/**
6445 * amdgpu_pci_resume() - resume normal ops after PCI reset
6446 * @pdev: pointer to PCI device
6447 *
6448 * Called when the error recovery driver tells us that its
6449 * OK to resume normal operation.
6450 */
6451void amdgpu_pci_resume(struct pci_dev *pdev)
6452{
6453 struct drm_device *dev = pci_get_drvdata(pdev);
6454 struct amdgpu_device *adev = drm_to_adev(dev);
6455 int i;
6456
6457
6458 DRM_INFO("PCI error: resume callback!!\n");
6459
6460 /* Only continue execution for the case of pci_channel_io_frozen */
6461 if (adev->pci_channel_state != pci_channel_io_frozen)
6462 return;
6463
6464 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6465 struct amdgpu_ring *ring = adev->rings[i];
6466
6467 if (!amdgpu_ring_sched_ready(ring))
6468 continue;
6469
6470 drm_sched_start(&ring->sched, 0);
6471 }
6472
6473 amdgpu_device_unset_mp1_state(adev);
6474 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6475}
6476
6477bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
6478{
6479 struct drm_device *dev = pci_get_drvdata(pdev);
6480 struct amdgpu_device *adev = drm_to_adev(dev);
6481 int r;
6482
6483 if (amdgpu_sriov_vf(adev))
6484 return false;
6485
6486 r = pci_save_state(pdev);
6487 if (!r) {
6488 kfree(adev->pci_state);
6489
6490 adev->pci_state = pci_store_saved_state(pdev);
6491
6492 if (!adev->pci_state) {
6493 DRM_ERROR("Failed to store PCI saved state");
6494 return false;
6495 }
6496 } else {
6497 DRM_WARN("Failed to save PCI state, err:%d\n", r);
6498 return false;
6499 }
6500
6501 return true;
6502}
6503
6504bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6505{
6506 struct drm_device *dev = pci_get_drvdata(pdev);
6507 struct amdgpu_device *adev = drm_to_adev(dev);
6508 int r;
6509
6510 if (!adev->pci_state)
6511 return false;
6512
6513 r = pci_load_saved_state(pdev, adev->pci_state);
6514
6515 if (!r) {
6516 pci_restore_state(pdev);
6517 } else {
6518 DRM_WARN("Failed to load PCI state, err:%d\n", r);
6519 return false;
6520 }
6521
6522 return true;
6523}
6524
6525void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6526 struct amdgpu_ring *ring)
6527{
6528#ifdef CONFIG_X86_64
6529 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6530 return;
6531#endif
6532 if (adev->gmc.xgmi.connected_to_cpu)
6533 return;
6534
6535 if (ring && ring->funcs->emit_hdp_flush)
6536 amdgpu_ring_emit_hdp_flush(ring);
6537 else
6538 amdgpu_asic_flush_hdp(adev, ring);
6539}
6540
6541void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6542 struct amdgpu_ring *ring)
6543{
6544#ifdef CONFIG_X86_64
6545 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6546 return;
6547#endif
6548 if (adev->gmc.xgmi.connected_to_cpu)
6549 return;
6550
6551 amdgpu_asic_invalidate_hdp(adev, ring);
6552}
6553
6554int amdgpu_in_reset(struct amdgpu_device *adev)
6555{
6556 return atomic_read(&adev->reset_domain->in_gpu_reset);
6557}
6558
6559/**
6560 * amdgpu_device_halt() - bring hardware to some kind of halt state
6561 *
6562 * @adev: amdgpu_device pointer
6563 *
6564 * Bring hardware to some kind of halt state so that no one can touch it
6565 * any more. It will help to maintain error context when error occurred.
6566 * Compare to a simple hang, the system will keep stable at least for SSH
6567 * access. Then it should be trivial to inspect the hardware state and
6568 * see what's going on. Implemented as following:
6569 *
6570 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6571 * clears all CPU mappings to device, disallows remappings through page faults
6572 * 2. amdgpu_irq_disable_all() disables all interrupts
6573 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6574 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6575 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6576 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6577 * flush any in flight DMA operations
6578 */
6579void amdgpu_device_halt(struct amdgpu_device *adev)
6580{
6581 struct pci_dev *pdev = adev->pdev;
6582 struct drm_device *ddev = adev_to_drm(adev);
6583
6584 amdgpu_xcp_dev_unplug(adev);
6585 drm_dev_unplug(ddev);
6586
6587 amdgpu_irq_disable_all(adev);
6588
6589 amdgpu_fence_driver_hw_fini(adev);
6590
6591 adev->no_hw_access = true;
6592
6593 amdgpu_device_unmap_mmio(adev);
6594
6595 pci_disable_device(pdev);
6596 pci_wait_for_pending_transaction(pdev);
6597}
6598
6599u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6600 u32 reg)
6601{
6602 unsigned long flags, address, data;
6603 u32 r;
6604
6605 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6606 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6607
6608 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6609 WREG32(address, reg * 4);
6610 (void)RREG32(address);
6611 r = RREG32(data);
6612 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6613 return r;
6614}
6615
6616void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6617 u32 reg, u32 v)
6618{
6619 unsigned long flags, address, data;
6620
6621 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6622 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6623
6624 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6625 WREG32(address, reg * 4);
6626 (void)RREG32(address);
6627 WREG32(data, v);
6628 (void)RREG32(data);
6629 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6630}
6631
6632/**
6633 * amdgpu_device_get_gang - return a reference to the current gang
6634 * @adev: amdgpu_device pointer
6635 *
6636 * Returns: A new reference to the current gang leader.
6637 */
6638struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
6639{
6640 struct dma_fence *fence;
6641
6642 rcu_read_lock();
6643 fence = dma_fence_get_rcu_safe(&adev->gang_submit);
6644 rcu_read_unlock();
6645 return fence;
6646}
6647
6648/**
6649 * amdgpu_device_switch_gang - switch to a new gang
6650 * @adev: amdgpu_device pointer
6651 * @gang: the gang to switch to
6652 *
6653 * Try to switch to a new gang.
6654 * Returns: NULL if we switched to the new gang or a reference to the current
6655 * gang leader.
6656 */
6657struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6658 struct dma_fence *gang)
6659{
6660 struct dma_fence *old = NULL;
6661
6662 do {
6663 dma_fence_put(old);
6664 old = amdgpu_device_get_gang(adev);
6665 if (old == gang)
6666 break;
6667
6668 if (!dma_fence_is_signaled(old))
6669 return old;
6670
6671 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6672 old, gang) != old);
6673
6674 dma_fence_put(old);
6675 return NULL;
6676}
6677
6678bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6679{
6680 switch (adev->asic_type) {
6681#ifdef CONFIG_DRM_AMDGPU_SI
6682 case CHIP_HAINAN:
6683#endif
6684 case CHIP_TOPAZ:
6685 /* chips with no display hardware */
6686 return false;
6687#ifdef CONFIG_DRM_AMDGPU_SI
6688 case CHIP_TAHITI:
6689 case CHIP_PITCAIRN:
6690 case CHIP_VERDE:
6691 case CHIP_OLAND:
6692#endif
6693#ifdef CONFIG_DRM_AMDGPU_CIK
6694 case CHIP_BONAIRE:
6695 case CHIP_HAWAII:
6696 case CHIP_KAVERI:
6697 case CHIP_KABINI:
6698 case CHIP_MULLINS:
6699#endif
6700 case CHIP_TONGA:
6701 case CHIP_FIJI:
6702 case CHIP_POLARIS10:
6703 case CHIP_POLARIS11:
6704 case CHIP_POLARIS12:
6705 case CHIP_VEGAM:
6706 case CHIP_CARRIZO:
6707 case CHIP_STONEY:
6708 /* chips with display hardware */
6709 return true;
6710 default:
6711 /* IP discovery */
6712 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||
6713 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6714 return false;
6715 return true;
6716 }
6717}
6718
6719uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6720 uint32_t inst, uint32_t reg_addr, char reg_name[],
6721 uint32_t expected_value, uint32_t mask)
6722{
6723 uint32_t ret = 0;
6724 uint32_t old_ = 0;
6725 uint32_t tmp_ = RREG32(reg_addr);
6726 uint32_t loop = adev->usec_timeout;
6727
6728 while ((tmp_ & (mask)) != (expected_value)) {
6729 if (old_ != tmp_) {
6730 loop = adev->usec_timeout;
6731 old_ = tmp_;
6732 } else
6733 udelay(1);
6734 tmp_ = RREG32(reg_addr);
6735 loop--;
6736 if (!loop) {
6737 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6738 inst, reg_name, (uint32_t)expected_value,
6739 (uint32_t)(tmp_ & (mask)));
6740 ret = -ETIMEDOUT;
6741 break;
6742 }
6743 }
6744 return ret;
6745}
6746
6747ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
6748{
6749 ssize_t size = 0;
6750
6751 if (!ring || !ring->adev)
6752 return size;
6753
6754 if (amdgpu_device_should_recover_gpu(ring->adev))
6755 size |= AMDGPU_RESET_TYPE_FULL;
6756
6757 if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
6758 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
6759 size |= AMDGPU_RESET_TYPE_SOFT_RESET;
6760
6761 return size;
6762}
6763
6764ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
6765{
6766 ssize_t size = 0;
6767
6768 if (supported_reset == 0) {
6769 size += sysfs_emit_at(buf, size, "unsupported");
6770 size += sysfs_emit_at(buf, size, "\n");
6771 return size;
6772
6773 }
6774
6775 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
6776 size += sysfs_emit_at(buf, size, "soft ");
6777
6778 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
6779 size += sysfs_emit_at(buf, size, "queue ");
6780
6781 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
6782 size += sysfs_emit_at(buf, size, "pipe ");
6783
6784 if (supported_reset & AMDGPU_RESET_TYPE_FULL)
6785 size += sysfs_emit_at(buf, size, "full ");
6786
6787 size += sysfs_emit_at(buf, size, "\n");
6788 return size;
6789}