Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28
29#include <linux/aperture.h>
30#include <linux/power_supply.h>
31#include <linux/kthread.h>
32#include <linux/module.h>
33#include <linux/console.h>
34#include <linux/slab.h>
35#include <linux/iommu.h>
36#include <linux/pci.h>
37#include <linux/pci-p2pdma.h>
38#include <linux/apple-gmux.h>
39
40#include <drm/drm_atomic_helper.h>
41#include <drm/drm_client_event.h>
42#include <drm/drm_crtc_helper.h>
43#include <drm/drm_probe_helper.h>
44#include <drm/amdgpu_drm.h>
45#include <linux/device.h>
46#include <linux/vgaarb.h>
47#include <linux/vga_switcheroo.h>
48#include <linux/efi.h>
49#include "amdgpu.h"
50#include "amdgpu_trace.h"
51#include "amdgpu_i2c.h"
52#include "atom.h"
53#include "amdgpu_atombios.h"
54#include "amdgpu_atomfirmware.h"
55#include "amd_pcie.h"
56#ifdef CONFIG_DRM_AMDGPU_SI
57#include "si.h"
58#endif
59#ifdef CONFIG_DRM_AMDGPU_CIK
60#include "cik.h"
61#endif
62#include "vi.h"
63#include "soc15.h"
64#include "nv.h"
65#include "bif/bif_4_1_d.h"
66#include <linux/firmware.h>
67#include "amdgpu_vf_error.h"
68
69#include "amdgpu_amdkfd.h"
70#include "amdgpu_pm.h"
71
72#include "amdgpu_xgmi.h"
73#include "amdgpu_ras.h"
74#include "amdgpu_pmu.h"
75#include "amdgpu_fru_eeprom.h"
76#include "amdgpu_reset.h"
77#include "amdgpu_virt.h"
78#include "amdgpu_dev_coredump.h"
79
80#include <linux/suspend.h>
81#include <drm/task_barrier.h>
82#include <linux/pm_runtime.h>
83
84#include <drm/drm_drv.h>
85
86#if IS_ENABLED(CONFIG_X86)
87#include <asm/intel-family.h>
88#endif
89
90MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
91MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
92MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
93MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
94MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
95MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
96MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
97
98#define AMDGPU_RESUME_MS 2000
99#define AMDGPU_MAX_RETRY_LIMIT 2
100#define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
101#define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
102#define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
103#define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
104
105static const struct drm_driver amdgpu_kms_driver;
106
107const char *amdgpu_asic_name[] = {
108 "TAHITI",
109 "PITCAIRN",
110 "VERDE",
111 "OLAND",
112 "HAINAN",
113 "BONAIRE",
114 "KAVERI",
115 "KABINI",
116 "HAWAII",
117 "MULLINS",
118 "TOPAZ",
119 "TONGA",
120 "FIJI",
121 "CARRIZO",
122 "STONEY",
123 "POLARIS10",
124 "POLARIS11",
125 "POLARIS12",
126 "VEGAM",
127 "VEGA10",
128 "VEGA12",
129 "VEGA20",
130 "RAVEN",
131 "ARCTURUS",
132 "RENOIR",
133 "ALDEBARAN",
134 "NAVI10",
135 "CYAN_SKILLFISH",
136 "NAVI14",
137 "NAVI12",
138 "SIENNA_CICHLID",
139 "NAVY_FLOUNDER",
140 "VANGOGH",
141 "DIMGREY_CAVEFISH",
142 "BEIGE_GOBY",
143 "YELLOW_CARP",
144 "IP DISCOVERY",
145 "LAST",
146};
147
148#define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0)
149/*
150 * Default init level where all blocks are expected to be initialized. This is
151 * the level of initialization expected by default and also after a full reset
152 * of the device.
153 */
154struct amdgpu_init_level amdgpu_init_default = {
155 .level = AMDGPU_INIT_LEVEL_DEFAULT,
156 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
157};
158
159struct amdgpu_init_level amdgpu_init_recovery = {
160 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY,
161 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
162};
163
164/*
165 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This
166 * is used for cases like reset on initialization where the entire hive needs to
167 * be reset before first use.
168 */
169struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
170 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
171 .hwini_ip_block_mask =
172 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) |
173 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) |
174 BIT(AMD_IP_BLOCK_TYPE_PSP)
175};
176
177static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
178 enum amd_ip_block_type block)
179{
180 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0;
181}
182
183void amdgpu_set_init_level(struct amdgpu_device *adev,
184 enum amdgpu_init_lvl_id lvl)
185{
186 switch (lvl) {
187 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI:
188 adev->init_lvl = &amdgpu_init_minimal_xgmi;
189 break;
190 case AMDGPU_INIT_LEVEL_RESET_RECOVERY:
191 adev->init_lvl = &amdgpu_init_recovery;
192 break;
193 case AMDGPU_INIT_LEVEL_DEFAULT:
194 fallthrough;
195 default:
196 adev->init_lvl = &amdgpu_init_default;
197 break;
198 }
199}
200
201static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
202static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
203 void *data);
204
205/**
206 * DOC: pcie_replay_count
207 *
208 * The amdgpu driver provides a sysfs API for reporting the total number
209 * of PCIe replays (NAKs).
210 * The file pcie_replay_count is used for this and returns the total
211 * number of replays as a sum of the NAKs generated and NAKs received.
212 */
213
214static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
215 struct device_attribute *attr, char *buf)
216{
217 struct drm_device *ddev = dev_get_drvdata(dev);
218 struct amdgpu_device *adev = drm_to_adev(ddev);
219 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
220
221 return sysfs_emit(buf, "%llu\n", cnt);
222}
223
224static DEVICE_ATTR(pcie_replay_count, 0444,
225 amdgpu_device_get_pcie_replay_count, NULL);
226
227static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
228 struct bin_attribute *attr, char *buf,
229 loff_t ppos, size_t count)
230{
231 struct device *dev = kobj_to_dev(kobj);
232 struct drm_device *ddev = dev_get_drvdata(dev);
233 struct amdgpu_device *adev = drm_to_adev(ddev);
234 ssize_t bytes_read;
235
236 switch (ppos) {
237 case AMDGPU_SYS_REG_STATE_XGMI:
238 bytes_read = amdgpu_asic_get_reg_state(
239 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
240 break;
241 case AMDGPU_SYS_REG_STATE_WAFL:
242 bytes_read = amdgpu_asic_get_reg_state(
243 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
244 break;
245 case AMDGPU_SYS_REG_STATE_PCIE:
246 bytes_read = amdgpu_asic_get_reg_state(
247 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
248 break;
249 case AMDGPU_SYS_REG_STATE_USR:
250 bytes_read = amdgpu_asic_get_reg_state(
251 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
252 break;
253 case AMDGPU_SYS_REG_STATE_USR_1:
254 bytes_read = amdgpu_asic_get_reg_state(
255 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
256 break;
257 default:
258 return -EINVAL;
259 }
260
261 return bytes_read;
262}
263
264BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
265 AMDGPU_SYS_REG_STATE_END);
266
267int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
268{
269 int ret;
270
271 if (!amdgpu_asic_get_reg_state_supported(adev))
272 return 0;
273
274 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
275
276 return ret;
277}
278
279void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
280{
281 if (!amdgpu_asic_get_reg_state_supported(adev))
282 return;
283 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
284}
285
286int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block)
287{
288 int r;
289
290 if (ip_block->version->funcs->suspend) {
291 r = ip_block->version->funcs->suspend(ip_block);
292 if (r) {
293 dev_err(ip_block->adev->dev,
294 "suspend of IP block <%s> failed %d\n",
295 ip_block->version->funcs->name, r);
296 return r;
297 }
298 }
299
300 ip_block->status.hw = false;
301 return 0;
302}
303
304int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block)
305{
306 int r;
307
308 if (ip_block->version->funcs->resume) {
309 r = ip_block->version->funcs->resume(ip_block);
310 if (r) {
311 dev_err(ip_block->adev->dev,
312 "resume of IP block <%s> failed %d\n",
313 ip_block->version->funcs->name, r);
314 return r;
315 }
316 }
317
318 ip_block->status.hw = true;
319 return 0;
320}
321
322/**
323 * DOC: board_info
324 *
325 * The amdgpu driver provides a sysfs API for giving board related information.
326 * It provides the form factor information in the format
327 *
328 * type : form factor
329 *
330 * Possible form factor values
331 *
332 * - "cem" - PCIE CEM card
333 * - "oam" - Open Compute Accelerator Module
334 * - "unknown" - Not known
335 *
336 */
337
338static ssize_t amdgpu_device_get_board_info(struct device *dev,
339 struct device_attribute *attr,
340 char *buf)
341{
342 struct drm_device *ddev = dev_get_drvdata(dev);
343 struct amdgpu_device *adev = drm_to_adev(ddev);
344 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM;
345 const char *pkg;
346
347 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type)
348 pkg_type = adev->smuio.funcs->get_pkg_type(adev);
349
350 switch (pkg_type) {
351 case AMDGPU_PKG_TYPE_CEM:
352 pkg = "cem";
353 break;
354 case AMDGPU_PKG_TYPE_OAM:
355 pkg = "oam";
356 break;
357 default:
358 pkg = "unknown";
359 break;
360 }
361
362 return sysfs_emit(buf, "%s : %s\n", "type", pkg);
363}
364
365static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL);
366
367static struct attribute *amdgpu_board_attrs[] = {
368 &dev_attr_board_info.attr,
369 NULL,
370};
371
372static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj,
373 struct attribute *attr, int n)
374{
375 struct device *dev = kobj_to_dev(kobj);
376 struct drm_device *ddev = dev_get_drvdata(dev);
377 struct amdgpu_device *adev = drm_to_adev(ddev);
378
379 if (adev->flags & AMD_IS_APU)
380 return 0;
381
382 return attr->mode;
383}
384
385static const struct attribute_group amdgpu_board_attrs_group = {
386 .attrs = amdgpu_board_attrs,
387 .is_visible = amdgpu_board_attrs_is_visible
388};
389
390static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
391
392
393/**
394 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
395 *
396 * @dev: drm_device pointer
397 *
398 * Returns true if the device is a dGPU with ATPX power control,
399 * otherwise return false.
400 */
401bool amdgpu_device_supports_px(struct drm_device *dev)
402{
403 struct amdgpu_device *adev = drm_to_adev(dev);
404
405 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
406 return true;
407 return false;
408}
409
410/**
411 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
412 *
413 * @dev: drm_device pointer
414 *
415 * Returns true if the device is a dGPU with ACPI power control,
416 * otherwise return false.
417 */
418bool amdgpu_device_supports_boco(struct drm_device *dev)
419{
420 struct amdgpu_device *adev = drm_to_adev(dev);
421
422 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE))
423 return false;
424
425 if (adev->has_pr3 ||
426 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
427 return true;
428 return false;
429}
430
431/**
432 * amdgpu_device_supports_baco - Does the device support BACO
433 *
434 * @dev: drm_device pointer
435 *
436 * Return:
437 * 1 if the device supports BACO;
438 * 3 if the device supports MACO (only works if BACO is supported)
439 * otherwise return 0.
440 */
441int amdgpu_device_supports_baco(struct drm_device *dev)
442{
443 struct amdgpu_device *adev = drm_to_adev(dev);
444
445 return amdgpu_asic_supports_baco(adev);
446}
447
448void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
449{
450 struct drm_device *dev;
451 int bamaco_support;
452
453 dev = adev_to_drm(adev);
454
455 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
456 bamaco_support = amdgpu_device_supports_baco(dev);
457
458 switch (amdgpu_runtime_pm) {
459 case 2:
460 if (bamaco_support & MACO_SUPPORT) {
461 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
462 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
463 } else if (bamaco_support == BACO_SUPPORT) {
464 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
465 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
466 }
467 break;
468 case 1:
469 if (bamaco_support & BACO_SUPPORT) {
470 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
471 dev_info(adev->dev, "Forcing BACO for runtime pm\n");
472 }
473 break;
474 case -1:
475 case -2:
476 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */
477 adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
478 dev_info(adev->dev, "Using ATPX for runtime pm\n");
479 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */
480 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
481 dev_info(adev->dev, "Using BOCO for runtime pm\n");
482 } else {
483 if (!bamaco_support)
484 goto no_runtime_pm;
485
486 switch (adev->asic_type) {
487 case CHIP_VEGA20:
488 case CHIP_ARCTURUS:
489 /* BACO are not supported on vega20 and arctrus */
490 break;
491 case CHIP_VEGA10:
492 /* enable BACO as runpm mode if noretry=0 */
493 if (!adev->gmc.noretry)
494 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
495 break;
496 default:
497 /* enable BACO as runpm mode on CI+ */
498 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
499 break;
500 }
501
502 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
503 if (bamaco_support & MACO_SUPPORT) {
504 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
505 dev_info(adev->dev, "Using BAMACO for runtime pm\n");
506 } else {
507 dev_info(adev->dev, "Using BACO for runtime pm\n");
508 }
509 }
510 }
511 break;
512 case 0:
513 dev_info(adev->dev, "runtime pm is manually disabled\n");
514 break;
515 default:
516 break;
517 }
518
519no_runtime_pm:
520 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
521 dev_info(adev->dev, "Runtime PM not available\n");
522}
523/**
524 * amdgpu_device_supports_smart_shift - Is the device dGPU with
525 * smart shift support
526 *
527 * @dev: drm_device pointer
528 *
529 * Returns true if the device is a dGPU with Smart Shift support,
530 * otherwise returns false.
531 */
532bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
533{
534 return (amdgpu_device_supports_boco(dev) &&
535 amdgpu_acpi_is_power_shift_control_supported());
536}
537
538/*
539 * VRAM access helper functions
540 */
541
542/**
543 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
544 *
545 * @adev: amdgpu_device pointer
546 * @pos: offset of the buffer in vram
547 * @buf: virtual address of the buffer in system memory
548 * @size: read/write size, sizeof(@buf) must > @size
549 * @write: true - write to vram, otherwise - read from vram
550 */
551void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
552 void *buf, size_t size, bool write)
553{
554 unsigned long flags;
555 uint32_t hi = ~0, tmp = 0;
556 uint32_t *data = buf;
557 uint64_t last;
558 int idx;
559
560 if (!drm_dev_enter(adev_to_drm(adev), &idx))
561 return;
562
563 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
564
565 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
566 for (last = pos + size; pos < last; pos += 4) {
567 tmp = pos >> 31;
568
569 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
570 if (tmp != hi) {
571 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
572 hi = tmp;
573 }
574 if (write)
575 WREG32_NO_KIQ(mmMM_DATA, *data++);
576 else
577 *data++ = RREG32_NO_KIQ(mmMM_DATA);
578 }
579
580 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
581 drm_dev_exit(idx);
582}
583
584/**
585 * amdgpu_device_aper_access - access vram by vram aperture
586 *
587 * @adev: amdgpu_device pointer
588 * @pos: offset of the buffer in vram
589 * @buf: virtual address of the buffer in system memory
590 * @size: read/write size, sizeof(@buf) must > @size
591 * @write: true - write to vram, otherwise - read from vram
592 *
593 * The return value means how many bytes have been transferred.
594 */
595size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
596 void *buf, size_t size, bool write)
597{
598#ifdef CONFIG_64BIT
599 void __iomem *addr;
600 size_t count = 0;
601 uint64_t last;
602
603 if (!adev->mman.aper_base_kaddr)
604 return 0;
605
606 last = min(pos + size, adev->gmc.visible_vram_size);
607 if (last > pos) {
608 addr = adev->mman.aper_base_kaddr + pos;
609 count = last - pos;
610
611 if (write) {
612 memcpy_toio(addr, buf, count);
613 /* Make sure HDP write cache flush happens without any reordering
614 * after the system memory contents are sent over PCIe device
615 */
616 mb();
617 amdgpu_device_flush_hdp(adev, NULL);
618 } else {
619 amdgpu_device_invalidate_hdp(adev, NULL);
620 /* Make sure HDP read cache is invalidated before issuing a read
621 * to the PCIe device
622 */
623 mb();
624 memcpy_fromio(buf, addr, count);
625 }
626
627 }
628
629 return count;
630#else
631 return 0;
632#endif
633}
634
635/**
636 * amdgpu_device_vram_access - read/write a buffer in vram
637 *
638 * @adev: amdgpu_device pointer
639 * @pos: offset of the buffer in vram
640 * @buf: virtual address of the buffer in system memory
641 * @size: read/write size, sizeof(@buf) must > @size
642 * @write: true - write to vram, otherwise - read from vram
643 */
644void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
645 void *buf, size_t size, bool write)
646{
647 size_t count;
648
649 /* try to using vram apreature to access vram first */
650 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
651 size -= count;
652 if (size) {
653 /* using MM to access rest vram */
654 pos += count;
655 buf += count;
656 amdgpu_device_mm_access(adev, pos, buf, size, write);
657 }
658}
659
660/*
661 * register access helper functions.
662 */
663
664/* Check if hw access should be skipped because of hotplug or device error */
665bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
666{
667 if (adev->no_hw_access)
668 return true;
669
670#ifdef CONFIG_LOCKDEP
671 /*
672 * This is a bit complicated to understand, so worth a comment. What we assert
673 * here is that the GPU reset is not running on another thread in parallel.
674 *
675 * For this we trylock the read side of the reset semaphore, if that succeeds
676 * we know that the reset is not running in parallel.
677 *
678 * If the trylock fails we assert that we are either already holding the read
679 * side of the lock or are the reset thread itself and hold the write side of
680 * the lock.
681 */
682 if (in_task()) {
683 if (down_read_trylock(&adev->reset_domain->sem))
684 up_read(&adev->reset_domain->sem);
685 else
686 lockdep_assert_held(&adev->reset_domain->sem);
687 }
688#endif
689 return false;
690}
691
692/**
693 * amdgpu_device_rreg - read a memory mapped IO or indirect register
694 *
695 * @adev: amdgpu_device pointer
696 * @reg: dword aligned register offset
697 * @acc_flags: access flags which require special behavior
698 *
699 * Returns the 32 bit value from the offset specified.
700 */
701uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
702 uint32_t reg, uint32_t acc_flags)
703{
704 uint32_t ret;
705
706 if (amdgpu_device_skip_hw_access(adev))
707 return 0;
708
709 if ((reg * 4) < adev->rmmio_size) {
710 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
711 amdgpu_sriov_runtime(adev) &&
712 down_read_trylock(&adev->reset_domain->sem)) {
713 ret = amdgpu_kiq_rreg(adev, reg, 0);
714 up_read(&adev->reset_domain->sem);
715 } else {
716 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
717 }
718 } else {
719 ret = adev->pcie_rreg(adev, reg * 4);
720 }
721
722 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
723
724 return ret;
725}
726
727/*
728 * MMIO register read with bytes helper functions
729 * @offset:bytes offset from MMIO start
730 */
731
732/**
733 * amdgpu_mm_rreg8 - read a memory mapped IO register
734 *
735 * @adev: amdgpu_device pointer
736 * @offset: byte aligned register offset
737 *
738 * Returns the 8 bit value from the offset specified.
739 */
740uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
741{
742 if (amdgpu_device_skip_hw_access(adev))
743 return 0;
744
745 if (offset < adev->rmmio_size)
746 return (readb(adev->rmmio + offset));
747 BUG();
748}
749
750
751/**
752 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
753 *
754 * @adev: amdgpu_device pointer
755 * @reg: dword aligned register offset
756 * @acc_flags: access flags which require special behavior
757 * @xcc_id: xcc accelerated compute core id
758 *
759 * Returns the 32 bit value from the offset specified.
760 */
761uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
762 uint32_t reg, uint32_t acc_flags,
763 uint32_t xcc_id)
764{
765 uint32_t ret, rlcg_flag;
766
767 if (amdgpu_device_skip_hw_access(adev))
768 return 0;
769
770 if ((reg * 4) < adev->rmmio_size) {
771 if (amdgpu_sriov_vf(adev) &&
772 !amdgpu_sriov_runtime(adev) &&
773 adev->gfx.rlc.rlcg_reg_access_supported &&
774 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
775 GC_HWIP, false,
776 &rlcg_flag)) {
777 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
778 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
779 amdgpu_sriov_runtime(adev) &&
780 down_read_trylock(&adev->reset_domain->sem)) {
781 ret = amdgpu_kiq_rreg(adev, reg, xcc_id);
782 up_read(&adev->reset_domain->sem);
783 } else {
784 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
785 }
786 } else {
787 ret = adev->pcie_rreg(adev, reg * 4);
788 }
789
790 return ret;
791}
792
793/*
794 * MMIO register write with bytes helper functions
795 * @offset:bytes offset from MMIO start
796 * @value: the value want to be written to the register
797 */
798
799/**
800 * amdgpu_mm_wreg8 - read a memory mapped IO register
801 *
802 * @adev: amdgpu_device pointer
803 * @offset: byte aligned register offset
804 * @value: 8 bit value to write
805 *
806 * Writes the value specified to the offset specified.
807 */
808void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
809{
810 if (amdgpu_device_skip_hw_access(adev))
811 return;
812
813 if (offset < adev->rmmio_size)
814 writeb(value, adev->rmmio + offset);
815 else
816 BUG();
817}
818
819/**
820 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
821 *
822 * @adev: amdgpu_device pointer
823 * @reg: dword aligned register offset
824 * @v: 32 bit value to write to the register
825 * @acc_flags: access flags which require special behavior
826 *
827 * Writes the value specified to the offset specified.
828 */
829void amdgpu_device_wreg(struct amdgpu_device *adev,
830 uint32_t reg, uint32_t v,
831 uint32_t acc_flags)
832{
833 if (amdgpu_device_skip_hw_access(adev))
834 return;
835
836 if ((reg * 4) < adev->rmmio_size) {
837 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
838 amdgpu_sriov_runtime(adev) &&
839 down_read_trylock(&adev->reset_domain->sem)) {
840 amdgpu_kiq_wreg(adev, reg, v, 0);
841 up_read(&adev->reset_domain->sem);
842 } else {
843 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
844 }
845 } else {
846 adev->pcie_wreg(adev, reg * 4, v);
847 }
848
849 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
850}
851
852/**
853 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
854 *
855 * @adev: amdgpu_device pointer
856 * @reg: mmio/rlc register
857 * @v: value to write
858 * @xcc_id: xcc accelerated compute core id
859 *
860 * this function is invoked only for the debugfs register access
861 */
862void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
863 uint32_t reg, uint32_t v,
864 uint32_t xcc_id)
865{
866 if (amdgpu_device_skip_hw_access(adev))
867 return;
868
869 if (amdgpu_sriov_fullaccess(adev) &&
870 adev->gfx.rlc.funcs &&
871 adev->gfx.rlc.funcs->is_rlcg_access_range) {
872 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
873 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
874 } else if ((reg * 4) >= adev->rmmio_size) {
875 adev->pcie_wreg(adev, reg * 4, v);
876 } else {
877 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
878 }
879}
880
881/**
882 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
883 *
884 * @adev: amdgpu_device pointer
885 * @reg: dword aligned register offset
886 * @v: 32 bit value to write to the register
887 * @acc_flags: access flags which require special behavior
888 * @xcc_id: xcc accelerated compute core id
889 *
890 * Writes the value specified to the offset specified.
891 */
892void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
893 uint32_t reg, uint32_t v,
894 uint32_t acc_flags, uint32_t xcc_id)
895{
896 uint32_t rlcg_flag;
897
898 if (amdgpu_device_skip_hw_access(adev))
899 return;
900
901 if ((reg * 4) < adev->rmmio_size) {
902 if (amdgpu_sriov_vf(adev) &&
903 !amdgpu_sriov_runtime(adev) &&
904 adev->gfx.rlc.rlcg_reg_access_supported &&
905 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
906 GC_HWIP, true,
907 &rlcg_flag)) {
908 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
909 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
910 amdgpu_sriov_runtime(adev) &&
911 down_read_trylock(&adev->reset_domain->sem)) {
912 amdgpu_kiq_wreg(adev, reg, v, xcc_id);
913 up_read(&adev->reset_domain->sem);
914 } else {
915 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
916 }
917 } else {
918 adev->pcie_wreg(adev, reg * 4, v);
919 }
920}
921
922/**
923 * amdgpu_device_indirect_rreg - read an indirect register
924 *
925 * @adev: amdgpu_device pointer
926 * @reg_addr: indirect register address to read from
927 *
928 * Returns the value of indirect register @reg_addr
929 */
930u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
931 u32 reg_addr)
932{
933 unsigned long flags, pcie_index, pcie_data;
934 void __iomem *pcie_index_offset;
935 void __iomem *pcie_data_offset;
936 u32 r;
937
938 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
939 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
940
941 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
942 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
943 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
944
945 writel(reg_addr, pcie_index_offset);
946 readl(pcie_index_offset);
947 r = readl(pcie_data_offset);
948 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
949
950 return r;
951}
952
953u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
954 u64 reg_addr)
955{
956 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
957 u32 r;
958 void __iomem *pcie_index_offset;
959 void __iomem *pcie_index_hi_offset;
960 void __iomem *pcie_data_offset;
961
962 if (unlikely(!adev->nbio.funcs)) {
963 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
964 pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
965 } else {
966 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
967 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
968 }
969
970 if (reg_addr >> 32) {
971 if (unlikely(!adev->nbio.funcs))
972 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
973 else
974 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
975 } else {
976 pcie_index_hi = 0;
977 }
978
979 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
980 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
981 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
982 if (pcie_index_hi != 0)
983 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
984 pcie_index_hi * 4;
985
986 writel(reg_addr, pcie_index_offset);
987 readl(pcie_index_offset);
988 if (pcie_index_hi != 0) {
989 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
990 readl(pcie_index_hi_offset);
991 }
992 r = readl(pcie_data_offset);
993
994 /* clear the high bits */
995 if (pcie_index_hi != 0) {
996 writel(0, pcie_index_hi_offset);
997 readl(pcie_index_hi_offset);
998 }
999
1000 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1001
1002 return r;
1003}
1004
1005/**
1006 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
1007 *
1008 * @adev: amdgpu_device pointer
1009 * @reg_addr: indirect register address to read from
1010 *
1011 * Returns the value of indirect register @reg_addr
1012 */
1013u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
1014 u32 reg_addr)
1015{
1016 unsigned long flags, pcie_index, pcie_data;
1017 void __iomem *pcie_index_offset;
1018 void __iomem *pcie_data_offset;
1019 u64 r;
1020
1021 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1022 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1023
1024 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1025 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1026 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1027
1028 /* read low 32 bits */
1029 writel(reg_addr, pcie_index_offset);
1030 readl(pcie_index_offset);
1031 r = readl(pcie_data_offset);
1032 /* read high 32 bits */
1033 writel(reg_addr + 4, pcie_index_offset);
1034 readl(pcie_index_offset);
1035 r |= ((u64)readl(pcie_data_offset) << 32);
1036 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1037
1038 return r;
1039}
1040
1041u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
1042 u64 reg_addr)
1043{
1044 unsigned long flags, pcie_index, pcie_data;
1045 unsigned long pcie_index_hi = 0;
1046 void __iomem *pcie_index_offset;
1047 void __iomem *pcie_index_hi_offset;
1048 void __iomem *pcie_data_offset;
1049 u64 r;
1050
1051 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1052 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1053 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1054 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1055
1056 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1057 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1058 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1059 if (pcie_index_hi != 0)
1060 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1061 pcie_index_hi * 4;
1062
1063 /* read low 32 bits */
1064 writel(reg_addr, pcie_index_offset);
1065 readl(pcie_index_offset);
1066 if (pcie_index_hi != 0) {
1067 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1068 readl(pcie_index_hi_offset);
1069 }
1070 r = readl(pcie_data_offset);
1071 /* read high 32 bits */
1072 writel(reg_addr + 4, pcie_index_offset);
1073 readl(pcie_index_offset);
1074 if (pcie_index_hi != 0) {
1075 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1076 readl(pcie_index_hi_offset);
1077 }
1078 r |= ((u64)readl(pcie_data_offset) << 32);
1079
1080 /* clear the high bits */
1081 if (pcie_index_hi != 0) {
1082 writel(0, pcie_index_hi_offset);
1083 readl(pcie_index_hi_offset);
1084 }
1085
1086 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1087
1088 return r;
1089}
1090
1091/**
1092 * amdgpu_device_indirect_wreg - write an indirect register address
1093 *
1094 * @adev: amdgpu_device pointer
1095 * @reg_addr: indirect register offset
1096 * @reg_data: indirect register data
1097 *
1098 */
1099void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
1100 u32 reg_addr, u32 reg_data)
1101{
1102 unsigned long flags, pcie_index, pcie_data;
1103 void __iomem *pcie_index_offset;
1104 void __iomem *pcie_data_offset;
1105
1106 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1107 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1108
1109 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1110 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1111 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1112
1113 writel(reg_addr, pcie_index_offset);
1114 readl(pcie_index_offset);
1115 writel(reg_data, pcie_data_offset);
1116 readl(pcie_data_offset);
1117 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1118}
1119
1120void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
1121 u64 reg_addr, u32 reg_data)
1122{
1123 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
1124 void __iomem *pcie_index_offset;
1125 void __iomem *pcie_index_hi_offset;
1126 void __iomem *pcie_data_offset;
1127
1128 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1129 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1130 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1131 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1132 else
1133 pcie_index_hi = 0;
1134
1135 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1136 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1137 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1138 if (pcie_index_hi != 0)
1139 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1140 pcie_index_hi * 4;
1141
1142 writel(reg_addr, pcie_index_offset);
1143 readl(pcie_index_offset);
1144 if (pcie_index_hi != 0) {
1145 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1146 readl(pcie_index_hi_offset);
1147 }
1148 writel(reg_data, pcie_data_offset);
1149 readl(pcie_data_offset);
1150
1151 /* clear the high bits */
1152 if (pcie_index_hi != 0) {
1153 writel(0, pcie_index_hi_offset);
1154 readl(pcie_index_hi_offset);
1155 }
1156
1157 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1158}
1159
1160/**
1161 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
1162 *
1163 * @adev: amdgpu_device pointer
1164 * @reg_addr: indirect register offset
1165 * @reg_data: indirect register data
1166 *
1167 */
1168void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
1169 u32 reg_addr, u64 reg_data)
1170{
1171 unsigned long flags, pcie_index, pcie_data;
1172 void __iomem *pcie_index_offset;
1173 void __iomem *pcie_data_offset;
1174
1175 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1176 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1177
1178 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1179 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1180 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1181
1182 /* write low 32 bits */
1183 writel(reg_addr, pcie_index_offset);
1184 readl(pcie_index_offset);
1185 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1186 readl(pcie_data_offset);
1187 /* write high 32 bits */
1188 writel(reg_addr + 4, pcie_index_offset);
1189 readl(pcie_index_offset);
1190 writel((u32)(reg_data >> 32), pcie_data_offset);
1191 readl(pcie_data_offset);
1192 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1193}
1194
1195void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
1196 u64 reg_addr, u64 reg_data)
1197{
1198 unsigned long flags, pcie_index, pcie_data;
1199 unsigned long pcie_index_hi = 0;
1200 void __iomem *pcie_index_offset;
1201 void __iomem *pcie_index_hi_offset;
1202 void __iomem *pcie_data_offset;
1203
1204 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1205 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1206 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1207 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1208
1209 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1210 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1211 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1212 if (pcie_index_hi != 0)
1213 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1214 pcie_index_hi * 4;
1215
1216 /* write low 32 bits */
1217 writel(reg_addr, pcie_index_offset);
1218 readl(pcie_index_offset);
1219 if (pcie_index_hi != 0) {
1220 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1221 readl(pcie_index_hi_offset);
1222 }
1223 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1224 readl(pcie_data_offset);
1225 /* write high 32 bits */
1226 writel(reg_addr + 4, pcie_index_offset);
1227 readl(pcie_index_offset);
1228 if (pcie_index_hi != 0) {
1229 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1230 readl(pcie_index_hi_offset);
1231 }
1232 writel((u32)(reg_data >> 32), pcie_data_offset);
1233 readl(pcie_data_offset);
1234
1235 /* clear the high bits */
1236 if (pcie_index_hi != 0) {
1237 writel(0, pcie_index_hi_offset);
1238 readl(pcie_index_hi_offset);
1239 }
1240
1241 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1242}
1243
1244/**
1245 * amdgpu_device_get_rev_id - query device rev_id
1246 *
1247 * @adev: amdgpu_device pointer
1248 *
1249 * Return device rev_id
1250 */
1251u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
1252{
1253 return adev->nbio.funcs->get_rev_id(adev);
1254}
1255
1256/**
1257 * amdgpu_invalid_rreg - dummy reg read function
1258 *
1259 * @adev: amdgpu_device pointer
1260 * @reg: offset of register
1261 *
1262 * Dummy register read function. Used for register blocks
1263 * that certain asics don't have (all asics).
1264 * Returns the value in the register.
1265 */
1266static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
1267{
1268 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
1269 BUG();
1270 return 0;
1271}
1272
1273static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
1274{
1275 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1276 BUG();
1277 return 0;
1278}
1279
1280/**
1281 * amdgpu_invalid_wreg - dummy reg write function
1282 *
1283 * @adev: amdgpu_device pointer
1284 * @reg: offset of register
1285 * @v: value to write to the register
1286 *
1287 * Dummy register read function. Used for register blocks
1288 * that certain asics don't have (all asics).
1289 */
1290static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
1291{
1292 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
1293 reg, v);
1294 BUG();
1295}
1296
1297static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
1298{
1299 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
1300 reg, v);
1301 BUG();
1302}
1303
1304/**
1305 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
1306 *
1307 * @adev: amdgpu_device pointer
1308 * @reg: offset of register
1309 *
1310 * Dummy register read function. Used for register blocks
1311 * that certain asics don't have (all asics).
1312 * Returns the value in the register.
1313 */
1314static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
1315{
1316 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
1317 BUG();
1318 return 0;
1319}
1320
1321static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
1322{
1323 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1324 BUG();
1325 return 0;
1326}
1327
1328/**
1329 * amdgpu_invalid_wreg64 - dummy reg write function
1330 *
1331 * @adev: amdgpu_device pointer
1332 * @reg: offset of register
1333 * @v: value to write to the register
1334 *
1335 * Dummy register read function. Used for register blocks
1336 * that certain asics don't have (all asics).
1337 */
1338static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
1339{
1340 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
1341 reg, v);
1342 BUG();
1343}
1344
1345static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
1346{
1347 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
1348 reg, v);
1349 BUG();
1350}
1351
1352/**
1353 * amdgpu_block_invalid_rreg - dummy reg read function
1354 *
1355 * @adev: amdgpu_device pointer
1356 * @block: offset of instance
1357 * @reg: offset of register
1358 *
1359 * Dummy register read function. Used for register blocks
1360 * that certain asics don't have (all asics).
1361 * Returns the value in the register.
1362 */
1363static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
1364 uint32_t block, uint32_t reg)
1365{
1366 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
1367 reg, block);
1368 BUG();
1369 return 0;
1370}
1371
1372/**
1373 * amdgpu_block_invalid_wreg - dummy reg write function
1374 *
1375 * @adev: amdgpu_device pointer
1376 * @block: offset of instance
1377 * @reg: offset of register
1378 * @v: value to write to the register
1379 *
1380 * Dummy register read function. Used for register blocks
1381 * that certain asics don't have (all asics).
1382 */
1383static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
1384 uint32_t block,
1385 uint32_t reg, uint32_t v)
1386{
1387 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1388 reg, block, v);
1389 BUG();
1390}
1391
1392/**
1393 * amdgpu_device_asic_init - Wrapper for atom asic_init
1394 *
1395 * @adev: amdgpu_device pointer
1396 *
1397 * Does any asic specific work and then calls atom asic init.
1398 */
1399static int amdgpu_device_asic_init(struct amdgpu_device *adev)
1400{
1401 int ret;
1402
1403 amdgpu_asic_pre_asic_init(adev);
1404
1405 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
1406 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
1407 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) ||
1408 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
1409 amdgpu_psp_wait_for_bootloader(adev);
1410 ret = amdgpu_atomfirmware_asic_init(adev, true);
1411 return ret;
1412 } else {
1413 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
1414 }
1415
1416 return 0;
1417}
1418
1419/**
1420 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1421 *
1422 * @adev: amdgpu_device pointer
1423 *
1424 * Allocates a scratch page of VRAM for use by various things in the
1425 * driver.
1426 */
1427static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
1428{
1429 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
1430 AMDGPU_GEM_DOMAIN_VRAM |
1431 AMDGPU_GEM_DOMAIN_GTT,
1432 &adev->mem_scratch.robj,
1433 &adev->mem_scratch.gpu_addr,
1434 (void **)&adev->mem_scratch.ptr);
1435}
1436
1437/**
1438 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1439 *
1440 * @adev: amdgpu_device pointer
1441 *
1442 * Frees the VRAM scratch page.
1443 */
1444static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
1445{
1446 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
1447}
1448
1449/**
1450 * amdgpu_device_program_register_sequence - program an array of registers.
1451 *
1452 * @adev: amdgpu_device pointer
1453 * @registers: pointer to the register array
1454 * @array_size: size of the register array
1455 *
1456 * Programs an array or registers with and or masks.
1457 * This is a helper for setting golden registers.
1458 */
1459void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
1460 const u32 *registers,
1461 const u32 array_size)
1462{
1463 u32 tmp, reg, and_mask, or_mask;
1464 int i;
1465
1466 if (array_size % 3)
1467 return;
1468
1469 for (i = 0; i < array_size; i += 3) {
1470 reg = registers[i + 0];
1471 and_mask = registers[i + 1];
1472 or_mask = registers[i + 2];
1473
1474 if (and_mask == 0xffffffff) {
1475 tmp = or_mask;
1476 } else {
1477 tmp = RREG32(reg);
1478 tmp &= ~and_mask;
1479 if (adev->family >= AMDGPU_FAMILY_AI)
1480 tmp |= (or_mask & and_mask);
1481 else
1482 tmp |= or_mask;
1483 }
1484 WREG32(reg, tmp);
1485 }
1486}
1487
1488/**
1489 * amdgpu_device_pci_config_reset - reset the GPU
1490 *
1491 * @adev: amdgpu_device pointer
1492 *
1493 * Resets the GPU using the pci config reset sequence.
1494 * Only applicable to asics prior to vega10.
1495 */
1496void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1497{
1498 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1499}
1500
1501/**
1502 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1503 *
1504 * @adev: amdgpu_device pointer
1505 *
1506 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1507 */
1508int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1509{
1510 return pci_reset_function(adev->pdev);
1511}
1512
1513/*
1514 * amdgpu_device_wb_*()
1515 * Writeback is the method by which the GPU updates special pages in memory
1516 * with the status of certain GPU events (fences, ring pointers,etc.).
1517 */
1518
1519/**
1520 * amdgpu_device_wb_fini - Disable Writeback and free memory
1521 *
1522 * @adev: amdgpu_device pointer
1523 *
1524 * Disables Writeback and frees the Writeback memory (all asics).
1525 * Used at driver shutdown.
1526 */
1527static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1528{
1529 if (adev->wb.wb_obj) {
1530 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1531 &adev->wb.gpu_addr,
1532 (void **)&adev->wb.wb);
1533 adev->wb.wb_obj = NULL;
1534 }
1535}
1536
1537/**
1538 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1539 *
1540 * @adev: amdgpu_device pointer
1541 *
1542 * Initializes writeback and allocates writeback memory (all asics).
1543 * Used at driver startup.
1544 * Returns 0 on success or an -error on failure.
1545 */
1546static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1547{
1548 int r;
1549
1550 if (adev->wb.wb_obj == NULL) {
1551 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1552 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1553 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1554 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1555 (void **)&adev->wb.wb);
1556 if (r) {
1557 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1558 return r;
1559 }
1560
1561 adev->wb.num_wb = AMDGPU_MAX_WB;
1562 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1563
1564 /* clear wb memory */
1565 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1566 }
1567
1568 return 0;
1569}
1570
1571/**
1572 * amdgpu_device_wb_get - Allocate a wb entry
1573 *
1574 * @adev: amdgpu_device pointer
1575 * @wb: wb index
1576 *
1577 * Allocate a wb slot for use by the driver (all asics).
1578 * Returns 0 on success or -EINVAL on failure.
1579 */
1580int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1581{
1582 unsigned long flags, offset;
1583
1584 spin_lock_irqsave(&adev->wb.lock, flags);
1585 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1586 if (offset < adev->wb.num_wb) {
1587 __set_bit(offset, adev->wb.used);
1588 spin_unlock_irqrestore(&adev->wb.lock, flags);
1589 *wb = offset << 3; /* convert to dw offset */
1590 return 0;
1591 } else {
1592 spin_unlock_irqrestore(&adev->wb.lock, flags);
1593 return -EINVAL;
1594 }
1595}
1596
1597/**
1598 * amdgpu_device_wb_free - Free a wb entry
1599 *
1600 * @adev: amdgpu_device pointer
1601 * @wb: wb index
1602 *
1603 * Free a wb slot allocated for use by the driver (all asics)
1604 */
1605void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1606{
1607 unsigned long flags;
1608
1609 wb >>= 3;
1610 spin_lock_irqsave(&adev->wb.lock, flags);
1611 if (wb < adev->wb.num_wb)
1612 __clear_bit(wb, adev->wb.used);
1613 spin_unlock_irqrestore(&adev->wb.lock, flags);
1614}
1615
1616/**
1617 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1618 *
1619 * @adev: amdgpu_device pointer
1620 *
1621 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1622 * to fail, but if any of the BARs is not accessible after the size we abort
1623 * driver loading by returning -ENODEV.
1624 */
1625int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1626{
1627 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1628 struct pci_bus *root;
1629 struct resource *res;
1630 unsigned int i;
1631 u16 cmd;
1632 int r;
1633
1634 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1635 return 0;
1636
1637 /* Bypass for VF */
1638 if (amdgpu_sriov_vf(adev))
1639 return 0;
1640
1641 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1642 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1643 DRM_WARN("System can't access extended configuration space, please check!!\n");
1644
1645 /* skip if the bios has already enabled large BAR */
1646 if (adev->gmc.real_vram_size &&
1647 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1648 return 0;
1649
1650 /* Check if the root BUS has 64bit memory resources */
1651 root = adev->pdev->bus;
1652 while (root->parent)
1653 root = root->parent;
1654
1655 pci_bus_for_each_resource(root, res, i) {
1656 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1657 res->start > 0x100000000ull)
1658 break;
1659 }
1660
1661 /* Trying to resize is pointless without a root hub window above 4GB */
1662 if (!res)
1663 return 0;
1664
1665 /* Limit the BAR size to what is available */
1666 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1667 rbar_size);
1668
1669 /* Disable memory decoding while we change the BAR addresses and size */
1670 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1671 pci_write_config_word(adev->pdev, PCI_COMMAND,
1672 cmd & ~PCI_COMMAND_MEMORY);
1673
1674 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1675 amdgpu_doorbell_fini(adev);
1676 if (adev->asic_type >= CHIP_BONAIRE)
1677 pci_release_resource(adev->pdev, 2);
1678
1679 pci_release_resource(adev->pdev, 0);
1680
1681 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1682 if (r == -ENOSPC)
1683 DRM_INFO("Not enough PCI address space for a large BAR.");
1684 else if (r && r != -ENOTSUPP)
1685 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1686
1687 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1688
1689 /* When the doorbell or fb BAR isn't available we have no chance of
1690 * using the device.
1691 */
1692 r = amdgpu_doorbell_init(adev);
1693 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1694 return -ENODEV;
1695
1696 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1697
1698 return 0;
1699}
1700
1701static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1702{
1703 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1704 return false;
1705
1706 return true;
1707}
1708
1709/*
1710 * GPU helpers function.
1711 */
1712/**
1713 * amdgpu_device_need_post - check if the hw need post or not
1714 *
1715 * @adev: amdgpu_device pointer
1716 *
1717 * Check if the asic has been initialized (all asics) at driver startup
1718 * or post is needed if hw reset is performed.
1719 * Returns true if need or false if not.
1720 */
1721bool amdgpu_device_need_post(struct amdgpu_device *adev)
1722{
1723 uint32_t reg;
1724
1725 if (amdgpu_sriov_vf(adev))
1726 return false;
1727
1728 if (!amdgpu_device_read_bios(adev))
1729 return false;
1730
1731 if (amdgpu_passthrough(adev)) {
1732 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1733 * some old smc fw still need driver do vPost otherwise gpu hang, while
1734 * those smc fw version above 22.15 doesn't have this flaw, so we force
1735 * vpost executed for smc version below 22.15
1736 */
1737 if (adev->asic_type == CHIP_FIJI) {
1738 int err;
1739 uint32_t fw_ver;
1740
1741 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1742 /* force vPost if error occurred */
1743 if (err)
1744 return true;
1745
1746 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1747 release_firmware(adev->pm.fw);
1748 if (fw_ver < 0x00160e00)
1749 return true;
1750 }
1751 }
1752
1753 /* Don't post if we need to reset whole hive on init */
1754 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
1755 return false;
1756
1757 if (adev->has_hw_reset) {
1758 adev->has_hw_reset = false;
1759 return true;
1760 }
1761
1762 /* bios scratch used on CIK+ */
1763 if (adev->asic_type >= CHIP_BONAIRE)
1764 return amdgpu_atombios_scratch_need_asic_init(adev);
1765
1766 /* check MEM_SIZE for older asics */
1767 reg = amdgpu_asic_get_config_memsize(adev);
1768
1769 if ((reg != 0) && (reg != 0xffffffff))
1770 return false;
1771
1772 return true;
1773}
1774
1775/*
1776 * Check whether seamless boot is supported.
1777 *
1778 * So far we only support seamless boot on DCE 3.0 or later.
1779 * If users report that it works on older ASICS as well, we may
1780 * loosen this.
1781 */
1782bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
1783{
1784 switch (amdgpu_seamless) {
1785 case -1:
1786 break;
1787 case 1:
1788 return true;
1789 case 0:
1790 return false;
1791 default:
1792 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n",
1793 amdgpu_seamless);
1794 return false;
1795 }
1796
1797 if (!(adev->flags & AMD_IS_APU))
1798 return false;
1799
1800 if (adev->mman.keep_stolen_vga_memory)
1801 return false;
1802
1803 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
1804}
1805
1806/*
1807 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
1808 * don't support dynamic speed switching. Until we have confirmation from Intel
1809 * that a specific host supports it, it's safer that we keep it disabled for all.
1810 *
1811 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1812 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1813 */
1814static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
1815{
1816#if IS_ENABLED(CONFIG_X86)
1817 struct cpuinfo_x86 *c = &cpu_data(0);
1818
1819 /* eGPU change speeds based on USB4 fabric conditions */
1820 if (dev_is_removable(adev->dev))
1821 return true;
1822
1823 if (c->x86_vendor == X86_VENDOR_INTEL)
1824 return false;
1825#endif
1826 return true;
1827}
1828
1829/**
1830 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1831 *
1832 * @adev: amdgpu_device pointer
1833 *
1834 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1835 * be set for this device.
1836 *
1837 * Returns true if it should be used or false if not.
1838 */
1839bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1840{
1841 switch (amdgpu_aspm) {
1842 case -1:
1843 break;
1844 case 0:
1845 return false;
1846 case 1:
1847 return true;
1848 default:
1849 return false;
1850 }
1851 if (adev->flags & AMD_IS_APU)
1852 return false;
1853 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK))
1854 return false;
1855 return pcie_aspm_enabled(adev->pdev);
1856}
1857
1858/* if we get transitioned to only one device, take VGA back */
1859/**
1860 * amdgpu_device_vga_set_decode - enable/disable vga decode
1861 *
1862 * @pdev: PCI device pointer
1863 * @state: enable/disable vga decode
1864 *
1865 * Enable/disable vga decode (all asics).
1866 * Returns VGA resource flags.
1867 */
1868static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1869 bool state)
1870{
1871 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1872
1873 amdgpu_asic_set_vga_state(adev, state);
1874 if (state)
1875 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1876 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1877 else
1878 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1879}
1880
1881/**
1882 * amdgpu_device_check_block_size - validate the vm block size
1883 *
1884 * @adev: amdgpu_device pointer
1885 *
1886 * Validates the vm block size specified via module parameter.
1887 * The vm block size defines number of bits in page table versus page directory,
1888 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1889 * page table and the remaining bits are in the page directory.
1890 */
1891static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1892{
1893 /* defines number of bits in page table versus page directory,
1894 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1895 * page table and the remaining bits are in the page directory
1896 */
1897 if (amdgpu_vm_block_size == -1)
1898 return;
1899
1900 if (amdgpu_vm_block_size < 9) {
1901 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1902 amdgpu_vm_block_size);
1903 amdgpu_vm_block_size = -1;
1904 }
1905}
1906
1907/**
1908 * amdgpu_device_check_vm_size - validate the vm size
1909 *
1910 * @adev: amdgpu_device pointer
1911 *
1912 * Validates the vm size in GB specified via module parameter.
1913 * The VM size is the size of the GPU virtual memory space in GB.
1914 */
1915static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1916{
1917 /* no need to check the default value */
1918 if (amdgpu_vm_size == -1)
1919 return;
1920
1921 if (amdgpu_vm_size < 1) {
1922 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1923 amdgpu_vm_size);
1924 amdgpu_vm_size = -1;
1925 }
1926}
1927
1928static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1929{
1930 struct sysinfo si;
1931 bool is_os_64 = (sizeof(void *) == 8);
1932 uint64_t total_memory;
1933 uint64_t dram_size_seven_GB = 0x1B8000000;
1934 uint64_t dram_size_three_GB = 0xB8000000;
1935
1936 if (amdgpu_smu_memory_pool_size == 0)
1937 return;
1938
1939 if (!is_os_64) {
1940 DRM_WARN("Not 64-bit OS, feature not supported\n");
1941 goto def_value;
1942 }
1943 si_meminfo(&si);
1944 total_memory = (uint64_t)si.totalram * si.mem_unit;
1945
1946 if ((amdgpu_smu_memory_pool_size == 1) ||
1947 (amdgpu_smu_memory_pool_size == 2)) {
1948 if (total_memory < dram_size_three_GB)
1949 goto def_value1;
1950 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1951 (amdgpu_smu_memory_pool_size == 8)) {
1952 if (total_memory < dram_size_seven_GB)
1953 goto def_value1;
1954 } else {
1955 DRM_WARN("Smu memory pool size not supported\n");
1956 goto def_value;
1957 }
1958 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1959
1960 return;
1961
1962def_value1:
1963 DRM_WARN("No enough system memory\n");
1964def_value:
1965 adev->pm.smu_prv_buffer_size = 0;
1966}
1967
1968static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1969{
1970 if (!(adev->flags & AMD_IS_APU) ||
1971 adev->asic_type < CHIP_RAVEN)
1972 return 0;
1973
1974 switch (adev->asic_type) {
1975 case CHIP_RAVEN:
1976 if (adev->pdev->device == 0x15dd)
1977 adev->apu_flags |= AMD_APU_IS_RAVEN;
1978 if (adev->pdev->device == 0x15d8)
1979 adev->apu_flags |= AMD_APU_IS_PICASSO;
1980 break;
1981 case CHIP_RENOIR:
1982 if ((adev->pdev->device == 0x1636) ||
1983 (adev->pdev->device == 0x164c))
1984 adev->apu_flags |= AMD_APU_IS_RENOIR;
1985 else
1986 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1987 break;
1988 case CHIP_VANGOGH:
1989 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1990 break;
1991 case CHIP_YELLOW_CARP:
1992 break;
1993 case CHIP_CYAN_SKILLFISH:
1994 if ((adev->pdev->device == 0x13FE) ||
1995 (adev->pdev->device == 0x143F))
1996 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1997 break;
1998 default:
1999 break;
2000 }
2001
2002 return 0;
2003}
2004
2005/**
2006 * amdgpu_device_check_arguments - validate module params
2007 *
2008 * @adev: amdgpu_device pointer
2009 *
2010 * Validates certain module parameters and updates
2011 * the associated values used by the driver (all asics).
2012 */
2013static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
2014{
2015 int i;
2016
2017 if (amdgpu_sched_jobs < 4) {
2018 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
2019 amdgpu_sched_jobs);
2020 amdgpu_sched_jobs = 4;
2021 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
2022 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
2023 amdgpu_sched_jobs);
2024 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
2025 }
2026
2027 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
2028 /* gart size must be greater or equal to 32M */
2029 dev_warn(adev->dev, "gart size (%d) too small\n",
2030 amdgpu_gart_size);
2031 amdgpu_gart_size = -1;
2032 }
2033
2034 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
2035 /* gtt size must be greater or equal to 32M */
2036 dev_warn(adev->dev, "gtt size (%d) too small\n",
2037 amdgpu_gtt_size);
2038 amdgpu_gtt_size = -1;
2039 }
2040
2041 /* valid range is between 4 and 9 inclusive */
2042 if (amdgpu_vm_fragment_size != -1 &&
2043 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
2044 dev_warn(adev->dev, "valid range is between 4 and 9\n");
2045 amdgpu_vm_fragment_size = -1;
2046 }
2047
2048 if (amdgpu_sched_hw_submission < 2) {
2049 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
2050 amdgpu_sched_hw_submission);
2051 amdgpu_sched_hw_submission = 2;
2052 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
2053 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
2054 amdgpu_sched_hw_submission);
2055 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
2056 }
2057
2058 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
2059 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
2060 amdgpu_reset_method = -1;
2061 }
2062
2063 amdgpu_device_check_smu_prv_buffer_size(adev);
2064
2065 amdgpu_device_check_vm_size(adev);
2066
2067 amdgpu_device_check_block_size(adev);
2068
2069 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
2070
2071 for (i = 0; i < MAX_XCP; i++)
2072 adev->enforce_isolation[i] = !!enforce_isolation;
2073
2074 return 0;
2075}
2076
2077/**
2078 * amdgpu_switcheroo_set_state - set switcheroo state
2079 *
2080 * @pdev: pci dev pointer
2081 * @state: vga_switcheroo state
2082 *
2083 * Callback for the switcheroo driver. Suspends or resumes
2084 * the asics before or after it is powered up using ACPI methods.
2085 */
2086static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
2087 enum vga_switcheroo_state state)
2088{
2089 struct drm_device *dev = pci_get_drvdata(pdev);
2090 int r;
2091
2092 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
2093 return;
2094
2095 if (state == VGA_SWITCHEROO_ON) {
2096 pr_info("switched on\n");
2097 /* don't suspend or resume card normally */
2098 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2099
2100 pci_set_power_state(pdev, PCI_D0);
2101 amdgpu_device_load_pci_state(pdev);
2102 r = pci_enable_device(pdev);
2103 if (r)
2104 DRM_WARN("pci_enable_device failed (%d)\n", r);
2105 amdgpu_device_resume(dev, true);
2106
2107 dev->switch_power_state = DRM_SWITCH_POWER_ON;
2108 } else {
2109 pr_info("switched off\n");
2110 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2111 amdgpu_device_prepare(dev);
2112 amdgpu_device_suspend(dev, true);
2113 amdgpu_device_cache_pci_state(pdev);
2114 /* Shut down the device */
2115 pci_disable_device(pdev);
2116 pci_set_power_state(pdev, PCI_D3cold);
2117 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
2118 }
2119}
2120
2121/**
2122 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
2123 *
2124 * @pdev: pci dev pointer
2125 *
2126 * Callback for the switcheroo driver. Check of the switcheroo
2127 * state can be changed.
2128 * Returns true if the state can be changed, false if not.
2129 */
2130static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
2131{
2132 struct drm_device *dev = pci_get_drvdata(pdev);
2133
2134 /*
2135 * FIXME: open_count is protected by drm_global_mutex but that would lead to
2136 * locking inversion with the driver load path. And the access here is
2137 * completely racy anyway. So don't bother with locking for now.
2138 */
2139 return atomic_read(&dev->open_count) == 0;
2140}
2141
2142static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
2143 .set_gpu_state = amdgpu_switcheroo_set_state,
2144 .reprobe = NULL,
2145 .can_switch = amdgpu_switcheroo_can_switch,
2146};
2147
2148/**
2149 * amdgpu_device_ip_set_clockgating_state - set the CG state
2150 *
2151 * @dev: amdgpu_device pointer
2152 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2153 * @state: clockgating state (gate or ungate)
2154 *
2155 * Sets the requested clockgating state for all instances of
2156 * the hardware IP specified.
2157 * Returns the error code from the last instance.
2158 */
2159int amdgpu_device_ip_set_clockgating_state(void *dev,
2160 enum amd_ip_block_type block_type,
2161 enum amd_clockgating_state state)
2162{
2163 struct amdgpu_device *adev = dev;
2164 int i, r = 0;
2165
2166 for (i = 0; i < adev->num_ip_blocks; i++) {
2167 if (!adev->ip_blocks[i].status.valid)
2168 continue;
2169 if (adev->ip_blocks[i].version->type != block_type)
2170 continue;
2171 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
2172 continue;
2173 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
2174 &adev->ip_blocks[i], state);
2175 if (r)
2176 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
2177 adev->ip_blocks[i].version->funcs->name, r);
2178 }
2179 return r;
2180}
2181
2182/**
2183 * amdgpu_device_ip_set_powergating_state - set the PG state
2184 *
2185 * @dev: amdgpu_device pointer
2186 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2187 * @state: powergating state (gate or ungate)
2188 *
2189 * Sets the requested powergating state for all instances of
2190 * the hardware IP specified.
2191 * Returns the error code from the last instance.
2192 */
2193int amdgpu_device_ip_set_powergating_state(void *dev,
2194 enum amd_ip_block_type block_type,
2195 enum amd_powergating_state state)
2196{
2197 struct amdgpu_device *adev = dev;
2198 int i, r = 0;
2199
2200 for (i = 0; i < adev->num_ip_blocks; i++) {
2201 if (!adev->ip_blocks[i].status.valid)
2202 continue;
2203 if (adev->ip_blocks[i].version->type != block_type)
2204 continue;
2205 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
2206 continue;
2207 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
2208 &adev->ip_blocks[i], state);
2209 if (r)
2210 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
2211 adev->ip_blocks[i].version->funcs->name, r);
2212 }
2213 return r;
2214}
2215
2216/**
2217 * amdgpu_device_ip_get_clockgating_state - get the CG state
2218 *
2219 * @adev: amdgpu_device pointer
2220 * @flags: clockgating feature flags
2221 *
2222 * Walks the list of IPs on the device and updates the clockgating
2223 * flags for each IP.
2224 * Updates @flags with the feature flags for each hardware IP where
2225 * clockgating is enabled.
2226 */
2227void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
2228 u64 *flags)
2229{
2230 int i;
2231
2232 for (i = 0; i < adev->num_ip_blocks; i++) {
2233 if (!adev->ip_blocks[i].status.valid)
2234 continue;
2235 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
2236 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
2237 }
2238}
2239
2240/**
2241 * amdgpu_device_ip_wait_for_idle - wait for idle
2242 *
2243 * @adev: amdgpu_device pointer
2244 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2245 *
2246 * Waits for the request hardware IP to be idle.
2247 * Returns 0 for success or a negative error code on failure.
2248 */
2249int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
2250 enum amd_ip_block_type block_type)
2251{
2252 int i, r;
2253
2254 for (i = 0; i < adev->num_ip_blocks; i++) {
2255 if (!adev->ip_blocks[i].status.valid)
2256 continue;
2257 if (adev->ip_blocks[i].version->type == block_type) {
2258 if (adev->ip_blocks[i].version->funcs->wait_for_idle) {
2259 r = adev->ip_blocks[i].version->funcs->wait_for_idle(
2260 &adev->ip_blocks[i]);
2261 if (r)
2262 return r;
2263 }
2264 break;
2265 }
2266 }
2267 return 0;
2268
2269}
2270
2271/**
2272 * amdgpu_device_ip_is_valid - is the hardware IP enabled
2273 *
2274 * @adev: amdgpu_device pointer
2275 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2276 *
2277 * Check if the hardware IP is enable or not.
2278 * Returns true if it the IP is enable, false if not.
2279 */
2280bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
2281 enum amd_ip_block_type block_type)
2282{
2283 int i;
2284
2285 for (i = 0; i < adev->num_ip_blocks; i++) {
2286 if (adev->ip_blocks[i].version->type == block_type)
2287 return adev->ip_blocks[i].status.valid;
2288 }
2289 return false;
2290
2291}
2292
2293/**
2294 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
2295 *
2296 * @adev: amdgpu_device pointer
2297 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
2298 *
2299 * Returns a pointer to the hardware IP block structure
2300 * if it exists for the asic, otherwise NULL.
2301 */
2302struct amdgpu_ip_block *
2303amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
2304 enum amd_ip_block_type type)
2305{
2306 int i;
2307
2308 for (i = 0; i < adev->num_ip_blocks; i++)
2309 if (adev->ip_blocks[i].version->type == type)
2310 return &adev->ip_blocks[i];
2311
2312 return NULL;
2313}
2314
2315/**
2316 * amdgpu_device_ip_block_version_cmp
2317 *
2318 * @adev: amdgpu_device pointer
2319 * @type: enum amd_ip_block_type
2320 * @major: major version
2321 * @minor: minor version
2322 *
2323 * return 0 if equal or greater
2324 * return 1 if smaller or the ip_block doesn't exist
2325 */
2326int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
2327 enum amd_ip_block_type type,
2328 u32 major, u32 minor)
2329{
2330 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
2331
2332 if (ip_block && ((ip_block->version->major > major) ||
2333 ((ip_block->version->major == major) &&
2334 (ip_block->version->minor >= minor))))
2335 return 0;
2336
2337 return 1;
2338}
2339
2340/**
2341 * amdgpu_device_ip_block_add
2342 *
2343 * @adev: amdgpu_device pointer
2344 * @ip_block_version: pointer to the IP to add
2345 *
2346 * Adds the IP block driver information to the collection of IPs
2347 * on the asic.
2348 */
2349int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
2350 const struct amdgpu_ip_block_version *ip_block_version)
2351{
2352 if (!ip_block_version)
2353 return -EINVAL;
2354
2355 switch (ip_block_version->type) {
2356 case AMD_IP_BLOCK_TYPE_VCN:
2357 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
2358 return 0;
2359 break;
2360 case AMD_IP_BLOCK_TYPE_JPEG:
2361 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
2362 return 0;
2363 break;
2364 default:
2365 break;
2366 }
2367
2368 dev_info(adev->dev, "detected ip block number %d <%s>\n",
2369 adev->num_ip_blocks, ip_block_version->funcs->name);
2370
2371 adev->ip_blocks[adev->num_ip_blocks].adev = adev;
2372
2373 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
2374
2375 return 0;
2376}
2377
2378/**
2379 * amdgpu_device_enable_virtual_display - enable virtual display feature
2380 *
2381 * @adev: amdgpu_device pointer
2382 *
2383 * Enabled the virtual display feature if the user has enabled it via
2384 * the module parameter virtual_display. This feature provides a virtual
2385 * display hardware on headless boards or in virtualized environments.
2386 * This function parses and validates the configuration string specified by
2387 * the user and configures the virtual display configuration (number of
2388 * virtual connectors, crtcs, etc.) specified.
2389 */
2390static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
2391{
2392 adev->enable_virtual_display = false;
2393
2394 if (amdgpu_virtual_display) {
2395 const char *pci_address_name = pci_name(adev->pdev);
2396 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
2397
2398 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
2399 pciaddstr_tmp = pciaddstr;
2400 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
2401 pciaddname = strsep(&pciaddname_tmp, ",");
2402 if (!strcmp("all", pciaddname)
2403 || !strcmp(pci_address_name, pciaddname)) {
2404 long num_crtc;
2405 int res = -1;
2406
2407 adev->enable_virtual_display = true;
2408
2409 if (pciaddname_tmp)
2410 res = kstrtol(pciaddname_tmp, 10,
2411 &num_crtc);
2412
2413 if (!res) {
2414 if (num_crtc < 1)
2415 num_crtc = 1;
2416 if (num_crtc > 6)
2417 num_crtc = 6;
2418 adev->mode_info.num_crtc = num_crtc;
2419 } else {
2420 adev->mode_info.num_crtc = 1;
2421 }
2422 break;
2423 }
2424 }
2425
2426 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2427 amdgpu_virtual_display, pci_address_name,
2428 adev->enable_virtual_display, adev->mode_info.num_crtc);
2429
2430 kfree(pciaddstr);
2431 }
2432}
2433
2434void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
2435{
2436 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
2437 adev->mode_info.num_crtc = 1;
2438 adev->enable_virtual_display = true;
2439 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
2440 adev->enable_virtual_display, adev->mode_info.num_crtc);
2441 }
2442}
2443
2444/**
2445 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2446 *
2447 * @adev: amdgpu_device pointer
2448 *
2449 * Parses the asic configuration parameters specified in the gpu info
2450 * firmware and makes them available to the driver for use in configuring
2451 * the asic.
2452 * Returns 0 on success, -EINVAL on failure.
2453 */
2454static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
2455{
2456 const char *chip_name;
2457 int err;
2458 const struct gpu_info_firmware_header_v1_0 *hdr;
2459
2460 adev->firmware.gpu_info_fw = NULL;
2461
2462 if (adev->mman.discovery_bin)
2463 return 0;
2464
2465 switch (adev->asic_type) {
2466 default:
2467 return 0;
2468 case CHIP_VEGA10:
2469 chip_name = "vega10";
2470 break;
2471 case CHIP_VEGA12:
2472 chip_name = "vega12";
2473 break;
2474 case CHIP_RAVEN:
2475 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2476 chip_name = "raven2";
2477 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2478 chip_name = "picasso";
2479 else
2480 chip_name = "raven";
2481 break;
2482 case CHIP_ARCTURUS:
2483 chip_name = "arcturus";
2484 break;
2485 case CHIP_NAVI12:
2486 chip_name = "navi12";
2487 break;
2488 }
2489
2490 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
2491 AMDGPU_UCODE_OPTIONAL,
2492 "amdgpu/%s_gpu_info.bin", chip_name);
2493 if (err) {
2494 dev_err(adev->dev,
2495 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
2496 chip_name);
2497 goto out;
2498 }
2499
2500 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2501 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2502
2503 switch (hdr->version_major) {
2504 case 1:
2505 {
2506 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2507 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2508 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2509
2510 /*
2511 * Should be dropped when DAL no longer needs it.
2512 */
2513 if (adev->asic_type == CHIP_NAVI12)
2514 goto parse_soc_bounding_box;
2515
2516 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2517 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2518 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2519 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2520 adev->gfx.config.max_texture_channel_caches =
2521 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2522 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2523 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2524 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2525 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2526 adev->gfx.config.double_offchip_lds_buf =
2527 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2528 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2529 adev->gfx.cu_info.max_waves_per_simd =
2530 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2531 adev->gfx.cu_info.max_scratch_slots_per_cu =
2532 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2533 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2534 if (hdr->version_minor >= 1) {
2535 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2536 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2537 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2538 adev->gfx.config.num_sc_per_sh =
2539 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2540 adev->gfx.config.num_packer_per_sc =
2541 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2542 }
2543
2544parse_soc_bounding_box:
2545 /*
2546 * soc bounding box info is not integrated in disocovery table,
2547 * we always need to parse it from gpu info firmware if needed.
2548 */
2549 if (hdr->version_minor == 2) {
2550 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2551 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2552 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2553 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2554 }
2555 break;
2556 }
2557 default:
2558 dev_err(adev->dev,
2559 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2560 err = -EINVAL;
2561 goto out;
2562 }
2563out:
2564 return err;
2565}
2566
2567/**
2568 * amdgpu_device_ip_early_init - run early init for hardware IPs
2569 *
2570 * @adev: amdgpu_device pointer
2571 *
2572 * Early initialization pass for hardware IPs. The hardware IPs that make
2573 * up each asic are discovered each IP's early_init callback is run. This
2574 * is the first stage in initializing the asic.
2575 * Returns 0 on success, negative error code on failure.
2576 */
2577static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2578{
2579 struct amdgpu_ip_block *ip_block;
2580 struct pci_dev *parent;
2581 int i, r;
2582 bool total;
2583
2584 amdgpu_device_enable_virtual_display(adev);
2585
2586 if (amdgpu_sriov_vf(adev)) {
2587 r = amdgpu_virt_request_full_gpu(adev, true);
2588 if (r)
2589 return r;
2590 }
2591
2592 switch (adev->asic_type) {
2593#ifdef CONFIG_DRM_AMDGPU_SI
2594 case CHIP_VERDE:
2595 case CHIP_TAHITI:
2596 case CHIP_PITCAIRN:
2597 case CHIP_OLAND:
2598 case CHIP_HAINAN:
2599 adev->family = AMDGPU_FAMILY_SI;
2600 r = si_set_ip_blocks(adev);
2601 if (r)
2602 return r;
2603 break;
2604#endif
2605#ifdef CONFIG_DRM_AMDGPU_CIK
2606 case CHIP_BONAIRE:
2607 case CHIP_HAWAII:
2608 case CHIP_KAVERI:
2609 case CHIP_KABINI:
2610 case CHIP_MULLINS:
2611 if (adev->flags & AMD_IS_APU)
2612 adev->family = AMDGPU_FAMILY_KV;
2613 else
2614 adev->family = AMDGPU_FAMILY_CI;
2615
2616 r = cik_set_ip_blocks(adev);
2617 if (r)
2618 return r;
2619 break;
2620#endif
2621 case CHIP_TOPAZ:
2622 case CHIP_TONGA:
2623 case CHIP_FIJI:
2624 case CHIP_POLARIS10:
2625 case CHIP_POLARIS11:
2626 case CHIP_POLARIS12:
2627 case CHIP_VEGAM:
2628 case CHIP_CARRIZO:
2629 case CHIP_STONEY:
2630 if (adev->flags & AMD_IS_APU)
2631 adev->family = AMDGPU_FAMILY_CZ;
2632 else
2633 adev->family = AMDGPU_FAMILY_VI;
2634
2635 r = vi_set_ip_blocks(adev);
2636 if (r)
2637 return r;
2638 break;
2639 default:
2640 r = amdgpu_discovery_set_ip_blocks(adev);
2641 if (r)
2642 return r;
2643 break;
2644 }
2645
2646 if (amdgpu_has_atpx() &&
2647 (amdgpu_is_atpx_hybrid() ||
2648 amdgpu_has_atpx_dgpu_power_cntl()) &&
2649 ((adev->flags & AMD_IS_APU) == 0) &&
2650 !dev_is_removable(&adev->pdev->dev))
2651 adev->flags |= AMD_IS_PX;
2652
2653 if (!(adev->flags & AMD_IS_APU)) {
2654 parent = pcie_find_root_port(adev->pdev);
2655 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2656 }
2657
2658
2659 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2660 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2661 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2662 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2663 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2664 if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
2665 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2666
2667 total = true;
2668 for (i = 0; i < adev->num_ip_blocks; i++) {
2669 ip_block = &adev->ip_blocks[i];
2670
2671 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2672 DRM_WARN("disabled ip block: %d <%s>\n",
2673 i, adev->ip_blocks[i].version->funcs->name);
2674 adev->ip_blocks[i].status.valid = false;
2675 } else if (ip_block->version->funcs->early_init) {
2676 r = ip_block->version->funcs->early_init(ip_block);
2677 if (r == -ENOENT) {
2678 adev->ip_blocks[i].status.valid = false;
2679 } else if (r) {
2680 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2681 adev->ip_blocks[i].version->funcs->name, r);
2682 total = false;
2683 } else {
2684 adev->ip_blocks[i].status.valid = true;
2685 }
2686 } else {
2687 adev->ip_blocks[i].status.valid = true;
2688 }
2689 /* get the vbios after the asic_funcs are set up */
2690 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2691 r = amdgpu_device_parse_gpu_info_fw(adev);
2692 if (r)
2693 return r;
2694
2695 /* Read BIOS */
2696 if (amdgpu_device_read_bios(adev)) {
2697 if (!amdgpu_get_bios(adev))
2698 return -EINVAL;
2699
2700 r = amdgpu_atombios_init(adev);
2701 if (r) {
2702 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2703 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2704 return r;
2705 }
2706 }
2707
2708 /*get pf2vf msg info at it's earliest time*/
2709 if (amdgpu_sriov_vf(adev))
2710 amdgpu_virt_init_data_exchange(adev);
2711
2712 }
2713 }
2714 if (!total)
2715 return -ENODEV;
2716
2717 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
2718 if (ip_block->status.valid != false)
2719 amdgpu_amdkfd_device_probe(adev);
2720
2721 adev->cg_flags &= amdgpu_cg_mask;
2722 adev->pg_flags &= amdgpu_pg_mask;
2723
2724 return 0;
2725}
2726
2727static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2728{
2729 int i, r;
2730
2731 for (i = 0; i < adev->num_ip_blocks; i++) {
2732 if (!adev->ip_blocks[i].status.sw)
2733 continue;
2734 if (adev->ip_blocks[i].status.hw)
2735 continue;
2736 if (!amdgpu_ip_member_of_hwini(
2737 adev, adev->ip_blocks[i].version->type))
2738 continue;
2739 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2740 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2741 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2742 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2743 if (r) {
2744 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2745 adev->ip_blocks[i].version->funcs->name, r);
2746 return r;
2747 }
2748 adev->ip_blocks[i].status.hw = true;
2749 }
2750 }
2751
2752 return 0;
2753}
2754
2755static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2756{
2757 int i, r;
2758
2759 for (i = 0; i < adev->num_ip_blocks; i++) {
2760 if (!adev->ip_blocks[i].status.sw)
2761 continue;
2762 if (adev->ip_blocks[i].status.hw)
2763 continue;
2764 if (!amdgpu_ip_member_of_hwini(
2765 adev, adev->ip_blocks[i].version->type))
2766 continue;
2767 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2768 if (r) {
2769 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2770 adev->ip_blocks[i].version->funcs->name, r);
2771 return r;
2772 }
2773 adev->ip_blocks[i].status.hw = true;
2774 }
2775
2776 return 0;
2777}
2778
2779static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2780{
2781 int r = 0;
2782 int i;
2783 uint32_t smu_version;
2784
2785 if (adev->asic_type >= CHIP_VEGA10) {
2786 for (i = 0; i < adev->num_ip_blocks; i++) {
2787 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2788 continue;
2789
2790 if (!amdgpu_ip_member_of_hwini(adev,
2791 AMD_IP_BLOCK_TYPE_PSP))
2792 break;
2793
2794 if (!adev->ip_blocks[i].status.sw)
2795 continue;
2796
2797 /* no need to do the fw loading again if already done*/
2798 if (adev->ip_blocks[i].status.hw == true)
2799 break;
2800
2801 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2802 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
2803 if (r)
2804 return r;
2805 } else {
2806 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2807 if (r) {
2808 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2809 adev->ip_blocks[i].version->funcs->name, r);
2810 return r;
2811 }
2812 adev->ip_blocks[i].status.hw = true;
2813 }
2814 break;
2815 }
2816 }
2817
2818 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2819 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2820
2821 return r;
2822}
2823
2824static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2825{
2826 long timeout;
2827 int r, i;
2828
2829 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2830 struct amdgpu_ring *ring = adev->rings[i];
2831
2832 /* No need to setup the GPU scheduler for rings that don't need it */
2833 if (!ring || ring->no_scheduler)
2834 continue;
2835
2836 switch (ring->funcs->type) {
2837 case AMDGPU_RING_TYPE_GFX:
2838 timeout = adev->gfx_timeout;
2839 break;
2840 case AMDGPU_RING_TYPE_COMPUTE:
2841 timeout = adev->compute_timeout;
2842 break;
2843 case AMDGPU_RING_TYPE_SDMA:
2844 timeout = adev->sdma_timeout;
2845 break;
2846 default:
2847 timeout = adev->video_timeout;
2848 break;
2849 }
2850
2851 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL,
2852 DRM_SCHED_PRIORITY_COUNT,
2853 ring->num_hw_submission, 0,
2854 timeout, adev->reset_domain->wq,
2855 ring->sched_score, ring->name,
2856 adev->dev);
2857 if (r) {
2858 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2859 ring->name);
2860 return r;
2861 }
2862 r = amdgpu_uvd_entity_init(adev, ring);
2863 if (r) {
2864 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
2865 ring->name);
2866 return r;
2867 }
2868 r = amdgpu_vce_entity_init(adev, ring);
2869 if (r) {
2870 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
2871 ring->name);
2872 return r;
2873 }
2874 }
2875
2876 amdgpu_xcp_update_partition_sched_list(adev);
2877
2878 return 0;
2879}
2880
2881
2882/**
2883 * amdgpu_device_ip_init - run init for hardware IPs
2884 *
2885 * @adev: amdgpu_device pointer
2886 *
2887 * Main initialization pass for hardware IPs. The list of all the hardware
2888 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2889 * are run. sw_init initializes the software state associated with each IP
2890 * and hw_init initializes the hardware associated with each IP.
2891 * Returns 0 on success, negative error code on failure.
2892 */
2893static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2894{
2895 bool init_badpage;
2896 int i, r;
2897
2898 r = amdgpu_ras_init(adev);
2899 if (r)
2900 return r;
2901
2902 for (i = 0; i < adev->num_ip_blocks; i++) {
2903 if (!adev->ip_blocks[i].status.valid)
2904 continue;
2905 if (adev->ip_blocks[i].version->funcs->sw_init) {
2906 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]);
2907 if (r) {
2908 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2909 adev->ip_blocks[i].version->funcs->name, r);
2910 goto init_failed;
2911 }
2912 }
2913 adev->ip_blocks[i].status.sw = true;
2914
2915 if (!amdgpu_ip_member_of_hwini(
2916 adev, adev->ip_blocks[i].version->type))
2917 continue;
2918
2919 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2920 /* need to do common hw init early so everything is set up for gmc */
2921 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2922 if (r) {
2923 DRM_ERROR("hw_init %d failed %d\n", i, r);
2924 goto init_failed;
2925 }
2926 adev->ip_blocks[i].status.hw = true;
2927 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2928 /* need to do gmc hw init early so we can allocate gpu mem */
2929 /* Try to reserve bad pages early */
2930 if (amdgpu_sriov_vf(adev))
2931 amdgpu_virt_exchange_data(adev);
2932
2933 r = amdgpu_device_mem_scratch_init(adev);
2934 if (r) {
2935 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2936 goto init_failed;
2937 }
2938 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2939 if (r) {
2940 DRM_ERROR("hw_init %d failed %d\n", i, r);
2941 goto init_failed;
2942 }
2943 r = amdgpu_device_wb_init(adev);
2944 if (r) {
2945 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2946 goto init_failed;
2947 }
2948 adev->ip_blocks[i].status.hw = true;
2949
2950 /* right after GMC hw init, we create CSA */
2951 if (adev->gfx.mcbp) {
2952 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2953 AMDGPU_GEM_DOMAIN_VRAM |
2954 AMDGPU_GEM_DOMAIN_GTT,
2955 AMDGPU_CSA_SIZE);
2956 if (r) {
2957 DRM_ERROR("allocate CSA failed %d\n", r);
2958 goto init_failed;
2959 }
2960 }
2961
2962 r = amdgpu_seq64_init(adev);
2963 if (r) {
2964 DRM_ERROR("allocate seq64 failed %d\n", r);
2965 goto init_failed;
2966 }
2967 }
2968 }
2969
2970 if (amdgpu_sriov_vf(adev))
2971 amdgpu_virt_init_data_exchange(adev);
2972
2973 r = amdgpu_ib_pool_init(adev);
2974 if (r) {
2975 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2976 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2977 goto init_failed;
2978 }
2979
2980 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2981 if (r)
2982 goto init_failed;
2983
2984 r = amdgpu_device_ip_hw_init_phase1(adev);
2985 if (r)
2986 goto init_failed;
2987
2988 r = amdgpu_device_fw_loading(adev);
2989 if (r)
2990 goto init_failed;
2991
2992 r = amdgpu_device_ip_hw_init_phase2(adev);
2993 if (r)
2994 goto init_failed;
2995
2996 /*
2997 * retired pages will be loaded from eeprom and reserved here,
2998 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2999 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
3000 * for I2C communication which only true at this point.
3001 *
3002 * amdgpu_ras_recovery_init may fail, but the upper only cares the
3003 * failure from bad gpu situation and stop amdgpu init process
3004 * accordingly. For other failed cases, it will still release all
3005 * the resource and print error message, rather than returning one
3006 * negative value to upper level.
3007 *
3008 * Note: theoretically, this should be called before all vram allocations
3009 * to protect retired page from abusing
3010 */
3011 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
3012 r = amdgpu_ras_recovery_init(adev, init_badpage);
3013 if (r)
3014 goto init_failed;
3015
3016 /**
3017 * In case of XGMI grab extra reference for reset domain for this device
3018 */
3019 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3020 if (amdgpu_xgmi_add_device(adev) == 0) {
3021 if (!amdgpu_sriov_vf(adev)) {
3022 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3023
3024 if (WARN_ON(!hive)) {
3025 r = -ENOENT;
3026 goto init_failed;
3027 }
3028
3029 if (!hive->reset_domain ||
3030 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
3031 r = -ENOENT;
3032 amdgpu_put_xgmi_hive(hive);
3033 goto init_failed;
3034 }
3035
3036 /* Drop the early temporary reset domain we created for device */
3037 amdgpu_reset_put_reset_domain(adev->reset_domain);
3038 adev->reset_domain = hive->reset_domain;
3039 amdgpu_put_xgmi_hive(hive);
3040 }
3041 }
3042 }
3043
3044 r = amdgpu_device_init_schedulers(adev);
3045 if (r)
3046 goto init_failed;
3047
3048 if (adev->mman.buffer_funcs_ring->sched.ready)
3049 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3050
3051 /* Don't init kfd if whole hive need to be reset during init */
3052 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
3053 kgd2kfd_init_zone_device(adev);
3054 amdgpu_amdkfd_device_init(adev);
3055 }
3056
3057 amdgpu_fru_get_product_info(adev);
3058
3059init_failed:
3060
3061 return r;
3062}
3063
3064/**
3065 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
3066 *
3067 * @adev: amdgpu_device pointer
3068 *
3069 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
3070 * this function before a GPU reset. If the value is retained after a
3071 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents.
3072 */
3073static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
3074{
3075 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
3076}
3077
3078/**
3079 * amdgpu_device_check_vram_lost - check if vram is valid
3080 *
3081 * @adev: amdgpu_device pointer
3082 *
3083 * Checks the reset magic value written to the gart pointer in VRAM.
3084 * The driver calls this after a GPU reset to see if the contents of
3085 * VRAM is lost or now.
3086 * returns true if vram is lost, false if not.
3087 */
3088static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
3089{
3090 if (memcmp(adev->gart.ptr, adev->reset_magic,
3091 AMDGPU_RESET_MAGIC_NUM))
3092 return true;
3093
3094 if (!amdgpu_in_reset(adev))
3095 return false;
3096
3097 /*
3098 * For all ASICs with baco/mode1 reset, the VRAM is
3099 * always assumed to be lost.
3100 */
3101 switch (amdgpu_asic_reset_method(adev)) {
3102 case AMD_RESET_METHOD_BACO:
3103 case AMD_RESET_METHOD_MODE1:
3104 return true;
3105 default:
3106 return false;
3107 }
3108}
3109
3110/**
3111 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
3112 *
3113 * @adev: amdgpu_device pointer
3114 * @state: clockgating state (gate or ungate)
3115 *
3116 * The list of all the hardware IPs that make up the asic is walked and the
3117 * set_clockgating_state callbacks are run.
3118 * Late initialization pass enabling clockgating for hardware IPs.
3119 * Fini or suspend, pass disabling clockgating for hardware IPs.
3120 * Returns 0 on success, negative error code on failure.
3121 */
3122
3123int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
3124 enum amd_clockgating_state state)
3125{
3126 int i, j, r;
3127
3128 if (amdgpu_emu_mode == 1)
3129 return 0;
3130
3131 for (j = 0; j < adev->num_ip_blocks; j++) {
3132 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3133 if (!adev->ip_blocks[i].status.late_initialized)
3134 continue;
3135 /* skip CG for GFX, SDMA on S0ix */
3136 if (adev->in_s0ix &&
3137 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3138 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3139 continue;
3140 /* skip CG for VCE/UVD, it's handled specially */
3141 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3142 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3143 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3144 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3145 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
3146 /* enable clockgating to save power */
3147 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i],
3148 state);
3149 if (r) {
3150 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
3151 adev->ip_blocks[i].version->funcs->name, r);
3152 return r;
3153 }
3154 }
3155 }
3156
3157 return 0;
3158}
3159
3160int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
3161 enum amd_powergating_state state)
3162{
3163 int i, j, r;
3164
3165 if (amdgpu_emu_mode == 1)
3166 return 0;
3167
3168 for (j = 0; j < adev->num_ip_blocks; j++) {
3169 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3170 if (!adev->ip_blocks[i].status.late_initialized)
3171 continue;
3172 /* skip PG for GFX, SDMA on S0ix */
3173 if (adev->in_s0ix &&
3174 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3175 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3176 continue;
3177 /* skip CG for VCE/UVD, it's handled specially */
3178 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3179 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3180 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3181 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3182 adev->ip_blocks[i].version->funcs->set_powergating_state) {
3183 /* enable powergating to save power */
3184 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i],
3185 state);
3186 if (r) {
3187 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
3188 adev->ip_blocks[i].version->funcs->name, r);
3189 return r;
3190 }
3191 }
3192 }
3193 return 0;
3194}
3195
3196static int amdgpu_device_enable_mgpu_fan_boost(void)
3197{
3198 struct amdgpu_gpu_instance *gpu_ins;
3199 struct amdgpu_device *adev;
3200 int i, ret = 0;
3201
3202 mutex_lock(&mgpu_info.mutex);
3203
3204 /*
3205 * MGPU fan boost feature should be enabled
3206 * only when there are two or more dGPUs in
3207 * the system
3208 */
3209 if (mgpu_info.num_dgpu < 2)
3210 goto out;
3211
3212 for (i = 0; i < mgpu_info.num_dgpu; i++) {
3213 gpu_ins = &(mgpu_info.gpu_ins[i]);
3214 adev = gpu_ins->adev;
3215 if (!(adev->flags & AMD_IS_APU) &&
3216 !gpu_ins->mgpu_fan_enabled) {
3217 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
3218 if (ret)
3219 break;
3220
3221 gpu_ins->mgpu_fan_enabled = 1;
3222 }
3223 }
3224
3225out:
3226 mutex_unlock(&mgpu_info.mutex);
3227
3228 return ret;
3229}
3230
3231/**
3232 * amdgpu_device_ip_late_init - run late init for hardware IPs
3233 *
3234 * @adev: amdgpu_device pointer
3235 *
3236 * Late initialization pass for hardware IPs. The list of all the hardware
3237 * IPs that make up the asic is walked and the late_init callbacks are run.
3238 * late_init covers any special initialization that an IP requires
3239 * after all of the have been initialized or something that needs to happen
3240 * late in the init process.
3241 * Returns 0 on success, negative error code on failure.
3242 */
3243static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
3244{
3245 struct amdgpu_gpu_instance *gpu_instance;
3246 int i = 0, r;
3247
3248 for (i = 0; i < adev->num_ip_blocks; i++) {
3249 if (!adev->ip_blocks[i].status.hw)
3250 continue;
3251 if (adev->ip_blocks[i].version->funcs->late_init) {
3252 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]);
3253 if (r) {
3254 DRM_ERROR("late_init of IP block <%s> failed %d\n",
3255 adev->ip_blocks[i].version->funcs->name, r);
3256 return r;
3257 }
3258 }
3259 adev->ip_blocks[i].status.late_initialized = true;
3260 }
3261
3262 r = amdgpu_ras_late_init(adev);
3263 if (r) {
3264 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
3265 return r;
3266 }
3267
3268 if (!amdgpu_reset_in_recovery(adev))
3269 amdgpu_ras_set_error_query_ready(adev, true);
3270
3271 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3272 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3273
3274 amdgpu_device_fill_reset_magic(adev);
3275
3276 r = amdgpu_device_enable_mgpu_fan_boost();
3277 if (r)
3278 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
3279
3280 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
3281 if (amdgpu_passthrough(adev) &&
3282 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
3283 adev->asic_type == CHIP_ALDEBARAN))
3284 amdgpu_dpm_handle_passthrough_sbr(adev, true);
3285
3286 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3287 mutex_lock(&mgpu_info.mutex);
3288
3289 /*
3290 * Reset device p-state to low as this was booted with high.
3291 *
3292 * This should be performed only after all devices from the same
3293 * hive get initialized.
3294 *
3295 * However, it's unknown how many device in the hive in advance.
3296 * As this is counted one by one during devices initializations.
3297 *
3298 * So, we wait for all XGMI interlinked devices initialized.
3299 * This may bring some delays as those devices may come from
3300 * different hives. But that should be OK.
3301 */
3302 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
3303 for (i = 0; i < mgpu_info.num_gpu; i++) {
3304 gpu_instance = &(mgpu_info.gpu_ins[i]);
3305 if (gpu_instance->adev->flags & AMD_IS_APU)
3306 continue;
3307
3308 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
3309 AMDGPU_XGMI_PSTATE_MIN);
3310 if (r) {
3311 DRM_ERROR("pstate setting failed (%d).\n", r);
3312 break;
3313 }
3314 }
3315 }
3316
3317 mutex_unlock(&mgpu_info.mutex);
3318 }
3319
3320 return 0;
3321}
3322
3323static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block)
3324{
3325 int r;
3326
3327 if (!ip_block->version->funcs->hw_fini) {
3328 DRM_ERROR("hw_fini of IP block <%s> not defined\n",
3329 ip_block->version->funcs->name);
3330 } else {
3331 r = ip_block->version->funcs->hw_fini(ip_block);
3332 /* XXX handle errors */
3333 if (r) {
3334 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3335 ip_block->version->funcs->name, r);
3336 }
3337 }
3338
3339 ip_block->status.hw = false;
3340}
3341
3342/**
3343 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
3344 *
3345 * @adev: amdgpu_device pointer
3346 *
3347 * For ASICs need to disable SMC first
3348 */
3349static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
3350{
3351 int i;
3352
3353 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
3354 return;
3355
3356 for (i = 0; i < adev->num_ip_blocks; i++) {
3357 if (!adev->ip_blocks[i].status.hw)
3358 continue;
3359 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3360 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3361 break;
3362 }
3363 }
3364}
3365
3366static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
3367{
3368 int i, r;
3369
3370 for (i = 0; i < adev->num_ip_blocks; i++) {
3371 if (!adev->ip_blocks[i].version->funcs->early_fini)
3372 continue;
3373
3374 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
3375 if (r) {
3376 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
3377 adev->ip_blocks[i].version->funcs->name, r);
3378 }
3379 }
3380
3381 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3382 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3383
3384 amdgpu_amdkfd_suspend(adev, false);
3385
3386 /* Workaround for ASICs need to disable SMC first */
3387 amdgpu_device_smu_fini_early(adev);
3388
3389 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3390 if (!adev->ip_blocks[i].status.hw)
3391 continue;
3392
3393 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3394 }
3395
3396 if (amdgpu_sriov_vf(adev)) {
3397 if (amdgpu_virt_release_full_gpu(adev, false))
3398 DRM_ERROR("failed to release exclusive mode on fini\n");
3399 }
3400
3401 return 0;
3402}
3403
3404/**
3405 * amdgpu_device_ip_fini - run fini for hardware IPs
3406 *
3407 * @adev: amdgpu_device pointer
3408 *
3409 * Main teardown pass for hardware IPs. The list of all the hardware
3410 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3411 * are run. hw_fini tears down the hardware associated with each IP
3412 * and sw_fini tears down any software state associated with each IP.
3413 * Returns 0 on success, negative error code on failure.
3414 */
3415static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
3416{
3417 int i, r;
3418
3419 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
3420 amdgpu_virt_release_ras_err_handler_data(adev);
3421
3422 if (adev->gmc.xgmi.num_physical_nodes > 1)
3423 amdgpu_xgmi_remove_device(adev);
3424
3425 amdgpu_amdkfd_device_fini_sw(adev);
3426
3427 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3428 if (!adev->ip_blocks[i].status.sw)
3429 continue;
3430
3431 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3432 amdgpu_ucode_free_bo(adev);
3433 amdgpu_free_static_csa(&adev->virt.csa_obj);
3434 amdgpu_device_wb_fini(adev);
3435 amdgpu_device_mem_scratch_fini(adev);
3436 amdgpu_ib_pool_fini(adev);
3437 amdgpu_seq64_fini(adev);
3438 }
3439 if (adev->ip_blocks[i].version->funcs->sw_fini) {
3440 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]);
3441 /* XXX handle errors */
3442 if (r) {
3443 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
3444 adev->ip_blocks[i].version->funcs->name, r);
3445 }
3446 }
3447 adev->ip_blocks[i].status.sw = false;
3448 adev->ip_blocks[i].status.valid = false;
3449 }
3450
3451 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3452 if (!adev->ip_blocks[i].status.late_initialized)
3453 continue;
3454 if (adev->ip_blocks[i].version->funcs->late_fini)
3455 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]);
3456 adev->ip_blocks[i].status.late_initialized = false;
3457 }
3458
3459 amdgpu_ras_fini(adev);
3460
3461 return 0;
3462}
3463
3464/**
3465 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3466 *
3467 * @work: work_struct.
3468 */
3469static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
3470{
3471 struct amdgpu_device *adev =
3472 container_of(work, struct amdgpu_device, delayed_init_work.work);
3473 int r;
3474
3475 r = amdgpu_ib_ring_tests(adev);
3476 if (r)
3477 DRM_ERROR("ib ring test failed (%d).\n", r);
3478}
3479
3480static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
3481{
3482 struct amdgpu_device *adev =
3483 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
3484
3485 WARN_ON_ONCE(adev->gfx.gfx_off_state);
3486 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
3487
3488 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0))
3489 adev->gfx.gfx_off_state = true;
3490}
3491
3492/**
3493 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3494 *
3495 * @adev: amdgpu_device pointer
3496 *
3497 * Main suspend function for hardware IPs. The list of all the hardware
3498 * IPs that make up the asic is walked, clockgating is disabled and the
3499 * suspend callbacks are run. suspend puts the hardware and software state
3500 * in each IP into a state suitable for suspend.
3501 * Returns 0 on success, negative error code on failure.
3502 */
3503static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
3504{
3505 int i, r;
3506
3507 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3508 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3509
3510 /*
3511 * Per PMFW team's suggestion, driver needs to handle gfxoff
3512 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3513 * scenario. Add the missing df cstate disablement here.
3514 */
3515 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
3516 dev_warn(adev->dev, "Failed to disallow df cstate");
3517
3518 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3519 if (!adev->ip_blocks[i].status.valid)
3520 continue;
3521
3522 /* displays are handled separately */
3523 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
3524 continue;
3525
3526 /* XXX handle errors */
3527 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3528 if (r)
3529 return r;
3530 }
3531
3532 return 0;
3533}
3534
3535/**
3536 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3537 *
3538 * @adev: amdgpu_device pointer
3539 *
3540 * Main suspend function for hardware IPs. The list of all the hardware
3541 * IPs that make up the asic is walked, clockgating is disabled and the
3542 * suspend callbacks are run. suspend puts the hardware and software state
3543 * in each IP into a state suitable for suspend.
3544 * Returns 0 on success, negative error code on failure.
3545 */
3546static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3547{
3548 int i, r;
3549
3550 if (adev->in_s0ix)
3551 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3552
3553 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3554 if (!adev->ip_blocks[i].status.valid)
3555 continue;
3556 /* displays are handled in phase1 */
3557 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3558 continue;
3559 /* PSP lost connection when err_event_athub occurs */
3560 if (amdgpu_ras_intr_triggered() &&
3561 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3562 adev->ip_blocks[i].status.hw = false;
3563 continue;
3564 }
3565
3566 /* skip unnecessary suspend if we do not initialize them yet */
3567 if (!amdgpu_ip_member_of_hwini(
3568 adev, adev->ip_blocks[i].version->type))
3569 continue;
3570
3571 /* skip suspend of gfx/mes and psp for S0ix
3572 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3573 * like at runtime. PSP is also part of the always on hardware
3574 * so no need to suspend it.
3575 */
3576 if (adev->in_s0ix &&
3577 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3578 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3579 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3580 continue;
3581
3582 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3583 if (adev->in_s0ix &&
3584 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
3585 IP_VERSION(5, 0, 0)) &&
3586 (adev->ip_blocks[i].version->type ==
3587 AMD_IP_BLOCK_TYPE_SDMA))
3588 continue;
3589
3590 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3591 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3592 * from this location and RLC Autoload automatically also gets loaded
3593 * from here based on PMFW -> PSP message during re-init sequence.
3594 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3595 * the TMR and reload FWs again for IMU enabled APU ASICs.
3596 */
3597 if (amdgpu_in_reset(adev) &&
3598 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3599 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3600 continue;
3601
3602 /* XXX handle errors */
3603 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3604 adev->ip_blocks[i].status.hw = false;
3605
3606 /* handle putting the SMC in the appropriate state */
3607 if (!amdgpu_sriov_vf(adev)) {
3608 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3609 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3610 if (r) {
3611 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3612 adev->mp1_state, r);
3613 return r;
3614 }
3615 }
3616 }
3617 }
3618
3619 return 0;
3620}
3621
3622/**
3623 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3624 *
3625 * @adev: amdgpu_device pointer
3626 *
3627 * Main suspend function for hardware IPs. The list of all the hardware
3628 * IPs that make up the asic is walked, clockgating is disabled and the
3629 * suspend callbacks are run. suspend puts the hardware and software state
3630 * in each IP into a state suitable for suspend.
3631 * Returns 0 on success, negative error code on failure.
3632 */
3633int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3634{
3635 int r;
3636
3637 if (amdgpu_sriov_vf(adev)) {
3638 amdgpu_virt_fini_data_exchange(adev);
3639 amdgpu_virt_request_full_gpu(adev, false);
3640 }
3641
3642 amdgpu_ttm_set_buffer_funcs_status(adev, false);
3643
3644 r = amdgpu_device_ip_suspend_phase1(adev);
3645 if (r)
3646 return r;
3647 r = amdgpu_device_ip_suspend_phase2(adev);
3648
3649 if (amdgpu_sriov_vf(adev))
3650 amdgpu_virt_release_full_gpu(adev, false);
3651
3652 return r;
3653}
3654
3655static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3656{
3657 int i, r;
3658
3659 static enum amd_ip_block_type ip_order[] = {
3660 AMD_IP_BLOCK_TYPE_COMMON,
3661 AMD_IP_BLOCK_TYPE_GMC,
3662 AMD_IP_BLOCK_TYPE_PSP,
3663 AMD_IP_BLOCK_TYPE_IH,
3664 };
3665
3666 for (i = 0; i < adev->num_ip_blocks; i++) {
3667 int j;
3668 struct amdgpu_ip_block *block;
3669
3670 block = &adev->ip_blocks[i];
3671 block->status.hw = false;
3672
3673 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3674
3675 if (block->version->type != ip_order[j] ||
3676 !block->status.valid)
3677 continue;
3678
3679 r = block->version->funcs->hw_init(&adev->ip_blocks[i]);
3680 if (r) {
3681 dev_err(adev->dev, "RE-INIT-early: %s failed\n",
3682 block->version->funcs->name);
3683 return r;
3684 }
3685 block->status.hw = true;
3686 }
3687 }
3688
3689 return 0;
3690}
3691
3692static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3693{
3694 struct amdgpu_ip_block *block;
3695 int i, r = 0;
3696
3697 static enum amd_ip_block_type ip_order[] = {
3698 AMD_IP_BLOCK_TYPE_SMC,
3699 AMD_IP_BLOCK_TYPE_DCE,
3700 AMD_IP_BLOCK_TYPE_GFX,
3701 AMD_IP_BLOCK_TYPE_SDMA,
3702 AMD_IP_BLOCK_TYPE_MES,
3703 AMD_IP_BLOCK_TYPE_UVD,
3704 AMD_IP_BLOCK_TYPE_VCE,
3705 AMD_IP_BLOCK_TYPE_VCN,
3706 AMD_IP_BLOCK_TYPE_JPEG
3707 };
3708
3709 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3710 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]);
3711
3712 if (!block)
3713 continue;
3714
3715 if (block->status.valid && !block->status.hw) {
3716 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
3717 r = amdgpu_ip_block_resume(block);
3718 } else {
3719 r = block->version->funcs->hw_init(block);
3720 }
3721
3722 if (r) {
3723 dev_err(adev->dev, "RE-INIT-late: %s failed\n",
3724 block->version->funcs->name);
3725 break;
3726 }
3727 block->status.hw = true;
3728 }
3729 }
3730
3731 return r;
3732}
3733
3734/**
3735 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3736 *
3737 * @adev: amdgpu_device pointer
3738 *
3739 * First resume function for hardware IPs. The list of all the hardware
3740 * IPs that make up the asic is walked and the resume callbacks are run for
3741 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3742 * after a suspend and updates the software state as necessary. This
3743 * function is also used for restoring the GPU after a GPU reset.
3744 * Returns 0 on success, negative error code on failure.
3745 */
3746static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3747{
3748 int i, r;
3749
3750 for (i = 0; i < adev->num_ip_blocks; i++) {
3751 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3752 continue;
3753 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3754 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3755 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3756 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3757
3758 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3759 if (r)
3760 return r;
3761 }
3762 }
3763
3764 return 0;
3765}
3766
3767/**
3768 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3769 *
3770 * @adev: amdgpu_device pointer
3771 *
3772 * Second resume function for hardware IPs. The list of all the hardware
3773 * IPs that make up the asic is walked and the resume callbacks are run for
3774 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3775 * functional state after a suspend and updates the software state as
3776 * necessary. This function is also used for restoring the GPU after a GPU
3777 * reset.
3778 * Returns 0 on success, negative error code on failure.
3779 */
3780static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3781{
3782 int i, r;
3783
3784 for (i = 0; i < adev->num_ip_blocks; i++) {
3785 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3786 continue;
3787 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3788 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3789 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3790 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE ||
3791 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3792 continue;
3793 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3794 if (r)
3795 return r;
3796 }
3797
3798 return 0;
3799}
3800
3801/**
3802 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs
3803 *
3804 * @adev: amdgpu_device pointer
3805 *
3806 * Third resume function for hardware IPs. The list of all the hardware
3807 * IPs that make up the asic is walked and the resume callbacks are run for
3808 * all DCE. resume puts the hardware into a functional state after a suspend
3809 * and updates the software state as necessary. This function is also used
3810 * for restoring the GPU after a GPU reset.
3811 *
3812 * Returns 0 on success, negative error code on failure.
3813 */
3814static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
3815{
3816 int i, r;
3817
3818 for (i = 0; i < adev->num_ip_blocks; i++) {
3819 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3820 continue;
3821 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
3822 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3823 if (r)
3824 return r;
3825 }
3826 }
3827
3828 return 0;
3829}
3830
3831/**
3832 * amdgpu_device_ip_resume - run resume for hardware IPs
3833 *
3834 * @adev: amdgpu_device pointer
3835 *
3836 * Main resume function for hardware IPs. The hardware IPs
3837 * are split into two resume functions because they are
3838 * also used in recovering from a GPU reset and some additional
3839 * steps need to be take between them. In this case (S3/S4) they are
3840 * run sequentially.
3841 * Returns 0 on success, negative error code on failure.
3842 */
3843static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3844{
3845 int r;
3846
3847 r = amdgpu_device_ip_resume_phase1(adev);
3848 if (r)
3849 return r;
3850
3851 r = amdgpu_device_fw_loading(adev);
3852 if (r)
3853 return r;
3854
3855 r = amdgpu_device_ip_resume_phase2(adev);
3856
3857 if (adev->mman.buffer_funcs_ring->sched.ready)
3858 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3859
3860 if (r)
3861 return r;
3862
3863 amdgpu_fence_driver_hw_init(adev);
3864
3865 r = amdgpu_device_ip_resume_phase3(adev);
3866
3867 return r;
3868}
3869
3870/**
3871 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3872 *
3873 * @adev: amdgpu_device pointer
3874 *
3875 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3876 */
3877static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3878{
3879 if (amdgpu_sriov_vf(adev)) {
3880 if (adev->is_atom_fw) {
3881 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3882 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3883 } else {
3884 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3885 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3886 }
3887
3888 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3889 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3890 }
3891}
3892
3893/**
3894 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3895 *
3896 * @asic_type: AMD asic type
3897 *
3898 * Check if there is DC (new modesetting infrastructre) support for an asic.
3899 * returns true if DC has support, false if not.
3900 */
3901bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3902{
3903 switch (asic_type) {
3904#ifdef CONFIG_DRM_AMDGPU_SI
3905 case CHIP_HAINAN:
3906#endif
3907 case CHIP_TOPAZ:
3908 /* chips with no display hardware */
3909 return false;
3910#if defined(CONFIG_DRM_AMD_DC)
3911 case CHIP_TAHITI:
3912 case CHIP_PITCAIRN:
3913 case CHIP_VERDE:
3914 case CHIP_OLAND:
3915 /*
3916 * We have systems in the wild with these ASICs that require
3917 * LVDS and VGA support which is not supported with DC.
3918 *
3919 * Fallback to the non-DC driver here by default so as not to
3920 * cause regressions.
3921 */
3922#if defined(CONFIG_DRM_AMD_DC_SI)
3923 return amdgpu_dc > 0;
3924#else
3925 return false;
3926#endif
3927 case CHIP_BONAIRE:
3928 case CHIP_KAVERI:
3929 case CHIP_KABINI:
3930 case CHIP_MULLINS:
3931 /*
3932 * We have systems in the wild with these ASICs that require
3933 * VGA support which is not supported with DC.
3934 *
3935 * Fallback to the non-DC driver here by default so as not to
3936 * cause regressions.
3937 */
3938 return amdgpu_dc > 0;
3939 default:
3940 return amdgpu_dc != 0;
3941#else
3942 default:
3943 if (amdgpu_dc > 0)
3944 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3945 return false;
3946#endif
3947 }
3948}
3949
3950/**
3951 * amdgpu_device_has_dc_support - check if dc is supported
3952 *
3953 * @adev: amdgpu_device pointer
3954 *
3955 * Returns true for supported, false for not supported
3956 */
3957bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3958{
3959 if (adev->enable_virtual_display ||
3960 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3961 return false;
3962
3963 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3964}
3965
3966static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3967{
3968 struct amdgpu_device *adev =
3969 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3970 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3971
3972 /* It's a bug to not have a hive within this function */
3973 if (WARN_ON(!hive))
3974 return;
3975
3976 /*
3977 * Use task barrier to synchronize all xgmi reset works across the
3978 * hive. task_barrier_enter and task_barrier_exit will block
3979 * until all the threads running the xgmi reset works reach
3980 * those points. task_barrier_full will do both blocks.
3981 */
3982 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3983
3984 task_barrier_enter(&hive->tb);
3985 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3986
3987 if (adev->asic_reset_res)
3988 goto fail;
3989
3990 task_barrier_exit(&hive->tb);
3991 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3992
3993 if (adev->asic_reset_res)
3994 goto fail;
3995
3996 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
3997 } else {
3998
3999 task_barrier_full(&hive->tb);
4000 adev->asic_reset_res = amdgpu_asic_reset(adev);
4001 }
4002
4003fail:
4004 if (adev->asic_reset_res)
4005 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4006 adev->asic_reset_res, adev_to_drm(adev)->unique);
4007 amdgpu_put_xgmi_hive(hive);
4008}
4009
4010static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
4011{
4012 char *input = amdgpu_lockup_timeout;
4013 char *timeout_setting = NULL;
4014 int index = 0;
4015 long timeout;
4016 int ret = 0;
4017
4018 /*
4019 * By default timeout for non compute jobs is 10000
4020 * and 60000 for compute jobs.
4021 * In SR-IOV or passthrough mode, timeout for compute
4022 * jobs are 60000 by default.
4023 */
4024 adev->gfx_timeout = msecs_to_jiffies(10000);
4025 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4026 if (amdgpu_sriov_vf(adev))
4027 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
4028 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
4029 else
4030 adev->compute_timeout = msecs_to_jiffies(60000);
4031
4032 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4033 while ((timeout_setting = strsep(&input, ",")) &&
4034 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4035 ret = kstrtol(timeout_setting, 0, &timeout);
4036 if (ret)
4037 return ret;
4038
4039 if (timeout == 0) {
4040 index++;
4041 continue;
4042 } else if (timeout < 0) {
4043 timeout = MAX_SCHEDULE_TIMEOUT;
4044 dev_warn(adev->dev, "lockup timeout disabled");
4045 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
4046 } else {
4047 timeout = msecs_to_jiffies(timeout);
4048 }
4049
4050 switch (index++) {
4051 case 0:
4052 adev->gfx_timeout = timeout;
4053 break;
4054 case 1:
4055 adev->compute_timeout = timeout;
4056 break;
4057 case 2:
4058 adev->sdma_timeout = timeout;
4059 break;
4060 case 3:
4061 adev->video_timeout = timeout;
4062 break;
4063 default:
4064 break;
4065 }
4066 }
4067 /*
4068 * There is only one value specified and
4069 * it should apply to all non-compute jobs.
4070 */
4071 if (index == 1) {
4072 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4073 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
4074 adev->compute_timeout = adev->gfx_timeout;
4075 }
4076 }
4077
4078 return ret;
4079}
4080
4081/**
4082 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
4083 *
4084 * @adev: amdgpu_device pointer
4085 *
4086 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
4087 */
4088static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
4089{
4090 struct iommu_domain *domain;
4091
4092 domain = iommu_get_domain_for_dev(adev->dev);
4093 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
4094 adev->ram_is_direct_mapped = true;
4095}
4096
4097#if defined(CONFIG_HSA_AMD_P2P)
4098/**
4099 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled.
4100 *
4101 * @adev: amdgpu_device pointer
4102 *
4103 * return if IOMMU remapping bar address
4104 */
4105static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
4106{
4107 struct iommu_domain *domain;
4108
4109 domain = iommu_get_domain_for_dev(adev->dev);
4110 if (domain && (domain->type == IOMMU_DOMAIN_DMA ||
4111 domain->type == IOMMU_DOMAIN_DMA_FQ))
4112 return true;
4113
4114 return false;
4115}
4116#endif
4117
4118static const struct attribute *amdgpu_dev_attributes[] = {
4119 &dev_attr_pcie_replay_count.attr,
4120 NULL
4121};
4122
4123static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
4124{
4125 if (amdgpu_mcbp == 1)
4126 adev->gfx.mcbp = true;
4127 else if (amdgpu_mcbp == 0)
4128 adev->gfx.mcbp = false;
4129
4130 if (amdgpu_sriov_vf(adev))
4131 adev->gfx.mcbp = true;
4132
4133 if (adev->gfx.mcbp)
4134 DRM_INFO("MCBP is enabled\n");
4135}
4136
4137/**
4138 * amdgpu_device_init - initialize the driver
4139 *
4140 * @adev: amdgpu_device pointer
4141 * @flags: driver flags
4142 *
4143 * Initializes the driver info and hw (all asics).
4144 * Returns 0 for success or an error on failure.
4145 * Called at driver startup.
4146 */
4147int amdgpu_device_init(struct amdgpu_device *adev,
4148 uint32_t flags)
4149{
4150 struct drm_device *ddev = adev_to_drm(adev);
4151 struct pci_dev *pdev = adev->pdev;
4152 int r, i;
4153 bool px = false;
4154 u32 max_MBps;
4155 int tmp;
4156
4157 adev->shutdown = false;
4158 adev->flags = flags;
4159
4160 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
4161 adev->asic_type = amdgpu_force_asic_type;
4162 else
4163 adev->asic_type = flags & AMD_ASIC_MASK;
4164
4165 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
4166 if (amdgpu_emu_mode == 1)
4167 adev->usec_timeout *= 10;
4168 adev->gmc.gart_size = 512 * 1024 * 1024;
4169 adev->accel_working = false;
4170 adev->num_rings = 0;
4171 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
4172 adev->mman.buffer_funcs = NULL;
4173 adev->mman.buffer_funcs_ring = NULL;
4174 adev->vm_manager.vm_pte_funcs = NULL;
4175 adev->vm_manager.vm_pte_num_scheds = 0;
4176 adev->gmc.gmc_funcs = NULL;
4177 adev->harvest_ip_mask = 0x0;
4178 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
4179 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
4180
4181 adev->smc_rreg = &amdgpu_invalid_rreg;
4182 adev->smc_wreg = &amdgpu_invalid_wreg;
4183 adev->pcie_rreg = &amdgpu_invalid_rreg;
4184 adev->pcie_wreg = &amdgpu_invalid_wreg;
4185 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
4186 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
4187 adev->pciep_rreg = &amdgpu_invalid_rreg;
4188 adev->pciep_wreg = &amdgpu_invalid_wreg;
4189 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
4190 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
4191 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext;
4192 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;
4193 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
4194 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
4195 adev->didt_rreg = &amdgpu_invalid_rreg;
4196 adev->didt_wreg = &amdgpu_invalid_wreg;
4197 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
4198 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
4199 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
4200 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
4201
4202 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
4203 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
4204 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
4205
4206 /* mutex initialization are all done here so we
4207 * can recall function without having locking issues
4208 */
4209 mutex_init(&adev->firmware.mutex);
4210 mutex_init(&adev->pm.mutex);
4211 mutex_init(&adev->gfx.gpu_clock_mutex);
4212 mutex_init(&adev->srbm_mutex);
4213 mutex_init(&adev->gfx.pipe_reserve_mutex);
4214 mutex_init(&adev->gfx.gfx_off_mutex);
4215 mutex_init(&adev->gfx.partition_mutex);
4216 mutex_init(&adev->grbm_idx_mutex);
4217 mutex_init(&adev->mn_lock);
4218 mutex_init(&adev->virt.vf_errors.lock);
4219 mutex_init(&adev->virt.rlcg_reg_lock);
4220 hash_init(adev->mn_hash);
4221 mutex_init(&adev->psp.mutex);
4222 mutex_init(&adev->notifier_lock);
4223 mutex_init(&adev->pm.stable_pstate_ctx_lock);
4224 mutex_init(&adev->benchmark_mutex);
4225 mutex_init(&adev->gfx.reset_sem_mutex);
4226 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
4227 mutex_init(&adev->enforce_isolation_mutex);
4228 mutex_init(&adev->gfx.kfd_sch_mutex);
4229
4230 amdgpu_device_init_apu_flags(adev);
4231
4232 r = amdgpu_device_check_arguments(adev);
4233 if (r)
4234 return r;
4235
4236 spin_lock_init(&adev->mmio_idx_lock);
4237 spin_lock_init(&adev->smc_idx_lock);
4238 spin_lock_init(&adev->pcie_idx_lock);
4239 spin_lock_init(&adev->uvd_ctx_idx_lock);
4240 spin_lock_init(&adev->didt_idx_lock);
4241 spin_lock_init(&adev->gc_cac_idx_lock);
4242 spin_lock_init(&adev->se_cac_idx_lock);
4243 spin_lock_init(&adev->audio_endpt_idx_lock);
4244 spin_lock_init(&adev->mm_stats.lock);
4245 spin_lock_init(&adev->wb.lock);
4246
4247 INIT_LIST_HEAD(&adev->reset_list);
4248
4249 INIT_LIST_HEAD(&adev->ras_list);
4250
4251 INIT_LIST_HEAD(&adev->pm.od_kobj_list);
4252
4253 INIT_DELAYED_WORK(&adev->delayed_init_work,
4254 amdgpu_device_delayed_init_work_handler);
4255 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
4256 amdgpu_device_delay_enable_gfx_off);
4257 /*
4258 * Initialize the enforce_isolation work structures for each XCP
4259 * partition. This work handler is responsible for enforcing shader
4260 * isolation on AMD GPUs. It counts the number of emitted fences for
4261 * each GFX and compute ring. If there are any fences, it schedules
4262 * the `enforce_isolation_work` to be run after a delay. If there are
4263 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the
4264 * runqueue.
4265 */
4266 for (i = 0; i < MAX_XCP; i++) {
4267 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work,
4268 amdgpu_gfx_enforce_isolation_handler);
4269 adev->gfx.enforce_isolation[i].adev = adev;
4270 adev->gfx.enforce_isolation[i].xcp_id = i;
4271 }
4272
4273 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
4274
4275 adev->gfx.gfx_off_req_count = 1;
4276 adev->gfx.gfx_off_residency = 0;
4277 adev->gfx.gfx_off_entrycount = 0;
4278 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
4279
4280 atomic_set(&adev->throttling_logging_enabled, 1);
4281 /*
4282 * If throttling continues, logging will be performed every minute
4283 * to avoid log flooding. "-1" is subtracted since the thermal
4284 * throttling interrupt comes every second. Thus, the total logging
4285 * interval is 59 seconds(retelimited printk interval) + 1(waiting
4286 * for throttling interrupt) = 60 seconds.
4287 */
4288 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
4289 ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1);
4290
4291 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
4292 ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE);
4293
4294 /* Registers mapping */
4295 /* TODO: block userspace mapping of io register */
4296 if (adev->asic_type >= CHIP_BONAIRE) {
4297 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
4298 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
4299 } else {
4300 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
4301 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
4302 }
4303
4304 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
4305 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
4306
4307 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
4308 if (!adev->rmmio)
4309 return -ENOMEM;
4310
4311 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
4312 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
4313
4314 /*
4315 * Reset domain needs to be present early, before XGMI hive discovered
4316 * (if any) and initialized to use reset sem and in_gpu reset flag
4317 * early on during init and before calling to RREG32.
4318 */
4319 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
4320 if (!adev->reset_domain)
4321 return -ENOMEM;
4322
4323 /* detect hw virtualization here */
4324 amdgpu_detect_virtualization(adev);
4325
4326 amdgpu_device_get_pcie_info(adev);
4327
4328 r = amdgpu_device_get_job_timeout_settings(adev);
4329 if (r) {
4330 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4331 return r;
4332 }
4333
4334 amdgpu_device_set_mcbp(adev);
4335
4336 /*
4337 * By default, use default mode where all blocks are expected to be
4338 * initialized. At present a 'swinit' of blocks is required to be
4339 * completed before the need for a different level is detected.
4340 */
4341 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT);
4342 /* early init functions */
4343 r = amdgpu_device_ip_early_init(adev);
4344 if (r)
4345 return r;
4346
4347 /* Get rid of things like offb */
4348 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name);
4349 if (r)
4350 return r;
4351
4352 /* Enable TMZ based on IP_VERSION */
4353 amdgpu_gmc_tmz_set(adev);
4354
4355 if (amdgpu_sriov_vf(adev) &&
4356 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
4357 /* VF MMIO access (except mailbox range) from CPU
4358 * will be blocked during sriov runtime
4359 */
4360 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
4361
4362 amdgpu_gmc_noretry_set(adev);
4363 /* Need to get xgmi info early to decide the reset behavior*/
4364 if (adev->gmc.xgmi.supported) {
4365 r = adev->gfxhub.funcs->get_xgmi_info(adev);
4366 if (r)
4367 return r;
4368 }
4369
4370 /* enable PCIE atomic ops */
4371 if (amdgpu_sriov_vf(adev)) {
4372 if (adev->virt.fw_reserve.p_pf2vf)
4373 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
4374 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
4375 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4376 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4377 * internal path natively support atomics, set have_atomics_support to true.
4378 */
4379 } else if ((adev->flags & AMD_IS_APU) &&
4380 (amdgpu_ip_version(adev, GC_HWIP, 0) >
4381 IP_VERSION(9, 0, 0))) {
4382 adev->have_atomics_support = true;
4383 } else {
4384 adev->have_atomics_support =
4385 !pci_enable_atomic_ops_to_root(adev->pdev,
4386 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
4387 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4388 }
4389
4390 if (!adev->have_atomics_support)
4391 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
4392
4393 /* doorbell bar mapping and doorbell index init*/
4394 amdgpu_doorbell_init(adev);
4395
4396 if (amdgpu_emu_mode == 1) {
4397 /* post the asic on emulation mode */
4398 emu_soc_asic_init(adev);
4399 goto fence_driver_init;
4400 }
4401
4402 amdgpu_reset_init(adev);
4403
4404 /* detect if we are with an SRIOV vbios */
4405 if (adev->bios)
4406 amdgpu_device_detect_sriov_bios(adev);
4407
4408 /* check if we need to reset the asic
4409 * E.g., driver was not cleanly unloaded previously, etc.
4410 */
4411 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
4412 if (adev->gmc.xgmi.num_physical_nodes) {
4413 dev_info(adev->dev, "Pending hive reset.\n");
4414 amdgpu_set_init_level(adev,
4415 AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
4416 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
4417 !amdgpu_device_has_display_hardware(adev)) {
4418 r = psp_gpu_reset(adev);
4419 } else {
4420 tmp = amdgpu_reset_method;
4421 /* It should do a default reset when loading or reloading the driver,
4422 * regardless of the module parameter reset_method.
4423 */
4424 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
4425 r = amdgpu_asic_reset(adev);
4426 amdgpu_reset_method = tmp;
4427 }
4428
4429 if (r) {
4430 dev_err(adev->dev, "asic reset on init failed\n");
4431 goto failed;
4432 }
4433 }
4434
4435 /* Post card if necessary */
4436 if (amdgpu_device_need_post(adev)) {
4437 if (!adev->bios) {
4438 dev_err(adev->dev, "no vBIOS found\n");
4439 r = -EINVAL;
4440 goto failed;
4441 }
4442 DRM_INFO("GPU posting now...\n");
4443 r = amdgpu_device_asic_init(adev);
4444 if (r) {
4445 dev_err(adev->dev, "gpu post error!\n");
4446 goto failed;
4447 }
4448 }
4449
4450 if (adev->bios) {
4451 if (adev->is_atom_fw) {
4452 /* Initialize clocks */
4453 r = amdgpu_atomfirmware_get_clock_info(adev);
4454 if (r) {
4455 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
4456 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4457 goto failed;
4458 }
4459 } else {
4460 /* Initialize clocks */
4461 r = amdgpu_atombios_get_clock_info(adev);
4462 if (r) {
4463 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
4464 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4465 goto failed;
4466 }
4467 /* init i2c buses */
4468 if (!amdgpu_device_has_dc_support(adev))
4469 amdgpu_atombios_i2c_init(adev);
4470 }
4471 }
4472
4473fence_driver_init:
4474 /* Fence driver */
4475 r = amdgpu_fence_driver_sw_init(adev);
4476 if (r) {
4477 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
4478 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
4479 goto failed;
4480 }
4481
4482 /* init the mode config */
4483 drm_mode_config_init(adev_to_drm(adev));
4484
4485 r = amdgpu_device_ip_init(adev);
4486 if (r) {
4487 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
4488 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
4489 goto release_ras_con;
4490 }
4491
4492 amdgpu_fence_driver_hw_init(adev);
4493
4494 dev_info(adev->dev,
4495 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4496 adev->gfx.config.max_shader_engines,
4497 adev->gfx.config.max_sh_per_se,
4498 adev->gfx.config.max_cu_per_sh,
4499 adev->gfx.cu_info.number);
4500
4501 adev->accel_working = true;
4502
4503 amdgpu_vm_check_compute_bug(adev);
4504
4505 /* Initialize the buffer migration limit. */
4506 if (amdgpu_moverate >= 0)
4507 max_MBps = amdgpu_moverate;
4508 else
4509 max_MBps = 8; /* Allow 8 MB/s. */
4510 /* Get a log2 for easy divisions. */
4511 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
4512
4513 /*
4514 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4515 * Otherwise the mgpu fan boost feature will be skipped due to the
4516 * gpu instance is counted less.
4517 */
4518 amdgpu_register_gpu_instance(adev);
4519
4520 /* enable clockgating, etc. after ib tests, etc. since some blocks require
4521 * explicit gating rather than handling it automatically.
4522 */
4523 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
4524 r = amdgpu_device_ip_late_init(adev);
4525 if (r) {
4526 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
4527 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
4528 goto release_ras_con;
4529 }
4530 /* must succeed. */
4531 amdgpu_ras_resume(adev);
4532 queue_delayed_work(system_wq, &adev->delayed_init_work,
4533 msecs_to_jiffies(AMDGPU_RESUME_MS));
4534 }
4535
4536 if (amdgpu_sriov_vf(adev)) {
4537 amdgpu_virt_release_full_gpu(adev, true);
4538 flush_delayed_work(&adev->delayed_init_work);
4539 }
4540
4541 /*
4542 * Place those sysfs registering after `late_init`. As some of those
4543 * operations performed in `late_init` might affect the sysfs
4544 * interfaces creating.
4545 */
4546 r = amdgpu_atombios_sysfs_init(adev);
4547 if (r)
4548 drm_err(&adev->ddev,
4549 "registering atombios sysfs failed (%d).\n", r);
4550
4551 r = amdgpu_pm_sysfs_init(adev);
4552 if (r)
4553 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
4554
4555 r = amdgpu_ucode_sysfs_init(adev);
4556 if (r) {
4557 adev->ucode_sysfs_en = false;
4558 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
4559 } else
4560 adev->ucode_sysfs_en = true;
4561
4562 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
4563 if (r)
4564 dev_err(adev->dev, "Could not create amdgpu device attr\n");
4565
4566 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
4567 if (r)
4568 dev_err(adev->dev,
4569 "Could not create amdgpu board attributes\n");
4570
4571 amdgpu_fru_sysfs_init(adev);
4572 amdgpu_reg_state_sysfs_init(adev);
4573 amdgpu_xcp_cfg_sysfs_init(adev);
4574
4575 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4576 r = amdgpu_pmu_init(adev);
4577 if (r)
4578 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4579
4580 /* Have stored pci confspace at hand for restore in sudden PCI error */
4581 if (amdgpu_device_cache_pci_state(adev->pdev))
4582 pci_restore_state(pdev);
4583
4584 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4585 /* this will fail for cards that aren't VGA class devices, just
4586 * ignore it
4587 */
4588 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4589 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4590
4591 px = amdgpu_device_supports_px(ddev);
4592
4593 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4594 apple_gmux_detect(NULL, NULL)))
4595 vga_switcheroo_register_client(adev->pdev,
4596 &amdgpu_switcheroo_ops, px);
4597
4598 if (px)
4599 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4600
4601 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
4602 amdgpu_xgmi_reset_on_init(adev);
4603
4604 amdgpu_device_check_iommu_direct_map(adev);
4605
4606 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier;
4607 r = register_pm_notifier(&adev->pm_nb);
4608 if (r)
4609 goto failed;
4610
4611 return 0;
4612
4613release_ras_con:
4614 if (amdgpu_sriov_vf(adev))
4615 amdgpu_virt_release_full_gpu(adev, true);
4616
4617 /* failed in exclusive mode due to timeout */
4618 if (amdgpu_sriov_vf(adev) &&
4619 !amdgpu_sriov_runtime(adev) &&
4620 amdgpu_virt_mmio_blocked(adev) &&
4621 !amdgpu_virt_wait_reset(adev)) {
4622 dev_err(adev->dev, "VF exclusive mode timeout\n");
4623 /* Don't send request since VF is inactive. */
4624 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4625 adev->virt.ops = NULL;
4626 r = -EAGAIN;
4627 }
4628 amdgpu_release_ras_context(adev);
4629
4630failed:
4631 amdgpu_vf_error_trans_all(adev);
4632
4633 return r;
4634}
4635
4636static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4637{
4638
4639 /* Clear all CPU mappings pointing to this device */
4640 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4641
4642 /* Unmap all mapped bars - Doorbell, registers and VRAM */
4643 amdgpu_doorbell_fini(adev);
4644
4645 iounmap(adev->rmmio);
4646 adev->rmmio = NULL;
4647 if (adev->mman.aper_base_kaddr)
4648 iounmap(adev->mman.aper_base_kaddr);
4649 adev->mman.aper_base_kaddr = NULL;
4650
4651 /* Memory manager related */
4652 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4653 arch_phys_wc_del(adev->gmc.vram_mtrr);
4654 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4655 }
4656}
4657
4658/**
4659 * amdgpu_device_fini_hw - tear down the driver
4660 *
4661 * @adev: amdgpu_device pointer
4662 *
4663 * Tear down the driver info (all asics).
4664 * Called at driver shutdown.
4665 */
4666void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4667{
4668 dev_info(adev->dev, "amdgpu: finishing device.\n");
4669 flush_delayed_work(&adev->delayed_init_work);
4670
4671 if (adev->mman.initialized)
4672 drain_workqueue(adev->mman.bdev.wq);
4673 adev->shutdown = true;
4674
4675 unregister_pm_notifier(&adev->pm_nb);
4676
4677 /* make sure IB test finished before entering exclusive mode
4678 * to avoid preemption on IB test
4679 */
4680 if (amdgpu_sriov_vf(adev)) {
4681 amdgpu_virt_request_full_gpu(adev, false);
4682 amdgpu_virt_fini_data_exchange(adev);
4683 }
4684
4685 /* disable all interrupts */
4686 amdgpu_irq_disable_all(adev);
4687 if (adev->mode_info.mode_config_initialized) {
4688 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4689 drm_helper_force_disable_all(adev_to_drm(adev));
4690 else
4691 drm_atomic_helper_shutdown(adev_to_drm(adev));
4692 }
4693 amdgpu_fence_driver_hw_fini(adev);
4694
4695 if (adev->pm.sysfs_initialized)
4696 amdgpu_pm_sysfs_fini(adev);
4697 if (adev->ucode_sysfs_en)
4698 amdgpu_ucode_sysfs_fini(adev);
4699 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4700 amdgpu_fru_sysfs_fini(adev);
4701
4702 amdgpu_reg_state_sysfs_fini(adev);
4703 amdgpu_xcp_cfg_sysfs_fini(adev);
4704
4705 /* disable ras feature must before hw fini */
4706 amdgpu_ras_pre_fini(adev);
4707
4708 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4709
4710 amdgpu_device_ip_fini_early(adev);
4711
4712 amdgpu_irq_fini_hw(adev);
4713
4714 if (adev->mman.initialized)
4715 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4716
4717 amdgpu_gart_dummy_page_fini(adev);
4718
4719 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4720 amdgpu_device_unmap_mmio(adev);
4721
4722}
4723
4724void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4725{
4726 int idx;
4727 bool px;
4728
4729 amdgpu_device_ip_fini(adev);
4730 amdgpu_fence_driver_sw_fini(adev);
4731 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4732 adev->accel_working = false;
4733 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4734
4735 amdgpu_reset_fini(adev);
4736
4737 /* free i2c buses */
4738 if (!amdgpu_device_has_dc_support(adev))
4739 amdgpu_i2c_fini(adev);
4740
4741 if (amdgpu_emu_mode != 1)
4742 amdgpu_atombios_fini(adev);
4743
4744 kfree(adev->bios);
4745 adev->bios = NULL;
4746
4747 kfree(adev->fru_info);
4748 adev->fru_info = NULL;
4749
4750 px = amdgpu_device_supports_px(adev_to_drm(adev));
4751
4752 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4753 apple_gmux_detect(NULL, NULL)))
4754 vga_switcheroo_unregister_client(adev->pdev);
4755
4756 if (px)
4757 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4758
4759 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4760 vga_client_unregister(adev->pdev);
4761
4762 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4763
4764 iounmap(adev->rmmio);
4765 adev->rmmio = NULL;
4766 amdgpu_doorbell_fini(adev);
4767 drm_dev_exit(idx);
4768 }
4769
4770 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4771 amdgpu_pmu_fini(adev);
4772 if (adev->mman.discovery_bin)
4773 amdgpu_discovery_fini(adev);
4774
4775 amdgpu_reset_put_reset_domain(adev->reset_domain);
4776 adev->reset_domain = NULL;
4777
4778 kfree(adev->pci_state);
4779
4780}
4781
4782/**
4783 * amdgpu_device_evict_resources - evict device resources
4784 * @adev: amdgpu device object
4785 *
4786 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4787 * of the vram memory type. Mainly used for evicting device resources
4788 * at suspend time.
4789 *
4790 */
4791static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4792{
4793 int ret;
4794
4795 /* No need to evict vram on APUs unless going to S4 */
4796 if (!adev->in_s4 && (adev->flags & AMD_IS_APU))
4797 return 0;
4798
4799 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4800 if (ret)
4801 DRM_WARN("evicting device resources failed\n");
4802 return ret;
4803}
4804
4805/*
4806 * Suspend & resume.
4807 */
4808/**
4809 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events
4810 * @nb: notifier block
4811 * @mode: suspend mode
4812 * @data: data
4813 *
4814 * This function is called when the system is about to suspend or hibernate.
4815 * It is used to evict resources from the device before the system goes to
4816 * sleep while there is still access to swap.
4817 */
4818static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
4819 void *data)
4820{
4821 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
4822 int r;
4823
4824 switch (mode) {
4825 case PM_HIBERNATION_PREPARE:
4826 adev->in_s4 = true;
4827 fallthrough;
4828 case PM_SUSPEND_PREPARE:
4829 r = amdgpu_device_evict_resources(adev);
4830 /*
4831 * This is considered non-fatal at this time because
4832 * amdgpu_device_prepare() will also fatally evict resources.
4833 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781
4834 */
4835 if (r)
4836 drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r);
4837 break;
4838 }
4839
4840 return NOTIFY_DONE;
4841}
4842
4843/**
4844 * amdgpu_device_prepare - prepare for device suspend
4845 *
4846 * @dev: drm dev pointer
4847 *
4848 * Prepare to put the hw in the suspend state (all asics).
4849 * Returns 0 for success or an error on failure.
4850 * Called at driver suspend.
4851 */
4852int amdgpu_device_prepare(struct drm_device *dev)
4853{
4854 struct amdgpu_device *adev = drm_to_adev(dev);
4855 int i, r;
4856
4857 amdgpu_choose_low_power_state(adev);
4858
4859 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4860 return 0;
4861
4862 /* Evict the majority of BOs before starting suspend sequence */
4863 r = amdgpu_device_evict_resources(adev);
4864 if (r)
4865 goto unprepare;
4866
4867 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4868
4869 for (i = 0; i < adev->num_ip_blocks; i++) {
4870 if (!adev->ip_blocks[i].status.valid)
4871 continue;
4872 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4873 continue;
4874 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]);
4875 if (r)
4876 goto unprepare;
4877 }
4878
4879 return 0;
4880
4881unprepare:
4882 adev->in_s0ix = adev->in_s3 = adev->in_s4 = false;
4883
4884 return r;
4885}
4886
4887/**
4888 * amdgpu_device_suspend - initiate device suspend
4889 *
4890 * @dev: drm dev pointer
4891 * @notify_clients: notify in-kernel DRM clients
4892 *
4893 * Puts the hw in the suspend state (all asics).
4894 * Returns 0 for success or an error on failure.
4895 * Called at driver suspend.
4896 */
4897int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
4898{
4899 struct amdgpu_device *adev = drm_to_adev(dev);
4900 int r = 0;
4901
4902 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4903 return 0;
4904
4905 adev->in_suspend = true;
4906
4907 if (amdgpu_sriov_vf(adev)) {
4908 amdgpu_virt_fini_data_exchange(adev);
4909 r = amdgpu_virt_request_full_gpu(adev, false);
4910 if (r)
4911 return r;
4912 }
4913
4914 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4915 DRM_WARN("smart shift update failed\n");
4916
4917 if (notify_clients)
4918 drm_client_dev_suspend(adev_to_drm(adev), false);
4919
4920 cancel_delayed_work_sync(&adev->delayed_init_work);
4921
4922 amdgpu_ras_suspend(adev);
4923
4924 amdgpu_device_ip_suspend_phase1(adev);
4925
4926 if (!adev->in_s0ix)
4927 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4928
4929 r = amdgpu_device_evict_resources(adev);
4930 if (r)
4931 return r;
4932
4933 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4934
4935 amdgpu_fence_driver_hw_fini(adev);
4936
4937 amdgpu_device_ip_suspend_phase2(adev);
4938
4939 if (amdgpu_sriov_vf(adev))
4940 amdgpu_virt_release_full_gpu(adev, false);
4941
4942 r = amdgpu_dpm_notify_rlc_state(adev, false);
4943 if (r)
4944 return r;
4945
4946 return 0;
4947}
4948
4949/**
4950 * amdgpu_device_resume - initiate device resume
4951 *
4952 * @dev: drm dev pointer
4953 * @notify_clients: notify in-kernel DRM clients
4954 *
4955 * Bring the hw back to operating state (all asics).
4956 * Returns 0 for success or an error on failure.
4957 * Called at driver resume.
4958 */
4959int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
4960{
4961 struct amdgpu_device *adev = drm_to_adev(dev);
4962 int r = 0;
4963
4964 if (amdgpu_sriov_vf(adev)) {
4965 r = amdgpu_virt_request_full_gpu(adev, true);
4966 if (r)
4967 return r;
4968 }
4969
4970 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4971 return 0;
4972
4973 if (adev->in_s0ix)
4974 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4975
4976 /* post card */
4977 if (amdgpu_device_need_post(adev)) {
4978 r = amdgpu_device_asic_init(adev);
4979 if (r)
4980 dev_err(adev->dev, "amdgpu asic init failed\n");
4981 }
4982
4983 r = amdgpu_device_ip_resume(adev);
4984
4985 if (r) {
4986 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4987 goto exit;
4988 }
4989
4990 if (!adev->in_s0ix) {
4991 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4992 if (r)
4993 goto exit;
4994 }
4995
4996 r = amdgpu_device_ip_late_init(adev);
4997 if (r)
4998 goto exit;
4999
5000 queue_delayed_work(system_wq, &adev->delayed_init_work,
5001 msecs_to_jiffies(AMDGPU_RESUME_MS));
5002exit:
5003 if (amdgpu_sriov_vf(adev)) {
5004 amdgpu_virt_init_data_exchange(adev);
5005 amdgpu_virt_release_full_gpu(adev, true);
5006 }
5007
5008 if (r)
5009 return r;
5010
5011 /* Make sure IB tests flushed */
5012 flush_delayed_work(&adev->delayed_init_work);
5013
5014 if (notify_clients)
5015 drm_client_dev_resume(adev_to_drm(adev), false);
5016
5017 amdgpu_ras_resume(adev);
5018
5019 if (adev->mode_info.num_crtc) {
5020 /*
5021 * Most of the connector probing functions try to acquire runtime pm
5022 * refs to ensure that the GPU is powered on when connector polling is
5023 * performed. Since we're calling this from a runtime PM callback,
5024 * trying to acquire rpm refs will cause us to deadlock.
5025 *
5026 * Since we're guaranteed to be holding the rpm lock, it's safe to
5027 * temporarily disable the rpm helpers so this doesn't deadlock us.
5028 */
5029#ifdef CONFIG_PM
5030 dev->dev->power.disable_depth++;
5031#endif
5032 if (!adev->dc_enabled)
5033 drm_helper_hpd_irq_event(dev);
5034 else
5035 drm_kms_helper_hotplug_event(dev);
5036#ifdef CONFIG_PM
5037 dev->dev->power.disable_depth--;
5038#endif
5039 }
5040 adev->in_suspend = false;
5041
5042 if (adev->enable_mes)
5043 amdgpu_mes_self_test(adev);
5044
5045 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
5046 DRM_WARN("smart shift update failed\n");
5047
5048 return 0;
5049}
5050
5051/**
5052 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
5053 *
5054 * @adev: amdgpu_device pointer
5055 *
5056 * The list of all the hardware IPs that make up the asic is walked and
5057 * the check_soft_reset callbacks are run. check_soft_reset determines
5058 * if the asic is still hung or not.
5059 * Returns true if any of the IPs are still in a hung state, false if not.
5060 */
5061static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
5062{
5063 int i;
5064 bool asic_hang = false;
5065
5066 if (amdgpu_sriov_vf(adev))
5067 return true;
5068
5069 if (amdgpu_asic_need_full_reset(adev))
5070 return true;
5071
5072 for (i = 0; i < adev->num_ip_blocks; i++) {
5073 if (!adev->ip_blocks[i].status.valid)
5074 continue;
5075 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
5076 adev->ip_blocks[i].status.hang =
5077 adev->ip_blocks[i].version->funcs->check_soft_reset(
5078 &adev->ip_blocks[i]);
5079 if (adev->ip_blocks[i].status.hang) {
5080 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
5081 asic_hang = true;
5082 }
5083 }
5084 return asic_hang;
5085}
5086
5087/**
5088 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
5089 *
5090 * @adev: amdgpu_device pointer
5091 *
5092 * The list of all the hardware IPs that make up the asic is walked and the
5093 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
5094 * handles any IP specific hardware or software state changes that are
5095 * necessary for a soft reset to succeed.
5096 * Returns 0 on success, negative error code on failure.
5097 */
5098static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
5099{
5100 int i, r = 0;
5101
5102 for (i = 0; i < adev->num_ip_blocks; i++) {
5103 if (!adev->ip_blocks[i].status.valid)
5104 continue;
5105 if (adev->ip_blocks[i].status.hang &&
5106 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
5107 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]);
5108 if (r)
5109 return r;
5110 }
5111 }
5112
5113 return 0;
5114}
5115
5116/**
5117 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
5118 *
5119 * @adev: amdgpu_device pointer
5120 *
5121 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
5122 * reset is necessary to recover.
5123 * Returns true if a full asic reset is required, false if not.
5124 */
5125static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
5126{
5127 int i;
5128
5129 if (amdgpu_asic_need_full_reset(adev))
5130 return true;
5131
5132 for (i = 0; i < adev->num_ip_blocks; i++) {
5133 if (!adev->ip_blocks[i].status.valid)
5134 continue;
5135 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
5136 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
5137 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
5138 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
5139 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
5140 if (adev->ip_blocks[i].status.hang) {
5141 dev_info(adev->dev, "Some block need full reset!\n");
5142 return true;
5143 }
5144 }
5145 }
5146 return false;
5147}
5148
5149/**
5150 * amdgpu_device_ip_soft_reset - do a soft reset
5151 *
5152 * @adev: amdgpu_device pointer
5153 *
5154 * The list of all the hardware IPs that make up the asic is walked and the
5155 * soft_reset callbacks are run if the block is hung. soft_reset handles any
5156 * IP specific hardware or software state changes that are necessary to soft
5157 * reset the IP.
5158 * Returns 0 on success, negative error code on failure.
5159 */
5160static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
5161{
5162 int i, r = 0;
5163
5164 for (i = 0; i < adev->num_ip_blocks; i++) {
5165 if (!adev->ip_blocks[i].status.valid)
5166 continue;
5167 if (adev->ip_blocks[i].status.hang &&
5168 adev->ip_blocks[i].version->funcs->soft_reset) {
5169 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]);
5170 if (r)
5171 return r;
5172 }
5173 }
5174
5175 return 0;
5176}
5177
5178/**
5179 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
5180 *
5181 * @adev: amdgpu_device pointer
5182 *
5183 * The list of all the hardware IPs that make up the asic is walked and the
5184 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
5185 * handles any IP specific hardware or software state changes that are
5186 * necessary after the IP has been soft reset.
5187 * Returns 0 on success, negative error code on failure.
5188 */
5189static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
5190{
5191 int i, r = 0;
5192
5193 for (i = 0; i < adev->num_ip_blocks; i++) {
5194 if (!adev->ip_blocks[i].status.valid)
5195 continue;
5196 if (adev->ip_blocks[i].status.hang &&
5197 adev->ip_blocks[i].version->funcs->post_soft_reset)
5198 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]);
5199 if (r)
5200 return r;
5201 }
5202
5203 return 0;
5204}
5205
5206/**
5207 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5208 *
5209 * @adev: amdgpu_device pointer
5210 * @reset_context: amdgpu reset context pointer
5211 *
5212 * do VF FLR and reinitialize Asic
5213 * return 0 means succeeded otherwise failed
5214 */
5215static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
5216 struct amdgpu_reset_context *reset_context)
5217{
5218 int r;
5219 struct amdgpu_hive_info *hive = NULL;
5220
5221 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
5222 if (!amdgpu_ras_get_fed_status(adev))
5223 amdgpu_virt_ready_to_reset(adev);
5224 amdgpu_virt_wait_reset(adev);
5225 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5226 r = amdgpu_virt_request_full_gpu(adev, true);
5227 } else {
5228 r = amdgpu_virt_reset_gpu(adev);
5229 }
5230 if (r)
5231 return r;
5232
5233 amdgpu_ras_clear_err_state(adev);
5234 amdgpu_irq_gpu_reset_resume_helper(adev);
5235
5236 /* some sw clean up VF needs to do before recover */
5237 amdgpu_virt_post_reset(adev);
5238
5239 /* Resume IP prior to SMC */
5240 r = amdgpu_device_ip_reinit_early_sriov(adev);
5241 if (r)
5242 return r;
5243
5244 amdgpu_virt_init_data_exchange(adev);
5245
5246 r = amdgpu_device_fw_loading(adev);
5247 if (r)
5248 return r;
5249
5250 /* now we are okay to resume SMC/CP/SDMA */
5251 r = amdgpu_device_ip_reinit_late_sriov(adev);
5252 if (r)
5253 return r;
5254
5255 hive = amdgpu_get_xgmi_hive(adev);
5256 /* Update PSP FW topology after reset */
5257 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
5258 r = amdgpu_xgmi_update_topology(hive, adev);
5259 if (hive)
5260 amdgpu_put_xgmi_hive(hive);
5261 if (r)
5262 return r;
5263
5264 r = amdgpu_ib_ring_tests(adev);
5265 if (r)
5266 return r;
5267
5268 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
5269 amdgpu_inc_vram_lost(adev);
5270
5271 /* need to be called during full access so we can't do it later like
5272 * bare-metal does.
5273 */
5274 amdgpu_amdkfd_post_reset(adev);
5275 amdgpu_virt_release_full_gpu(adev, true);
5276
5277 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5278 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
5279 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
5280 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
5281 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
5282 amdgpu_ras_resume(adev);
5283
5284 amdgpu_virt_ras_telemetry_post_reset(adev);
5285
5286 return 0;
5287}
5288
5289/**
5290 * amdgpu_device_has_job_running - check if there is any unfinished job
5291 *
5292 * @adev: amdgpu_device pointer
5293 *
5294 * check if there is any job running on the device when guest driver receives
5295 * FLR notification from host driver. If there are still jobs running, then
5296 * the guest driver will not respond the FLR reset. Instead, let the job hit
5297 * the timeout and guest driver then issue the reset request.
5298 */
5299bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
5300{
5301 int i;
5302
5303 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5304 struct amdgpu_ring *ring = adev->rings[i];
5305
5306 if (!amdgpu_ring_sched_ready(ring))
5307 continue;
5308
5309 if (amdgpu_fence_count_emitted(ring))
5310 return true;
5311 }
5312 return false;
5313}
5314
5315/**
5316 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
5317 *
5318 * @adev: amdgpu_device pointer
5319 *
5320 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
5321 * a hung GPU.
5322 */
5323bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
5324{
5325
5326 if (amdgpu_gpu_recovery == 0)
5327 goto disabled;
5328
5329 /* Skip soft reset check in fatal error mode */
5330 if (!amdgpu_ras_is_poison_mode_supported(adev))
5331 return true;
5332
5333 if (amdgpu_sriov_vf(adev))
5334 return true;
5335
5336 if (amdgpu_gpu_recovery == -1) {
5337 switch (adev->asic_type) {
5338#ifdef CONFIG_DRM_AMDGPU_SI
5339 case CHIP_VERDE:
5340 case CHIP_TAHITI:
5341 case CHIP_PITCAIRN:
5342 case CHIP_OLAND:
5343 case CHIP_HAINAN:
5344#endif
5345#ifdef CONFIG_DRM_AMDGPU_CIK
5346 case CHIP_KAVERI:
5347 case CHIP_KABINI:
5348 case CHIP_MULLINS:
5349#endif
5350 case CHIP_CARRIZO:
5351 case CHIP_STONEY:
5352 case CHIP_CYAN_SKILLFISH:
5353 goto disabled;
5354 default:
5355 break;
5356 }
5357 }
5358
5359 return true;
5360
5361disabled:
5362 dev_info(adev->dev, "GPU recovery disabled.\n");
5363 return false;
5364}
5365
5366int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
5367{
5368 u32 i;
5369 int ret = 0;
5370
5371 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
5372
5373 dev_info(adev->dev, "GPU mode1 reset\n");
5374
5375 /* Cache the state before bus master disable. The saved config space
5376 * values are used in other cases like restore after mode-2 reset.
5377 */
5378 amdgpu_device_cache_pci_state(adev->pdev);
5379
5380 /* disable BM */
5381 pci_clear_master(adev->pdev);
5382
5383 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
5384 dev_info(adev->dev, "GPU smu mode1 reset\n");
5385 ret = amdgpu_dpm_mode1_reset(adev);
5386 } else {
5387 dev_info(adev->dev, "GPU psp mode1 reset\n");
5388 ret = psp_gpu_reset(adev);
5389 }
5390
5391 if (ret)
5392 goto mode1_reset_failed;
5393
5394 amdgpu_device_load_pci_state(adev->pdev);
5395 ret = amdgpu_psp_wait_for_bootloader(adev);
5396 if (ret)
5397 goto mode1_reset_failed;
5398
5399 /* wait for asic to come out of reset */
5400 for (i = 0; i < adev->usec_timeout; i++) {
5401 u32 memsize = adev->nbio.funcs->get_memsize(adev);
5402
5403 if (memsize != 0xffffffff)
5404 break;
5405 udelay(1);
5406 }
5407
5408 if (i >= adev->usec_timeout) {
5409 ret = -ETIMEDOUT;
5410 goto mode1_reset_failed;
5411 }
5412
5413 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
5414
5415 return 0;
5416
5417mode1_reset_failed:
5418 dev_err(adev->dev, "GPU mode1 reset failed\n");
5419 return ret;
5420}
5421
5422int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
5423 struct amdgpu_reset_context *reset_context)
5424{
5425 int i, r = 0;
5426 struct amdgpu_job *job = NULL;
5427 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
5428 bool need_full_reset =
5429 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5430
5431 if (reset_context->reset_req_dev == adev)
5432 job = reset_context->job;
5433
5434 if (amdgpu_sriov_vf(adev))
5435 amdgpu_virt_pre_reset(adev);
5436
5437 amdgpu_fence_driver_isr_toggle(adev, true);
5438
5439 /* block all schedulers and reset given job's ring */
5440 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5441 struct amdgpu_ring *ring = adev->rings[i];
5442
5443 if (!amdgpu_ring_sched_ready(ring))
5444 continue;
5445
5446 /* Clear job fence from fence drv to avoid force_completion
5447 * leave NULL and vm flush fence in fence drv
5448 */
5449 amdgpu_fence_driver_clear_job_fences(ring);
5450
5451 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
5452 amdgpu_fence_driver_force_completion(ring);
5453 }
5454
5455 amdgpu_fence_driver_isr_toggle(adev, false);
5456
5457 if (job && job->vm)
5458 drm_sched_increase_karma(&job->base);
5459
5460 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
5461 /* If reset handler not implemented, continue; otherwise return */
5462 if (r == -EOPNOTSUPP)
5463 r = 0;
5464 else
5465 return r;
5466
5467 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
5468 if (!amdgpu_sriov_vf(adev)) {
5469
5470 if (!need_full_reset)
5471 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
5472
5473 if (!need_full_reset && amdgpu_gpu_recovery &&
5474 amdgpu_device_ip_check_soft_reset(adev)) {
5475 amdgpu_device_ip_pre_soft_reset(adev);
5476 r = amdgpu_device_ip_soft_reset(adev);
5477 amdgpu_device_ip_post_soft_reset(adev);
5478 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
5479 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
5480 need_full_reset = true;
5481 }
5482 }
5483
5484 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
5485 dev_info(tmp_adev->dev, "Dumping IP State\n");
5486 /* Trigger ip dump before we reset the asic */
5487 for (i = 0; i < tmp_adev->num_ip_blocks; i++)
5488 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
5489 tmp_adev->ip_blocks[i].version->funcs
5490 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]);
5491 dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
5492 }
5493
5494 if (need_full_reset)
5495 r = amdgpu_device_ip_suspend(adev);
5496 if (need_full_reset)
5497 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5498 else
5499 clear_bit(AMDGPU_NEED_FULL_RESET,
5500 &reset_context->flags);
5501 }
5502
5503 return r;
5504}
5505
5506int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
5507{
5508 struct list_head *device_list_handle;
5509 bool full_reset, vram_lost = false;
5510 struct amdgpu_device *tmp_adev;
5511 int r, init_level;
5512
5513 device_list_handle = reset_context->reset_device_list;
5514
5515 if (!device_list_handle)
5516 return -EINVAL;
5517
5518 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5519
5520 /**
5521 * If it's reset on init, it's default init level, otherwise keep level
5522 * as recovery level.
5523 */
5524 if (reset_context->method == AMD_RESET_METHOD_ON_INIT)
5525 init_level = AMDGPU_INIT_LEVEL_DEFAULT;
5526 else
5527 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY;
5528
5529 r = 0;
5530 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5531 amdgpu_set_init_level(tmp_adev, init_level);
5532 if (full_reset) {
5533 /* post card */
5534 amdgpu_ras_clear_err_state(tmp_adev);
5535 r = amdgpu_device_asic_init(tmp_adev);
5536 if (r) {
5537 dev_warn(tmp_adev->dev, "asic atom init failed!");
5538 } else {
5539 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5540
5541 r = amdgpu_device_ip_resume_phase1(tmp_adev);
5542 if (r)
5543 goto out;
5544
5545 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5546
5547 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
5548 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
5549
5550 if (vram_lost) {
5551 DRM_INFO("VRAM is lost due to GPU reset!\n");
5552 amdgpu_inc_vram_lost(tmp_adev);
5553 }
5554
5555 r = amdgpu_device_fw_loading(tmp_adev);
5556 if (r)
5557 return r;
5558
5559 r = amdgpu_xcp_restore_partition_mode(
5560 tmp_adev->xcp_mgr);
5561 if (r)
5562 goto out;
5563
5564 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5565 if (r)
5566 goto out;
5567
5568 if (tmp_adev->mman.buffer_funcs_ring->sched.ready)
5569 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
5570
5571 r = amdgpu_device_ip_resume_phase3(tmp_adev);
5572 if (r)
5573 goto out;
5574
5575 if (vram_lost)
5576 amdgpu_device_fill_reset_magic(tmp_adev);
5577
5578 /*
5579 * Add this ASIC as tracked as reset was already
5580 * complete successfully.
5581 */
5582 amdgpu_register_gpu_instance(tmp_adev);
5583
5584 if (!reset_context->hive &&
5585 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5586 amdgpu_xgmi_add_device(tmp_adev);
5587
5588 r = amdgpu_device_ip_late_init(tmp_adev);
5589 if (r)
5590 goto out;
5591
5592 drm_client_dev_resume(adev_to_drm(tmp_adev), false);
5593
5594 /*
5595 * The GPU enters bad state once faulty pages
5596 * by ECC has reached the threshold, and ras
5597 * recovery is scheduled next. So add one check
5598 * here to break recovery if it indeed exceeds
5599 * bad page threshold, and remind user to
5600 * retire this GPU or setting one bigger
5601 * bad_page_threshold value to fix this once
5602 * probing driver again.
5603 */
5604 if (!amdgpu_ras_is_rma(tmp_adev)) {
5605 /* must succeed. */
5606 amdgpu_ras_resume(tmp_adev);
5607 } else {
5608 r = -EINVAL;
5609 goto out;
5610 }
5611
5612 /* Update PSP FW topology after reset */
5613 if (reset_context->hive &&
5614 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5615 r = amdgpu_xgmi_update_topology(
5616 reset_context->hive, tmp_adev);
5617 }
5618 }
5619
5620out:
5621 if (!r) {
5622 /* IP init is complete now, set level as default */
5623 amdgpu_set_init_level(tmp_adev,
5624 AMDGPU_INIT_LEVEL_DEFAULT);
5625 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5626 r = amdgpu_ib_ring_tests(tmp_adev);
5627 if (r) {
5628 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5629 r = -EAGAIN;
5630 goto end;
5631 }
5632 }
5633
5634 if (r)
5635 tmp_adev->asic_reset_res = r;
5636 }
5637
5638end:
5639 return r;
5640}
5641
5642int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5643 struct amdgpu_reset_context *reset_context)
5644{
5645 struct amdgpu_device *tmp_adev = NULL;
5646 bool need_full_reset, skip_hw_reset;
5647 int r = 0;
5648
5649 /* Try reset handler method first */
5650 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5651 reset_list);
5652
5653 reset_context->reset_device_list = device_list_handle;
5654 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5655 /* If reset handler not implemented, continue; otherwise return */
5656 if (r == -EOPNOTSUPP)
5657 r = 0;
5658 else
5659 return r;
5660
5661 /* Reset handler not implemented, use the default method */
5662 need_full_reset =
5663 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5664 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5665
5666 /*
5667 * ASIC reset has to be done on all XGMI hive nodes ASAP
5668 * to allow proper links negotiation in FW (within 1 sec)
5669 */
5670 if (!skip_hw_reset && need_full_reset) {
5671 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5672 /* For XGMI run all resets in parallel to speed up the process */
5673 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5674 if (!queue_work(system_unbound_wq,
5675 &tmp_adev->xgmi_reset_work))
5676 r = -EALREADY;
5677 } else
5678 r = amdgpu_asic_reset(tmp_adev);
5679
5680 if (r) {
5681 dev_err(tmp_adev->dev,
5682 "ASIC reset failed with error, %d for drm dev, %s",
5683 r, adev_to_drm(tmp_adev)->unique);
5684 goto out;
5685 }
5686 }
5687
5688 /* For XGMI wait for all resets to complete before proceed */
5689 if (!r) {
5690 list_for_each_entry(tmp_adev, device_list_handle,
5691 reset_list) {
5692 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5693 flush_work(&tmp_adev->xgmi_reset_work);
5694 r = tmp_adev->asic_reset_res;
5695 if (r)
5696 break;
5697 }
5698 }
5699 }
5700 }
5701
5702 if (!r && amdgpu_ras_intr_triggered()) {
5703 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5704 amdgpu_ras_reset_error_count(tmp_adev,
5705 AMDGPU_RAS_BLOCK__MMHUB);
5706 }
5707
5708 amdgpu_ras_intr_cleared();
5709 }
5710
5711 r = amdgpu_device_reinit_after_reset(reset_context);
5712 if (r == -EAGAIN)
5713 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5714 else
5715 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5716
5717out:
5718 return r;
5719}
5720
5721static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5722{
5723
5724 switch (amdgpu_asic_reset_method(adev)) {
5725 case AMD_RESET_METHOD_MODE1:
5726 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5727 break;
5728 case AMD_RESET_METHOD_MODE2:
5729 adev->mp1_state = PP_MP1_STATE_RESET;
5730 break;
5731 default:
5732 adev->mp1_state = PP_MP1_STATE_NONE;
5733 break;
5734 }
5735}
5736
5737static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5738{
5739 amdgpu_vf_error_trans_all(adev);
5740 adev->mp1_state = PP_MP1_STATE_NONE;
5741}
5742
5743static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5744{
5745 struct pci_dev *p = NULL;
5746
5747 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5748 adev->pdev->bus->number, 1);
5749 if (p) {
5750 pm_runtime_enable(&(p->dev));
5751 pm_runtime_resume(&(p->dev));
5752 }
5753
5754 pci_dev_put(p);
5755}
5756
5757static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5758{
5759 enum amd_reset_method reset_method;
5760 struct pci_dev *p = NULL;
5761 u64 expires;
5762
5763 /*
5764 * For now, only BACO and mode1 reset are confirmed
5765 * to suffer the audio issue without proper suspended.
5766 */
5767 reset_method = amdgpu_asic_reset_method(adev);
5768 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5769 (reset_method != AMD_RESET_METHOD_MODE1))
5770 return -EINVAL;
5771
5772 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5773 adev->pdev->bus->number, 1);
5774 if (!p)
5775 return -ENODEV;
5776
5777 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5778 if (!expires)
5779 /*
5780 * If we cannot get the audio device autosuspend delay,
5781 * a fixed 4S interval will be used. Considering 3S is
5782 * the audio controller default autosuspend delay setting.
5783 * 4S used here is guaranteed to cover that.
5784 */
5785 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5786
5787 while (!pm_runtime_status_suspended(&(p->dev))) {
5788 if (!pm_runtime_suspend(&(p->dev)))
5789 break;
5790
5791 if (expires < ktime_get_mono_fast_ns()) {
5792 dev_warn(adev->dev, "failed to suspend display audio\n");
5793 pci_dev_put(p);
5794 /* TODO: abort the succeeding gpu reset? */
5795 return -ETIMEDOUT;
5796 }
5797 }
5798
5799 pm_runtime_disable(&(p->dev));
5800
5801 pci_dev_put(p);
5802 return 0;
5803}
5804
5805static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5806{
5807 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5808
5809#if defined(CONFIG_DEBUG_FS)
5810 if (!amdgpu_sriov_vf(adev))
5811 cancel_work(&adev->reset_work);
5812#endif
5813
5814 if (adev->kfd.dev)
5815 cancel_work(&adev->kfd.reset_work);
5816
5817 if (amdgpu_sriov_vf(adev))
5818 cancel_work(&adev->virt.flr_work);
5819
5820 if (con && adev->ras_enabled)
5821 cancel_work(&con->recovery_work);
5822
5823}
5824
5825static int amdgpu_device_health_check(struct list_head *device_list_handle)
5826{
5827 struct amdgpu_device *tmp_adev;
5828 int ret = 0;
5829 u32 status;
5830
5831 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5832 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
5833 if (PCI_POSSIBLE_ERROR(status)) {
5834 dev_err(tmp_adev->dev, "device lost from bus!");
5835 ret = -ENODEV;
5836 }
5837 }
5838
5839 return ret;
5840}
5841
5842/**
5843 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5844 *
5845 * @adev: amdgpu_device pointer
5846 * @job: which job trigger hang
5847 * @reset_context: amdgpu reset context pointer
5848 *
5849 * Attempt to reset the GPU if it has hung (all asics).
5850 * Attempt to do soft-reset or full-reset and reinitialize Asic
5851 * Returns 0 for success or an error on failure.
5852 */
5853
5854int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5855 struct amdgpu_job *job,
5856 struct amdgpu_reset_context *reset_context)
5857{
5858 struct list_head device_list, *device_list_handle = NULL;
5859 bool job_signaled = false;
5860 struct amdgpu_hive_info *hive = NULL;
5861 struct amdgpu_device *tmp_adev = NULL;
5862 int i, r = 0;
5863 bool need_emergency_restart = false;
5864 bool audio_suspended = false;
5865 int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
5866
5867 /*
5868 * If it reaches here because of hang/timeout and a RAS error is
5869 * detected at the same time, let RAS recovery take care of it.
5870 */
5871 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) &&
5872 !amdgpu_sriov_vf(adev) &&
5873 reset_context->src != AMDGPU_RESET_SRC_RAS) {
5874 dev_dbg(adev->dev,
5875 "Gpu recovery from source: %d yielding to RAS error recovery handling",
5876 reset_context->src);
5877 return 0;
5878 }
5879 /*
5880 * Special case: RAS triggered and full reset isn't supported
5881 */
5882 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5883
5884 /*
5885 * Flush RAM to disk so that after reboot
5886 * the user can read log and see why the system rebooted.
5887 */
5888 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5889 amdgpu_ras_get_context(adev)->reboot) {
5890 DRM_WARN("Emergency reboot.");
5891
5892 ksys_sync_helper();
5893 emergency_restart();
5894 }
5895
5896 dev_info(adev->dev, "GPU %s begin!\n",
5897 need_emergency_restart ? "jobs stop":"reset");
5898
5899 if (!amdgpu_sriov_vf(adev))
5900 hive = amdgpu_get_xgmi_hive(adev);
5901 if (hive)
5902 mutex_lock(&hive->hive_lock);
5903
5904 reset_context->job = job;
5905 reset_context->hive = hive;
5906 /*
5907 * Build list of devices to reset.
5908 * In case we are in XGMI hive mode, resort the device list
5909 * to put adev in the 1st position.
5910 */
5911 INIT_LIST_HEAD(&device_list);
5912 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5913 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5914 list_add_tail(&tmp_adev->reset_list, &device_list);
5915 if (adev->shutdown)
5916 tmp_adev->shutdown = true;
5917 }
5918 if (!list_is_first(&adev->reset_list, &device_list))
5919 list_rotate_to_front(&adev->reset_list, &device_list);
5920 device_list_handle = &device_list;
5921 } else {
5922 list_add_tail(&adev->reset_list, &device_list);
5923 device_list_handle = &device_list;
5924 }
5925
5926 if (!amdgpu_sriov_vf(adev)) {
5927 r = amdgpu_device_health_check(device_list_handle);
5928 if (r)
5929 goto end_reset;
5930 }
5931
5932 /* We need to lock reset domain only once both for XGMI and single device */
5933 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5934 reset_list);
5935 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5936
5937 /* block all schedulers and reset given job's ring */
5938 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5939
5940 amdgpu_device_set_mp1_state(tmp_adev);
5941
5942 /*
5943 * Try to put the audio codec into suspend state
5944 * before gpu reset started.
5945 *
5946 * Due to the power domain of the graphics device
5947 * is shared with AZ power domain. Without this,
5948 * we may change the audio hardware from behind
5949 * the audio driver's back. That will trigger
5950 * some audio codec errors.
5951 */
5952 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5953 audio_suspended = true;
5954
5955 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5956
5957 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5958
5959 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
5960
5961 /*
5962 * Mark these ASICs to be reset as untracked first
5963 * And add them back after reset completed
5964 */
5965 amdgpu_unregister_gpu_instance(tmp_adev);
5966
5967 drm_client_dev_suspend(adev_to_drm(tmp_adev), false);
5968
5969 /* disable ras on ALL IPs */
5970 if (!need_emergency_restart &&
5971 amdgpu_device_ip_need_full_reset(tmp_adev))
5972 amdgpu_ras_suspend(tmp_adev);
5973
5974 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5975 struct amdgpu_ring *ring = tmp_adev->rings[i];
5976
5977 if (!amdgpu_ring_sched_ready(ring))
5978 continue;
5979
5980 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5981
5982 if (need_emergency_restart)
5983 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5984 }
5985 atomic_inc(&tmp_adev->gpu_reset_counter);
5986 }
5987
5988 if (need_emergency_restart)
5989 goto skip_sched_resume;
5990
5991 /*
5992 * Must check guilty signal here since after this point all old
5993 * HW fences are force signaled.
5994 *
5995 * job->base holds a reference to parent fence
5996 */
5997 if (job && dma_fence_is_signaled(&job->hw_fence)) {
5998 job_signaled = true;
5999 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
6000 goto skip_hw_reset;
6001 }
6002
6003retry: /* Rest of adevs pre asic reset from XGMI hive. */
6004 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6005 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
6006 /*TODO Should we stop ?*/
6007 if (r) {
6008 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
6009 r, adev_to_drm(tmp_adev)->unique);
6010 tmp_adev->asic_reset_res = r;
6011 }
6012 }
6013
6014 /* Actual ASIC resets if needed.*/
6015 /* Host driver will handle XGMI hive reset for SRIOV */
6016 if (amdgpu_sriov_vf(adev)) {
6017 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
6018 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
6019 amdgpu_ras_set_fed(adev, true);
6020 set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
6021 }
6022
6023 r = amdgpu_device_reset_sriov(adev, reset_context);
6024 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
6025 amdgpu_virt_release_full_gpu(adev, true);
6026 goto retry;
6027 }
6028 if (r)
6029 adev->asic_reset_res = r;
6030 } else {
6031 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
6032 if (r && r == -EAGAIN)
6033 goto retry;
6034 }
6035
6036 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6037 /*
6038 * Drop any pending non scheduler resets queued before reset is done.
6039 * Any reset scheduled after this point would be valid. Scheduler resets
6040 * were already dropped during drm_sched_stop and no new ones can come
6041 * in before drm_sched_start.
6042 */
6043 amdgpu_device_stop_pending_resets(tmp_adev);
6044 }
6045
6046skip_hw_reset:
6047
6048 /* Post ASIC reset for all devs .*/
6049 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6050
6051 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6052 struct amdgpu_ring *ring = tmp_adev->rings[i];
6053
6054 if (!amdgpu_ring_sched_ready(ring))
6055 continue;
6056
6057 drm_sched_start(&ring->sched, 0);
6058 }
6059
6060 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
6061 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
6062
6063 if (tmp_adev->asic_reset_res)
6064 r = tmp_adev->asic_reset_res;
6065
6066 tmp_adev->asic_reset_res = 0;
6067
6068 if (r) {
6069 /* bad news, how to tell it to userspace ?
6070 * for ras error, we should report GPU bad status instead of
6071 * reset failure
6072 */
6073 if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
6074 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
6075 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
6076 atomic_read(&tmp_adev->gpu_reset_counter));
6077 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
6078 } else {
6079 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
6080 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
6081 DRM_WARN("smart shift update failed\n");
6082 }
6083 }
6084
6085skip_sched_resume:
6086 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6087 /* unlock kfd: SRIOV would do it separately */
6088 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
6089 amdgpu_amdkfd_post_reset(tmp_adev);
6090
6091 /* kfd_post_reset will do nothing if kfd device is not initialized,
6092 * need to bring up kfd here if it's not be initialized before
6093 */
6094 if (!adev->kfd.init_complete)
6095 amdgpu_amdkfd_device_init(adev);
6096
6097 if (audio_suspended)
6098 amdgpu_device_resume_display_audio(tmp_adev);
6099
6100 amdgpu_device_unset_mp1_state(tmp_adev);
6101
6102 amdgpu_ras_set_error_query_ready(tmp_adev, true);
6103 }
6104
6105 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
6106 reset_list);
6107 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
6108
6109end_reset:
6110 if (hive) {
6111 mutex_unlock(&hive->hive_lock);
6112 amdgpu_put_xgmi_hive(hive);
6113 }
6114
6115 if (r)
6116 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
6117
6118 atomic_set(&adev->reset_domain->reset_res, r);
6119 return r;
6120}
6121
6122/**
6123 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
6124 *
6125 * @adev: amdgpu_device pointer
6126 * @speed: pointer to the speed of the link
6127 * @width: pointer to the width of the link
6128 *
6129 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6130 * first physical partner to an AMD dGPU.
6131 * This will exclude any virtual switches and links.
6132 */
6133static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
6134 enum pci_bus_speed *speed,
6135 enum pcie_link_width *width)
6136{
6137 struct pci_dev *parent = adev->pdev;
6138
6139 if (!speed || !width)
6140 return;
6141
6142 *speed = PCI_SPEED_UNKNOWN;
6143 *width = PCIE_LNK_WIDTH_UNKNOWN;
6144
6145 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
6146 while ((parent = pci_upstream_bridge(parent))) {
6147 /* skip upstream/downstream switches internal to dGPU*/
6148 if (parent->vendor == PCI_VENDOR_ID_ATI)
6149 continue;
6150 *speed = pcie_get_speed_cap(parent);
6151 *width = pcie_get_width_cap(parent);
6152 break;
6153 }
6154 } else {
6155 /* use the current speeds rather than max if switching is not supported */
6156 pcie_bandwidth_available(adev->pdev, NULL, speed, width);
6157 }
6158}
6159
6160/**
6161 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU
6162 *
6163 * @adev: amdgpu_device pointer
6164 * @speed: pointer to the speed of the link
6165 * @width: pointer to the width of the link
6166 *
6167 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6168 * AMD dGPU which may be a virtual upstream bridge.
6169 */
6170static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev,
6171 enum pci_bus_speed *speed,
6172 enum pcie_link_width *width)
6173{
6174 struct pci_dev *parent = adev->pdev;
6175
6176 if (!speed || !width)
6177 return;
6178
6179 parent = pci_upstream_bridge(parent);
6180 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) {
6181 /* use the upstream/downstream switches internal to dGPU */
6182 *speed = pcie_get_speed_cap(parent);
6183 *width = pcie_get_width_cap(parent);
6184 while ((parent = pci_upstream_bridge(parent))) {
6185 if (parent->vendor == PCI_VENDOR_ID_ATI) {
6186 /* use the upstream/downstream switches internal to dGPU */
6187 *speed = pcie_get_speed_cap(parent);
6188 *width = pcie_get_width_cap(parent);
6189 }
6190 }
6191 } else {
6192 /* use the device itself */
6193 *speed = pcie_get_speed_cap(adev->pdev);
6194 *width = pcie_get_width_cap(adev->pdev);
6195 }
6196}
6197
6198/**
6199 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
6200 *
6201 * @adev: amdgpu_device pointer
6202 *
6203 * Fetches and stores in the driver the PCIE capabilities (gen speed
6204 * and lanes) of the slot the device is in. Handles APUs and
6205 * virtualized environments where PCIE config space may not be available.
6206 */
6207static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
6208{
6209 enum pci_bus_speed speed_cap, platform_speed_cap;
6210 enum pcie_link_width platform_link_width, link_width;
6211
6212 if (amdgpu_pcie_gen_cap)
6213 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
6214
6215 if (amdgpu_pcie_lane_cap)
6216 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
6217
6218 /* covers APUs as well */
6219 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
6220 if (adev->pm.pcie_gen_mask == 0)
6221 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
6222 if (adev->pm.pcie_mlw_mask == 0)
6223 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
6224 return;
6225 }
6226
6227 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
6228 return;
6229
6230 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
6231 &platform_link_width);
6232 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width);
6233
6234 if (adev->pm.pcie_gen_mask == 0) {
6235 /* asic caps */
6236 if (speed_cap == PCI_SPEED_UNKNOWN) {
6237 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6238 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6239 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6240 } else {
6241 if (speed_cap == PCIE_SPEED_32_0GT)
6242 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6243 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6244 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6245 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6246 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
6247 else if (speed_cap == PCIE_SPEED_16_0GT)
6248 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6249 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6250 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6251 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
6252 else if (speed_cap == PCIE_SPEED_8_0GT)
6253 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6254 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6255 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6256 else if (speed_cap == PCIE_SPEED_5_0GT)
6257 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6258 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
6259 else
6260 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
6261 }
6262 /* platform caps */
6263 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
6264 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6265 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6266 } else {
6267 if (platform_speed_cap == PCIE_SPEED_32_0GT)
6268 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6269 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6270 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6271 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6272 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
6273 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
6274 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6275 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6276 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6277 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
6278 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
6279 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6280 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6281 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
6282 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
6283 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6284 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6285 else
6286 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
6287
6288 }
6289 }
6290 if (adev->pm.pcie_mlw_mask == 0) {
6291 /* asic caps */
6292 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6293 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK;
6294 } else {
6295 switch (link_width) {
6296 case PCIE_LNK_X32:
6297 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 |
6298 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 |
6299 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6300 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6301 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6302 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6303 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6304 break;
6305 case PCIE_LNK_X16:
6306 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 |
6307 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6308 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6309 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6310 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6311 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6312 break;
6313 case PCIE_LNK_X12:
6314 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6315 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6316 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6317 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6318 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6319 break;
6320 case PCIE_LNK_X8:
6321 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6322 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6323 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6324 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6325 break;
6326 case PCIE_LNK_X4:
6327 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6328 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6329 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6330 break;
6331 case PCIE_LNK_X2:
6332 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6333 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6334 break;
6335 case PCIE_LNK_X1:
6336 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1;
6337 break;
6338 default:
6339 break;
6340 }
6341 }
6342 /* platform caps */
6343 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6344 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
6345 } else {
6346 switch (platform_link_width) {
6347 case PCIE_LNK_X32:
6348 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
6349 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6350 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6351 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6352 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6353 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6354 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6355 break;
6356 case PCIE_LNK_X16:
6357 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6358 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6359 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6360 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6361 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6362 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6363 break;
6364 case PCIE_LNK_X12:
6365 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6366 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6367 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6368 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6369 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6370 break;
6371 case PCIE_LNK_X8:
6372 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6373 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6374 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6375 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6376 break;
6377 case PCIE_LNK_X4:
6378 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6379 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6380 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6381 break;
6382 case PCIE_LNK_X2:
6383 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6384 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6385 break;
6386 case PCIE_LNK_X1:
6387 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
6388 break;
6389 default:
6390 break;
6391 }
6392 }
6393 }
6394}
6395
6396/**
6397 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
6398 *
6399 * @adev: amdgpu_device pointer
6400 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
6401 *
6402 * Return true if @peer_adev can access (DMA) @adev through the PCIe
6403 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
6404 * @peer_adev.
6405 */
6406bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
6407 struct amdgpu_device *peer_adev)
6408{
6409#ifdef CONFIG_HSA_AMD_P2P
6410 bool p2p_access =
6411 !adev->gmc.xgmi.connected_to_cpu &&
6412 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
6413 if (!p2p_access)
6414 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n",
6415 pci_name(peer_adev->pdev));
6416
6417 bool is_large_bar = adev->gmc.visible_vram_size &&
6418 adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
6419 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev);
6420
6421 if (!p2p_addressable) {
6422 uint64_t address_mask = peer_adev->dev->dma_mask ?
6423 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
6424 resource_size_t aper_limit =
6425 adev->gmc.aper_base + adev->gmc.aper_size - 1;
6426
6427 p2p_addressable = !(adev->gmc.aper_base & address_mask ||
6428 aper_limit & address_mask);
6429 }
6430 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable;
6431#else
6432 return false;
6433#endif
6434}
6435
6436int amdgpu_device_baco_enter(struct drm_device *dev)
6437{
6438 struct amdgpu_device *adev = drm_to_adev(dev);
6439 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6440
6441 if (!amdgpu_device_supports_baco(dev))
6442 return -ENOTSUPP;
6443
6444 if (ras && adev->ras_enabled &&
6445 adev->nbio.funcs->enable_doorbell_interrupt)
6446 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
6447
6448 return amdgpu_dpm_baco_enter(adev);
6449}
6450
6451int amdgpu_device_baco_exit(struct drm_device *dev)
6452{
6453 struct amdgpu_device *adev = drm_to_adev(dev);
6454 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6455 int ret = 0;
6456
6457 if (!amdgpu_device_supports_baco(dev))
6458 return -ENOTSUPP;
6459
6460 ret = amdgpu_dpm_baco_exit(adev);
6461 if (ret)
6462 return ret;
6463
6464 if (ras && adev->ras_enabled &&
6465 adev->nbio.funcs->enable_doorbell_interrupt)
6466 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
6467
6468 if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
6469 adev->nbio.funcs->clear_doorbell_interrupt)
6470 adev->nbio.funcs->clear_doorbell_interrupt(adev);
6471
6472 return 0;
6473}
6474
6475/**
6476 * amdgpu_pci_error_detected - Called when a PCI error is detected.
6477 * @pdev: PCI device struct
6478 * @state: PCI channel state
6479 *
6480 * Description: Called when a PCI error is detected.
6481 *
6482 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
6483 */
6484pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
6485{
6486 struct drm_device *dev = pci_get_drvdata(pdev);
6487 struct amdgpu_device *adev = drm_to_adev(dev);
6488 int i;
6489
6490 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
6491
6492 if (adev->gmc.xgmi.num_physical_nodes > 1) {
6493 DRM_WARN("No support for XGMI hive yet...");
6494 return PCI_ERS_RESULT_DISCONNECT;
6495 }
6496
6497 adev->pci_channel_state = state;
6498
6499 switch (state) {
6500 case pci_channel_io_normal:
6501 return PCI_ERS_RESULT_CAN_RECOVER;
6502 /* Fatal error, prepare for slot reset */
6503 case pci_channel_io_frozen:
6504 /*
6505 * Locking adev->reset_domain->sem will prevent any external access
6506 * to GPU during PCI error recovery
6507 */
6508 amdgpu_device_lock_reset_domain(adev->reset_domain);
6509 amdgpu_device_set_mp1_state(adev);
6510
6511 /*
6512 * Block any work scheduling as we do for regular GPU reset
6513 * for the duration of the recovery
6514 */
6515 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6516 struct amdgpu_ring *ring = adev->rings[i];
6517
6518 if (!amdgpu_ring_sched_ready(ring))
6519 continue;
6520
6521 drm_sched_stop(&ring->sched, NULL);
6522 }
6523 atomic_inc(&adev->gpu_reset_counter);
6524 return PCI_ERS_RESULT_NEED_RESET;
6525 case pci_channel_io_perm_failure:
6526 /* Permanent error, prepare for device removal */
6527 return PCI_ERS_RESULT_DISCONNECT;
6528 }
6529
6530 return PCI_ERS_RESULT_NEED_RESET;
6531}
6532
6533/**
6534 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
6535 * @pdev: pointer to PCI device
6536 */
6537pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
6538{
6539
6540 DRM_INFO("PCI error: mmio enabled callback!!\n");
6541
6542 /* TODO - dump whatever for debugging purposes */
6543
6544 /* This called only if amdgpu_pci_error_detected returns
6545 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
6546 * works, no need to reset slot.
6547 */
6548
6549 return PCI_ERS_RESULT_RECOVERED;
6550}
6551
6552/**
6553 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
6554 * @pdev: PCI device struct
6555 *
6556 * Description: This routine is called by the pci error recovery
6557 * code after the PCI slot has been reset, just before we
6558 * should resume normal operations.
6559 */
6560pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
6561{
6562 struct drm_device *dev = pci_get_drvdata(pdev);
6563 struct amdgpu_device *adev = drm_to_adev(dev);
6564 int r, i;
6565 struct amdgpu_reset_context reset_context;
6566 u32 memsize;
6567 struct list_head device_list;
6568
6569 /* PCI error slot reset should be skipped During RAS recovery */
6570 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
6571 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
6572 amdgpu_ras_in_recovery(adev))
6573 return PCI_ERS_RESULT_RECOVERED;
6574
6575 DRM_INFO("PCI error: slot reset callback!!\n");
6576
6577 memset(&reset_context, 0, sizeof(reset_context));
6578
6579 INIT_LIST_HEAD(&device_list);
6580 list_add_tail(&adev->reset_list, &device_list);
6581
6582 /* wait for asic to come out of reset */
6583 msleep(500);
6584
6585 /* Restore PCI confspace */
6586 amdgpu_device_load_pci_state(pdev);
6587
6588 /* confirm ASIC came out of reset */
6589 for (i = 0; i < adev->usec_timeout; i++) {
6590 memsize = amdgpu_asic_get_config_memsize(adev);
6591
6592 if (memsize != 0xffffffff)
6593 break;
6594 udelay(1);
6595 }
6596 if (memsize == 0xffffffff) {
6597 r = -ETIME;
6598 goto out;
6599 }
6600
6601 reset_context.method = AMD_RESET_METHOD_NONE;
6602 reset_context.reset_req_dev = adev;
6603 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
6604 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
6605
6606 adev->no_hw_access = true;
6607 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
6608 adev->no_hw_access = false;
6609 if (r)
6610 goto out;
6611
6612 r = amdgpu_do_asic_reset(&device_list, &reset_context);
6613
6614out:
6615 if (!r) {
6616 if (amdgpu_device_cache_pci_state(adev->pdev))
6617 pci_restore_state(adev->pdev);
6618
6619 DRM_INFO("PCIe error recovery succeeded\n");
6620 } else {
6621 DRM_ERROR("PCIe error recovery failed, err:%d", r);
6622 amdgpu_device_unset_mp1_state(adev);
6623 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6624 }
6625
6626 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
6627}
6628
6629/**
6630 * amdgpu_pci_resume() - resume normal ops after PCI reset
6631 * @pdev: pointer to PCI device
6632 *
6633 * Called when the error recovery driver tells us that its
6634 * OK to resume normal operation.
6635 */
6636void amdgpu_pci_resume(struct pci_dev *pdev)
6637{
6638 struct drm_device *dev = pci_get_drvdata(pdev);
6639 struct amdgpu_device *adev = drm_to_adev(dev);
6640 int i;
6641
6642
6643 DRM_INFO("PCI error: resume callback!!\n");
6644
6645 /* Only continue execution for the case of pci_channel_io_frozen */
6646 if (adev->pci_channel_state != pci_channel_io_frozen)
6647 return;
6648
6649 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6650 struct amdgpu_ring *ring = adev->rings[i];
6651
6652 if (!amdgpu_ring_sched_ready(ring))
6653 continue;
6654
6655 drm_sched_start(&ring->sched, 0);
6656 }
6657
6658 amdgpu_device_unset_mp1_state(adev);
6659 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6660}
6661
6662bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
6663{
6664 struct drm_device *dev = pci_get_drvdata(pdev);
6665 struct amdgpu_device *adev = drm_to_adev(dev);
6666 int r;
6667
6668 if (amdgpu_sriov_vf(adev))
6669 return false;
6670
6671 r = pci_save_state(pdev);
6672 if (!r) {
6673 kfree(adev->pci_state);
6674
6675 adev->pci_state = pci_store_saved_state(pdev);
6676
6677 if (!adev->pci_state) {
6678 DRM_ERROR("Failed to store PCI saved state");
6679 return false;
6680 }
6681 } else {
6682 DRM_WARN("Failed to save PCI state, err:%d\n", r);
6683 return false;
6684 }
6685
6686 return true;
6687}
6688
6689bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6690{
6691 struct drm_device *dev = pci_get_drvdata(pdev);
6692 struct amdgpu_device *adev = drm_to_adev(dev);
6693 int r;
6694
6695 if (!adev->pci_state)
6696 return false;
6697
6698 r = pci_load_saved_state(pdev, adev->pci_state);
6699
6700 if (!r) {
6701 pci_restore_state(pdev);
6702 } else {
6703 DRM_WARN("Failed to load PCI state, err:%d\n", r);
6704 return false;
6705 }
6706
6707 return true;
6708}
6709
6710void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6711 struct amdgpu_ring *ring)
6712{
6713#ifdef CONFIG_X86_64
6714 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6715 return;
6716#endif
6717 if (adev->gmc.xgmi.connected_to_cpu)
6718 return;
6719
6720 if (ring && ring->funcs->emit_hdp_flush)
6721 amdgpu_ring_emit_hdp_flush(ring);
6722 else
6723 amdgpu_asic_flush_hdp(adev, ring);
6724}
6725
6726void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6727 struct amdgpu_ring *ring)
6728{
6729#ifdef CONFIG_X86_64
6730 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6731 return;
6732#endif
6733 if (adev->gmc.xgmi.connected_to_cpu)
6734 return;
6735
6736 amdgpu_asic_invalidate_hdp(adev, ring);
6737}
6738
6739int amdgpu_in_reset(struct amdgpu_device *adev)
6740{
6741 return atomic_read(&adev->reset_domain->in_gpu_reset);
6742}
6743
6744/**
6745 * amdgpu_device_halt() - bring hardware to some kind of halt state
6746 *
6747 * @adev: amdgpu_device pointer
6748 *
6749 * Bring hardware to some kind of halt state so that no one can touch it
6750 * any more. It will help to maintain error context when error occurred.
6751 * Compare to a simple hang, the system will keep stable at least for SSH
6752 * access. Then it should be trivial to inspect the hardware state and
6753 * see what's going on. Implemented as following:
6754 *
6755 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6756 * clears all CPU mappings to device, disallows remappings through page faults
6757 * 2. amdgpu_irq_disable_all() disables all interrupts
6758 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6759 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6760 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6761 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6762 * flush any in flight DMA operations
6763 */
6764void amdgpu_device_halt(struct amdgpu_device *adev)
6765{
6766 struct pci_dev *pdev = adev->pdev;
6767 struct drm_device *ddev = adev_to_drm(adev);
6768
6769 amdgpu_xcp_dev_unplug(adev);
6770 drm_dev_unplug(ddev);
6771
6772 amdgpu_irq_disable_all(adev);
6773
6774 amdgpu_fence_driver_hw_fini(adev);
6775
6776 adev->no_hw_access = true;
6777
6778 amdgpu_device_unmap_mmio(adev);
6779
6780 pci_disable_device(pdev);
6781 pci_wait_for_pending_transaction(pdev);
6782}
6783
6784u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6785 u32 reg)
6786{
6787 unsigned long flags, address, data;
6788 u32 r;
6789
6790 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6791 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6792
6793 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6794 WREG32(address, reg * 4);
6795 (void)RREG32(address);
6796 r = RREG32(data);
6797 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6798 return r;
6799}
6800
6801void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6802 u32 reg, u32 v)
6803{
6804 unsigned long flags, address, data;
6805
6806 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6807 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6808
6809 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6810 WREG32(address, reg * 4);
6811 (void)RREG32(address);
6812 WREG32(data, v);
6813 (void)RREG32(data);
6814 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6815}
6816
6817/**
6818 * amdgpu_device_get_gang - return a reference to the current gang
6819 * @adev: amdgpu_device pointer
6820 *
6821 * Returns: A new reference to the current gang leader.
6822 */
6823struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
6824{
6825 struct dma_fence *fence;
6826
6827 rcu_read_lock();
6828 fence = dma_fence_get_rcu_safe(&adev->gang_submit);
6829 rcu_read_unlock();
6830 return fence;
6831}
6832
6833/**
6834 * amdgpu_device_switch_gang - switch to a new gang
6835 * @adev: amdgpu_device pointer
6836 * @gang: the gang to switch to
6837 *
6838 * Try to switch to a new gang.
6839 * Returns: NULL if we switched to the new gang or a reference to the current
6840 * gang leader.
6841 */
6842struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6843 struct dma_fence *gang)
6844{
6845 struct dma_fence *old = NULL;
6846
6847 do {
6848 dma_fence_put(old);
6849 old = amdgpu_device_get_gang(adev);
6850 if (old == gang)
6851 break;
6852
6853 if (!dma_fence_is_signaled(old))
6854 return old;
6855
6856 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6857 old, gang) != old);
6858
6859 dma_fence_put(old);
6860 return NULL;
6861}
6862
6863bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6864{
6865 switch (adev->asic_type) {
6866#ifdef CONFIG_DRM_AMDGPU_SI
6867 case CHIP_HAINAN:
6868#endif
6869 case CHIP_TOPAZ:
6870 /* chips with no display hardware */
6871 return false;
6872#ifdef CONFIG_DRM_AMDGPU_SI
6873 case CHIP_TAHITI:
6874 case CHIP_PITCAIRN:
6875 case CHIP_VERDE:
6876 case CHIP_OLAND:
6877#endif
6878#ifdef CONFIG_DRM_AMDGPU_CIK
6879 case CHIP_BONAIRE:
6880 case CHIP_HAWAII:
6881 case CHIP_KAVERI:
6882 case CHIP_KABINI:
6883 case CHIP_MULLINS:
6884#endif
6885 case CHIP_TONGA:
6886 case CHIP_FIJI:
6887 case CHIP_POLARIS10:
6888 case CHIP_POLARIS11:
6889 case CHIP_POLARIS12:
6890 case CHIP_VEGAM:
6891 case CHIP_CARRIZO:
6892 case CHIP_STONEY:
6893 /* chips with display hardware */
6894 return true;
6895 default:
6896 /* IP discovery */
6897 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||
6898 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6899 return false;
6900 return true;
6901 }
6902}
6903
6904uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6905 uint32_t inst, uint32_t reg_addr, char reg_name[],
6906 uint32_t expected_value, uint32_t mask)
6907{
6908 uint32_t ret = 0;
6909 uint32_t old_ = 0;
6910 uint32_t tmp_ = RREG32(reg_addr);
6911 uint32_t loop = adev->usec_timeout;
6912
6913 while ((tmp_ & (mask)) != (expected_value)) {
6914 if (old_ != tmp_) {
6915 loop = adev->usec_timeout;
6916 old_ = tmp_;
6917 } else
6918 udelay(1);
6919 tmp_ = RREG32(reg_addr);
6920 loop--;
6921 if (!loop) {
6922 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6923 inst, reg_name, (uint32_t)expected_value,
6924 (uint32_t)(tmp_ & (mask)));
6925 ret = -ETIMEDOUT;
6926 break;
6927 }
6928 }
6929 return ret;
6930}
6931
6932ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
6933{
6934 ssize_t size = 0;
6935
6936 if (!ring || !ring->adev)
6937 return size;
6938
6939 if (amdgpu_device_should_recover_gpu(ring->adev))
6940 size |= AMDGPU_RESET_TYPE_FULL;
6941
6942 if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
6943 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
6944 size |= AMDGPU_RESET_TYPE_SOFT_RESET;
6945
6946 return size;
6947}
6948
6949ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
6950{
6951 ssize_t size = 0;
6952
6953 if (supported_reset == 0) {
6954 size += sysfs_emit_at(buf, size, "unsupported");
6955 size += sysfs_emit_at(buf, size, "\n");
6956 return size;
6957
6958 }
6959
6960 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
6961 size += sysfs_emit_at(buf, size, "soft ");
6962
6963 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
6964 size += sysfs_emit_at(buf, size, "queue ");
6965
6966 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
6967 size += sysfs_emit_at(buf, size, "pipe ");
6968
6969 if (supported_reset & AMDGPU_RESET_TYPE_FULL)
6970 size += sysfs_emit_at(buf, size, "full ");
6971
6972 size += sysfs_emit_at(buf, size, "\n");
6973 return size;
6974}