drivers/gpu/drm/amd/amdkfd/kfd_process.c at v6.19

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / drivers / gpu / drm / amd / amdkfd / kfd_process.c
at v6.19 2343 lines 61 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0 OR MIT
   2/*
   3 * Copyright 2014-2022 Advanced Micro Devices, Inc.
   4 *
   5 * Permission is hereby granted, free of charge, to any person obtaining a
   6 * copy of this software and associated documentation files (the "Software"),
   7 * to deal in the Software without restriction, including without limitation
   8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9 * and/or sell copies of the Software, and to permit persons to whom the
  10 * Software is furnished to do so, subject to the following conditions:
  11 *
  12 * The above copyright notice and this permission notice shall be included in
  13 * all copies or substantial portions of the Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  21 * OTHER DEALINGS IN THE SOFTWARE.
  22 */
  23
  24#include <linux/mutex.h>
  25#include <linux/log2.h>
  26#include <linux/sched.h>
  27#include <linux/sched/mm.h>
  28#include <linux/sched/task.h>
  29#include <linux/mmu_context.h>
  30#include <linux/slab.h>
  31#include <linux/notifier.h>
  32#include <linux/compat.h>
  33#include <linux/mman.h>
  34#include <linux/file.h>
  35#include <linux/pm_runtime.h>
  36#include "amdgpu_amdkfd.h"
  37#include "amdgpu.h"
  38#include "amdgpu_reset.h"
  39
  40struct mm_struct;
  41
  42#include "kfd_priv.h"
  43#include "kfd_device_queue_manager.h"
  44#include "kfd_svm.h"
  45#include "kfd_smi_events.h"
  46#include "kfd_debug.h"
  47
  48/*
  49 * List of struct kfd_process (field kfd_process).
  50 * Unique/indexed by mm_struct*
  51 */
  52DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE);
  53DEFINE_MUTEX(kfd_processes_mutex);
  54
  55DEFINE_SRCU(kfd_processes_srcu);
  56
  57/* For process termination handling */
  58static struct workqueue_struct *kfd_process_wq;
  59
  60/* Ordered, single-threaded workqueue for restoring evicted
  61 * processes. Restoring multiple processes concurrently under memory
  62 * pressure can lead to processes blocking each other from validating
  63 * their BOs and result in a live-lock situation where processes
  64 * remain evicted indefinitely.
  65 */
  66static struct workqueue_struct *kfd_restore_wq;
  67
  68static struct kfd_process *find_process(const struct task_struct *thread,
  69					bool ref);
  70static void kfd_process_ref_release(struct kref *ref);
  71static struct kfd_process *create_process(const struct task_struct *thread);
  72
  73static void evict_process_worker(struct work_struct *work);
  74static void restore_process_worker(struct work_struct *work);
  75
  76static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd);
  77
  78struct kfd_procfs_tree {
  79	struct kobject *kobj;
  80};
  81
  82static struct kfd_procfs_tree procfs;
  83
  84/*
  85 * Structure for SDMA activity tracking
  86 */
  87struct kfd_sdma_activity_handler_workarea {
  88	struct work_struct sdma_activity_work;
  89	struct kfd_process_device *pdd;
  90	uint64_t sdma_activity_counter;
  91};
  92
  93struct temp_sdma_queue_list {
  94	uint64_t __user *rptr;
  95	uint64_t sdma_val;
  96	unsigned int queue_id;
  97	struct list_head list;
  98};
  99
 100static void kfd_sdma_activity_worker(struct work_struct *work)
 101{
 102	struct kfd_sdma_activity_handler_workarea *workarea;
 103	struct kfd_process_device *pdd;
 104	uint64_t val;
 105	struct mm_struct *mm;
 106	struct queue *q;
 107	struct qcm_process_device *qpd;
 108	struct device_queue_manager *dqm;
 109	int ret = 0;
 110	struct temp_sdma_queue_list sdma_q_list;
 111	struct temp_sdma_queue_list *sdma_q, *next;
 112
 113	workarea = container_of(work, struct kfd_sdma_activity_handler_workarea,
 114				sdma_activity_work);
 115
 116	pdd = workarea->pdd;
 117	if (!pdd)
 118		return;
 119	dqm = pdd->dev->dqm;
 120	qpd = &pdd->qpd;
 121	if (!dqm || !qpd)
 122		return;
 123	/*
 124	 * Total SDMA activity is current SDMA activity + past SDMA activity
 125	 * Past SDMA count is stored in pdd.
 126	 * To get the current activity counters for all active SDMA queues,
 127	 * we loop over all SDMA queues and get their counts from user-space.
 128	 *
 129	 * We cannot call get_user() with dqm_lock held as it can cause
 130	 * a circular lock dependency situation. To read the SDMA stats,
 131	 * we need to do the following:
 132	 *
 133	 * 1. Create a temporary list of SDMA queue nodes from the qpd->queues_list,
 134	 *    with dqm_lock/dqm_unlock().
 135	 * 2. Call get_user() for each node in temporary list without dqm_lock.
 136	 *    Save the SDMA count for each node and also add the count to the total
 137	 *    SDMA count counter.
 138	 *    Its possible, during this step, a few SDMA queue nodes got deleted
 139	 *    from the qpd->queues_list.
 140	 * 3. Do a second pass over qpd->queues_list to check if any nodes got deleted.
 141	 *    If any node got deleted, its SDMA count would be captured in the sdma
 142	 *    past activity counter. So subtract the SDMA counter stored in step 2
 143	 *    for this node from the total SDMA count.
 144	 */
 145	INIT_LIST_HEAD(&sdma_q_list.list);
 146
 147	/*
 148	 * Create the temp list of all SDMA queues
 149	 */
 150	dqm_lock(dqm);
 151
 152	list_for_each_entry(q, &qpd->queues_list, list) {
 153		if ((q->properties.type != KFD_QUEUE_TYPE_SDMA) &&
 154		    (q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI))
 155			continue;
 156
 157		sdma_q = kzalloc(sizeof(struct temp_sdma_queue_list), GFP_KERNEL);
 158		if (!sdma_q) {
 159			dqm_unlock(dqm);
 160			goto cleanup;
 161		}
 162
 163		INIT_LIST_HEAD(&sdma_q->list);
 164		sdma_q->rptr = (uint64_t __user *)q->properties.read_ptr;
 165		sdma_q->queue_id = q->properties.queue_id;
 166		list_add_tail(&sdma_q->list, &sdma_q_list.list);
 167	}
 168
 169	/*
 170	 * If the temp list is empty, then no SDMA queues nodes were found in
 171	 * qpd->queues_list. Return the past activity count as the total sdma
 172	 * count
 173	 */
 174	if (list_empty(&sdma_q_list.list)) {
 175		workarea->sdma_activity_counter = pdd->sdma_past_activity_counter;
 176		dqm_unlock(dqm);
 177		return;
 178	}
 179
 180	dqm_unlock(dqm);
 181
 182	/*
 183	 * Get the usage count for each SDMA queue in temp_list.
 184	 */
 185	mm = get_task_mm(pdd->process->lead_thread);
 186	if (!mm)
 187		goto cleanup;
 188
 189	kthread_use_mm(mm);
 190
 191	list_for_each_entry(sdma_q, &sdma_q_list.list, list) {
 192		val = 0;
 193		ret = read_sdma_queue_counter(sdma_q->rptr, &val);
 194		if (ret) {
 195			pr_debug("Failed to read SDMA queue active counter for queue id: %d",
 196				 sdma_q->queue_id);
 197		} else {
 198			sdma_q->sdma_val = val;
 199			workarea->sdma_activity_counter += val;
 200		}
 201	}
 202
 203	kthread_unuse_mm(mm);
 204	mmput(mm);
 205
 206	/*
 207	 * Do a second iteration over qpd_queues_list to check if any SDMA
 208	 * nodes got deleted while fetching SDMA counter.
 209	 */
 210	dqm_lock(dqm);
 211
 212	workarea->sdma_activity_counter += pdd->sdma_past_activity_counter;
 213
 214	list_for_each_entry(q, &qpd->queues_list, list) {
 215		if (list_empty(&sdma_q_list.list))
 216			break;
 217
 218		if ((q->properties.type != KFD_QUEUE_TYPE_SDMA) &&
 219		    (q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI))
 220			continue;
 221
 222		list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) {
 223			if (((uint64_t __user *)q->properties.read_ptr == sdma_q->rptr) &&
 224			     (sdma_q->queue_id == q->properties.queue_id)) {
 225				list_del(&sdma_q->list);
 226				kfree(sdma_q);
 227				break;
 228			}
 229		}
 230	}
 231
 232	dqm_unlock(dqm);
 233
 234	/*
 235	 * If temp list is not empty, it implies some queues got deleted
 236	 * from qpd->queues_list during SDMA usage read. Subtract the SDMA
 237	 * count for each node from the total SDMA count.
 238	 */
 239	list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) {
 240		workarea->sdma_activity_counter -= sdma_q->sdma_val;
 241		list_del(&sdma_q->list);
 242		kfree(sdma_q);
 243	}
 244
 245	return;
 246
 247cleanup:
 248	list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) {
 249		list_del(&sdma_q->list);
 250		kfree(sdma_q);
 251	}
 252}
 253
 254/**
 255 * kfd_get_cu_occupancy - Collect number of waves in-flight on this device
 256 * by current process. Translates acquired wave count into number of compute units
 257 * that are occupied.
 258 *
 259 * @attr: Handle of attribute that allows reporting of wave count. The attribute
 260 * handle encapsulates GPU device it is associated with, thereby allowing collection
 261 * of waves in flight, etc
 262 * @buffer: Handle of user provided buffer updated with wave count
 263 *
 264 * Return: Number of bytes written to user buffer or an error value
 265 */
 266static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
 267{
 268	int cu_cnt;
 269	int wave_cnt;
 270	int max_waves_per_cu;
 271	struct kfd_node *dev = NULL;
 272	struct kfd_process *proc = NULL;
 273	struct kfd_process_device *pdd = NULL;
 274	int i;
 275	struct kfd_cu_occupancy *cu_occupancy;
 276	u32 queue_format;
 277
 278	pdd = container_of(attr, struct kfd_process_device, attr_cu_occupancy);
 279	dev = pdd->dev;
 280	if (dev->kfd2kgd->get_cu_occupancy == NULL)
 281		return -EINVAL;
 282
 283	cu_cnt = 0;
 284	proc = pdd->process;
 285	if (pdd->qpd.queue_count == 0) {
 286		pr_debug("Gpu-Id: %d has no active queues for process pid %d\n",
 287			 dev->id, (int)proc->lead_thread->pid);
 288		return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt);
 289	}
 290
 291	/* Collect wave count from device if it supports */
 292	wave_cnt = 0;
 293	max_waves_per_cu = 0;
 294
 295	cu_occupancy = kcalloc(AMDGPU_MAX_QUEUES, sizeof(*cu_occupancy), GFP_KERNEL);
 296	if (!cu_occupancy)
 297		return -ENOMEM;
 298
 299	/*
 300	 * For GFX 9.4.3, fetch the CU occupancy from the first XCC in the partition.
 301	 * For AQL queues, because of cooperative dispatch we multiply the wave count
 302	 * by number of XCCs in the partition to get the total wave counts across all
 303	 * XCCs in the partition.
 304	 * For PM4 queues, there is no cooperative dispatch so wave_cnt stay as it is.
 305	 */
 306	dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy,
 307			&max_waves_per_cu, ffs(dev->xcc_mask) - 1);
 308
 309	for (i = 0; i < AMDGPU_MAX_QUEUES; i++) {
 310		if (cu_occupancy[i].wave_cnt != 0 &&
 311		    kfd_dqm_is_queue_in_process(dev->dqm, &pdd->qpd,
 312						cu_occupancy[i].doorbell_off,
 313						&queue_format)) {
 314			if (unlikely(queue_format == KFD_QUEUE_FORMAT_PM4))
 315				wave_cnt += cu_occupancy[i].wave_cnt;
 316			else
 317				wave_cnt += (NUM_XCC(dev->xcc_mask) *
 318						cu_occupancy[i].wave_cnt);
 319		}
 320	}
 321
 322	/* Translate wave count to number of compute units */
 323	cu_cnt = (wave_cnt + (max_waves_per_cu - 1)) / max_waves_per_cu;
 324	kfree(cu_occupancy);
 325	return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt);
 326}
 327
 328static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr,
 329			       char *buffer)
 330{
 331	if (strcmp(attr->name, "pasid") == 0)
 332		return snprintf(buffer, PAGE_SIZE, "%d\n", 0);
 333	else if (strncmp(attr->name, "vram_", 5) == 0) {
 334		struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device,
 335							      attr_vram);
 336		return snprintf(buffer, PAGE_SIZE, "%llu\n", atomic64_read(&pdd->vram_usage));
 337	} else if (strncmp(attr->name, "sdma_", 5) == 0) {
 338		struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device,
 339							      attr_sdma);
 340		struct kfd_sdma_activity_handler_workarea sdma_activity_work_handler;
 341
 342		INIT_WORK_ONSTACK(&sdma_activity_work_handler.sdma_activity_work,
 343				  kfd_sdma_activity_worker);
 344
 345		sdma_activity_work_handler.pdd = pdd;
 346		sdma_activity_work_handler.sdma_activity_counter = 0;
 347
 348		schedule_work(&sdma_activity_work_handler.sdma_activity_work);
 349
 350		flush_work(&sdma_activity_work_handler.sdma_activity_work);
 351		destroy_work_on_stack(&sdma_activity_work_handler.sdma_activity_work);
 352
 353		return snprintf(buffer, PAGE_SIZE, "%llu\n",
 354				(sdma_activity_work_handler.sdma_activity_counter)/
 355				 SDMA_ACTIVITY_DIVISOR);
 356	} else {
 357		pr_err("Invalid attribute");
 358		return -EINVAL;
 359	}
 360
 361	return 0;
 362}
 363
 364static void kfd_procfs_kobj_release(struct kobject *kobj)
 365{
 366	kfree(kobj);
 367}
 368
 369static const struct sysfs_ops kfd_procfs_ops = {
 370	.show = kfd_procfs_show,
 371};
 372
 373static const struct kobj_type procfs_type = {
 374	.release = kfd_procfs_kobj_release,
 375	.sysfs_ops = &kfd_procfs_ops,
 376};
 377
 378void kfd_procfs_init(void)
 379{
 380	int ret = 0;
 381
 382	procfs.kobj = kfd_alloc_struct(procfs.kobj);
 383	if (!procfs.kobj)
 384		return;
 385
 386	ret = kobject_init_and_add(procfs.kobj, &procfs_type,
 387				   &kfd_device->kobj, "proc");
 388	if (ret) {
 389		pr_warn("Could not create procfs proc folder");
 390		/* If we fail to create the procfs, clean up */
 391		kfd_procfs_shutdown();
 392	}
 393}
 394
 395void kfd_procfs_shutdown(void)
 396{
 397	if (procfs.kobj) {
 398		kobject_del(procfs.kobj);
 399		kobject_put(procfs.kobj);
 400		procfs.kobj = NULL;
 401	}
 402}
 403
 404static ssize_t kfd_procfs_queue_show(struct kobject *kobj,
 405				     struct attribute *attr, char *buffer)
 406{
 407	struct queue *q = container_of(kobj, struct queue, kobj);
 408
 409	if (!strcmp(attr->name, "size"))
 410		return snprintf(buffer, PAGE_SIZE, "%llu",
 411				q->properties.queue_size);
 412	else if (!strcmp(attr->name, "type"))
 413		return snprintf(buffer, PAGE_SIZE, "%d", q->properties.type);
 414	else if (!strcmp(attr->name, "gpuid"))
 415		return snprintf(buffer, PAGE_SIZE, "%u", q->device->id);
 416	else
 417		pr_err("Invalid attribute");
 418
 419	return 0;
 420}
 421
 422static ssize_t kfd_procfs_stats_show(struct kobject *kobj,
 423				     struct attribute *attr, char *buffer)
 424{
 425	if (strcmp(attr->name, "evicted_ms") == 0) {
 426		struct kfd_process_device *pdd = container_of(attr,
 427				struct kfd_process_device,
 428				attr_evict);
 429		uint64_t evict_jiffies;
 430
 431		evict_jiffies = atomic64_read(&pdd->evict_duration_counter);
 432
 433		return snprintf(buffer,
 434				PAGE_SIZE,
 435				"%llu\n",
 436				jiffies64_to_msecs(evict_jiffies));
 437
 438	/* Sysfs handle that gets CU occupancy is per device */
 439	} else if (strcmp(attr->name, "cu_occupancy") == 0) {
 440		return kfd_get_cu_occupancy(attr, buffer);
 441	} else {
 442		pr_err("Invalid attribute");
 443	}
 444
 445	return 0;
 446}
 447
 448static ssize_t kfd_sysfs_counters_show(struct kobject *kobj,
 449				       struct attribute *attr, char *buf)
 450{
 451	struct kfd_process_device *pdd;
 452
 453	if (!strcmp(attr->name, "faults")) {
 454		pdd = container_of(attr, struct kfd_process_device,
 455				   attr_faults);
 456		return sysfs_emit(buf, "%llu\n", READ_ONCE(pdd->faults));
 457	}
 458	if (!strcmp(attr->name, "page_in")) {
 459		pdd = container_of(attr, struct kfd_process_device,
 460				   attr_page_in);
 461		return sysfs_emit(buf, "%llu\n", READ_ONCE(pdd->page_in));
 462	}
 463	if (!strcmp(attr->name, "page_out")) {
 464		pdd = container_of(attr, struct kfd_process_device,
 465				   attr_page_out);
 466		return sysfs_emit(buf, "%llu\n", READ_ONCE(pdd->page_out));
 467	}
 468	return 0;
 469}
 470
 471static struct attribute attr_queue_size = {
 472	.name = "size",
 473	.mode = KFD_SYSFS_FILE_MODE
 474};
 475
 476static struct attribute attr_queue_type = {
 477	.name = "type",
 478	.mode = KFD_SYSFS_FILE_MODE
 479};
 480
 481static struct attribute attr_queue_gpuid = {
 482	.name = "gpuid",
 483	.mode = KFD_SYSFS_FILE_MODE
 484};
 485
 486static struct attribute *procfs_queue_attrs[] = {
 487	&attr_queue_size,
 488	&attr_queue_type,
 489	&attr_queue_gpuid,
 490	NULL
 491};
 492ATTRIBUTE_GROUPS(procfs_queue);
 493
 494static const struct sysfs_ops procfs_queue_ops = {
 495	.show = kfd_procfs_queue_show,
 496};
 497
 498static const struct kobj_type procfs_queue_type = {
 499	.sysfs_ops = &procfs_queue_ops,
 500	.default_groups = procfs_queue_groups,
 501};
 502
 503static const struct sysfs_ops procfs_stats_ops = {
 504	.show = kfd_procfs_stats_show,
 505};
 506
 507static const struct kobj_type procfs_stats_type = {
 508	.sysfs_ops = &procfs_stats_ops,
 509	.release = kfd_procfs_kobj_release,
 510};
 511
 512static const struct sysfs_ops sysfs_counters_ops = {
 513	.show = kfd_sysfs_counters_show,
 514};
 515
 516static const struct kobj_type sysfs_counters_type = {
 517	.sysfs_ops = &sysfs_counters_ops,
 518	.release = kfd_procfs_kobj_release,
 519};
 520
 521int kfd_procfs_add_queue(struct queue *q)
 522{
 523	struct kfd_process *proc;
 524	int ret;
 525
 526	if (!q || !q->process)
 527		return -EINVAL;
 528	proc = q->process;
 529
 530	/* Create proc/<pid>/queues/<queue id> folder */
 531	if (!proc->kobj_queues)
 532		return -EFAULT;
 533	ret = kobject_init_and_add(&q->kobj, &procfs_queue_type,
 534			proc->kobj_queues, "%u", q->properties.queue_id);
 535	if (ret < 0) {
 536		pr_warn("Creating proc/<pid>/queues/%u failed",
 537			q->properties.queue_id);
 538		kobject_put(&q->kobj);
 539		return ret;
 540	}
 541
 542	return 0;
 543}
 544
 545static void kfd_sysfs_create_file(struct kobject *kobj, struct attribute *attr,
 546				 char *name)
 547{
 548	int ret;
 549
 550	if (!kobj || !attr || !name)
 551		return;
 552
 553	attr->name = name;
 554	attr->mode = KFD_SYSFS_FILE_MODE;
 555	sysfs_attr_init(attr);
 556
 557	ret = sysfs_create_file(kobj, attr);
 558	if (ret)
 559		pr_warn("Create sysfs %s/%s failed %d", kobj->name, name, ret);
 560}
 561
 562static void kfd_procfs_add_sysfs_stats(struct kfd_process *p)
 563{
 564	int ret;
 565	int i;
 566	char stats_dir_filename[MAX_SYSFS_FILENAME_LEN];
 567
 568	if (!p || !p->kobj)
 569		return;
 570
 571	/*
 572	 * Create sysfs files for each GPU:
 573	 * - proc/<pid>/stats_<gpuid>/
 574	 * - proc/<pid>/stats_<gpuid>/evicted_ms
 575	 * - proc/<pid>/stats_<gpuid>/cu_occupancy
 576	 */
 577	for (i = 0; i < p->n_pdds; i++) {
 578		struct kfd_process_device *pdd = p->pdds[i];
 579
 580		snprintf(stats_dir_filename, MAX_SYSFS_FILENAME_LEN,
 581				"stats_%u", pdd->dev->id);
 582		pdd->kobj_stats = kfd_alloc_struct(pdd->kobj_stats);
 583		if (!pdd->kobj_stats)
 584			return;
 585
 586		ret = kobject_init_and_add(pdd->kobj_stats,
 587					   &procfs_stats_type,
 588					   p->kobj,
 589					   stats_dir_filename);
 590
 591		if (ret) {
 592			pr_warn("Creating KFD proc/stats_%s folder failed",
 593				stats_dir_filename);
 594			kobject_put(pdd->kobj_stats);
 595			pdd->kobj_stats = NULL;
 596			return;
 597		}
 598
 599		kfd_sysfs_create_file(pdd->kobj_stats, &pdd->attr_evict,
 600				      "evicted_ms");
 601		/* Add sysfs file to report compute unit occupancy */
 602		if (pdd->dev->kfd2kgd->get_cu_occupancy)
 603			kfd_sysfs_create_file(pdd->kobj_stats,
 604					      &pdd->attr_cu_occupancy,
 605					      "cu_occupancy");
 606	}
 607}
 608
 609static void kfd_procfs_add_sysfs_counters(struct kfd_process *p)
 610{
 611	int ret = 0;
 612	int i;
 613	char counters_dir_filename[MAX_SYSFS_FILENAME_LEN];
 614
 615	if (!p || !p->kobj)
 616		return;
 617
 618	/*
 619	 * Create sysfs files for each GPU which supports SVM
 620	 * - proc/<pid>/counters_<gpuid>/
 621	 * - proc/<pid>/counters_<gpuid>/faults
 622	 * - proc/<pid>/counters_<gpuid>/page_in
 623	 * - proc/<pid>/counters_<gpuid>/page_out
 624	 */
 625	for_each_set_bit(i, p->svms.bitmap_supported, p->n_pdds) {
 626		struct kfd_process_device *pdd = p->pdds[i];
 627		struct kobject *kobj_counters;
 628
 629		snprintf(counters_dir_filename, MAX_SYSFS_FILENAME_LEN,
 630			"counters_%u", pdd->dev->id);
 631		kobj_counters = kfd_alloc_struct(kobj_counters);
 632		if (!kobj_counters)
 633			return;
 634
 635		ret = kobject_init_and_add(kobj_counters, &sysfs_counters_type,
 636					   p->kobj, counters_dir_filename);
 637		if (ret) {
 638			pr_warn("Creating KFD proc/%s folder failed",
 639				counters_dir_filename);
 640			kobject_put(kobj_counters);
 641			return;
 642		}
 643
 644		pdd->kobj_counters = kobj_counters;
 645		kfd_sysfs_create_file(kobj_counters, &pdd->attr_faults,
 646				      "faults");
 647		kfd_sysfs_create_file(kobj_counters, &pdd->attr_page_in,
 648				      "page_in");
 649		kfd_sysfs_create_file(kobj_counters, &pdd->attr_page_out,
 650				      "page_out");
 651	}
 652}
 653
 654static void kfd_procfs_add_sysfs_files(struct kfd_process *p)
 655{
 656	int i;
 657
 658	if (!p || !p->kobj)
 659		return;
 660
 661	/*
 662	 * Create sysfs files for each GPU:
 663	 * - proc/<pid>/vram_<gpuid>
 664	 * - proc/<pid>/sdma_<gpuid>
 665	 */
 666	for (i = 0; i < p->n_pdds; i++) {
 667		struct kfd_process_device *pdd = p->pdds[i];
 668
 669		snprintf(pdd->vram_filename, MAX_SYSFS_FILENAME_LEN, "vram_%u",
 670			 pdd->dev->id);
 671		kfd_sysfs_create_file(p->kobj, &pdd->attr_vram,
 672				      pdd->vram_filename);
 673
 674		snprintf(pdd->sdma_filename, MAX_SYSFS_FILENAME_LEN, "sdma_%u",
 675			 pdd->dev->id);
 676		kfd_sysfs_create_file(p->kobj, &pdd->attr_sdma,
 677					    pdd->sdma_filename);
 678	}
 679}
 680
 681void kfd_procfs_del_queue(struct queue *q)
 682{
 683	if (!q)
 684		return;
 685
 686	kobject_del(&q->kobj);
 687	kobject_put(&q->kobj);
 688}
 689
 690int kfd_process_create_wq(void)
 691{
 692	if (!kfd_process_wq)
 693		kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
 694	if (!kfd_restore_wq)
 695		kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq",
 696							 WQ_FREEZABLE);
 697
 698	if (!kfd_process_wq || !kfd_restore_wq) {
 699		kfd_process_destroy_wq();
 700		return -ENOMEM;
 701	}
 702
 703	return 0;
 704}
 705
 706void kfd_process_destroy_wq(void)
 707{
 708	if (kfd_process_wq) {
 709		destroy_workqueue(kfd_process_wq);
 710		kfd_process_wq = NULL;
 711	}
 712	if (kfd_restore_wq) {
 713		destroy_workqueue(kfd_restore_wq);
 714		kfd_restore_wq = NULL;
 715	}
 716}
 717
 718static void kfd_process_free_gpuvm(struct kgd_mem *mem,
 719			struct kfd_process_device *pdd, void **kptr)
 720{
 721	struct kfd_node *dev = pdd->dev;
 722
 723	if (kptr && *kptr) {
 724		amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(mem);
 725		*kptr = NULL;
 726	}
 727
 728	amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(dev->adev, mem, pdd->drm_priv);
 729	amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, mem, pdd->drm_priv,
 730					       NULL);
 731}
 732
 733/* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
 734 *	This function should be only called right after the process
 735 *	is created and when kfd_processes_mutex is still being held
 736 *	to avoid concurrency. Because of that exclusiveness, we do
 737 *	not need to take p->mutex.
 738 */
 739static int kfd_process_alloc_gpuvm(struct kfd_process_device *pdd,
 740				   uint64_t gpu_va, uint32_t size,
 741				   uint32_t flags, struct kgd_mem **mem, void **kptr)
 742{
 743	struct kfd_node *kdev = pdd->dev;
 744	int err;
 745
 746	err = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(kdev->adev, gpu_va, size,
 747						 pdd->drm_priv, mem, NULL,
 748						 flags, false);
 749	if (err)
 750		goto err_alloc_mem;
 751
 752	err = amdgpu_amdkfd_gpuvm_map_memory_to_gpu(kdev->adev, *mem,
 753			pdd->drm_priv);
 754	if (err)
 755		goto err_map_mem;
 756
 757	err = amdgpu_amdkfd_gpuvm_sync_memory(kdev->adev, *mem, true);
 758	if (err) {
 759		pr_debug("Sync memory failed, wait interrupted by user signal\n");
 760		goto sync_memory_failed;
 761	}
 762
 763	if (kptr) {
 764		err = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(
 765				(struct kgd_mem *)*mem, kptr, NULL);
 766		if (err) {
 767			pr_debug("Map GTT BO to kernel failed\n");
 768			goto sync_memory_failed;
 769		}
 770	}
 771
 772	return err;
 773
 774sync_memory_failed:
 775	amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(kdev->adev, *mem, pdd->drm_priv);
 776
 777err_map_mem:
 778	amdgpu_amdkfd_gpuvm_free_memory_of_gpu(kdev->adev, *mem, pdd->drm_priv,
 779					       NULL);
 780err_alloc_mem:
 781	*mem = NULL;
 782	*kptr = NULL;
 783	return err;
 784}
 785
 786/* kfd_process_device_reserve_ib_mem - Reserve memory inside the
 787 *	process for IB usage The memory reserved is for KFD to submit
 788 *	IB to AMDGPU from kernel.  If the memory is reserved
 789 *	successfully, ib_kaddr will have the CPU/kernel
 790 *	address. Check ib_kaddr before accessing the memory.
 791 */
 792static int kfd_process_device_reserve_ib_mem(struct kfd_process_device *pdd)
 793{
 794	struct qcm_process_device *qpd = &pdd->qpd;
 795	uint32_t flags = KFD_IOC_ALLOC_MEM_FLAGS_GTT |
 796			KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE |
 797			KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
 798			KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE;
 799	struct kgd_mem *mem;
 800	void *kaddr;
 801	int ret;
 802
 803	if (qpd->ib_kaddr || !qpd->ib_base)
 804		return 0;
 805
 806	/* ib_base is only set for dGPU */
 807	ret = kfd_process_alloc_gpuvm(pdd, qpd->ib_base, PAGE_SIZE, flags,
 808				      &mem, &kaddr);
 809	if (ret)
 810		return ret;
 811
 812	qpd->ib_mem = mem;
 813	qpd->ib_kaddr = kaddr;
 814
 815	return 0;
 816}
 817
 818static void kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd)
 819{
 820	struct qcm_process_device *qpd = &pdd->qpd;
 821
 822	if (!qpd->ib_kaddr || !qpd->ib_base)
 823		return;
 824
 825	kfd_process_free_gpuvm(qpd->ib_mem, pdd, &qpd->ib_kaddr);
 826}
 827
 828struct kfd_process *kfd_create_process(struct task_struct *thread)
 829{
 830	struct kfd_process *process;
 831	int ret;
 832
 833	if (!(thread->mm && mmget_not_zero(thread->mm)))
 834		return ERR_PTR(-EINVAL);
 835
 836	/* Only the pthreads threading model is supported. */
 837	if (thread->group_leader->mm != thread->mm) {
 838		mmput(thread->mm);
 839		return ERR_PTR(-EINVAL);
 840	}
 841
 842	/* If the process just called exec(3), it is possible that the
 843	 * cleanup of the kfd_process (following the release of the mm
 844	 * of the old process image) is still in the cleanup work queue.
 845	 * Make sure to drain any job before trying to recreate any
 846	 * resource for this process.
 847	 */
 848	flush_workqueue(kfd_process_wq);
 849
 850	/*
 851	 * take kfd processes mutex before starting of process creation
 852	 * so there won't be a case where two threads of the same process
 853	 * create two kfd_process structures
 854	 */
 855	mutex_lock(&kfd_processes_mutex);
 856
 857	if (kfd_is_locked(NULL)) {
 858		pr_debug("KFD is locked! Cannot create process");
 859		process = ERR_PTR(-EINVAL);
 860		goto out;
 861	}
 862
 863	/* A prior open of /dev/kfd could have already created the process.
 864	 * find_process will increase process kref in this case
 865	 */
 866	process = find_process(thread, true);
 867	if (process) {
 868		pr_debug("Process already found\n");
 869	} else {
 870		process = create_process(thread);
 871		if (IS_ERR(process))
 872			goto out;
 873
 874		if (!procfs.kobj)
 875			goto out;
 876
 877		process->kobj = kfd_alloc_struct(process->kobj);
 878		if (!process->kobj) {
 879			pr_warn("Creating procfs kobject failed");
 880			goto out;
 881		}
 882		ret = kobject_init_and_add(process->kobj, &procfs_type,
 883					   procfs.kobj, "%d",
 884					   (int)process->lead_thread->pid);
 885		if (ret) {
 886			pr_warn("Creating procfs pid directory failed");
 887			kobject_put(process->kobj);
 888			goto out;
 889		}
 890
 891		kfd_sysfs_create_file(process->kobj, &process->attr_pasid,
 892				      "pasid");
 893
 894		process->kobj_queues = kobject_create_and_add("queues",
 895							process->kobj);
 896		if (!process->kobj_queues)
 897			pr_warn("Creating KFD proc/queues folder failed");
 898
 899		kfd_procfs_add_sysfs_stats(process);
 900		kfd_procfs_add_sysfs_files(process);
 901		kfd_procfs_add_sysfs_counters(process);
 902
 903		kfd_debugfs_add_process(process);
 904
 905		init_waitqueue_head(&process->wait_irq_drain);
 906	}
 907out:
 908	mutex_unlock(&kfd_processes_mutex);
 909	mmput(thread->mm);
 910
 911	return process;
 912}
 913
 914struct kfd_process *kfd_get_process(const struct task_struct *thread)
 915{
 916	struct kfd_process *process;
 917
 918	if (!thread->mm)
 919		return ERR_PTR(-EINVAL);
 920
 921	/* Only the pthreads threading model is supported. */
 922	if (thread->group_leader->mm != thread->mm)
 923		return ERR_PTR(-EINVAL);
 924
 925	process = find_process(thread, false);
 926	if (!process)
 927		return ERR_PTR(-EINVAL);
 928
 929	return process;
 930}
 931
 932static struct kfd_process *find_process_by_mm(const struct mm_struct *mm)
 933{
 934	struct kfd_process *process;
 935
 936	hash_for_each_possible_rcu(kfd_processes_table, process,
 937					kfd_processes, (uintptr_t)mm)
 938		if (process->mm == mm)
 939			return process;
 940
 941	return NULL;
 942}
 943
 944static struct kfd_process *find_process(const struct task_struct *thread,
 945					bool ref)
 946{
 947	struct kfd_process *p;
 948	int idx;
 949
 950	idx = srcu_read_lock(&kfd_processes_srcu);
 951	p = find_process_by_mm(thread->mm);
 952	if (p && ref)
 953		kref_get(&p->ref);
 954	srcu_read_unlock(&kfd_processes_srcu, idx);
 955
 956	return p;
 957}
 958
 959void kfd_unref_process(struct kfd_process *p)
 960{
 961	kref_put(&p->ref, kfd_process_ref_release);
 962}
 963
 964/* This increments the process->ref counter. */
 965struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid)
 966{
 967	struct task_struct *task = NULL;
 968	struct kfd_process *p    = NULL;
 969
 970	if (!pid) {
 971		task = current;
 972		get_task_struct(task);
 973	} else {
 974		task = get_pid_task(pid, PIDTYPE_PID);
 975	}
 976
 977	if (task) {
 978		p = find_process(task, true);
 979		put_task_struct(task);
 980	}
 981
 982	return p;
 983}
 984
 985static void kfd_process_device_free_bos(struct kfd_process_device *pdd)
 986{
 987	struct kfd_process *p = pdd->process;
 988	void *mem;
 989	int id;
 990	int i;
 991
 992	/*
 993	 * Remove all handles from idr and release appropriate
 994	 * local memory object
 995	 */
 996	idr_for_each_entry(&pdd->alloc_idr, mem, id) {
 997
 998		for (i = 0; i < p->n_pdds; i++) {
 999			struct kfd_process_device *peer_pdd = p->pdds[i];
1000
1001			if (!peer_pdd->drm_priv)
1002				continue;
1003			amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
1004				peer_pdd->dev->adev, mem, peer_pdd->drm_priv);
1005		}
1006
1007		amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->adev, mem,
1008						       pdd->drm_priv, NULL);
1009		kfd_process_device_remove_obj_handle(pdd, id);
1010	}
1011}
1012
1013/*
1014 * Just kunmap and unpin signal BO here. It will be freed in
1015 * kfd_process_free_outstanding_kfd_bos()
1016 */
1017static void kfd_process_kunmap_signal_bo(struct kfd_process *p)
1018{
1019	struct kfd_process_device *pdd;
1020	struct kfd_node *kdev;
1021	void *mem;
1022
1023	kdev = kfd_device_by_id(GET_GPU_ID(p->signal_handle));
1024	if (!kdev)
1025		return;
1026
1027	mutex_lock(&p->mutex);
1028
1029	pdd = kfd_get_process_device_data(kdev, p);
1030	if (!pdd)
1031		goto out;
1032
1033	mem = kfd_process_device_translate_handle(
1034		pdd, GET_IDR_HANDLE(p->signal_handle));
1035	if (!mem)
1036		goto out;
1037
1038	amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(mem);
1039
1040out:
1041	mutex_unlock(&p->mutex);
1042}
1043
1044static void kfd_process_free_outstanding_kfd_bos(struct kfd_process *p)
1045{
1046	int i;
1047
1048	for (i = 0; i < p->n_pdds; i++)
1049		kfd_process_device_free_bos(p->pdds[i]);
1050}
1051
1052static void kfd_process_destroy_pdds(struct kfd_process *p)
1053{
1054	int i;
1055
1056	for (i = 0; i < p->n_pdds; i++) {
1057		struct kfd_process_device *pdd = p->pdds[i];
1058
1059		kfd_smi_event_process(pdd, false);
1060
1061		pr_debug("Releasing pdd (topology id %d, for pid %d)\n",
1062			pdd->dev->id, p->lead_thread->pid);
1063		kfd_process_device_destroy_cwsr_dgpu(pdd);
1064		kfd_process_device_destroy_ib_mem(pdd);
1065
1066		if (pdd->drm_file)
1067			fput(pdd->drm_file);
1068
1069		if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base)
1070			free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
1071				get_order(KFD_CWSR_TBA_TMA_SIZE));
1072
1073		idr_destroy(&pdd->alloc_idr);
1074
1075		kfd_free_process_doorbells(pdd->dev->kfd, pdd);
1076
1077		if (pdd->dev->kfd->shared_resources.enable_mes &&
1078			pdd->proc_ctx_cpu_ptr)
1079			amdgpu_amdkfd_free_gtt_mem(pdd->dev->adev,
1080						   &pdd->proc_ctx_bo);
1081		/*
1082		 * before destroying pdd, make sure to report availability
1083		 * for auto suspend
1084		 */
1085		if (pdd->runtime_inuse) {
1086			pm_runtime_put_autosuspend(adev_to_drm(pdd->dev->adev)->dev);
1087			pdd->runtime_inuse = false;
1088		}
1089
1090		atomic_dec(&pdd->dev->kfd->kfd_processes_count);
1091
1092		kfree(pdd);
1093		p->pdds[i] = NULL;
1094	}
1095	p->n_pdds = 0;
1096}
1097
1098static void kfd_process_remove_sysfs(struct kfd_process *p)
1099{
1100	struct kfd_process_device *pdd;
1101	int i;
1102
1103	if (!p->kobj)
1104		return;
1105
1106	sysfs_remove_file(p->kobj, &p->attr_pasid);
1107	kobject_del(p->kobj_queues);
1108	kobject_put(p->kobj_queues);
1109	p->kobj_queues = NULL;
1110
1111	for (i = 0; i < p->n_pdds; i++) {
1112		pdd = p->pdds[i];
1113
1114		sysfs_remove_file(p->kobj, &pdd->attr_vram);
1115		sysfs_remove_file(p->kobj, &pdd->attr_sdma);
1116
1117		sysfs_remove_file(pdd->kobj_stats, &pdd->attr_evict);
1118		if (pdd->dev->kfd2kgd->get_cu_occupancy)
1119			sysfs_remove_file(pdd->kobj_stats,
1120					  &pdd->attr_cu_occupancy);
1121		kobject_del(pdd->kobj_stats);
1122		kobject_put(pdd->kobj_stats);
1123		pdd->kobj_stats = NULL;
1124	}
1125
1126	for_each_set_bit(i, p->svms.bitmap_supported, p->n_pdds) {
1127		pdd = p->pdds[i];
1128
1129		sysfs_remove_file(pdd->kobj_counters, &pdd->attr_faults);
1130		sysfs_remove_file(pdd->kobj_counters, &pdd->attr_page_in);
1131		sysfs_remove_file(pdd->kobj_counters, &pdd->attr_page_out);
1132		kobject_del(pdd->kobj_counters);
1133		kobject_put(pdd->kobj_counters);
1134		pdd->kobj_counters = NULL;
1135	}
1136
1137	kobject_del(p->kobj);
1138	kobject_put(p->kobj);
1139	p->kobj = NULL;
1140}
1141
1142/*
1143 * If any GPU is ongoing reset, wait for reset complete.
1144 */
1145static void kfd_process_wait_gpu_reset_complete(struct kfd_process *p)
1146{
1147	int i;
1148
1149	for (i = 0; i < p->n_pdds; i++)
1150		flush_workqueue(p->pdds[i]->dev->adev->reset_domain->wq);
1151}
1152
1153/* No process locking is needed in this function, because the process
1154 * is not findable any more. We must assume that no other thread is
1155 * using it any more, otherwise we couldn't safely free the process
1156 * structure in the end.
1157 */
1158static void kfd_process_wq_release(struct work_struct *work)
1159{
1160	struct kfd_process *p = container_of(work, struct kfd_process,
1161					     release_work);
1162	struct dma_fence *ef;
1163
1164	/*
1165	 * If GPU in reset, user queues may still running, wait for reset complete.
1166	 */
1167	kfd_process_wait_gpu_reset_complete(p);
1168
1169	/* Signal the eviction fence after user mode queues are
1170	 * destroyed. This allows any BOs to be freed without
1171	 * triggering pointless evictions or waiting for fences.
1172	 */
1173	synchronize_rcu();
1174	ef = rcu_access_pointer(p->ef);
1175	if (ef)
1176		dma_fence_signal(ef);
1177
1178	kfd_process_remove_sysfs(p);
1179	kfd_debugfs_remove_process(p);
1180
1181	kfd_process_kunmap_signal_bo(p);
1182	kfd_process_free_outstanding_kfd_bos(p);
1183	svm_range_list_fini(p);
1184
1185	kfd_process_destroy_pdds(p);
1186	dma_fence_put(ef);
1187
1188	kfd_event_free_process(p);
1189
1190	mutex_destroy(&p->mutex);
1191
1192	put_task_struct(p->lead_thread);
1193
1194	kfree(p);
1195}
1196
1197static void kfd_process_ref_release(struct kref *ref)
1198{
1199	struct kfd_process *p = container_of(ref, struct kfd_process, ref);
1200
1201	INIT_WORK(&p->release_work, kfd_process_wq_release);
1202	queue_work(kfd_process_wq, &p->release_work);
1203}
1204
1205static struct mmu_notifier *kfd_process_alloc_notifier(struct mm_struct *mm)
1206{
1207	/* This increments p->ref counter if kfd process p exists */
1208	struct kfd_process *p = kfd_lookup_process_by_mm(mm);
1209
1210	return p ? &p->mmu_notifier : ERR_PTR(-ESRCH);
1211}
1212
1213static void kfd_process_free_notifier(struct mmu_notifier *mn)
1214{
1215	kfd_unref_process(container_of(mn, struct kfd_process, mmu_notifier));
1216}
1217
1218static void kfd_process_notifier_release_internal(struct kfd_process *p)
1219{
1220	int i;
1221
1222	cancel_delayed_work_sync(&p->eviction_work);
1223	cancel_delayed_work_sync(&p->restore_work);
1224
1225	/*
1226	 * Dequeue and destroy user queues, it is not safe for GPU to access
1227	 * system memory after mmu release notifier callback returns because
1228	 * exit_mmap free process memory afterwards.
1229	 */
1230	kfd_process_dequeue_from_all_devices(p);
1231	pqm_uninit(&p->pqm);
1232
1233	for (i = 0; i < p->n_pdds; i++) {
1234		struct kfd_process_device *pdd = p->pdds[i];
1235
1236		/* re-enable GFX OFF since runtime enable with ttmp setup disabled it. */
1237		if (!kfd_dbg_is_rlc_restore_supported(pdd->dev) && p->runtime_info.ttmp_setup)
1238			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
1239	}
1240
1241	/* Indicate to other users that MM is no longer valid */
1242	p->mm = NULL;
1243	kfd_dbg_trap_disable(p);
1244
1245	if (atomic_read(&p->debugged_process_count) > 0) {
1246		struct kfd_process *target;
1247		unsigned int temp;
1248		int idx = srcu_read_lock(&kfd_processes_srcu);
1249
1250		hash_for_each_rcu(kfd_processes_table, temp, target, kfd_processes) {
1251			if (target->debugger_process && target->debugger_process == p) {
1252				mutex_lock_nested(&target->mutex, 1);
1253				kfd_dbg_trap_disable(target);
1254				mutex_unlock(&target->mutex);
1255				if (atomic_read(&p->debugged_process_count) == 0)
1256					break;
1257			}
1258		}
1259
1260		srcu_read_unlock(&kfd_processes_srcu, idx);
1261	}
1262
1263	mmu_notifier_put(&p->mmu_notifier);
1264}
1265
1266static void kfd_process_notifier_release(struct mmu_notifier *mn,
1267					struct mm_struct *mm)
1268{
1269	struct kfd_process *p;
1270
1271	/*
1272	 * The kfd_process structure can not be free because the
1273	 * mmu_notifier srcu is read locked
1274	 */
1275	p = container_of(mn, struct kfd_process, mmu_notifier);
1276	if (WARN_ON(p->mm != mm))
1277		return;
1278
1279	mutex_lock(&kfd_processes_mutex);
1280	/*
1281	 * Do early return if table is empty.
1282	 *
1283	 * This could potentially happen if this function is called concurrently
1284	 * by mmu_notifier and by kfd_cleanup_pocesses.
1285	 *
1286	 */
1287	if (hash_empty(kfd_processes_table)) {
1288		mutex_unlock(&kfd_processes_mutex);
1289		return;
1290	}
1291	hash_del_rcu(&p->kfd_processes);
1292	mutex_unlock(&kfd_processes_mutex);
1293	synchronize_srcu(&kfd_processes_srcu);
1294
1295	kfd_process_notifier_release_internal(p);
1296}
1297
1298static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
1299	.release = kfd_process_notifier_release,
1300	.alloc_notifier = kfd_process_alloc_notifier,
1301	.free_notifier = kfd_process_free_notifier,
1302};
1303
1304/*
1305 * This code handles the case when driver is being unloaded before all
1306 * mm_struct are released.  We need to safely free the kfd_process and
1307 * avoid race conditions with mmu_notifier that might try to free them.
1308 *
1309 */
1310void kfd_cleanup_processes(void)
1311{
1312	struct kfd_process *p;
1313	struct hlist_node *p_temp;
1314	unsigned int temp;
1315	HLIST_HEAD(cleanup_list);
1316
1317	/*
1318	 * Move all remaining kfd_process from the process table to a
1319	 * temp list for processing.   Once done, callback from mmu_notifier
1320	 * release will not see the kfd_process in the table and do early return,
1321	 * avoiding double free issues.
1322	 */
1323	mutex_lock(&kfd_processes_mutex);
1324	hash_for_each_safe(kfd_processes_table, temp, p_temp, p, kfd_processes) {
1325		hash_del_rcu(&p->kfd_processes);
1326		synchronize_srcu(&kfd_processes_srcu);
1327		hlist_add_head(&p->kfd_processes, &cleanup_list);
1328	}
1329	mutex_unlock(&kfd_processes_mutex);
1330
1331	hlist_for_each_entry_safe(p, p_temp, &cleanup_list, kfd_processes)
1332		kfd_process_notifier_release_internal(p);
1333
1334	/*
1335	 * Ensures that all outstanding free_notifier get called, triggering
1336	 * the release of the kfd_process struct.
1337	 */
1338	mmu_notifier_synchronize();
1339}
1340
1341int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
1342{
1343	unsigned long  offset;
1344	int i;
1345
1346	if (p->has_cwsr)
1347		return 0;
1348
1349	for (i = 0; i < p->n_pdds; i++) {
1350		struct kfd_node *dev = p->pdds[i]->dev;
1351		struct qcm_process_device *qpd = &p->pdds[i]->qpd;
1352
1353		if (!dev->kfd->cwsr_enabled || qpd->cwsr_kaddr || qpd->cwsr_base)
1354			continue;
1355
1356		offset = KFD_MMAP_TYPE_RESERVED_MEM | KFD_MMAP_GPU_ID(dev->id);
1357		qpd->tba_addr = (int64_t)vm_mmap(filep, 0,
1358			KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC,
1359			MAP_SHARED, offset);
1360
1361		if (IS_ERR_VALUE(qpd->tba_addr)) {
1362			int err = qpd->tba_addr;
1363
1364			dev_err(dev->adev->dev,
1365				"Failure to set tba address. error %d.\n", err);
1366			qpd->tba_addr = 0;
1367			qpd->cwsr_kaddr = NULL;
1368			return err;
1369		}
1370
1371		memcpy(qpd->cwsr_kaddr, dev->kfd->cwsr_isa, dev->kfd->cwsr_isa_size);
1372
1373		kfd_process_set_trap_debug_flag(qpd, p->debug_trap_enabled);
1374
1375		qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
1376		pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
1377			qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
1378	}
1379
1380	p->has_cwsr = true;
1381
1382	return 0;
1383}
1384
1385static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
1386{
1387	struct kfd_node *dev = pdd->dev;
1388	struct qcm_process_device *qpd = &pdd->qpd;
1389	uint32_t flags = KFD_IOC_ALLOC_MEM_FLAGS_GTT
1390			| KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
1391			| KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE;
1392	struct kgd_mem *mem;
1393	void *kaddr;
1394	int ret;
1395
1396	if (!dev->kfd->cwsr_enabled || qpd->cwsr_kaddr || !qpd->cwsr_base)
1397		return 0;
1398
1399	/* cwsr_base is only set for dGPU */
1400	ret = kfd_process_alloc_gpuvm(pdd, qpd->cwsr_base,
1401				      KFD_CWSR_TBA_TMA_SIZE, flags, &mem, &kaddr);
1402	if (ret)
1403		return ret;
1404
1405	qpd->cwsr_mem = mem;
1406	qpd->cwsr_kaddr = kaddr;
1407	qpd->tba_addr = qpd->cwsr_base;
1408
1409	memcpy(qpd->cwsr_kaddr, dev->kfd->cwsr_isa, dev->kfd->cwsr_isa_size);
1410
1411	kfd_process_set_trap_debug_flag(&pdd->qpd,
1412					pdd->process->debug_trap_enabled);
1413
1414	qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
1415	pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
1416		 qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
1417
1418	return 0;
1419}
1420
1421static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd)
1422{
1423	struct kfd_node *dev = pdd->dev;
1424	struct qcm_process_device *qpd = &pdd->qpd;
1425
1426	if (!dev->kfd->cwsr_enabled || !qpd->cwsr_kaddr || !qpd->cwsr_base)
1427		return;
1428
1429	kfd_process_free_gpuvm(qpd->cwsr_mem, pdd, &qpd->cwsr_kaddr);
1430}
1431
1432void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
1433				  uint64_t tba_addr,
1434				  uint64_t tma_addr)
1435{
1436	if (qpd->cwsr_kaddr) {
1437		/* KFD trap handler is bound, record as second-level TBA/TMA
1438		 * in first-level TMA. First-level trap will jump to second.
1439		 */
1440		uint64_t *tma =
1441			(uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
1442		tma[0] = tba_addr;
1443		tma[1] = tma_addr;
1444	} else {
1445		/* No trap handler bound, bind as first-level TBA/TMA. */
1446		qpd->tba_addr = tba_addr;
1447		qpd->tma_addr = tma_addr;
1448	}
1449}
1450
1451bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
1452{
1453	int i;
1454
1455	/* On most GFXv9 GPUs, the retry mode in the SQ must match the
1456	 * boot time retry setting. Mixing processes with different
1457	 * XNACK/retry settings can hang the GPU.
1458	 *
1459	 * Different GPUs can have different noretry settings depending
1460	 * on HW bugs or limitations. We need to find at least one
1461	 * XNACK mode for this process that's compatible with all GPUs.
1462	 * Fortunately GPUs with retry enabled (noretry=0) can run code
1463	 * built for XNACK-off. On GFXv9 it may perform slower.
1464	 *
1465	 * Therefore applications built for XNACK-off can always be
1466	 * supported and will be our fallback if any GPU does not
1467	 * support retry.
1468	 */
1469	for (i = 0; i < p->n_pdds; i++) {
1470		struct kfd_node *dev = p->pdds[i]->dev;
1471
1472		/* Only consider GFXv9 and higher GPUs. Older GPUs don't
1473		 * support the SVM APIs and don't need to be considered
1474		 * for the XNACK mode selection.
1475		 */
1476		if (!KFD_IS_SOC15(dev))
1477			continue;
1478		/* Aldebaran can always support XNACK because it can support
1479		 * per-process XNACK mode selection. But let the dev->noretry
1480		 * setting still influence the default XNACK mode.
1481		 */
1482		if (supported && KFD_SUPPORT_XNACK_PER_PROCESS(dev)) {
1483			if (!amdgpu_sriov_xnack_support(dev->kfd->adev)) {
1484				pr_debug("SRIOV platform xnack not supported\n");
1485				return false;
1486			}
1487			continue;
1488		}
1489
1490		/* GFXv10 and later GPUs do not support shader preemption
1491		 * during page faults. This can lead to poor QoS for queue
1492		 * management and memory-manager-related preemptions or
1493		 * even deadlocks.
1494		 */
1495		if (KFD_GC_VERSION(dev) >= IP_VERSION(10, 1, 1))
1496			return false;
1497
1498		if (dev->kfd->noretry)
1499			return false;
1500	}
1501
1502	return true;
1503}
1504
1505void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
1506				     bool enabled)
1507{
1508	if (qpd->cwsr_kaddr) {
1509		uint64_t *tma =
1510			(uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
1511		tma[2] = enabled;
1512	}
1513}
1514
1515/*
1516 * On return the kfd_process is fully operational and will be freed when the
1517 * mm is released
1518 */
1519static struct kfd_process *create_process(const struct task_struct *thread)
1520{
1521	struct kfd_process *process;
1522	struct mmu_notifier *mn;
1523	int err = -ENOMEM;
1524
1525	process = kzalloc(sizeof(*process), GFP_KERNEL);
1526	if (!process)
1527		goto err_alloc_process;
1528
1529	kref_init(&process->ref);
1530	mutex_init(&process->mutex);
1531	process->mm = thread->mm;
1532	process->lead_thread = thread->group_leader;
1533	process->n_pdds = 0;
1534	process->queues_paused = false;
1535	INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
1536	INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
1537	process->last_restore_timestamp = get_jiffies_64();
1538	err = kfd_event_init_process(process);
1539	if (err)
1540		goto err_event_init;
1541	process->is_32bit_user_mode = in_compat_syscall();
1542	process->debug_trap_enabled = false;
1543	process->debugger_process = NULL;
1544	process->exception_enable_mask = 0;
1545	atomic_set(&process->debugged_process_count, 0);
1546	sema_init(&process->runtime_enable_sema, 0);
1547
1548	err = pqm_init(&process->pqm, process);
1549	if (err != 0)
1550		goto err_process_pqm_init;
1551
1552	/* init process apertures*/
1553	err = kfd_init_apertures(process);
1554	if (err != 0)
1555		goto err_init_apertures;
1556
1557	/* Check XNACK support after PDDs are created in kfd_init_apertures */
1558	process->xnack_enabled = kfd_process_xnack_mode(process, false);
1559
1560	err = svm_range_list_init(process);
1561	if (err)
1562		goto err_init_svm_range_list;
1563
1564	/* alloc_notifier needs to find the process in the hash table */
1565	hash_add_rcu(kfd_processes_table, &process->kfd_processes,
1566			(uintptr_t)process->mm);
1567
1568	/* Avoid free_notifier to start kfd_process_wq_release if
1569	 * mmu_notifier_get failed because of pending signal.
1570	 */
1571	kref_get(&process->ref);
1572
1573	/* MMU notifier registration must be the last call that can fail
1574	 * because after this point we cannot unwind the process creation.
1575	 * After this point, mmu_notifier_put will trigger the cleanup by
1576	 * dropping the last process reference in the free_notifier.
1577	 */
1578	mn = mmu_notifier_get(&kfd_process_mmu_notifier_ops, process->mm);
1579	if (IS_ERR(mn)) {
1580		err = PTR_ERR(mn);
1581		goto err_register_notifier;
1582	}
1583	BUG_ON(mn != &process->mmu_notifier);
1584
1585	kfd_unref_process(process);
1586	get_task_struct(process->lead_thread);
1587
1588	INIT_WORK(&process->debug_event_workarea, debug_event_write_work_handler);
1589
1590	return process;
1591
1592err_register_notifier:
1593	hash_del_rcu(&process->kfd_processes);
1594	svm_range_list_fini(process);
1595err_init_svm_range_list:
1596	kfd_process_free_outstanding_kfd_bos(process);
1597	kfd_process_destroy_pdds(process);
1598err_init_apertures:
1599	pqm_uninit(&process->pqm);
1600err_process_pqm_init:
1601	kfd_event_free_process(process);
1602err_event_init:
1603	mutex_destroy(&process->mutex);
1604	kfree(process);
1605err_alloc_process:
1606	return ERR_PTR(err);
1607}
1608
1609struct kfd_process_device *kfd_get_process_device_data(struct kfd_node *dev,
1610							struct kfd_process *p)
1611{
1612	int i;
1613
1614	for (i = 0; i < p->n_pdds; i++)
1615		if (p->pdds[i]->dev == dev)
1616			return p->pdds[i];
1617
1618	return NULL;
1619}
1620
1621struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev,
1622							struct kfd_process *p)
1623{
1624	struct kfd_process_device *pdd = NULL;
1625
1626	if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
1627		return NULL;
1628	pdd = kzalloc(sizeof(*pdd), GFP_KERNEL);
1629	if (!pdd)
1630		return NULL;
1631
1632	pdd->dev = dev;
1633	INIT_LIST_HEAD(&pdd->qpd.queues_list);
1634	INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
1635	pdd->qpd.dqm = dev->dqm;
1636	pdd->qpd.pqm = &p->pqm;
1637	pdd->qpd.evicted = 0;
1638	pdd->qpd.mapped_gws_queue = false;
1639	pdd->process = p;
1640	pdd->bound = PDD_UNBOUND;
1641	pdd->already_dequeued = false;
1642	pdd->runtime_inuse = false;
1643	atomic64_set(&pdd->vram_usage, 0);
1644	pdd->sdma_past_activity_counter = 0;
1645	pdd->user_gpu_id = dev->id;
1646	atomic64_set(&pdd->evict_duration_counter, 0);
1647
1648	p->pdds[p->n_pdds++] = pdd;
1649	if (kfd_dbg_is_per_vmid_supported(pdd->dev))
1650		pdd->spi_dbg_override = pdd->dev->kfd2kgd->disable_debug_trap(
1651							pdd->dev->adev,
1652							false,
1653							0);
1654
1655	/* Init idr used for memory handle translation */
1656	idr_init(&pdd->alloc_idr);
1657
1658	atomic_inc(&dev->kfd->kfd_processes_count);
1659
1660	return pdd;
1661}
1662
1663/**
1664 * kfd_process_device_init_vm - Initialize a VM for a process-device
1665 *
1666 * @pdd: The process-device
1667 * @drm_file: Optional pointer to a DRM file descriptor
1668 *
1669 * If @drm_file is specified, it will be used to acquire the VM from
1670 * that file descriptor. If successful, the @pdd takes ownership of
1671 * the file descriptor.
1672 *
1673 * If @drm_file is NULL, a new VM is created.
1674 *
1675 * Returns 0 on success, -errno on failure.
1676 */
1677int kfd_process_device_init_vm(struct kfd_process_device *pdd,
1678			       struct file *drm_file)
1679{
1680	struct amdgpu_fpriv *drv_priv;
1681	struct amdgpu_vm *avm;
1682	struct kfd_process *p;
1683	struct dma_fence *ef;
1684	struct kfd_node *dev;
1685	int ret;
1686
1687	if (!drm_file)
1688		return -EINVAL;
1689
1690	if (pdd->drm_priv)
1691		return -EBUSY;
1692
1693	ret = amdgpu_file_to_fpriv(drm_file, &drv_priv);
1694	if (ret)
1695		return ret;
1696	avm = &drv_priv->vm;
1697
1698	p = pdd->process;
1699	dev = pdd->dev;
1700
1701	ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(dev->adev, avm,
1702						     &p->kgd_process_info,
1703						     p->ef ? NULL : &ef);
1704	if (ret) {
1705		dev_err(dev->adev->dev, "Failed to create process VM object\n");
1706		return ret;
1707	}
1708
1709	if (!p->ef)
1710		RCU_INIT_POINTER(p->ef, ef);
1711
1712	pdd->drm_priv = drm_file->private_data;
1713
1714	ret = kfd_process_device_reserve_ib_mem(pdd);
1715	if (ret)
1716		goto err_reserve_ib_mem;
1717	ret = kfd_process_device_init_cwsr_dgpu(pdd);
1718	if (ret)
1719		goto err_init_cwsr;
1720
1721	if (unlikely(!avm->pasid)) {
1722		dev_warn(pdd->dev->adev->dev, "WARN: vm %p has no pasid associated",
1723				 avm);
1724		ret = -EINVAL;
1725		goto err_get_pasid;
1726	}
1727
1728	pdd->pasid = avm->pasid;
1729	pdd->drm_file = drm_file;
1730
1731	kfd_smi_event_process(pdd, true);
1732
1733	return 0;
1734
1735err_get_pasid:
1736	kfd_process_device_destroy_cwsr_dgpu(pdd);
1737err_init_cwsr:
1738	kfd_process_device_destroy_ib_mem(pdd);
1739err_reserve_ib_mem:
1740	pdd->drm_priv = NULL;
1741	amdgpu_amdkfd_gpuvm_destroy_cb(dev->adev, avm);
1742
1743	return ret;
1744}
1745
1746/*
1747 * Direct the IOMMU to bind the process (specifically the pasid->mm)
1748 * to the device.
1749 * Unbinding occurs when the process dies or the device is removed.
1750 *
1751 * Assumes that the process lock is held.
1752 */
1753struct kfd_process_device *kfd_bind_process_to_device(struct kfd_node *dev,
1754							struct kfd_process *p)
1755{
1756	struct kfd_process_device *pdd;
1757	int err;
1758
1759	pdd = kfd_get_process_device_data(dev, p);
1760	if (!pdd) {
1761		dev_err(dev->adev->dev, "Process device data doesn't exist\n");
1762		return ERR_PTR(-ENOMEM);
1763	}
1764
1765	if (!pdd->drm_priv)
1766		return ERR_PTR(-ENODEV);
1767
1768	/*
1769	 * signal runtime-pm system to auto resume and prevent
1770	 * further runtime suspend once device pdd is created until
1771	 * pdd is destroyed.
1772	 */
1773	if (!pdd->runtime_inuse) {
1774		err = pm_runtime_get_sync(adev_to_drm(dev->adev)->dev);
1775		if (err < 0) {
1776			pm_runtime_put_autosuspend(adev_to_drm(dev->adev)->dev);
1777			return ERR_PTR(err);
1778		}
1779	}
1780
1781	/*
1782	 * make sure that runtime_usage counter is incremented just once
1783	 * per pdd
1784	 */
1785	pdd->runtime_inuse = true;
1786
1787	return pdd;
1788}
1789
1790/* Create specific handle mapped to mem from process local memory idr
1791 * Assumes that the process lock is held.
1792 */
1793int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd,
1794					void *mem)
1795{
1796	return idr_alloc(&pdd->alloc_idr, mem, 0, 0, GFP_KERNEL);
1797}
1798
1799/* Translate specific handle from process local memory idr
1800 * Assumes that the process lock is held.
1801 */
1802void *kfd_process_device_translate_handle(struct kfd_process_device *pdd,
1803					int handle)
1804{
1805	if (handle < 0)
1806		return NULL;
1807
1808	return idr_find(&pdd->alloc_idr, handle);
1809}
1810
1811/* Remove specific handle from process local memory idr
1812 * Assumes that the process lock is held.
1813 */
1814void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
1815					int handle)
1816{
1817	if (handle >= 0)
1818		idr_remove(&pdd->alloc_idr, handle);
1819}
1820
1821static struct kfd_process_device *kfd_lookup_process_device_by_pasid(u32 pasid)
1822{
1823	struct kfd_process_device *ret_p = NULL;
1824	struct kfd_process *p;
1825	unsigned int temp;
1826	int i;
1827
1828	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
1829		for (i = 0; i < p->n_pdds; i++) {
1830			if (p->pdds[i]->pasid == pasid) {
1831				ret_p = p->pdds[i];
1832				break;
1833			}
1834		}
1835		if (ret_p)
1836			break;
1837	}
1838	return ret_p;
1839}
1840
1841/* This increments the process->ref counter. */
1842struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid,
1843						struct kfd_process_device **pdd)
1844{
1845	struct kfd_process_device *ret_p;
1846
1847	int idx = srcu_read_lock(&kfd_processes_srcu);
1848
1849	ret_p = kfd_lookup_process_device_by_pasid(pasid);
1850	if (ret_p) {
1851		if (pdd)
1852			*pdd = ret_p;
1853		kref_get(&ret_p->process->ref);
1854
1855		srcu_read_unlock(&kfd_processes_srcu, idx);
1856		return ret_p->process;
1857	}
1858
1859	srcu_read_unlock(&kfd_processes_srcu, idx);
1860
1861	if (pdd)
1862		*pdd = NULL;
1863
1864	return NULL;
1865}
1866
1867/* This increments the process->ref counter. */
1868struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
1869{
1870	struct kfd_process *p;
1871
1872	int idx = srcu_read_lock(&kfd_processes_srcu);
1873
1874	p = find_process_by_mm(mm);
1875	if (p)
1876		kref_get(&p->ref);
1877
1878	srcu_read_unlock(&kfd_processes_srcu, idx);
1879
1880	return p;
1881}
1882
1883/* kfd_process_evict_queues - Evict all user queues of a process
1884 *
1885 * Eviction is reference-counted per process-device. This means multiple
1886 * evictions from different sources can be nested safely.
1887 */
1888int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger)
1889{
1890	int r = 0;
1891	int i;
1892	unsigned int n_evicted = 0;
1893
1894	for (i = 0; i < p->n_pdds; i++) {
1895		struct kfd_process_device *pdd = p->pdds[i];
1896		struct device *dev = pdd->dev->adev->dev;
1897
1898		kfd_smi_event_queue_eviction(pdd->dev, p->lead_thread->pid,
1899					     trigger);
1900
1901		r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
1902							    &pdd->qpd);
1903		/* evict return -EIO if HWS is hang or asic is resetting, in this case
1904		 * we would like to set all the queues to be in evicted state to prevent
1905		 * them been add back since they actually not be saved right now.
1906		 */
1907		if (r && r != -EIO) {
1908			dev_err(dev, "Failed to evict process queues\n");
1909			goto fail;
1910		}
1911		n_evicted++;
1912
1913		pdd->dev->dqm->is_hws_hang = false;
1914	}
1915
1916	return r;
1917
1918fail:
1919	/* To keep state consistent, roll back partial eviction by
1920	 * restoring queues
1921	 */
1922	for (i = 0; i < p->n_pdds; i++) {
1923		struct kfd_process_device *pdd = p->pdds[i];
1924
1925		if (n_evicted == 0)
1926			break;
1927
1928		kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid);
1929
1930		if (pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
1931							      &pdd->qpd))
1932			dev_err(pdd->dev->adev->dev,
1933				"Failed to restore queues\n");
1934
1935		n_evicted--;
1936	}
1937
1938	return r;
1939}
1940
1941/* kfd_process_restore_queues - Restore all user queues of a process */
1942int kfd_process_restore_queues(struct kfd_process *p)
1943{
1944	int r, ret = 0;
1945	int i;
1946
1947	for (i = 0; i < p->n_pdds; i++) {
1948		struct kfd_process_device *pdd = p->pdds[i];
1949		struct device *dev = pdd->dev->adev->dev;
1950
1951		kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid);
1952
1953		r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
1954							      &pdd->qpd);
1955		if (r) {
1956			dev_err(dev, "Failed to restore process queues\n");
1957			if (!ret)
1958				ret = r;
1959		}
1960	}
1961
1962	return ret;
1963}
1964
1965int kfd_process_gpuidx_from_gpuid(struct kfd_process *p, uint32_t gpu_id)
1966{
1967	int i;
1968
1969	for (i = 0; i < p->n_pdds; i++)
1970		if (p->pdds[i] && gpu_id == p->pdds[i]->user_gpu_id)
1971			return i;
1972	return -EINVAL;
1973}
1974
1975int
1976kfd_process_gpuid_from_node(struct kfd_process *p, struct kfd_node *node,
1977			    uint32_t *gpuid, uint32_t *gpuidx)
1978{
1979	int i;
1980
1981	for (i = 0; i < p->n_pdds; i++)
1982		if (p->pdds[i] && p->pdds[i]->dev == node) {
1983			*gpuid = p->pdds[i]->user_gpu_id;
1984			*gpuidx = i;
1985			return 0;
1986		}
1987	return -EINVAL;
1988}
1989
1990static int signal_eviction_fence(struct kfd_process *p)
1991{
1992	struct dma_fence *ef;
1993	int ret;
1994
1995	rcu_read_lock();
1996	ef = dma_fence_get_rcu_safe(&p->ef);
1997	rcu_read_unlock();
1998	if (!ef)
1999		return -EINVAL;
2000
2001	ret = dma_fence_signal(ef);
2002	dma_fence_put(ef);
2003
2004	return ret;
2005}
2006
2007static void evict_process_worker(struct work_struct *work)
2008{
2009	int ret;
2010	struct kfd_process *p;
2011	struct delayed_work *dwork;
2012
2013	dwork = to_delayed_work(work);
2014
2015	/* Process termination destroys this worker thread. So during the
2016	 * lifetime of this thread, kfd_process p will be valid
2017	 */
2018	p = container_of(dwork, struct kfd_process, eviction_work);
2019
2020	pr_debug("Started evicting process pid %d\n", p->lead_thread->pid);
2021	ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM);
2022	if (!ret) {
2023		/* If another thread already signaled the eviction fence,
2024		 * they are responsible stopping the queues and scheduling
2025		 * the restore work.
2026		 */
2027		if (signal_eviction_fence(p) ||
2028		    mod_delayed_work(kfd_restore_wq, &p->restore_work,
2029				     msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)))
2030			kfd_process_restore_queues(p);
2031
2032		pr_debug("Finished evicting process pid %d\n", p->lead_thread->pid);
2033	} else
2034		pr_err("Failed to evict queues of process pid %d\n", p->lead_thread->pid);
2035}
2036
2037static int restore_process_helper(struct kfd_process *p)
2038{
2039	int ret = 0;
2040
2041	/* VMs may not have been acquired yet during debugging. */
2042	if (p->kgd_process_info) {
2043		ret = amdgpu_amdkfd_gpuvm_restore_process_bos(
2044			p->kgd_process_info, &p->ef);
2045		if (ret)
2046			return ret;
2047	}
2048
2049	ret = kfd_process_restore_queues(p);
2050	if (!ret)
2051		pr_debug("Finished restoring process pid %d\n",
2052			p->lead_thread->pid);
2053	else
2054		pr_err("Failed to restore queues of process pid %d\n",
2055		      p->lead_thread->pid);
2056
2057	return ret;
2058}
2059
2060static void restore_process_worker(struct work_struct *work)
2061{
2062	struct delayed_work *dwork;
2063	struct kfd_process *p;
2064	int ret = 0;
2065
2066	dwork = to_delayed_work(work);
2067
2068	/* Process termination destroys this worker thread. So during the
2069	 * lifetime of this thread, kfd_process p will be valid
2070	 */
2071	p = container_of(dwork, struct kfd_process, restore_work);
2072	pr_debug("Started restoring process pasid %d\n", (int)p->lead_thread->pid);
2073
2074	/* Setting last_restore_timestamp before successful restoration.
2075	 * Otherwise this would have to be set by KGD (restore_process_bos)
2076	 * before KFD BOs are unreserved. If not, the process can be evicted
2077	 * again before the timestamp is set.
2078	 * If restore fails, the timestamp will be set again in the next
2079	 * attempt. This would mean that the minimum GPU quanta would be
2080	 * PROCESS_ACTIVE_TIME_MS - (time to execute the following two
2081	 * functions)
2082	 */
2083
2084	p->last_restore_timestamp = get_jiffies_64();
2085
2086	ret = restore_process_helper(p);
2087	if (ret) {
2088		pr_debug("Failed to restore BOs of process pid %d, retry after %d ms\n",
2089			 p->lead_thread->pid, PROCESS_BACK_OFF_TIME_MS);
2090		if (mod_delayed_work(kfd_restore_wq, &p->restore_work,
2091				     msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)))
2092			kfd_process_restore_queues(p);
2093	}
2094}
2095
2096void kfd_suspend_all_processes(void)
2097{
2098	struct kfd_process *p;
2099	unsigned int temp;
2100	int idx = srcu_read_lock(&kfd_processes_srcu);
2101
2102	WARN(debug_evictions, "Evicting all processes");
2103	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
2104		if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
2105			pr_err("Failed to suspend process pid %d\n", p->lead_thread->pid);
2106		signal_eviction_fence(p);
2107	}
2108	srcu_read_unlock(&kfd_processes_srcu, idx);
2109}
2110
2111int kfd_resume_all_processes(void)
2112{
2113	struct kfd_process *p;
2114	unsigned int temp;
2115	int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu);
2116
2117	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
2118		if (restore_process_helper(p)) {
2119			pr_err("Restore process pid %d failed during resume\n",
2120			      p->lead_thread->pid);
2121			ret = -EFAULT;
2122		}
2123	}
2124	srcu_read_unlock(&kfd_processes_srcu, idx);
2125	return ret;
2126}
2127
2128int kfd_reserved_mem_mmap(struct kfd_node *dev, struct kfd_process *process,
2129			  struct vm_area_struct *vma)
2130{
2131	struct kfd_process_device *pdd;
2132	struct qcm_process_device *qpd;
2133
2134	if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) {
2135		dev_err(dev->adev->dev, "Incorrect CWSR mapping size.\n");
2136		return -EINVAL;
2137	}
2138
2139	pdd = kfd_get_process_device_data(dev, process);
2140	if (!pdd)
2141		return -EINVAL;
2142	qpd = &pdd->qpd;
2143
2144	qpd->cwsr_kaddr = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
2145					get_order(KFD_CWSR_TBA_TMA_SIZE));
2146	if (!qpd->cwsr_kaddr) {
2147		dev_err(dev->adev->dev,
2148			"Error allocating per process CWSR buffer.\n");
2149		return -ENOMEM;
2150	}
2151
2152	vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND
2153		| VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP);
2154	/* Mapping pages to user process */
2155	return remap_pfn_range(vma, vma->vm_start,
2156			       PFN_DOWN(__pa(qpd->cwsr_kaddr)),
2157			       KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot);
2158}
2159
2160/* assumes caller holds process lock. */
2161int kfd_process_drain_interrupts(struct kfd_process_device *pdd)
2162{
2163	uint32_t irq_drain_fence[8];
2164	uint8_t node_id = 0;
2165	int r = 0;
2166
2167	if (!KFD_IS_SOC15(pdd->dev))
2168		return 0;
2169
2170	pdd->process->irq_drain_is_open = true;
2171
2172	memset(irq_drain_fence, 0, sizeof(irq_drain_fence));
2173	irq_drain_fence[0] = (KFD_IRQ_FENCE_SOURCEID << 8) |
2174							KFD_IRQ_FENCE_CLIENTID;
2175	irq_drain_fence[3] = pdd->pasid;
2176
2177	/*
2178	 * For GFX 9.4.3/9.5.0, send the NodeId also in IH cookie DW[3]
2179	 */
2180	if (KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 4, 3) ||
2181	    KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 4, 4) ||
2182	    KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 5, 0)) {
2183		node_id = ffs(pdd->dev->interrupt_bitmap) - 1;
2184		irq_drain_fence[3] |= node_id << 16;
2185	}
2186
2187	/* ensure stale irqs scheduled KFD interrupts and send drain fence. */
2188	if (amdgpu_amdkfd_send_close_event_drain_irq(pdd->dev->adev,
2189						     irq_drain_fence)) {
2190		pdd->process->irq_drain_is_open = false;
2191		return 0;
2192	}
2193
2194	r = wait_event_interruptible(pdd->process->wait_irq_drain,
2195				     !READ_ONCE(pdd->process->irq_drain_is_open));
2196	if (r)
2197		pdd->process->irq_drain_is_open = false;
2198
2199	return r;
2200}
2201
2202void kfd_process_close_interrupt_drain(unsigned int pasid)
2203{
2204	struct kfd_process *p;
2205
2206	p = kfd_lookup_process_by_pasid(pasid, NULL);
2207
2208	if (!p)
2209		return;
2210
2211	WRITE_ONCE(p->irq_drain_is_open, false);
2212	wake_up_all(&p->wait_irq_drain);
2213	kfd_unref_process(p);
2214}
2215
2216struct send_exception_work_handler_workarea {
2217	struct work_struct work;
2218	struct kfd_process *p;
2219	unsigned int queue_id;
2220	uint64_t error_reason;
2221};
2222
2223static void send_exception_work_handler(struct work_struct *work)
2224{
2225	struct send_exception_work_handler_workarea *workarea;
2226	struct kfd_process *p;
2227	struct queue *q;
2228	struct mm_struct *mm;
2229	struct kfd_context_save_area_header __user *csa_header;
2230	uint64_t __user *err_payload_ptr;
2231	uint64_t cur_err;
2232	uint32_t ev_id;
2233
2234	workarea = container_of(work,
2235				struct send_exception_work_handler_workarea,
2236				work);
2237	p = workarea->p;
2238
2239	mm = get_task_mm(p->lead_thread);
2240
2241	if (!mm)
2242		return;
2243
2244	kthread_use_mm(mm);
2245
2246	q = pqm_get_user_queue(&p->pqm, workarea->queue_id);
2247
2248	if (!q)
2249		goto out;
2250
2251	csa_header = (void __user *)q->properties.ctx_save_restore_area_address;
2252
2253	get_user(err_payload_ptr, (uint64_t __user **)&csa_header->err_payload_addr);
2254	get_user(cur_err, err_payload_ptr);
2255	cur_err |= workarea->error_reason;
2256	put_user(cur_err, err_payload_ptr);
2257	get_user(ev_id, &csa_header->err_event_id);
2258
2259	kfd_set_event(p, ev_id);
2260
2261out:
2262	kthread_unuse_mm(mm);
2263	mmput(mm);
2264}
2265
2266int kfd_send_exception_to_runtime(struct kfd_process *p,
2267			unsigned int queue_id,
2268			uint64_t error_reason)
2269{
2270	struct send_exception_work_handler_workarea worker;
2271
2272	INIT_WORK_ONSTACK(&worker.work, send_exception_work_handler);
2273
2274	worker.p = p;
2275	worker.queue_id = queue_id;
2276	worker.error_reason = error_reason;
2277
2278	schedule_work(&worker.work);
2279	flush_work(&worker.work);
2280	destroy_work_on_stack(&worker.work);
2281
2282	return 0;
2283}
2284
2285struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *p, uint32_t gpu_id)
2286{
2287	int i;
2288
2289	if (gpu_id) {
2290		for (i = 0; i < p->n_pdds; i++) {
2291			struct kfd_process_device *pdd = p->pdds[i];
2292
2293			if (pdd->user_gpu_id == gpu_id)
2294				return pdd;
2295		}
2296	}
2297	return NULL;
2298}
2299
2300int kfd_process_get_user_gpu_id(struct kfd_process *p, uint32_t actual_gpu_id)
2301{
2302	int i;
2303
2304	if (!actual_gpu_id)
2305		return 0;
2306
2307	for (i = 0; i < p->n_pdds; i++) {
2308		struct kfd_process_device *pdd = p->pdds[i];
2309
2310		if (pdd->dev->id == actual_gpu_id)
2311			return pdd->user_gpu_id;
2312	}
2313	return -EINVAL;
2314}
2315
2316#if defined(CONFIG_DEBUG_FS)
2317
2318int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
2319{
2320	struct kfd_process *p;
2321	unsigned int temp;
2322	int r = 0;
2323
2324	int idx = srcu_read_lock(&kfd_processes_srcu);
2325
2326	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
2327		seq_printf(m, "Process %d PASID %d:\n",
2328			   p->lead_thread->tgid, p->lead_thread->pid);
2329
2330		mutex_lock(&p->mutex);
2331		r = pqm_debugfs_mqds(m, &p->pqm);
2332		mutex_unlock(&p->mutex);
2333
2334		if (r)
2335			break;
2336	}
2337
2338	srcu_read_unlock(&kfd_processes_srcu, idx);
2339
2340	return r;
2341}
2342
2343#endif