drivers/gpu/drm/i915/i915_gem.c at v4.18

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / drivers / gpu / drm / i915 / i915_gem.c
at v4.18 6088 lines 166 kB view raw
wrap content
   1/*
   2 * Copyright © 2008-2015 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * Authors:
  24 *    Eric Anholt <eric@anholt.net>
  25 *
  26 */
  27
  28#include <drm/drmP.h>
  29#include <drm/drm_vma_manager.h>
  30#include <drm/i915_drm.h>
  31#include "i915_drv.h"
  32#include "i915_gem_clflush.h"
  33#include "i915_vgpu.h"
  34#include "i915_trace.h"
  35#include "intel_drv.h"
  36#include "intel_frontbuffer.h"
  37#include "intel_mocs.h"
  38#include "intel_workarounds.h"
  39#include "i915_gemfs.h"
  40#include <linux/dma-fence-array.h>
  41#include <linux/kthread.h>
  42#include <linux/reservation.h>
  43#include <linux/shmem_fs.h>
  44#include <linux/slab.h>
  45#include <linux/stop_machine.h>
  46#include <linux/swap.h>
  47#include <linux/pci.h>
  48#include <linux/dma-buf.h>
  49
  50static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
  51
  52static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
  53{
  54	if (obj->cache_dirty)
  55		return false;
  56
  57	if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
  58		return true;
  59
  60	return obj->pin_global; /* currently in use by HW, keep flushed */
  61}
  62
  63static int
  64insert_mappable_node(struct i915_ggtt *ggtt,
  65                     struct drm_mm_node *node, u32 size)
  66{
  67	memset(node, 0, sizeof(*node));
  68	return drm_mm_insert_node_in_range(&ggtt->base.mm, node,
  69					   size, 0, I915_COLOR_UNEVICTABLE,
  70					   0, ggtt->mappable_end,
  71					   DRM_MM_INSERT_LOW);
  72}
  73
  74static void
  75remove_mappable_node(struct drm_mm_node *node)
  76{
  77	drm_mm_remove_node(node);
  78}
  79
  80/* some bookkeeping */
  81static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
  82				  u64 size)
  83{
  84	spin_lock(&dev_priv->mm.object_stat_lock);
  85	dev_priv->mm.object_count++;
  86	dev_priv->mm.object_memory += size;
  87	spin_unlock(&dev_priv->mm.object_stat_lock);
  88}
  89
  90static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
  91				     u64 size)
  92{
  93	spin_lock(&dev_priv->mm.object_stat_lock);
  94	dev_priv->mm.object_count--;
  95	dev_priv->mm.object_memory -= size;
  96	spin_unlock(&dev_priv->mm.object_stat_lock);
  97}
  98
  99static int
 100i915_gem_wait_for_error(struct i915_gpu_error *error)
 101{
 102	int ret;
 103
 104	might_sleep();
 105
 106	/*
 107	 * Only wait 10 seconds for the gpu reset to complete to avoid hanging
 108	 * userspace. If it takes that long something really bad is going on and
 109	 * we should simply try to bail out and fail as gracefully as possible.
 110	 */
 111	ret = wait_event_interruptible_timeout(error->reset_queue,
 112					       !i915_reset_backoff(error),
 113					       I915_RESET_TIMEOUT);
 114	if (ret == 0) {
 115		DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
 116		return -EIO;
 117	} else if (ret < 0) {
 118		return ret;
 119	} else {
 120		return 0;
 121	}
 122}
 123
 124int i915_mutex_lock_interruptible(struct drm_device *dev)
 125{
 126	struct drm_i915_private *dev_priv = to_i915(dev);
 127	int ret;
 128
 129	ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
 130	if (ret)
 131		return ret;
 132
 133	ret = mutex_lock_interruptible(&dev->struct_mutex);
 134	if (ret)
 135		return ret;
 136
 137	return 0;
 138}
 139
 140static u32 __i915_gem_park(struct drm_i915_private *i915)
 141{
 142	lockdep_assert_held(&i915->drm.struct_mutex);
 143	GEM_BUG_ON(i915->gt.active_requests);
 144	GEM_BUG_ON(!list_empty(&i915->gt.active_rings));
 145
 146	if (!i915->gt.awake)
 147		return I915_EPOCH_INVALID;
 148
 149	GEM_BUG_ON(i915->gt.epoch == I915_EPOCH_INVALID);
 150
 151	/*
 152	 * Be paranoid and flush a concurrent interrupt to make sure
 153	 * we don't reactivate any irq tasklets after parking.
 154	 *
 155	 * FIXME: Note that even though we have waited for execlists to be idle,
 156	 * there may still be an in-flight interrupt even though the CSB
 157	 * is now empty. synchronize_irq() makes sure that a residual interrupt
 158	 * is completed before we continue, but it doesn't prevent the HW from
 159	 * raising a spurious interrupt later. To complete the shield we should
 160	 * coordinate disabling the CS irq with flushing the interrupts.
 161	 */
 162	synchronize_irq(i915->drm.irq);
 163
 164	intel_engines_park(i915);
 165	i915_timelines_park(i915);
 166
 167	i915_pmu_gt_parked(i915);
 168	i915_vma_parked(i915);
 169
 170	i915->gt.awake = false;
 171
 172	if (INTEL_GEN(i915) >= 6)
 173		gen6_rps_idle(i915);
 174
 175	intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ);
 176
 177	intel_runtime_pm_put(i915);
 178
 179	return i915->gt.epoch;
 180}
 181
 182void i915_gem_park(struct drm_i915_private *i915)
 183{
 184	lockdep_assert_held(&i915->drm.struct_mutex);
 185	GEM_BUG_ON(i915->gt.active_requests);
 186
 187	if (!i915->gt.awake)
 188		return;
 189
 190	/* Defer the actual call to __i915_gem_park() to prevent ping-pongs */
 191	mod_delayed_work(i915->wq, &i915->gt.idle_work, msecs_to_jiffies(100));
 192}
 193
 194void i915_gem_unpark(struct drm_i915_private *i915)
 195{
 196	lockdep_assert_held(&i915->drm.struct_mutex);
 197	GEM_BUG_ON(!i915->gt.active_requests);
 198
 199	if (i915->gt.awake)
 200		return;
 201
 202	intel_runtime_pm_get_noresume(i915);
 203
 204	/*
 205	 * It seems that the DMC likes to transition between the DC states a lot
 206	 * when there are no connected displays (no active power domains) during
 207	 * command submission.
 208	 *
 209	 * This activity has negative impact on the performance of the chip with
 210	 * huge latencies observed in the interrupt handler and elsewhere.
 211	 *
 212	 * Work around it by grabbing a GT IRQ power domain whilst there is any
 213	 * GT activity, preventing any DC state transitions.
 214	 */
 215	intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
 216
 217	i915->gt.awake = true;
 218	if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
 219		i915->gt.epoch = 1;
 220
 221	intel_enable_gt_powersave(i915);
 222	i915_update_gfx_val(i915);
 223	if (INTEL_GEN(i915) >= 6)
 224		gen6_rps_busy(i915);
 225	i915_pmu_gt_unparked(i915);
 226
 227	intel_engines_unpark(i915);
 228
 229	i915_queue_hangcheck(i915);
 230
 231	queue_delayed_work(i915->wq,
 232			   &i915->gt.retire_work,
 233			   round_jiffies_up_relative(HZ));
 234}
 235
 236int
 237i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
 238			    struct drm_file *file)
 239{
 240	struct drm_i915_private *dev_priv = to_i915(dev);
 241	struct i915_ggtt *ggtt = &dev_priv->ggtt;
 242	struct drm_i915_gem_get_aperture *args = data;
 243	struct i915_vma *vma;
 244	u64 pinned;
 245
 246	pinned = ggtt->base.reserved;
 247	mutex_lock(&dev->struct_mutex);
 248	list_for_each_entry(vma, &ggtt->base.active_list, vm_link)
 249		if (i915_vma_is_pinned(vma))
 250			pinned += vma->node.size;
 251	list_for_each_entry(vma, &ggtt->base.inactive_list, vm_link)
 252		if (i915_vma_is_pinned(vma))
 253			pinned += vma->node.size;
 254	mutex_unlock(&dev->struct_mutex);
 255
 256	args->aper_size = ggtt->base.total;
 257	args->aper_available_size = args->aper_size - pinned;
 258
 259	return 0;
 260}
 261
 262static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
 263{
 264	struct address_space *mapping = obj->base.filp->f_mapping;
 265	drm_dma_handle_t *phys;
 266	struct sg_table *st;
 267	struct scatterlist *sg;
 268	char *vaddr;
 269	int i;
 270	int err;
 271
 272	if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
 273		return -EINVAL;
 274
 275	/* Always aligning to the object size, allows a single allocation
 276	 * to handle all possible callers, and given typical object sizes,
 277	 * the alignment of the buddy allocation will naturally match.
 278	 */
 279	phys = drm_pci_alloc(obj->base.dev,
 280			     roundup_pow_of_two(obj->base.size),
 281			     roundup_pow_of_two(obj->base.size));
 282	if (!phys)
 283		return -ENOMEM;
 284
 285	vaddr = phys->vaddr;
 286	for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 287		struct page *page;
 288		char *src;
 289
 290		page = shmem_read_mapping_page(mapping, i);
 291		if (IS_ERR(page)) {
 292			err = PTR_ERR(page);
 293			goto err_phys;
 294		}
 295
 296		src = kmap_atomic(page);
 297		memcpy(vaddr, src, PAGE_SIZE);
 298		drm_clflush_virt_range(vaddr, PAGE_SIZE);
 299		kunmap_atomic(src);
 300
 301		put_page(page);
 302		vaddr += PAGE_SIZE;
 303	}
 304
 305	i915_gem_chipset_flush(to_i915(obj->base.dev));
 306
 307	st = kmalloc(sizeof(*st), GFP_KERNEL);
 308	if (!st) {
 309		err = -ENOMEM;
 310		goto err_phys;
 311	}
 312
 313	if (sg_alloc_table(st, 1, GFP_KERNEL)) {
 314		kfree(st);
 315		err = -ENOMEM;
 316		goto err_phys;
 317	}
 318
 319	sg = st->sgl;
 320	sg->offset = 0;
 321	sg->length = obj->base.size;
 322
 323	sg_dma_address(sg) = phys->busaddr;
 324	sg_dma_len(sg) = obj->base.size;
 325
 326	obj->phys_handle = phys;
 327
 328	__i915_gem_object_set_pages(obj, st, sg->length);
 329
 330	return 0;
 331
 332err_phys:
 333	drm_pci_free(obj->base.dev, phys);
 334
 335	return err;
 336}
 337
 338static void __start_cpu_write(struct drm_i915_gem_object *obj)
 339{
 340	obj->read_domains = I915_GEM_DOMAIN_CPU;
 341	obj->write_domain = I915_GEM_DOMAIN_CPU;
 342	if (cpu_write_needs_clflush(obj))
 343		obj->cache_dirty = true;
 344}
 345
 346static void
 347__i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 348				struct sg_table *pages,
 349				bool needs_clflush)
 350{
 351	GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
 352
 353	if (obj->mm.madv == I915_MADV_DONTNEED)
 354		obj->mm.dirty = false;
 355
 356	if (needs_clflush &&
 357	    (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
 358	    !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
 359		drm_clflush_sg(pages);
 360
 361	__start_cpu_write(obj);
 362}
 363
 364static void
 365i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
 366			       struct sg_table *pages)
 367{
 368	__i915_gem_object_release_shmem(obj, pages, false);
 369
 370	if (obj->mm.dirty) {
 371		struct address_space *mapping = obj->base.filp->f_mapping;
 372		char *vaddr = obj->phys_handle->vaddr;
 373		int i;
 374
 375		for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 376			struct page *page;
 377			char *dst;
 378
 379			page = shmem_read_mapping_page(mapping, i);
 380			if (IS_ERR(page))
 381				continue;
 382
 383			dst = kmap_atomic(page);
 384			drm_clflush_virt_range(vaddr, PAGE_SIZE);
 385			memcpy(dst, vaddr, PAGE_SIZE);
 386			kunmap_atomic(dst);
 387
 388			set_page_dirty(page);
 389			if (obj->mm.madv == I915_MADV_WILLNEED)
 390				mark_page_accessed(page);
 391			put_page(page);
 392			vaddr += PAGE_SIZE;
 393		}
 394		obj->mm.dirty = false;
 395	}
 396
 397	sg_free_table(pages);
 398	kfree(pages);
 399
 400	drm_pci_free(obj->base.dev, obj->phys_handle);
 401}
 402
 403static void
 404i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
 405{
 406	i915_gem_object_unpin_pages(obj);
 407}
 408
 409static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
 410	.get_pages = i915_gem_object_get_pages_phys,
 411	.put_pages = i915_gem_object_put_pages_phys,
 412	.release = i915_gem_object_release_phys,
 413};
 414
 415static const struct drm_i915_gem_object_ops i915_gem_object_ops;
 416
 417int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 418{
 419	struct i915_vma *vma;
 420	LIST_HEAD(still_in_list);
 421	int ret;
 422
 423	lockdep_assert_held(&obj->base.dev->struct_mutex);
 424
 425	/* Closed vma are removed from the obj->vma_list - but they may
 426	 * still have an active binding on the object. To remove those we
 427	 * must wait for all rendering to complete to the object (as unbinding
 428	 * must anyway), and retire the requests.
 429	 */
 430	ret = i915_gem_object_set_to_cpu_domain(obj, false);
 431	if (ret)
 432		return ret;
 433
 434	while ((vma = list_first_entry_or_null(&obj->vma_list,
 435					       struct i915_vma,
 436					       obj_link))) {
 437		list_move_tail(&vma->obj_link, &still_in_list);
 438		ret = i915_vma_unbind(vma);
 439		if (ret)
 440			break;
 441	}
 442	list_splice(&still_in_list, &obj->vma_list);
 443
 444	return ret;
 445}
 446
 447static long
 448i915_gem_object_wait_fence(struct dma_fence *fence,
 449			   unsigned int flags,
 450			   long timeout,
 451			   struct intel_rps_client *rps_client)
 452{
 453	struct i915_request *rq;
 454
 455	BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
 456
 457	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
 458		return timeout;
 459
 460	if (!dma_fence_is_i915(fence))
 461		return dma_fence_wait_timeout(fence,
 462					      flags & I915_WAIT_INTERRUPTIBLE,
 463					      timeout);
 464
 465	rq = to_request(fence);
 466	if (i915_request_completed(rq))
 467		goto out;
 468
 469	/*
 470	 * This client is about to stall waiting for the GPU. In many cases
 471	 * this is undesirable and limits the throughput of the system, as
 472	 * many clients cannot continue processing user input/output whilst
 473	 * blocked. RPS autotuning may take tens of milliseconds to respond
 474	 * to the GPU load and thus incurs additional latency for the client.
 475	 * We can circumvent that by promoting the GPU frequency to maximum
 476	 * before we wait. This makes the GPU throttle up much more quickly
 477	 * (good for benchmarks and user experience, e.g. window animations),
 478	 * but at a cost of spending more power processing the workload
 479	 * (bad for battery). Not all clients even want their results
 480	 * immediately and for them we should just let the GPU select its own
 481	 * frequency to maximise efficiency. To prevent a single client from
 482	 * forcing the clocks too high for the whole system, we only allow
 483	 * each client to waitboost once in a busy period.
 484	 */
 485	if (rps_client && !i915_request_started(rq)) {
 486		if (INTEL_GEN(rq->i915) >= 6)
 487			gen6_rps_boost(rq, rps_client);
 488	}
 489
 490	timeout = i915_request_wait(rq, flags, timeout);
 491
 492out:
 493	if (flags & I915_WAIT_LOCKED && i915_request_completed(rq))
 494		i915_request_retire_upto(rq);
 495
 496	return timeout;
 497}
 498
 499static long
 500i915_gem_object_wait_reservation(struct reservation_object *resv,
 501				 unsigned int flags,
 502				 long timeout,
 503				 struct intel_rps_client *rps_client)
 504{
 505	unsigned int seq = __read_seqcount_begin(&resv->seq);
 506	struct dma_fence *excl;
 507	bool prune_fences = false;
 508
 509	if (flags & I915_WAIT_ALL) {
 510		struct dma_fence **shared;
 511		unsigned int count, i;
 512		int ret;
 513
 514		ret = reservation_object_get_fences_rcu(resv,
 515							&excl, &count, &shared);
 516		if (ret)
 517			return ret;
 518
 519		for (i = 0; i < count; i++) {
 520			timeout = i915_gem_object_wait_fence(shared[i],
 521							     flags, timeout,
 522							     rps_client);
 523			if (timeout < 0)
 524				break;
 525
 526			dma_fence_put(shared[i]);
 527		}
 528
 529		for (; i < count; i++)
 530			dma_fence_put(shared[i]);
 531		kfree(shared);
 532
 533		/*
 534		 * If both shared fences and an exclusive fence exist,
 535		 * then by construction the shared fences must be later
 536		 * than the exclusive fence. If we successfully wait for
 537		 * all the shared fences, we know that the exclusive fence
 538		 * must all be signaled. If all the shared fences are
 539		 * signaled, we can prune the array and recover the
 540		 * floating references on the fences/requests.
 541		 */
 542		prune_fences = count && timeout >= 0;
 543	} else {
 544		excl = reservation_object_get_excl_rcu(resv);
 545	}
 546
 547	if (excl && timeout >= 0)
 548		timeout = i915_gem_object_wait_fence(excl, flags, timeout,
 549						     rps_client);
 550
 551	dma_fence_put(excl);
 552
 553	/*
 554	 * Opportunistically prune the fences iff we know they have *all* been
 555	 * signaled and that the reservation object has not been changed (i.e.
 556	 * no new fences have been added).
 557	 */
 558	if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
 559		if (reservation_object_trylock(resv)) {
 560			if (!__read_seqcount_retry(&resv->seq, seq))
 561				reservation_object_add_excl_fence(resv, NULL);
 562			reservation_object_unlock(resv);
 563		}
 564	}
 565
 566	return timeout;
 567}
 568
 569static void __fence_set_priority(struct dma_fence *fence,
 570				 const struct i915_sched_attr *attr)
 571{
 572	struct i915_request *rq;
 573	struct intel_engine_cs *engine;
 574
 575	if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
 576		return;
 577
 578	rq = to_request(fence);
 579	engine = rq->engine;
 580
 581	local_bh_disable();
 582	rcu_read_lock(); /* RCU serialisation for set-wedged protection */
 583	if (engine->schedule)
 584		engine->schedule(rq, attr);
 585	rcu_read_unlock();
 586	local_bh_enable(); /* kick the tasklets if queues were reprioritised */
 587}
 588
 589static void fence_set_priority(struct dma_fence *fence,
 590			       const struct i915_sched_attr *attr)
 591{
 592	/* Recurse once into a fence-array */
 593	if (dma_fence_is_array(fence)) {
 594		struct dma_fence_array *array = to_dma_fence_array(fence);
 595		int i;
 596
 597		for (i = 0; i < array->num_fences; i++)
 598			__fence_set_priority(array->fences[i], attr);
 599	} else {
 600		__fence_set_priority(fence, attr);
 601	}
 602}
 603
 604int
 605i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
 606			      unsigned int flags,
 607			      const struct i915_sched_attr *attr)
 608{
 609	struct dma_fence *excl;
 610
 611	if (flags & I915_WAIT_ALL) {
 612		struct dma_fence **shared;
 613		unsigned int count, i;
 614		int ret;
 615
 616		ret = reservation_object_get_fences_rcu(obj->resv,
 617							&excl, &count, &shared);
 618		if (ret)
 619			return ret;
 620
 621		for (i = 0; i < count; i++) {
 622			fence_set_priority(shared[i], attr);
 623			dma_fence_put(shared[i]);
 624		}
 625
 626		kfree(shared);
 627	} else {
 628		excl = reservation_object_get_excl_rcu(obj->resv);
 629	}
 630
 631	if (excl) {
 632		fence_set_priority(excl, attr);
 633		dma_fence_put(excl);
 634	}
 635	return 0;
 636}
 637
 638/**
 639 * Waits for rendering to the object to be completed
 640 * @obj: i915 gem object
 641 * @flags: how to wait (under a lock, for all rendering or just for writes etc)
 642 * @timeout: how long to wait
 643 * @rps_client: client (user process) to charge for any waitboosting
 644 */
 645int
 646i915_gem_object_wait(struct drm_i915_gem_object *obj,
 647		     unsigned int flags,
 648		     long timeout,
 649		     struct intel_rps_client *rps_client)
 650{
 651	might_sleep();
 652#if IS_ENABLED(CONFIG_LOCKDEP)
 653	GEM_BUG_ON(debug_locks &&
 654		   !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
 655		   !!(flags & I915_WAIT_LOCKED));
 656#endif
 657	GEM_BUG_ON(timeout < 0);
 658
 659	timeout = i915_gem_object_wait_reservation(obj->resv,
 660						   flags, timeout,
 661						   rps_client);
 662	return timeout < 0 ? timeout : 0;
 663}
 664
 665static struct intel_rps_client *to_rps_client(struct drm_file *file)
 666{
 667	struct drm_i915_file_private *fpriv = file->driver_priv;
 668
 669	return &fpriv->rps_client;
 670}
 671
 672static int
 673i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
 674		     struct drm_i915_gem_pwrite *args,
 675		     struct drm_file *file)
 676{
 677	void *vaddr = obj->phys_handle->vaddr + args->offset;
 678	char __user *user_data = u64_to_user_ptr(args->data_ptr);
 679
 680	/* We manually control the domain here and pretend that it
 681	 * remains coherent i.e. in the GTT domain, like shmem_pwrite.
 682	 */
 683	intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 684	if (copy_from_user(vaddr, user_data, args->size))
 685		return -EFAULT;
 686
 687	drm_clflush_virt_range(vaddr, args->size);
 688	i915_gem_chipset_flush(to_i915(obj->base.dev));
 689
 690	intel_fb_obj_flush(obj, ORIGIN_CPU);
 691	return 0;
 692}
 693
 694void *i915_gem_object_alloc(struct drm_i915_private *dev_priv)
 695{
 696	return kmem_cache_zalloc(dev_priv->objects, GFP_KERNEL);
 697}
 698
 699void i915_gem_object_free(struct drm_i915_gem_object *obj)
 700{
 701	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 702	kmem_cache_free(dev_priv->objects, obj);
 703}
 704
 705static int
 706i915_gem_create(struct drm_file *file,
 707		struct drm_i915_private *dev_priv,
 708		uint64_t size,
 709		uint32_t *handle_p)
 710{
 711	struct drm_i915_gem_object *obj;
 712	int ret;
 713	u32 handle;
 714
 715	size = roundup(size, PAGE_SIZE);
 716	if (size == 0)
 717		return -EINVAL;
 718
 719	/* Allocate the new object */
 720	obj = i915_gem_object_create(dev_priv, size);
 721	if (IS_ERR(obj))
 722		return PTR_ERR(obj);
 723
 724	ret = drm_gem_handle_create(file, &obj->base, &handle);
 725	/* drop reference from allocate - handle holds it now */
 726	i915_gem_object_put(obj);
 727	if (ret)
 728		return ret;
 729
 730	*handle_p = handle;
 731	return 0;
 732}
 733
 734int
 735i915_gem_dumb_create(struct drm_file *file,
 736		     struct drm_device *dev,
 737		     struct drm_mode_create_dumb *args)
 738{
 739	/* have to work out size/pitch and return them */
 740	args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
 741	args->size = args->pitch * args->height;
 742	return i915_gem_create(file, to_i915(dev),
 743			       args->size, &args->handle);
 744}
 745
 746static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 747{
 748	return !(obj->cache_level == I915_CACHE_NONE ||
 749		 obj->cache_level == I915_CACHE_WT);
 750}
 751
 752/**
 753 * Creates a new mm object and returns a handle to it.
 754 * @dev: drm device pointer
 755 * @data: ioctl data blob
 756 * @file: drm file pointer
 757 */
 758int
 759i915_gem_create_ioctl(struct drm_device *dev, void *data,
 760		      struct drm_file *file)
 761{
 762	struct drm_i915_private *dev_priv = to_i915(dev);
 763	struct drm_i915_gem_create *args = data;
 764
 765	i915_gem_flush_free_objects(dev_priv);
 766
 767	return i915_gem_create(file, dev_priv,
 768			       args->size, &args->handle);
 769}
 770
 771static inline enum fb_op_origin
 772fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
 773{
 774	return (domain == I915_GEM_DOMAIN_GTT ?
 775		obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
 776}
 777
 778void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
 779{
 780	/*
 781	 * No actual flushing is required for the GTT write domain for reads
 782	 * from the GTT domain. Writes to it "immediately" go to main memory
 783	 * as far as we know, so there's no chipset flush. It also doesn't
 784	 * land in the GPU render cache.
 785	 *
 786	 * However, we do have to enforce the order so that all writes through
 787	 * the GTT land before any writes to the device, such as updates to
 788	 * the GATT itself.
 789	 *
 790	 * We also have to wait a bit for the writes to land from the GTT.
 791	 * An uncached read (i.e. mmio) seems to be ideal for the round-trip
 792	 * timing. This issue has only been observed when switching quickly
 793	 * between GTT writes and CPU reads from inside the kernel on recent hw,
 794	 * and it appears to only affect discrete GTT blocks (i.e. on LLC
 795	 * system agents we cannot reproduce this behaviour, until Cannonlake
 796	 * that was!).
 797	 */
 798
 799	wmb();
 800
 801	intel_runtime_pm_get(dev_priv);
 802	spin_lock_irq(&dev_priv->uncore.lock);
 803
 804	POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
 805
 806	spin_unlock_irq(&dev_priv->uncore.lock);
 807	intel_runtime_pm_put(dev_priv);
 808}
 809
 810static void
 811flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
 812{
 813	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 814	struct i915_vma *vma;
 815
 816	if (!(obj->write_domain & flush_domains))
 817		return;
 818
 819	switch (obj->write_domain) {
 820	case I915_GEM_DOMAIN_GTT:
 821		i915_gem_flush_ggtt_writes(dev_priv);
 822
 823		intel_fb_obj_flush(obj,
 824				   fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
 825
 826		for_each_ggtt_vma(vma, obj) {
 827			if (vma->iomap)
 828				continue;
 829
 830			i915_vma_unset_ggtt_write(vma);
 831		}
 832		break;
 833
 834	case I915_GEM_DOMAIN_CPU:
 835		i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 836		break;
 837
 838	case I915_GEM_DOMAIN_RENDER:
 839		if (gpu_write_needs_clflush(obj))
 840			obj->cache_dirty = true;
 841		break;
 842	}
 843
 844	obj->write_domain = 0;
 845}
 846
 847static inline int
 848__copy_to_user_swizzled(char __user *cpu_vaddr,
 849			const char *gpu_vaddr, int gpu_offset,
 850			int length)
 851{
 852	int ret, cpu_offset = 0;
 853
 854	while (length > 0) {
 855		int cacheline_end = ALIGN(gpu_offset + 1, 64);
 856		int this_length = min(cacheline_end - gpu_offset, length);
 857		int swizzled_gpu_offset = gpu_offset ^ 64;
 858
 859		ret = __copy_to_user(cpu_vaddr + cpu_offset,
 860				     gpu_vaddr + swizzled_gpu_offset,
 861				     this_length);
 862		if (ret)
 863			return ret + length;
 864
 865		cpu_offset += this_length;
 866		gpu_offset += this_length;
 867		length -= this_length;
 868	}
 869
 870	return 0;
 871}
 872
 873static inline int
 874__copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset,
 875			  const char __user *cpu_vaddr,
 876			  int length)
 877{
 878	int ret, cpu_offset = 0;
 879
 880	while (length > 0) {
 881		int cacheline_end = ALIGN(gpu_offset + 1, 64);
 882		int this_length = min(cacheline_end - gpu_offset, length);
 883		int swizzled_gpu_offset = gpu_offset ^ 64;
 884
 885		ret = __copy_from_user(gpu_vaddr + swizzled_gpu_offset,
 886				       cpu_vaddr + cpu_offset,
 887				       this_length);
 888		if (ret)
 889			return ret + length;
 890
 891		cpu_offset += this_length;
 892		gpu_offset += this_length;
 893		length -= this_length;
 894	}
 895
 896	return 0;
 897}
 898
 899/*
 900 * Pins the specified object's pages and synchronizes the object with
 901 * GPU accesses. Sets needs_clflush to non-zero if the caller should
 902 * flush the object from the CPU cache.
 903 */
 904int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 905				    unsigned int *needs_clflush)
 906{
 907	int ret;
 908
 909	lockdep_assert_held(&obj->base.dev->struct_mutex);
 910
 911	*needs_clflush = 0;
 912	if (!i915_gem_object_has_struct_page(obj))
 913		return -ENODEV;
 914
 915	ret = i915_gem_object_wait(obj,
 916				   I915_WAIT_INTERRUPTIBLE |
 917				   I915_WAIT_LOCKED,
 918				   MAX_SCHEDULE_TIMEOUT,
 919				   NULL);
 920	if (ret)
 921		return ret;
 922
 923	ret = i915_gem_object_pin_pages(obj);
 924	if (ret)
 925		return ret;
 926
 927	if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
 928	    !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 929		ret = i915_gem_object_set_to_cpu_domain(obj, false);
 930		if (ret)
 931			goto err_unpin;
 932		else
 933			goto out;
 934	}
 935
 936	flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 937
 938	/* If we're not in the cpu read domain, set ourself into the gtt
 939	 * read domain and manually flush cachelines (if required). This
 940	 * optimizes for the case when the gpu will dirty the data
 941	 * anyway again before the next pread happens.
 942	 */
 943	if (!obj->cache_dirty &&
 944	    !(obj->read_domains & I915_GEM_DOMAIN_CPU))
 945		*needs_clflush = CLFLUSH_BEFORE;
 946
 947out:
 948	/* return with the pages pinned */
 949	return 0;
 950
 951err_unpin:
 952	i915_gem_object_unpin_pages(obj);
 953	return ret;
 954}
 955
 956int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 957				     unsigned int *needs_clflush)
 958{
 959	int ret;
 960
 961	lockdep_assert_held(&obj->base.dev->struct_mutex);
 962
 963	*needs_clflush = 0;
 964	if (!i915_gem_object_has_struct_page(obj))
 965		return -ENODEV;
 966
 967	ret = i915_gem_object_wait(obj,
 968				   I915_WAIT_INTERRUPTIBLE |
 969				   I915_WAIT_LOCKED |
 970				   I915_WAIT_ALL,
 971				   MAX_SCHEDULE_TIMEOUT,
 972				   NULL);
 973	if (ret)
 974		return ret;
 975
 976	ret = i915_gem_object_pin_pages(obj);
 977	if (ret)
 978		return ret;
 979
 980	if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
 981	    !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 982		ret = i915_gem_object_set_to_cpu_domain(obj, true);
 983		if (ret)
 984			goto err_unpin;
 985		else
 986			goto out;
 987	}
 988
 989	flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 990
 991	/* If we're not in the cpu write domain, set ourself into the
 992	 * gtt write domain and manually flush cachelines (as required).
 993	 * This optimizes for the case when the gpu will use the data
 994	 * right away and we therefore have to clflush anyway.
 995	 */
 996	if (!obj->cache_dirty) {
 997		*needs_clflush |= CLFLUSH_AFTER;
 998
 999		/*
1000		 * Same trick applies to invalidate partially written
1001		 * cachelines read before writing.
1002		 */
1003		if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
1004			*needs_clflush |= CLFLUSH_BEFORE;
1005	}
1006
1007out:
1008	intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1009	obj->mm.dirty = true;
1010	/* return with the pages pinned */
1011	return 0;
1012
1013err_unpin:
1014	i915_gem_object_unpin_pages(obj);
1015	return ret;
1016}
1017
1018static void
1019shmem_clflush_swizzled_range(char *addr, unsigned long length,
1020			     bool swizzled)
1021{
1022	if (unlikely(swizzled)) {
1023		unsigned long start = (unsigned long) addr;
1024		unsigned long end = (unsigned long) addr + length;
1025
1026		/* For swizzling simply ensure that we always flush both
1027		 * channels. Lame, but simple and it works. Swizzled
1028		 * pwrite/pread is far from a hotpath - current userspace
1029		 * doesn't use it at all. */
1030		start = round_down(start, 128);
1031		end = round_up(end, 128);
1032
1033		drm_clflush_virt_range((void *)start, end - start);
1034	} else {
1035		drm_clflush_virt_range(addr, length);
1036	}
1037
1038}
1039
1040/* Only difference to the fast-path function is that this can handle bit17
1041 * and uses non-atomic copy and kmap functions. */
1042static int
1043shmem_pread_slow(struct page *page, int offset, int length,
1044		 char __user *user_data,
1045		 bool page_do_bit17_swizzling, bool needs_clflush)
1046{
1047	char *vaddr;
1048	int ret;
1049
1050	vaddr = kmap(page);
1051	if (needs_clflush)
1052		shmem_clflush_swizzled_range(vaddr + offset, length,
1053					     page_do_bit17_swizzling);
1054
1055	if (page_do_bit17_swizzling)
1056		ret = __copy_to_user_swizzled(user_data, vaddr, offset, length);
1057	else
1058		ret = __copy_to_user(user_data, vaddr + offset, length);
1059	kunmap(page);
1060
1061	return ret ? - EFAULT : 0;
1062}
1063
1064static int
1065shmem_pread(struct page *page, int offset, int length, char __user *user_data,
1066	    bool page_do_bit17_swizzling, bool needs_clflush)
1067{
1068	int ret;
1069
1070	ret = -ENODEV;
1071	if (!page_do_bit17_swizzling) {
1072		char *vaddr = kmap_atomic(page);
1073
1074		if (needs_clflush)
1075			drm_clflush_virt_range(vaddr + offset, length);
1076		ret = __copy_to_user_inatomic(user_data, vaddr + offset, length);
1077		kunmap_atomic(vaddr);
1078	}
1079	if (ret == 0)
1080		return 0;
1081
1082	return shmem_pread_slow(page, offset, length, user_data,
1083				page_do_bit17_swizzling, needs_clflush);
1084}
1085
1086static int
1087i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
1088		     struct drm_i915_gem_pread *args)
1089{
1090	char __user *user_data;
1091	u64 remain;
1092	unsigned int obj_do_bit17_swizzling;
1093	unsigned int needs_clflush;
1094	unsigned int idx, offset;
1095	int ret;
1096
1097	obj_do_bit17_swizzling = 0;
1098	if (i915_gem_object_needs_bit17_swizzle(obj))
1099		obj_do_bit17_swizzling = BIT(17);
1100
1101	ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
1102	if (ret)
1103		return ret;
1104
1105	ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
1106	mutex_unlock(&obj->base.dev->struct_mutex);
1107	if (ret)
1108		return ret;
1109
1110	remain = args->size;
1111	user_data = u64_to_user_ptr(args->data_ptr);
1112	offset = offset_in_page(args->offset);
1113	for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1114		struct page *page = i915_gem_object_get_page(obj, idx);
1115		int length;
1116
1117		length = remain;
1118		if (offset + length > PAGE_SIZE)
1119			length = PAGE_SIZE - offset;
1120
1121		ret = shmem_pread(page, offset, length, user_data,
1122				  page_to_phys(page) & obj_do_bit17_swizzling,
1123				  needs_clflush);
1124		if (ret)
1125			break;
1126
1127		remain -= length;
1128		user_data += length;
1129		offset = 0;
1130	}
1131
1132	i915_gem_obj_finish_shmem_access(obj);
1133	return ret;
1134}
1135
1136static inline bool
1137gtt_user_read(struct io_mapping *mapping,
1138	      loff_t base, int offset,
1139	      char __user *user_data, int length)
1140{
1141	void __iomem *vaddr;
1142	unsigned long unwritten;
1143
1144	/* We can use the cpu mem copy function because this is X86. */
1145	vaddr = io_mapping_map_atomic_wc(mapping, base);
1146	unwritten = __copy_to_user_inatomic(user_data,
1147					    (void __force *)vaddr + offset,
1148					    length);
1149	io_mapping_unmap_atomic(vaddr);
1150	if (unwritten) {
1151		vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1152		unwritten = copy_to_user(user_data,
1153					 (void __force *)vaddr + offset,
1154					 length);
1155		io_mapping_unmap(vaddr);
1156	}
1157	return unwritten;
1158}
1159
1160static int
1161i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
1162		   const struct drm_i915_gem_pread *args)
1163{
1164	struct drm_i915_private *i915 = to_i915(obj->base.dev);
1165	struct i915_ggtt *ggtt = &i915->ggtt;
1166	struct drm_mm_node node;
1167	struct i915_vma *vma;
1168	void __user *user_data;
1169	u64 remain, offset;
1170	int ret;
1171
1172	ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1173	if (ret)
1174		return ret;
1175
1176	intel_runtime_pm_get(i915);
1177	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1178				       PIN_MAPPABLE |
1179				       PIN_NONFAULT |
1180				       PIN_NONBLOCK);
1181	if (!IS_ERR(vma)) {
1182		node.start = i915_ggtt_offset(vma);
1183		node.allocated = false;
1184		ret = i915_vma_put_fence(vma);
1185		if (ret) {
1186			i915_vma_unpin(vma);
1187			vma = ERR_PTR(ret);
1188		}
1189	}
1190	if (IS_ERR(vma)) {
1191		ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1192		if (ret)
1193			goto out_unlock;
1194		GEM_BUG_ON(!node.allocated);
1195	}
1196
1197	ret = i915_gem_object_set_to_gtt_domain(obj, false);
1198	if (ret)
1199		goto out_unpin;
1200
1201	mutex_unlock(&i915->drm.struct_mutex);
1202
1203	user_data = u64_to_user_ptr(args->data_ptr);
1204	remain = args->size;
1205	offset = args->offset;
1206
1207	while (remain > 0) {
1208		/* Operation in this page
1209		 *
1210		 * page_base = page offset within aperture
1211		 * page_offset = offset within page
1212		 * page_length = bytes to copy for this page
1213		 */
1214		u32 page_base = node.start;
1215		unsigned page_offset = offset_in_page(offset);
1216		unsigned page_length = PAGE_SIZE - page_offset;
1217		page_length = remain < page_length ? remain : page_length;
1218		if (node.allocated) {
1219			wmb();
1220			ggtt->base.insert_page(&ggtt->base,
1221					       i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1222					       node.start, I915_CACHE_NONE, 0);
1223			wmb();
1224		} else {
1225			page_base += offset & PAGE_MASK;
1226		}
1227
1228		if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1229				  user_data, page_length)) {
1230			ret = -EFAULT;
1231			break;
1232		}
1233
1234		remain -= page_length;
1235		user_data += page_length;
1236		offset += page_length;
1237	}
1238
1239	mutex_lock(&i915->drm.struct_mutex);
1240out_unpin:
1241	if (node.allocated) {
1242		wmb();
1243		ggtt->base.clear_range(&ggtt->base,
1244				       node.start, node.size);
1245		remove_mappable_node(&node);
1246	} else {
1247		i915_vma_unpin(vma);
1248	}
1249out_unlock:
1250	intel_runtime_pm_put(i915);
1251	mutex_unlock(&i915->drm.struct_mutex);
1252
1253	return ret;
1254}
1255
1256/**
1257 * Reads data from the object referenced by handle.
1258 * @dev: drm device pointer
1259 * @data: ioctl data blob
1260 * @file: drm file pointer
1261 *
1262 * On error, the contents of *data are undefined.
1263 */
1264int
1265i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1266		     struct drm_file *file)
1267{
1268	struct drm_i915_gem_pread *args = data;
1269	struct drm_i915_gem_object *obj;
1270	int ret;
1271
1272	if (args->size == 0)
1273		return 0;
1274
1275	if (!access_ok(VERIFY_WRITE,
1276		       u64_to_user_ptr(args->data_ptr),
1277		       args->size))
1278		return -EFAULT;
1279
1280	obj = i915_gem_object_lookup(file, args->handle);
1281	if (!obj)
1282		return -ENOENT;
1283
1284	/* Bounds check source.  */
1285	if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1286		ret = -EINVAL;
1287		goto out;
1288	}
1289
1290	trace_i915_gem_object_pread(obj, args->offset, args->size);
1291
1292	ret = i915_gem_object_wait(obj,
1293				   I915_WAIT_INTERRUPTIBLE,
1294				   MAX_SCHEDULE_TIMEOUT,
1295				   to_rps_client(file));
1296	if (ret)
1297		goto out;
1298
1299	ret = i915_gem_object_pin_pages(obj);
1300	if (ret)
1301		goto out;
1302
1303	ret = i915_gem_shmem_pread(obj, args);
1304	if (ret == -EFAULT || ret == -ENODEV)
1305		ret = i915_gem_gtt_pread(obj, args);
1306
1307	i915_gem_object_unpin_pages(obj);
1308out:
1309	i915_gem_object_put(obj);
1310	return ret;
1311}
1312
1313/* This is the fast write path which cannot handle
1314 * page faults in the source data
1315 */
1316
1317static inline bool
1318ggtt_write(struct io_mapping *mapping,
1319	   loff_t base, int offset,
1320	   char __user *user_data, int length)
1321{
1322	void __iomem *vaddr;
1323	unsigned long unwritten;
1324
1325	/* We can use the cpu mem copy function because this is X86. */
1326	vaddr = io_mapping_map_atomic_wc(mapping, base);
1327	unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1328						      user_data, length);
1329	io_mapping_unmap_atomic(vaddr);
1330	if (unwritten) {
1331		vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1332		unwritten = copy_from_user((void __force *)vaddr + offset,
1333					   user_data, length);
1334		io_mapping_unmap(vaddr);
1335	}
1336
1337	return unwritten;
1338}
1339
1340/**
1341 * This is the fast pwrite path, where we copy the data directly from the
1342 * user into the GTT, uncached.
1343 * @obj: i915 GEM object
1344 * @args: pwrite arguments structure
1345 */
1346static int
1347i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1348			 const struct drm_i915_gem_pwrite *args)
1349{
1350	struct drm_i915_private *i915 = to_i915(obj->base.dev);
1351	struct i915_ggtt *ggtt = &i915->ggtt;
1352	struct drm_mm_node node;
1353	struct i915_vma *vma;
1354	u64 remain, offset;
1355	void __user *user_data;
1356	int ret;
1357
1358	ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1359	if (ret)
1360		return ret;
1361
1362	if (i915_gem_object_has_struct_page(obj)) {
1363		/*
1364		 * Avoid waking the device up if we can fallback, as
1365		 * waking/resuming is very slow (worst-case 10-100 ms
1366		 * depending on PCI sleeps and our own resume time).
1367		 * This easily dwarfs any performance advantage from
1368		 * using the cache bypass of indirect GGTT access.
1369		 */
1370		if (!intel_runtime_pm_get_if_in_use(i915)) {
1371			ret = -EFAULT;
1372			goto out_unlock;
1373		}
1374	} else {
1375		/* No backing pages, no fallback, we must force GGTT access */
1376		intel_runtime_pm_get(i915);
1377	}
1378
1379	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1380				       PIN_MAPPABLE |
1381				       PIN_NONFAULT |
1382				       PIN_NONBLOCK);
1383	if (!IS_ERR(vma)) {
1384		node.start = i915_ggtt_offset(vma);
1385		node.allocated = false;
1386		ret = i915_vma_put_fence(vma);
1387		if (ret) {
1388			i915_vma_unpin(vma);
1389			vma = ERR_PTR(ret);
1390		}
1391	}
1392	if (IS_ERR(vma)) {
1393		ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1394		if (ret)
1395			goto out_rpm;
1396		GEM_BUG_ON(!node.allocated);
1397	}
1398
1399	ret = i915_gem_object_set_to_gtt_domain(obj, true);
1400	if (ret)
1401		goto out_unpin;
1402
1403	mutex_unlock(&i915->drm.struct_mutex);
1404
1405	intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1406
1407	user_data = u64_to_user_ptr(args->data_ptr);
1408	offset = args->offset;
1409	remain = args->size;
1410	while (remain) {
1411		/* Operation in this page
1412		 *
1413		 * page_base = page offset within aperture
1414		 * page_offset = offset within page
1415		 * page_length = bytes to copy for this page
1416		 */
1417		u32 page_base = node.start;
1418		unsigned int page_offset = offset_in_page(offset);
1419		unsigned int page_length = PAGE_SIZE - page_offset;
1420		page_length = remain < page_length ? remain : page_length;
1421		if (node.allocated) {
1422			wmb(); /* flush the write before we modify the GGTT */
1423			ggtt->base.insert_page(&ggtt->base,
1424					       i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1425					       node.start, I915_CACHE_NONE, 0);
1426			wmb(); /* flush modifications to the GGTT (insert_page) */
1427		} else {
1428			page_base += offset & PAGE_MASK;
1429		}
1430		/* If we get a fault while copying data, then (presumably) our
1431		 * source page isn't available.  Return the error and we'll
1432		 * retry in the slow path.
1433		 * If the object is non-shmem backed, we retry again with the
1434		 * path that handles page fault.
1435		 */
1436		if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1437			       user_data, page_length)) {
1438			ret = -EFAULT;
1439			break;
1440		}
1441
1442		remain -= page_length;
1443		user_data += page_length;
1444		offset += page_length;
1445	}
1446	intel_fb_obj_flush(obj, ORIGIN_CPU);
1447
1448	mutex_lock(&i915->drm.struct_mutex);
1449out_unpin:
1450	if (node.allocated) {
1451		wmb();
1452		ggtt->base.clear_range(&ggtt->base,
1453				       node.start, node.size);
1454		remove_mappable_node(&node);
1455	} else {
1456		i915_vma_unpin(vma);
1457	}
1458out_rpm:
1459	intel_runtime_pm_put(i915);
1460out_unlock:
1461	mutex_unlock(&i915->drm.struct_mutex);
1462	return ret;
1463}
1464
1465static int
1466shmem_pwrite_slow(struct page *page, int offset, int length,
1467		  char __user *user_data,
1468		  bool page_do_bit17_swizzling,
1469		  bool needs_clflush_before,
1470		  bool needs_clflush_after)
1471{
1472	char *vaddr;
1473	int ret;
1474
1475	vaddr = kmap(page);
1476	if (unlikely(needs_clflush_before || page_do_bit17_swizzling))
1477		shmem_clflush_swizzled_range(vaddr + offset, length,
1478					     page_do_bit17_swizzling);
1479	if (page_do_bit17_swizzling)
1480		ret = __copy_from_user_swizzled(vaddr, offset, user_data,
1481						length);
1482	else
1483		ret = __copy_from_user(vaddr + offset, user_data, length);
1484	if (needs_clflush_after)
1485		shmem_clflush_swizzled_range(vaddr + offset, length,
1486					     page_do_bit17_swizzling);
1487	kunmap(page);
1488
1489	return ret ? -EFAULT : 0;
1490}
1491
1492/* Per-page copy function for the shmem pwrite fastpath.
1493 * Flushes invalid cachelines before writing to the target if
1494 * needs_clflush_before is set and flushes out any written cachelines after
1495 * writing if needs_clflush is set.
1496 */
1497static int
1498shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1499	     bool page_do_bit17_swizzling,
1500	     bool needs_clflush_before,
1501	     bool needs_clflush_after)
1502{
1503	int ret;
1504
1505	ret = -ENODEV;
1506	if (!page_do_bit17_swizzling) {
1507		char *vaddr = kmap_atomic(page);
1508
1509		if (needs_clflush_before)
1510			drm_clflush_virt_range(vaddr + offset, len);
1511		ret = __copy_from_user_inatomic(vaddr + offset, user_data, len);
1512		if (needs_clflush_after)
1513			drm_clflush_virt_range(vaddr + offset, len);
1514
1515		kunmap_atomic(vaddr);
1516	}
1517	if (ret == 0)
1518		return ret;
1519
1520	return shmem_pwrite_slow(page, offset, len, user_data,
1521				 page_do_bit17_swizzling,
1522				 needs_clflush_before,
1523				 needs_clflush_after);
1524}
1525
1526static int
1527i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1528		      const struct drm_i915_gem_pwrite *args)
1529{
1530	struct drm_i915_private *i915 = to_i915(obj->base.dev);
1531	void __user *user_data;
1532	u64 remain;
1533	unsigned int obj_do_bit17_swizzling;
1534	unsigned int partial_cacheline_write;
1535	unsigned int needs_clflush;
1536	unsigned int offset, idx;
1537	int ret;
1538
1539	ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1540	if (ret)
1541		return ret;
1542
1543	ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1544	mutex_unlock(&i915->drm.struct_mutex);
1545	if (ret)
1546		return ret;
1547
1548	obj_do_bit17_swizzling = 0;
1549	if (i915_gem_object_needs_bit17_swizzle(obj))
1550		obj_do_bit17_swizzling = BIT(17);
1551
1552	/* If we don't overwrite a cacheline completely we need to be
1553	 * careful to have up-to-date data by first clflushing. Don't
1554	 * overcomplicate things and flush the entire patch.
1555	 */
1556	partial_cacheline_write = 0;
1557	if (needs_clflush & CLFLUSH_BEFORE)
1558		partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1559
1560	user_data = u64_to_user_ptr(args->data_ptr);
1561	remain = args->size;
1562	offset = offset_in_page(args->offset);
1563	for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1564		struct page *page = i915_gem_object_get_page(obj, idx);
1565		int length;
1566
1567		length = remain;
1568		if (offset + length > PAGE_SIZE)
1569			length = PAGE_SIZE - offset;
1570
1571		ret = shmem_pwrite(page, offset, length, user_data,
1572				   page_to_phys(page) & obj_do_bit17_swizzling,
1573				   (offset | length) & partial_cacheline_write,
1574				   needs_clflush & CLFLUSH_AFTER);
1575		if (ret)
1576			break;
1577
1578		remain -= length;
1579		user_data += length;
1580		offset = 0;
1581	}
1582
1583	intel_fb_obj_flush(obj, ORIGIN_CPU);
1584	i915_gem_obj_finish_shmem_access(obj);
1585	return ret;
1586}
1587
1588/**
1589 * Writes data to the object referenced by handle.
1590 * @dev: drm device
1591 * @data: ioctl data blob
1592 * @file: drm file
1593 *
1594 * On error, the contents of the buffer that were to be modified are undefined.
1595 */
1596int
1597i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1598		      struct drm_file *file)
1599{
1600	struct drm_i915_gem_pwrite *args = data;
1601	struct drm_i915_gem_object *obj;
1602	int ret;
1603
1604	if (args->size == 0)
1605		return 0;
1606
1607	if (!access_ok(VERIFY_READ,
1608		       u64_to_user_ptr(args->data_ptr),
1609		       args->size))
1610		return -EFAULT;
1611
1612	obj = i915_gem_object_lookup(file, args->handle);
1613	if (!obj)
1614		return -ENOENT;
1615
1616	/* Bounds check destination. */
1617	if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1618		ret = -EINVAL;
1619		goto err;
1620	}
1621
1622	trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1623
1624	ret = -ENODEV;
1625	if (obj->ops->pwrite)
1626		ret = obj->ops->pwrite(obj, args);
1627	if (ret != -ENODEV)
1628		goto err;
1629
1630	ret = i915_gem_object_wait(obj,
1631				   I915_WAIT_INTERRUPTIBLE |
1632				   I915_WAIT_ALL,
1633				   MAX_SCHEDULE_TIMEOUT,
1634				   to_rps_client(file));
1635	if (ret)
1636		goto err;
1637
1638	ret = i915_gem_object_pin_pages(obj);
1639	if (ret)
1640		goto err;
1641
1642	ret = -EFAULT;
1643	/* We can only do the GTT pwrite on untiled buffers, as otherwise
1644	 * it would end up going through the fenced access, and we'll get
1645	 * different detiling behavior between reading and writing.
1646	 * pread/pwrite currently are reading and writing from the CPU
1647	 * perspective, requiring manual detiling by the client.
1648	 */
1649	if (!i915_gem_object_has_struct_page(obj) ||
1650	    cpu_write_needs_clflush(obj))
1651		/* Note that the gtt paths might fail with non-page-backed user
1652		 * pointers (e.g. gtt mappings when moving data between
1653		 * textures). Fallback to the shmem path in that case.
1654		 */
1655		ret = i915_gem_gtt_pwrite_fast(obj, args);
1656
1657	if (ret == -EFAULT || ret == -ENOSPC) {
1658		if (obj->phys_handle)
1659			ret = i915_gem_phys_pwrite(obj, args, file);
1660		else
1661			ret = i915_gem_shmem_pwrite(obj, args);
1662	}
1663
1664	i915_gem_object_unpin_pages(obj);
1665err:
1666	i915_gem_object_put(obj);
1667	return ret;
1668}
1669
1670static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1671{
1672	struct drm_i915_private *i915;
1673	struct list_head *list;
1674	struct i915_vma *vma;
1675
1676	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1677
1678	for_each_ggtt_vma(vma, obj) {
1679		if (i915_vma_is_active(vma))
1680			continue;
1681
1682		if (!drm_mm_node_allocated(&vma->node))
1683			continue;
1684
1685		list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
1686	}
1687
1688	i915 = to_i915(obj->base.dev);
1689	spin_lock(&i915->mm.obj_lock);
1690	list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1691	list_move_tail(&obj->mm.link, list);
1692	spin_unlock(&i915->mm.obj_lock);
1693}
1694
1695/**
1696 * Called when user space prepares to use an object with the CPU, either
1697 * through the mmap ioctl's mapping or a GTT mapping.
1698 * @dev: drm device
1699 * @data: ioctl data blob
1700 * @file: drm file
1701 */
1702int
1703i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1704			  struct drm_file *file)
1705{
1706	struct drm_i915_gem_set_domain *args = data;
1707	struct drm_i915_gem_object *obj;
1708	uint32_t read_domains = args->read_domains;
1709	uint32_t write_domain = args->write_domain;
1710	int err;
1711
1712	/* Only handle setting domains to types used by the CPU. */
1713	if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1714		return -EINVAL;
1715
1716	/* Having something in the write domain implies it's in the read
1717	 * domain, and only that read domain.  Enforce that in the request.
1718	 */
1719	if (write_domain != 0 && read_domains != write_domain)
1720		return -EINVAL;
1721
1722	obj = i915_gem_object_lookup(file, args->handle);
1723	if (!obj)
1724		return -ENOENT;
1725
1726	/* Try to flush the object off the GPU without holding the lock.
1727	 * We will repeat the flush holding the lock in the normal manner
1728	 * to catch cases where we are gazumped.
1729	 */
1730	err = i915_gem_object_wait(obj,
1731				   I915_WAIT_INTERRUPTIBLE |
1732				   (write_domain ? I915_WAIT_ALL : 0),
1733				   MAX_SCHEDULE_TIMEOUT,
1734				   to_rps_client(file));
1735	if (err)
1736		goto out;
1737
1738	/*
1739	 * Proxy objects do not control access to the backing storage, ergo
1740	 * they cannot be used as a means to manipulate the cache domain
1741	 * tracking for that backing storage. The proxy object is always
1742	 * considered to be outside of any cache domain.
1743	 */
1744	if (i915_gem_object_is_proxy(obj)) {
1745		err = -ENXIO;
1746		goto out;
1747	}
1748
1749	/*
1750	 * Flush and acquire obj->pages so that we are coherent through
1751	 * direct access in memory with previous cached writes through
1752	 * shmemfs and that our cache domain tracking remains valid.
1753	 * For example, if the obj->filp was moved to swap without us
1754	 * being notified and releasing the pages, we would mistakenly
1755	 * continue to assume that the obj remained out of the CPU cached
1756	 * domain.
1757	 */
1758	err = i915_gem_object_pin_pages(obj);
1759	if (err)
1760		goto out;
1761
1762	err = i915_mutex_lock_interruptible(dev);
1763	if (err)
1764		goto out_unpin;
1765
1766	if (read_domains & I915_GEM_DOMAIN_WC)
1767		err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1768	else if (read_domains & I915_GEM_DOMAIN_GTT)
1769		err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1770	else
1771		err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1772
1773	/* And bump the LRU for this access */
1774	i915_gem_object_bump_inactive_ggtt(obj);
1775
1776	mutex_unlock(&dev->struct_mutex);
1777
1778	if (write_domain != 0)
1779		intel_fb_obj_invalidate(obj,
1780					fb_write_origin(obj, write_domain));
1781
1782out_unpin:
1783	i915_gem_object_unpin_pages(obj);
1784out:
1785	i915_gem_object_put(obj);
1786	return err;
1787}
1788
1789/**
1790 * Called when user space has done writes to this buffer
1791 * @dev: drm device
1792 * @data: ioctl data blob
1793 * @file: drm file
1794 */
1795int
1796i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1797			 struct drm_file *file)
1798{
1799	struct drm_i915_gem_sw_finish *args = data;
1800	struct drm_i915_gem_object *obj;
1801
1802	obj = i915_gem_object_lookup(file, args->handle);
1803	if (!obj)
1804		return -ENOENT;
1805
1806	/*
1807	 * Proxy objects are barred from CPU access, so there is no
1808	 * need to ban sw_finish as it is a nop.
1809	 */
1810
1811	/* Pinned buffers may be scanout, so flush the cache */
1812	i915_gem_object_flush_if_display(obj);
1813	i915_gem_object_put(obj);
1814
1815	return 0;
1816}
1817
1818/**
1819 * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1820 *			 it is mapped to.
1821 * @dev: drm device
1822 * @data: ioctl data blob
1823 * @file: drm file
1824 *
1825 * While the mapping holds a reference on the contents of the object, it doesn't
1826 * imply a ref on the object itself.
1827 *
1828 * IMPORTANT:
1829 *
1830 * DRM driver writers who look a this function as an example for how to do GEM
1831 * mmap support, please don't implement mmap support like here. The modern way
1832 * to implement DRM mmap support is with an mmap offset ioctl (like
1833 * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1834 * That way debug tooling like valgrind will understand what's going on, hiding
1835 * the mmap call in a driver private ioctl will break that. The i915 driver only
1836 * does cpu mmaps this way because we didn't know better.
1837 */
1838int
1839i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1840		    struct drm_file *file)
1841{
1842	struct drm_i915_gem_mmap *args = data;
1843	struct drm_i915_gem_object *obj;
1844	unsigned long addr;
1845
1846	if (args->flags & ~(I915_MMAP_WC))
1847		return -EINVAL;
1848
1849	if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1850		return -ENODEV;
1851
1852	obj = i915_gem_object_lookup(file, args->handle);
1853	if (!obj)
1854		return -ENOENT;
1855
1856	/* prime objects have no backing filp to GEM mmap
1857	 * pages from.
1858	 */
1859	if (!obj->base.filp) {
1860		i915_gem_object_put(obj);
1861		return -ENXIO;
1862	}
1863
1864	addr = vm_mmap(obj->base.filp, 0, args->size,
1865		       PROT_READ | PROT_WRITE, MAP_SHARED,
1866		       args->offset);
1867	if (args->flags & I915_MMAP_WC) {
1868		struct mm_struct *mm = current->mm;
1869		struct vm_area_struct *vma;
1870
1871		if (down_write_killable(&mm->mmap_sem)) {
1872			i915_gem_object_put(obj);
1873			return -EINTR;
1874		}
1875		vma = find_vma(mm, addr);
1876		if (vma)
1877			vma->vm_page_prot =
1878				pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1879		else
1880			addr = -ENOMEM;
1881		up_write(&mm->mmap_sem);
1882
1883		/* This may race, but that's ok, it only gets set */
1884		WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1885	}
1886	i915_gem_object_put(obj);
1887	if (IS_ERR((void *)addr))
1888		return addr;
1889
1890	args->addr_ptr = (uint64_t) addr;
1891
1892	return 0;
1893}
1894
1895static unsigned int tile_row_pages(struct drm_i915_gem_object *obj)
1896{
1897	return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1898}
1899
1900/**
1901 * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1902 *
1903 * A history of the GTT mmap interface:
1904 *
1905 * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1906 *     aligned and suitable for fencing, and still fit into the available
1907 *     mappable space left by the pinned display objects. A classic problem
1908 *     we called the page-fault-of-doom where we would ping-pong between
1909 *     two objects that could not fit inside the GTT and so the memcpy
1910 *     would page one object in at the expense of the other between every
1911 *     single byte.
1912 *
1913 * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1914 *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1915 *     object is too large for the available space (or simply too large
1916 *     for the mappable aperture!), a view is created instead and faulted
1917 *     into userspace. (This view is aligned and sized appropriately for
1918 *     fenced access.)
1919 *
1920 * 2 - Recognise WC as a separate cache domain so that we can flush the
1921 *     delayed writes via GTT before performing direct access via WC.
1922 *
1923 * Restrictions:
1924 *
1925 *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1926 *    hangs on some architectures, corruption on others. An attempt to service
1927 *    a GTT page fault from a snoopable object will generate a SIGBUS.
1928 *
1929 *  * the object must be able to fit into RAM (physical memory, though no
1930 *    limited to the mappable aperture).
1931 *
1932 *
1933 * Caveats:
1934 *
1935 *  * a new GTT page fault will synchronize rendering from the GPU and flush
1936 *    all data to system memory. Subsequent access will not be synchronized.
1937 *
1938 *  * all mappings are revoked on runtime device suspend.
1939 *
1940 *  * there are only 8, 16 or 32 fence registers to share between all users
1941 *    (older machines require fence register for display and blitter access
1942 *    as well). Contention of the fence registers will cause the previous users
1943 *    to be unmapped and any new access will generate new page faults.
1944 *
1945 *  * running out of memory while servicing a fault may generate a SIGBUS,
1946 *    rather than the expected SIGSEGV.
1947 */
1948int i915_gem_mmap_gtt_version(void)
1949{
1950	return 2;
1951}
1952
1953static inline struct i915_ggtt_view
1954compute_partial_view(struct drm_i915_gem_object *obj,
1955		     pgoff_t page_offset,
1956		     unsigned int chunk)
1957{
1958	struct i915_ggtt_view view;
1959
1960	if (i915_gem_object_is_tiled(obj))
1961		chunk = roundup(chunk, tile_row_pages(obj));
1962
1963	view.type = I915_GGTT_VIEW_PARTIAL;
1964	view.partial.offset = rounddown(page_offset, chunk);
1965	view.partial.size =
1966		min_t(unsigned int, chunk,
1967		      (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
1968
1969	/* If the partial covers the entire object, just create a normal VMA. */
1970	if (chunk >= obj->base.size >> PAGE_SHIFT)
1971		view.type = I915_GGTT_VIEW_NORMAL;
1972
1973	return view;
1974}
1975
1976/**
1977 * i915_gem_fault - fault a page into the GTT
1978 * @vmf: fault info
1979 *
1980 * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
1981 * from userspace.  The fault handler takes care of binding the object to
1982 * the GTT (if needed), allocating and programming a fence register (again,
1983 * only if needed based on whether the old reg is still valid or the object
1984 * is tiled) and inserting a new PTE into the faulting process.
1985 *
1986 * Note that the faulting process may involve evicting existing objects
1987 * from the GTT and/or fence registers to make room.  So performance may
1988 * suffer if the GTT working set is large or there are few fence registers
1989 * left.
1990 *
1991 * The current feature set supported by i915_gem_fault() and thus GTT mmaps
1992 * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
1993 */
1994int i915_gem_fault(struct vm_fault *vmf)
1995{
1996#define MIN_CHUNK_PAGES ((1 << 20) >> PAGE_SHIFT) /* 1 MiB */
1997	struct vm_area_struct *area = vmf->vma;
1998	struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
1999	struct drm_device *dev = obj->base.dev;
2000	struct drm_i915_private *dev_priv = to_i915(dev);
2001	struct i915_ggtt *ggtt = &dev_priv->ggtt;
2002	bool write = !!(vmf->flags & FAULT_FLAG_WRITE);
2003	struct i915_vma *vma;
2004	pgoff_t page_offset;
2005	int ret;
2006
2007	/* We don't use vmf->pgoff since that has the fake offset */
2008	page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
2009
2010	trace_i915_gem_object_fault(obj, page_offset, true, write);
2011
2012	/* Try to flush the object off the GPU first without holding the lock.
2013	 * Upon acquiring the lock, we will perform our sanity checks and then
2014	 * repeat the flush holding the lock in the normal manner to catch cases
2015	 * where we are gazumped.
2016	 */
2017	ret = i915_gem_object_wait(obj,
2018				   I915_WAIT_INTERRUPTIBLE,
2019				   MAX_SCHEDULE_TIMEOUT,
2020				   NULL);
2021	if (ret)
2022		goto err;
2023
2024	ret = i915_gem_object_pin_pages(obj);
2025	if (ret)
2026		goto err;
2027
2028	intel_runtime_pm_get(dev_priv);
2029
2030	ret = i915_mutex_lock_interruptible(dev);
2031	if (ret)
2032		goto err_rpm;
2033
2034	/* Access to snoopable pages through the GTT is incoherent. */
2035	if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
2036		ret = -EFAULT;
2037		goto err_unlock;
2038	}
2039
2040
2041	/* Now pin it into the GTT as needed */
2042	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
2043				       PIN_MAPPABLE |
2044				       PIN_NONBLOCK |
2045				       PIN_NONFAULT);
2046	if (IS_ERR(vma)) {
2047		/* Use a partial view if it is bigger than available space */
2048		struct i915_ggtt_view view =
2049			compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
2050		unsigned int flags;
2051
2052		flags = PIN_MAPPABLE;
2053		if (view.type == I915_GGTT_VIEW_NORMAL)
2054			flags |= PIN_NONBLOCK; /* avoid warnings for pinned */
2055
2056		/*
2057		 * Userspace is now writing through an untracked VMA, abandon
2058		 * all hope that the hardware is able to track future writes.
2059		 */
2060		obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
2061
2062		vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
2063		if (IS_ERR(vma) && !view.type) {
2064			flags = PIN_MAPPABLE;
2065			view.type = I915_GGTT_VIEW_PARTIAL;
2066			vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
2067		}
2068	}
2069	if (IS_ERR(vma)) {
2070		ret = PTR_ERR(vma);
2071		goto err_unlock;
2072	}
2073
2074	ret = i915_gem_object_set_to_gtt_domain(obj, write);
2075	if (ret)
2076		goto err_unpin;
2077
2078	ret = i915_vma_pin_fence(vma);
2079	if (ret)
2080		goto err_unpin;
2081
2082	/* Finally, remap it using the new GTT offset */
2083	ret = remap_io_mapping(area,
2084			       area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
2085			       (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
2086			       min_t(u64, vma->size, area->vm_end - area->vm_start),
2087			       &ggtt->iomap);
2088	if (ret)
2089		goto err_fence;
2090
2091	/* Mark as being mmapped into userspace for later revocation */
2092	assert_rpm_wakelock_held(dev_priv);
2093	if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
2094		list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
2095	GEM_BUG_ON(!obj->userfault_count);
2096
2097	i915_vma_set_ggtt_write(vma);
2098
2099err_fence:
2100	i915_vma_unpin_fence(vma);
2101err_unpin:
2102	__i915_vma_unpin(vma);
2103err_unlock:
2104	mutex_unlock(&dev->struct_mutex);
2105err_rpm:
2106	intel_runtime_pm_put(dev_priv);
2107	i915_gem_object_unpin_pages(obj);
2108err:
2109	switch (ret) {
2110	case -EIO:
2111		/*
2112		 * We eat errors when the gpu is terminally wedged to avoid
2113		 * userspace unduly crashing (gl has no provisions for mmaps to
2114		 * fail). But any other -EIO isn't ours (e.g. swap in failure)
2115		 * and so needs to be reported.
2116		 */
2117		if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
2118			ret = VM_FAULT_SIGBUS;
2119			break;
2120		}
2121	case -EAGAIN:
2122		/*
2123		 * EAGAIN means the gpu is hung and we'll wait for the error
2124		 * handler to reset everything when re-faulting in
2125		 * i915_mutex_lock_interruptible.
2126		 */
2127	case 0:
2128	case -ERESTARTSYS:
2129	case -EINTR:
2130	case -EBUSY:
2131		/*
2132		 * EBUSY is ok: this just means that another thread
2133		 * already did the job.
2134		 */
2135		ret = VM_FAULT_NOPAGE;
2136		break;
2137	case -ENOMEM:
2138		ret = VM_FAULT_OOM;
2139		break;
2140	case -ENOSPC:
2141	case -EFAULT:
2142		ret = VM_FAULT_SIGBUS;
2143		break;
2144	default:
2145		WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
2146		ret = VM_FAULT_SIGBUS;
2147		break;
2148	}
2149	return ret;
2150}
2151
2152static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
2153{
2154	struct i915_vma *vma;
2155
2156	GEM_BUG_ON(!obj->userfault_count);
2157
2158	obj->userfault_count = 0;
2159	list_del(&obj->userfault_link);
2160	drm_vma_node_unmap(&obj->base.vma_node,
2161			   obj->base.dev->anon_inode->i_mapping);
2162
2163	for_each_ggtt_vma(vma, obj)
2164		i915_vma_unset_userfault(vma);
2165}
2166
2167/**
2168 * i915_gem_release_mmap - remove physical page mappings
2169 * @obj: obj in question
2170 *
2171 * Preserve the reservation of the mmapping with the DRM core code, but
2172 * relinquish ownership of the pages back to the system.
2173 *
2174 * It is vital that we remove the page mapping if we have mapped a tiled
2175 * object through the GTT and then lose the fence register due to
2176 * resource pressure. Similarly if the object has been moved out of the
2177 * aperture, than pages mapped into userspace must be revoked. Removing the
2178 * mapping will then trigger a page fault on the next user access, allowing
2179 * fixup by i915_gem_fault().
2180 */
2181void
2182i915_gem_release_mmap(struct drm_i915_gem_object *obj)
2183{
2184	struct drm_i915_private *i915 = to_i915(obj->base.dev);
2185
2186	/* Serialisation between user GTT access and our code depends upon
2187	 * revoking the CPU's PTE whilst the mutex is held. The next user
2188	 * pagefault then has to wait until we release the mutex.
2189	 *
2190	 * Note that RPM complicates somewhat by adding an additional
2191	 * requirement that operations to the GGTT be made holding the RPM
2192	 * wakeref.
2193	 */
2194	lockdep_assert_held(&i915->drm.struct_mutex);
2195	intel_runtime_pm_get(i915);
2196
2197	if (!obj->userfault_count)
2198		goto out;
2199
2200	__i915_gem_object_release_mmap(obj);
2201
2202	/* Ensure that the CPU's PTE are revoked and there are not outstanding
2203	 * memory transactions from userspace before we return. The TLB
2204	 * flushing implied above by changing the PTE above *should* be
2205	 * sufficient, an extra barrier here just provides us with a bit
2206	 * of paranoid documentation about our requirement to serialise
2207	 * memory writes before touching registers / GSM.
2208	 */
2209	wmb();
2210
2211out:
2212	intel_runtime_pm_put(i915);
2213}
2214
2215void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2216{
2217	struct drm_i915_gem_object *obj, *on;
2218	int i;
2219
2220	/*
2221	 * Only called during RPM suspend. All users of the userfault_list
2222	 * must be holding an RPM wakeref to ensure that this can not
2223	 * run concurrently with themselves (and use the struct_mutex for
2224	 * protection between themselves).
2225	 */
2226
2227	list_for_each_entry_safe(obj, on,
2228				 &dev_priv->mm.userfault_list, userfault_link)
2229		__i915_gem_object_release_mmap(obj);
2230
2231	/* The fence will be lost when the device powers down. If any were
2232	 * in use by hardware (i.e. they are pinned), we should not be powering
2233	 * down! All other fences will be reacquired by the user upon waking.
2234	 */
2235	for (i = 0; i < dev_priv->num_fence_regs; i++) {
2236		struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2237
2238		/* Ideally we want to assert that the fence register is not
2239		 * live at this point (i.e. that no piece of code will be
2240		 * trying to write through fence + GTT, as that both violates
2241		 * our tracking of activity and associated locking/barriers,
2242		 * but also is illegal given that the hw is powered down).
2243		 *
2244		 * Previously we used reg->pin_count as a "liveness" indicator.
2245		 * That is not sufficient, and we need a more fine-grained
2246		 * tool if we want to have a sanity check here.
2247		 */
2248
2249		if (!reg->vma)
2250			continue;
2251
2252		GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2253		reg->dirty = true;
2254	}
2255}
2256
2257static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2258{
2259	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2260	int err;
2261
2262	err = drm_gem_create_mmap_offset(&obj->base);
2263	if (likely(!err))
2264		return 0;
2265
2266	/* Attempt to reap some mmap space from dead objects */
2267	do {
2268		err = i915_gem_wait_for_idle(dev_priv, I915_WAIT_INTERRUPTIBLE);
2269		if (err)
2270			break;
2271
2272		i915_gem_drain_freed_objects(dev_priv);
2273		err = drm_gem_create_mmap_offset(&obj->base);
2274		if (!err)
2275			break;
2276
2277	} while (flush_delayed_work(&dev_priv->gt.retire_work));
2278
2279	return err;
2280}
2281
2282static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2283{
2284	drm_gem_free_mmap_offset(&obj->base);
2285}
2286
2287int
2288i915_gem_mmap_gtt(struct drm_file *file,
2289		  struct drm_device *dev,
2290		  uint32_t handle,
2291		  uint64_t *offset)
2292{
2293	struct drm_i915_gem_object *obj;
2294	int ret;
2295
2296	obj = i915_gem_object_lookup(file, handle);
2297	if (!obj)
2298		return -ENOENT;
2299
2300	ret = i915_gem_object_create_mmap_offset(obj);
2301	if (ret == 0)
2302		*offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2303
2304	i915_gem_object_put(obj);
2305	return ret;
2306}
2307
2308/**
2309 * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2310 * @dev: DRM device
2311 * @data: GTT mapping ioctl data
2312 * @file: GEM object info
2313 *
2314 * Simply returns the fake offset to userspace so it can mmap it.
2315 * The mmap call will end up in drm_gem_mmap(), which will set things
2316 * up so we can get faults in the handler above.
2317 *
2318 * The fault handler will take care of binding the object into the GTT
2319 * (since it may have been evicted to make room for something), allocating
2320 * a fence register, and mapping the appropriate aperture address into
2321 * userspace.
2322 */
2323int
2324i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2325			struct drm_file *file)
2326{
2327	struct drm_i915_gem_mmap_gtt *args = data;
2328
2329	return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2330}
2331
2332/* Immediately discard the backing storage */
2333static void
2334i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2335{
2336	i915_gem_object_free_mmap_offset(obj);
2337
2338	if (obj->base.filp == NULL)
2339		return;
2340
2341	/* Our goal here is to return as much of the memory as
2342	 * is possible back to the system as we are called from OOM.
2343	 * To do this we must instruct the shmfs to drop all of its
2344	 * backing pages, *now*.
2345	 */
2346	shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2347	obj->mm.madv = __I915_MADV_PURGED;
2348	obj->mm.pages = ERR_PTR(-EFAULT);
2349}
2350
2351/* Try to discard unwanted pages */
2352void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2353{
2354	struct address_space *mapping;
2355
2356	lockdep_assert_held(&obj->mm.lock);
2357	GEM_BUG_ON(i915_gem_object_has_pages(obj));
2358
2359	switch (obj->mm.madv) {
2360	case I915_MADV_DONTNEED:
2361		i915_gem_object_truncate(obj);
2362	case __I915_MADV_PURGED:
2363		return;
2364	}
2365
2366	if (obj->base.filp == NULL)
2367		return;
2368
2369	mapping = obj->base.filp->f_mapping,
2370	invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2371}
2372
2373static void
2374i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2375			      struct sg_table *pages)
2376{
2377	struct sgt_iter sgt_iter;
2378	struct page *page;
2379
2380	__i915_gem_object_release_shmem(obj, pages, true);
2381
2382	i915_gem_gtt_finish_pages(obj, pages);
2383
2384	if (i915_gem_object_needs_bit17_swizzle(obj))
2385		i915_gem_object_save_bit_17_swizzle(obj, pages);
2386
2387	for_each_sgt_page(page, sgt_iter, pages) {
2388		if (obj->mm.dirty)
2389			set_page_dirty(page);
2390
2391		if (obj->mm.madv == I915_MADV_WILLNEED)
2392			mark_page_accessed(page);
2393
2394		put_page(page);
2395	}
2396	obj->mm.dirty = false;
2397
2398	sg_free_table(pages);
2399	kfree(pages);
2400}
2401
2402static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2403{
2404	struct radix_tree_iter iter;
2405	void __rcu **slot;
2406
2407	rcu_read_lock();
2408	radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2409		radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2410	rcu_read_unlock();
2411}
2412
2413void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2414				 enum i915_mm_subclass subclass)
2415{
2416	struct drm_i915_private *i915 = to_i915(obj->base.dev);
2417	struct sg_table *pages;
2418
2419	if (i915_gem_object_has_pinned_pages(obj))
2420		return;
2421
2422	GEM_BUG_ON(obj->bind_count);
2423	if (!i915_gem_object_has_pages(obj))
2424		return;
2425
2426	/* May be called by shrinker from within get_pages() (on another bo) */
2427	mutex_lock_nested(&obj->mm.lock, subclass);
2428	if (unlikely(atomic_read(&obj->mm.pages_pin_count)))
2429		goto unlock;
2430
2431	/* ->put_pages might need to allocate memory for the bit17 swizzle
2432	 * array, hence protect them from being reaped by removing them from gtt
2433	 * lists early. */
2434	pages = fetch_and_zero(&obj->mm.pages);
2435	GEM_BUG_ON(!pages);
2436
2437	spin_lock(&i915->mm.obj_lock);
2438	list_del(&obj->mm.link);
2439	spin_unlock(&i915->mm.obj_lock);
2440
2441	if (obj->mm.mapping) {
2442		void *ptr;
2443
2444		ptr = page_mask_bits(obj->mm.mapping);
2445		if (is_vmalloc_addr(ptr))
2446			vunmap(ptr);
2447		else
2448			kunmap(kmap_to_page(ptr));
2449
2450		obj->mm.mapping = NULL;
2451	}
2452
2453	__i915_gem_object_reset_page_iter(obj);
2454
2455	if (!IS_ERR(pages))
2456		obj->ops->put_pages(obj, pages);
2457
2458	obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2459
2460unlock:
2461	mutex_unlock(&obj->mm.lock);
2462}
2463
2464static bool i915_sg_trim(struct sg_table *orig_st)
2465{
2466	struct sg_table new_st;
2467	struct scatterlist *sg, *new_sg;
2468	unsigned int i;
2469
2470	if (orig_st->nents == orig_st->orig_nents)
2471		return false;
2472
2473	if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2474		return false;
2475
2476	new_sg = new_st.sgl;
2477	for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2478		sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2479		/* called before being DMA mapped, no need to copy sg->dma_* */
2480		new_sg = sg_next(new_sg);
2481	}
2482	GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2483
2484	sg_free_table(orig_st);
2485
2486	*orig_st = new_st;
2487	return true;
2488}
2489
2490static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2491{
2492	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2493	const unsigned long page_count = obj->base.size / PAGE_SIZE;
2494	unsigned long i;
2495	struct address_space *mapping;
2496	struct sg_table *st;
2497	struct scatterlist *sg;
2498	struct sgt_iter sgt_iter;
2499	struct page *page;
2500	unsigned long last_pfn = 0;	/* suppress gcc warning */
2501	unsigned int max_segment = i915_sg_segment_size();
2502	unsigned int sg_page_sizes;
2503	gfp_t noreclaim;
2504	int ret;
2505
2506	/* Assert that the object is not currently in any GPU domain. As it
2507	 * wasn't in the GTT, there shouldn't be any way it could have been in
2508	 * a GPU cache
2509	 */
2510	GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2511	GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2512
2513	st = kmalloc(sizeof(*st), GFP_KERNEL);
2514	if (st == NULL)
2515		return -ENOMEM;
2516
2517rebuild_st:
2518	if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2519		kfree(st);
2520		return -ENOMEM;
2521	}
2522
2523	/* Get the list of pages out of our struct file.  They'll be pinned
2524	 * at this point until we release them.
2525	 *
2526	 * Fail silently without starting the shrinker
2527	 */
2528	mapping = obj->base.filp->f_mapping;
2529	noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2530	noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2531
2532	sg = st->sgl;
2533	st->nents = 0;
2534	sg_page_sizes = 0;
2535	for (i = 0; i < page_count; i++) {
2536		const unsigned int shrink[] = {
2537			I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2538			0,
2539		}, *s = shrink;
2540		gfp_t gfp = noreclaim;
2541
2542		do {
2543			page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2544			if (likely(!IS_ERR(page)))
2545				break;
2546
2547			if (!*s) {
2548				ret = PTR_ERR(page);
2549				goto err_sg;
2550			}
2551
2552			i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2553			cond_resched();
2554
2555			/* We've tried hard to allocate the memory by reaping
2556			 * our own buffer, now let the real VM do its job and
2557			 * go down in flames if truly OOM.
2558			 *
2559			 * However, since graphics tend to be disposable,
2560			 * defer the oom here by reporting the ENOMEM back
2561			 * to userspace.
2562			 */
2563			if (!*s) {
2564				/* reclaim and warn, but no oom */
2565				gfp = mapping_gfp_mask(mapping);
2566
2567				/* Our bo are always dirty and so we require
2568				 * kswapd to reclaim our pages (direct reclaim
2569				 * does not effectively begin pageout of our
2570				 * buffers on its own). However, direct reclaim
2571				 * only waits for kswapd when under allocation
2572				 * congestion. So as a result __GFP_RECLAIM is
2573				 * unreliable and fails to actually reclaim our
2574				 * dirty pages -- unless you try over and over
2575				 * again with !__GFP_NORETRY. However, we still
2576				 * want to fail this allocation rather than
2577				 * trigger the out-of-memory killer and for
2578				 * this we want __GFP_RETRY_MAYFAIL.
2579				 */
2580				gfp |= __GFP_RETRY_MAYFAIL;
2581			}
2582		} while (1);
2583
2584		if (!i ||
2585		    sg->length >= max_segment ||
2586		    page_to_pfn(page) != last_pfn + 1) {
2587			if (i) {
2588				sg_page_sizes |= sg->length;
2589				sg = sg_next(sg);
2590			}
2591			st->nents++;
2592			sg_set_page(sg, page, PAGE_SIZE, 0);
2593		} else {
2594			sg->length += PAGE_SIZE;
2595		}
2596		last_pfn = page_to_pfn(page);
2597
2598		/* Check that the i965g/gm workaround works. */
2599		WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2600	}
2601	if (sg) { /* loop terminated early; short sg table */
2602		sg_page_sizes |= sg->length;
2603		sg_mark_end(sg);
2604	}
2605
2606	/* Trim unused sg entries to avoid wasting memory. */
2607	i915_sg_trim(st);
2608
2609	ret = i915_gem_gtt_prepare_pages(obj, st);
2610	if (ret) {
2611		/* DMA remapping failed? One possible cause is that
2612		 * it could not reserve enough large entries, asking
2613		 * for PAGE_SIZE chunks instead may be helpful.
2614		 */
2615		if (max_segment > PAGE_SIZE) {
2616			for_each_sgt_page(page, sgt_iter, st)
2617				put_page(page);
2618			sg_free_table(st);
2619
2620			max_segment = PAGE_SIZE;
2621			goto rebuild_st;
2622		} else {
2623			dev_warn(&dev_priv->drm.pdev->dev,
2624				 "Failed to DMA remap %lu pages\n",
2625				 page_count);
2626			goto err_pages;
2627		}
2628	}
2629
2630	if (i915_gem_object_needs_bit17_swizzle(obj))
2631		i915_gem_object_do_bit_17_swizzle(obj, st);
2632
2633	__i915_gem_object_set_pages(obj, st, sg_page_sizes);
2634
2635	return 0;
2636
2637err_sg:
2638	sg_mark_end(sg);
2639err_pages:
2640	for_each_sgt_page(page, sgt_iter, st)
2641		put_page(page);
2642	sg_free_table(st);
2643	kfree(st);
2644
2645	/* shmemfs first checks if there is enough memory to allocate the page
2646	 * and reports ENOSPC should there be insufficient, along with the usual
2647	 * ENOMEM for a genuine allocation failure.
2648	 *
2649	 * We use ENOSPC in our driver to mean that we have run out of aperture
2650	 * space and so want to translate the error from shmemfs back to our
2651	 * usual understanding of ENOMEM.
2652	 */
2653	if (ret == -ENOSPC)
2654		ret = -ENOMEM;
2655
2656	return ret;
2657}
2658
2659void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2660				 struct sg_table *pages,
2661				 unsigned int sg_page_sizes)
2662{
2663	struct drm_i915_private *i915 = to_i915(obj->base.dev);
2664	unsigned long supported = INTEL_INFO(i915)->page_sizes;
2665	int i;
2666
2667	lockdep_assert_held(&obj->mm.lock);
2668
2669	obj->mm.get_page.sg_pos = pages->sgl;
2670	obj->mm.get_page.sg_idx = 0;
2671
2672	obj->mm.pages = pages;
2673
2674	if (i915_gem_object_is_tiled(obj) &&
2675	    i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2676		GEM_BUG_ON(obj->mm.quirked);
2677		__i915_gem_object_pin_pages(obj);
2678		obj->mm.quirked = true;
2679	}
2680
2681	GEM_BUG_ON(!sg_page_sizes);
2682	obj->mm.page_sizes.phys = sg_page_sizes;
2683
2684	/*
2685	 * Calculate the supported page-sizes which fit into the given
2686	 * sg_page_sizes. This will give us the page-sizes which we may be able
2687	 * to use opportunistically when later inserting into the GTT. For
2688	 * example if phys=2G, then in theory we should be able to use 1G, 2M,
2689	 * 64K or 4K pages, although in practice this will depend on a number of
2690	 * other factors.
2691	 */
2692	obj->mm.page_sizes.sg = 0;
2693	for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2694		if (obj->mm.page_sizes.phys & ~0u << i)
2695			obj->mm.page_sizes.sg |= BIT(i);
2696	}
2697	GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2698
2699	spin_lock(&i915->mm.obj_lock);
2700	list_add(&obj->mm.link, &i915->mm.unbound_list);
2701	spin_unlock(&i915->mm.obj_lock);
2702}
2703
2704static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2705{
2706	int err;
2707
2708	if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2709		DRM_DEBUG("Attempting to obtain a purgeable object\n");
2710		return -EFAULT;
2711	}
2712
2713	err = obj->ops->get_pages(obj);
2714	GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2715
2716	return err;
2717}
2718
2719/* Ensure that the associated pages are gathered from the backing storage
2720 * and pinned into our object. i915_gem_object_pin_pages() may be called
2721 * multiple times before they are released by a single call to
2722 * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2723 * either as a result of memory pressure (reaping pages under the shrinker)
2724 * or as the object is itself released.
2725 */
2726int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2727{
2728	int err;
2729
2730	err = mutex_lock_interruptible(&obj->mm.lock);
2731	if (err)
2732		return err;
2733
2734	if (unlikely(!i915_gem_object_has_pages(obj))) {
2735		GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2736
2737		err = ____i915_gem_object_get_pages(obj);
2738		if (err)
2739			goto unlock;
2740
2741		smp_mb__before_atomic();
2742	}
2743	atomic_inc(&obj->mm.pages_pin_count);
2744
2745unlock:
2746	mutex_unlock(&obj->mm.lock);
2747	return err;
2748}
2749
2750/* The 'mapping' part of i915_gem_object_pin_map() below */
2751static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2752				 enum i915_map_type type)
2753{
2754	unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2755	struct sg_table *sgt = obj->mm.pages;
2756	struct sgt_iter sgt_iter;
2757	struct page *page;
2758	struct page *stack_pages[32];
2759	struct page **pages = stack_pages;
2760	unsigned long i = 0;
2761	pgprot_t pgprot;
2762	void *addr;
2763
2764	/* A single page can always be kmapped */
2765	if (n_pages == 1 && type == I915_MAP_WB)
2766		return kmap(sg_page(sgt->sgl));
2767
2768	if (n_pages > ARRAY_SIZE(stack_pages)) {
2769		/* Too big for stack -- allocate temporary array instead */
2770		pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2771		if (!pages)
2772			return NULL;
2773	}
2774
2775	for_each_sgt_page(page, sgt_iter, sgt)
2776		pages[i++] = page;
2777
2778	/* Check that we have the expected number of pages */
2779	GEM_BUG_ON(i != n_pages);
2780
2781	switch (type) {
2782	default:
2783		MISSING_CASE(type);
2784		/* fallthrough to use PAGE_KERNEL anyway */
2785	case I915_MAP_WB:
2786		pgprot = PAGE_KERNEL;
2787		break;
2788	case I915_MAP_WC:
2789		pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2790		break;
2791	}
2792	addr = vmap(pages, n_pages, 0, pgprot);
2793
2794	if (pages != stack_pages)
2795		kvfree(pages);
2796
2797	return addr;
2798}
2799
2800/* get, pin, and map the pages of the object into kernel space */
2801void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2802			      enum i915_map_type type)
2803{
2804	enum i915_map_type has_type;
2805	bool pinned;
2806	void *ptr;
2807	int ret;
2808
2809	if (unlikely(!i915_gem_object_has_struct_page(obj)))
2810		return ERR_PTR(-ENXIO);
2811
2812	ret = mutex_lock_interruptible(&obj->mm.lock);
2813	if (ret)
2814		return ERR_PTR(ret);
2815
2816	pinned = !(type & I915_MAP_OVERRIDE);
2817	type &= ~I915_MAP_OVERRIDE;
2818
2819	if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2820		if (unlikely(!i915_gem_object_has_pages(obj))) {
2821			GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2822
2823			ret = ____i915_gem_object_get_pages(obj);
2824			if (ret)
2825				goto err_unlock;
2826
2827			smp_mb__before_atomic();
2828		}
2829		atomic_inc(&obj->mm.pages_pin_count);
2830		pinned = false;
2831	}
2832	GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2833
2834	ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2835	if (ptr && has_type != type) {
2836		if (pinned) {
2837			ret = -EBUSY;
2838			goto err_unpin;
2839		}
2840
2841		if (is_vmalloc_addr(ptr))
2842			vunmap(ptr);
2843		else
2844			kunmap(kmap_to_page(ptr));
2845
2846		ptr = obj->mm.mapping = NULL;
2847	}
2848
2849	if (!ptr) {
2850		ptr = i915_gem_object_map(obj, type);
2851		if (!ptr) {
2852			ret = -ENOMEM;
2853			goto err_unpin;
2854		}
2855
2856		obj->mm.mapping = page_pack_bits(ptr, type);
2857	}
2858
2859out_unlock:
2860	mutex_unlock(&obj->mm.lock);
2861	return ptr;
2862
2863err_unpin:
2864	atomic_dec(&obj->mm.pages_pin_count);
2865err_unlock:
2866	ptr = ERR_PTR(ret);
2867	goto out_unlock;
2868}
2869
2870static int
2871i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2872			   const struct drm_i915_gem_pwrite *arg)
2873{
2874	struct address_space *mapping = obj->base.filp->f_mapping;
2875	char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2876	u64 remain, offset;
2877	unsigned int pg;
2878
2879	/* Before we instantiate/pin the backing store for our use, we
2880	 * can prepopulate the shmemfs filp efficiently using a write into
2881	 * the pagecache. We avoid the penalty of instantiating all the
2882	 * pages, important if the user is just writing to a few and never
2883	 * uses the object on the GPU, and using a direct write into shmemfs
2884	 * allows it to avoid the cost of retrieving a page (either swapin
2885	 * or clearing-before-use) before it is overwritten.
2886	 */
2887	if (i915_gem_object_has_pages(obj))
2888		return -ENODEV;
2889
2890	if (obj->mm.madv != I915_MADV_WILLNEED)
2891		return -EFAULT;
2892
2893	/* Before the pages are instantiated the object is treated as being
2894	 * in the CPU domain. The pages will be clflushed as required before
2895	 * use, and we can freely write into the pages directly. If userspace
2896	 * races pwrite with any other operation; corruption will ensue -
2897	 * that is userspace's prerogative!
2898	 */
2899
2900	remain = arg->size;
2901	offset = arg->offset;
2902	pg = offset_in_page(offset);
2903
2904	do {
2905		unsigned int len, unwritten;
2906		struct page *page;
2907		void *data, *vaddr;
2908		int err;
2909
2910		len = PAGE_SIZE - pg;
2911		if (len > remain)
2912			len = remain;
2913
2914		err = pagecache_write_begin(obj->base.filp, mapping,
2915					    offset, len, 0,
2916					    &page, &data);
2917		if (err < 0)
2918			return err;
2919
2920		vaddr = kmap(page);
2921		unwritten = copy_from_user(vaddr + pg, user_data, len);
2922		kunmap(page);
2923
2924		err = pagecache_write_end(obj->base.filp, mapping,
2925					  offset, len, len - unwritten,
2926					  page, data);
2927		if (err < 0)
2928			return err;
2929
2930		if (unwritten)
2931			return -EFAULT;
2932
2933		remain -= len;
2934		user_data += len;
2935		offset += len;
2936		pg = 0;
2937	} while (remain);
2938
2939	return 0;
2940}
2941
2942static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv,
2943					const struct i915_gem_context *ctx)
2944{
2945	unsigned int score;
2946	unsigned long prev_hang;
2947
2948	if (i915_gem_context_is_banned(ctx))
2949		score = I915_CLIENT_SCORE_CONTEXT_BAN;
2950	else
2951		score = 0;
2952
2953	prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
2954	if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
2955		score += I915_CLIENT_SCORE_HANG_FAST;
2956
2957	if (score) {
2958		atomic_add(score, &file_priv->ban_score);
2959
2960		DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
2961				 ctx->name, score,
2962				 atomic_read(&file_priv->ban_score));
2963	}
2964}
2965
2966static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
2967{
2968	unsigned int score;
2969	bool banned, bannable;
2970
2971	atomic_inc(&ctx->guilty_count);
2972
2973	bannable = i915_gem_context_is_bannable(ctx);
2974	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
2975	banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
2976
2977	DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, ban %s\n",
2978			 ctx->name, atomic_read(&ctx->guilty_count),
2979			 score, yesno(banned && bannable));
2980
2981	/* Cool contexts don't accumulate client ban score */
2982	if (!bannable)
2983		return;
2984
2985	if (banned)
2986		i915_gem_context_set_banned(ctx);
2987
2988	if (!IS_ERR_OR_NULL(ctx->file_priv))
2989		i915_gem_client_mark_guilty(ctx->file_priv, ctx);
2990}
2991
2992static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
2993{
2994	atomic_inc(&ctx->active_count);
2995}
2996
2997struct i915_request *
2998i915_gem_find_active_request(struct intel_engine_cs *engine)
2999{
3000	struct i915_request *request, *active = NULL;
3001	unsigned long flags;
3002
3003	/*
3004	 * We are called by the error capture, reset and to dump engine
3005	 * state at random points in time. In particular, note that neither is
3006	 * crucially ordered with an interrupt. After a hang, the GPU is dead
3007	 * and we assume that no more writes can happen (we waited long enough
3008	 * for all writes that were in transaction to be flushed) - adding an
3009	 * extra delay for a recent interrupt is pointless. Hence, we do
3010	 * not need an engine->irq_seqno_barrier() before the seqno reads.
3011	 * At all other times, we must assume the GPU is still running, but
3012	 * we only care about the snapshot of this moment.
3013	 */
3014	spin_lock_irqsave(&engine->timeline.lock, flags);
3015	list_for_each_entry(request, &engine->timeline.requests, link) {
3016		if (__i915_request_completed(request, request->global_seqno))
3017			continue;
3018
3019		active = request;
3020		break;
3021	}
3022	spin_unlock_irqrestore(&engine->timeline.lock, flags);
3023
3024	return active;
3025}
3026
3027/*
3028 * Ensure irq handler finishes, and not run again.
3029 * Also return the active request so that we only search for it once.
3030 */
3031struct i915_request *
3032i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
3033{
3034	struct i915_request *request = NULL;
3035
3036	/*
3037	 * During the reset sequence, we must prevent the engine from
3038	 * entering RC6. As the context state is undefined until we restart
3039	 * the engine, if it does enter RC6 during the reset, the state
3040	 * written to the powercontext is undefined and so we may lose
3041	 * GPU state upon resume, i.e. fail to restart after a reset.
3042	 */
3043	intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
3044
3045	/*
3046	 * Prevent the signaler thread from updating the request
3047	 * state (by calling dma_fence_signal) as we are processing
3048	 * the reset. The write from the GPU of the seqno is
3049	 * asynchronous and the signaler thread may see a different
3050	 * value to us and declare the request complete, even though
3051	 * the reset routine have picked that request as the active
3052	 * (incomplete) request. This conflict is not handled
3053	 * gracefully!
3054	 */
3055	kthread_park(engine->breadcrumbs.signaler);
3056
3057	/*
3058	 * Prevent request submission to the hardware until we have
3059	 * completed the reset in i915_gem_reset_finish(). If a request
3060	 * is completed by one engine, it may then queue a request
3061	 * to a second via its execlists->tasklet *just* as we are
3062	 * calling engine->init_hw() and also writing the ELSP.
3063	 * Turning off the execlists->tasklet until the reset is over
3064	 * prevents the race.
3065	 *
3066	 * Note that this needs to be a single atomic operation on the
3067	 * tasklet (flush existing tasks, prevent new tasks) to prevent
3068	 * a race between reset and set-wedged. It is not, so we do the best
3069	 * we can atm and make sure we don't lock the machine up in the more
3070	 * common case of recursively being called from set-wedged from inside
3071	 * i915_reset.
3072	 */
3073	if (!atomic_read(&engine->execlists.tasklet.count))
3074		tasklet_kill(&engine->execlists.tasklet);
3075	tasklet_disable(&engine->execlists.tasklet);
3076
3077	/*
3078	 * We're using worker to queue preemption requests from the tasklet in
3079	 * GuC submission mode.
3080	 * Even though tasklet was disabled, we may still have a worker queued.
3081	 * Let's make sure that all workers scheduled before disabling the
3082	 * tasklet are completed before continuing with the reset.
3083	 */
3084	if (engine->i915->guc.preempt_wq)
3085		flush_workqueue(engine->i915->guc.preempt_wq);
3086
3087	if (engine->irq_seqno_barrier)
3088		engine->irq_seqno_barrier(engine);
3089
3090	request = i915_gem_find_active_request(engine);
3091	if (request && request->fence.error == -EIO)
3092		request = ERR_PTR(-EIO); /* Previous reset failed! */
3093
3094	return request;
3095}
3096
3097int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
3098{
3099	struct intel_engine_cs *engine;
3100	struct i915_request *request;
3101	enum intel_engine_id id;
3102	int err = 0;
3103
3104	for_each_engine(engine, dev_priv, id) {
3105		request = i915_gem_reset_prepare_engine(engine);
3106		if (IS_ERR(request)) {
3107			err = PTR_ERR(request);
3108			continue;
3109		}
3110
3111		engine->hangcheck.active_request = request;
3112	}
3113
3114	i915_gem_revoke_fences(dev_priv);
3115	intel_uc_sanitize(dev_priv);
3116
3117	return err;
3118}
3119
3120static void skip_request(struct i915_request *request)
3121{
3122	void *vaddr = request->ring->vaddr;
3123	u32 head;
3124
3125	/* As this request likely depends on state from the lost
3126	 * context, clear out all the user operations leaving the
3127	 * breadcrumb at the end (so we get the fence notifications).
3128	 */
3129	head = request->head;
3130	if (request->postfix < head) {
3131		memset(vaddr + head, 0, request->ring->size - head);
3132		head = 0;
3133	}
3134	memset(vaddr + head, 0, request->postfix - head);
3135
3136	dma_fence_set_error(&request->fence, -EIO);
3137}
3138
3139static void engine_skip_context(struct i915_request *request)
3140{
3141	struct intel_engine_cs *engine = request->engine;
3142	struct i915_gem_context *hung_ctx = request->ctx;
3143	struct i915_timeline *timeline = request->timeline;
3144	unsigned long flags;
3145
3146	GEM_BUG_ON(timeline == &engine->timeline);
3147
3148	spin_lock_irqsave(&engine->timeline.lock, flags);
3149	spin_lock_nested(&timeline->lock, SINGLE_DEPTH_NESTING);
3150
3151	list_for_each_entry_continue(request, &engine->timeline.requests, link)
3152		if (request->ctx == hung_ctx)
3153			skip_request(request);
3154
3155	list_for_each_entry(request, &timeline->requests, link)
3156		skip_request(request);
3157
3158	spin_unlock(&timeline->lock);
3159	spin_unlock_irqrestore(&engine->timeline.lock, flags);
3160}
3161
3162/* Returns the request if it was guilty of the hang */
3163static struct i915_request *
3164i915_gem_reset_request(struct intel_engine_cs *engine,
3165		       struct i915_request *request,
3166		       bool stalled)
3167{
3168	/* The guilty request will get skipped on a hung engine.
3169	 *
3170	 * Users of client default contexts do not rely on logical
3171	 * state preserved between batches so it is safe to execute
3172	 * queued requests following the hang. Non default contexts
3173	 * rely on preserved state, so skipping a batch loses the
3174	 * evolution of the state and it needs to be considered corrupted.
3175	 * Executing more queued batches on top of corrupted state is
3176	 * risky. But we take the risk by trying to advance through
3177	 * the queued requests in order to make the client behaviour
3178	 * more predictable around resets, by not throwing away random
3179	 * amount of batches it has prepared for execution. Sophisticated
3180	 * clients can use gem_reset_stats_ioctl and dma fence status
3181	 * (exported via sync_file info ioctl on explicit fences) to observe
3182	 * when it loses the context state and should rebuild accordingly.
3183	 *
3184	 * The context ban, and ultimately the client ban, mechanism are safety
3185	 * valves if client submission ends up resulting in nothing more than
3186	 * subsequent hangs.
3187	 */
3188
3189	if (i915_request_completed(request)) {
3190		GEM_TRACE("%s pardoned global=%d (fence %llx:%d), current %d\n",
3191			  engine->name, request->global_seqno,
3192			  request->fence.context, request->fence.seqno,
3193			  intel_engine_get_seqno(engine));
3194		stalled = false;
3195	}
3196
3197	if (stalled) {
3198		i915_gem_context_mark_guilty(request->ctx);
3199		skip_request(request);
3200
3201		/* If this context is now banned, skip all pending requests. */
3202		if (i915_gem_context_is_banned(request->ctx))
3203			engine_skip_context(request);
3204	} else {
3205		/*
3206		 * Since this is not the hung engine, it may have advanced
3207		 * since the hang declaration. Double check by refinding
3208		 * the active request at the time of the reset.
3209		 */
3210		request = i915_gem_find_active_request(engine);
3211		if (request) {
3212			i915_gem_context_mark_innocent(request->ctx);
3213			dma_fence_set_error(&request->fence, -EAGAIN);
3214
3215			/* Rewind the engine to replay the incomplete rq */
3216			spin_lock_irq(&engine->timeline.lock);
3217			request = list_prev_entry(request, link);
3218			if (&request->link == &engine->timeline.requests)
3219				request = NULL;
3220			spin_unlock_irq(&engine->timeline.lock);
3221		}
3222	}
3223
3224	return request;
3225}
3226
3227void i915_gem_reset_engine(struct intel_engine_cs *engine,
3228			   struct i915_request *request,
3229			   bool stalled)
3230{
3231	/*
3232	 * Make sure this write is visible before we re-enable the interrupt
3233	 * handlers on another CPU, as tasklet_enable() resolves to just
3234	 * a compiler barrier which is insufficient for our purpose here.
3235	 */
3236	smp_store_mb(engine->irq_posted, 0);
3237
3238	if (request)
3239		request = i915_gem_reset_request(engine, request, stalled);
3240
3241	if (request) {
3242		DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
3243				 engine->name, request->global_seqno);
3244	}
3245
3246	/* Setup the CS to resume from the breadcrumb of the hung request */
3247	engine->reset_hw(engine, request);
3248}
3249
3250void i915_gem_reset(struct drm_i915_private *dev_priv,
3251		    unsigned int stalled_mask)
3252{
3253	struct intel_engine_cs *engine;
3254	enum intel_engine_id id;
3255
3256	lockdep_assert_held(&dev_priv->drm.struct_mutex);
3257
3258	i915_retire_requests(dev_priv);
3259
3260	for_each_engine(engine, dev_priv, id) {
3261		struct i915_gem_context *ctx;
3262
3263		i915_gem_reset_engine(engine,
3264				      engine->hangcheck.active_request,
3265				      stalled_mask & ENGINE_MASK(id));
3266		ctx = fetch_and_zero(&engine->last_retired_context);
3267		if (ctx)
3268			intel_context_unpin(ctx, engine);
3269
3270		/*
3271		 * Ostensibily, we always want a context loaded for powersaving,
3272		 * so if the engine is idle after the reset, send a request
3273		 * to load our scratch kernel_context.
3274		 *
3275		 * More mysteriously, if we leave the engine idle after a reset,
3276		 * the next userspace batch may hang, with what appears to be
3277		 * an incoherent read by the CS (presumably stale TLB). An
3278		 * empty request appears sufficient to paper over the glitch.
3279		 */
3280		if (intel_engine_is_idle(engine)) {
3281			struct i915_request *rq;
3282
3283			rq = i915_request_alloc(engine,
3284						dev_priv->kernel_context);
3285			if (!IS_ERR(rq))
3286				__i915_request_add(rq, false);
3287		}
3288	}
3289
3290	i915_gem_restore_fences(dev_priv);
3291}
3292
3293void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
3294{
3295	tasklet_enable(&engine->execlists.tasklet);
3296	kthread_unpark(engine->breadcrumbs.signaler);
3297
3298	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
3299}
3300
3301void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
3302{
3303	struct intel_engine_cs *engine;
3304	enum intel_engine_id id;
3305
3306	lockdep_assert_held(&dev_priv->drm.struct_mutex);
3307
3308	for_each_engine(engine, dev_priv, id) {
3309		engine->hangcheck.active_request = NULL;
3310		i915_gem_reset_finish_engine(engine);
3311	}
3312}
3313
3314static void nop_submit_request(struct i915_request *request)
3315{
3316	GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3317		  request->engine->name,
3318		  request->fence.context, request->fence.seqno);
3319	dma_fence_set_error(&request->fence, -EIO);
3320
3321	i915_request_submit(request);
3322}
3323
3324static void nop_complete_submit_request(struct i915_request *request)
3325{
3326	unsigned long flags;
3327
3328	GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3329		  request->engine->name,
3330		  request->fence.context, request->fence.seqno);
3331	dma_fence_set_error(&request->fence, -EIO);
3332
3333	spin_lock_irqsave(&request->engine->timeline.lock, flags);
3334	__i915_request_submit(request);
3335	intel_engine_init_global_seqno(request->engine, request->global_seqno);
3336	spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
3337}
3338
3339void i915_gem_set_wedged(struct drm_i915_private *i915)
3340{
3341	struct intel_engine_cs *engine;
3342	enum intel_engine_id id;
3343
3344	GEM_TRACE("start\n");
3345
3346	if (GEM_SHOW_DEBUG()) {
3347		struct drm_printer p = drm_debug_printer(__func__);
3348
3349		for_each_engine(engine, i915, id)
3350			intel_engine_dump(engine, &p, "%s\n", engine->name);
3351	}
3352
3353	set_bit(I915_WEDGED, &i915->gpu_error.flags);
3354	smp_mb__after_atomic();
3355
3356	/*
3357	 * First, stop submission to hw, but do not yet complete requests by
3358	 * rolling the global seqno forward (since this would complete requests
3359	 * for which we haven't set the fence error to EIO yet).
3360	 */
3361	for_each_engine(engine, i915, id) {
3362		i915_gem_reset_prepare_engine(engine);
3363
3364		engine->submit_request = nop_submit_request;
3365		engine->schedule = NULL;
3366	}
3367	i915->caps.scheduler = 0;
3368
3369	/* Even if the GPU reset fails, it should still stop the engines */
3370	intel_gpu_reset(i915, ALL_ENGINES);
3371
3372	/*
3373	 * Make sure no one is running the old callback before we proceed with
3374	 * cancelling requests and resetting the completion tracking. Otherwise
3375	 * we might submit a request to the hardware which never completes.
3376	 */
3377	synchronize_rcu();
3378
3379	for_each_engine(engine, i915, id) {
3380		/* Mark all executing requests as skipped */
3381		engine->cancel_requests(engine);
3382
3383		/*
3384		 * Only once we've force-cancelled all in-flight requests can we
3385		 * start to complete all requests.
3386		 */
3387		engine->submit_request = nop_complete_submit_request;
3388	}
3389
3390	/*
3391	 * Make sure no request can slip through without getting completed by
3392	 * either this call here to intel_engine_init_global_seqno, or the one
3393	 * in nop_complete_submit_request.
3394	 */
3395	synchronize_rcu();
3396
3397	for_each_engine(engine, i915, id) {
3398		unsigned long flags;
3399
3400		/*
3401		 * Mark all pending requests as complete so that any concurrent
3402		 * (lockless) lookup doesn't try and wait upon the request as we
3403		 * reset it.
3404		 */
3405		spin_lock_irqsave(&engine->timeline.lock, flags);
3406		intel_engine_init_global_seqno(engine,
3407					       intel_engine_last_submit(engine));
3408		spin_unlock_irqrestore(&engine->timeline.lock, flags);
3409
3410		i915_gem_reset_finish_engine(engine);
3411	}
3412
3413	GEM_TRACE("end\n");
3414
3415	wake_up_all(&i915->gpu_error.reset_queue);
3416}
3417
3418bool i915_gem_unset_wedged(struct drm_i915_private *i915)
3419{
3420	struct i915_timeline *tl;
3421
3422	lockdep_assert_held(&i915->drm.struct_mutex);
3423	if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
3424		return true;
3425
3426	GEM_TRACE("start\n");
3427
3428	/*
3429	 * Before unwedging, make sure that all pending operations
3430	 * are flushed and errored out - we may have requests waiting upon
3431	 * third party fences. We marked all inflight requests as EIO, and
3432	 * every execbuf since returned EIO, for consistency we want all
3433	 * the currently pending requests to also be marked as EIO, which
3434	 * is done inside our nop_submit_request - and so we must wait.
3435	 *
3436	 * No more can be submitted until we reset the wedged bit.
3437	 */
3438	list_for_each_entry(tl, &i915->gt.timelines, link) {
3439		struct i915_request *rq;
3440
3441		rq = i915_gem_active_peek(&tl->last_request,
3442					  &i915->drm.struct_mutex);
3443		if (!rq)
3444			continue;
3445
3446		/*
3447		 * We can't use our normal waiter as we want to
3448		 * avoid recursively trying to handle the current
3449		 * reset. The basic dma_fence_default_wait() installs
3450		 * a callback for dma_fence_signal(), which is
3451		 * triggered by our nop handler (indirectly, the
3452		 * callback enables the signaler thread which is
3453		 * woken by the nop_submit_request() advancing the seqno
3454		 * and when the seqno passes the fence, the signaler
3455		 * then signals the fence waking us up).
3456		 */
3457		if (dma_fence_default_wait(&rq->fence, true,
3458					   MAX_SCHEDULE_TIMEOUT) < 0)
3459			return false;
3460	}
3461	i915_retire_requests(i915);
3462	GEM_BUG_ON(i915->gt.active_requests);
3463
3464	/*
3465	 * Undo nop_submit_request. We prevent all new i915 requests from
3466	 * being queued (by disallowing execbuf whilst wedged) so having
3467	 * waited for all active requests above, we know the system is idle
3468	 * and do not have to worry about a thread being inside
3469	 * engine->submit_request() as we swap over. So unlike installing
3470	 * the nop_submit_request on reset, we can do this from normal
3471	 * context and do not require stop_machine().
3472	 */
3473	intel_engines_reset_default_submission(i915);
3474	i915_gem_contexts_lost(i915);
3475
3476	GEM_TRACE("end\n");
3477
3478	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
3479	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
3480
3481	return true;
3482}
3483
3484static void
3485i915_gem_retire_work_handler(struct work_struct *work)
3486{
3487	struct drm_i915_private *dev_priv =
3488		container_of(work, typeof(*dev_priv), gt.retire_work.work);
3489	struct drm_device *dev = &dev_priv->drm;
3490
3491	/* Come back later if the device is busy... */
3492	if (mutex_trylock(&dev->struct_mutex)) {
3493		i915_retire_requests(dev_priv);
3494		mutex_unlock(&dev->struct_mutex);
3495	}
3496
3497	/*
3498	 * Keep the retire handler running until we are finally idle.
3499	 * We do not need to do this test under locking as in the worst-case
3500	 * we queue the retire worker once too often.
3501	 */
3502	if (READ_ONCE(dev_priv->gt.awake))
3503		queue_delayed_work(dev_priv->wq,
3504				   &dev_priv->gt.retire_work,
3505				   round_jiffies_up_relative(HZ));
3506}
3507
3508static void shrink_caches(struct drm_i915_private *i915)
3509{
3510	/*
3511	 * kmem_cache_shrink() discards empty slabs and reorders partially
3512	 * filled slabs to prioritise allocating from the mostly full slabs,
3513	 * with the aim of reducing fragmentation.
3514	 */
3515	kmem_cache_shrink(i915->priorities);
3516	kmem_cache_shrink(i915->dependencies);
3517	kmem_cache_shrink(i915->requests);
3518	kmem_cache_shrink(i915->luts);
3519	kmem_cache_shrink(i915->vmas);
3520	kmem_cache_shrink(i915->objects);
3521}
3522
3523struct sleep_rcu_work {
3524	union {
3525		struct rcu_head rcu;
3526		struct work_struct work;
3527	};
3528	struct drm_i915_private *i915;
3529	unsigned int epoch;
3530};
3531
3532static inline bool
3533same_epoch(struct drm_i915_private *i915, unsigned int epoch)
3534{
3535	/*
3536	 * There is a small chance that the epoch wrapped since we started
3537	 * sleeping. If we assume that epoch is at least a u32, then it will
3538	 * take at least 2^32 * 100ms for it to wrap, or about 326 years.
3539	 */
3540	return epoch == READ_ONCE(i915->gt.epoch);
3541}
3542
3543static void __sleep_work(struct work_struct *work)
3544{
3545	struct sleep_rcu_work *s = container_of(work, typeof(*s), work);
3546	struct drm_i915_private *i915 = s->i915;
3547	unsigned int epoch = s->epoch;
3548
3549	kfree(s);
3550	if (same_epoch(i915, epoch))
3551		shrink_caches(i915);
3552}
3553
3554static void __sleep_rcu(struct rcu_head *rcu)
3555{
3556	struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu);
3557	struct drm_i915_private *i915 = s->i915;
3558
3559	if (same_epoch(i915, s->epoch)) {
3560		INIT_WORK(&s->work, __sleep_work);
3561		queue_work(i915->wq, &s->work);
3562	} else {
3563		kfree(s);
3564	}
3565}
3566
3567static inline bool
3568new_requests_since_last_retire(const struct drm_i915_private *i915)
3569{
3570	return (READ_ONCE(i915->gt.active_requests) ||
3571		work_pending(&i915->gt.idle_work.work));
3572}
3573
3574static void
3575i915_gem_idle_work_handler(struct work_struct *work)
3576{
3577	struct drm_i915_private *dev_priv =
3578		container_of(work, typeof(*dev_priv), gt.idle_work.work);
3579	unsigned int epoch = I915_EPOCH_INVALID;
3580	bool rearm_hangcheck;
3581
3582	if (!READ_ONCE(dev_priv->gt.awake))
3583		return;
3584
3585	/*
3586	 * Wait for last execlists context complete, but bail out in case a
3587	 * new request is submitted. As we don't trust the hardware, we
3588	 * continue on if the wait times out. This is necessary to allow
3589	 * the machine to suspend even if the hardware dies, and we will
3590	 * try to recover in resume (after depriving the hardware of power,
3591	 * it may be in a better mmod).
3592	 */
3593	__wait_for(if (new_requests_since_last_retire(dev_priv)) return,
3594		   intel_engines_are_idle(dev_priv),
3595		   I915_IDLE_ENGINES_TIMEOUT * 1000,
3596		   10, 500);
3597
3598	rearm_hangcheck =
3599		cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
3600
3601	if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
3602		/* Currently busy, come back later */
3603		mod_delayed_work(dev_priv->wq,
3604				 &dev_priv->gt.idle_work,
3605				 msecs_to_jiffies(50));
3606		goto out_rearm;
3607	}
3608
3609	/*
3610	 * New request retired after this work handler started, extend active
3611	 * period until next instance of the work.
3612	 */
3613	if (new_requests_since_last_retire(dev_priv))
3614		goto out_unlock;
3615
3616	epoch = __i915_gem_park(dev_priv);
3617
3618	rearm_hangcheck = false;
3619out_unlock:
3620	mutex_unlock(&dev_priv->drm.struct_mutex);
3621
3622out_rearm:
3623	if (rearm_hangcheck) {
3624		GEM_BUG_ON(!dev_priv->gt.awake);
3625		i915_queue_hangcheck(dev_priv);
3626	}
3627
3628	/*
3629	 * When we are idle, it is an opportune time to reap our caches.
3630	 * However, we have many objects that utilise RCU and the ordered
3631	 * i915->wq that this work is executing on. To try and flush any
3632	 * pending frees now we are idle, we first wait for an RCU grace
3633	 * period, and then queue a task (that will run last on the wq) to
3634	 * shrink and re-optimize the caches.
3635	 */
3636	if (same_epoch(dev_priv, epoch)) {
3637		struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL);
3638		if (s) {
3639			s->i915 = dev_priv;
3640			s->epoch = epoch;
3641			call_rcu(&s->rcu, __sleep_rcu);
3642		}
3643	}
3644}
3645
3646void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3647{
3648	struct drm_i915_private *i915 = to_i915(gem->dev);
3649	struct drm_i915_gem_object *obj = to_intel_bo(gem);
3650	struct drm_i915_file_private *fpriv = file->driver_priv;
3651	struct i915_lut_handle *lut, *ln;
3652
3653	mutex_lock(&i915->drm.struct_mutex);
3654
3655	list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3656		struct i915_gem_context *ctx = lut->ctx;
3657		struct i915_vma *vma;
3658
3659		GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3660		if (ctx->file_priv != fpriv)
3661			continue;
3662
3663		vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3664		GEM_BUG_ON(vma->obj != obj);
3665
3666		/* We allow the process to have multiple handles to the same
3667		 * vma, in the same fd namespace, by virtue of flink/open.
3668		 */
3669		GEM_BUG_ON(!vma->open_count);
3670		if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3671			i915_vma_close(vma);
3672
3673		list_del(&lut->obj_link);
3674		list_del(&lut->ctx_link);
3675
3676		kmem_cache_free(i915->luts, lut);
3677		__i915_gem_object_release_unless_active(obj);
3678	}
3679
3680	mutex_unlock(&i915->drm.struct_mutex);
3681}
3682
3683static unsigned long to_wait_timeout(s64 timeout_ns)
3684{
3685	if (timeout_ns < 0)
3686		return MAX_SCHEDULE_TIMEOUT;
3687
3688	if (timeout_ns == 0)
3689		return 0;
3690
3691	return nsecs_to_jiffies_timeout(timeout_ns);
3692}
3693
3694/**
3695 * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3696 * @dev: drm device pointer
3697 * @data: ioctl data blob
3698 * @file: drm file pointer
3699 *
3700 * Returns 0 if successful, else an error is returned with the remaining time in
3701 * the timeout parameter.
3702 *  -ETIME: object is still busy after timeout
3703 *  -ERESTARTSYS: signal interrupted the wait
3704 *  -ENONENT: object doesn't exist
3705 * Also possible, but rare:
3706 *  -EAGAIN: incomplete, restart syscall
3707 *  -ENOMEM: damn
3708 *  -ENODEV: Internal IRQ fail
3709 *  -E?: The add request failed
3710 *
3711 * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3712 * non-zero timeout parameter the wait ioctl will wait for the given number of
3713 * nanoseconds on an object becoming unbusy. Since the wait itself does so
3714 * without holding struct_mutex the object may become re-busied before this
3715 * function completes. A similar but shorter * race condition exists in the busy
3716 * ioctl
3717 */
3718int
3719i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3720{
3721	struct drm_i915_gem_wait *args = data;
3722	struct drm_i915_gem_object *obj;
3723	ktime_t start;
3724	long ret;
3725
3726	if (args->flags != 0)
3727		return -EINVAL;
3728
3729	obj = i915_gem_object_lookup(file, args->bo_handle);
3730	if (!obj)
3731		return -ENOENT;
3732
3733	start = ktime_get();
3734
3735	ret = i915_gem_object_wait(obj,
3736				   I915_WAIT_INTERRUPTIBLE | I915_WAIT_ALL,
3737				   to_wait_timeout(args->timeout_ns),
3738				   to_rps_client(file));
3739
3740	if (args->timeout_ns > 0) {
3741		args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3742		if (args->timeout_ns < 0)
3743			args->timeout_ns = 0;
3744
3745		/*
3746		 * Apparently ktime isn't accurate enough and occasionally has a
3747		 * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3748		 * things up to make the test happy. We allow up to 1 jiffy.
3749		 *
3750		 * This is a regression from the timespec->ktime conversion.
3751		 */
3752		if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3753			args->timeout_ns = 0;
3754
3755		/* Asked to wait beyond the jiffie/scheduler precision? */
3756		if (ret == -ETIME && args->timeout_ns)
3757			ret = -EAGAIN;
3758	}
3759
3760	i915_gem_object_put(obj);
3761	return ret;
3762}
3763
3764static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags)
3765{
3766	return i915_gem_active_wait(&tl->last_request, flags);
3767}
3768
3769static int wait_for_engines(struct drm_i915_private *i915)
3770{
3771	if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3772		dev_err(i915->drm.dev,
3773			"Failed to idle engines, declaring wedged!\n");
3774		GEM_TRACE_DUMP();
3775		i915_gem_set_wedged(i915);
3776		return -EIO;
3777	}
3778
3779	return 0;
3780}
3781
3782int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
3783{
3784	/* If the device is asleep, we have no requests outstanding */
3785	if (!READ_ONCE(i915->gt.awake))
3786		return 0;
3787
3788	if (flags & I915_WAIT_LOCKED) {
3789		struct i915_timeline *tl;
3790		int err;
3791
3792		lockdep_assert_held(&i915->drm.struct_mutex);
3793
3794		list_for_each_entry(tl, &i915->gt.timelines, link) {
3795			err = wait_for_timeline(tl, flags);
3796			if (err)
3797				return err;
3798		}
3799		i915_retire_requests(i915);
3800
3801		return wait_for_engines(i915);
3802	} else {
3803		struct intel_engine_cs *engine;
3804		enum intel_engine_id id;
3805		int err;
3806
3807		for_each_engine(engine, i915, id) {
3808			err = wait_for_timeline(&engine->timeline, flags);
3809			if (err)
3810				return err;
3811		}
3812
3813		return 0;
3814	}
3815}
3816
3817static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3818{
3819	/*
3820	 * We manually flush the CPU domain so that we can override and
3821	 * force the flush for the display, and perform it asyncrhonously.
3822	 */
3823	flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3824	if (obj->cache_dirty)
3825		i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3826	obj->write_domain = 0;
3827}
3828
3829void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3830{
3831	if (!READ_ONCE(obj->pin_global))
3832		return;
3833
3834	mutex_lock(&obj->base.dev->struct_mutex);
3835	__i915_gem_object_flush_for_display(obj);
3836	mutex_unlock(&obj->base.dev->struct_mutex);
3837}
3838
3839/**
3840 * Moves a single object to the WC read, and possibly write domain.
3841 * @obj: object to act on
3842 * @write: ask for write access or read only
3843 *
3844 * This function returns when the move is complete, including waiting on
3845 * flushes to occur.
3846 */
3847int
3848i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3849{
3850	int ret;
3851
3852	lockdep_assert_held(&obj->base.dev->struct_mutex);
3853
3854	ret = i915_gem_object_wait(obj,
3855				   I915_WAIT_INTERRUPTIBLE |
3856				   I915_WAIT_LOCKED |
3857				   (write ? I915_WAIT_ALL : 0),
3858				   MAX_SCHEDULE_TIMEOUT,
3859				   NULL);
3860	if (ret)
3861		return ret;
3862
3863	if (obj->write_domain == I915_GEM_DOMAIN_WC)
3864		return 0;
3865
3866	/* Flush and acquire obj->pages so that we are coherent through
3867	 * direct access in memory with previous cached writes through
3868	 * shmemfs and that our cache domain tracking remains valid.
3869	 * For example, if the obj->filp was moved to swap without us
3870	 * being notified and releasing the pages, we would mistakenly
3871	 * continue to assume that the obj remained out of the CPU cached
3872	 * domain.
3873	 */
3874	ret = i915_gem_object_pin_pages(obj);
3875	if (ret)
3876		return ret;
3877
3878	flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3879
3880	/* Serialise direct access to this object with the barriers for
3881	 * coherent writes from the GPU, by effectively invalidating the
3882	 * WC domain upon first access.
3883	 */
3884	if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
3885		mb();
3886
3887	/* It should now be out of any other write domains, and we can update
3888	 * the domain values for our changes.
3889	 */
3890	GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3891	obj->read_domains |= I915_GEM_DOMAIN_WC;
3892	if (write) {
3893		obj->read_domains = I915_GEM_DOMAIN_WC;
3894		obj->write_domain = I915_GEM_DOMAIN_WC;
3895		obj->mm.dirty = true;
3896	}
3897
3898	i915_gem_object_unpin_pages(obj);
3899	return 0;
3900}
3901
3902/**
3903 * Moves a single object to the GTT read, and possibly write domain.
3904 * @obj: object to act on
3905 * @write: ask for write access or read only
3906 *
3907 * This function returns when the move is complete, including waiting on
3908 * flushes to occur.
3909 */
3910int
3911i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3912{
3913	int ret;
3914
3915	lockdep_assert_held(&obj->base.dev->struct_mutex);
3916
3917	ret = i915_gem_object_wait(obj,
3918				   I915_WAIT_INTERRUPTIBLE |
3919				   I915_WAIT_LOCKED |
3920				   (write ? I915_WAIT_ALL : 0),
3921				   MAX_SCHEDULE_TIMEOUT,
3922				   NULL);
3923	if (ret)
3924		return ret;
3925
3926	if (obj->write_domain == I915_GEM_DOMAIN_GTT)
3927		return 0;
3928
3929	/* Flush and acquire obj->pages so that we are coherent through
3930	 * direct access in memory with previous cached writes through
3931	 * shmemfs and that our cache domain tracking remains valid.
3932	 * For example, if the obj->filp was moved to swap without us
3933	 * being notified and releasing the pages, we would mistakenly
3934	 * continue to assume that the obj remained out of the CPU cached
3935	 * domain.
3936	 */
3937	ret = i915_gem_object_pin_pages(obj);
3938	if (ret)
3939		return ret;
3940
3941	flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3942
3943	/* Serialise direct access to this object with the barriers for
3944	 * coherent writes from the GPU, by effectively invalidating the
3945	 * GTT domain upon first access.
3946	 */
3947	if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
3948		mb();
3949
3950	/* It should now be out of any other write domains, and we can update
3951	 * the domain values for our changes.
3952	 */
3953	GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
3954	obj->read_domains |= I915_GEM_DOMAIN_GTT;
3955	if (write) {
3956		obj->read_domains = I915_GEM_DOMAIN_GTT;
3957		obj->write_domain = I915_GEM_DOMAIN_GTT;
3958		obj->mm.dirty = true;
3959	}
3960
3961	i915_gem_object_unpin_pages(obj);
3962	return 0;
3963}
3964
3965/**
3966 * Changes the cache-level of an object across all VMA.
3967 * @obj: object to act on
3968 * @cache_level: new cache level to set for the object
3969 *
3970 * After this function returns, the object will be in the new cache-level
3971 * across all GTT and the contents of the backing storage will be coherent,
3972 * with respect to the new cache-level. In order to keep the backing storage
3973 * coherent for all users, we only allow a single cache level to be set
3974 * globally on the object and prevent it from being changed whilst the
3975 * hardware is reading from the object. That is if the object is currently
3976 * on the scanout it will be set to uncached (or equivalent display
3977 * cache coherency) and all non-MOCS GPU access will also be uncached so
3978 * that all direct access to the scanout remains coherent.
3979 */
3980int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
3981				    enum i915_cache_level cache_level)
3982{
3983	struct i915_vma *vma;
3984	int ret;
3985
3986	lockdep_assert_held(&obj->base.dev->struct_mutex);
3987
3988	if (obj->cache_level == cache_level)
3989		return 0;
3990
3991	/* Inspect the list of currently bound VMA and unbind any that would
3992	 * be invalid given the new cache-level. This is principally to
3993	 * catch the issue of the CS prefetch crossing page boundaries and
3994	 * reading an invalid PTE on older architectures.
3995	 */
3996restart:
3997	list_for_each_entry(vma, &obj->vma_list, obj_link) {
3998		if (!drm_mm_node_allocated(&vma->node))
3999			continue;
4000
4001		if (i915_vma_is_pinned(vma)) {
4002			DRM_DEBUG("can not change the cache level of pinned objects\n");
4003			return -EBUSY;
4004		}
4005
4006		if (!i915_vma_is_closed(vma) &&
4007		    i915_gem_valid_gtt_space(vma, cache_level))
4008			continue;
4009
4010		ret = i915_vma_unbind(vma);
4011		if (ret)
4012			return ret;
4013
4014		/* As unbinding may affect other elements in the
4015		 * obj->vma_list (due to side-effects from retiring
4016		 * an active vma), play safe and restart the iterator.
4017		 */
4018		goto restart;
4019	}
4020
4021	/* We can reuse the existing drm_mm nodes but need to change the
4022	 * cache-level on the PTE. We could simply unbind them all and
4023	 * rebind with the correct cache-level on next use. However since
4024	 * we already have a valid slot, dma mapping, pages etc, we may as
4025	 * rewrite the PTE in the belief that doing so tramples upon less
4026	 * state and so involves less work.
4027	 */
4028	if (obj->bind_count) {
4029		/* Before we change the PTE, the GPU must not be accessing it.
4030		 * If we wait upon the object, we know that all the bound
4031		 * VMA are no longer active.
4032		 */
4033		ret = i915_gem_object_wait(obj,
4034					   I915_WAIT_INTERRUPTIBLE |
4035					   I915_WAIT_LOCKED |
4036					   I915_WAIT_ALL,
4037					   MAX_SCHEDULE_TIMEOUT,
4038					   NULL);
4039		if (ret)
4040			return ret;
4041
4042		if (!HAS_LLC(to_i915(obj->base.dev)) &&
4043		    cache_level != I915_CACHE_NONE) {
4044			/* Access to snoopable pages through the GTT is
4045			 * incoherent and on some machines causes a hard
4046			 * lockup. Relinquish the CPU mmaping to force
4047			 * userspace to refault in the pages and we can
4048			 * then double check if the GTT mapping is still
4049			 * valid for that pointer access.
4050			 */
4051			i915_gem_release_mmap(obj);
4052
4053			/* As we no longer need a fence for GTT access,
4054			 * we can relinquish it now (and so prevent having
4055			 * to steal a fence from someone else on the next
4056			 * fence request). Note GPU activity would have
4057			 * dropped the fence as all snoopable access is
4058			 * supposed to be linear.
4059			 */
4060			for_each_ggtt_vma(vma, obj) {
4061				ret = i915_vma_put_fence(vma);
4062				if (ret)
4063					return ret;
4064			}
4065		} else {
4066			/* We either have incoherent backing store and
4067			 * so no GTT access or the architecture is fully
4068			 * coherent. In such cases, existing GTT mmaps
4069			 * ignore the cache bit in the PTE and we can
4070			 * rewrite it without confusing the GPU or having
4071			 * to force userspace to fault back in its mmaps.
4072			 */
4073		}
4074
4075		list_for_each_entry(vma, &obj->vma_list, obj_link) {
4076			if (!drm_mm_node_allocated(&vma->node))
4077				continue;
4078
4079			ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
4080			if (ret)
4081				return ret;
4082		}
4083	}
4084
4085	list_for_each_entry(vma, &obj->vma_list, obj_link)
4086		vma->node.color = cache_level;
4087	i915_gem_object_set_cache_coherency(obj, cache_level);
4088	obj->cache_dirty = true; /* Always invalidate stale cachelines */
4089
4090	return 0;
4091}
4092
4093int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
4094			       struct drm_file *file)
4095{
4096	struct drm_i915_gem_caching *args = data;
4097	struct drm_i915_gem_object *obj;
4098	int err = 0;
4099
4100	rcu_read_lock();
4101	obj = i915_gem_object_lookup_rcu(file, args->handle);
4102	if (!obj) {
4103		err = -ENOENT;
4104		goto out;
4105	}
4106
4107	switch (obj->cache_level) {
4108	case I915_CACHE_LLC:
4109	case I915_CACHE_L3_LLC:
4110		args->caching = I915_CACHING_CACHED;
4111		break;
4112
4113	case I915_CACHE_WT:
4114		args->caching = I915_CACHING_DISPLAY;
4115		break;
4116
4117	default:
4118		args->caching = I915_CACHING_NONE;
4119		break;
4120	}
4121out:
4122	rcu_read_unlock();
4123	return err;
4124}
4125
4126int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
4127			       struct drm_file *file)
4128{
4129	struct drm_i915_private *i915 = to_i915(dev);
4130	struct drm_i915_gem_caching *args = data;
4131	struct drm_i915_gem_object *obj;
4132	enum i915_cache_level level;
4133	int ret = 0;
4134
4135	switch (args->caching) {
4136	case I915_CACHING_NONE:
4137		level = I915_CACHE_NONE;
4138		break;
4139	case I915_CACHING_CACHED:
4140		/*
4141		 * Due to a HW issue on BXT A stepping, GPU stores via a
4142		 * snooped mapping may leave stale data in a corresponding CPU
4143		 * cacheline, whereas normally such cachelines would get
4144		 * invalidated.
4145		 */
4146		if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
4147			return -ENODEV;
4148
4149		level = I915_CACHE_LLC;
4150		break;
4151	case I915_CACHING_DISPLAY:
4152		level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
4153		break;
4154	default:
4155		return -EINVAL;
4156	}
4157
4158	obj = i915_gem_object_lookup(file, args->handle);
4159	if (!obj)
4160		return -ENOENT;
4161
4162	/*
4163	 * The caching mode of proxy object is handled by its generator, and
4164	 * not allowed to be changed by userspace.
4165	 */
4166	if (i915_gem_object_is_proxy(obj)) {
4167		ret = -ENXIO;
4168		goto out;
4169	}
4170
4171	if (obj->cache_level == level)
4172		goto out;
4173
4174	ret = i915_gem_object_wait(obj,
4175				   I915_WAIT_INTERRUPTIBLE,
4176				   MAX_SCHEDULE_TIMEOUT,
4177				   to_rps_client(file));
4178	if (ret)
4179		goto out;
4180
4181	ret = i915_mutex_lock_interruptible(dev);
4182	if (ret)
4183		goto out;
4184
4185	ret = i915_gem_object_set_cache_level(obj, level);
4186	mutex_unlock(&dev->struct_mutex);
4187
4188out:
4189	i915_gem_object_put(obj);
4190	return ret;
4191}
4192
4193/*
4194 * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
4195 * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
4196 * (for pageflips). We only flush the caches while preparing the buffer for
4197 * display, the callers are responsible for frontbuffer flush.
4198 */
4199struct i915_vma *
4200i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
4201				     u32 alignment,
4202				     const struct i915_ggtt_view *view,
4203				     unsigned int flags)
4204{
4205	struct i915_vma *vma;
4206	int ret;
4207
4208	lockdep_assert_held(&obj->base.dev->struct_mutex);
4209
4210	/* Mark the global pin early so that we account for the
4211	 * display coherency whilst setting up the cache domains.
4212	 */
4213	obj->pin_global++;
4214
4215	/* The display engine is not coherent with the LLC cache on gen6.  As
4216	 * a result, we make sure that the pinning that is about to occur is
4217	 * done with uncached PTEs. This is lowest common denominator for all
4218	 * chipsets.
4219	 *
4220	 * However for gen6+, we could do better by using the GFDT bit instead
4221	 * of uncaching, which would allow us to flush all the LLC-cached data
4222	 * with that bit in the PTE to main memory with just one PIPE_CONTROL.
4223	 */
4224	ret = i915_gem_object_set_cache_level(obj,
4225					      HAS_WT(to_i915(obj->base.dev)) ?
4226					      I915_CACHE_WT : I915_CACHE_NONE);
4227	if (ret) {
4228		vma = ERR_PTR(ret);
4229		goto err_unpin_global;
4230	}
4231
4232	/* As the user may map the buffer once pinned in the display plane
4233	 * (e.g. libkms for the bootup splash), we have to ensure that we
4234	 * always use map_and_fenceable for all scanout buffers. However,
4235	 * it may simply be too big to fit into mappable, in which case
4236	 * put it anyway and hope that userspace can cope (but always first
4237	 * try to preserve the existing ABI).
4238	 */
4239	vma = ERR_PTR(-ENOSPC);
4240	if ((flags & PIN_MAPPABLE) == 0 &&
4241	    (!view || view->type == I915_GGTT_VIEW_NORMAL))
4242		vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
4243					       flags |
4244					       PIN_MAPPABLE |
4245					       PIN_NONBLOCK);
4246	if (IS_ERR(vma))
4247		vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
4248	if (IS_ERR(vma))
4249		goto err_unpin_global;
4250
4251	vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
4252
4253	__i915_gem_object_flush_for_display(obj);
4254
4255	/* It should now be out of any other write domains, and we can update
4256	 * the domain values for our changes.
4257	 */
4258	obj->read_domains |= I915_GEM_DOMAIN_GTT;
4259
4260	return vma;
4261
4262err_unpin_global:
4263	obj->pin_global--;
4264	return vma;
4265}
4266
4267void
4268i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
4269{
4270	lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
4271
4272	if (WARN_ON(vma->obj->pin_global == 0))
4273		return;
4274
4275	if (--vma->obj->pin_global == 0)
4276		vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
4277
4278	/* Bump the LRU to try and avoid premature eviction whilst flipping  */
4279	i915_gem_object_bump_inactive_ggtt(vma->obj);
4280
4281	i915_vma_unpin(vma);
4282}
4283
4284/**
4285 * Moves a single object to the CPU read, and possibly write domain.
4286 * @obj: object to act on
4287 * @write: requesting write or read-only access
4288 *
4289 * This function returns when the move is complete, including waiting on
4290 * flushes to occur.
4291 */
4292int
4293i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
4294{
4295	int ret;
4296
4297	lockdep_assert_held(&obj->base.dev->struct_mutex);
4298
4299	ret = i915_gem_object_wait(obj,
4300				   I915_WAIT_INTERRUPTIBLE |
4301				   I915_WAIT_LOCKED |
4302				   (write ? I915_WAIT_ALL : 0),
4303				   MAX_SCHEDULE_TIMEOUT,
4304				   NULL);
4305	if (ret)
4306		return ret;
4307
4308	flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
4309
4310	/* Flush the CPU cache if it's still invalid. */
4311	if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
4312		i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
4313		obj->read_domains |= I915_GEM_DOMAIN_CPU;
4314	}
4315
4316	/* It should now be out of any other write domains, and we can update
4317	 * the domain values for our changes.
4318	 */
4319	GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
4320
4321	/* If we're writing through the CPU, then the GPU read domains will
4322	 * need to be invalidated at next use.
4323	 */
4324	if (write)
4325		__start_cpu_write(obj);
4326
4327	return 0;
4328}
4329
4330/* Throttle our rendering by waiting until the ring has completed our requests
4331 * emitted over 20 msec ago.
4332 *
4333 * Note that if we were to use the current jiffies each time around the loop,
4334 * we wouldn't escape the function with any frames outstanding if the time to
4335 * render a frame was over 20ms.
4336 *
4337 * This should get us reasonable parallelism between CPU and GPU but also
4338 * relatively low latency when blocking on a particular request to finish.
4339 */
4340static int
4341i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
4342{
4343	struct drm_i915_private *dev_priv = to_i915(dev);
4344	struct drm_i915_file_private *file_priv = file->driver_priv;
4345	unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
4346	struct i915_request *request, *target = NULL;
4347	long ret;
4348
4349	/* ABI: return -EIO if already wedged */
4350	if (i915_terminally_wedged(&dev_priv->gpu_error))
4351		return -EIO;
4352
4353	spin_lock(&file_priv->mm.lock);
4354	list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
4355		if (time_after_eq(request->emitted_jiffies, recent_enough))
4356			break;
4357
4358		if (target) {
4359			list_del(&target->client_link);
4360			target->file_priv = NULL;
4361		}
4362
4363		target = request;
4364	}
4365	if (target)
4366		i915_request_get(target);
4367	spin_unlock(&file_priv->mm.lock);
4368
4369	if (target == NULL)
4370		return 0;
4371
4372	ret = i915_request_wait(target,
4373				I915_WAIT_INTERRUPTIBLE,
4374				MAX_SCHEDULE_TIMEOUT);
4375	i915_request_put(target);
4376
4377	return ret < 0 ? ret : 0;
4378}
4379
4380struct i915_vma *
4381i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
4382			 const struct i915_ggtt_view *view,
4383			 u64 size,
4384			 u64 alignment,
4385			 u64 flags)
4386{
4387	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
4388	struct i915_address_space *vm = &dev_priv->ggtt.base;
4389	struct i915_vma *vma;
4390	int ret;
4391
4392	lockdep_assert_held(&obj->base.dev->struct_mutex);
4393
4394	if (flags & PIN_MAPPABLE &&
4395	    (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
4396		/* If the required space is larger than the available
4397		 * aperture, we will not able to find a slot for the
4398		 * object and unbinding the object now will be in
4399		 * vain. Worse, doing so may cause us to ping-pong
4400		 * the object in and out of the Global GTT and
4401		 * waste a lot of cycles under the mutex.
4402		 */
4403		if (obj->base.size > dev_priv->ggtt.mappable_end)
4404			return ERR_PTR(-E2BIG);
4405
4406		/* If NONBLOCK is set the caller is optimistically
4407		 * trying to cache the full object within the mappable
4408		 * aperture, and *must* have a fallback in place for
4409		 * situations where we cannot bind the object. We
4410		 * can be a little more lax here and use the fallback
4411		 * more often to avoid costly migrations of ourselves
4412		 * and other objects within the aperture.
4413		 *
4414		 * Half-the-aperture is used as a simple heuristic.
4415		 * More interesting would to do search for a free
4416		 * block prior to making the commitment to unbind.
4417		 * That caters for the self-harm case, and with a
4418		 * little more heuristics (e.g. NOFAULT, NOEVICT)
4419		 * we could try to minimise harm to others.
4420		 */
4421		if (flags & PIN_NONBLOCK &&
4422		    obj->base.size > dev_priv->ggtt.mappable_end / 2)
4423			return ERR_PTR(-ENOSPC);
4424	}
4425
4426	vma = i915_vma_instance(obj, vm, view);
4427	if (unlikely(IS_ERR(vma)))
4428		return vma;
4429
4430	if (i915_vma_misplaced(vma, size, alignment, flags)) {
4431		if (flags & PIN_NONBLOCK) {
4432			if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
4433				return ERR_PTR(-ENOSPC);
4434
4435			if (flags & PIN_MAPPABLE &&
4436			    vma->fence_size > dev_priv->ggtt.mappable_end / 2)
4437				return ERR_PTR(-ENOSPC);
4438		}
4439
4440		WARN(i915_vma_is_pinned(vma),
4441		     "bo is already pinned in ggtt with incorrect alignment:"
4442		     " offset=%08x, req.alignment=%llx,"
4443		     " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
4444		     i915_ggtt_offset(vma), alignment,
4445		     !!(flags & PIN_MAPPABLE),
4446		     i915_vma_is_map_and_fenceable(vma));
4447		ret = i915_vma_unbind(vma);
4448		if (ret)
4449			return ERR_PTR(ret);
4450	}
4451
4452	ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
4453	if (ret)
4454		return ERR_PTR(ret);
4455
4456	return vma;
4457}
4458
4459static __always_inline unsigned int __busy_read_flag(unsigned int id)
4460{
4461	/* Note that we could alias engines in the execbuf API, but
4462	 * that would be very unwise as it prevents userspace from
4463	 * fine control over engine selection. Ahem.
4464	 *
4465	 * This should be something like EXEC_MAX_ENGINE instead of
4466	 * I915_NUM_ENGINES.
4467	 */
4468	BUILD_BUG_ON(I915_NUM_ENGINES > 16);
4469	return 0x10000 << id;
4470}
4471
4472static __always_inline unsigned int __busy_write_id(unsigned int id)
4473{
4474	/* The uABI guarantees an active writer is also amongst the read
4475	 * engines. This would be true if we accessed the activity tracking
4476	 * under the lock, but as we perform the lookup of the object and
4477	 * its activity locklessly we can not guarantee that the last_write
4478	 * being active implies that we have set the same engine flag from
4479	 * last_read - hence we always set both read and write busy for
4480	 * last_write.
4481	 */
4482	return id | __busy_read_flag(id);
4483}
4484
4485static __always_inline unsigned int
4486__busy_set_if_active(const struct dma_fence *fence,
4487		     unsigned int (*flag)(unsigned int id))
4488{
4489	struct i915_request *rq;
4490
4491	/* We have to check the current hw status of the fence as the uABI
4492	 * guarantees forward progress. We could rely on the idle worker
4493	 * to eventually flush us, but to minimise latency just ask the
4494	 * hardware.
4495	 *
4496	 * Note we only report on the status of native fences.
4497	 */
4498	if (!dma_fence_is_i915(fence))
4499		return 0;
4500
4501	/* opencode to_request() in order to avoid const warnings */
4502	rq = container_of(fence, struct i915_request, fence);
4503	if (i915_request_completed(rq))
4504		return 0;
4505
4506	return flag(rq->engine->uabi_id);
4507}
4508
4509static __always_inline unsigned int
4510busy_check_reader(const struct dma_fence *fence)
4511{
4512	return __busy_set_if_active(fence, __busy_read_flag);
4513}
4514
4515static __always_inline unsigned int
4516busy_check_writer(const struct dma_fence *fence)
4517{
4518	if (!fence)
4519		return 0;
4520
4521	return __busy_set_if_active(fence, __busy_write_id);
4522}
4523
4524int
4525i915_gem_busy_ioctl(struct drm_device *dev, void *data,
4526		    struct drm_file *file)
4527{
4528	struct drm_i915_gem_busy *args = data;
4529	struct drm_i915_gem_object *obj;
4530	struct reservation_object_list *list;
4531	unsigned int seq;
4532	int err;
4533
4534	err = -ENOENT;
4535	rcu_read_lock();
4536	obj = i915_gem_object_lookup_rcu(file, args->handle);
4537	if (!obj)
4538		goto out;
4539
4540	/* A discrepancy here is that we do not report the status of
4541	 * non-i915 fences, i.e. even though we may report the object as idle,
4542	 * a call to set-domain may still stall waiting for foreign rendering.
4543	 * This also means that wait-ioctl may report an object as busy,
4544	 * where busy-ioctl considers it idle.
4545	 *
4546	 * We trade the ability to warn of foreign fences to report on which
4547	 * i915 engines are active for the object.
4548	 *
4549	 * Alternatively, we can trade that extra information on read/write
4550	 * activity with
4551	 *	args->busy =
4552	 *		!reservation_object_test_signaled_rcu(obj->resv, true);
4553	 * to report the overall busyness. This is what the wait-ioctl does.
4554	 *
4555	 */
4556retry:
4557	seq = raw_read_seqcount(&obj->resv->seq);
4558
4559	/* Translate the exclusive fence to the READ *and* WRITE engine */
4560	args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
4561
4562	/* Translate shared fences to READ set of engines */
4563	list = rcu_dereference(obj->resv->fence);
4564	if (list) {
4565		unsigned int shared_count = list->shared_count, i;
4566
4567		for (i = 0; i < shared_count; ++i) {
4568			struct dma_fence *fence =
4569				rcu_dereference(list->shared[i]);
4570
4571			args->busy |= busy_check_reader(fence);
4572		}
4573	}
4574
4575	if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
4576		goto retry;
4577
4578	err = 0;
4579out:
4580	rcu_read_unlock();
4581	return err;
4582}
4583
4584int
4585i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
4586			struct drm_file *file_priv)
4587{
4588	return i915_gem_ring_throttle(dev, file_priv);
4589}
4590
4591int
4592i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
4593		       struct drm_file *file_priv)
4594{
4595	struct drm_i915_private *dev_priv = to_i915(dev);
4596	struct drm_i915_gem_madvise *args = data;
4597	struct drm_i915_gem_object *obj;
4598	int err;
4599
4600	switch (args->madv) {
4601	case I915_MADV_DONTNEED:
4602	case I915_MADV_WILLNEED:
4603	    break;
4604	default:
4605	    return -EINVAL;
4606	}
4607
4608	obj = i915_gem_object_lookup(file_priv, args->handle);
4609	if (!obj)
4610		return -ENOENT;
4611
4612	err = mutex_lock_interruptible(&obj->mm.lock);
4613	if (err)
4614		goto out;
4615
4616	if (i915_gem_object_has_pages(obj) &&
4617	    i915_gem_object_is_tiled(obj) &&
4618	    dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4619		if (obj->mm.madv == I915_MADV_WILLNEED) {
4620			GEM_BUG_ON(!obj->mm.quirked);
4621			__i915_gem_object_unpin_pages(obj);
4622			obj->mm.quirked = false;
4623		}
4624		if (args->madv == I915_MADV_WILLNEED) {
4625			GEM_BUG_ON(obj->mm.quirked);
4626			__i915_gem_object_pin_pages(obj);
4627			obj->mm.quirked = true;
4628		}
4629	}
4630
4631	if (obj->mm.madv != __I915_MADV_PURGED)
4632		obj->mm.madv = args->madv;
4633
4634	/* if the object is no longer attached, discard its backing storage */
4635	if (obj->mm.madv == I915_MADV_DONTNEED &&
4636	    !i915_gem_object_has_pages(obj))
4637		i915_gem_object_truncate(obj);
4638
4639	args->retained = obj->mm.madv != __I915_MADV_PURGED;
4640	mutex_unlock(&obj->mm.lock);
4641
4642out:
4643	i915_gem_object_put(obj);
4644	return err;
4645}
4646
4647static void
4648frontbuffer_retire(struct i915_gem_active *active, struct i915_request *request)
4649{
4650	struct drm_i915_gem_object *obj =
4651		container_of(active, typeof(*obj), frontbuffer_write);
4652
4653	intel_fb_obj_flush(obj, ORIGIN_CS);
4654}
4655
4656void i915_gem_object_init(struct drm_i915_gem_object *obj,
4657			  const struct drm_i915_gem_object_ops *ops)
4658{
4659	mutex_init(&obj->mm.lock);
4660
4661	INIT_LIST_HEAD(&obj->vma_list);
4662	INIT_LIST_HEAD(&obj->lut_list);
4663	INIT_LIST_HEAD(&obj->batch_pool_link);
4664
4665	obj->ops = ops;
4666
4667	reservation_object_init(&obj->__builtin_resv);
4668	obj->resv = &obj->__builtin_resv;
4669
4670	obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4671	init_request_active(&obj->frontbuffer_write, frontbuffer_retire);
4672
4673	obj->mm.madv = I915_MADV_WILLNEED;
4674	INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4675	mutex_init(&obj->mm.get_page.lock);
4676
4677	i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4678}
4679
4680static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4681	.flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4682		 I915_GEM_OBJECT_IS_SHRINKABLE,
4683
4684	.get_pages = i915_gem_object_get_pages_gtt,
4685	.put_pages = i915_gem_object_put_pages_gtt,
4686
4687	.pwrite = i915_gem_object_pwrite_gtt,
4688};
4689
4690static int i915_gem_object_create_shmem(struct drm_device *dev,
4691					struct drm_gem_object *obj,
4692					size_t size)
4693{
4694	struct drm_i915_private *i915 = to_i915(dev);
4695	unsigned long flags = VM_NORESERVE;
4696	struct file *filp;
4697
4698	drm_gem_private_object_init(dev, obj, size);
4699
4700	if (i915->mm.gemfs)
4701		filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4702						 flags);
4703	else
4704		filp = shmem_file_setup("i915", size, flags);
4705
4706	if (IS_ERR(filp))
4707		return PTR_ERR(filp);
4708
4709	obj->filp = filp;
4710
4711	return 0;
4712}
4713
4714struct drm_i915_gem_object *
4715i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4716{
4717	struct drm_i915_gem_object *obj;
4718	struct address_space *mapping;
4719	unsigned int cache_level;
4720	gfp_t mask;
4721	int ret;
4722
4723	/* There is a prevalence of the assumption that we fit the object's
4724	 * page count inside a 32bit _signed_ variable. Let's document this and
4725	 * catch if we ever need to fix it. In the meantime, if you do spot
4726	 * such a local variable, please consider fixing!
4727	 */
4728	if (size >> PAGE_SHIFT > INT_MAX)
4729		return ERR_PTR(-E2BIG);
4730
4731	if (overflows_type(size, obj->base.size))
4732		return ERR_PTR(-E2BIG);
4733
4734	obj = i915_gem_object_alloc(dev_priv);
4735	if (obj == NULL)
4736		return ERR_PTR(-ENOMEM);
4737
4738	ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4739	if (ret)
4740		goto fail;
4741
4742	mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4743	if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4744		/* 965gm cannot relocate objects above 4GiB. */
4745		mask &= ~__GFP_HIGHMEM;
4746		mask |= __GFP_DMA32;
4747	}
4748
4749	mapping = obj->base.filp->f_mapping;
4750	mapping_set_gfp_mask(mapping, mask);
4751	GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4752
4753	i915_gem_object_init(obj, &i915_gem_object_ops);
4754
4755	obj->write_domain = I915_GEM_DOMAIN_CPU;
4756	obj->read_domains = I915_GEM_DOMAIN_CPU;
4757
4758	if (HAS_LLC(dev_priv))
4759		/* On some devices, we can have the GPU use the LLC (the CPU
4760		 * cache) for about a 10% performance improvement
4761		 * compared to uncached.  Graphics requests other than
4762		 * display scanout are coherent with the CPU in
4763		 * accessing this cache.  This means in this mode we
4764		 * don't need to clflush on the CPU side, and on the
4765		 * GPU side we only need to flush internal caches to
4766		 * get data visible to the CPU.
4767		 *
4768		 * However, we maintain the display planes as UC, and so
4769		 * need to rebind when first used as such.
4770		 */
4771		cache_level = I915_CACHE_LLC;
4772	else
4773		cache_level = I915_CACHE_NONE;
4774
4775	i915_gem_object_set_cache_coherency(obj, cache_level);
4776
4777	trace_i915_gem_object_create(obj);
4778
4779	return obj;
4780
4781fail:
4782	i915_gem_object_free(obj);
4783	return ERR_PTR(ret);
4784}
4785
4786static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4787{
4788	/* If we are the last user of the backing storage (be it shmemfs
4789	 * pages or stolen etc), we know that the pages are going to be
4790	 * immediately released. In this case, we can then skip copying
4791	 * back the contents from the GPU.
4792	 */
4793
4794	if (obj->mm.madv != I915_MADV_WILLNEED)
4795		return false;
4796
4797	if (obj->base.filp == NULL)
4798		return true;
4799
4800	/* At first glance, this looks racy, but then again so would be
4801	 * userspace racing mmap against close. However, the first external
4802	 * reference to the filp can only be obtained through the
4803	 * i915_gem_mmap_ioctl() which safeguards us against the user
4804	 * acquiring such a reference whilst we are in the middle of
4805	 * freeing the object.
4806	 */
4807	return atomic_long_read(&obj->base.filp->f_count) == 1;
4808}
4809
4810static void __i915_gem_free_objects(struct drm_i915_private *i915,
4811				    struct llist_node *freed)
4812{
4813	struct drm_i915_gem_object *obj, *on;
4814
4815	intel_runtime_pm_get(i915);
4816	llist_for_each_entry_safe(obj, on, freed, freed) {
4817		struct i915_vma *vma, *vn;
4818
4819		trace_i915_gem_object_destroy(obj);
4820
4821		mutex_lock(&i915->drm.struct_mutex);
4822
4823		GEM_BUG_ON(i915_gem_object_is_active(obj));
4824		list_for_each_entry_safe(vma, vn,
4825					 &obj->vma_list, obj_link) {
4826			GEM_BUG_ON(i915_vma_is_active(vma));
4827			vma->flags &= ~I915_VMA_PIN_MASK;
4828			i915_vma_destroy(vma);
4829		}
4830		GEM_BUG_ON(!list_empty(&obj->vma_list));
4831		GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
4832
4833		/* This serializes freeing with the shrinker. Since the free
4834		 * is delayed, first by RCU then by the workqueue, we want the
4835		 * shrinker to be able to free pages of unreferenced objects,
4836		 * or else we may oom whilst there are plenty of deferred
4837		 * freed objects.
4838		 */
4839		if (i915_gem_object_has_pages(obj)) {
4840			spin_lock(&i915->mm.obj_lock);
4841			list_del_init(&obj->mm.link);
4842			spin_unlock(&i915->mm.obj_lock);
4843		}
4844
4845		mutex_unlock(&i915->drm.struct_mutex);
4846
4847		GEM_BUG_ON(obj->bind_count);
4848		GEM_BUG_ON(obj->userfault_count);
4849		GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4850		GEM_BUG_ON(!list_empty(&obj->lut_list));
4851
4852		if (obj->ops->release)
4853			obj->ops->release(obj);
4854
4855		if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4856			atomic_set(&obj->mm.pages_pin_count, 0);
4857		__i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4858		GEM_BUG_ON(i915_gem_object_has_pages(obj));
4859
4860		if (obj->base.import_attach)
4861			drm_prime_gem_destroy(&obj->base, NULL);
4862
4863		reservation_object_fini(&obj->__builtin_resv);
4864		drm_gem_object_release(&obj->base);
4865		i915_gem_info_remove_obj(i915, obj->base.size);
4866
4867		kfree(obj->bit_17);
4868		i915_gem_object_free(obj);
4869
4870		GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
4871		atomic_dec(&i915->mm.free_count);
4872
4873		if (on)
4874			cond_resched();
4875	}
4876	intel_runtime_pm_put(i915);
4877}
4878
4879static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4880{
4881	struct llist_node *freed;
4882
4883	/* Free the oldest, most stale object to keep the free_list short */
4884	freed = NULL;
4885	if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
4886		/* Only one consumer of llist_del_first() allowed */
4887		spin_lock(&i915->mm.free_lock);
4888		freed = llist_del_first(&i915->mm.free_list);
4889		spin_unlock(&i915->mm.free_lock);
4890	}
4891	if (unlikely(freed)) {
4892		freed->next = NULL;
4893		__i915_gem_free_objects(i915, freed);
4894	}
4895}
4896
4897static void __i915_gem_free_work(struct work_struct *work)
4898{
4899	struct drm_i915_private *i915 =
4900		container_of(work, struct drm_i915_private, mm.free_work);
4901	struct llist_node *freed;
4902
4903	/*
4904	 * All file-owned VMA should have been released by this point through
4905	 * i915_gem_close_object(), or earlier by i915_gem_context_close().
4906	 * However, the object may also be bound into the global GTT (e.g.
4907	 * older GPUs without per-process support, or for direct access through
4908	 * the GTT either for the user or for scanout). Those VMA still need to
4909	 * unbound now.
4910	 */
4911
4912	spin_lock(&i915->mm.free_lock);
4913	while ((freed = llist_del_all(&i915->mm.free_list))) {
4914		spin_unlock(&i915->mm.free_lock);
4915
4916		__i915_gem_free_objects(i915, freed);
4917		if (need_resched())
4918			return;
4919
4920		spin_lock(&i915->mm.free_lock);
4921	}
4922	spin_unlock(&i915->mm.free_lock);
4923}
4924
4925static void __i915_gem_free_object_rcu(struct rcu_head *head)
4926{
4927	struct drm_i915_gem_object *obj =
4928		container_of(head, typeof(*obj), rcu);
4929	struct drm_i915_private *i915 = to_i915(obj->base.dev);
4930
4931	/*
4932	 * Since we require blocking on struct_mutex to unbind the freed
4933	 * object from the GPU before releasing resources back to the
4934	 * system, we can not do that directly from the RCU callback (which may
4935	 * be a softirq context), but must instead then defer that work onto a
4936	 * kthread. We use the RCU callback rather than move the freed object
4937	 * directly onto the work queue so that we can mix between using the
4938	 * worker and performing frees directly from subsequent allocations for
4939	 * crude but effective memory throttling.
4940	 */
4941	if (llist_add(&obj->freed, &i915->mm.free_list))
4942		queue_work(i915->wq, &i915->mm.free_work);
4943}
4944
4945void i915_gem_free_object(struct drm_gem_object *gem_obj)
4946{
4947	struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
4948
4949	if (obj->mm.quirked)
4950		__i915_gem_object_unpin_pages(obj);
4951
4952	if (discard_backing_storage(obj))
4953		obj->mm.madv = I915_MADV_DONTNEED;
4954
4955	/*
4956	 * Before we free the object, make sure any pure RCU-only
4957	 * read-side critical sections are complete, e.g.
4958	 * i915_gem_busy_ioctl(). For the corresponding synchronized
4959	 * lookup see i915_gem_object_lookup_rcu().
4960	 */
4961	atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
4962	call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
4963}
4964
4965void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
4966{
4967	lockdep_assert_held(&obj->base.dev->struct_mutex);
4968
4969	if (!i915_gem_object_has_active_reference(obj) &&
4970	    i915_gem_object_is_active(obj))
4971		i915_gem_object_set_active_reference(obj);
4972	else
4973		i915_gem_object_put(obj);
4974}
4975
4976static void assert_kernel_context_is_current(struct drm_i915_private *i915)
4977{
4978	struct i915_gem_context *kernel_context = i915->kernel_context;
4979	struct intel_engine_cs *engine;
4980	enum intel_engine_id id;
4981
4982	for_each_engine(engine, i915, id) {
4983		GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request));
4984		GEM_BUG_ON(engine->last_retired_context != kernel_context);
4985	}
4986}
4987
4988void i915_gem_sanitize(struct drm_i915_private *i915)
4989{
4990	if (i915_terminally_wedged(&i915->gpu_error)) {
4991		mutex_lock(&i915->drm.struct_mutex);
4992		i915_gem_unset_wedged(i915);
4993		mutex_unlock(&i915->drm.struct_mutex);
4994	}
4995
4996	/*
4997	 * If we inherit context state from the BIOS or earlier occupants
4998	 * of the GPU, the GPU may be in an inconsistent state when we
4999	 * try to take over. The only way to remove the earlier state
5000	 * is by resetting. However, resetting on earlier gen is tricky as
5001	 * it may impact the display and we are uncertain about the stability
5002	 * of the reset, so this could be applied to even earlier gen.
5003	 */
5004	if (INTEL_GEN(i915) >= 5 && intel_has_gpu_reset(i915))
5005		WARN_ON(intel_gpu_reset(i915, ALL_ENGINES));
5006}
5007
5008int i915_gem_suspend(struct drm_i915_private *dev_priv)
5009{
5010	struct drm_device *dev = &dev_priv->drm;
5011	int ret;
5012
5013	intel_runtime_pm_get(dev_priv);
5014	intel_suspend_gt_powersave(dev_priv);
5015
5016	mutex_lock(&dev->struct_mutex);
5017
5018	/* We have to flush all the executing contexts to main memory so
5019	 * that they can saved in the hibernation image. To ensure the last
5020	 * context image is coherent, we have to switch away from it. That
5021	 * leaves the dev_priv->kernel_context still active when
5022	 * we actually suspend, and its image in memory may not match the GPU
5023	 * state. Fortunately, the kernel_context is disposable and we do
5024	 * not rely on its state.
5025	 */
5026	if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5027		ret = i915_gem_switch_to_kernel_context(dev_priv);
5028		if (ret)
5029			goto err_unlock;
5030
5031		ret = i915_gem_wait_for_idle(dev_priv,
5032					     I915_WAIT_INTERRUPTIBLE |
5033					     I915_WAIT_LOCKED);
5034		if (ret && ret != -EIO)
5035			goto err_unlock;
5036
5037		assert_kernel_context_is_current(dev_priv);
5038	}
5039	i915_gem_contexts_lost(dev_priv);
5040	mutex_unlock(&dev->struct_mutex);
5041
5042	intel_uc_suspend(dev_priv);
5043
5044	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
5045	cancel_delayed_work_sync(&dev_priv->gt.retire_work);
5046
5047	/* As the idle_work is rearming if it detects a race, play safe and
5048	 * repeat the flush until it is definitely idle.
5049	 */
5050	drain_delayed_work(&dev_priv->gt.idle_work);
5051
5052	/* Assert that we sucessfully flushed all the work and
5053	 * reset the GPU back to its idle, low power state.
5054	 */
5055	WARN_ON(dev_priv->gt.awake);
5056	if (WARN_ON(!intel_engines_are_idle(dev_priv)))
5057		i915_gem_set_wedged(dev_priv); /* no hope, discard everything */
5058
5059	/*
5060	 * Neither the BIOS, ourselves or any other kernel
5061	 * expects the system to be in execlists mode on startup,
5062	 * so we need to reset the GPU back to legacy mode. And the only
5063	 * known way to disable logical contexts is through a GPU reset.
5064	 *
5065	 * So in order to leave the system in a known default configuration,
5066	 * always reset the GPU upon unload and suspend. Afterwards we then
5067	 * clean up the GEM state tracking, flushing off the requests and
5068	 * leaving the system in a known idle state.
5069	 *
5070	 * Note that is of the upmost importance that the GPU is idle and
5071	 * all stray writes are flushed *before* we dismantle the backing
5072	 * storage for the pinned objects.
5073	 *
5074	 * However, since we are uncertain that resetting the GPU on older
5075	 * machines is a good idea, we don't - just in case it leaves the
5076	 * machine in an unusable condition.
5077	 */
5078	intel_uc_sanitize(dev_priv);
5079	i915_gem_sanitize(dev_priv);
5080
5081	intel_runtime_pm_put(dev_priv);
5082	return 0;
5083
5084err_unlock:
5085	mutex_unlock(&dev->struct_mutex);
5086	intel_runtime_pm_put(dev_priv);
5087	return ret;
5088}
5089
5090void i915_gem_resume(struct drm_i915_private *i915)
5091{
5092	WARN_ON(i915->gt.awake);
5093
5094	mutex_lock(&i915->drm.struct_mutex);
5095	intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
5096
5097	i915_gem_restore_gtt_mappings(i915);
5098	i915_gem_restore_fences(i915);
5099
5100	/*
5101	 * As we didn't flush the kernel context before suspend, we cannot
5102	 * guarantee that the context image is complete. So let's just reset
5103	 * it and start again.
5104	 */
5105	i915->gt.resume(i915);
5106
5107	if (i915_gem_init_hw(i915))
5108		goto err_wedged;
5109
5110	intel_uc_resume(i915);
5111
5112	/* Always reload a context for powersaving. */
5113	if (i915_gem_switch_to_kernel_context(i915))
5114		goto err_wedged;
5115
5116out_unlock:
5117	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
5118	mutex_unlock(&i915->drm.struct_mutex);
5119	return;
5120
5121err_wedged:
5122	if (!i915_terminally_wedged(&i915->gpu_error)) {
5123		DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
5124		i915_gem_set_wedged(i915);
5125	}
5126	goto out_unlock;
5127}
5128
5129void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
5130{
5131	if (INTEL_GEN(dev_priv) < 5 ||
5132	    dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
5133		return;
5134
5135	I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
5136				 DISP_TILE_SURFACE_SWIZZLING);
5137
5138	if (IS_GEN5(dev_priv))
5139		return;
5140
5141	I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
5142	if (IS_GEN6(dev_priv))
5143		I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
5144	else if (IS_GEN7(dev_priv))
5145		I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
5146	else if (IS_GEN8(dev_priv))
5147		I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
5148	else
5149		BUG();
5150}
5151
5152static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
5153{
5154	I915_WRITE(RING_CTL(base), 0);
5155	I915_WRITE(RING_HEAD(base), 0);
5156	I915_WRITE(RING_TAIL(base), 0);
5157	I915_WRITE(RING_START(base), 0);
5158}
5159
5160static void init_unused_rings(struct drm_i915_private *dev_priv)
5161{
5162	if (IS_I830(dev_priv)) {
5163		init_unused_ring(dev_priv, PRB1_BASE);
5164		init_unused_ring(dev_priv, SRB0_BASE);
5165		init_unused_ring(dev_priv, SRB1_BASE);
5166		init_unused_ring(dev_priv, SRB2_BASE);
5167		init_unused_ring(dev_priv, SRB3_BASE);
5168	} else if (IS_GEN2(dev_priv)) {
5169		init_unused_ring(dev_priv, SRB0_BASE);
5170		init_unused_ring(dev_priv, SRB1_BASE);
5171	} else if (IS_GEN3(dev_priv)) {
5172		init_unused_ring(dev_priv, PRB1_BASE);
5173		init_unused_ring(dev_priv, PRB2_BASE);
5174	}
5175}
5176
5177static int __i915_gem_restart_engines(void *data)
5178{
5179	struct drm_i915_private *i915 = data;
5180	struct intel_engine_cs *engine;
5181	enum intel_engine_id id;
5182	int err;
5183
5184	for_each_engine(engine, i915, id) {
5185		err = engine->init_hw(engine);
5186		if (err) {
5187			DRM_ERROR("Failed to restart %s (%d)\n",
5188				  engine->name, err);
5189			return err;
5190		}
5191	}
5192
5193	return 0;
5194}
5195
5196int i915_gem_init_hw(struct drm_i915_private *dev_priv)
5197{
5198	int ret;
5199
5200	dev_priv->gt.last_init_time = ktime_get();
5201
5202	/* Double layer security blanket, see i915_gem_init() */
5203	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5204
5205	if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
5206		I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
5207
5208	if (IS_HASWELL(dev_priv))
5209		I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
5210			   LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
5211
5212	if (HAS_PCH_NOP(dev_priv)) {
5213		if (IS_IVYBRIDGE(dev_priv)) {
5214			u32 temp = I915_READ(GEN7_MSG_CTL);
5215			temp &= ~(WAIT_FOR_PCH_FLR_ACK | WAIT_FOR_PCH_RESET_ACK);
5216			I915_WRITE(GEN7_MSG_CTL, temp);
5217		} else if (INTEL_GEN(dev_priv) >= 7) {
5218			u32 temp = I915_READ(HSW_NDE_RSTWRN_OPT);
5219			temp &= ~RESET_PCH_HANDSHAKE_ENABLE;
5220			I915_WRITE(HSW_NDE_RSTWRN_OPT, temp);
5221		}
5222	}
5223
5224	intel_gt_workarounds_apply(dev_priv);
5225
5226	i915_gem_init_swizzling(dev_priv);
5227
5228	/*
5229	 * At least 830 can leave some of the unused rings
5230	 * "active" (ie. head != tail) after resume which
5231	 * will prevent c3 entry. Makes sure all unused rings
5232	 * are totally idle.
5233	 */
5234	init_unused_rings(dev_priv);
5235
5236	BUG_ON(!dev_priv->kernel_context);
5237	if (i915_terminally_wedged(&dev_priv->gpu_error)) {
5238		ret = -EIO;
5239		goto out;
5240	}
5241
5242	ret = i915_ppgtt_init_hw(dev_priv);
5243	if (ret) {
5244		DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
5245		goto out;
5246	}
5247
5248	ret = intel_wopcm_init_hw(&dev_priv->wopcm);
5249	if (ret) {
5250		DRM_ERROR("Enabling WOPCM failed (%d)\n", ret);
5251		goto out;
5252	}
5253
5254	/* We can't enable contexts until all firmware is loaded */
5255	ret = intel_uc_init_hw(dev_priv);
5256	if (ret) {
5257		DRM_ERROR("Enabling uc failed (%d)\n", ret);
5258		goto out;
5259	}
5260
5261	intel_mocs_init_l3cc_table(dev_priv);
5262
5263	/* Only when the HW is re-initialised, can we replay the requests */
5264	ret = __i915_gem_restart_engines(dev_priv);
5265out:
5266	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5267	return ret;
5268}
5269
5270static int __intel_engines_record_defaults(struct drm_i915_private *i915)
5271{
5272	struct i915_gem_context *ctx;
5273	struct intel_engine_cs *engine;
5274	enum intel_engine_id id;
5275	int err;
5276
5277	/*
5278	 * As we reset the gpu during very early sanitisation, the current
5279	 * register state on the GPU should reflect its defaults values.
5280	 * We load a context onto the hw (with restore-inhibit), then switch
5281	 * over to a second context to save that default register state. We
5282	 * can then prime every new context with that state so they all start
5283	 * from the same default HW values.
5284	 */
5285
5286	ctx = i915_gem_context_create_kernel(i915, 0);
5287	if (IS_ERR(ctx))
5288		return PTR_ERR(ctx);
5289
5290	for_each_engine(engine, i915, id) {
5291		struct i915_request *rq;
5292
5293		rq = i915_request_alloc(engine, ctx);
5294		if (IS_ERR(rq)) {
5295			err = PTR_ERR(rq);
5296			goto out_ctx;
5297		}
5298
5299		err = 0;
5300		if (engine->init_context)
5301			err = engine->init_context(rq);
5302
5303		__i915_request_add(rq, true);
5304		if (err)
5305			goto err_active;
5306	}
5307
5308	err = i915_gem_switch_to_kernel_context(i915);
5309	if (err)
5310		goto err_active;
5311
5312	err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
5313	if (err)
5314		goto err_active;
5315
5316	assert_kernel_context_is_current(i915);
5317
5318	for_each_engine(engine, i915, id) {
5319		struct i915_vma *state;
5320
5321		state = to_intel_context(ctx, engine)->state;
5322		if (!state)
5323			continue;
5324
5325		/*
5326		 * As we will hold a reference to the logical state, it will
5327		 * not be torn down with the context, and importantly the
5328		 * object will hold onto its vma (making it possible for a
5329		 * stray GTT write to corrupt our defaults). Unmap the vma
5330		 * from the GTT to prevent such accidents and reclaim the
5331		 * space.
5332		 */
5333		err = i915_vma_unbind(state);
5334		if (err)
5335			goto err_active;
5336
5337		err = i915_gem_object_set_to_cpu_domain(state->obj, false);
5338		if (err)
5339			goto err_active;
5340
5341		engine->default_state = i915_gem_object_get(state->obj);
5342	}
5343
5344	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
5345		unsigned int found = intel_engines_has_context_isolation(i915);
5346
5347		/*
5348		 * Make sure that classes with multiple engine instances all
5349		 * share the same basic configuration.
5350		 */
5351		for_each_engine(engine, i915, id) {
5352			unsigned int bit = BIT(engine->uabi_class);
5353			unsigned int expected = engine->default_state ? bit : 0;
5354
5355			if ((found & bit) != expected) {
5356				DRM_ERROR("mismatching default context state for class %d on engine %s\n",
5357					  engine->uabi_class, engine->name);
5358			}
5359		}
5360	}
5361
5362out_ctx:
5363	i915_gem_context_set_closed(ctx);
5364	i915_gem_context_put(ctx);
5365	return err;
5366
5367err_active:
5368	/*
5369	 * If we have to abandon now, we expect the engines to be idle
5370	 * and ready to be torn-down. First try to flush any remaining
5371	 * request, ensure we are pointing at the kernel context and
5372	 * then remove it.
5373	 */
5374	if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
5375		goto out_ctx;
5376
5377	if (WARN_ON(i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED)))
5378		goto out_ctx;
5379
5380	i915_gem_contexts_lost(i915);
5381	goto out_ctx;
5382}
5383
5384int i915_gem_init(struct drm_i915_private *dev_priv)
5385{
5386	int ret;
5387
5388	/*
5389	 * We need to fallback to 4K pages since gvt gtt handling doesn't
5390	 * support huge page entries - we will need to check either hypervisor
5391	 * mm can support huge guest page or just do emulation in gvt.
5392	 */
5393	if (intel_vgpu_active(dev_priv))
5394		mkwrite_device_info(dev_priv)->page_sizes =
5395			I915_GTT_PAGE_SIZE_4K;
5396
5397	dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
5398
5399	if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
5400		dev_priv->gt.resume = intel_lr_context_resume;
5401		dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
5402	} else {
5403		dev_priv->gt.resume = intel_legacy_submission_resume;
5404		dev_priv->gt.cleanup_engine = intel_engine_cleanup;
5405	}
5406
5407	ret = i915_gem_init_userptr(dev_priv);
5408	if (ret)
5409		return ret;
5410
5411	ret = intel_wopcm_init(&dev_priv->wopcm);
5412	if (ret)
5413		return ret;
5414
5415	ret = intel_uc_init_misc(dev_priv);
5416	if (ret)
5417		return ret;
5418
5419	/* This is just a security blanket to placate dragons.
5420	 * On some systems, we very sporadically observe that the first TLBs
5421	 * used by the CS may be stale, despite us poking the TLB reset. If
5422	 * we hold the forcewake during initialisation these problems
5423	 * just magically go away.
5424	 */
5425	mutex_lock(&dev_priv->drm.struct_mutex);
5426	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5427
5428	ret = i915_gem_init_ggtt(dev_priv);
5429	if (ret) {
5430		GEM_BUG_ON(ret == -EIO);
5431		goto err_unlock;
5432	}
5433
5434	ret = i915_gem_contexts_init(dev_priv);
5435	if (ret) {
5436		GEM_BUG_ON(ret == -EIO);
5437		goto err_ggtt;
5438	}
5439
5440	ret = intel_engines_init(dev_priv);
5441	if (ret) {
5442		GEM_BUG_ON(ret == -EIO);
5443		goto err_context;
5444	}
5445
5446	intel_init_gt_powersave(dev_priv);
5447
5448	ret = intel_uc_init(dev_priv);
5449	if (ret)
5450		goto err_pm;
5451
5452	ret = i915_gem_init_hw(dev_priv);
5453	if (ret)
5454		goto err_uc_init;
5455
5456	/*
5457	 * Despite its name intel_init_clock_gating applies both display
5458	 * clock gating workarounds; GT mmio workarounds and the occasional
5459	 * GT power context workaround. Worse, sometimes it includes a context
5460	 * register workaround which we need to apply before we record the
5461	 * default HW state for all contexts.
5462	 *
5463	 * FIXME: break up the workarounds and apply them at the right time!
5464	 */
5465	intel_init_clock_gating(dev_priv);
5466
5467	ret = __intel_engines_record_defaults(dev_priv);
5468	if (ret)
5469		goto err_init_hw;
5470
5471	if (i915_inject_load_failure()) {
5472		ret = -ENODEV;
5473		goto err_init_hw;
5474	}
5475
5476	if (i915_inject_load_failure()) {
5477		ret = -EIO;
5478		goto err_init_hw;
5479	}
5480
5481	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5482	mutex_unlock(&dev_priv->drm.struct_mutex);
5483
5484	return 0;
5485
5486	/*
5487	 * Unwinding is complicated by that we want to handle -EIO to mean
5488	 * disable GPU submission but keep KMS alive. We want to mark the
5489	 * HW as irrevisibly wedged, but keep enough state around that the
5490	 * driver doesn't explode during runtime.
5491	 */
5492err_init_hw:
5493	i915_gem_wait_for_idle(dev_priv, I915_WAIT_LOCKED);
5494	i915_gem_contexts_lost(dev_priv);
5495	intel_uc_fini_hw(dev_priv);
5496err_uc_init:
5497	intel_uc_fini(dev_priv);
5498err_pm:
5499	if (ret != -EIO) {
5500		intel_cleanup_gt_powersave(dev_priv);
5501		i915_gem_cleanup_engines(dev_priv);
5502	}
5503err_context:
5504	if (ret != -EIO)
5505		i915_gem_contexts_fini(dev_priv);
5506err_ggtt:
5507err_unlock:
5508	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5509	mutex_unlock(&dev_priv->drm.struct_mutex);
5510
5511	intel_uc_fini_misc(dev_priv);
5512
5513	if (ret != -EIO)
5514		i915_gem_cleanup_userptr(dev_priv);
5515
5516	if (ret == -EIO) {
5517		/*
5518		 * Allow engine initialisation to fail by marking the GPU as
5519		 * wedged. But we only want to do this where the GPU is angry,
5520		 * for all other failure, such as an allocation failure, bail.
5521		 */
5522		if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5523			DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
5524			i915_gem_set_wedged(dev_priv);
5525		}
5526		ret = 0;
5527	}
5528
5529	i915_gem_drain_freed_objects(dev_priv);
5530	return ret;
5531}
5532
5533void i915_gem_init_mmio(struct drm_i915_private *i915)
5534{
5535	i915_gem_sanitize(i915);
5536}
5537
5538void
5539i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5540{
5541	struct intel_engine_cs *engine;
5542	enum intel_engine_id id;
5543
5544	for_each_engine(engine, dev_priv, id)
5545		dev_priv->gt.cleanup_engine(engine);
5546}
5547
5548void
5549i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5550{
5551	int i;
5552
5553	if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5554	    !IS_CHERRYVIEW(dev_priv))
5555		dev_priv->num_fence_regs = 32;
5556	else if (INTEL_GEN(dev_priv) >= 4 ||
5557		 IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5558		 IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5559		dev_priv->num_fence_regs = 16;
5560	else
5561		dev_priv->num_fence_regs = 8;
5562
5563	if (intel_vgpu_active(dev_priv))
5564		dev_priv->num_fence_regs =
5565				I915_READ(vgtif_reg(avail_rs.fence_num));
5566
5567	/* Initialize fence registers to zero */
5568	for (i = 0; i < dev_priv->num_fence_regs; i++) {
5569		struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5570
5571		fence->i915 = dev_priv;
5572		fence->id = i;
5573		list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5574	}
5575	i915_gem_restore_fences(dev_priv);
5576
5577	i915_gem_detect_bit_6_swizzle(dev_priv);
5578}
5579
5580static void i915_gem_init__mm(struct drm_i915_private *i915)
5581{
5582	spin_lock_init(&i915->mm.object_stat_lock);
5583	spin_lock_init(&i915->mm.obj_lock);
5584	spin_lock_init(&i915->mm.free_lock);
5585
5586	init_llist_head(&i915->mm.free_list);
5587
5588	INIT_LIST_HEAD(&i915->mm.unbound_list);
5589	INIT_LIST_HEAD(&i915->mm.bound_list);
5590	INIT_LIST_HEAD(&i915->mm.fence_list);
5591	INIT_LIST_HEAD(&i915->mm.userfault_list);
5592
5593	INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5594}
5595
5596int i915_gem_init_early(struct drm_i915_private *dev_priv)
5597{
5598	int err = -ENOMEM;
5599
5600	dev_priv->objects = KMEM_CACHE(drm_i915_gem_object, SLAB_HWCACHE_ALIGN);
5601	if (!dev_priv->objects)
5602		goto err_out;
5603
5604	dev_priv->vmas = KMEM_CACHE(i915_vma, SLAB_HWCACHE_ALIGN);
5605	if (!dev_priv->vmas)
5606		goto err_objects;
5607
5608	dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
5609	if (!dev_priv->luts)
5610		goto err_vmas;
5611
5612	dev_priv->requests = KMEM_CACHE(i915_request,
5613					SLAB_HWCACHE_ALIGN |
5614					SLAB_RECLAIM_ACCOUNT |
5615					SLAB_TYPESAFE_BY_RCU);
5616	if (!dev_priv->requests)
5617		goto err_luts;
5618
5619	dev_priv->dependencies = KMEM_CACHE(i915_dependency,
5620					    SLAB_HWCACHE_ALIGN |
5621					    SLAB_RECLAIM_ACCOUNT);
5622	if (!dev_priv->dependencies)
5623		goto err_requests;
5624
5625	dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
5626	if (!dev_priv->priorities)
5627		goto err_dependencies;
5628
5629	INIT_LIST_HEAD(&dev_priv->gt.timelines);
5630	INIT_LIST_HEAD(&dev_priv->gt.active_rings);
5631	INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
5632
5633	i915_gem_init__mm(dev_priv);
5634
5635	INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5636			  i915_gem_retire_work_handler);
5637	INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5638			  i915_gem_idle_work_handler);
5639	init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5640	init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5641
5642	atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5643
5644	spin_lock_init(&dev_priv->fb_tracking.lock);
5645
5646	err = i915_gemfs_init(dev_priv);
5647	if (err)
5648		DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5649
5650	return 0;
5651
5652err_dependencies:
5653	kmem_cache_destroy(dev_priv->dependencies);
5654err_requests:
5655	kmem_cache_destroy(dev_priv->requests);
5656err_luts:
5657	kmem_cache_destroy(dev_priv->luts);
5658err_vmas:
5659	kmem_cache_destroy(dev_priv->vmas);
5660err_objects:
5661	kmem_cache_destroy(dev_priv->objects);
5662err_out:
5663	return err;
5664}
5665
5666void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
5667{
5668	i915_gem_drain_freed_objects(dev_priv);
5669	GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5670	GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5671	WARN_ON(dev_priv->mm.object_count);
5672	WARN_ON(!list_empty(&dev_priv->gt.timelines));
5673
5674	kmem_cache_destroy(dev_priv->priorities);
5675	kmem_cache_destroy(dev_priv->dependencies);
5676	kmem_cache_destroy(dev_priv->requests);
5677	kmem_cache_destroy(dev_priv->luts);
5678	kmem_cache_destroy(dev_priv->vmas);
5679	kmem_cache_destroy(dev_priv->objects);
5680
5681	/* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
5682	rcu_barrier();
5683
5684	i915_gemfs_fini(dev_priv);
5685}
5686
5687int i915_gem_freeze(struct drm_i915_private *dev_priv)
5688{
5689	/* Discard all purgeable objects, let userspace recover those as
5690	 * required after resuming.
5691	 */
5692	i915_gem_shrink_all(dev_priv);
5693
5694	return 0;
5695}
5696
5697int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
5698{
5699	struct drm_i915_gem_object *obj;
5700	struct list_head *phases[] = {
5701		&dev_priv->mm.unbound_list,
5702		&dev_priv->mm.bound_list,
5703		NULL
5704	}, **p;
5705
5706	/* Called just before we write the hibernation image.
5707	 *
5708	 * We need to update the domain tracking to reflect that the CPU
5709	 * will be accessing all the pages to create and restore from the
5710	 * hibernation, and so upon restoration those pages will be in the
5711	 * CPU domain.
5712	 *
5713	 * To make sure the hibernation image contains the latest state,
5714	 * we update that state just before writing out the image.
5715	 *
5716	 * To try and reduce the hibernation image, we manually shrink
5717	 * the objects as well, see i915_gem_freeze()
5718	 */
5719
5720	i915_gem_shrink(dev_priv, -1UL, NULL, I915_SHRINK_UNBOUND);
5721	i915_gem_drain_freed_objects(dev_priv);
5722
5723	spin_lock(&dev_priv->mm.obj_lock);
5724	for (p = phases; *p; p++) {
5725		list_for_each_entry(obj, *p, mm.link)
5726			__start_cpu_write(obj);
5727	}
5728	spin_unlock(&dev_priv->mm.obj_lock);
5729
5730	return 0;
5731}
5732
5733void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5734{
5735	struct drm_i915_file_private *file_priv = file->driver_priv;
5736	struct i915_request *request;
5737
5738	/* Clean up our request list when the client is going away, so that
5739	 * later retire_requests won't dereference our soon-to-be-gone
5740	 * file_priv.
5741	 */
5742	spin_lock(&file_priv->mm.lock);
5743	list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5744		request->file_priv = NULL;
5745	spin_unlock(&file_priv->mm.lock);
5746}
5747
5748int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5749{
5750	struct drm_i915_file_private *file_priv;
5751	int ret;
5752
5753	DRM_DEBUG("\n");
5754
5755	file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5756	if (!file_priv)
5757		return -ENOMEM;
5758
5759	file->driver_priv = file_priv;
5760	file_priv->dev_priv = i915;
5761	file_priv->file = file;
5762
5763	spin_lock_init(&file_priv->mm.lock);
5764	INIT_LIST_HEAD(&file_priv->mm.request_list);
5765
5766	file_priv->bsd_engine = -1;
5767	file_priv->hang_timestamp = jiffies;
5768
5769	ret = i915_gem_context_open(i915, file);
5770	if (ret)
5771		kfree(file_priv);
5772
5773	return ret;
5774}
5775
5776/**
5777 * i915_gem_track_fb - update frontbuffer tracking
5778 * @old: current GEM buffer for the frontbuffer slots
5779 * @new: new GEM buffer for the frontbuffer slots
5780 * @frontbuffer_bits: bitmask of frontbuffer slots
5781 *
5782 * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5783 * from @old and setting them in @new. Both @old and @new can be NULL.
5784 */
5785void i915_gem_track_fb(struct drm_i915_gem_object *old,
5786		       struct drm_i915_gem_object *new,
5787		       unsigned frontbuffer_bits)
5788{
5789	/* Control of individual bits within the mask are guarded by
5790	 * the owning plane->mutex, i.e. we can never see concurrent
5791	 * manipulation of individual bits. But since the bitfield as a whole
5792	 * is updated using RMW, we need to use atomics in order to update
5793	 * the bits.
5794	 */
5795	BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
5796		     sizeof(atomic_t) * BITS_PER_BYTE);
5797
5798	if (old) {
5799		WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
5800		atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
5801	}
5802
5803	if (new) {
5804		WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
5805		atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
5806	}
5807}
5808
5809/* Allocate a new GEM object and fill it with the supplied data */
5810struct drm_i915_gem_object *
5811i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
5812			         const void *data, size_t size)
5813{
5814	struct drm_i915_gem_object *obj;
5815	struct file *file;
5816	size_t offset;
5817	int err;
5818
5819	obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
5820	if (IS_ERR(obj))
5821		return obj;
5822
5823	GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
5824
5825	file = obj->base.filp;
5826	offset = 0;
5827	do {
5828		unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
5829		struct page *page;
5830		void *pgdata, *vaddr;
5831
5832		err = pagecache_write_begin(file, file->f_mapping,
5833					    offset, len, 0,
5834					    &page, &pgdata);
5835		if (err < 0)
5836			goto fail;
5837
5838		vaddr = kmap(page);
5839		memcpy(vaddr, data, len);
5840		kunmap(page);
5841
5842		err = pagecache_write_end(file, file->f_mapping,
5843					  offset, len, len,
5844					  page, pgdata);
5845		if (err < 0)
5846			goto fail;
5847
5848		size -= len;
5849		data += len;
5850		offset += len;
5851	} while (size);
5852
5853	return obj;
5854
5855fail:
5856	i915_gem_object_put(obj);
5857	return ERR_PTR(err);
5858}
5859
5860struct scatterlist *
5861i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
5862		       unsigned int n,
5863		       unsigned int *offset)
5864{
5865	struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
5866	struct scatterlist *sg;
5867	unsigned int idx, count;
5868
5869	might_sleep();
5870	GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
5871	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
5872
5873	/* As we iterate forward through the sg, we record each entry in a
5874	 * radixtree for quick repeated (backwards) lookups. If we have seen
5875	 * this index previously, we will have an entry for it.
5876	 *
5877	 * Initial lookup is O(N), but this is amortized to O(1) for
5878	 * sequential page access (where each new request is consecutive
5879	 * to the previous one). Repeated lookups are O(lg(obj->base.size)),
5880	 * i.e. O(1) with a large constant!
5881	 */
5882	if (n < READ_ONCE(iter->sg_idx))
5883		goto lookup;
5884
5885	mutex_lock(&iter->lock);
5886
5887	/* We prefer to reuse the last sg so that repeated lookup of this
5888	 * (or the subsequent) sg are fast - comparing against the last
5889	 * sg is faster than going through the radixtree.
5890	 */
5891
5892	sg = iter->sg_pos;
5893	idx = iter->sg_idx;
5894	count = __sg_page_count(sg);
5895
5896	while (idx + count <= n) {
5897		unsigned long exception, i;
5898		int ret;
5899
5900		/* If we cannot allocate and insert this entry, or the
5901		 * individual pages from this range, cancel updating the
5902		 * sg_idx so that on this lookup we are forced to linearly
5903		 * scan onwards, but on future lookups we will try the
5904		 * insertion again (in which case we need to be careful of
5905		 * the error return reporting that we have already inserted
5906		 * this index).
5907		 */
5908		ret = radix_tree_insert(&iter->radix, idx, sg);
5909		if (ret && ret != -EEXIST)
5910			goto scan;
5911
5912		exception =
5913			RADIX_TREE_EXCEPTIONAL_ENTRY |
5914			idx << RADIX_TREE_EXCEPTIONAL_SHIFT;
5915		for (i = 1; i < count; i++) {
5916			ret = radix_tree_insert(&iter->radix, idx + i,
5917						(void *)exception);
5918			if (ret && ret != -EEXIST)
5919				goto scan;
5920		}
5921
5922		idx += count;
5923		sg = ____sg_next(sg);
5924		count = __sg_page_count(sg);
5925	}
5926
5927scan:
5928	iter->sg_pos = sg;
5929	iter->sg_idx = idx;
5930
5931	mutex_unlock(&iter->lock);
5932
5933	if (unlikely(n < idx)) /* insertion completed by another thread */
5934		goto lookup;
5935
5936	/* In case we failed to insert the entry into the radixtree, we need
5937	 * to look beyond the current sg.
5938	 */
5939	while (idx + count <= n) {
5940		idx += count;
5941		sg = ____sg_next(sg);
5942		count = __sg_page_count(sg);
5943	}
5944
5945	*offset = n - idx;
5946	return sg;
5947
5948lookup:
5949	rcu_read_lock();
5950
5951	sg = radix_tree_lookup(&iter->radix, n);
5952	GEM_BUG_ON(!sg);
5953
5954	/* If this index is in the middle of multi-page sg entry,
5955	 * the radixtree will contain an exceptional entry that points
5956	 * to the start of that range. We will return the pointer to
5957	 * the base page and the offset of this page within the
5958	 * sg entry's range.
5959	 */
5960	*offset = 0;
5961	if (unlikely(radix_tree_exception(sg))) {
5962		unsigned long base =
5963			(unsigned long)sg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
5964
5965		sg = radix_tree_lookup(&iter->radix, base);
5966		GEM_BUG_ON(!sg);
5967
5968		*offset = n - base;
5969	}
5970
5971	rcu_read_unlock();
5972
5973	return sg;
5974}
5975
5976struct page *
5977i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
5978{
5979	struct scatterlist *sg;
5980	unsigned int offset;
5981
5982	GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
5983
5984	sg = i915_gem_object_get_sg(obj, n, &offset);
5985	return nth_page(sg_page(sg), offset);
5986}
5987
5988/* Like i915_gem_object_get_page(), but mark the returned page dirty */
5989struct page *
5990i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
5991			       unsigned int n)
5992{
5993	struct page *page;
5994
5995	page = i915_gem_object_get_page(obj, n);
5996	if (!obj->mm.dirty)
5997		set_page_dirty(page);
5998
5999	return page;
6000}
6001
6002dma_addr_t
6003i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
6004				unsigned long n)
6005{
6006	struct scatterlist *sg;
6007	unsigned int offset;
6008
6009	sg = i915_gem_object_get_sg(obj, n, &offset);
6010	return sg_dma_address(sg) + (offset << PAGE_SHIFT);
6011}
6012
6013int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
6014{
6015	struct sg_table *pages;
6016	int err;
6017
6018	if (align > obj->base.size)
6019		return -EINVAL;
6020
6021	if (obj->ops == &i915_gem_phys_ops)
6022		return 0;
6023
6024	if (obj->ops != &i915_gem_object_ops)
6025		return -EINVAL;
6026
6027	err = i915_gem_object_unbind(obj);
6028	if (err)
6029		return err;
6030
6031	mutex_lock(&obj->mm.lock);
6032
6033	if (obj->mm.madv != I915_MADV_WILLNEED) {
6034		err = -EFAULT;
6035		goto err_unlock;
6036	}
6037
6038	if (obj->mm.quirked) {
6039		err = -EFAULT;
6040		goto err_unlock;
6041	}
6042
6043	if (obj->mm.mapping) {
6044		err = -EBUSY;
6045		goto err_unlock;
6046	}
6047
6048	pages = fetch_and_zero(&obj->mm.pages);
6049	if (pages) {
6050		struct drm_i915_private *i915 = to_i915(obj->base.dev);
6051
6052		__i915_gem_object_reset_page_iter(obj);
6053
6054		spin_lock(&i915->mm.obj_lock);
6055		list_del(&obj->mm.link);
6056		spin_unlock(&i915->mm.obj_lock);
6057	}
6058
6059	obj->ops = &i915_gem_phys_ops;
6060
6061	err = ____i915_gem_object_get_pages(obj);
6062	if (err)
6063		goto err_xfer;
6064
6065	/* Perma-pin (until release) the physical set of pages */
6066	__i915_gem_object_pin_pages(obj);
6067
6068	if (!IS_ERR_OR_NULL(pages))
6069		i915_gem_object_ops.put_pages(obj, pages);
6070	mutex_unlock(&obj->mm.lock);
6071	return 0;
6072
6073err_xfer:
6074	obj->ops = &i915_gem_object_ops;
6075	obj->mm.pages = pages;
6076err_unlock:
6077	mutex_unlock(&obj->mm.lock);
6078	return err;
6079}
6080
6081#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6082#include "selftests/scatterlist.c"
6083#include "selftests/mock_gem_device.c"
6084#include "selftests/huge_gem_object.c"
6085#include "selftests/huge_pages.c"
6086#include "selftests/i915_gem_object.c"
6087#include "selftests/i915_gem_coherency.c"
6088#endif
Configure Feed

Configure Feed