drivers/iommu/intel/iommu.c at v6.8-rc1

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / drivers / iommu / intel / iommu.c
at v6.8-rc1 5066 lines 134 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright © 2006-2014 Intel Corporation.
   4 *
   5 * Authors: David Woodhouse <dwmw2@infradead.org>,
   6 *          Ashok Raj <ashok.raj@intel.com>,
   7 *          Shaohua Li <shaohua.li@intel.com>,
   8 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9 *          Fenghua Yu <fenghua.yu@intel.com>
  10 *          Joerg Roedel <jroedel@suse.de>
  11 */
  12
  13#define pr_fmt(fmt)     "DMAR: " fmt
  14#define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16#include <linux/crash_dump.h>
  17#include <linux/dma-direct.h>
  18#include <linux/dmi.h>
  19#include <linux/memory.h>
  20#include <linux/pci.h>
  21#include <linux/pci-ats.h>
  22#include <linux/spinlock.h>
  23#include <linux/syscore_ops.h>
  24#include <linux/tboot.h>
  25#include <uapi/linux/iommufd.h>
  26
  27#include "iommu.h"
  28#include "../dma-iommu.h"
  29#include "../irq_remapping.h"
  30#include "../iommu-sva.h"
  31#include "pasid.h"
  32#include "cap_audit.h"
  33#include "perfmon.h"
  34
  35#define ROOT_SIZE		VTD_PAGE_SIZE
  36#define CONTEXT_SIZE		VTD_PAGE_SIZE
  37
  38#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  39#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  40#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  41#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  42
  43#define IOAPIC_RANGE_START	(0xfee00000)
  44#define IOAPIC_RANGE_END	(0xfeefffff)
  45#define IOVA_START_ADDR		(0x1000)
  46
  47#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  48
  49#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
  50#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
  51
  52/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  53   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  54#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
  55				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  56#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  57
  58/* IO virtual address start page frame number */
  59#define IOVA_START_PFN		(1)
  60
  61#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
  62
  63static void __init check_tylersburg_isoch(void);
  64static int rwbf_quirk;
  65
  66/*
  67 * set to 1 to panic kernel if can't successfully enable VT-d
  68 * (used when kernel is launched w/ TXT)
  69 */
  70static int force_on = 0;
  71static int intel_iommu_tboot_noforce;
  72static int no_platform_optin;
  73
  74#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
  75
  76/*
  77 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
  78 * if marked present.
  79 */
  80static phys_addr_t root_entry_lctp(struct root_entry *re)
  81{
  82	if (!(re->lo & 1))
  83		return 0;
  84
  85	return re->lo & VTD_PAGE_MASK;
  86}
  87
  88/*
  89 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
  90 * if marked present.
  91 */
  92static phys_addr_t root_entry_uctp(struct root_entry *re)
  93{
  94	if (!(re->hi & 1))
  95		return 0;
  96
  97	return re->hi & VTD_PAGE_MASK;
  98}
  99
 100/*
 101 * This domain is a statically identity mapping domain.
 102 *	1. This domain creats a static 1:1 mapping to all usable memory.
 103 * 	2. It maps to each iommu if successful.
 104 *	3. Each iommu mapps to this domain if successful.
 105 */
 106static struct dmar_domain *si_domain;
 107static int hw_pass_through = 1;
 108
 109struct dmar_rmrr_unit {
 110	struct list_head list;		/* list of rmrr units	*/
 111	struct acpi_dmar_header *hdr;	/* ACPI header		*/
 112	u64	base_address;		/* reserved base address*/
 113	u64	end_address;		/* reserved end address */
 114	struct dmar_dev_scope *devices;	/* target devices */
 115	int	devices_cnt;		/* target device count */
 116};
 117
 118struct dmar_atsr_unit {
 119	struct list_head list;		/* list of ATSR units */
 120	struct acpi_dmar_header *hdr;	/* ACPI header */
 121	struct dmar_dev_scope *devices;	/* target devices */
 122	int devices_cnt;		/* target device count */
 123	u8 include_all:1;		/* include all ports */
 124};
 125
 126struct dmar_satc_unit {
 127	struct list_head list;		/* list of SATC units */
 128	struct acpi_dmar_header *hdr;	/* ACPI header */
 129	struct dmar_dev_scope *devices;	/* target devices */
 130	struct intel_iommu *iommu;	/* the corresponding iommu */
 131	int devices_cnt;		/* target device count */
 132	u8 atc_required:1;		/* ATS is required */
 133};
 134
 135static LIST_HEAD(dmar_atsr_units);
 136static LIST_HEAD(dmar_rmrr_units);
 137static LIST_HEAD(dmar_satc_units);
 138
 139#define for_each_rmrr_units(rmrr) \
 140	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 141
 142static void intel_iommu_domain_free(struct iommu_domain *domain);
 143
 144int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
 145int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
 146
 147int intel_iommu_enabled = 0;
 148EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 149
 150static int dmar_map_gfx = 1;
 151static int intel_iommu_superpage = 1;
 152static int iommu_identity_mapping;
 153static int iommu_skip_te_disable;
 154
 155#define IDENTMAP_GFX		2
 156#define IDENTMAP_AZALIA		4
 157
 158const struct iommu_ops intel_iommu_ops;
 159static const struct iommu_dirty_ops intel_dirty_ops;
 160
 161static bool translation_pre_enabled(struct intel_iommu *iommu)
 162{
 163	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 164}
 165
 166static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 167{
 168	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 169}
 170
 171static void init_translation_status(struct intel_iommu *iommu)
 172{
 173	u32 gsts;
 174
 175	gsts = readl(iommu->reg + DMAR_GSTS_REG);
 176	if (gsts & DMA_GSTS_TES)
 177		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 178}
 179
 180static int __init intel_iommu_setup(char *str)
 181{
 182	if (!str)
 183		return -EINVAL;
 184
 185	while (*str) {
 186		if (!strncmp(str, "on", 2)) {
 187			dmar_disabled = 0;
 188			pr_info("IOMMU enabled\n");
 189		} else if (!strncmp(str, "off", 3)) {
 190			dmar_disabled = 1;
 191			no_platform_optin = 1;
 192			pr_info("IOMMU disabled\n");
 193		} else if (!strncmp(str, "igfx_off", 8)) {
 194			dmar_map_gfx = 0;
 195			pr_info("Disable GFX device mapping\n");
 196		} else if (!strncmp(str, "forcedac", 8)) {
 197			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
 198			iommu_dma_forcedac = true;
 199		} else if (!strncmp(str, "strict", 6)) {
 200			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
 201			iommu_set_dma_strict();
 202		} else if (!strncmp(str, "sp_off", 6)) {
 203			pr_info("Disable supported super page\n");
 204			intel_iommu_superpage = 0;
 205		} else if (!strncmp(str, "sm_on", 5)) {
 206			pr_info("Enable scalable mode if hardware supports\n");
 207			intel_iommu_sm = 1;
 208		} else if (!strncmp(str, "sm_off", 6)) {
 209			pr_info("Scalable mode is disallowed\n");
 210			intel_iommu_sm = 0;
 211		} else if (!strncmp(str, "tboot_noforce", 13)) {
 212			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 213			intel_iommu_tboot_noforce = 1;
 214		} else {
 215			pr_notice("Unknown option - '%s'\n", str);
 216		}
 217
 218		str += strcspn(str, ",");
 219		while (*str == ',')
 220			str++;
 221	}
 222
 223	return 1;
 224}
 225__setup("intel_iommu=", intel_iommu_setup);
 226
 227void *alloc_pgtable_page(int node, gfp_t gfp)
 228{
 229	struct page *page;
 230	void *vaddr = NULL;
 231
 232	page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
 233	if (page)
 234		vaddr = page_address(page);
 235	return vaddr;
 236}
 237
 238void free_pgtable_page(void *vaddr)
 239{
 240	free_page((unsigned long)vaddr);
 241}
 242
 243static int domain_type_is_si(struct dmar_domain *domain)
 244{
 245	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
 246}
 247
 248static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
 249{
 250	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 251
 252	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 253}
 254
 255/*
 256 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
 257 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
 258 * the returned SAGAW.
 259 */
 260static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
 261{
 262	unsigned long fl_sagaw, sl_sagaw;
 263
 264	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
 265	sl_sagaw = cap_sagaw(iommu->cap);
 266
 267	/* Second level only. */
 268	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
 269		return sl_sagaw;
 270
 271	/* First level only. */
 272	if (!ecap_slts(iommu->ecap))
 273		return fl_sagaw;
 274
 275	return fl_sagaw & sl_sagaw;
 276}
 277
 278static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 279{
 280	unsigned long sagaw;
 281	int agaw;
 282
 283	sagaw = __iommu_calculate_sagaw(iommu);
 284	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
 285		if (test_bit(agaw, &sagaw))
 286			break;
 287	}
 288
 289	return agaw;
 290}
 291
 292/*
 293 * Calculate max SAGAW for each iommu.
 294 */
 295int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 296{
 297	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 298}
 299
 300/*
 301 * calculate agaw for each iommu.
 302 * "SAGAW" may be different across iommus, use a default agaw, and
 303 * get a supported less agaw for iommus that don't support the default agaw.
 304 */
 305int iommu_calculate_agaw(struct intel_iommu *iommu)
 306{
 307	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 308}
 309
 310static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 311{
 312	return sm_supported(iommu) ?
 313			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 314}
 315
 316static void domain_update_iommu_coherency(struct dmar_domain *domain)
 317{
 318	struct iommu_domain_info *info;
 319	struct dmar_drhd_unit *drhd;
 320	struct intel_iommu *iommu;
 321	bool found = false;
 322	unsigned long i;
 323
 324	domain->iommu_coherency = true;
 325	xa_for_each(&domain->iommu_array, i, info) {
 326		found = true;
 327		if (!iommu_paging_structure_coherency(info->iommu)) {
 328			domain->iommu_coherency = false;
 329			break;
 330		}
 331	}
 332	if (found)
 333		return;
 334
 335	/* No hardware attached; use lowest common denominator */
 336	rcu_read_lock();
 337	for_each_active_iommu(iommu, drhd) {
 338		if (!iommu_paging_structure_coherency(iommu)) {
 339			domain->iommu_coherency = false;
 340			break;
 341		}
 342	}
 343	rcu_read_unlock();
 344}
 345
 346static int domain_update_iommu_superpage(struct dmar_domain *domain,
 347					 struct intel_iommu *skip)
 348{
 349	struct dmar_drhd_unit *drhd;
 350	struct intel_iommu *iommu;
 351	int mask = 0x3;
 352
 353	if (!intel_iommu_superpage)
 354		return 0;
 355
 356	/* set iommu_superpage to the smallest common denominator */
 357	rcu_read_lock();
 358	for_each_active_iommu(iommu, drhd) {
 359		if (iommu != skip) {
 360			if (domain && domain->use_first_level) {
 361				if (!cap_fl1gp_support(iommu->cap))
 362					mask = 0x1;
 363			} else {
 364				mask &= cap_super_page_val(iommu->cap);
 365			}
 366
 367			if (!mask)
 368				break;
 369		}
 370	}
 371	rcu_read_unlock();
 372
 373	return fls(mask);
 374}
 375
 376static int domain_update_device_node(struct dmar_domain *domain)
 377{
 378	struct device_domain_info *info;
 379	int nid = NUMA_NO_NODE;
 380	unsigned long flags;
 381
 382	spin_lock_irqsave(&domain->lock, flags);
 383	list_for_each_entry(info, &domain->devices, link) {
 384		/*
 385		 * There could possibly be multiple device numa nodes as devices
 386		 * within the same domain may sit behind different IOMMUs. There
 387		 * isn't perfect answer in such situation, so we select first
 388		 * come first served policy.
 389		 */
 390		nid = dev_to_node(info->dev);
 391		if (nid != NUMA_NO_NODE)
 392			break;
 393	}
 394	spin_unlock_irqrestore(&domain->lock, flags);
 395
 396	return nid;
 397}
 398
 399static void domain_update_iotlb(struct dmar_domain *domain);
 400
 401/* Return the super pagesize bitmap if supported. */
 402static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
 403{
 404	unsigned long bitmap = 0;
 405
 406	/*
 407	 * 1-level super page supports page size of 2MiB, 2-level super page
 408	 * supports page size of both 2MiB and 1GiB.
 409	 */
 410	if (domain->iommu_superpage == 1)
 411		bitmap |= SZ_2M;
 412	else if (domain->iommu_superpage == 2)
 413		bitmap |= SZ_2M | SZ_1G;
 414
 415	return bitmap;
 416}
 417
 418/* Some capabilities may be different across iommus */
 419void domain_update_iommu_cap(struct dmar_domain *domain)
 420{
 421	domain_update_iommu_coherency(domain);
 422	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 423
 424	/*
 425	 * If RHSA is missing, we should default to the device numa domain
 426	 * as fall back.
 427	 */
 428	if (domain->nid == NUMA_NO_NODE)
 429		domain->nid = domain_update_device_node(domain);
 430
 431	/*
 432	 * First-level translation restricts the input-address to a
 433	 * canonical address (i.e., address bits 63:N have the same
 434	 * value as address bit [N-1], where N is 48-bits with 4-level
 435	 * paging and 57-bits with 5-level paging). Hence, skip bit
 436	 * [N-1].
 437	 */
 438	if (domain->use_first_level)
 439		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
 440	else
 441		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
 442
 443	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
 444	domain_update_iotlb(domain);
 445}
 446
 447struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 448					 u8 devfn, int alloc)
 449{
 450	struct root_entry *root = &iommu->root_entry[bus];
 451	struct context_entry *context;
 452	u64 *entry;
 453
 454	/*
 455	 * Except that the caller requested to allocate a new entry,
 456	 * returning a copied context entry makes no sense.
 457	 */
 458	if (!alloc && context_copied(iommu, bus, devfn))
 459		return NULL;
 460
 461	entry = &root->lo;
 462	if (sm_supported(iommu)) {
 463		if (devfn >= 0x80) {
 464			devfn -= 0x80;
 465			entry = &root->hi;
 466		}
 467		devfn *= 2;
 468	}
 469	if (*entry & 1)
 470		context = phys_to_virt(*entry & VTD_PAGE_MASK);
 471	else {
 472		unsigned long phy_addr;
 473		if (!alloc)
 474			return NULL;
 475
 476		context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
 477		if (!context)
 478			return NULL;
 479
 480		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 481		phy_addr = virt_to_phys((void *)context);
 482		*entry = phy_addr | 1;
 483		__iommu_flush_cache(iommu, entry, sizeof(*entry));
 484	}
 485	return &context[devfn];
 486}
 487
 488/**
 489 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 490 *				 sub-hierarchy of a candidate PCI-PCI bridge
 491 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 492 * @bridge: the candidate PCI-PCI bridge
 493 *
 494 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 495 */
 496static bool
 497is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 498{
 499	struct pci_dev *pdev, *pbridge;
 500
 501	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 502		return false;
 503
 504	pdev = to_pci_dev(dev);
 505	pbridge = to_pci_dev(bridge);
 506
 507	if (pbridge->subordinate &&
 508	    pbridge->subordinate->number <= pdev->bus->number &&
 509	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
 510		return true;
 511
 512	return false;
 513}
 514
 515static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 516{
 517	struct dmar_drhd_unit *drhd;
 518	u32 vtbar;
 519	int rc;
 520
 521	/* We know that this device on this chipset has its own IOMMU.
 522	 * If we find it under a different IOMMU, then the BIOS is lying
 523	 * to us. Hope that the IOMMU for this device is actually
 524	 * disabled, and it needs no translation...
 525	 */
 526	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 527	if (rc) {
 528		/* "can't" happen */
 529		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 530		return false;
 531	}
 532	vtbar &= 0xffff0000;
 533
 534	/* we know that the this iommu should be at offset 0xa000 from vtbar */
 535	drhd = dmar_find_matched_drhd_unit(pdev);
 536	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 537		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 538		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 539		return true;
 540	}
 541
 542	return false;
 543}
 544
 545static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 546{
 547	if (!iommu || iommu->drhd->ignored)
 548		return true;
 549
 550	if (dev_is_pci(dev)) {
 551		struct pci_dev *pdev = to_pci_dev(dev);
 552
 553		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 554		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 555		    quirk_ioat_snb_local_iommu(pdev))
 556			return true;
 557	}
 558
 559	return false;
 560}
 561
 562static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
 563{
 564	struct dmar_drhd_unit *drhd = NULL;
 565	struct pci_dev *pdev = NULL;
 566	struct intel_iommu *iommu;
 567	struct device *tmp;
 568	u16 segment = 0;
 569	int i;
 570
 571	if (!dev)
 572		return NULL;
 573
 574	if (dev_is_pci(dev)) {
 575		struct pci_dev *pf_pdev;
 576
 577		pdev = pci_real_dma_dev(to_pci_dev(dev));
 578
 579		/* VFs aren't listed in scope tables; we need to look up
 580		 * the PF instead to find the IOMMU. */
 581		pf_pdev = pci_physfn(pdev);
 582		dev = &pf_pdev->dev;
 583		segment = pci_domain_nr(pdev->bus);
 584	} else if (has_acpi_companion(dev))
 585		dev = &ACPI_COMPANION(dev)->dev;
 586
 587	rcu_read_lock();
 588	for_each_iommu(iommu, drhd) {
 589		if (pdev && segment != drhd->segment)
 590			continue;
 591
 592		for_each_active_dev_scope(drhd->devices,
 593					  drhd->devices_cnt, i, tmp) {
 594			if (tmp == dev) {
 595				/* For a VF use its original BDF# not that of the PF
 596				 * which we used for the IOMMU lookup. Strictly speaking
 597				 * we could do this for all PCI devices; we only need to
 598				 * get the BDF# from the scope table for ACPI matches. */
 599				if (pdev && pdev->is_virtfn)
 600					goto got_pdev;
 601
 602				if (bus && devfn) {
 603					*bus = drhd->devices[i].bus;
 604					*devfn = drhd->devices[i].devfn;
 605				}
 606				goto out;
 607			}
 608
 609			if (is_downstream_to_pci_bridge(dev, tmp))
 610				goto got_pdev;
 611		}
 612
 613		if (pdev && drhd->include_all) {
 614got_pdev:
 615			if (bus && devfn) {
 616				*bus = pdev->bus->number;
 617				*devfn = pdev->devfn;
 618			}
 619			goto out;
 620		}
 621	}
 622	iommu = NULL;
 623out:
 624	if (iommu_is_dummy(iommu, dev))
 625		iommu = NULL;
 626
 627	rcu_read_unlock();
 628
 629	return iommu;
 630}
 631
 632static void domain_flush_cache(struct dmar_domain *domain,
 633			       void *addr, int size)
 634{
 635	if (!domain->iommu_coherency)
 636		clflush_cache_range(addr, size);
 637}
 638
 639static void free_context_table(struct intel_iommu *iommu)
 640{
 641	struct context_entry *context;
 642	int i;
 643
 644	if (!iommu->root_entry)
 645		return;
 646
 647	for (i = 0; i < ROOT_ENTRY_NR; i++) {
 648		context = iommu_context_addr(iommu, i, 0, 0);
 649		if (context)
 650			free_pgtable_page(context);
 651
 652		if (!sm_supported(iommu))
 653			continue;
 654
 655		context = iommu_context_addr(iommu, i, 0x80, 0);
 656		if (context)
 657			free_pgtable_page(context);
 658	}
 659
 660	free_pgtable_page(iommu->root_entry);
 661	iommu->root_entry = NULL;
 662}
 663
 664#ifdef CONFIG_DMAR_DEBUG
 665static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
 666			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
 667{
 668	struct dma_pte *pte;
 669	int offset;
 670
 671	while (1) {
 672		offset = pfn_level_offset(pfn, level);
 673		pte = &parent[offset];
 674		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
 675			pr_info("PTE not present at level %d\n", level);
 676			break;
 677		}
 678
 679		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
 680
 681		if (level == 1)
 682			break;
 683
 684		parent = phys_to_virt(dma_pte_addr(pte));
 685		level--;
 686	}
 687}
 688
 689void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
 690			  unsigned long long addr, u32 pasid)
 691{
 692	struct pasid_dir_entry *dir, *pde;
 693	struct pasid_entry *entries, *pte;
 694	struct context_entry *ctx_entry;
 695	struct root_entry *rt_entry;
 696	int i, dir_index, index, level;
 697	u8 devfn = source_id & 0xff;
 698	u8 bus = source_id >> 8;
 699	struct dma_pte *pgtable;
 700
 701	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
 702
 703	/* root entry dump */
 704	rt_entry = &iommu->root_entry[bus];
 705	if (!rt_entry) {
 706		pr_info("root table entry is not present\n");
 707		return;
 708	}
 709
 710	if (sm_supported(iommu))
 711		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
 712			rt_entry->hi, rt_entry->lo);
 713	else
 714		pr_info("root entry: 0x%016llx", rt_entry->lo);
 715
 716	/* context entry dump */
 717	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
 718	if (!ctx_entry) {
 719		pr_info("context table entry is not present\n");
 720		return;
 721	}
 722
 723	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
 724		ctx_entry->hi, ctx_entry->lo);
 725
 726	/* legacy mode does not require PASID entries */
 727	if (!sm_supported(iommu)) {
 728		level = agaw_to_level(ctx_entry->hi & 7);
 729		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 730		goto pgtable_walk;
 731	}
 732
 733	/* get the pointer to pasid directory entry */
 734	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 735	if (!dir) {
 736		pr_info("pasid directory entry is not present\n");
 737		return;
 738	}
 739	/* For request-without-pasid, get the pasid from context entry */
 740	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
 741		pasid = IOMMU_NO_PASID;
 742
 743	dir_index = pasid >> PASID_PDE_SHIFT;
 744	pde = &dir[dir_index];
 745	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
 746
 747	/* get the pointer to the pasid table entry */
 748	entries = get_pasid_table_from_pde(pde);
 749	if (!entries) {
 750		pr_info("pasid table entry is not present\n");
 751		return;
 752	}
 753	index = pasid & PASID_PTE_MASK;
 754	pte = &entries[index];
 755	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
 756		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
 757
 758	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
 759		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
 760		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
 761	} else {
 762		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
 763		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
 764	}
 765
 766pgtable_walk:
 767	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
 768}
 769#endif
 770
 771static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 772				      unsigned long pfn, int *target_level,
 773				      gfp_t gfp)
 774{
 775	struct dma_pte *parent, *pte;
 776	int level = agaw_to_level(domain->agaw);
 777	int offset;
 778
 779	if (!domain_pfn_supported(domain, pfn))
 780		/* Address beyond IOMMU's addressing capabilities. */
 781		return NULL;
 782
 783	parent = domain->pgd;
 784
 785	while (1) {
 786		void *tmp_page;
 787
 788		offset = pfn_level_offset(pfn, level);
 789		pte = &parent[offset];
 790		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 791			break;
 792		if (level == *target_level)
 793			break;
 794
 795		if (!dma_pte_present(pte)) {
 796			uint64_t pteval;
 797
 798			tmp_page = alloc_pgtable_page(domain->nid, gfp);
 799
 800			if (!tmp_page)
 801				return NULL;
 802
 803			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 804			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 805			if (domain->use_first_level)
 806				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
 807
 808			if (cmpxchg64(&pte->val, 0ULL, pteval))
 809				/* Someone else set it while we were thinking; use theirs. */
 810				free_pgtable_page(tmp_page);
 811			else
 812				domain_flush_cache(domain, pte, sizeof(*pte));
 813		}
 814		if (level == 1)
 815			break;
 816
 817		parent = phys_to_virt(dma_pte_addr(pte));
 818		level--;
 819	}
 820
 821	if (!*target_level)
 822		*target_level = level;
 823
 824	return pte;
 825}
 826
 827/* return address's pte at specific level */
 828static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 829					 unsigned long pfn,
 830					 int level, int *large_page)
 831{
 832	struct dma_pte *parent, *pte;
 833	int total = agaw_to_level(domain->agaw);
 834	int offset;
 835
 836	parent = domain->pgd;
 837	while (level <= total) {
 838		offset = pfn_level_offset(pfn, total);
 839		pte = &parent[offset];
 840		if (level == total)
 841			return pte;
 842
 843		if (!dma_pte_present(pte)) {
 844			*large_page = total;
 845			break;
 846		}
 847
 848		if (dma_pte_superpage(pte)) {
 849			*large_page = total;
 850			return pte;
 851		}
 852
 853		parent = phys_to_virt(dma_pte_addr(pte));
 854		total--;
 855	}
 856	return NULL;
 857}
 858
 859/* clear last level pte, a tlb flush should be followed */
 860static void dma_pte_clear_range(struct dmar_domain *domain,
 861				unsigned long start_pfn,
 862				unsigned long last_pfn)
 863{
 864	unsigned int large_page;
 865	struct dma_pte *first_pte, *pte;
 866
 867	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
 868	    WARN_ON(start_pfn > last_pfn))
 869		return;
 870
 871	/* we don't need lock here; nobody else touches the iova range */
 872	do {
 873		large_page = 1;
 874		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 875		if (!pte) {
 876			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 877			continue;
 878		}
 879		do {
 880			dma_clear_pte(pte);
 881			start_pfn += lvl_to_nr_pages(large_page);
 882			pte++;
 883		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 884
 885		domain_flush_cache(domain, first_pte,
 886				   (void *)pte - (void *)first_pte);
 887
 888	} while (start_pfn && start_pfn <= last_pfn);
 889}
 890
 891static void dma_pte_free_level(struct dmar_domain *domain, int level,
 892			       int retain_level, struct dma_pte *pte,
 893			       unsigned long pfn, unsigned long start_pfn,
 894			       unsigned long last_pfn)
 895{
 896	pfn = max(start_pfn, pfn);
 897	pte = &pte[pfn_level_offset(pfn, level)];
 898
 899	do {
 900		unsigned long level_pfn;
 901		struct dma_pte *level_pte;
 902
 903		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
 904			goto next;
 905
 906		level_pfn = pfn & level_mask(level);
 907		level_pte = phys_to_virt(dma_pte_addr(pte));
 908
 909		if (level > 2) {
 910			dma_pte_free_level(domain, level - 1, retain_level,
 911					   level_pte, level_pfn, start_pfn,
 912					   last_pfn);
 913		}
 914
 915		/*
 916		 * Free the page table if we're below the level we want to
 917		 * retain and the range covers the entire table.
 918		 */
 919		if (level < retain_level && !(start_pfn > level_pfn ||
 920		      last_pfn < level_pfn + level_size(level) - 1)) {
 921			dma_clear_pte(pte);
 922			domain_flush_cache(domain, pte, sizeof(*pte));
 923			free_pgtable_page(level_pte);
 924		}
 925next:
 926		pfn += level_size(level);
 927	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
 928}
 929
 930/*
 931 * clear last level (leaf) ptes and free page table pages below the
 932 * level we wish to keep intact.
 933 */
 934static void dma_pte_free_pagetable(struct dmar_domain *domain,
 935				   unsigned long start_pfn,
 936				   unsigned long last_pfn,
 937				   int retain_level)
 938{
 939	dma_pte_clear_range(domain, start_pfn, last_pfn);
 940
 941	/* We don't need lock here; nobody else touches the iova range */
 942	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
 943			   domain->pgd, 0, start_pfn, last_pfn);
 944
 945	/* free pgd */
 946	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
 947		free_pgtable_page(domain->pgd);
 948		domain->pgd = NULL;
 949	}
 950}
 951
 952/* When a page at a given level is being unlinked from its parent, we don't
 953   need to *modify* it at all. All we need to do is make a list of all the
 954   pages which can be freed just as soon as we've flushed the IOTLB and we
 955   know the hardware page-walk will no longer touch them.
 956   The 'pte' argument is the *parent* PTE, pointing to the page that is to
 957   be freed. */
 958static void dma_pte_list_pagetables(struct dmar_domain *domain,
 959				    int level, struct dma_pte *pte,
 960				    struct list_head *freelist)
 961{
 962	struct page *pg;
 963
 964	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
 965	list_add_tail(&pg->lru, freelist);
 966
 967	if (level == 1)
 968		return;
 969
 970	pte = page_address(pg);
 971	do {
 972		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
 973			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
 974		pte++;
 975	} while (!first_pte_in_page(pte));
 976}
 977
 978static void dma_pte_clear_level(struct dmar_domain *domain, int level,
 979				struct dma_pte *pte, unsigned long pfn,
 980				unsigned long start_pfn, unsigned long last_pfn,
 981				struct list_head *freelist)
 982{
 983	struct dma_pte *first_pte = NULL, *last_pte = NULL;
 984
 985	pfn = max(start_pfn, pfn);
 986	pte = &pte[pfn_level_offset(pfn, level)];
 987
 988	do {
 989		unsigned long level_pfn = pfn & level_mask(level);
 990
 991		if (!dma_pte_present(pte))
 992			goto next;
 993
 994		/* If range covers entire pagetable, free it */
 995		if (start_pfn <= level_pfn &&
 996		    last_pfn >= level_pfn + level_size(level) - 1) {
 997			/* These suborbinate page tables are going away entirely. Don't
 998			   bother to clear them; we're just going to *free* them. */
 999			if (level > 1 && !dma_pte_superpage(pte))
1000				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1001
1002			dma_clear_pte(pte);
1003			if (!first_pte)
1004				first_pte = pte;
1005			last_pte = pte;
1006		} else if (level > 1) {
1007			/* Recurse down into a level that isn't *entirely* obsolete */
1008			dma_pte_clear_level(domain, level - 1,
1009					    phys_to_virt(dma_pte_addr(pte)),
1010					    level_pfn, start_pfn, last_pfn,
1011					    freelist);
1012		}
1013next:
1014		pfn = level_pfn + level_size(level);
1015	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1016
1017	if (first_pte)
1018		domain_flush_cache(domain, first_pte,
1019				   (void *)++last_pte - (void *)first_pte);
1020}
1021
1022/* We can't just free the pages because the IOMMU may still be walking
1023   the page tables, and may have cached the intermediate levels. The
1024   pages can only be freed after the IOTLB flush has been done. */
1025static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1026			 unsigned long last_pfn, struct list_head *freelist)
1027{
1028	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1029	    WARN_ON(start_pfn > last_pfn))
1030		return;
1031
1032	/* we don't need lock here; nobody else touches the iova range */
1033	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1034			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1035
1036	/* free pgd */
1037	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1038		struct page *pgd_page = virt_to_page(domain->pgd);
1039		list_add_tail(&pgd_page->lru, freelist);
1040		domain->pgd = NULL;
1041	}
1042}
1043
1044/* iommu handling */
1045static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1046{
1047	struct root_entry *root;
1048
1049	root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1050	if (!root) {
1051		pr_err("Allocating root entry for %s failed\n",
1052			iommu->name);
1053		return -ENOMEM;
1054	}
1055
1056	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1057	iommu->root_entry = root;
1058
1059	return 0;
1060}
1061
1062static void iommu_set_root_entry(struct intel_iommu *iommu)
1063{
1064	u64 addr;
1065	u32 sts;
1066	unsigned long flag;
1067
1068	addr = virt_to_phys(iommu->root_entry);
1069	if (sm_supported(iommu))
1070		addr |= DMA_RTADDR_SMT;
1071
1072	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1073	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1074
1075	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1076
1077	/* Make sure hardware complete it */
1078	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1079		      readl, (sts & DMA_GSTS_RTPS), sts);
1080
1081	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1082
1083	/*
1084	 * Hardware invalidates all DMA remapping hardware translation
1085	 * caches as part of SRTP flow.
1086	 */
1087	if (cap_esrtps(iommu->cap))
1088		return;
1089
1090	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1091	if (sm_supported(iommu))
1092		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1093	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1094}
1095
1096void iommu_flush_write_buffer(struct intel_iommu *iommu)
1097{
1098	u32 val;
1099	unsigned long flag;
1100
1101	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1102		return;
1103
1104	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1105	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1106
1107	/* Make sure hardware complete it */
1108	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1109		      readl, (!(val & DMA_GSTS_WBFS)), val);
1110
1111	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1112}
1113
1114/* return value determine if we need a write buffer flush */
1115static void __iommu_flush_context(struct intel_iommu *iommu,
1116				  u16 did, u16 source_id, u8 function_mask,
1117				  u64 type)
1118{
1119	u64 val = 0;
1120	unsigned long flag;
1121
1122	switch (type) {
1123	case DMA_CCMD_GLOBAL_INVL:
1124		val = DMA_CCMD_GLOBAL_INVL;
1125		break;
1126	case DMA_CCMD_DOMAIN_INVL:
1127		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1128		break;
1129	case DMA_CCMD_DEVICE_INVL:
1130		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1131			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1132		break;
1133	default:
1134		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1135			iommu->name, type);
1136		return;
1137	}
1138	val |= DMA_CCMD_ICC;
1139
1140	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1141	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1142
1143	/* Make sure hardware complete it */
1144	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1145		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1146
1147	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1148}
1149
1150/* return value determine if we need a write buffer flush */
1151static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1152				u64 addr, unsigned int size_order, u64 type)
1153{
1154	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1155	u64 val = 0, val_iva = 0;
1156	unsigned long flag;
1157
1158	switch (type) {
1159	case DMA_TLB_GLOBAL_FLUSH:
1160		/* global flush doesn't need set IVA_REG */
1161		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1162		break;
1163	case DMA_TLB_DSI_FLUSH:
1164		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1165		break;
1166	case DMA_TLB_PSI_FLUSH:
1167		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1168		/* IH bit is passed in as part of address */
1169		val_iva = size_order | addr;
1170		break;
1171	default:
1172		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1173			iommu->name, type);
1174		return;
1175	}
1176
1177	if (cap_write_drain(iommu->cap))
1178		val |= DMA_TLB_WRITE_DRAIN;
1179
1180	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1181	/* Note: Only uses first TLB reg currently */
1182	if (val_iva)
1183		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1184	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1185
1186	/* Make sure hardware complete it */
1187	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1188		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1189
1190	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1191
1192	/* check IOTLB invalidation granularity */
1193	if (DMA_TLB_IAIG(val) == 0)
1194		pr_err("Flush IOTLB failed\n");
1195	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1196		pr_debug("TLB flush request %Lx, actual %Lx\n",
1197			(unsigned long long)DMA_TLB_IIRG(type),
1198			(unsigned long long)DMA_TLB_IAIG(val));
1199}
1200
1201static struct device_domain_info *
1202domain_lookup_dev_info(struct dmar_domain *domain,
1203		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1204{
1205	struct device_domain_info *info;
1206	unsigned long flags;
1207
1208	spin_lock_irqsave(&domain->lock, flags);
1209	list_for_each_entry(info, &domain->devices, link) {
1210		if (info->iommu == iommu && info->bus == bus &&
1211		    info->devfn == devfn) {
1212			spin_unlock_irqrestore(&domain->lock, flags);
1213			return info;
1214		}
1215	}
1216	spin_unlock_irqrestore(&domain->lock, flags);
1217
1218	return NULL;
1219}
1220
1221static void domain_update_iotlb(struct dmar_domain *domain)
1222{
1223	struct dev_pasid_info *dev_pasid;
1224	struct device_domain_info *info;
1225	bool has_iotlb_device = false;
1226	unsigned long flags;
1227
1228	spin_lock_irqsave(&domain->lock, flags);
1229	list_for_each_entry(info, &domain->devices, link) {
1230		if (info->ats_enabled) {
1231			has_iotlb_device = true;
1232			break;
1233		}
1234	}
1235
1236	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1237		info = dev_iommu_priv_get(dev_pasid->dev);
1238		if (info->ats_enabled) {
1239			has_iotlb_device = true;
1240			break;
1241		}
1242	}
1243	domain->has_iotlb_device = has_iotlb_device;
1244	spin_unlock_irqrestore(&domain->lock, flags);
1245}
1246
1247/*
1248 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1249 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1250 * check because it applies only to the built-in QAT devices and it doesn't
1251 * grant additional privileges.
1252 */
1253#define BUGGY_QAT_DEVID_MASK 0x4940
1254static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1255{
1256	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1257		return false;
1258
1259	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1260		return false;
1261
1262	return true;
1263}
1264
1265static void iommu_enable_pci_caps(struct device_domain_info *info)
1266{
1267	struct pci_dev *pdev;
1268
1269	if (!dev_is_pci(info->dev))
1270		return;
1271
1272	pdev = to_pci_dev(info->dev);
1273
1274	/* The PCIe spec, in its wisdom, declares that the behaviour of
1275	   the device if you enable PASID support after ATS support is
1276	   undefined. So always enable PASID support on devices which
1277	   have it, even if we can't yet know if we're ever going to
1278	   use it. */
1279	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1280		info->pasid_enabled = 1;
1281
1282	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1283	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1284		info->ats_enabled = 1;
1285		domain_update_iotlb(info->domain);
1286	}
1287}
1288
1289static void iommu_disable_pci_caps(struct device_domain_info *info)
1290{
1291	struct pci_dev *pdev;
1292
1293	if (!dev_is_pci(info->dev))
1294		return;
1295
1296	pdev = to_pci_dev(info->dev);
1297
1298	if (info->ats_enabled) {
1299		pci_disable_ats(pdev);
1300		info->ats_enabled = 0;
1301		domain_update_iotlb(info->domain);
1302	}
1303
1304	if (info->pasid_enabled) {
1305		pci_disable_pasid(pdev);
1306		info->pasid_enabled = 0;
1307	}
1308}
1309
1310static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1311				    u64 addr, unsigned int mask)
1312{
1313	u16 sid, qdep;
1314
1315	if (!info || !info->ats_enabled)
1316		return;
1317
1318	sid = info->bus << 8 | info->devfn;
1319	qdep = info->ats_qdep;
1320	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1321			   qdep, addr, mask);
1322	quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1323}
1324
1325static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1326				  u64 addr, unsigned mask)
1327{
1328	struct dev_pasid_info *dev_pasid;
1329	struct device_domain_info *info;
1330	unsigned long flags;
1331
1332	if (!domain->has_iotlb_device)
1333		return;
1334
1335	spin_lock_irqsave(&domain->lock, flags);
1336	list_for_each_entry(info, &domain->devices, link)
1337		__iommu_flush_dev_iotlb(info, addr, mask);
1338
1339	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1340		info = dev_iommu_priv_get(dev_pasid->dev);
1341
1342		if (!info->ats_enabled)
1343			continue;
1344
1345		qi_flush_dev_iotlb_pasid(info->iommu,
1346					 PCI_DEVID(info->bus, info->devfn),
1347					 info->pfsid, dev_pasid->pasid,
1348					 info->ats_qdep, addr,
1349					 mask);
1350	}
1351	spin_unlock_irqrestore(&domain->lock, flags);
1352}
1353
1354static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1355				     struct dmar_domain *domain, u64 addr,
1356				     unsigned long npages, bool ih)
1357{
1358	u16 did = domain_id_iommu(domain, iommu);
1359	struct dev_pasid_info *dev_pasid;
1360	unsigned long flags;
1361
1362	spin_lock_irqsave(&domain->lock, flags);
1363	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1364		qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1365
1366	if (!list_empty(&domain->devices))
1367		qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1368	spin_unlock_irqrestore(&domain->lock, flags);
1369}
1370
1371static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1372				  struct dmar_domain *domain,
1373				  unsigned long pfn, unsigned int pages,
1374				  int ih, int map)
1375{
1376	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1377	unsigned int mask = ilog2(aligned_pages);
1378	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1379	u16 did = domain_id_iommu(domain, iommu);
1380
1381	if (WARN_ON(!pages))
1382		return;
1383
1384	if (ih)
1385		ih = 1 << 6;
1386
1387	if (domain->use_first_level) {
1388		domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1389	} else {
1390		unsigned long bitmask = aligned_pages - 1;
1391
1392		/*
1393		 * PSI masks the low order bits of the base address. If the
1394		 * address isn't aligned to the mask, then compute a mask value
1395		 * needed to ensure the target range is flushed.
1396		 */
1397		if (unlikely(bitmask & pfn)) {
1398			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1399
1400			/*
1401			 * Since end_pfn <= pfn + bitmask, the only way bits
1402			 * higher than bitmask can differ in pfn and end_pfn is
1403			 * by carrying. This means after masking out bitmask,
1404			 * high bits starting with the first set bit in
1405			 * shared_bits are all equal in both pfn and end_pfn.
1406			 */
1407			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1408			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1409		}
1410
1411		/*
1412		 * Fallback to domain selective flush if no PSI support or
1413		 * the size is too big.
1414		 */
1415		if (!cap_pgsel_inv(iommu->cap) ||
1416		    mask > cap_max_amask_val(iommu->cap))
1417			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1418							DMA_TLB_DSI_FLUSH);
1419		else
1420			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1421							DMA_TLB_PSI_FLUSH);
1422	}
1423
1424	/*
1425	 * In caching mode, changes of pages from non-present to present require
1426	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1427	 */
1428	if (!cap_caching_mode(iommu->cap) || !map)
1429		iommu_flush_dev_iotlb(domain, addr, mask);
1430}
1431
1432/* Notification for newly created mappings */
1433static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain,
1434				 unsigned long pfn, unsigned int pages)
1435{
1436	/*
1437	 * It's a non-present to present mapping. Only flush if caching mode
1438	 * and second level.
1439	 */
1440	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1441		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1442	else
1443		iommu_flush_write_buffer(iommu);
1444}
1445
1446static void intel_flush_iotlb_all(struct iommu_domain *domain)
1447{
1448	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1449	struct iommu_domain_info *info;
1450	unsigned long idx;
1451
1452	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1453		struct intel_iommu *iommu = info->iommu;
1454		u16 did = domain_id_iommu(dmar_domain, iommu);
1455
1456		if (dmar_domain->use_first_level)
1457			domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1458		else
1459			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1460						 DMA_TLB_DSI_FLUSH);
1461
1462		if (!cap_caching_mode(iommu->cap))
1463			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1464	}
1465}
1466
1467static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1468{
1469	u32 pmen;
1470	unsigned long flags;
1471
1472	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1473		return;
1474
1475	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1476	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1477	pmen &= ~DMA_PMEN_EPM;
1478	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1479
1480	/* wait for the protected region status bit to clear */
1481	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1482		readl, !(pmen & DMA_PMEN_PRS), pmen);
1483
1484	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1485}
1486
1487static void iommu_enable_translation(struct intel_iommu *iommu)
1488{
1489	u32 sts;
1490	unsigned long flags;
1491
1492	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1493	iommu->gcmd |= DMA_GCMD_TE;
1494	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1495
1496	/* Make sure hardware complete it */
1497	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1498		      readl, (sts & DMA_GSTS_TES), sts);
1499
1500	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1501}
1502
1503static void iommu_disable_translation(struct intel_iommu *iommu)
1504{
1505	u32 sts;
1506	unsigned long flag;
1507
1508	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1509	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1510		return;
1511
1512	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1513	iommu->gcmd &= ~DMA_GCMD_TE;
1514	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1515
1516	/* Make sure hardware complete it */
1517	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1518		      readl, (!(sts & DMA_GSTS_TES)), sts);
1519
1520	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1521}
1522
1523static int iommu_init_domains(struct intel_iommu *iommu)
1524{
1525	u32 ndomains;
1526
1527	ndomains = cap_ndoms(iommu->cap);
1528	pr_debug("%s: Number of Domains supported <%d>\n",
1529		 iommu->name, ndomains);
1530
1531	spin_lock_init(&iommu->lock);
1532
1533	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1534	if (!iommu->domain_ids)
1535		return -ENOMEM;
1536
1537	/*
1538	 * If Caching mode is set, then invalid translations are tagged
1539	 * with domain-id 0, hence we need to pre-allocate it. We also
1540	 * use domain-id 0 as a marker for non-allocated domain-id, so
1541	 * make sure it is not used for a real domain.
1542	 */
1543	set_bit(0, iommu->domain_ids);
1544
1545	/*
1546	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1547	 * entry for first-level or pass-through translation modes should
1548	 * be programmed with a domain id different from those used for
1549	 * second-level or nested translation. We reserve a domain id for
1550	 * this purpose.
1551	 */
1552	if (sm_supported(iommu))
1553		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1554
1555	return 0;
1556}
1557
1558static void disable_dmar_iommu(struct intel_iommu *iommu)
1559{
1560	if (!iommu->domain_ids)
1561		return;
1562
1563	/*
1564	 * All iommu domains must have been detached from the devices,
1565	 * hence there should be no domain IDs in use.
1566	 */
1567	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1568		    > NUM_RESERVED_DID))
1569		return;
1570
1571	if (iommu->gcmd & DMA_GCMD_TE)
1572		iommu_disable_translation(iommu);
1573}
1574
1575static void free_dmar_iommu(struct intel_iommu *iommu)
1576{
1577	if (iommu->domain_ids) {
1578		bitmap_free(iommu->domain_ids);
1579		iommu->domain_ids = NULL;
1580	}
1581
1582	if (iommu->copied_tables) {
1583		bitmap_free(iommu->copied_tables);
1584		iommu->copied_tables = NULL;
1585	}
1586
1587	/* free context mapping */
1588	free_context_table(iommu);
1589
1590#ifdef CONFIG_INTEL_IOMMU_SVM
1591	if (pasid_supported(iommu)) {
1592		if (ecap_prs(iommu->ecap))
1593			intel_svm_finish_prq(iommu);
1594	}
1595#endif
1596}
1597
1598/*
1599 * Check and return whether first level is used by default for
1600 * DMA translation.
1601 */
1602static bool first_level_by_default(unsigned int type)
1603{
1604	/* Only SL is available in legacy mode */
1605	if (!scalable_mode_support())
1606		return false;
1607
1608	/* Only level (either FL or SL) is available, just use it */
1609	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1610		return intel_cap_flts_sanity();
1611
1612	/* Both levels are available, decide it based on domain type */
1613	return type != IOMMU_DOMAIN_UNMANAGED;
1614}
1615
1616static struct dmar_domain *alloc_domain(unsigned int type)
1617{
1618	struct dmar_domain *domain;
1619
1620	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1621	if (!domain)
1622		return NULL;
1623
1624	domain->nid = NUMA_NO_NODE;
1625	if (first_level_by_default(type))
1626		domain->use_first_level = true;
1627	domain->has_iotlb_device = false;
1628	INIT_LIST_HEAD(&domain->devices);
1629	INIT_LIST_HEAD(&domain->dev_pasids);
1630	spin_lock_init(&domain->lock);
1631	xa_init(&domain->iommu_array);
1632
1633	return domain;
1634}
1635
1636int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1637{
1638	struct iommu_domain_info *info, *curr;
1639	unsigned long ndomains;
1640	int num, ret = -ENOSPC;
1641
1642	info = kzalloc(sizeof(*info), GFP_KERNEL);
1643	if (!info)
1644		return -ENOMEM;
1645
1646	spin_lock(&iommu->lock);
1647	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1648	if (curr) {
1649		curr->refcnt++;
1650		spin_unlock(&iommu->lock);
1651		kfree(info);
1652		return 0;
1653	}
1654
1655	ndomains = cap_ndoms(iommu->cap);
1656	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1657	if (num >= ndomains) {
1658		pr_err("%s: No free domain ids\n", iommu->name);
1659		goto err_unlock;
1660	}
1661
1662	set_bit(num, iommu->domain_ids);
1663	info->refcnt	= 1;
1664	info->did	= num;
1665	info->iommu	= iommu;
1666	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1667			  NULL, info, GFP_ATOMIC);
1668	if (curr) {
1669		ret = xa_err(curr) ? : -EBUSY;
1670		goto err_clear;
1671	}
1672	domain_update_iommu_cap(domain);
1673
1674	spin_unlock(&iommu->lock);
1675	return 0;
1676
1677err_clear:
1678	clear_bit(info->did, iommu->domain_ids);
1679err_unlock:
1680	spin_unlock(&iommu->lock);
1681	kfree(info);
1682	return ret;
1683}
1684
1685void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1686{
1687	struct iommu_domain_info *info;
1688
1689	spin_lock(&iommu->lock);
1690	info = xa_load(&domain->iommu_array, iommu->seq_id);
1691	if (--info->refcnt == 0) {
1692		clear_bit(info->did, iommu->domain_ids);
1693		xa_erase(&domain->iommu_array, iommu->seq_id);
1694		domain->nid = NUMA_NO_NODE;
1695		domain_update_iommu_cap(domain);
1696		kfree(info);
1697	}
1698	spin_unlock(&iommu->lock);
1699}
1700
1701static int guestwidth_to_adjustwidth(int gaw)
1702{
1703	int agaw;
1704	int r = (gaw - 12) % 9;
1705
1706	if (r == 0)
1707		agaw = gaw;
1708	else
1709		agaw = gaw + 9 - r;
1710	if (agaw > 64)
1711		agaw = 64;
1712	return agaw;
1713}
1714
1715static void domain_exit(struct dmar_domain *domain)
1716{
1717	if (domain->pgd) {
1718		LIST_HEAD(freelist);
1719
1720		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1721		put_pages_list(&freelist);
1722	}
1723
1724	if (WARN_ON(!list_empty(&domain->devices)))
1725		return;
1726
1727	kfree(domain);
1728}
1729
1730/*
1731 * Get the PASID directory size for scalable mode context entry.
1732 * Value of X in the PDTS field of a scalable mode context entry
1733 * indicates PASID directory with 2^(X + 7) entries.
1734 */
1735static unsigned long context_get_sm_pds(struct pasid_table *table)
1736{
1737	unsigned long pds, max_pde;
1738
1739	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1740	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1741	if (pds < 7)
1742		return 0;
1743
1744	return pds - 7;
1745}
1746
1747static int domain_context_mapping_one(struct dmar_domain *domain,
1748				      struct intel_iommu *iommu,
1749				      struct pasid_table *table,
1750				      u8 bus, u8 devfn)
1751{
1752	struct device_domain_info *info =
1753			domain_lookup_dev_info(domain, iommu, bus, devfn);
1754	u16 did = domain_id_iommu(domain, iommu);
1755	int translation = CONTEXT_TT_MULTI_LEVEL;
1756	struct context_entry *context;
1757	int ret;
1758
1759	if (hw_pass_through && domain_type_is_si(domain))
1760		translation = CONTEXT_TT_PASS_THROUGH;
1761
1762	pr_debug("Set context mapping for %02x:%02x.%d\n",
1763		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1764
1765	spin_lock(&iommu->lock);
1766	ret = -ENOMEM;
1767	context = iommu_context_addr(iommu, bus, devfn, 1);
1768	if (!context)
1769		goto out_unlock;
1770
1771	ret = 0;
1772	if (context_present(context) && !context_copied(iommu, bus, devfn))
1773		goto out_unlock;
1774
1775	/*
1776	 * For kdump cases, old valid entries may be cached due to the
1777	 * in-flight DMA and copied pgtable, but there is no unmapping
1778	 * behaviour for them, thus we need an explicit cache flush for
1779	 * the newly-mapped device. For kdump, at this point, the device
1780	 * is supposed to finish reset at its driver probe stage, so no
1781	 * in-flight DMA will exist, and we don't need to worry anymore
1782	 * hereafter.
1783	 */
1784	if (context_copied(iommu, bus, devfn)) {
1785		u16 did_old = context_domain_id(context);
1786
1787		if (did_old < cap_ndoms(iommu->cap)) {
1788			iommu->flush.flush_context(iommu, did_old,
1789						   (((u16)bus) << 8) | devfn,
1790						   DMA_CCMD_MASK_NOBIT,
1791						   DMA_CCMD_DEVICE_INVL);
1792			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1793						 DMA_TLB_DSI_FLUSH);
1794		}
1795
1796		clear_context_copied(iommu, bus, devfn);
1797	}
1798
1799	context_clear_entry(context);
1800
1801	if (sm_supported(iommu)) {
1802		unsigned long pds;
1803
1804		/* Setup the PASID DIR pointer: */
1805		pds = context_get_sm_pds(table);
1806		context->lo = (u64)virt_to_phys(table->table) |
1807				context_pdts(pds);
1808
1809		/* Setup the RID_PASID field: */
1810		context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1811
1812		/*
1813		 * Setup the Device-TLB enable bit and Page request
1814		 * Enable bit:
1815		 */
1816		if (info && info->ats_supported)
1817			context_set_sm_dte(context);
1818		if (info && info->pri_supported)
1819			context_set_sm_pre(context);
1820		if (info && info->pasid_supported)
1821			context_set_pasid(context);
1822	} else {
1823		struct dma_pte *pgd = domain->pgd;
1824		int agaw;
1825
1826		context_set_domain_id(context, did);
1827
1828		if (translation != CONTEXT_TT_PASS_THROUGH) {
1829			/*
1830			 * Skip top levels of page tables for iommu which has
1831			 * less agaw than default. Unnecessary for PT mode.
1832			 */
1833			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1834				ret = -ENOMEM;
1835				pgd = phys_to_virt(dma_pte_addr(pgd));
1836				if (!dma_pte_present(pgd))
1837					goto out_unlock;
1838			}
1839
1840			if (info && info->ats_supported)
1841				translation = CONTEXT_TT_DEV_IOTLB;
1842			else
1843				translation = CONTEXT_TT_MULTI_LEVEL;
1844
1845			context_set_address_root(context, virt_to_phys(pgd));
1846			context_set_address_width(context, agaw);
1847		} else {
1848			/*
1849			 * In pass through mode, AW must be programmed to
1850			 * indicate the largest AGAW value supported by
1851			 * hardware. And ASR is ignored by hardware.
1852			 */
1853			context_set_address_width(context, iommu->msagaw);
1854		}
1855
1856		context_set_translation_type(context, translation);
1857	}
1858
1859	context_set_fault_enable(context);
1860	context_set_present(context);
1861	if (!ecap_coherent(iommu->ecap))
1862		clflush_cache_range(context, sizeof(*context));
1863
1864	/*
1865	 * It's a non-present to present mapping. If hardware doesn't cache
1866	 * non-present entry we only need to flush the write-buffer. If the
1867	 * _does_ cache non-present entries, then it does so in the special
1868	 * domain #0, which we have to flush:
1869	 */
1870	if (cap_caching_mode(iommu->cap)) {
1871		iommu->flush.flush_context(iommu, 0,
1872					   (((u16)bus) << 8) | devfn,
1873					   DMA_CCMD_MASK_NOBIT,
1874					   DMA_CCMD_DEVICE_INVL);
1875		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1876	} else {
1877		iommu_flush_write_buffer(iommu);
1878	}
1879
1880	ret = 0;
1881
1882out_unlock:
1883	spin_unlock(&iommu->lock);
1884
1885	return ret;
1886}
1887
1888struct domain_context_mapping_data {
1889	struct dmar_domain *domain;
1890	struct intel_iommu *iommu;
1891	struct pasid_table *table;
1892};
1893
1894static int domain_context_mapping_cb(struct pci_dev *pdev,
1895				     u16 alias, void *opaque)
1896{
1897	struct domain_context_mapping_data *data = opaque;
1898
1899	return domain_context_mapping_one(data->domain, data->iommu,
1900					  data->table, PCI_BUS_NUM(alias),
1901					  alias & 0xff);
1902}
1903
1904static int
1905domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1906{
1907	struct device_domain_info *info = dev_iommu_priv_get(dev);
1908	struct domain_context_mapping_data data;
1909	struct intel_iommu *iommu = info->iommu;
1910	u8 bus = info->bus, devfn = info->devfn;
1911	struct pasid_table *table;
1912
1913	table = intel_pasid_get_table(dev);
1914
1915	if (!dev_is_pci(dev))
1916		return domain_context_mapping_one(domain, iommu, table,
1917						  bus, devfn);
1918
1919	data.domain = domain;
1920	data.iommu = iommu;
1921	data.table = table;
1922
1923	return pci_for_each_dma_alias(to_pci_dev(dev),
1924				      &domain_context_mapping_cb, &data);
1925}
1926
1927/* Returns a number of VTD pages, but aligned to MM page size */
1928static unsigned long aligned_nrpages(unsigned long host_addr, size_t size)
1929{
1930	host_addr &= ~PAGE_MASK;
1931	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1932}
1933
1934/* Return largest possible superpage level for a given mapping */
1935static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1936				   unsigned long phy_pfn, unsigned long pages)
1937{
1938	int support, level = 1;
1939	unsigned long pfnmerge;
1940
1941	support = domain->iommu_superpage;
1942
1943	/* To use a large page, the virtual *and* physical addresses
1944	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1945	   of them will mean we have to use smaller pages. So just
1946	   merge them and check both at once. */
1947	pfnmerge = iov_pfn | phy_pfn;
1948
1949	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1950		pages >>= VTD_STRIDE_SHIFT;
1951		if (!pages)
1952			break;
1953		pfnmerge >>= VTD_STRIDE_SHIFT;
1954		level++;
1955		support--;
1956	}
1957	return level;
1958}
1959
1960/*
1961 * Ensure that old small page tables are removed to make room for superpage(s).
1962 * We're going to add new large pages, so make sure we don't remove their parent
1963 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1964 */
1965static void switch_to_super_page(struct dmar_domain *domain,
1966				 unsigned long start_pfn,
1967				 unsigned long end_pfn, int level)
1968{
1969	unsigned long lvl_pages = lvl_to_nr_pages(level);
1970	struct iommu_domain_info *info;
1971	struct dma_pte *pte = NULL;
1972	unsigned long i;
1973
1974	while (start_pfn <= end_pfn) {
1975		if (!pte)
1976			pte = pfn_to_dma_pte(domain, start_pfn, &level,
1977					     GFP_ATOMIC);
1978
1979		if (dma_pte_present(pte)) {
1980			dma_pte_free_pagetable(domain, start_pfn,
1981					       start_pfn + lvl_pages - 1,
1982					       level + 1);
1983
1984			xa_for_each(&domain->iommu_array, i, info)
1985				iommu_flush_iotlb_psi(info->iommu, domain,
1986						      start_pfn, lvl_pages,
1987						      0, 0);
1988		}
1989
1990		pte++;
1991		start_pfn += lvl_pages;
1992		if (first_pte_in_page(pte))
1993			pte = NULL;
1994	}
1995}
1996
1997static int
1998__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1999		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2000		 gfp_t gfp)
2001{
2002	struct dma_pte *first_pte = NULL, *pte = NULL;
2003	unsigned int largepage_lvl = 0;
2004	unsigned long lvl_pages = 0;
2005	phys_addr_t pteval;
2006	u64 attr;
2007
2008	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2009		return -EINVAL;
2010
2011	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2012		return -EINVAL;
2013
2014	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2015		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2016		return -EINVAL;
2017	}
2018
2019	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2020	attr |= DMA_FL_PTE_PRESENT;
2021	if (domain->use_first_level) {
2022		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2023		if (prot & DMA_PTE_WRITE)
2024			attr |= DMA_FL_PTE_DIRTY;
2025	}
2026
2027	domain->has_mappings = true;
2028
2029	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2030
2031	while (nr_pages > 0) {
2032		uint64_t tmp;
2033
2034		if (!pte) {
2035			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2036					phys_pfn, nr_pages);
2037
2038			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2039					     gfp);
2040			if (!pte)
2041				return -ENOMEM;
2042			first_pte = pte;
2043
2044			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2045
2046			/* It is large page*/
2047			if (largepage_lvl > 1) {
2048				unsigned long end_pfn;
2049				unsigned long pages_to_remove;
2050
2051				pteval |= DMA_PTE_LARGE_PAGE;
2052				pages_to_remove = min_t(unsigned long, nr_pages,
2053							nr_pte_to_next_page(pte) * lvl_pages);
2054				end_pfn = iov_pfn + pages_to_remove - 1;
2055				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2056			} else {
2057				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2058			}
2059
2060		}
2061		/* We don't need lock here, nobody else
2062		 * touches the iova range
2063		 */
2064		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2065		if (tmp) {
2066			static int dumps = 5;
2067			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2068				iov_pfn, tmp, (unsigned long long)pteval);
2069			if (dumps) {
2070				dumps--;
2071				debug_dma_dump_mappings(NULL);
2072			}
2073			WARN_ON(1);
2074		}
2075
2076		nr_pages -= lvl_pages;
2077		iov_pfn += lvl_pages;
2078		phys_pfn += lvl_pages;
2079		pteval += lvl_pages * VTD_PAGE_SIZE;
2080
2081		/* If the next PTE would be the first in a new page, then we
2082		 * need to flush the cache on the entries we've just written.
2083		 * And then we'll need to recalculate 'pte', so clear it and
2084		 * let it get set again in the if (!pte) block above.
2085		 *
2086		 * If we're done (!nr_pages) we need to flush the cache too.
2087		 *
2088		 * Also if we've been setting superpages, we may need to
2089		 * recalculate 'pte' and switch back to smaller pages for the
2090		 * end of the mapping, if the trailing size is not enough to
2091		 * use another superpage (i.e. nr_pages < lvl_pages).
2092		 */
2093		pte++;
2094		if (!nr_pages || first_pte_in_page(pte) ||
2095		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2096			domain_flush_cache(domain, first_pte,
2097					   (void *)pte - (void *)first_pte);
2098			pte = NULL;
2099		}
2100	}
2101
2102	return 0;
2103}
2104
2105static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2106{
2107	struct intel_iommu *iommu = info->iommu;
2108	struct context_entry *context;
2109	u16 did_old;
2110
2111	if (!iommu)
2112		return;
2113
2114	spin_lock(&iommu->lock);
2115	context = iommu_context_addr(iommu, bus, devfn, 0);
2116	if (!context) {
2117		spin_unlock(&iommu->lock);
2118		return;
2119	}
2120
2121	if (sm_supported(iommu)) {
2122		if (hw_pass_through && domain_type_is_si(info->domain))
2123			did_old = FLPT_DEFAULT_DID;
2124		else
2125			did_old = domain_id_iommu(info->domain, iommu);
2126	} else {
2127		did_old = context_domain_id(context);
2128	}
2129
2130	context_clear_entry(context);
2131	__iommu_flush_cache(iommu, context, sizeof(*context));
2132	spin_unlock(&iommu->lock);
2133	iommu->flush.flush_context(iommu,
2134				   did_old,
2135				   (((u16)bus) << 8) | devfn,
2136				   DMA_CCMD_MASK_NOBIT,
2137				   DMA_CCMD_DEVICE_INVL);
2138
2139	if (sm_supported(iommu))
2140		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2141
2142	iommu->flush.flush_iotlb(iommu,
2143				 did_old,
2144				 0,
2145				 0,
2146				 DMA_TLB_DSI_FLUSH);
2147
2148	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2149}
2150
2151static int domain_setup_first_level(struct intel_iommu *iommu,
2152				    struct dmar_domain *domain,
2153				    struct device *dev,
2154				    u32 pasid)
2155{
2156	struct dma_pte *pgd = domain->pgd;
2157	int agaw, level;
2158	int flags = 0;
2159
2160	/*
2161	 * Skip top levels of page tables for iommu which has
2162	 * less agaw than default. Unnecessary for PT mode.
2163	 */
2164	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2165		pgd = phys_to_virt(dma_pte_addr(pgd));
2166		if (!dma_pte_present(pgd))
2167			return -ENOMEM;
2168	}
2169
2170	level = agaw_to_level(agaw);
2171	if (level != 4 && level != 5)
2172		return -EINVAL;
2173
2174	if (level == 5)
2175		flags |= PASID_FLAG_FL5LP;
2176
2177	if (domain->force_snooping)
2178		flags |= PASID_FLAG_PAGE_SNOOP;
2179
2180	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2181					     domain_id_iommu(domain, iommu),
2182					     flags);
2183}
2184
2185static bool dev_is_real_dma_subdevice(struct device *dev)
2186{
2187	return dev && dev_is_pci(dev) &&
2188	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2189}
2190
2191static int iommu_domain_identity_map(struct dmar_domain *domain,
2192				     unsigned long first_vpfn,
2193				     unsigned long last_vpfn)
2194{
2195	/*
2196	 * RMRR range might have overlap with physical memory range,
2197	 * clear it first
2198	 */
2199	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2200
2201	return __domain_mapping(domain, first_vpfn,
2202				first_vpfn, last_vpfn - first_vpfn + 1,
2203				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2204}
2205
2206static int md_domain_init(struct dmar_domain *domain, int guest_width);
2207
2208static int __init si_domain_init(int hw)
2209{
2210	struct dmar_rmrr_unit *rmrr;
2211	struct device *dev;
2212	int i, nid, ret;
2213
2214	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2215	if (!si_domain)
2216		return -EFAULT;
2217
2218	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2219		domain_exit(si_domain);
2220		si_domain = NULL;
2221		return -EFAULT;
2222	}
2223
2224	if (hw)
2225		return 0;
2226
2227	for_each_online_node(nid) {
2228		unsigned long start_pfn, end_pfn;
2229		int i;
2230
2231		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2232			ret = iommu_domain_identity_map(si_domain,
2233					mm_to_dma_pfn_start(start_pfn),
2234					mm_to_dma_pfn_end(end_pfn));
2235			if (ret)
2236				return ret;
2237		}
2238	}
2239
2240	/*
2241	 * Identity map the RMRRs so that devices with RMRRs could also use
2242	 * the si_domain.
2243	 */
2244	for_each_rmrr_units(rmrr) {
2245		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2246					  i, dev) {
2247			unsigned long long start = rmrr->base_address;
2248			unsigned long long end = rmrr->end_address;
2249
2250			if (WARN_ON(end < start ||
2251				    end >> agaw_to_width(si_domain->agaw)))
2252				continue;
2253
2254			ret = iommu_domain_identity_map(si_domain,
2255					mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2256					mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2257			if (ret)
2258				return ret;
2259		}
2260	}
2261
2262	return 0;
2263}
2264
2265static int dmar_domain_attach_device(struct dmar_domain *domain,
2266				     struct device *dev)
2267{
2268	struct device_domain_info *info = dev_iommu_priv_get(dev);
2269	struct intel_iommu *iommu = info->iommu;
2270	unsigned long flags;
2271	int ret;
2272
2273	ret = domain_attach_iommu(domain, iommu);
2274	if (ret)
2275		return ret;
2276	info->domain = domain;
2277	spin_lock_irqsave(&domain->lock, flags);
2278	list_add(&info->link, &domain->devices);
2279	spin_unlock_irqrestore(&domain->lock, flags);
2280
2281	/* PASID table is mandatory for a PCI device in scalable mode. */
2282	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2283		/* Setup the PASID entry for requests without PASID: */
2284		if (hw_pass_through && domain_type_is_si(domain))
2285			ret = intel_pasid_setup_pass_through(iommu,
2286					dev, IOMMU_NO_PASID);
2287		else if (domain->use_first_level)
2288			ret = domain_setup_first_level(iommu, domain, dev,
2289					IOMMU_NO_PASID);
2290		else
2291			ret = intel_pasid_setup_second_level(iommu, domain,
2292					dev, IOMMU_NO_PASID);
2293		if (ret) {
2294			dev_err(dev, "Setup RID2PASID failed\n");
2295			device_block_translation(dev);
2296			return ret;
2297		}
2298	}
2299
2300	ret = domain_context_mapping(domain, dev);
2301	if (ret) {
2302		dev_err(dev, "Domain context map failed\n");
2303		device_block_translation(dev);
2304		return ret;
2305	}
2306
2307	if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2308		iommu_enable_pci_caps(info);
2309
2310	return 0;
2311}
2312
2313/**
2314 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2315 * is relaxable (ie. is allowed to be not enforced under some conditions)
2316 * @dev: device handle
2317 *
2318 * We assume that PCI USB devices with RMRRs have them largely
2319 * for historical reasons and that the RMRR space is not actively used post
2320 * boot.  This exclusion may change if vendors begin to abuse it.
2321 *
2322 * The same exception is made for graphics devices, with the requirement that
2323 * any use of the RMRR regions will be torn down before assigning the device
2324 * to a guest.
2325 *
2326 * Return: true if the RMRR is relaxable, false otherwise
2327 */
2328static bool device_rmrr_is_relaxable(struct device *dev)
2329{
2330	struct pci_dev *pdev;
2331
2332	if (!dev_is_pci(dev))
2333		return false;
2334
2335	pdev = to_pci_dev(dev);
2336	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2337		return true;
2338	else
2339		return false;
2340}
2341
2342/*
2343 * Return the required default domain type for a specific device.
2344 *
2345 * @dev: the device in query
2346 * @startup: true if this is during early boot
2347 *
2348 * Returns:
2349 *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2350 *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2351 *  - 0: both identity and dynamic domains work for this device
2352 */
2353static int device_def_domain_type(struct device *dev)
2354{
2355	if (dev_is_pci(dev)) {
2356		struct pci_dev *pdev = to_pci_dev(dev);
2357
2358		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2359			return IOMMU_DOMAIN_IDENTITY;
2360
2361		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2362			return IOMMU_DOMAIN_IDENTITY;
2363	}
2364
2365	return 0;
2366}
2367
2368static void intel_iommu_init_qi(struct intel_iommu *iommu)
2369{
2370	/*
2371	 * Start from the sane iommu hardware state.
2372	 * If the queued invalidation is already initialized by us
2373	 * (for example, while enabling interrupt-remapping) then
2374	 * we got the things already rolling from a sane state.
2375	 */
2376	if (!iommu->qi) {
2377		/*
2378		 * Clear any previous faults.
2379		 */
2380		dmar_fault(-1, iommu);
2381		/*
2382		 * Disable queued invalidation if supported and already enabled
2383		 * before OS handover.
2384		 */
2385		dmar_disable_qi(iommu);
2386	}
2387
2388	if (dmar_enable_qi(iommu)) {
2389		/*
2390		 * Queued Invalidate not enabled, use Register Based Invalidate
2391		 */
2392		iommu->flush.flush_context = __iommu_flush_context;
2393		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2394		pr_info("%s: Using Register based invalidation\n",
2395			iommu->name);
2396	} else {
2397		iommu->flush.flush_context = qi_flush_context;
2398		iommu->flush.flush_iotlb = qi_flush_iotlb;
2399		pr_info("%s: Using Queued invalidation\n", iommu->name);
2400	}
2401}
2402
2403static int copy_context_table(struct intel_iommu *iommu,
2404			      struct root_entry *old_re,
2405			      struct context_entry **tbl,
2406			      int bus, bool ext)
2407{
2408	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2409	struct context_entry *new_ce = NULL, ce;
2410	struct context_entry *old_ce = NULL;
2411	struct root_entry re;
2412	phys_addr_t old_ce_phys;
2413
2414	tbl_idx = ext ? bus * 2 : bus;
2415	memcpy(&re, old_re, sizeof(re));
2416
2417	for (devfn = 0; devfn < 256; devfn++) {
2418		/* First calculate the correct index */
2419		idx = (ext ? devfn * 2 : devfn) % 256;
2420
2421		if (idx == 0) {
2422			/* First save what we may have and clean up */
2423			if (new_ce) {
2424				tbl[tbl_idx] = new_ce;
2425				__iommu_flush_cache(iommu, new_ce,
2426						    VTD_PAGE_SIZE);
2427				pos = 1;
2428			}
2429
2430			if (old_ce)
2431				memunmap(old_ce);
2432
2433			ret = 0;
2434			if (devfn < 0x80)
2435				old_ce_phys = root_entry_lctp(&re);
2436			else
2437				old_ce_phys = root_entry_uctp(&re);
2438
2439			if (!old_ce_phys) {
2440				if (ext && devfn == 0) {
2441					/* No LCTP, try UCTP */
2442					devfn = 0x7f;
2443					continue;
2444				} else {
2445					goto out;
2446				}
2447			}
2448
2449			ret = -ENOMEM;
2450			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2451					MEMREMAP_WB);
2452			if (!old_ce)
2453				goto out;
2454
2455			new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2456			if (!new_ce)
2457				goto out_unmap;
2458
2459			ret = 0;
2460		}
2461
2462		/* Now copy the context entry */
2463		memcpy(&ce, old_ce + idx, sizeof(ce));
2464
2465		if (!context_present(&ce))
2466			continue;
2467
2468		did = context_domain_id(&ce);
2469		if (did >= 0 && did < cap_ndoms(iommu->cap))
2470			set_bit(did, iommu->domain_ids);
2471
2472		set_context_copied(iommu, bus, devfn);
2473		new_ce[idx] = ce;
2474	}
2475
2476	tbl[tbl_idx + pos] = new_ce;
2477
2478	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2479
2480out_unmap:
2481	memunmap(old_ce);
2482
2483out:
2484	return ret;
2485}
2486
2487static int copy_translation_tables(struct intel_iommu *iommu)
2488{
2489	struct context_entry **ctxt_tbls;
2490	struct root_entry *old_rt;
2491	phys_addr_t old_rt_phys;
2492	int ctxt_table_entries;
2493	u64 rtaddr_reg;
2494	int bus, ret;
2495	bool new_ext, ext;
2496
2497	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2498	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2499	new_ext    = !!sm_supported(iommu);
2500
2501	/*
2502	 * The RTT bit can only be changed when translation is disabled,
2503	 * but disabling translation means to open a window for data
2504	 * corruption. So bail out and don't copy anything if we would
2505	 * have to change the bit.
2506	 */
2507	if (new_ext != ext)
2508		return -EINVAL;
2509
2510	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2511	if (!iommu->copied_tables)
2512		return -ENOMEM;
2513
2514	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2515	if (!old_rt_phys)
2516		return -EINVAL;
2517
2518	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2519	if (!old_rt)
2520		return -ENOMEM;
2521
2522	/* This is too big for the stack - allocate it from slab */
2523	ctxt_table_entries = ext ? 512 : 256;
2524	ret = -ENOMEM;
2525	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2526	if (!ctxt_tbls)
2527		goto out_unmap;
2528
2529	for (bus = 0; bus < 256; bus++) {
2530		ret = copy_context_table(iommu, &old_rt[bus],
2531					 ctxt_tbls, bus, ext);
2532		if (ret) {
2533			pr_err("%s: Failed to copy context table for bus %d\n",
2534				iommu->name, bus);
2535			continue;
2536		}
2537	}
2538
2539	spin_lock(&iommu->lock);
2540
2541	/* Context tables are copied, now write them to the root_entry table */
2542	for (bus = 0; bus < 256; bus++) {
2543		int idx = ext ? bus * 2 : bus;
2544		u64 val;
2545
2546		if (ctxt_tbls[idx]) {
2547			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2548			iommu->root_entry[bus].lo = val;
2549		}
2550
2551		if (!ext || !ctxt_tbls[idx + 1])
2552			continue;
2553
2554		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2555		iommu->root_entry[bus].hi = val;
2556	}
2557
2558	spin_unlock(&iommu->lock);
2559
2560	kfree(ctxt_tbls);
2561
2562	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2563
2564	ret = 0;
2565
2566out_unmap:
2567	memunmap(old_rt);
2568
2569	return ret;
2570}
2571
2572static int __init init_dmars(void)
2573{
2574	struct dmar_drhd_unit *drhd;
2575	struct intel_iommu *iommu;
2576	int ret;
2577
2578	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2579	if (ret)
2580		goto free_iommu;
2581
2582	for_each_iommu(iommu, drhd) {
2583		if (drhd->ignored) {
2584			iommu_disable_translation(iommu);
2585			continue;
2586		}
2587
2588		/*
2589		 * Find the max pasid size of all IOMMU's in the system.
2590		 * We need to ensure the system pasid table is no bigger
2591		 * than the smallest supported.
2592		 */
2593		if (pasid_supported(iommu)) {
2594			u32 temp = 2 << ecap_pss(iommu->ecap);
2595
2596			intel_pasid_max_id = min_t(u32, temp,
2597						   intel_pasid_max_id);
2598		}
2599
2600		intel_iommu_init_qi(iommu);
2601
2602		ret = iommu_init_domains(iommu);
2603		if (ret)
2604			goto free_iommu;
2605
2606		init_translation_status(iommu);
2607
2608		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2609			iommu_disable_translation(iommu);
2610			clear_translation_pre_enabled(iommu);
2611			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2612				iommu->name);
2613		}
2614
2615		/*
2616		 * TBD:
2617		 * we could share the same root & context tables
2618		 * among all IOMMU's. Need to Split it later.
2619		 */
2620		ret = iommu_alloc_root_entry(iommu);
2621		if (ret)
2622			goto free_iommu;
2623
2624		if (translation_pre_enabled(iommu)) {
2625			pr_info("Translation already enabled - trying to copy translation structures\n");
2626
2627			ret = copy_translation_tables(iommu);
2628			if (ret) {
2629				/*
2630				 * We found the IOMMU with translation
2631				 * enabled - but failed to copy over the
2632				 * old root-entry table. Try to proceed
2633				 * by disabling translation now and
2634				 * allocating a clean root-entry table.
2635				 * This might cause DMAR faults, but
2636				 * probably the dump will still succeed.
2637				 */
2638				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2639				       iommu->name);
2640				iommu_disable_translation(iommu);
2641				clear_translation_pre_enabled(iommu);
2642			} else {
2643				pr_info("Copied translation tables from previous kernel for %s\n",
2644					iommu->name);
2645			}
2646		}
2647
2648		if (!ecap_pass_through(iommu->ecap))
2649			hw_pass_through = 0;
2650		intel_svm_check(iommu);
2651	}
2652
2653	/*
2654	 * Now that qi is enabled on all iommus, set the root entry and flush
2655	 * caches. This is required on some Intel X58 chipsets, otherwise the
2656	 * flush_context function will loop forever and the boot hangs.
2657	 */
2658	for_each_active_iommu(iommu, drhd) {
2659		iommu_flush_write_buffer(iommu);
2660		iommu_set_root_entry(iommu);
2661	}
2662
2663#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2664	dmar_map_gfx = 0;
2665#endif
2666
2667	if (!dmar_map_gfx)
2668		iommu_identity_mapping |= IDENTMAP_GFX;
2669
2670	check_tylersburg_isoch();
2671
2672	ret = si_domain_init(hw_pass_through);
2673	if (ret)
2674		goto free_iommu;
2675
2676	/*
2677	 * for each drhd
2678	 *   enable fault log
2679	 *   global invalidate context cache
2680	 *   global invalidate iotlb
2681	 *   enable translation
2682	 */
2683	for_each_iommu(iommu, drhd) {
2684		if (drhd->ignored) {
2685			/*
2686			 * we always have to disable PMRs or DMA may fail on
2687			 * this device
2688			 */
2689			if (force_on)
2690				iommu_disable_protect_mem_regions(iommu);
2691			continue;
2692		}
2693
2694		iommu_flush_write_buffer(iommu);
2695
2696#ifdef CONFIG_INTEL_IOMMU_SVM
2697		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2698			/*
2699			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2700			 * could cause possible lock race condition.
2701			 */
2702			up_write(&dmar_global_lock);
2703			ret = intel_svm_enable_prq(iommu);
2704			down_write(&dmar_global_lock);
2705			if (ret)
2706				goto free_iommu;
2707		}
2708#endif
2709		ret = dmar_set_interrupt(iommu);
2710		if (ret)
2711			goto free_iommu;
2712	}
2713
2714	return 0;
2715
2716free_iommu:
2717	for_each_active_iommu(iommu, drhd) {
2718		disable_dmar_iommu(iommu);
2719		free_dmar_iommu(iommu);
2720	}
2721	if (si_domain) {
2722		domain_exit(si_domain);
2723		si_domain = NULL;
2724	}
2725
2726	return ret;
2727}
2728
2729static void __init init_no_remapping_devices(void)
2730{
2731	struct dmar_drhd_unit *drhd;
2732	struct device *dev;
2733	int i;
2734
2735	for_each_drhd_unit(drhd) {
2736		if (!drhd->include_all) {
2737			for_each_active_dev_scope(drhd->devices,
2738						  drhd->devices_cnt, i, dev)
2739				break;
2740			/* ignore DMAR unit if no devices exist */
2741			if (i == drhd->devices_cnt)
2742				drhd->ignored = 1;
2743		}
2744	}
2745
2746	for_each_active_drhd_unit(drhd) {
2747		if (drhd->include_all)
2748			continue;
2749
2750		for_each_active_dev_scope(drhd->devices,
2751					  drhd->devices_cnt, i, dev)
2752			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2753				break;
2754		if (i < drhd->devices_cnt)
2755			continue;
2756
2757		/* This IOMMU has *only* gfx devices. Either bypass it or
2758		   set the gfx_mapped flag, as appropriate */
2759		drhd->gfx_dedicated = 1;
2760		if (!dmar_map_gfx)
2761			drhd->ignored = 1;
2762	}
2763}
2764
2765#ifdef CONFIG_SUSPEND
2766static int init_iommu_hw(void)
2767{
2768	struct dmar_drhd_unit *drhd;
2769	struct intel_iommu *iommu = NULL;
2770	int ret;
2771
2772	for_each_active_iommu(iommu, drhd) {
2773		if (iommu->qi) {
2774			ret = dmar_reenable_qi(iommu);
2775			if (ret)
2776				return ret;
2777		}
2778	}
2779
2780	for_each_iommu(iommu, drhd) {
2781		if (drhd->ignored) {
2782			/*
2783			 * we always have to disable PMRs or DMA may fail on
2784			 * this device
2785			 */
2786			if (force_on)
2787				iommu_disable_protect_mem_regions(iommu);
2788			continue;
2789		}
2790
2791		iommu_flush_write_buffer(iommu);
2792		iommu_set_root_entry(iommu);
2793		iommu_enable_translation(iommu);
2794		iommu_disable_protect_mem_regions(iommu);
2795	}
2796
2797	return 0;
2798}
2799
2800static void iommu_flush_all(void)
2801{
2802	struct dmar_drhd_unit *drhd;
2803	struct intel_iommu *iommu;
2804
2805	for_each_active_iommu(iommu, drhd) {
2806		iommu->flush.flush_context(iommu, 0, 0, 0,
2807					   DMA_CCMD_GLOBAL_INVL);
2808		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2809					 DMA_TLB_GLOBAL_FLUSH);
2810	}
2811}
2812
2813static int iommu_suspend(void)
2814{
2815	struct dmar_drhd_unit *drhd;
2816	struct intel_iommu *iommu = NULL;
2817	unsigned long flag;
2818
2819	iommu_flush_all();
2820
2821	for_each_active_iommu(iommu, drhd) {
2822		iommu_disable_translation(iommu);
2823
2824		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2825
2826		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2827			readl(iommu->reg + DMAR_FECTL_REG);
2828		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2829			readl(iommu->reg + DMAR_FEDATA_REG);
2830		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2831			readl(iommu->reg + DMAR_FEADDR_REG);
2832		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2833			readl(iommu->reg + DMAR_FEUADDR_REG);
2834
2835		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2836	}
2837	return 0;
2838}
2839
2840static void iommu_resume(void)
2841{
2842	struct dmar_drhd_unit *drhd;
2843	struct intel_iommu *iommu = NULL;
2844	unsigned long flag;
2845
2846	if (init_iommu_hw()) {
2847		if (force_on)
2848			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2849		else
2850			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2851		return;
2852	}
2853
2854	for_each_active_iommu(iommu, drhd) {
2855
2856		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2857
2858		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2859			iommu->reg + DMAR_FECTL_REG);
2860		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2861			iommu->reg + DMAR_FEDATA_REG);
2862		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2863			iommu->reg + DMAR_FEADDR_REG);
2864		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2865			iommu->reg + DMAR_FEUADDR_REG);
2866
2867		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2868	}
2869}
2870
2871static struct syscore_ops iommu_syscore_ops = {
2872	.resume		= iommu_resume,
2873	.suspend	= iommu_suspend,
2874};
2875
2876static void __init init_iommu_pm_ops(void)
2877{
2878	register_syscore_ops(&iommu_syscore_ops);
2879}
2880
2881#else
2882static inline void init_iommu_pm_ops(void) {}
2883#endif	/* CONFIG_PM */
2884
2885static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2886{
2887	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2888	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2889	    rmrr->end_address <= rmrr->base_address ||
2890	    arch_rmrr_sanity_check(rmrr))
2891		return -EINVAL;
2892
2893	return 0;
2894}
2895
2896int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2897{
2898	struct acpi_dmar_reserved_memory *rmrr;
2899	struct dmar_rmrr_unit *rmrru;
2900
2901	rmrr = (struct acpi_dmar_reserved_memory *)header;
2902	if (rmrr_sanity_check(rmrr)) {
2903		pr_warn(FW_BUG
2904			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2905			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2906			   rmrr->base_address, rmrr->end_address,
2907			   dmi_get_system_info(DMI_BIOS_VENDOR),
2908			   dmi_get_system_info(DMI_BIOS_VERSION),
2909			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2910		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2911	}
2912
2913	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2914	if (!rmrru)
2915		goto out;
2916
2917	rmrru->hdr = header;
2918
2919	rmrru->base_address = rmrr->base_address;
2920	rmrru->end_address = rmrr->end_address;
2921
2922	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2923				((void *)rmrr) + rmrr->header.length,
2924				&rmrru->devices_cnt);
2925	if (rmrru->devices_cnt && rmrru->devices == NULL)
2926		goto free_rmrru;
2927
2928	list_add(&rmrru->list, &dmar_rmrr_units);
2929
2930	return 0;
2931free_rmrru:
2932	kfree(rmrru);
2933out:
2934	return -ENOMEM;
2935}
2936
2937static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2938{
2939	struct dmar_atsr_unit *atsru;
2940	struct acpi_dmar_atsr *tmp;
2941
2942	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2943				dmar_rcu_check()) {
2944		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2945		if (atsr->segment != tmp->segment)
2946			continue;
2947		if (atsr->header.length != tmp->header.length)
2948			continue;
2949		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2950			return atsru;
2951	}
2952
2953	return NULL;
2954}
2955
2956int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2957{
2958	struct acpi_dmar_atsr *atsr;
2959	struct dmar_atsr_unit *atsru;
2960
2961	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2962		return 0;
2963
2964	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2965	atsru = dmar_find_atsr(atsr);
2966	if (atsru)
2967		return 0;
2968
2969	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2970	if (!atsru)
2971		return -ENOMEM;
2972
2973	/*
2974	 * If memory is allocated from slab by ACPI _DSM method, we need to
2975	 * copy the memory content because the memory buffer will be freed
2976	 * on return.
2977	 */
2978	atsru->hdr = (void *)(atsru + 1);
2979	memcpy(atsru->hdr, hdr, hdr->length);
2980	atsru->include_all = atsr->flags & 0x1;
2981	if (!atsru->include_all) {
2982		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2983				(void *)atsr + atsr->header.length,
2984				&atsru->devices_cnt);
2985		if (atsru->devices_cnt && atsru->devices == NULL) {
2986			kfree(atsru);
2987			return -ENOMEM;
2988		}
2989	}
2990
2991	list_add_rcu(&atsru->list, &dmar_atsr_units);
2992
2993	return 0;
2994}
2995
2996static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2997{
2998	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2999	kfree(atsru);
3000}
3001
3002int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3003{
3004	struct acpi_dmar_atsr *atsr;
3005	struct dmar_atsr_unit *atsru;
3006
3007	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3008	atsru = dmar_find_atsr(atsr);
3009	if (atsru) {
3010		list_del_rcu(&atsru->list);
3011		synchronize_rcu();
3012		intel_iommu_free_atsr(atsru);
3013	}
3014
3015	return 0;
3016}
3017
3018int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3019{
3020	int i;
3021	struct device *dev;
3022	struct acpi_dmar_atsr *atsr;
3023	struct dmar_atsr_unit *atsru;
3024
3025	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3026	atsru = dmar_find_atsr(atsr);
3027	if (!atsru)
3028		return 0;
3029
3030	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3031		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3032					  i, dev)
3033			return -EBUSY;
3034	}
3035
3036	return 0;
3037}
3038
3039static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3040{
3041	struct dmar_satc_unit *satcu;
3042	struct acpi_dmar_satc *tmp;
3043
3044	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3045				dmar_rcu_check()) {
3046		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3047		if (satc->segment != tmp->segment)
3048			continue;
3049		if (satc->header.length != tmp->header.length)
3050			continue;
3051		if (memcmp(satc, tmp, satc->header.length) == 0)
3052			return satcu;
3053	}
3054
3055	return NULL;
3056}
3057
3058int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3059{
3060	struct acpi_dmar_satc *satc;
3061	struct dmar_satc_unit *satcu;
3062
3063	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3064		return 0;
3065
3066	satc = container_of(hdr, struct acpi_dmar_satc, header);
3067	satcu = dmar_find_satc(satc);
3068	if (satcu)
3069		return 0;
3070
3071	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3072	if (!satcu)
3073		return -ENOMEM;
3074
3075	satcu->hdr = (void *)(satcu + 1);
3076	memcpy(satcu->hdr, hdr, hdr->length);
3077	satcu->atc_required = satc->flags & 0x1;
3078	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3079					      (void *)satc + satc->header.length,
3080					      &satcu->devices_cnt);
3081	if (satcu->devices_cnt && !satcu->devices) {
3082		kfree(satcu);
3083		return -ENOMEM;
3084	}
3085	list_add_rcu(&satcu->list, &dmar_satc_units);
3086
3087	return 0;
3088}
3089
3090static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3091{
3092	int sp, ret;
3093	struct intel_iommu *iommu = dmaru->iommu;
3094
3095	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3096	if (ret)
3097		goto out;
3098
3099	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3100		pr_warn("%s: Doesn't support hardware pass through.\n",
3101			iommu->name);
3102		return -ENXIO;
3103	}
3104
3105	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3106	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3107		pr_warn("%s: Doesn't support large page.\n",
3108			iommu->name);
3109		return -ENXIO;
3110	}
3111
3112	/*
3113	 * Disable translation if already enabled prior to OS handover.
3114	 */
3115	if (iommu->gcmd & DMA_GCMD_TE)
3116		iommu_disable_translation(iommu);
3117
3118	ret = iommu_init_domains(iommu);
3119	if (ret == 0)
3120		ret = iommu_alloc_root_entry(iommu);
3121	if (ret)
3122		goto out;
3123
3124	intel_svm_check(iommu);
3125
3126	if (dmaru->ignored) {
3127		/*
3128		 * we always have to disable PMRs or DMA may fail on this device
3129		 */
3130		if (force_on)
3131			iommu_disable_protect_mem_regions(iommu);
3132		return 0;
3133	}
3134
3135	intel_iommu_init_qi(iommu);
3136	iommu_flush_write_buffer(iommu);
3137
3138#ifdef CONFIG_INTEL_IOMMU_SVM
3139	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3140		ret = intel_svm_enable_prq(iommu);
3141		if (ret)
3142			goto disable_iommu;
3143	}
3144#endif
3145	ret = dmar_set_interrupt(iommu);
3146	if (ret)
3147		goto disable_iommu;
3148
3149	iommu_set_root_entry(iommu);
3150	iommu_enable_translation(iommu);
3151
3152	iommu_disable_protect_mem_regions(iommu);
3153	return 0;
3154
3155disable_iommu:
3156	disable_dmar_iommu(iommu);
3157out:
3158	free_dmar_iommu(iommu);
3159	return ret;
3160}
3161
3162int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3163{
3164	int ret = 0;
3165	struct intel_iommu *iommu = dmaru->iommu;
3166
3167	if (!intel_iommu_enabled)
3168		return 0;
3169	if (iommu == NULL)
3170		return -EINVAL;
3171
3172	if (insert) {
3173		ret = intel_iommu_add(dmaru);
3174	} else {
3175		disable_dmar_iommu(iommu);
3176		free_dmar_iommu(iommu);
3177	}
3178
3179	return ret;
3180}
3181
3182static void intel_iommu_free_dmars(void)
3183{
3184	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3185	struct dmar_atsr_unit *atsru, *atsr_n;
3186	struct dmar_satc_unit *satcu, *satc_n;
3187
3188	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3189		list_del(&rmrru->list);
3190		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3191		kfree(rmrru);
3192	}
3193
3194	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3195		list_del(&atsru->list);
3196		intel_iommu_free_atsr(atsru);
3197	}
3198	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3199		list_del(&satcu->list);
3200		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3201		kfree(satcu);
3202	}
3203}
3204
3205static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3206{
3207	struct dmar_satc_unit *satcu;
3208	struct acpi_dmar_satc *satc;
3209	struct device *tmp;
3210	int i;
3211
3212	dev = pci_physfn(dev);
3213	rcu_read_lock();
3214
3215	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3216		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3217		if (satc->segment != pci_domain_nr(dev->bus))
3218			continue;
3219		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3220			if (to_pci_dev(tmp) == dev)
3221				goto out;
3222	}
3223	satcu = NULL;
3224out:
3225	rcu_read_unlock();
3226	return satcu;
3227}
3228
3229static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3230{
3231	int i, ret = 1;
3232	struct pci_bus *bus;
3233	struct pci_dev *bridge = NULL;
3234	struct device *tmp;
3235	struct acpi_dmar_atsr *atsr;
3236	struct dmar_atsr_unit *atsru;
3237	struct dmar_satc_unit *satcu;
3238
3239	dev = pci_physfn(dev);
3240	satcu = dmar_find_matched_satc_unit(dev);
3241	if (satcu)
3242		/*
3243		 * This device supports ATS as it is in SATC table.
3244		 * When IOMMU is in legacy mode, enabling ATS is done
3245		 * automatically by HW for the device that requires
3246		 * ATS, hence OS should not enable this device ATS
3247		 * to avoid duplicated TLB invalidation.
3248		 */
3249		return !(satcu->atc_required && !sm_supported(iommu));
3250
3251	for (bus = dev->bus; bus; bus = bus->parent) {
3252		bridge = bus->self;
3253		/* If it's an integrated device, allow ATS */
3254		if (!bridge)
3255			return 1;
3256		/* Connected via non-PCIe: no ATS */
3257		if (!pci_is_pcie(bridge) ||
3258		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3259			return 0;
3260		/* If we found the root port, look it up in the ATSR */
3261		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3262			break;
3263	}
3264
3265	rcu_read_lock();
3266	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3267		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3268		if (atsr->segment != pci_domain_nr(dev->bus))
3269			continue;
3270
3271		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3272			if (tmp == &bridge->dev)
3273				goto out;
3274
3275		if (atsru->include_all)
3276			goto out;
3277	}
3278	ret = 0;
3279out:
3280	rcu_read_unlock();
3281
3282	return ret;
3283}
3284
3285int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3286{
3287	int ret;
3288	struct dmar_rmrr_unit *rmrru;
3289	struct dmar_atsr_unit *atsru;
3290	struct dmar_satc_unit *satcu;
3291	struct acpi_dmar_atsr *atsr;
3292	struct acpi_dmar_reserved_memory *rmrr;
3293	struct acpi_dmar_satc *satc;
3294
3295	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3296		return 0;
3297
3298	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3299		rmrr = container_of(rmrru->hdr,
3300				    struct acpi_dmar_reserved_memory, header);
3301		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3302			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3303				((void *)rmrr) + rmrr->header.length,
3304				rmrr->segment, rmrru->devices,
3305				rmrru->devices_cnt);
3306			if (ret < 0)
3307				return ret;
3308		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3309			dmar_remove_dev_scope(info, rmrr->segment,
3310				rmrru->devices, rmrru->devices_cnt);
3311		}
3312	}
3313
3314	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3315		if (atsru->include_all)
3316			continue;
3317
3318		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3319		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3320			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3321					(void *)atsr + atsr->header.length,
3322					atsr->segment, atsru->devices,
3323					atsru->devices_cnt);
3324			if (ret > 0)
3325				break;
3326			else if (ret < 0)
3327				return ret;
3328		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3329			if (dmar_remove_dev_scope(info, atsr->segment,
3330					atsru->devices, atsru->devices_cnt))
3331				break;
3332		}
3333	}
3334	list_for_each_entry(satcu, &dmar_satc_units, list) {
3335		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3336		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3337			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3338					(void *)satc + satc->header.length,
3339					satc->segment, satcu->devices,
3340					satcu->devices_cnt);
3341			if (ret > 0)
3342				break;
3343			else if (ret < 0)
3344				return ret;
3345		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3346			if (dmar_remove_dev_scope(info, satc->segment,
3347					satcu->devices, satcu->devices_cnt))
3348				break;
3349		}
3350	}
3351
3352	return 0;
3353}
3354
3355static int intel_iommu_memory_notifier(struct notifier_block *nb,
3356				       unsigned long val, void *v)
3357{
3358	struct memory_notify *mhp = v;
3359	unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3360	unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3361			mhp->nr_pages - 1);
3362
3363	switch (val) {
3364	case MEM_GOING_ONLINE:
3365		if (iommu_domain_identity_map(si_domain,
3366					      start_vpfn, last_vpfn)) {
3367			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3368				start_vpfn, last_vpfn);
3369			return NOTIFY_BAD;
3370		}
3371		break;
3372
3373	case MEM_OFFLINE:
3374	case MEM_CANCEL_ONLINE:
3375		{
3376			struct dmar_drhd_unit *drhd;
3377			struct intel_iommu *iommu;
3378			LIST_HEAD(freelist);
3379
3380			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3381
3382			rcu_read_lock();
3383			for_each_active_iommu(iommu, drhd)
3384				iommu_flush_iotlb_psi(iommu, si_domain,
3385					start_vpfn, mhp->nr_pages,
3386					list_empty(&freelist), 0);
3387			rcu_read_unlock();
3388			put_pages_list(&freelist);
3389		}
3390		break;
3391	}
3392
3393	return NOTIFY_OK;
3394}
3395
3396static struct notifier_block intel_iommu_memory_nb = {
3397	.notifier_call = intel_iommu_memory_notifier,
3398	.priority = 0
3399};
3400
3401static void intel_disable_iommus(void)
3402{
3403	struct intel_iommu *iommu = NULL;
3404	struct dmar_drhd_unit *drhd;
3405
3406	for_each_iommu(iommu, drhd)
3407		iommu_disable_translation(iommu);
3408}
3409
3410void intel_iommu_shutdown(void)
3411{
3412	struct dmar_drhd_unit *drhd;
3413	struct intel_iommu *iommu = NULL;
3414
3415	if (no_iommu || dmar_disabled)
3416		return;
3417
3418	down_write(&dmar_global_lock);
3419
3420	/* Disable PMRs explicitly here. */
3421	for_each_iommu(iommu, drhd)
3422		iommu_disable_protect_mem_regions(iommu);
3423
3424	/* Make sure the IOMMUs are switched off */
3425	intel_disable_iommus();
3426
3427	up_write(&dmar_global_lock);
3428}
3429
3430static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3431{
3432	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3433
3434	return container_of(iommu_dev, struct intel_iommu, iommu);
3435}
3436
3437static ssize_t version_show(struct device *dev,
3438			    struct device_attribute *attr, char *buf)
3439{
3440	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3441	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3442	return sysfs_emit(buf, "%d:%d\n",
3443			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3444}
3445static DEVICE_ATTR_RO(version);
3446
3447static ssize_t address_show(struct device *dev,
3448			    struct device_attribute *attr, char *buf)
3449{
3450	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3451	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3452}
3453static DEVICE_ATTR_RO(address);
3454
3455static ssize_t cap_show(struct device *dev,
3456			struct device_attribute *attr, char *buf)
3457{
3458	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3459	return sysfs_emit(buf, "%llx\n", iommu->cap);
3460}
3461static DEVICE_ATTR_RO(cap);
3462
3463static ssize_t ecap_show(struct device *dev,
3464			 struct device_attribute *attr, char *buf)
3465{
3466	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3467	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3468}
3469static DEVICE_ATTR_RO(ecap);
3470
3471static ssize_t domains_supported_show(struct device *dev,
3472				      struct device_attribute *attr, char *buf)
3473{
3474	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3475	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3476}
3477static DEVICE_ATTR_RO(domains_supported);
3478
3479static ssize_t domains_used_show(struct device *dev,
3480				 struct device_attribute *attr, char *buf)
3481{
3482	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3483	return sysfs_emit(buf, "%d\n",
3484			  bitmap_weight(iommu->domain_ids,
3485					cap_ndoms(iommu->cap)));
3486}
3487static DEVICE_ATTR_RO(domains_used);
3488
3489static struct attribute *intel_iommu_attrs[] = {
3490	&dev_attr_version.attr,
3491	&dev_attr_address.attr,
3492	&dev_attr_cap.attr,
3493	&dev_attr_ecap.attr,
3494	&dev_attr_domains_supported.attr,
3495	&dev_attr_domains_used.attr,
3496	NULL,
3497};
3498
3499static struct attribute_group intel_iommu_group = {
3500	.name = "intel-iommu",
3501	.attrs = intel_iommu_attrs,
3502};
3503
3504const struct attribute_group *intel_iommu_groups[] = {
3505	&intel_iommu_group,
3506	NULL,
3507};
3508
3509static bool has_external_pci(void)
3510{
3511	struct pci_dev *pdev = NULL;
3512
3513	for_each_pci_dev(pdev)
3514		if (pdev->external_facing) {
3515			pci_dev_put(pdev);
3516			return true;
3517		}
3518
3519	return false;
3520}
3521
3522static int __init platform_optin_force_iommu(void)
3523{
3524	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3525		return 0;
3526
3527	if (no_iommu || dmar_disabled)
3528		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3529
3530	/*
3531	 * If Intel-IOMMU is disabled by default, we will apply identity
3532	 * map for all devices except those marked as being untrusted.
3533	 */
3534	if (dmar_disabled)
3535		iommu_set_default_passthrough(false);
3536
3537	dmar_disabled = 0;
3538	no_iommu = 0;
3539
3540	return 1;
3541}
3542
3543static int __init probe_acpi_namespace_devices(void)
3544{
3545	struct dmar_drhd_unit *drhd;
3546	/* To avoid a -Wunused-but-set-variable warning. */
3547	struct intel_iommu *iommu __maybe_unused;
3548	struct device *dev;
3549	int i, ret = 0;
3550
3551	for_each_active_iommu(iommu, drhd) {
3552		for_each_active_dev_scope(drhd->devices,
3553					  drhd->devices_cnt, i, dev) {
3554			struct acpi_device_physical_node *pn;
3555			struct acpi_device *adev;
3556
3557			if (dev->bus != &acpi_bus_type)
3558				continue;
3559
3560			adev = to_acpi_device(dev);
3561			mutex_lock(&adev->physical_node_lock);
3562			list_for_each_entry(pn,
3563					    &adev->physical_node_list, node) {
3564				ret = iommu_probe_device(pn->dev);
3565				if (ret)
3566					break;
3567			}
3568			mutex_unlock(&adev->physical_node_lock);
3569
3570			if (ret)
3571				return ret;
3572		}
3573	}
3574
3575	return 0;
3576}
3577
3578static __init int tboot_force_iommu(void)
3579{
3580	if (!tboot_enabled())
3581		return 0;
3582
3583	if (no_iommu || dmar_disabled)
3584		pr_warn("Forcing Intel-IOMMU to enabled\n");
3585
3586	dmar_disabled = 0;
3587	no_iommu = 0;
3588
3589	return 1;
3590}
3591
3592int __init intel_iommu_init(void)
3593{
3594	int ret = -ENODEV;
3595	struct dmar_drhd_unit *drhd;
3596	struct intel_iommu *iommu;
3597
3598	/*
3599	 * Intel IOMMU is required for a TXT/tboot launch or platform
3600	 * opt in, so enforce that.
3601	 */
3602	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3603		    platform_optin_force_iommu();
3604
3605	down_write(&dmar_global_lock);
3606	if (dmar_table_init()) {
3607		if (force_on)
3608			panic("tboot: Failed to initialize DMAR table\n");
3609		goto out_free_dmar;
3610	}
3611
3612	if (dmar_dev_scope_init() < 0) {
3613		if (force_on)
3614			panic("tboot: Failed to initialize DMAR device scope\n");
3615		goto out_free_dmar;
3616	}
3617
3618	up_write(&dmar_global_lock);
3619
3620	/*
3621	 * The bus notifier takes the dmar_global_lock, so lockdep will
3622	 * complain later when we register it under the lock.
3623	 */
3624	dmar_register_bus_notifier();
3625
3626	down_write(&dmar_global_lock);
3627
3628	if (!no_iommu)
3629		intel_iommu_debugfs_init();
3630
3631	if (no_iommu || dmar_disabled) {
3632		/*
3633		 * We exit the function here to ensure IOMMU's remapping and
3634		 * mempool aren't setup, which means that the IOMMU's PMRs
3635		 * won't be disabled via the call to init_dmars(). So disable
3636		 * it explicitly here. The PMRs were setup by tboot prior to
3637		 * calling SENTER, but the kernel is expected to reset/tear
3638		 * down the PMRs.
3639		 */
3640		if (intel_iommu_tboot_noforce) {
3641			for_each_iommu(iommu, drhd)
3642				iommu_disable_protect_mem_regions(iommu);
3643		}
3644
3645		/*
3646		 * Make sure the IOMMUs are switched off, even when we
3647		 * boot into a kexec kernel and the previous kernel left
3648		 * them enabled
3649		 */
3650		intel_disable_iommus();
3651		goto out_free_dmar;
3652	}
3653
3654	if (list_empty(&dmar_rmrr_units))
3655		pr_info("No RMRR found\n");
3656
3657	if (list_empty(&dmar_atsr_units))
3658		pr_info("No ATSR found\n");
3659
3660	if (list_empty(&dmar_satc_units))
3661		pr_info("No SATC found\n");
3662
3663	init_no_remapping_devices();
3664
3665	ret = init_dmars();
3666	if (ret) {
3667		if (force_on)
3668			panic("tboot: Failed to initialize DMARs\n");
3669		pr_err("Initialization failed\n");
3670		goto out_free_dmar;
3671	}
3672	up_write(&dmar_global_lock);
3673
3674	init_iommu_pm_ops();
3675
3676	down_read(&dmar_global_lock);
3677	for_each_active_iommu(iommu, drhd) {
3678		/*
3679		 * The flush queue implementation does not perform
3680		 * page-selective invalidations that are required for efficient
3681		 * TLB flushes in virtual environments.  The benefit of batching
3682		 * is likely to be much lower than the overhead of synchronizing
3683		 * the virtual and physical IOMMU page-tables.
3684		 */
3685		if (cap_caching_mode(iommu->cap) &&
3686		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3687			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3688			iommu_set_dma_strict();
3689		}
3690		iommu_device_sysfs_add(&iommu->iommu, NULL,
3691				       intel_iommu_groups,
3692				       "%s", iommu->name);
3693		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3694
3695		iommu_pmu_register(iommu);
3696	}
3697	up_read(&dmar_global_lock);
3698
3699	if (si_domain && !hw_pass_through)
3700		register_memory_notifier(&intel_iommu_memory_nb);
3701
3702	down_read(&dmar_global_lock);
3703	if (probe_acpi_namespace_devices())
3704		pr_warn("ACPI name space devices didn't probe correctly\n");
3705
3706	/* Finally, we enable the DMA remapping hardware. */
3707	for_each_iommu(iommu, drhd) {
3708		if (!drhd->ignored && !translation_pre_enabled(iommu))
3709			iommu_enable_translation(iommu);
3710
3711		iommu_disable_protect_mem_regions(iommu);
3712	}
3713	up_read(&dmar_global_lock);
3714
3715	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3716
3717	intel_iommu_enabled = 1;
3718
3719	return 0;
3720
3721out_free_dmar:
3722	intel_iommu_free_dmars();
3723	up_write(&dmar_global_lock);
3724	return ret;
3725}
3726
3727static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3728{
3729	struct device_domain_info *info = opaque;
3730
3731	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3732	return 0;
3733}
3734
3735/*
3736 * NB - intel-iommu lacks any sort of reference counting for the users of
3737 * dependent devices.  If multiple endpoints have intersecting dependent
3738 * devices, unbinding the driver from any one of them will possibly leave
3739 * the others unable to operate.
3740 */
3741static void domain_context_clear(struct device_domain_info *info)
3742{
3743	if (!dev_is_pci(info->dev))
3744		domain_context_clear_one(info, info->bus, info->devfn);
3745
3746	pci_for_each_dma_alias(to_pci_dev(info->dev),
3747			       &domain_context_clear_one_cb, info);
3748}
3749
3750static void dmar_remove_one_dev_info(struct device *dev)
3751{
3752	struct device_domain_info *info = dev_iommu_priv_get(dev);
3753	struct dmar_domain *domain = info->domain;
3754	struct intel_iommu *iommu = info->iommu;
3755	unsigned long flags;
3756
3757	if (!dev_is_real_dma_subdevice(info->dev)) {
3758		if (dev_is_pci(info->dev) && sm_supported(iommu))
3759			intel_pasid_tear_down_entry(iommu, info->dev,
3760					IOMMU_NO_PASID, false);
3761
3762		iommu_disable_pci_caps(info);
3763		domain_context_clear(info);
3764	}
3765
3766	spin_lock_irqsave(&domain->lock, flags);
3767	list_del(&info->link);
3768	spin_unlock_irqrestore(&domain->lock, flags);
3769
3770	domain_detach_iommu(domain, iommu);
3771	info->domain = NULL;
3772}
3773
3774/*
3775 * Clear the page table pointer in context or pasid table entries so that
3776 * all DMA requests without PASID from the device are blocked. If the page
3777 * table has been set, clean up the data structures.
3778 */
3779void device_block_translation(struct device *dev)
3780{
3781	struct device_domain_info *info = dev_iommu_priv_get(dev);
3782	struct intel_iommu *iommu = info->iommu;
3783	unsigned long flags;
3784
3785	iommu_disable_pci_caps(info);
3786	if (!dev_is_real_dma_subdevice(dev)) {
3787		if (sm_supported(iommu))
3788			intel_pasid_tear_down_entry(iommu, dev,
3789						    IOMMU_NO_PASID, false);
3790		else
3791			domain_context_clear(info);
3792	}
3793
3794	if (!info->domain)
3795		return;
3796
3797	spin_lock_irqsave(&info->domain->lock, flags);
3798	list_del(&info->link);
3799	spin_unlock_irqrestore(&info->domain->lock, flags);
3800
3801	domain_detach_iommu(info->domain, iommu);
3802	info->domain = NULL;
3803}
3804
3805static int md_domain_init(struct dmar_domain *domain, int guest_width)
3806{
3807	int adjust_width;
3808
3809	/* calculate AGAW */
3810	domain->gaw = guest_width;
3811	adjust_width = guestwidth_to_adjustwidth(guest_width);
3812	domain->agaw = width_to_agaw(adjust_width);
3813
3814	domain->iommu_coherency = false;
3815	domain->iommu_superpage = 0;
3816	domain->max_addr = 0;
3817
3818	/* always allocate the top pgd */
3819	domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
3820	if (!domain->pgd)
3821		return -ENOMEM;
3822	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3823	return 0;
3824}
3825
3826static int blocking_domain_attach_dev(struct iommu_domain *domain,
3827				      struct device *dev)
3828{
3829	device_block_translation(dev);
3830	return 0;
3831}
3832
3833static struct iommu_domain blocking_domain = {
3834	.type = IOMMU_DOMAIN_BLOCKED,
3835	.ops = &(const struct iommu_domain_ops) {
3836		.attach_dev	= blocking_domain_attach_dev,
3837	}
3838};
3839
3840static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3841{
3842	struct dmar_domain *dmar_domain;
3843	struct iommu_domain *domain;
3844
3845	switch (type) {
3846	case IOMMU_DOMAIN_DMA:
3847	case IOMMU_DOMAIN_UNMANAGED:
3848		dmar_domain = alloc_domain(type);
3849		if (!dmar_domain) {
3850			pr_err("Can't allocate dmar_domain\n");
3851			return NULL;
3852		}
3853		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3854			pr_err("Domain initialization failed\n");
3855			domain_exit(dmar_domain);
3856			return NULL;
3857		}
3858
3859		domain = &dmar_domain->domain;
3860		domain->geometry.aperture_start = 0;
3861		domain->geometry.aperture_end   =
3862				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
3863		domain->geometry.force_aperture = true;
3864
3865		return domain;
3866	case IOMMU_DOMAIN_IDENTITY:
3867		return &si_domain->domain;
3868	case IOMMU_DOMAIN_SVA:
3869		return intel_svm_domain_alloc();
3870	default:
3871		return NULL;
3872	}
3873
3874	return NULL;
3875}
3876
3877static struct iommu_domain *
3878intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3879			      struct iommu_domain *parent,
3880			      const struct iommu_user_data *user_data)
3881{
3882	struct device_domain_info *info = dev_iommu_priv_get(dev);
3883	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3884	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3885	struct intel_iommu *iommu = info->iommu;
3886	struct iommu_domain *domain;
3887
3888	/* Must be NESTING domain */
3889	if (parent) {
3890		if (!nested_supported(iommu) || flags)
3891			return ERR_PTR(-EOPNOTSUPP);
3892		return intel_nested_domain_alloc(parent, user_data);
3893	}
3894
3895	if (flags &
3896	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3897		return ERR_PTR(-EOPNOTSUPP);
3898	if (nested_parent && !nested_supported(iommu))
3899		return ERR_PTR(-EOPNOTSUPP);
3900	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3901		return ERR_PTR(-EOPNOTSUPP);
3902
3903	/*
3904	 * domain_alloc_user op needs to fully initialize a domain before
3905	 * return, so uses iommu_domain_alloc() here for simple.
3906	 */
3907	domain = iommu_domain_alloc(dev->bus);
3908	if (!domain)
3909		return ERR_PTR(-ENOMEM);
3910
3911	if (nested_parent)
3912		to_dmar_domain(domain)->nested_parent = true;
3913
3914	if (dirty_tracking) {
3915		if (to_dmar_domain(domain)->use_first_level) {
3916			iommu_domain_free(domain);
3917			return ERR_PTR(-EOPNOTSUPP);
3918		}
3919		domain->dirty_ops = &intel_dirty_ops;
3920	}
3921
3922	return domain;
3923}
3924
3925static void intel_iommu_domain_free(struct iommu_domain *domain)
3926{
3927	if (domain != &si_domain->domain)
3928		domain_exit(to_dmar_domain(domain));
3929}
3930
3931int prepare_domain_attach_device(struct iommu_domain *domain,
3932				 struct device *dev)
3933{
3934	struct device_domain_info *info = dev_iommu_priv_get(dev);
3935	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3936	struct intel_iommu *iommu = info->iommu;
3937	int addr_width;
3938
3939	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3940		return -EINVAL;
3941
3942	if (domain->dirty_ops && !ssads_supported(iommu))
3943		return -EINVAL;
3944
3945	/* check if this iommu agaw is sufficient for max mapped address */
3946	addr_width = agaw_to_width(iommu->agaw);
3947	if (addr_width > cap_mgaw(iommu->cap))
3948		addr_width = cap_mgaw(iommu->cap);
3949
3950	if (dmar_domain->max_addr > (1LL << addr_width))
3951		return -EINVAL;
3952	dmar_domain->gaw = addr_width;
3953
3954	/*
3955	 * Knock out extra levels of page tables if necessary
3956	 */
3957	while (iommu->agaw < dmar_domain->agaw) {
3958		struct dma_pte *pte;
3959
3960		pte = dmar_domain->pgd;
3961		if (dma_pte_present(pte)) {
3962			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3963			free_pgtable_page(pte);
3964		}
3965		dmar_domain->agaw--;
3966	}
3967
3968	return 0;
3969}
3970
3971static int intel_iommu_attach_device(struct iommu_domain *domain,
3972				     struct device *dev)
3973{
3974	struct device_domain_info *info = dev_iommu_priv_get(dev);
3975	int ret;
3976
3977	if (info->domain)
3978		device_block_translation(dev);
3979
3980	ret = prepare_domain_attach_device(domain, dev);
3981	if (ret)
3982		return ret;
3983
3984	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3985}
3986
3987static int intel_iommu_map(struct iommu_domain *domain,
3988			   unsigned long iova, phys_addr_t hpa,
3989			   size_t size, int iommu_prot, gfp_t gfp)
3990{
3991	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3992	u64 max_addr;
3993	int prot = 0;
3994
3995	if (iommu_prot & IOMMU_READ)
3996		prot |= DMA_PTE_READ;
3997	if (iommu_prot & IOMMU_WRITE)
3998		prot |= DMA_PTE_WRITE;
3999	if (dmar_domain->set_pte_snp)
4000		prot |= DMA_PTE_SNP;
4001
4002	max_addr = iova + size;
4003	if (dmar_domain->max_addr < max_addr) {
4004		u64 end;
4005
4006		/* check if minimum agaw is sufficient for mapped address */
4007		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4008		if (end < max_addr) {
4009			pr_err("%s: iommu width (%d) is not "
4010			       "sufficient for the mapped address (%llx)\n",
4011			       __func__, dmar_domain->gaw, max_addr);
4012			return -EFAULT;
4013		}
4014		dmar_domain->max_addr = max_addr;
4015	}
4016	/* Round up size to next multiple of PAGE_SIZE, if it and
4017	   the low bits of hpa would take us onto the next page */
4018	size = aligned_nrpages(hpa, size);
4019	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4020				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4021}
4022
4023static int intel_iommu_map_pages(struct iommu_domain *domain,
4024				 unsigned long iova, phys_addr_t paddr,
4025				 size_t pgsize, size_t pgcount,
4026				 int prot, gfp_t gfp, size_t *mapped)
4027{
4028	unsigned long pgshift = __ffs(pgsize);
4029	size_t size = pgcount << pgshift;
4030	int ret;
4031
4032	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4033		return -EINVAL;
4034
4035	if (!IS_ALIGNED(iova | paddr, pgsize))
4036		return -EINVAL;
4037
4038	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4039	if (!ret && mapped)
4040		*mapped = size;
4041
4042	return ret;
4043}
4044
4045static size_t intel_iommu_unmap(struct iommu_domain *domain,
4046				unsigned long iova, size_t size,
4047				struct iommu_iotlb_gather *gather)
4048{
4049	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4050	unsigned long start_pfn, last_pfn;
4051	int level = 0;
4052
4053	/* Cope with horrid API which requires us to unmap more than the
4054	   size argument if it happens to be a large-page mapping. */
4055	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4056				     &level, GFP_ATOMIC)))
4057		return 0;
4058
4059	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4060		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4061
4062	start_pfn = iova >> VTD_PAGE_SHIFT;
4063	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4064
4065	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4066
4067	if (dmar_domain->max_addr == iova + size)
4068		dmar_domain->max_addr = iova;
4069
4070	/*
4071	 * We do not use page-selective IOTLB invalidation in flush queue,
4072	 * so there is no need to track page and sync iotlb.
4073	 */
4074	if (!iommu_iotlb_gather_queued(gather))
4075		iommu_iotlb_gather_add_page(domain, gather, iova, size);
4076
4077	return size;
4078}
4079
4080static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4081				      unsigned long iova,
4082				      size_t pgsize, size_t pgcount,
4083				      struct iommu_iotlb_gather *gather)
4084{
4085	unsigned long pgshift = __ffs(pgsize);
4086	size_t size = pgcount << pgshift;
4087
4088	return intel_iommu_unmap(domain, iova, size, gather);
4089}
4090
4091static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4092				 struct iommu_iotlb_gather *gather)
4093{
4094	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4095	unsigned long iova_pfn = IOVA_PFN(gather->start);
4096	size_t size = gather->end - gather->start;
4097	struct iommu_domain_info *info;
4098	unsigned long start_pfn;
4099	unsigned long nrpages;
4100	unsigned long i;
4101
4102	nrpages = aligned_nrpages(gather->start, size);
4103	start_pfn = mm_to_dma_pfn_start(iova_pfn);
4104
4105	xa_for_each(&dmar_domain->iommu_array, i, info)
4106		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4107				      start_pfn, nrpages,
4108				      list_empty(&gather->freelist), 0);
4109
4110	put_pages_list(&gather->freelist);
4111}
4112
4113static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4114					    dma_addr_t iova)
4115{
4116	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4117	struct dma_pte *pte;
4118	int level = 0;
4119	u64 phys = 0;
4120
4121	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4122			     GFP_ATOMIC);
4123	if (pte && dma_pte_present(pte))
4124		phys = dma_pte_addr(pte) +
4125			(iova & (BIT_MASK(level_to_offset_bits(level) +
4126						VTD_PAGE_SHIFT) - 1));
4127
4128	return phys;
4129}
4130
4131static bool domain_support_force_snooping(struct dmar_domain *domain)
4132{
4133	struct device_domain_info *info;
4134	bool support = true;
4135
4136	assert_spin_locked(&domain->lock);
4137	list_for_each_entry(info, &domain->devices, link) {
4138		if (!ecap_sc_support(info->iommu->ecap)) {
4139			support = false;
4140			break;
4141		}
4142	}
4143
4144	return support;
4145}
4146
4147static void domain_set_force_snooping(struct dmar_domain *domain)
4148{
4149	struct device_domain_info *info;
4150
4151	assert_spin_locked(&domain->lock);
4152	/*
4153	 * Second level page table supports per-PTE snoop control. The
4154	 * iommu_map() interface will handle this by setting SNP bit.
4155	 */
4156	if (!domain->use_first_level) {
4157		domain->set_pte_snp = true;
4158		return;
4159	}
4160
4161	list_for_each_entry(info, &domain->devices, link)
4162		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4163						     IOMMU_NO_PASID);
4164}
4165
4166static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4167{
4168	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4169	unsigned long flags;
4170
4171	if (dmar_domain->force_snooping)
4172		return true;
4173
4174	spin_lock_irqsave(&dmar_domain->lock, flags);
4175	if (!domain_support_force_snooping(dmar_domain) ||
4176	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4177		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4178		return false;
4179	}
4180
4181	domain_set_force_snooping(dmar_domain);
4182	dmar_domain->force_snooping = true;
4183	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4184
4185	return true;
4186}
4187
4188static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4189{
4190	struct device_domain_info *info = dev_iommu_priv_get(dev);
4191
4192	switch (cap) {
4193	case IOMMU_CAP_CACHE_COHERENCY:
4194	case IOMMU_CAP_DEFERRED_FLUSH:
4195		return true;
4196	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4197		return dmar_platform_optin();
4198	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4199		return ecap_sc_support(info->iommu->ecap);
4200	case IOMMU_CAP_DIRTY_TRACKING:
4201		return ssads_supported(info->iommu);
4202	default:
4203		return false;
4204	}
4205}
4206
4207static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4208{
4209	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4210	struct device_domain_info *info;
4211	struct intel_iommu *iommu;
4212	u8 bus, devfn;
4213	int ret;
4214
4215	iommu = device_lookup_iommu(dev, &bus, &devfn);
4216	if (!iommu || !iommu->iommu.ops)
4217		return ERR_PTR(-ENODEV);
4218
4219	info = kzalloc(sizeof(*info), GFP_KERNEL);
4220	if (!info)
4221		return ERR_PTR(-ENOMEM);
4222
4223	if (dev_is_real_dma_subdevice(dev)) {
4224		info->bus = pdev->bus->number;
4225		info->devfn = pdev->devfn;
4226		info->segment = pci_domain_nr(pdev->bus);
4227	} else {
4228		info->bus = bus;
4229		info->devfn = devfn;
4230		info->segment = iommu->segment;
4231	}
4232
4233	info->dev = dev;
4234	info->iommu = iommu;
4235	if (dev_is_pci(dev)) {
4236		if (ecap_dev_iotlb_support(iommu->ecap) &&
4237		    pci_ats_supported(pdev) &&
4238		    dmar_ats_supported(pdev, iommu)) {
4239			info->ats_supported = 1;
4240			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4241
4242			/*
4243			 * For IOMMU that supports device IOTLB throttling
4244			 * (DIT), we assign PFSID to the invalidation desc
4245			 * of a VF such that IOMMU HW can gauge queue depth
4246			 * at PF level. If DIT is not set, PFSID will be
4247			 * treated as reserved, which should be set to 0.
4248			 */
4249			if (ecap_dit(iommu->ecap))
4250				info->pfsid = pci_dev_id(pci_physfn(pdev));
4251			info->ats_qdep = pci_ats_queue_depth(pdev);
4252		}
4253		if (sm_supported(iommu)) {
4254			if (pasid_supported(iommu)) {
4255				int features = pci_pasid_features(pdev);
4256
4257				if (features >= 0)
4258					info->pasid_supported = features | 1;
4259			}
4260
4261			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4262			    pci_pri_supported(pdev))
4263				info->pri_supported = 1;
4264		}
4265	}
4266
4267	dev_iommu_priv_set(dev, info);
4268
4269	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4270		ret = intel_pasid_alloc_table(dev);
4271		if (ret) {
4272			dev_err(dev, "PASID table allocation failed\n");
4273			kfree(info);
4274			return ERR_PTR(ret);
4275		}
4276	}
4277
4278	intel_iommu_debugfs_create_dev(info);
4279
4280	return &iommu->iommu;
4281}
4282
4283static void intel_iommu_release_device(struct device *dev)
4284{
4285	struct device_domain_info *info = dev_iommu_priv_get(dev);
4286
4287	dmar_remove_one_dev_info(dev);
4288	intel_pasid_free_table(dev);
4289	intel_iommu_debugfs_remove_dev(info);
4290	kfree(info);
4291	set_dma_ops(dev, NULL);
4292}
4293
4294static void intel_iommu_probe_finalize(struct device *dev)
4295{
4296	set_dma_ops(dev, NULL);
4297	iommu_setup_dma_ops(dev, 0, U64_MAX);
4298}
4299
4300static void intel_iommu_get_resv_regions(struct device *device,
4301					 struct list_head *head)
4302{
4303	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4304	struct iommu_resv_region *reg;
4305	struct dmar_rmrr_unit *rmrr;
4306	struct device *i_dev;
4307	int i;
4308
4309	rcu_read_lock();
4310	for_each_rmrr_units(rmrr) {
4311		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4312					  i, i_dev) {
4313			struct iommu_resv_region *resv;
4314			enum iommu_resv_type type;
4315			size_t length;
4316
4317			if (i_dev != device &&
4318			    !is_downstream_to_pci_bridge(device, i_dev))
4319				continue;
4320
4321			length = rmrr->end_address - rmrr->base_address + 1;
4322
4323			type = device_rmrr_is_relaxable(device) ?
4324				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4325
4326			resv = iommu_alloc_resv_region(rmrr->base_address,
4327						       length, prot, type,
4328						       GFP_ATOMIC);
4329			if (!resv)
4330				break;
4331
4332			list_add_tail(&resv->list, head);
4333		}
4334	}
4335	rcu_read_unlock();
4336
4337#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4338	if (dev_is_pci(device)) {
4339		struct pci_dev *pdev = to_pci_dev(device);
4340
4341		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4342			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4343					IOMMU_RESV_DIRECT_RELAXABLE,
4344					GFP_KERNEL);
4345			if (reg)
4346				list_add_tail(&reg->list, head);
4347		}
4348	}
4349#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4350
4351	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4352				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4353				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4354	if (!reg)
4355		return;
4356	list_add_tail(&reg->list, head);
4357}
4358
4359static struct iommu_group *intel_iommu_device_group(struct device *dev)
4360{
4361	if (dev_is_pci(dev))
4362		return pci_device_group(dev);
4363	return generic_device_group(dev);
4364}
4365
4366static int intel_iommu_enable_sva(struct device *dev)
4367{
4368	struct device_domain_info *info = dev_iommu_priv_get(dev);
4369	struct intel_iommu *iommu;
4370
4371	if (!info || dmar_disabled)
4372		return -EINVAL;
4373
4374	iommu = info->iommu;
4375	if (!iommu)
4376		return -EINVAL;
4377
4378	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4379		return -ENODEV;
4380
4381	if (!info->pasid_enabled || !info->ats_enabled)
4382		return -EINVAL;
4383
4384	/*
4385	 * Devices having device-specific I/O fault handling should not
4386	 * support PCI/PRI. The IOMMU side has no means to check the
4387	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4388	 * default that if the device driver enables SVA on a non-PRI
4389	 * device, it will handle IOPF in its own way.
4390	 */
4391	if (!info->pri_supported)
4392		return 0;
4393
4394	/* Devices supporting PRI should have it enabled. */
4395	if (!info->pri_enabled)
4396		return -EINVAL;
4397
4398	return 0;
4399}
4400
4401static int intel_iommu_enable_iopf(struct device *dev)
4402{
4403	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4404	struct device_domain_info *info = dev_iommu_priv_get(dev);
4405	struct intel_iommu *iommu;
4406	int ret;
4407
4408	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4409		return -ENODEV;
4410
4411	if (info->pri_enabled)
4412		return -EBUSY;
4413
4414	iommu = info->iommu;
4415	if (!iommu)
4416		return -EINVAL;
4417
4418	/* PASID is required in PRG Response Message. */
4419	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4420		return -EINVAL;
4421
4422	ret = pci_reset_pri(pdev);
4423	if (ret)
4424		return ret;
4425
4426	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4427	if (ret)
4428		return ret;
4429
4430	ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4431	if (ret)
4432		goto iopf_remove_device;
4433
4434	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4435	if (ret)
4436		goto iopf_unregister_handler;
4437	info->pri_enabled = 1;
4438
4439	return 0;
4440
4441iopf_unregister_handler:
4442	iommu_unregister_device_fault_handler(dev);
4443iopf_remove_device:
4444	iopf_queue_remove_device(iommu->iopf_queue, dev);
4445
4446	return ret;
4447}
4448
4449static int intel_iommu_disable_iopf(struct device *dev)
4450{
4451	struct device_domain_info *info = dev_iommu_priv_get(dev);
4452	struct intel_iommu *iommu = info->iommu;
4453
4454	if (!info->pri_enabled)
4455		return -EINVAL;
4456
4457	/*
4458	 * PCIe spec states that by clearing PRI enable bit, the Page
4459	 * Request Interface will not issue new page requests, but has
4460	 * outstanding page requests that have been transmitted or are
4461	 * queued for transmission. This is supposed to be called after
4462	 * the device driver has stopped DMA, all PASIDs have been
4463	 * unbound and the outstanding PRQs have been drained.
4464	 */
4465	pci_disable_pri(to_pci_dev(dev));
4466	info->pri_enabled = 0;
4467
4468	/*
4469	 * With PRI disabled and outstanding PRQs drained, unregistering
4470	 * fault handler and removing device from iopf queue should never
4471	 * fail.
4472	 */
4473	WARN_ON(iommu_unregister_device_fault_handler(dev));
4474	WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4475
4476	return 0;
4477}
4478
4479static int
4480intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4481{
4482	switch (feat) {
4483	case IOMMU_DEV_FEAT_IOPF:
4484		return intel_iommu_enable_iopf(dev);
4485
4486	case IOMMU_DEV_FEAT_SVA:
4487		return intel_iommu_enable_sva(dev);
4488
4489	default:
4490		return -ENODEV;
4491	}
4492}
4493
4494static int
4495intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4496{
4497	switch (feat) {
4498	case IOMMU_DEV_FEAT_IOPF:
4499		return intel_iommu_disable_iopf(dev);
4500
4501	case IOMMU_DEV_FEAT_SVA:
4502		return 0;
4503
4504	default:
4505		return -ENODEV;
4506	}
4507}
4508
4509static bool intel_iommu_is_attach_deferred(struct device *dev)
4510{
4511	struct device_domain_info *info = dev_iommu_priv_get(dev);
4512
4513	return translation_pre_enabled(info->iommu) && !info->domain;
4514}
4515
4516/*
4517 * Check that the device does not live on an external facing PCI port that is
4518 * marked as untrusted. Such devices should not be able to apply quirks and
4519 * thus not be able to bypass the IOMMU restrictions.
4520 */
4521static bool risky_device(struct pci_dev *pdev)
4522{
4523	if (pdev->untrusted) {
4524		pci_info(pdev,
4525			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4526			 pdev->vendor, pdev->device);
4527		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4528		return true;
4529	}
4530	return false;
4531}
4532
4533static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4534				      unsigned long iova, size_t size)
4535{
4536	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4537	unsigned long pages = aligned_nrpages(iova, size);
4538	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4539	struct iommu_domain_info *info;
4540	unsigned long i;
4541
4542	xa_for_each(&dmar_domain->iommu_array, i, info)
4543		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4544	return 0;
4545}
4546
4547static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4548{
4549	struct device_domain_info *info = dev_iommu_priv_get(dev);
4550	struct dev_pasid_info *curr, *dev_pasid = NULL;
4551	struct intel_iommu *iommu = info->iommu;
4552	struct dmar_domain *dmar_domain;
4553	struct iommu_domain *domain;
4554	unsigned long flags;
4555
4556	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4557	if (WARN_ON_ONCE(!domain))
4558		goto out_tear_down;
4559
4560	/*
4561	 * The SVA implementation needs to handle its own stuffs like the mm
4562	 * notification. Before consolidating that code into iommu core, let
4563	 * the intel sva code handle it.
4564	 */
4565	if (domain->type == IOMMU_DOMAIN_SVA) {
4566		intel_svm_remove_dev_pasid(dev, pasid);
4567		goto out_tear_down;
4568	}
4569
4570	dmar_domain = to_dmar_domain(domain);
4571	spin_lock_irqsave(&dmar_domain->lock, flags);
4572	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4573		if (curr->dev == dev && curr->pasid == pasid) {
4574			list_del(&curr->link_domain);
4575			dev_pasid = curr;
4576			break;
4577		}
4578	}
4579	WARN_ON_ONCE(!dev_pasid);
4580	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4581
4582	domain_detach_iommu(dmar_domain, iommu);
4583	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4584	kfree(dev_pasid);
4585out_tear_down:
4586	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4587	intel_drain_pasid_prq(dev, pasid);
4588}
4589
4590static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4591				     struct device *dev, ioasid_t pasid)
4592{
4593	struct device_domain_info *info = dev_iommu_priv_get(dev);
4594	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4595	struct intel_iommu *iommu = info->iommu;
4596	struct dev_pasid_info *dev_pasid;
4597	unsigned long flags;
4598	int ret;
4599
4600	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4601		return -EOPNOTSUPP;
4602
4603	if (domain->dirty_ops)
4604		return -EINVAL;
4605
4606	if (context_copied(iommu, info->bus, info->devfn))
4607		return -EBUSY;
4608
4609	ret = prepare_domain_attach_device(domain, dev);
4610	if (ret)
4611		return ret;
4612
4613	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4614	if (!dev_pasid)
4615		return -ENOMEM;
4616
4617	ret = domain_attach_iommu(dmar_domain, iommu);
4618	if (ret)
4619		goto out_free;
4620
4621	if (domain_type_is_si(dmar_domain))
4622		ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
4623	else if (dmar_domain->use_first_level)
4624		ret = domain_setup_first_level(iommu, dmar_domain,
4625					       dev, pasid);
4626	else
4627		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4628						     dev, pasid);
4629	if (ret)
4630		goto out_detach_iommu;
4631
4632	dev_pasid->dev = dev;
4633	dev_pasid->pasid = pasid;
4634	spin_lock_irqsave(&dmar_domain->lock, flags);
4635	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4636	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4637
4638	if (domain->type & __IOMMU_DOMAIN_PAGING)
4639		intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4640
4641	return 0;
4642out_detach_iommu:
4643	domain_detach_iommu(dmar_domain, iommu);
4644out_free:
4645	kfree(dev_pasid);
4646	return ret;
4647}
4648
4649static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4650{
4651	struct device_domain_info *info = dev_iommu_priv_get(dev);
4652	struct intel_iommu *iommu = info->iommu;
4653	struct iommu_hw_info_vtd *vtd;
4654
4655	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4656	if (!vtd)
4657		return ERR_PTR(-ENOMEM);
4658
4659	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4660	vtd->cap_reg = iommu->cap;
4661	vtd->ecap_reg = iommu->ecap;
4662	*length = sizeof(*vtd);
4663	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4664	return vtd;
4665}
4666
4667static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4668					  bool enable)
4669{
4670	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4671	struct device_domain_info *info;
4672	int ret;
4673
4674	spin_lock(&dmar_domain->lock);
4675	if (dmar_domain->dirty_tracking == enable)
4676		goto out_unlock;
4677
4678	list_for_each_entry(info, &dmar_domain->devices, link) {
4679		ret = intel_pasid_setup_dirty_tracking(info->iommu,
4680						       info->domain, info->dev,
4681						       IOMMU_NO_PASID, enable);
4682		if (ret)
4683			goto err_unwind;
4684	}
4685
4686	dmar_domain->dirty_tracking = enable;
4687out_unlock:
4688	spin_unlock(&dmar_domain->lock);
4689
4690	return 0;
4691
4692err_unwind:
4693	list_for_each_entry(info, &dmar_domain->devices, link)
4694		intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
4695						 info->dev, IOMMU_NO_PASID,
4696						 dmar_domain->dirty_tracking);
4697	spin_unlock(&dmar_domain->lock);
4698	return ret;
4699}
4700
4701static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4702					    unsigned long iova, size_t size,
4703					    unsigned long flags,
4704					    struct iommu_dirty_bitmap *dirty)
4705{
4706	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4707	unsigned long end = iova + size - 1;
4708	unsigned long pgsize;
4709
4710	/*
4711	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4712	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4713	 * have occurred when we stopped dirty tracking. This ensures that we
4714	 * never inherit dirtied bits from a previous cycle.
4715	 */
4716	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4717		return -EINVAL;
4718
4719	do {
4720		struct dma_pte *pte;
4721		int lvl = 0;
4722
4723		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4724				     GFP_ATOMIC);
4725		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4726		if (!pte || !dma_pte_present(pte)) {
4727			iova += pgsize;
4728			continue;
4729		}
4730
4731		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4732			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4733		iova += pgsize;
4734	} while (iova < end);
4735
4736	return 0;
4737}
4738
4739static const struct iommu_dirty_ops intel_dirty_ops = {
4740	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4741	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4742};
4743
4744const struct iommu_ops intel_iommu_ops = {
4745	.blocked_domain		= &blocking_domain,
4746	.capable		= intel_iommu_capable,
4747	.hw_info		= intel_iommu_hw_info,
4748	.domain_alloc		= intel_iommu_domain_alloc,
4749	.domain_alloc_user	= intel_iommu_domain_alloc_user,
4750	.probe_device		= intel_iommu_probe_device,
4751	.probe_finalize		= intel_iommu_probe_finalize,
4752	.release_device		= intel_iommu_release_device,
4753	.get_resv_regions	= intel_iommu_get_resv_regions,
4754	.device_group		= intel_iommu_device_group,
4755	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4756	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4757	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4758	.def_domain_type	= device_def_domain_type,
4759	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4760	.pgsize_bitmap		= SZ_4K,
4761#ifdef CONFIG_INTEL_IOMMU_SVM
4762	.page_response		= intel_svm_page_response,
4763#endif
4764	.default_domain_ops = &(const struct iommu_domain_ops) {
4765		.attach_dev		= intel_iommu_attach_device,
4766		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4767		.map_pages		= intel_iommu_map_pages,
4768		.unmap_pages		= intel_iommu_unmap_pages,
4769		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4770		.flush_iotlb_all        = intel_flush_iotlb_all,
4771		.iotlb_sync		= intel_iommu_tlb_sync,
4772		.iova_to_phys		= intel_iommu_iova_to_phys,
4773		.free			= intel_iommu_domain_free,
4774		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4775	}
4776};
4777
4778static void quirk_iommu_igfx(struct pci_dev *dev)
4779{
4780	if (risky_device(dev))
4781		return;
4782
4783	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4784	dmar_map_gfx = 0;
4785}
4786
4787/* G4x/GM45 integrated gfx dmar support is totally busted. */
4788DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4789DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4790DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4791DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4792DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4793DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4794DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4795
4796/* Broadwell igfx malfunctions with dmar */
4797DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4798DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4799DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4800DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4801DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4802DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4803DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4804DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4805DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4806DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4807DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4808DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4809DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4810DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4811DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4812DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4813DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4814DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4815DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4816DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4817DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4818DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4819DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4820DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4821
4822static void quirk_iommu_rwbf(struct pci_dev *dev)
4823{
4824	if (risky_device(dev))
4825		return;
4826
4827	/*
4828	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4829	 * but needs it. Same seems to hold for the desktop versions.
4830	 */
4831	pci_info(dev, "Forcing write-buffer flush capability\n");
4832	rwbf_quirk = 1;
4833}
4834
4835DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4836DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4837DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4838DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4839DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4840DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4841DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4842
4843#define GGC 0x52
4844#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4845#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4846#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4847#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4848#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4849#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4850#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4851#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4852
4853static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4854{
4855	unsigned short ggc;
4856
4857	if (risky_device(dev))
4858		return;
4859
4860	if (pci_read_config_word(dev, GGC, &ggc))
4861		return;
4862
4863	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4864		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4865		dmar_map_gfx = 0;
4866	} else if (dmar_map_gfx) {
4867		/* we have to ensure the gfx device is idle before we flush */
4868		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4869		iommu_set_dma_strict();
4870	}
4871}
4872DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4873DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4874DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4875DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4876
4877static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4878{
4879	unsigned short ver;
4880
4881	if (!IS_GFX_DEVICE(dev))
4882		return;
4883
4884	ver = (dev->device >> 8) & 0xff;
4885	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4886	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4887	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4888		return;
4889
4890	if (risky_device(dev))
4891		return;
4892
4893	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4894	iommu_skip_te_disable = 1;
4895}
4896DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4897
4898/* On Tylersburg chipsets, some BIOSes have been known to enable the
4899   ISOCH DMAR unit for the Azalia sound device, but not give it any
4900   TLB entries, which causes it to deadlock. Check for that.  We do
4901   this in a function called from init_dmars(), instead of in a PCI
4902   quirk, because we don't want to print the obnoxious "BIOS broken"
4903   message if VT-d is actually disabled.
4904*/
4905static void __init check_tylersburg_isoch(void)
4906{
4907	struct pci_dev *pdev;
4908	uint32_t vtisochctrl;
4909
4910	/* If there's no Azalia in the system anyway, forget it. */
4911	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4912	if (!pdev)
4913		return;
4914
4915	if (risky_device(pdev)) {
4916		pci_dev_put(pdev);
4917		return;
4918	}
4919
4920	pci_dev_put(pdev);
4921
4922	/* System Management Registers. Might be hidden, in which case
4923	   we can't do the sanity check. But that's OK, because the
4924	   known-broken BIOSes _don't_ actually hide it, so far. */
4925	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4926	if (!pdev)
4927		return;
4928
4929	if (risky_device(pdev)) {
4930		pci_dev_put(pdev);
4931		return;
4932	}
4933
4934	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4935		pci_dev_put(pdev);
4936		return;
4937	}
4938
4939	pci_dev_put(pdev);
4940
4941	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4942	if (vtisochctrl & 1)
4943		return;
4944
4945	/* Drop all bits other than the number of TLB entries */
4946	vtisochctrl &= 0x1c;
4947
4948	/* If we have the recommended number of TLB entries (16), fine. */
4949	if (vtisochctrl == 0x10)
4950		return;
4951
4952	/* Zero TLB entries? You get to ride the short bus to school. */
4953	if (!vtisochctrl) {
4954		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4955		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4956		     dmi_get_system_info(DMI_BIOS_VENDOR),
4957		     dmi_get_system_info(DMI_BIOS_VERSION),
4958		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4959		iommu_identity_mapping |= IDENTMAP_AZALIA;
4960		return;
4961	}
4962
4963	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4964	       vtisochctrl);
4965}
4966
4967/*
4968 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4969 * invalidation completion before posted writes initiated with translated address
4970 * that utilized translations matching the invalidation address range, violating
4971 * the invalidation completion ordering.
4972 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4973 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4974 * under the control of the trusted/privileged host device driver must use this
4975 * quirk.
4976 * Device TLBs are invalidated under the following six conditions:
4977 * 1. Device driver does DMA API unmap IOVA
4978 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4979 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4980 *    exit_mmap() due to crash
4981 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4982 *    VM has to free pages that were unmapped
4983 * 5. Userspace driver unmaps a DMA buffer
4984 * 6. Cache invalidation in vSVA usage (upcoming)
4985 *
4986 * For #1 and #2, device drivers are responsible for stopping DMA traffic
4987 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4988 * invalidate TLB the same way as normal user unmap which will use this quirk.
4989 * The dTLB invalidation after PASID cache flush does not need this quirk.
4990 *
4991 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4992 */
4993void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4994			       unsigned long address, unsigned long mask,
4995			       u32 pasid, u16 qdep)
4996{
4997	u16 sid;
4998
4999	if (likely(!info->dtlb_extra_inval))
5000		return;
5001
5002	sid = PCI_DEVID(info->bus, info->devfn);
5003	if (pasid == IOMMU_NO_PASID) {
5004		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5005				   qdep, address, mask);
5006	} else {
5007		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5008					 pasid, qdep, address, mask);
5009	}
5010}
5011
5012#define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
5013
5014/*
5015 * Function to submit a command to the enhanced command interface. The
5016 * valid enhanced command descriptions are defined in Table 47 of the
5017 * VT-d spec. The VT-d hardware implementation may support some but not
5018 * all commands, which can be determined by checking the Enhanced
5019 * Command Capability Register.
5020 *
5021 * Return values:
5022 *  - 0: Command successful without any error;
5023 *  - Negative: software error value;
5024 *  - Nonzero positive: failure status code defined in Table 48.
5025 */
5026int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5027{
5028	unsigned long flags;
5029	u64 res;
5030	int ret;
5031
5032	if (!cap_ecmds(iommu->cap))
5033		return -ENODEV;
5034
5035	raw_spin_lock_irqsave(&iommu->register_lock, flags);
5036
5037	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5038	if (res & DMA_ECMD_ECRSP_IP) {
5039		ret = -EBUSY;
5040		goto err;
5041	}
5042
5043	/*
5044	 * Unconditionally write the operand B, because
5045	 * - There is no side effect if an ecmd doesn't require an
5046	 *   operand B, but we set the register to some value.
5047	 * - It's not invoked in any critical path. The extra MMIO
5048	 *   write doesn't bring any performance concerns.
5049	 */
5050	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5051	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5052
5053	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5054		      !(res & DMA_ECMD_ECRSP_IP), res);
5055
5056	if (res & DMA_ECMD_ECRSP_IP) {
5057		ret = -ETIMEDOUT;
5058		goto err;
5059	}
5060
5061	ret = ecmd_get_status_code(res);
5062err:
5063	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5064
5065	return ret;
5066}
Configure Feed

Configure Feed