Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13#define pr_fmt(fmt) "DMAR: " fmt
14#define dev_fmt(fmt) pr_fmt(fmt)
15
16#include <linux/crash_dump.h>
17#include <linux/dma-direct.h>
18#include <linux/dmi.h>
19#include <linux/memory.h>
20#include <linux/pci.h>
21#include <linux/pci-ats.h>
22#include <linux/spinlock.h>
23#include <linux/syscore_ops.h>
24#include <linux/tboot.h>
25#include <uapi/linux/iommufd.h>
26
27#include "iommu.h"
28#include "../dma-iommu.h"
29#include "../irq_remapping.h"
30#include "../iommu-sva.h"
31#include "pasid.h"
32#include "cap_audit.h"
33#include "perfmon.h"
34
35#define ROOT_SIZE VTD_PAGE_SIZE
36#define CONTEXT_SIZE VTD_PAGE_SIZE
37
38#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42
43#define IOAPIC_RANGE_START (0xfee00000)
44#define IOAPIC_RANGE_END (0xfeefffff)
45#define IOVA_START_ADDR (0x1000)
46
47#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48
49#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
51
52/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
55 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
57
58/* IO virtual address start page frame number */
59#define IOVA_START_PFN (1)
60
61#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
62
63static void __init check_tylersburg_isoch(void);
64static int rwbf_quirk;
65
66/*
67 * set to 1 to panic kernel if can't successfully enable VT-d
68 * (used when kernel is launched w/ TXT)
69 */
70static int force_on = 0;
71static int intel_iommu_tboot_noforce;
72static int no_platform_optin;
73
74#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
75
76/*
77 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
78 * if marked present.
79 */
80static phys_addr_t root_entry_lctp(struct root_entry *re)
81{
82 if (!(re->lo & 1))
83 return 0;
84
85 return re->lo & VTD_PAGE_MASK;
86}
87
88/*
89 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
90 * if marked present.
91 */
92static phys_addr_t root_entry_uctp(struct root_entry *re)
93{
94 if (!(re->hi & 1))
95 return 0;
96
97 return re->hi & VTD_PAGE_MASK;
98}
99
100/*
101 * This domain is a statically identity mapping domain.
102 * 1. This domain creats a static 1:1 mapping to all usable memory.
103 * 2. It maps to each iommu if successful.
104 * 3. Each iommu mapps to this domain if successful.
105 */
106static struct dmar_domain *si_domain;
107static int hw_pass_through = 1;
108
109struct dmar_rmrr_unit {
110 struct list_head list; /* list of rmrr units */
111 struct acpi_dmar_header *hdr; /* ACPI header */
112 u64 base_address; /* reserved base address*/
113 u64 end_address; /* reserved end address */
114 struct dmar_dev_scope *devices; /* target devices */
115 int devices_cnt; /* target device count */
116};
117
118struct dmar_atsr_unit {
119 struct list_head list; /* list of ATSR units */
120 struct acpi_dmar_header *hdr; /* ACPI header */
121 struct dmar_dev_scope *devices; /* target devices */
122 int devices_cnt; /* target device count */
123 u8 include_all:1; /* include all ports */
124};
125
126struct dmar_satc_unit {
127 struct list_head list; /* list of SATC units */
128 struct acpi_dmar_header *hdr; /* ACPI header */
129 struct dmar_dev_scope *devices; /* target devices */
130 struct intel_iommu *iommu; /* the corresponding iommu */
131 int devices_cnt; /* target device count */
132 u8 atc_required:1; /* ATS is required */
133};
134
135static LIST_HEAD(dmar_atsr_units);
136static LIST_HEAD(dmar_rmrr_units);
137static LIST_HEAD(dmar_satc_units);
138
139#define for_each_rmrr_units(rmrr) \
140 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
141
142static void intel_iommu_domain_free(struct iommu_domain *domain);
143
144int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
145int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
146
147int intel_iommu_enabled = 0;
148EXPORT_SYMBOL_GPL(intel_iommu_enabled);
149
150static int dmar_map_gfx = 1;
151static int intel_iommu_superpage = 1;
152static int iommu_identity_mapping;
153static int iommu_skip_te_disable;
154
155#define IDENTMAP_GFX 2
156#define IDENTMAP_AZALIA 4
157
158const struct iommu_ops intel_iommu_ops;
159static const struct iommu_dirty_ops intel_dirty_ops;
160
161static bool translation_pre_enabled(struct intel_iommu *iommu)
162{
163 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
164}
165
166static void clear_translation_pre_enabled(struct intel_iommu *iommu)
167{
168 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
169}
170
171static void init_translation_status(struct intel_iommu *iommu)
172{
173 u32 gsts;
174
175 gsts = readl(iommu->reg + DMAR_GSTS_REG);
176 if (gsts & DMA_GSTS_TES)
177 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
178}
179
180static int __init intel_iommu_setup(char *str)
181{
182 if (!str)
183 return -EINVAL;
184
185 while (*str) {
186 if (!strncmp(str, "on", 2)) {
187 dmar_disabled = 0;
188 pr_info("IOMMU enabled\n");
189 } else if (!strncmp(str, "off", 3)) {
190 dmar_disabled = 1;
191 no_platform_optin = 1;
192 pr_info("IOMMU disabled\n");
193 } else if (!strncmp(str, "igfx_off", 8)) {
194 dmar_map_gfx = 0;
195 pr_info("Disable GFX device mapping\n");
196 } else if (!strncmp(str, "forcedac", 8)) {
197 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
198 iommu_dma_forcedac = true;
199 } else if (!strncmp(str, "strict", 6)) {
200 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
201 iommu_set_dma_strict();
202 } else if (!strncmp(str, "sp_off", 6)) {
203 pr_info("Disable supported super page\n");
204 intel_iommu_superpage = 0;
205 } else if (!strncmp(str, "sm_on", 5)) {
206 pr_info("Enable scalable mode if hardware supports\n");
207 intel_iommu_sm = 1;
208 } else if (!strncmp(str, "sm_off", 6)) {
209 pr_info("Scalable mode is disallowed\n");
210 intel_iommu_sm = 0;
211 } else if (!strncmp(str, "tboot_noforce", 13)) {
212 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
213 intel_iommu_tboot_noforce = 1;
214 } else {
215 pr_notice("Unknown option - '%s'\n", str);
216 }
217
218 str += strcspn(str, ",");
219 while (*str == ',')
220 str++;
221 }
222
223 return 1;
224}
225__setup("intel_iommu=", intel_iommu_setup);
226
227void *alloc_pgtable_page(int node, gfp_t gfp)
228{
229 struct page *page;
230 void *vaddr = NULL;
231
232 page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
233 if (page)
234 vaddr = page_address(page);
235 return vaddr;
236}
237
238void free_pgtable_page(void *vaddr)
239{
240 free_page((unsigned long)vaddr);
241}
242
243static int domain_type_is_si(struct dmar_domain *domain)
244{
245 return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
246}
247
248static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
249{
250 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
251
252 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
253}
254
255/*
256 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
257 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
258 * the returned SAGAW.
259 */
260static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
261{
262 unsigned long fl_sagaw, sl_sagaw;
263
264 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
265 sl_sagaw = cap_sagaw(iommu->cap);
266
267 /* Second level only. */
268 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
269 return sl_sagaw;
270
271 /* First level only. */
272 if (!ecap_slts(iommu->ecap))
273 return fl_sagaw;
274
275 return fl_sagaw & sl_sagaw;
276}
277
278static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
279{
280 unsigned long sagaw;
281 int agaw;
282
283 sagaw = __iommu_calculate_sagaw(iommu);
284 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
285 if (test_bit(agaw, &sagaw))
286 break;
287 }
288
289 return agaw;
290}
291
292/*
293 * Calculate max SAGAW for each iommu.
294 */
295int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
296{
297 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
298}
299
300/*
301 * calculate agaw for each iommu.
302 * "SAGAW" may be different across iommus, use a default agaw, and
303 * get a supported less agaw for iommus that don't support the default agaw.
304 */
305int iommu_calculate_agaw(struct intel_iommu *iommu)
306{
307 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
308}
309
310static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
311{
312 return sm_supported(iommu) ?
313 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
314}
315
316static void domain_update_iommu_coherency(struct dmar_domain *domain)
317{
318 struct iommu_domain_info *info;
319 struct dmar_drhd_unit *drhd;
320 struct intel_iommu *iommu;
321 bool found = false;
322 unsigned long i;
323
324 domain->iommu_coherency = true;
325 xa_for_each(&domain->iommu_array, i, info) {
326 found = true;
327 if (!iommu_paging_structure_coherency(info->iommu)) {
328 domain->iommu_coherency = false;
329 break;
330 }
331 }
332 if (found)
333 return;
334
335 /* No hardware attached; use lowest common denominator */
336 rcu_read_lock();
337 for_each_active_iommu(iommu, drhd) {
338 if (!iommu_paging_structure_coherency(iommu)) {
339 domain->iommu_coherency = false;
340 break;
341 }
342 }
343 rcu_read_unlock();
344}
345
346static int domain_update_iommu_superpage(struct dmar_domain *domain,
347 struct intel_iommu *skip)
348{
349 struct dmar_drhd_unit *drhd;
350 struct intel_iommu *iommu;
351 int mask = 0x3;
352
353 if (!intel_iommu_superpage)
354 return 0;
355
356 /* set iommu_superpage to the smallest common denominator */
357 rcu_read_lock();
358 for_each_active_iommu(iommu, drhd) {
359 if (iommu != skip) {
360 if (domain && domain->use_first_level) {
361 if (!cap_fl1gp_support(iommu->cap))
362 mask = 0x1;
363 } else {
364 mask &= cap_super_page_val(iommu->cap);
365 }
366
367 if (!mask)
368 break;
369 }
370 }
371 rcu_read_unlock();
372
373 return fls(mask);
374}
375
376static int domain_update_device_node(struct dmar_domain *domain)
377{
378 struct device_domain_info *info;
379 int nid = NUMA_NO_NODE;
380 unsigned long flags;
381
382 spin_lock_irqsave(&domain->lock, flags);
383 list_for_each_entry(info, &domain->devices, link) {
384 /*
385 * There could possibly be multiple device numa nodes as devices
386 * within the same domain may sit behind different IOMMUs. There
387 * isn't perfect answer in such situation, so we select first
388 * come first served policy.
389 */
390 nid = dev_to_node(info->dev);
391 if (nid != NUMA_NO_NODE)
392 break;
393 }
394 spin_unlock_irqrestore(&domain->lock, flags);
395
396 return nid;
397}
398
399static void domain_update_iotlb(struct dmar_domain *domain);
400
401/* Return the super pagesize bitmap if supported. */
402static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
403{
404 unsigned long bitmap = 0;
405
406 /*
407 * 1-level super page supports page size of 2MiB, 2-level super page
408 * supports page size of both 2MiB and 1GiB.
409 */
410 if (domain->iommu_superpage == 1)
411 bitmap |= SZ_2M;
412 else if (domain->iommu_superpage == 2)
413 bitmap |= SZ_2M | SZ_1G;
414
415 return bitmap;
416}
417
418/* Some capabilities may be different across iommus */
419void domain_update_iommu_cap(struct dmar_domain *domain)
420{
421 domain_update_iommu_coherency(domain);
422 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
423
424 /*
425 * If RHSA is missing, we should default to the device numa domain
426 * as fall back.
427 */
428 if (domain->nid == NUMA_NO_NODE)
429 domain->nid = domain_update_device_node(domain);
430
431 /*
432 * First-level translation restricts the input-address to a
433 * canonical address (i.e., address bits 63:N have the same
434 * value as address bit [N-1], where N is 48-bits with 4-level
435 * paging and 57-bits with 5-level paging). Hence, skip bit
436 * [N-1].
437 */
438 if (domain->use_first_level)
439 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
440 else
441 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
442
443 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
444 domain_update_iotlb(domain);
445}
446
447struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
448 u8 devfn, int alloc)
449{
450 struct root_entry *root = &iommu->root_entry[bus];
451 struct context_entry *context;
452 u64 *entry;
453
454 /*
455 * Except that the caller requested to allocate a new entry,
456 * returning a copied context entry makes no sense.
457 */
458 if (!alloc && context_copied(iommu, bus, devfn))
459 return NULL;
460
461 entry = &root->lo;
462 if (sm_supported(iommu)) {
463 if (devfn >= 0x80) {
464 devfn -= 0x80;
465 entry = &root->hi;
466 }
467 devfn *= 2;
468 }
469 if (*entry & 1)
470 context = phys_to_virt(*entry & VTD_PAGE_MASK);
471 else {
472 unsigned long phy_addr;
473 if (!alloc)
474 return NULL;
475
476 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
477 if (!context)
478 return NULL;
479
480 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
481 phy_addr = virt_to_phys((void *)context);
482 *entry = phy_addr | 1;
483 __iommu_flush_cache(iommu, entry, sizeof(*entry));
484 }
485 return &context[devfn];
486}
487
488/**
489 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
490 * sub-hierarchy of a candidate PCI-PCI bridge
491 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
492 * @bridge: the candidate PCI-PCI bridge
493 *
494 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
495 */
496static bool
497is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
498{
499 struct pci_dev *pdev, *pbridge;
500
501 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
502 return false;
503
504 pdev = to_pci_dev(dev);
505 pbridge = to_pci_dev(bridge);
506
507 if (pbridge->subordinate &&
508 pbridge->subordinate->number <= pdev->bus->number &&
509 pbridge->subordinate->busn_res.end >= pdev->bus->number)
510 return true;
511
512 return false;
513}
514
515static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
516{
517 struct dmar_drhd_unit *drhd;
518 u32 vtbar;
519 int rc;
520
521 /* We know that this device on this chipset has its own IOMMU.
522 * If we find it under a different IOMMU, then the BIOS is lying
523 * to us. Hope that the IOMMU for this device is actually
524 * disabled, and it needs no translation...
525 */
526 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
527 if (rc) {
528 /* "can't" happen */
529 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
530 return false;
531 }
532 vtbar &= 0xffff0000;
533
534 /* we know that the this iommu should be at offset 0xa000 from vtbar */
535 drhd = dmar_find_matched_drhd_unit(pdev);
536 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
537 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
538 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
539 return true;
540 }
541
542 return false;
543}
544
545static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
546{
547 if (!iommu || iommu->drhd->ignored)
548 return true;
549
550 if (dev_is_pci(dev)) {
551 struct pci_dev *pdev = to_pci_dev(dev);
552
553 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
554 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
555 quirk_ioat_snb_local_iommu(pdev))
556 return true;
557 }
558
559 return false;
560}
561
562static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
563{
564 struct dmar_drhd_unit *drhd = NULL;
565 struct pci_dev *pdev = NULL;
566 struct intel_iommu *iommu;
567 struct device *tmp;
568 u16 segment = 0;
569 int i;
570
571 if (!dev)
572 return NULL;
573
574 if (dev_is_pci(dev)) {
575 struct pci_dev *pf_pdev;
576
577 pdev = pci_real_dma_dev(to_pci_dev(dev));
578
579 /* VFs aren't listed in scope tables; we need to look up
580 * the PF instead to find the IOMMU. */
581 pf_pdev = pci_physfn(pdev);
582 dev = &pf_pdev->dev;
583 segment = pci_domain_nr(pdev->bus);
584 } else if (has_acpi_companion(dev))
585 dev = &ACPI_COMPANION(dev)->dev;
586
587 rcu_read_lock();
588 for_each_iommu(iommu, drhd) {
589 if (pdev && segment != drhd->segment)
590 continue;
591
592 for_each_active_dev_scope(drhd->devices,
593 drhd->devices_cnt, i, tmp) {
594 if (tmp == dev) {
595 /* For a VF use its original BDF# not that of the PF
596 * which we used for the IOMMU lookup. Strictly speaking
597 * we could do this for all PCI devices; we only need to
598 * get the BDF# from the scope table for ACPI matches. */
599 if (pdev && pdev->is_virtfn)
600 goto got_pdev;
601
602 if (bus && devfn) {
603 *bus = drhd->devices[i].bus;
604 *devfn = drhd->devices[i].devfn;
605 }
606 goto out;
607 }
608
609 if (is_downstream_to_pci_bridge(dev, tmp))
610 goto got_pdev;
611 }
612
613 if (pdev && drhd->include_all) {
614got_pdev:
615 if (bus && devfn) {
616 *bus = pdev->bus->number;
617 *devfn = pdev->devfn;
618 }
619 goto out;
620 }
621 }
622 iommu = NULL;
623out:
624 if (iommu_is_dummy(iommu, dev))
625 iommu = NULL;
626
627 rcu_read_unlock();
628
629 return iommu;
630}
631
632static void domain_flush_cache(struct dmar_domain *domain,
633 void *addr, int size)
634{
635 if (!domain->iommu_coherency)
636 clflush_cache_range(addr, size);
637}
638
639static void free_context_table(struct intel_iommu *iommu)
640{
641 struct context_entry *context;
642 int i;
643
644 if (!iommu->root_entry)
645 return;
646
647 for (i = 0; i < ROOT_ENTRY_NR; i++) {
648 context = iommu_context_addr(iommu, i, 0, 0);
649 if (context)
650 free_pgtable_page(context);
651
652 if (!sm_supported(iommu))
653 continue;
654
655 context = iommu_context_addr(iommu, i, 0x80, 0);
656 if (context)
657 free_pgtable_page(context);
658 }
659
660 free_pgtable_page(iommu->root_entry);
661 iommu->root_entry = NULL;
662}
663
664#ifdef CONFIG_DMAR_DEBUG
665static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
666 u8 bus, u8 devfn, struct dma_pte *parent, int level)
667{
668 struct dma_pte *pte;
669 int offset;
670
671 while (1) {
672 offset = pfn_level_offset(pfn, level);
673 pte = &parent[offset];
674 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
675 pr_info("PTE not present at level %d\n", level);
676 break;
677 }
678
679 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
680
681 if (level == 1)
682 break;
683
684 parent = phys_to_virt(dma_pte_addr(pte));
685 level--;
686 }
687}
688
689void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
690 unsigned long long addr, u32 pasid)
691{
692 struct pasid_dir_entry *dir, *pde;
693 struct pasid_entry *entries, *pte;
694 struct context_entry *ctx_entry;
695 struct root_entry *rt_entry;
696 int i, dir_index, index, level;
697 u8 devfn = source_id & 0xff;
698 u8 bus = source_id >> 8;
699 struct dma_pte *pgtable;
700
701 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
702
703 /* root entry dump */
704 rt_entry = &iommu->root_entry[bus];
705 if (!rt_entry) {
706 pr_info("root table entry is not present\n");
707 return;
708 }
709
710 if (sm_supported(iommu))
711 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
712 rt_entry->hi, rt_entry->lo);
713 else
714 pr_info("root entry: 0x%016llx", rt_entry->lo);
715
716 /* context entry dump */
717 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
718 if (!ctx_entry) {
719 pr_info("context table entry is not present\n");
720 return;
721 }
722
723 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
724 ctx_entry->hi, ctx_entry->lo);
725
726 /* legacy mode does not require PASID entries */
727 if (!sm_supported(iommu)) {
728 level = agaw_to_level(ctx_entry->hi & 7);
729 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
730 goto pgtable_walk;
731 }
732
733 /* get the pointer to pasid directory entry */
734 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
735 if (!dir) {
736 pr_info("pasid directory entry is not present\n");
737 return;
738 }
739 /* For request-without-pasid, get the pasid from context entry */
740 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
741 pasid = IOMMU_NO_PASID;
742
743 dir_index = pasid >> PASID_PDE_SHIFT;
744 pde = &dir[dir_index];
745 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
746
747 /* get the pointer to the pasid table entry */
748 entries = get_pasid_table_from_pde(pde);
749 if (!entries) {
750 pr_info("pasid table entry is not present\n");
751 return;
752 }
753 index = pasid & PASID_PTE_MASK;
754 pte = &entries[index];
755 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
756 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
757
758 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
759 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
760 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
761 } else {
762 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
763 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
764 }
765
766pgtable_walk:
767 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
768}
769#endif
770
771static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
772 unsigned long pfn, int *target_level,
773 gfp_t gfp)
774{
775 struct dma_pte *parent, *pte;
776 int level = agaw_to_level(domain->agaw);
777 int offset;
778
779 if (!domain_pfn_supported(domain, pfn))
780 /* Address beyond IOMMU's addressing capabilities. */
781 return NULL;
782
783 parent = domain->pgd;
784
785 while (1) {
786 void *tmp_page;
787
788 offset = pfn_level_offset(pfn, level);
789 pte = &parent[offset];
790 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
791 break;
792 if (level == *target_level)
793 break;
794
795 if (!dma_pte_present(pte)) {
796 uint64_t pteval;
797
798 tmp_page = alloc_pgtable_page(domain->nid, gfp);
799
800 if (!tmp_page)
801 return NULL;
802
803 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
804 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
805 if (domain->use_first_level)
806 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
807
808 if (cmpxchg64(&pte->val, 0ULL, pteval))
809 /* Someone else set it while we were thinking; use theirs. */
810 free_pgtable_page(tmp_page);
811 else
812 domain_flush_cache(domain, pte, sizeof(*pte));
813 }
814 if (level == 1)
815 break;
816
817 parent = phys_to_virt(dma_pte_addr(pte));
818 level--;
819 }
820
821 if (!*target_level)
822 *target_level = level;
823
824 return pte;
825}
826
827/* return address's pte at specific level */
828static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
829 unsigned long pfn,
830 int level, int *large_page)
831{
832 struct dma_pte *parent, *pte;
833 int total = agaw_to_level(domain->agaw);
834 int offset;
835
836 parent = domain->pgd;
837 while (level <= total) {
838 offset = pfn_level_offset(pfn, total);
839 pte = &parent[offset];
840 if (level == total)
841 return pte;
842
843 if (!dma_pte_present(pte)) {
844 *large_page = total;
845 break;
846 }
847
848 if (dma_pte_superpage(pte)) {
849 *large_page = total;
850 return pte;
851 }
852
853 parent = phys_to_virt(dma_pte_addr(pte));
854 total--;
855 }
856 return NULL;
857}
858
859/* clear last level pte, a tlb flush should be followed */
860static void dma_pte_clear_range(struct dmar_domain *domain,
861 unsigned long start_pfn,
862 unsigned long last_pfn)
863{
864 unsigned int large_page;
865 struct dma_pte *first_pte, *pte;
866
867 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
868 WARN_ON(start_pfn > last_pfn))
869 return;
870
871 /* we don't need lock here; nobody else touches the iova range */
872 do {
873 large_page = 1;
874 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
875 if (!pte) {
876 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
877 continue;
878 }
879 do {
880 dma_clear_pte(pte);
881 start_pfn += lvl_to_nr_pages(large_page);
882 pte++;
883 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
884
885 domain_flush_cache(domain, first_pte,
886 (void *)pte - (void *)first_pte);
887
888 } while (start_pfn && start_pfn <= last_pfn);
889}
890
891static void dma_pte_free_level(struct dmar_domain *domain, int level,
892 int retain_level, struct dma_pte *pte,
893 unsigned long pfn, unsigned long start_pfn,
894 unsigned long last_pfn)
895{
896 pfn = max(start_pfn, pfn);
897 pte = &pte[pfn_level_offset(pfn, level)];
898
899 do {
900 unsigned long level_pfn;
901 struct dma_pte *level_pte;
902
903 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
904 goto next;
905
906 level_pfn = pfn & level_mask(level);
907 level_pte = phys_to_virt(dma_pte_addr(pte));
908
909 if (level > 2) {
910 dma_pte_free_level(domain, level - 1, retain_level,
911 level_pte, level_pfn, start_pfn,
912 last_pfn);
913 }
914
915 /*
916 * Free the page table if we're below the level we want to
917 * retain and the range covers the entire table.
918 */
919 if (level < retain_level && !(start_pfn > level_pfn ||
920 last_pfn < level_pfn + level_size(level) - 1)) {
921 dma_clear_pte(pte);
922 domain_flush_cache(domain, pte, sizeof(*pte));
923 free_pgtable_page(level_pte);
924 }
925next:
926 pfn += level_size(level);
927 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
928}
929
930/*
931 * clear last level (leaf) ptes and free page table pages below the
932 * level we wish to keep intact.
933 */
934static void dma_pte_free_pagetable(struct dmar_domain *domain,
935 unsigned long start_pfn,
936 unsigned long last_pfn,
937 int retain_level)
938{
939 dma_pte_clear_range(domain, start_pfn, last_pfn);
940
941 /* We don't need lock here; nobody else touches the iova range */
942 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
943 domain->pgd, 0, start_pfn, last_pfn);
944
945 /* free pgd */
946 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
947 free_pgtable_page(domain->pgd);
948 domain->pgd = NULL;
949 }
950}
951
952/* When a page at a given level is being unlinked from its parent, we don't
953 need to *modify* it at all. All we need to do is make a list of all the
954 pages which can be freed just as soon as we've flushed the IOTLB and we
955 know the hardware page-walk will no longer touch them.
956 The 'pte' argument is the *parent* PTE, pointing to the page that is to
957 be freed. */
958static void dma_pte_list_pagetables(struct dmar_domain *domain,
959 int level, struct dma_pte *pte,
960 struct list_head *freelist)
961{
962 struct page *pg;
963
964 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
965 list_add_tail(&pg->lru, freelist);
966
967 if (level == 1)
968 return;
969
970 pte = page_address(pg);
971 do {
972 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
973 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
974 pte++;
975 } while (!first_pte_in_page(pte));
976}
977
978static void dma_pte_clear_level(struct dmar_domain *domain, int level,
979 struct dma_pte *pte, unsigned long pfn,
980 unsigned long start_pfn, unsigned long last_pfn,
981 struct list_head *freelist)
982{
983 struct dma_pte *first_pte = NULL, *last_pte = NULL;
984
985 pfn = max(start_pfn, pfn);
986 pte = &pte[pfn_level_offset(pfn, level)];
987
988 do {
989 unsigned long level_pfn = pfn & level_mask(level);
990
991 if (!dma_pte_present(pte))
992 goto next;
993
994 /* If range covers entire pagetable, free it */
995 if (start_pfn <= level_pfn &&
996 last_pfn >= level_pfn + level_size(level) - 1) {
997 /* These suborbinate page tables are going away entirely. Don't
998 bother to clear them; we're just going to *free* them. */
999 if (level > 1 && !dma_pte_superpage(pte))
1000 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1001
1002 dma_clear_pte(pte);
1003 if (!first_pte)
1004 first_pte = pte;
1005 last_pte = pte;
1006 } else if (level > 1) {
1007 /* Recurse down into a level that isn't *entirely* obsolete */
1008 dma_pte_clear_level(domain, level - 1,
1009 phys_to_virt(dma_pte_addr(pte)),
1010 level_pfn, start_pfn, last_pfn,
1011 freelist);
1012 }
1013next:
1014 pfn = level_pfn + level_size(level);
1015 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1016
1017 if (first_pte)
1018 domain_flush_cache(domain, first_pte,
1019 (void *)++last_pte - (void *)first_pte);
1020}
1021
1022/* We can't just free the pages because the IOMMU may still be walking
1023 the page tables, and may have cached the intermediate levels. The
1024 pages can only be freed after the IOTLB flush has been done. */
1025static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1026 unsigned long last_pfn, struct list_head *freelist)
1027{
1028 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1029 WARN_ON(start_pfn > last_pfn))
1030 return;
1031
1032 /* we don't need lock here; nobody else touches the iova range */
1033 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1034 domain->pgd, 0, start_pfn, last_pfn, freelist);
1035
1036 /* free pgd */
1037 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1038 struct page *pgd_page = virt_to_page(domain->pgd);
1039 list_add_tail(&pgd_page->lru, freelist);
1040 domain->pgd = NULL;
1041 }
1042}
1043
1044/* iommu handling */
1045static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1046{
1047 struct root_entry *root;
1048
1049 root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1050 if (!root) {
1051 pr_err("Allocating root entry for %s failed\n",
1052 iommu->name);
1053 return -ENOMEM;
1054 }
1055
1056 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1057 iommu->root_entry = root;
1058
1059 return 0;
1060}
1061
1062static void iommu_set_root_entry(struct intel_iommu *iommu)
1063{
1064 u64 addr;
1065 u32 sts;
1066 unsigned long flag;
1067
1068 addr = virt_to_phys(iommu->root_entry);
1069 if (sm_supported(iommu))
1070 addr |= DMA_RTADDR_SMT;
1071
1072 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1073 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1074
1075 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1076
1077 /* Make sure hardware complete it */
1078 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1079 readl, (sts & DMA_GSTS_RTPS), sts);
1080
1081 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1082
1083 /*
1084 * Hardware invalidates all DMA remapping hardware translation
1085 * caches as part of SRTP flow.
1086 */
1087 if (cap_esrtps(iommu->cap))
1088 return;
1089
1090 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1091 if (sm_supported(iommu))
1092 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1093 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1094}
1095
1096void iommu_flush_write_buffer(struct intel_iommu *iommu)
1097{
1098 u32 val;
1099 unsigned long flag;
1100
1101 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1102 return;
1103
1104 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1105 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1106
1107 /* Make sure hardware complete it */
1108 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1109 readl, (!(val & DMA_GSTS_WBFS)), val);
1110
1111 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1112}
1113
1114/* return value determine if we need a write buffer flush */
1115static void __iommu_flush_context(struct intel_iommu *iommu,
1116 u16 did, u16 source_id, u8 function_mask,
1117 u64 type)
1118{
1119 u64 val = 0;
1120 unsigned long flag;
1121
1122 switch (type) {
1123 case DMA_CCMD_GLOBAL_INVL:
1124 val = DMA_CCMD_GLOBAL_INVL;
1125 break;
1126 case DMA_CCMD_DOMAIN_INVL:
1127 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1128 break;
1129 case DMA_CCMD_DEVICE_INVL:
1130 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1131 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1132 break;
1133 default:
1134 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1135 iommu->name, type);
1136 return;
1137 }
1138 val |= DMA_CCMD_ICC;
1139
1140 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1141 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1142
1143 /* Make sure hardware complete it */
1144 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1145 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1146
1147 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1148}
1149
1150/* return value determine if we need a write buffer flush */
1151static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1152 u64 addr, unsigned int size_order, u64 type)
1153{
1154 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1155 u64 val = 0, val_iva = 0;
1156 unsigned long flag;
1157
1158 switch (type) {
1159 case DMA_TLB_GLOBAL_FLUSH:
1160 /* global flush doesn't need set IVA_REG */
1161 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1162 break;
1163 case DMA_TLB_DSI_FLUSH:
1164 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1165 break;
1166 case DMA_TLB_PSI_FLUSH:
1167 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1168 /* IH bit is passed in as part of address */
1169 val_iva = size_order | addr;
1170 break;
1171 default:
1172 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1173 iommu->name, type);
1174 return;
1175 }
1176
1177 if (cap_write_drain(iommu->cap))
1178 val |= DMA_TLB_WRITE_DRAIN;
1179
1180 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1181 /* Note: Only uses first TLB reg currently */
1182 if (val_iva)
1183 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1184 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1185
1186 /* Make sure hardware complete it */
1187 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1188 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1189
1190 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1191
1192 /* check IOTLB invalidation granularity */
1193 if (DMA_TLB_IAIG(val) == 0)
1194 pr_err("Flush IOTLB failed\n");
1195 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1196 pr_debug("TLB flush request %Lx, actual %Lx\n",
1197 (unsigned long long)DMA_TLB_IIRG(type),
1198 (unsigned long long)DMA_TLB_IAIG(val));
1199}
1200
1201static struct device_domain_info *
1202domain_lookup_dev_info(struct dmar_domain *domain,
1203 struct intel_iommu *iommu, u8 bus, u8 devfn)
1204{
1205 struct device_domain_info *info;
1206 unsigned long flags;
1207
1208 spin_lock_irqsave(&domain->lock, flags);
1209 list_for_each_entry(info, &domain->devices, link) {
1210 if (info->iommu == iommu && info->bus == bus &&
1211 info->devfn == devfn) {
1212 spin_unlock_irqrestore(&domain->lock, flags);
1213 return info;
1214 }
1215 }
1216 spin_unlock_irqrestore(&domain->lock, flags);
1217
1218 return NULL;
1219}
1220
1221static void domain_update_iotlb(struct dmar_domain *domain)
1222{
1223 struct dev_pasid_info *dev_pasid;
1224 struct device_domain_info *info;
1225 bool has_iotlb_device = false;
1226 unsigned long flags;
1227
1228 spin_lock_irqsave(&domain->lock, flags);
1229 list_for_each_entry(info, &domain->devices, link) {
1230 if (info->ats_enabled) {
1231 has_iotlb_device = true;
1232 break;
1233 }
1234 }
1235
1236 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1237 info = dev_iommu_priv_get(dev_pasid->dev);
1238 if (info->ats_enabled) {
1239 has_iotlb_device = true;
1240 break;
1241 }
1242 }
1243 domain->has_iotlb_device = has_iotlb_device;
1244 spin_unlock_irqrestore(&domain->lock, flags);
1245}
1246
1247/*
1248 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1249 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1250 * check because it applies only to the built-in QAT devices and it doesn't
1251 * grant additional privileges.
1252 */
1253#define BUGGY_QAT_DEVID_MASK 0x4940
1254static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1255{
1256 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1257 return false;
1258
1259 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1260 return false;
1261
1262 return true;
1263}
1264
1265static void iommu_enable_pci_caps(struct device_domain_info *info)
1266{
1267 struct pci_dev *pdev;
1268
1269 if (!dev_is_pci(info->dev))
1270 return;
1271
1272 pdev = to_pci_dev(info->dev);
1273
1274 /* The PCIe spec, in its wisdom, declares that the behaviour of
1275 the device if you enable PASID support after ATS support is
1276 undefined. So always enable PASID support on devices which
1277 have it, even if we can't yet know if we're ever going to
1278 use it. */
1279 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1280 info->pasid_enabled = 1;
1281
1282 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1283 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1284 info->ats_enabled = 1;
1285 domain_update_iotlb(info->domain);
1286 }
1287}
1288
1289static void iommu_disable_pci_caps(struct device_domain_info *info)
1290{
1291 struct pci_dev *pdev;
1292
1293 if (!dev_is_pci(info->dev))
1294 return;
1295
1296 pdev = to_pci_dev(info->dev);
1297
1298 if (info->ats_enabled) {
1299 pci_disable_ats(pdev);
1300 info->ats_enabled = 0;
1301 domain_update_iotlb(info->domain);
1302 }
1303
1304 if (info->pasid_enabled) {
1305 pci_disable_pasid(pdev);
1306 info->pasid_enabled = 0;
1307 }
1308}
1309
1310static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1311 u64 addr, unsigned int mask)
1312{
1313 u16 sid, qdep;
1314
1315 if (!info || !info->ats_enabled)
1316 return;
1317
1318 sid = info->bus << 8 | info->devfn;
1319 qdep = info->ats_qdep;
1320 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1321 qdep, addr, mask);
1322 quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1323}
1324
1325static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1326 u64 addr, unsigned mask)
1327{
1328 struct dev_pasid_info *dev_pasid;
1329 struct device_domain_info *info;
1330 unsigned long flags;
1331
1332 if (!domain->has_iotlb_device)
1333 return;
1334
1335 spin_lock_irqsave(&domain->lock, flags);
1336 list_for_each_entry(info, &domain->devices, link)
1337 __iommu_flush_dev_iotlb(info, addr, mask);
1338
1339 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1340 info = dev_iommu_priv_get(dev_pasid->dev);
1341
1342 if (!info->ats_enabled)
1343 continue;
1344
1345 qi_flush_dev_iotlb_pasid(info->iommu,
1346 PCI_DEVID(info->bus, info->devfn),
1347 info->pfsid, dev_pasid->pasid,
1348 info->ats_qdep, addr,
1349 mask);
1350 }
1351 spin_unlock_irqrestore(&domain->lock, flags);
1352}
1353
1354static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1355 struct dmar_domain *domain, u64 addr,
1356 unsigned long npages, bool ih)
1357{
1358 u16 did = domain_id_iommu(domain, iommu);
1359 struct dev_pasid_info *dev_pasid;
1360 unsigned long flags;
1361
1362 spin_lock_irqsave(&domain->lock, flags);
1363 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1364 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1365
1366 if (!list_empty(&domain->devices))
1367 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1368 spin_unlock_irqrestore(&domain->lock, flags);
1369}
1370
1371static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1372 struct dmar_domain *domain,
1373 unsigned long pfn, unsigned int pages,
1374 int ih, int map)
1375{
1376 unsigned int aligned_pages = __roundup_pow_of_two(pages);
1377 unsigned int mask = ilog2(aligned_pages);
1378 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1379 u16 did = domain_id_iommu(domain, iommu);
1380
1381 if (WARN_ON(!pages))
1382 return;
1383
1384 if (ih)
1385 ih = 1 << 6;
1386
1387 if (domain->use_first_level) {
1388 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1389 } else {
1390 unsigned long bitmask = aligned_pages - 1;
1391
1392 /*
1393 * PSI masks the low order bits of the base address. If the
1394 * address isn't aligned to the mask, then compute a mask value
1395 * needed to ensure the target range is flushed.
1396 */
1397 if (unlikely(bitmask & pfn)) {
1398 unsigned long end_pfn = pfn + pages - 1, shared_bits;
1399
1400 /*
1401 * Since end_pfn <= pfn + bitmask, the only way bits
1402 * higher than bitmask can differ in pfn and end_pfn is
1403 * by carrying. This means after masking out bitmask,
1404 * high bits starting with the first set bit in
1405 * shared_bits are all equal in both pfn and end_pfn.
1406 */
1407 shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1408 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1409 }
1410
1411 /*
1412 * Fallback to domain selective flush if no PSI support or
1413 * the size is too big.
1414 */
1415 if (!cap_pgsel_inv(iommu->cap) ||
1416 mask > cap_max_amask_val(iommu->cap))
1417 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1418 DMA_TLB_DSI_FLUSH);
1419 else
1420 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1421 DMA_TLB_PSI_FLUSH);
1422 }
1423
1424 /*
1425 * In caching mode, changes of pages from non-present to present require
1426 * flush. However, device IOTLB doesn't need to be flushed in this case.
1427 */
1428 if (!cap_caching_mode(iommu->cap) || !map)
1429 iommu_flush_dev_iotlb(domain, addr, mask);
1430}
1431
1432/* Notification for newly created mappings */
1433static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain,
1434 unsigned long pfn, unsigned int pages)
1435{
1436 /*
1437 * It's a non-present to present mapping. Only flush if caching mode
1438 * and second level.
1439 */
1440 if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1441 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1442 else
1443 iommu_flush_write_buffer(iommu);
1444}
1445
1446static void intel_flush_iotlb_all(struct iommu_domain *domain)
1447{
1448 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1449 struct iommu_domain_info *info;
1450 unsigned long idx;
1451
1452 xa_for_each(&dmar_domain->iommu_array, idx, info) {
1453 struct intel_iommu *iommu = info->iommu;
1454 u16 did = domain_id_iommu(dmar_domain, iommu);
1455
1456 if (dmar_domain->use_first_level)
1457 domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1458 else
1459 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1460 DMA_TLB_DSI_FLUSH);
1461
1462 if (!cap_caching_mode(iommu->cap))
1463 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1464 }
1465}
1466
1467static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1468{
1469 u32 pmen;
1470 unsigned long flags;
1471
1472 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1473 return;
1474
1475 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1476 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1477 pmen &= ~DMA_PMEN_EPM;
1478 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1479
1480 /* wait for the protected region status bit to clear */
1481 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1482 readl, !(pmen & DMA_PMEN_PRS), pmen);
1483
1484 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1485}
1486
1487static void iommu_enable_translation(struct intel_iommu *iommu)
1488{
1489 u32 sts;
1490 unsigned long flags;
1491
1492 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1493 iommu->gcmd |= DMA_GCMD_TE;
1494 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1495
1496 /* Make sure hardware complete it */
1497 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1498 readl, (sts & DMA_GSTS_TES), sts);
1499
1500 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1501}
1502
1503static void iommu_disable_translation(struct intel_iommu *iommu)
1504{
1505 u32 sts;
1506 unsigned long flag;
1507
1508 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1509 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1510 return;
1511
1512 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1513 iommu->gcmd &= ~DMA_GCMD_TE;
1514 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1515
1516 /* Make sure hardware complete it */
1517 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1518 readl, (!(sts & DMA_GSTS_TES)), sts);
1519
1520 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1521}
1522
1523static int iommu_init_domains(struct intel_iommu *iommu)
1524{
1525 u32 ndomains;
1526
1527 ndomains = cap_ndoms(iommu->cap);
1528 pr_debug("%s: Number of Domains supported <%d>\n",
1529 iommu->name, ndomains);
1530
1531 spin_lock_init(&iommu->lock);
1532
1533 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1534 if (!iommu->domain_ids)
1535 return -ENOMEM;
1536
1537 /*
1538 * If Caching mode is set, then invalid translations are tagged
1539 * with domain-id 0, hence we need to pre-allocate it. We also
1540 * use domain-id 0 as a marker for non-allocated domain-id, so
1541 * make sure it is not used for a real domain.
1542 */
1543 set_bit(0, iommu->domain_ids);
1544
1545 /*
1546 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1547 * entry for first-level or pass-through translation modes should
1548 * be programmed with a domain id different from those used for
1549 * second-level or nested translation. We reserve a domain id for
1550 * this purpose.
1551 */
1552 if (sm_supported(iommu))
1553 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1554
1555 return 0;
1556}
1557
1558static void disable_dmar_iommu(struct intel_iommu *iommu)
1559{
1560 if (!iommu->domain_ids)
1561 return;
1562
1563 /*
1564 * All iommu domains must have been detached from the devices,
1565 * hence there should be no domain IDs in use.
1566 */
1567 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1568 > NUM_RESERVED_DID))
1569 return;
1570
1571 if (iommu->gcmd & DMA_GCMD_TE)
1572 iommu_disable_translation(iommu);
1573}
1574
1575static void free_dmar_iommu(struct intel_iommu *iommu)
1576{
1577 if (iommu->domain_ids) {
1578 bitmap_free(iommu->domain_ids);
1579 iommu->domain_ids = NULL;
1580 }
1581
1582 if (iommu->copied_tables) {
1583 bitmap_free(iommu->copied_tables);
1584 iommu->copied_tables = NULL;
1585 }
1586
1587 /* free context mapping */
1588 free_context_table(iommu);
1589
1590#ifdef CONFIG_INTEL_IOMMU_SVM
1591 if (pasid_supported(iommu)) {
1592 if (ecap_prs(iommu->ecap))
1593 intel_svm_finish_prq(iommu);
1594 }
1595#endif
1596}
1597
1598/*
1599 * Check and return whether first level is used by default for
1600 * DMA translation.
1601 */
1602static bool first_level_by_default(unsigned int type)
1603{
1604 /* Only SL is available in legacy mode */
1605 if (!scalable_mode_support())
1606 return false;
1607
1608 /* Only level (either FL or SL) is available, just use it */
1609 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1610 return intel_cap_flts_sanity();
1611
1612 /* Both levels are available, decide it based on domain type */
1613 return type != IOMMU_DOMAIN_UNMANAGED;
1614}
1615
1616static struct dmar_domain *alloc_domain(unsigned int type)
1617{
1618 struct dmar_domain *domain;
1619
1620 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1621 if (!domain)
1622 return NULL;
1623
1624 domain->nid = NUMA_NO_NODE;
1625 if (first_level_by_default(type))
1626 domain->use_first_level = true;
1627 domain->has_iotlb_device = false;
1628 INIT_LIST_HEAD(&domain->devices);
1629 INIT_LIST_HEAD(&domain->dev_pasids);
1630 spin_lock_init(&domain->lock);
1631 xa_init(&domain->iommu_array);
1632
1633 return domain;
1634}
1635
1636int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1637{
1638 struct iommu_domain_info *info, *curr;
1639 unsigned long ndomains;
1640 int num, ret = -ENOSPC;
1641
1642 info = kzalloc(sizeof(*info), GFP_KERNEL);
1643 if (!info)
1644 return -ENOMEM;
1645
1646 spin_lock(&iommu->lock);
1647 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1648 if (curr) {
1649 curr->refcnt++;
1650 spin_unlock(&iommu->lock);
1651 kfree(info);
1652 return 0;
1653 }
1654
1655 ndomains = cap_ndoms(iommu->cap);
1656 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1657 if (num >= ndomains) {
1658 pr_err("%s: No free domain ids\n", iommu->name);
1659 goto err_unlock;
1660 }
1661
1662 set_bit(num, iommu->domain_ids);
1663 info->refcnt = 1;
1664 info->did = num;
1665 info->iommu = iommu;
1666 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1667 NULL, info, GFP_ATOMIC);
1668 if (curr) {
1669 ret = xa_err(curr) ? : -EBUSY;
1670 goto err_clear;
1671 }
1672 domain_update_iommu_cap(domain);
1673
1674 spin_unlock(&iommu->lock);
1675 return 0;
1676
1677err_clear:
1678 clear_bit(info->did, iommu->domain_ids);
1679err_unlock:
1680 spin_unlock(&iommu->lock);
1681 kfree(info);
1682 return ret;
1683}
1684
1685void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1686{
1687 struct iommu_domain_info *info;
1688
1689 spin_lock(&iommu->lock);
1690 info = xa_load(&domain->iommu_array, iommu->seq_id);
1691 if (--info->refcnt == 0) {
1692 clear_bit(info->did, iommu->domain_ids);
1693 xa_erase(&domain->iommu_array, iommu->seq_id);
1694 domain->nid = NUMA_NO_NODE;
1695 domain_update_iommu_cap(domain);
1696 kfree(info);
1697 }
1698 spin_unlock(&iommu->lock);
1699}
1700
1701static int guestwidth_to_adjustwidth(int gaw)
1702{
1703 int agaw;
1704 int r = (gaw - 12) % 9;
1705
1706 if (r == 0)
1707 agaw = gaw;
1708 else
1709 agaw = gaw + 9 - r;
1710 if (agaw > 64)
1711 agaw = 64;
1712 return agaw;
1713}
1714
1715static void domain_exit(struct dmar_domain *domain)
1716{
1717 if (domain->pgd) {
1718 LIST_HEAD(freelist);
1719
1720 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1721 put_pages_list(&freelist);
1722 }
1723
1724 if (WARN_ON(!list_empty(&domain->devices)))
1725 return;
1726
1727 kfree(domain);
1728}
1729
1730/*
1731 * Get the PASID directory size for scalable mode context entry.
1732 * Value of X in the PDTS field of a scalable mode context entry
1733 * indicates PASID directory with 2^(X + 7) entries.
1734 */
1735static unsigned long context_get_sm_pds(struct pasid_table *table)
1736{
1737 unsigned long pds, max_pde;
1738
1739 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1740 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1741 if (pds < 7)
1742 return 0;
1743
1744 return pds - 7;
1745}
1746
1747static int domain_context_mapping_one(struct dmar_domain *domain,
1748 struct intel_iommu *iommu,
1749 struct pasid_table *table,
1750 u8 bus, u8 devfn)
1751{
1752 struct device_domain_info *info =
1753 domain_lookup_dev_info(domain, iommu, bus, devfn);
1754 u16 did = domain_id_iommu(domain, iommu);
1755 int translation = CONTEXT_TT_MULTI_LEVEL;
1756 struct context_entry *context;
1757 int ret;
1758
1759 if (hw_pass_through && domain_type_is_si(domain))
1760 translation = CONTEXT_TT_PASS_THROUGH;
1761
1762 pr_debug("Set context mapping for %02x:%02x.%d\n",
1763 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1764
1765 spin_lock(&iommu->lock);
1766 ret = -ENOMEM;
1767 context = iommu_context_addr(iommu, bus, devfn, 1);
1768 if (!context)
1769 goto out_unlock;
1770
1771 ret = 0;
1772 if (context_present(context) && !context_copied(iommu, bus, devfn))
1773 goto out_unlock;
1774
1775 /*
1776 * For kdump cases, old valid entries may be cached due to the
1777 * in-flight DMA and copied pgtable, but there is no unmapping
1778 * behaviour for them, thus we need an explicit cache flush for
1779 * the newly-mapped device. For kdump, at this point, the device
1780 * is supposed to finish reset at its driver probe stage, so no
1781 * in-flight DMA will exist, and we don't need to worry anymore
1782 * hereafter.
1783 */
1784 if (context_copied(iommu, bus, devfn)) {
1785 u16 did_old = context_domain_id(context);
1786
1787 if (did_old < cap_ndoms(iommu->cap)) {
1788 iommu->flush.flush_context(iommu, did_old,
1789 (((u16)bus) << 8) | devfn,
1790 DMA_CCMD_MASK_NOBIT,
1791 DMA_CCMD_DEVICE_INVL);
1792 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1793 DMA_TLB_DSI_FLUSH);
1794 }
1795
1796 clear_context_copied(iommu, bus, devfn);
1797 }
1798
1799 context_clear_entry(context);
1800
1801 if (sm_supported(iommu)) {
1802 unsigned long pds;
1803
1804 /* Setup the PASID DIR pointer: */
1805 pds = context_get_sm_pds(table);
1806 context->lo = (u64)virt_to_phys(table->table) |
1807 context_pdts(pds);
1808
1809 /* Setup the RID_PASID field: */
1810 context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1811
1812 /*
1813 * Setup the Device-TLB enable bit and Page request
1814 * Enable bit:
1815 */
1816 if (info && info->ats_supported)
1817 context_set_sm_dte(context);
1818 if (info && info->pri_supported)
1819 context_set_sm_pre(context);
1820 if (info && info->pasid_supported)
1821 context_set_pasid(context);
1822 } else {
1823 struct dma_pte *pgd = domain->pgd;
1824 int agaw;
1825
1826 context_set_domain_id(context, did);
1827
1828 if (translation != CONTEXT_TT_PASS_THROUGH) {
1829 /*
1830 * Skip top levels of page tables for iommu which has
1831 * less agaw than default. Unnecessary for PT mode.
1832 */
1833 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1834 ret = -ENOMEM;
1835 pgd = phys_to_virt(dma_pte_addr(pgd));
1836 if (!dma_pte_present(pgd))
1837 goto out_unlock;
1838 }
1839
1840 if (info && info->ats_supported)
1841 translation = CONTEXT_TT_DEV_IOTLB;
1842 else
1843 translation = CONTEXT_TT_MULTI_LEVEL;
1844
1845 context_set_address_root(context, virt_to_phys(pgd));
1846 context_set_address_width(context, agaw);
1847 } else {
1848 /*
1849 * In pass through mode, AW must be programmed to
1850 * indicate the largest AGAW value supported by
1851 * hardware. And ASR is ignored by hardware.
1852 */
1853 context_set_address_width(context, iommu->msagaw);
1854 }
1855
1856 context_set_translation_type(context, translation);
1857 }
1858
1859 context_set_fault_enable(context);
1860 context_set_present(context);
1861 if (!ecap_coherent(iommu->ecap))
1862 clflush_cache_range(context, sizeof(*context));
1863
1864 /*
1865 * It's a non-present to present mapping. If hardware doesn't cache
1866 * non-present entry we only need to flush the write-buffer. If the
1867 * _does_ cache non-present entries, then it does so in the special
1868 * domain #0, which we have to flush:
1869 */
1870 if (cap_caching_mode(iommu->cap)) {
1871 iommu->flush.flush_context(iommu, 0,
1872 (((u16)bus) << 8) | devfn,
1873 DMA_CCMD_MASK_NOBIT,
1874 DMA_CCMD_DEVICE_INVL);
1875 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1876 } else {
1877 iommu_flush_write_buffer(iommu);
1878 }
1879
1880 ret = 0;
1881
1882out_unlock:
1883 spin_unlock(&iommu->lock);
1884
1885 return ret;
1886}
1887
1888struct domain_context_mapping_data {
1889 struct dmar_domain *domain;
1890 struct intel_iommu *iommu;
1891 struct pasid_table *table;
1892};
1893
1894static int domain_context_mapping_cb(struct pci_dev *pdev,
1895 u16 alias, void *opaque)
1896{
1897 struct domain_context_mapping_data *data = opaque;
1898
1899 return domain_context_mapping_one(data->domain, data->iommu,
1900 data->table, PCI_BUS_NUM(alias),
1901 alias & 0xff);
1902}
1903
1904static int
1905domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1906{
1907 struct device_domain_info *info = dev_iommu_priv_get(dev);
1908 struct domain_context_mapping_data data;
1909 struct intel_iommu *iommu = info->iommu;
1910 u8 bus = info->bus, devfn = info->devfn;
1911 struct pasid_table *table;
1912
1913 table = intel_pasid_get_table(dev);
1914
1915 if (!dev_is_pci(dev))
1916 return domain_context_mapping_one(domain, iommu, table,
1917 bus, devfn);
1918
1919 data.domain = domain;
1920 data.iommu = iommu;
1921 data.table = table;
1922
1923 return pci_for_each_dma_alias(to_pci_dev(dev),
1924 &domain_context_mapping_cb, &data);
1925}
1926
1927/* Returns a number of VTD pages, but aligned to MM page size */
1928static unsigned long aligned_nrpages(unsigned long host_addr, size_t size)
1929{
1930 host_addr &= ~PAGE_MASK;
1931 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1932}
1933
1934/* Return largest possible superpage level for a given mapping */
1935static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1936 unsigned long phy_pfn, unsigned long pages)
1937{
1938 int support, level = 1;
1939 unsigned long pfnmerge;
1940
1941 support = domain->iommu_superpage;
1942
1943 /* To use a large page, the virtual *and* physical addresses
1944 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1945 of them will mean we have to use smaller pages. So just
1946 merge them and check both at once. */
1947 pfnmerge = iov_pfn | phy_pfn;
1948
1949 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1950 pages >>= VTD_STRIDE_SHIFT;
1951 if (!pages)
1952 break;
1953 pfnmerge >>= VTD_STRIDE_SHIFT;
1954 level++;
1955 support--;
1956 }
1957 return level;
1958}
1959
1960/*
1961 * Ensure that old small page tables are removed to make room for superpage(s).
1962 * We're going to add new large pages, so make sure we don't remove their parent
1963 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1964 */
1965static void switch_to_super_page(struct dmar_domain *domain,
1966 unsigned long start_pfn,
1967 unsigned long end_pfn, int level)
1968{
1969 unsigned long lvl_pages = lvl_to_nr_pages(level);
1970 struct iommu_domain_info *info;
1971 struct dma_pte *pte = NULL;
1972 unsigned long i;
1973
1974 while (start_pfn <= end_pfn) {
1975 if (!pte)
1976 pte = pfn_to_dma_pte(domain, start_pfn, &level,
1977 GFP_ATOMIC);
1978
1979 if (dma_pte_present(pte)) {
1980 dma_pte_free_pagetable(domain, start_pfn,
1981 start_pfn + lvl_pages - 1,
1982 level + 1);
1983
1984 xa_for_each(&domain->iommu_array, i, info)
1985 iommu_flush_iotlb_psi(info->iommu, domain,
1986 start_pfn, lvl_pages,
1987 0, 0);
1988 }
1989
1990 pte++;
1991 start_pfn += lvl_pages;
1992 if (first_pte_in_page(pte))
1993 pte = NULL;
1994 }
1995}
1996
1997static int
1998__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1999 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2000 gfp_t gfp)
2001{
2002 struct dma_pte *first_pte = NULL, *pte = NULL;
2003 unsigned int largepage_lvl = 0;
2004 unsigned long lvl_pages = 0;
2005 phys_addr_t pteval;
2006 u64 attr;
2007
2008 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2009 return -EINVAL;
2010
2011 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2012 return -EINVAL;
2013
2014 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2015 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2016 return -EINVAL;
2017 }
2018
2019 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2020 attr |= DMA_FL_PTE_PRESENT;
2021 if (domain->use_first_level) {
2022 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2023 if (prot & DMA_PTE_WRITE)
2024 attr |= DMA_FL_PTE_DIRTY;
2025 }
2026
2027 domain->has_mappings = true;
2028
2029 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2030
2031 while (nr_pages > 0) {
2032 uint64_t tmp;
2033
2034 if (!pte) {
2035 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2036 phys_pfn, nr_pages);
2037
2038 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2039 gfp);
2040 if (!pte)
2041 return -ENOMEM;
2042 first_pte = pte;
2043
2044 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2045
2046 /* It is large page*/
2047 if (largepage_lvl > 1) {
2048 unsigned long end_pfn;
2049 unsigned long pages_to_remove;
2050
2051 pteval |= DMA_PTE_LARGE_PAGE;
2052 pages_to_remove = min_t(unsigned long, nr_pages,
2053 nr_pte_to_next_page(pte) * lvl_pages);
2054 end_pfn = iov_pfn + pages_to_remove - 1;
2055 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2056 } else {
2057 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2058 }
2059
2060 }
2061 /* We don't need lock here, nobody else
2062 * touches the iova range
2063 */
2064 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2065 if (tmp) {
2066 static int dumps = 5;
2067 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2068 iov_pfn, tmp, (unsigned long long)pteval);
2069 if (dumps) {
2070 dumps--;
2071 debug_dma_dump_mappings(NULL);
2072 }
2073 WARN_ON(1);
2074 }
2075
2076 nr_pages -= lvl_pages;
2077 iov_pfn += lvl_pages;
2078 phys_pfn += lvl_pages;
2079 pteval += lvl_pages * VTD_PAGE_SIZE;
2080
2081 /* If the next PTE would be the first in a new page, then we
2082 * need to flush the cache on the entries we've just written.
2083 * And then we'll need to recalculate 'pte', so clear it and
2084 * let it get set again in the if (!pte) block above.
2085 *
2086 * If we're done (!nr_pages) we need to flush the cache too.
2087 *
2088 * Also if we've been setting superpages, we may need to
2089 * recalculate 'pte' and switch back to smaller pages for the
2090 * end of the mapping, if the trailing size is not enough to
2091 * use another superpage (i.e. nr_pages < lvl_pages).
2092 */
2093 pte++;
2094 if (!nr_pages || first_pte_in_page(pte) ||
2095 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2096 domain_flush_cache(domain, first_pte,
2097 (void *)pte - (void *)first_pte);
2098 pte = NULL;
2099 }
2100 }
2101
2102 return 0;
2103}
2104
2105static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2106{
2107 struct intel_iommu *iommu = info->iommu;
2108 struct context_entry *context;
2109 u16 did_old;
2110
2111 if (!iommu)
2112 return;
2113
2114 spin_lock(&iommu->lock);
2115 context = iommu_context_addr(iommu, bus, devfn, 0);
2116 if (!context) {
2117 spin_unlock(&iommu->lock);
2118 return;
2119 }
2120
2121 if (sm_supported(iommu)) {
2122 if (hw_pass_through && domain_type_is_si(info->domain))
2123 did_old = FLPT_DEFAULT_DID;
2124 else
2125 did_old = domain_id_iommu(info->domain, iommu);
2126 } else {
2127 did_old = context_domain_id(context);
2128 }
2129
2130 context_clear_entry(context);
2131 __iommu_flush_cache(iommu, context, sizeof(*context));
2132 spin_unlock(&iommu->lock);
2133 iommu->flush.flush_context(iommu,
2134 did_old,
2135 (((u16)bus) << 8) | devfn,
2136 DMA_CCMD_MASK_NOBIT,
2137 DMA_CCMD_DEVICE_INVL);
2138
2139 if (sm_supported(iommu))
2140 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2141
2142 iommu->flush.flush_iotlb(iommu,
2143 did_old,
2144 0,
2145 0,
2146 DMA_TLB_DSI_FLUSH);
2147
2148 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2149}
2150
2151static int domain_setup_first_level(struct intel_iommu *iommu,
2152 struct dmar_domain *domain,
2153 struct device *dev,
2154 u32 pasid)
2155{
2156 struct dma_pte *pgd = domain->pgd;
2157 int agaw, level;
2158 int flags = 0;
2159
2160 /*
2161 * Skip top levels of page tables for iommu which has
2162 * less agaw than default. Unnecessary for PT mode.
2163 */
2164 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2165 pgd = phys_to_virt(dma_pte_addr(pgd));
2166 if (!dma_pte_present(pgd))
2167 return -ENOMEM;
2168 }
2169
2170 level = agaw_to_level(agaw);
2171 if (level != 4 && level != 5)
2172 return -EINVAL;
2173
2174 if (level == 5)
2175 flags |= PASID_FLAG_FL5LP;
2176
2177 if (domain->force_snooping)
2178 flags |= PASID_FLAG_PAGE_SNOOP;
2179
2180 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2181 domain_id_iommu(domain, iommu),
2182 flags);
2183}
2184
2185static bool dev_is_real_dma_subdevice(struct device *dev)
2186{
2187 return dev && dev_is_pci(dev) &&
2188 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2189}
2190
2191static int iommu_domain_identity_map(struct dmar_domain *domain,
2192 unsigned long first_vpfn,
2193 unsigned long last_vpfn)
2194{
2195 /*
2196 * RMRR range might have overlap with physical memory range,
2197 * clear it first
2198 */
2199 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2200
2201 return __domain_mapping(domain, first_vpfn,
2202 first_vpfn, last_vpfn - first_vpfn + 1,
2203 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2204}
2205
2206static int md_domain_init(struct dmar_domain *domain, int guest_width);
2207
2208static int __init si_domain_init(int hw)
2209{
2210 struct dmar_rmrr_unit *rmrr;
2211 struct device *dev;
2212 int i, nid, ret;
2213
2214 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2215 if (!si_domain)
2216 return -EFAULT;
2217
2218 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2219 domain_exit(si_domain);
2220 si_domain = NULL;
2221 return -EFAULT;
2222 }
2223
2224 if (hw)
2225 return 0;
2226
2227 for_each_online_node(nid) {
2228 unsigned long start_pfn, end_pfn;
2229 int i;
2230
2231 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2232 ret = iommu_domain_identity_map(si_domain,
2233 mm_to_dma_pfn_start(start_pfn),
2234 mm_to_dma_pfn_end(end_pfn));
2235 if (ret)
2236 return ret;
2237 }
2238 }
2239
2240 /*
2241 * Identity map the RMRRs so that devices with RMRRs could also use
2242 * the si_domain.
2243 */
2244 for_each_rmrr_units(rmrr) {
2245 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2246 i, dev) {
2247 unsigned long long start = rmrr->base_address;
2248 unsigned long long end = rmrr->end_address;
2249
2250 if (WARN_ON(end < start ||
2251 end >> agaw_to_width(si_domain->agaw)))
2252 continue;
2253
2254 ret = iommu_domain_identity_map(si_domain,
2255 mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2256 mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2257 if (ret)
2258 return ret;
2259 }
2260 }
2261
2262 return 0;
2263}
2264
2265static int dmar_domain_attach_device(struct dmar_domain *domain,
2266 struct device *dev)
2267{
2268 struct device_domain_info *info = dev_iommu_priv_get(dev);
2269 struct intel_iommu *iommu = info->iommu;
2270 unsigned long flags;
2271 int ret;
2272
2273 ret = domain_attach_iommu(domain, iommu);
2274 if (ret)
2275 return ret;
2276 info->domain = domain;
2277 spin_lock_irqsave(&domain->lock, flags);
2278 list_add(&info->link, &domain->devices);
2279 spin_unlock_irqrestore(&domain->lock, flags);
2280
2281 /* PASID table is mandatory for a PCI device in scalable mode. */
2282 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2283 /* Setup the PASID entry for requests without PASID: */
2284 if (hw_pass_through && domain_type_is_si(domain))
2285 ret = intel_pasid_setup_pass_through(iommu,
2286 dev, IOMMU_NO_PASID);
2287 else if (domain->use_first_level)
2288 ret = domain_setup_first_level(iommu, domain, dev,
2289 IOMMU_NO_PASID);
2290 else
2291 ret = intel_pasid_setup_second_level(iommu, domain,
2292 dev, IOMMU_NO_PASID);
2293 if (ret) {
2294 dev_err(dev, "Setup RID2PASID failed\n");
2295 device_block_translation(dev);
2296 return ret;
2297 }
2298 }
2299
2300 ret = domain_context_mapping(domain, dev);
2301 if (ret) {
2302 dev_err(dev, "Domain context map failed\n");
2303 device_block_translation(dev);
2304 return ret;
2305 }
2306
2307 if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2308 iommu_enable_pci_caps(info);
2309
2310 return 0;
2311}
2312
2313/**
2314 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2315 * is relaxable (ie. is allowed to be not enforced under some conditions)
2316 * @dev: device handle
2317 *
2318 * We assume that PCI USB devices with RMRRs have them largely
2319 * for historical reasons and that the RMRR space is not actively used post
2320 * boot. This exclusion may change if vendors begin to abuse it.
2321 *
2322 * The same exception is made for graphics devices, with the requirement that
2323 * any use of the RMRR regions will be torn down before assigning the device
2324 * to a guest.
2325 *
2326 * Return: true if the RMRR is relaxable, false otherwise
2327 */
2328static bool device_rmrr_is_relaxable(struct device *dev)
2329{
2330 struct pci_dev *pdev;
2331
2332 if (!dev_is_pci(dev))
2333 return false;
2334
2335 pdev = to_pci_dev(dev);
2336 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2337 return true;
2338 else
2339 return false;
2340}
2341
2342/*
2343 * Return the required default domain type for a specific device.
2344 *
2345 * @dev: the device in query
2346 * @startup: true if this is during early boot
2347 *
2348 * Returns:
2349 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2350 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2351 * - 0: both identity and dynamic domains work for this device
2352 */
2353static int device_def_domain_type(struct device *dev)
2354{
2355 if (dev_is_pci(dev)) {
2356 struct pci_dev *pdev = to_pci_dev(dev);
2357
2358 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2359 return IOMMU_DOMAIN_IDENTITY;
2360
2361 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2362 return IOMMU_DOMAIN_IDENTITY;
2363 }
2364
2365 return 0;
2366}
2367
2368static void intel_iommu_init_qi(struct intel_iommu *iommu)
2369{
2370 /*
2371 * Start from the sane iommu hardware state.
2372 * If the queued invalidation is already initialized by us
2373 * (for example, while enabling interrupt-remapping) then
2374 * we got the things already rolling from a sane state.
2375 */
2376 if (!iommu->qi) {
2377 /*
2378 * Clear any previous faults.
2379 */
2380 dmar_fault(-1, iommu);
2381 /*
2382 * Disable queued invalidation if supported and already enabled
2383 * before OS handover.
2384 */
2385 dmar_disable_qi(iommu);
2386 }
2387
2388 if (dmar_enable_qi(iommu)) {
2389 /*
2390 * Queued Invalidate not enabled, use Register Based Invalidate
2391 */
2392 iommu->flush.flush_context = __iommu_flush_context;
2393 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2394 pr_info("%s: Using Register based invalidation\n",
2395 iommu->name);
2396 } else {
2397 iommu->flush.flush_context = qi_flush_context;
2398 iommu->flush.flush_iotlb = qi_flush_iotlb;
2399 pr_info("%s: Using Queued invalidation\n", iommu->name);
2400 }
2401}
2402
2403static int copy_context_table(struct intel_iommu *iommu,
2404 struct root_entry *old_re,
2405 struct context_entry **tbl,
2406 int bus, bool ext)
2407{
2408 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2409 struct context_entry *new_ce = NULL, ce;
2410 struct context_entry *old_ce = NULL;
2411 struct root_entry re;
2412 phys_addr_t old_ce_phys;
2413
2414 tbl_idx = ext ? bus * 2 : bus;
2415 memcpy(&re, old_re, sizeof(re));
2416
2417 for (devfn = 0; devfn < 256; devfn++) {
2418 /* First calculate the correct index */
2419 idx = (ext ? devfn * 2 : devfn) % 256;
2420
2421 if (idx == 0) {
2422 /* First save what we may have and clean up */
2423 if (new_ce) {
2424 tbl[tbl_idx] = new_ce;
2425 __iommu_flush_cache(iommu, new_ce,
2426 VTD_PAGE_SIZE);
2427 pos = 1;
2428 }
2429
2430 if (old_ce)
2431 memunmap(old_ce);
2432
2433 ret = 0;
2434 if (devfn < 0x80)
2435 old_ce_phys = root_entry_lctp(&re);
2436 else
2437 old_ce_phys = root_entry_uctp(&re);
2438
2439 if (!old_ce_phys) {
2440 if (ext && devfn == 0) {
2441 /* No LCTP, try UCTP */
2442 devfn = 0x7f;
2443 continue;
2444 } else {
2445 goto out;
2446 }
2447 }
2448
2449 ret = -ENOMEM;
2450 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2451 MEMREMAP_WB);
2452 if (!old_ce)
2453 goto out;
2454
2455 new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2456 if (!new_ce)
2457 goto out_unmap;
2458
2459 ret = 0;
2460 }
2461
2462 /* Now copy the context entry */
2463 memcpy(&ce, old_ce + idx, sizeof(ce));
2464
2465 if (!context_present(&ce))
2466 continue;
2467
2468 did = context_domain_id(&ce);
2469 if (did >= 0 && did < cap_ndoms(iommu->cap))
2470 set_bit(did, iommu->domain_ids);
2471
2472 set_context_copied(iommu, bus, devfn);
2473 new_ce[idx] = ce;
2474 }
2475
2476 tbl[tbl_idx + pos] = new_ce;
2477
2478 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2479
2480out_unmap:
2481 memunmap(old_ce);
2482
2483out:
2484 return ret;
2485}
2486
2487static int copy_translation_tables(struct intel_iommu *iommu)
2488{
2489 struct context_entry **ctxt_tbls;
2490 struct root_entry *old_rt;
2491 phys_addr_t old_rt_phys;
2492 int ctxt_table_entries;
2493 u64 rtaddr_reg;
2494 int bus, ret;
2495 bool new_ext, ext;
2496
2497 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2498 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2499 new_ext = !!sm_supported(iommu);
2500
2501 /*
2502 * The RTT bit can only be changed when translation is disabled,
2503 * but disabling translation means to open a window for data
2504 * corruption. So bail out and don't copy anything if we would
2505 * have to change the bit.
2506 */
2507 if (new_ext != ext)
2508 return -EINVAL;
2509
2510 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2511 if (!iommu->copied_tables)
2512 return -ENOMEM;
2513
2514 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2515 if (!old_rt_phys)
2516 return -EINVAL;
2517
2518 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2519 if (!old_rt)
2520 return -ENOMEM;
2521
2522 /* This is too big for the stack - allocate it from slab */
2523 ctxt_table_entries = ext ? 512 : 256;
2524 ret = -ENOMEM;
2525 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2526 if (!ctxt_tbls)
2527 goto out_unmap;
2528
2529 for (bus = 0; bus < 256; bus++) {
2530 ret = copy_context_table(iommu, &old_rt[bus],
2531 ctxt_tbls, bus, ext);
2532 if (ret) {
2533 pr_err("%s: Failed to copy context table for bus %d\n",
2534 iommu->name, bus);
2535 continue;
2536 }
2537 }
2538
2539 spin_lock(&iommu->lock);
2540
2541 /* Context tables are copied, now write them to the root_entry table */
2542 for (bus = 0; bus < 256; bus++) {
2543 int idx = ext ? bus * 2 : bus;
2544 u64 val;
2545
2546 if (ctxt_tbls[idx]) {
2547 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2548 iommu->root_entry[bus].lo = val;
2549 }
2550
2551 if (!ext || !ctxt_tbls[idx + 1])
2552 continue;
2553
2554 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2555 iommu->root_entry[bus].hi = val;
2556 }
2557
2558 spin_unlock(&iommu->lock);
2559
2560 kfree(ctxt_tbls);
2561
2562 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2563
2564 ret = 0;
2565
2566out_unmap:
2567 memunmap(old_rt);
2568
2569 return ret;
2570}
2571
2572static int __init init_dmars(void)
2573{
2574 struct dmar_drhd_unit *drhd;
2575 struct intel_iommu *iommu;
2576 int ret;
2577
2578 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2579 if (ret)
2580 goto free_iommu;
2581
2582 for_each_iommu(iommu, drhd) {
2583 if (drhd->ignored) {
2584 iommu_disable_translation(iommu);
2585 continue;
2586 }
2587
2588 /*
2589 * Find the max pasid size of all IOMMU's in the system.
2590 * We need to ensure the system pasid table is no bigger
2591 * than the smallest supported.
2592 */
2593 if (pasid_supported(iommu)) {
2594 u32 temp = 2 << ecap_pss(iommu->ecap);
2595
2596 intel_pasid_max_id = min_t(u32, temp,
2597 intel_pasid_max_id);
2598 }
2599
2600 intel_iommu_init_qi(iommu);
2601
2602 ret = iommu_init_domains(iommu);
2603 if (ret)
2604 goto free_iommu;
2605
2606 init_translation_status(iommu);
2607
2608 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2609 iommu_disable_translation(iommu);
2610 clear_translation_pre_enabled(iommu);
2611 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2612 iommu->name);
2613 }
2614
2615 /*
2616 * TBD:
2617 * we could share the same root & context tables
2618 * among all IOMMU's. Need to Split it later.
2619 */
2620 ret = iommu_alloc_root_entry(iommu);
2621 if (ret)
2622 goto free_iommu;
2623
2624 if (translation_pre_enabled(iommu)) {
2625 pr_info("Translation already enabled - trying to copy translation structures\n");
2626
2627 ret = copy_translation_tables(iommu);
2628 if (ret) {
2629 /*
2630 * We found the IOMMU with translation
2631 * enabled - but failed to copy over the
2632 * old root-entry table. Try to proceed
2633 * by disabling translation now and
2634 * allocating a clean root-entry table.
2635 * This might cause DMAR faults, but
2636 * probably the dump will still succeed.
2637 */
2638 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2639 iommu->name);
2640 iommu_disable_translation(iommu);
2641 clear_translation_pre_enabled(iommu);
2642 } else {
2643 pr_info("Copied translation tables from previous kernel for %s\n",
2644 iommu->name);
2645 }
2646 }
2647
2648 if (!ecap_pass_through(iommu->ecap))
2649 hw_pass_through = 0;
2650 intel_svm_check(iommu);
2651 }
2652
2653 /*
2654 * Now that qi is enabled on all iommus, set the root entry and flush
2655 * caches. This is required on some Intel X58 chipsets, otherwise the
2656 * flush_context function will loop forever and the boot hangs.
2657 */
2658 for_each_active_iommu(iommu, drhd) {
2659 iommu_flush_write_buffer(iommu);
2660 iommu_set_root_entry(iommu);
2661 }
2662
2663#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2664 dmar_map_gfx = 0;
2665#endif
2666
2667 if (!dmar_map_gfx)
2668 iommu_identity_mapping |= IDENTMAP_GFX;
2669
2670 check_tylersburg_isoch();
2671
2672 ret = si_domain_init(hw_pass_through);
2673 if (ret)
2674 goto free_iommu;
2675
2676 /*
2677 * for each drhd
2678 * enable fault log
2679 * global invalidate context cache
2680 * global invalidate iotlb
2681 * enable translation
2682 */
2683 for_each_iommu(iommu, drhd) {
2684 if (drhd->ignored) {
2685 /*
2686 * we always have to disable PMRs or DMA may fail on
2687 * this device
2688 */
2689 if (force_on)
2690 iommu_disable_protect_mem_regions(iommu);
2691 continue;
2692 }
2693
2694 iommu_flush_write_buffer(iommu);
2695
2696#ifdef CONFIG_INTEL_IOMMU_SVM
2697 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2698 /*
2699 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2700 * could cause possible lock race condition.
2701 */
2702 up_write(&dmar_global_lock);
2703 ret = intel_svm_enable_prq(iommu);
2704 down_write(&dmar_global_lock);
2705 if (ret)
2706 goto free_iommu;
2707 }
2708#endif
2709 ret = dmar_set_interrupt(iommu);
2710 if (ret)
2711 goto free_iommu;
2712 }
2713
2714 return 0;
2715
2716free_iommu:
2717 for_each_active_iommu(iommu, drhd) {
2718 disable_dmar_iommu(iommu);
2719 free_dmar_iommu(iommu);
2720 }
2721 if (si_domain) {
2722 domain_exit(si_domain);
2723 si_domain = NULL;
2724 }
2725
2726 return ret;
2727}
2728
2729static void __init init_no_remapping_devices(void)
2730{
2731 struct dmar_drhd_unit *drhd;
2732 struct device *dev;
2733 int i;
2734
2735 for_each_drhd_unit(drhd) {
2736 if (!drhd->include_all) {
2737 for_each_active_dev_scope(drhd->devices,
2738 drhd->devices_cnt, i, dev)
2739 break;
2740 /* ignore DMAR unit if no devices exist */
2741 if (i == drhd->devices_cnt)
2742 drhd->ignored = 1;
2743 }
2744 }
2745
2746 for_each_active_drhd_unit(drhd) {
2747 if (drhd->include_all)
2748 continue;
2749
2750 for_each_active_dev_scope(drhd->devices,
2751 drhd->devices_cnt, i, dev)
2752 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2753 break;
2754 if (i < drhd->devices_cnt)
2755 continue;
2756
2757 /* This IOMMU has *only* gfx devices. Either bypass it or
2758 set the gfx_mapped flag, as appropriate */
2759 drhd->gfx_dedicated = 1;
2760 if (!dmar_map_gfx)
2761 drhd->ignored = 1;
2762 }
2763}
2764
2765#ifdef CONFIG_SUSPEND
2766static int init_iommu_hw(void)
2767{
2768 struct dmar_drhd_unit *drhd;
2769 struct intel_iommu *iommu = NULL;
2770 int ret;
2771
2772 for_each_active_iommu(iommu, drhd) {
2773 if (iommu->qi) {
2774 ret = dmar_reenable_qi(iommu);
2775 if (ret)
2776 return ret;
2777 }
2778 }
2779
2780 for_each_iommu(iommu, drhd) {
2781 if (drhd->ignored) {
2782 /*
2783 * we always have to disable PMRs or DMA may fail on
2784 * this device
2785 */
2786 if (force_on)
2787 iommu_disable_protect_mem_regions(iommu);
2788 continue;
2789 }
2790
2791 iommu_flush_write_buffer(iommu);
2792 iommu_set_root_entry(iommu);
2793 iommu_enable_translation(iommu);
2794 iommu_disable_protect_mem_regions(iommu);
2795 }
2796
2797 return 0;
2798}
2799
2800static void iommu_flush_all(void)
2801{
2802 struct dmar_drhd_unit *drhd;
2803 struct intel_iommu *iommu;
2804
2805 for_each_active_iommu(iommu, drhd) {
2806 iommu->flush.flush_context(iommu, 0, 0, 0,
2807 DMA_CCMD_GLOBAL_INVL);
2808 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2809 DMA_TLB_GLOBAL_FLUSH);
2810 }
2811}
2812
2813static int iommu_suspend(void)
2814{
2815 struct dmar_drhd_unit *drhd;
2816 struct intel_iommu *iommu = NULL;
2817 unsigned long flag;
2818
2819 iommu_flush_all();
2820
2821 for_each_active_iommu(iommu, drhd) {
2822 iommu_disable_translation(iommu);
2823
2824 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2825
2826 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2827 readl(iommu->reg + DMAR_FECTL_REG);
2828 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2829 readl(iommu->reg + DMAR_FEDATA_REG);
2830 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2831 readl(iommu->reg + DMAR_FEADDR_REG);
2832 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2833 readl(iommu->reg + DMAR_FEUADDR_REG);
2834
2835 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2836 }
2837 return 0;
2838}
2839
2840static void iommu_resume(void)
2841{
2842 struct dmar_drhd_unit *drhd;
2843 struct intel_iommu *iommu = NULL;
2844 unsigned long flag;
2845
2846 if (init_iommu_hw()) {
2847 if (force_on)
2848 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2849 else
2850 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2851 return;
2852 }
2853
2854 for_each_active_iommu(iommu, drhd) {
2855
2856 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2857
2858 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2859 iommu->reg + DMAR_FECTL_REG);
2860 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2861 iommu->reg + DMAR_FEDATA_REG);
2862 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2863 iommu->reg + DMAR_FEADDR_REG);
2864 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2865 iommu->reg + DMAR_FEUADDR_REG);
2866
2867 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2868 }
2869}
2870
2871static struct syscore_ops iommu_syscore_ops = {
2872 .resume = iommu_resume,
2873 .suspend = iommu_suspend,
2874};
2875
2876static void __init init_iommu_pm_ops(void)
2877{
2878 register_syscore_ops(&iommu_syscore_ops);
2879}
2880
2881#else
2882static inline void init_iommu_pm_ops(void) {}
2883#endif /* CONFIG_PM */
2884
2885static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2886{
2887 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2888 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2889 rmrr->end_address <= rmrr->base_address ||
2890 arch_rmrr_sanity_check(rmrr))
2891 return -EINVAL;
2892
2893 return 0;
2894}
2895
2896int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2897{
2898 struct acpi_dmar_reserved_memory *rmrr;
2899 struct dmar_rmrr_unit *rmrru;
2900
2901 rmrr = (struct acpi_dmar_reserved_memory *)header;
2902 if (rmrr_sanity_check(rmrr)) {
2903 pr_warn(FW_BUG
2904 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2905 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2906 rmrr->base_address, rmrr->end_address,
2907 dmi_get_system_info(DMI_BIOS_VENDOR),
2908 dmi_get_system_info(DMI_BIOS_VERSION),
2909 dmi_get_system_info(DMI_PRODUCT_VERSION));
2910 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2911 }
2912
2913 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2914 if (!rmrru)
2915 goto out;
2916
2917 rmrru->hdr = header;
2918
2919 rmrru->base_address = rmrr->base_address;
2920 rmrru->end_address = rmrr->end_address;
2921
2922 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2923 ((void *)rmrr) + rmrr->header.length,
2924 &rmrru->devices_cnt);
2925 if (rmrru->devices_cnt && rmrru->devices == NULL)
2926 goto free_rmrru;
2927
2928 list_add(&rmrru->list, &dmar_rmrr_units);
2929
2930 return 0;
2931free_rmrru:
2932 kfree(rmrru);
2933out:
2934 return -ENOMEM;
2935}
2936
2937static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2938{
2939 struct dmar_atsr_unit *atsru;
2940 struct acpi_dmar_atsr *tmp;
2941
2942 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2943 dmar_rcu_check()) {
2944 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2945 if (atsr->segment != tmp->segment)
2946 continue;
2947 if (atsr->header.length != tmp->header.length)
2948 continue;
2949 if (memcmp(atsr, tmp, atsr->header.length) == 0)
2950 return atsru;
2951 }
2952
2953 return NULL;
2954}
2955
2956int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2957{
2958 struct acpi_dmar_atsr *atsr;
2959 struct dmar_atsr_unit *atsru;
2960
2961 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2962 return 0;
2963
2964 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2965 atsru = dmar_find_atsr(atsr);
2966 if (atsru)
2967 return 0;
2968
2969 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2970 if (!atsru)
2971 return -ENOMEM;
2972
2973 /*
2974 * If memory is allocated from slab by ACPI _DSM method, we need to
2975 * copy the memory content because the memory buffer will be freed
2976 * on return.
2977 */
2978 atsru->hdr = (void *)(atsru + 1);
2979 memcpy(atsru->hdr, hdr, hdr->length);
2980 atsru->include_all = atsr->flags & 0x1;
2981 if (!atsru->include_all) {
2982 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2983 (void *)atsr + atsr->header.length,
2984 &atsru->devices_cnt);
2985 if (atsru->devices_cnt && atsru->devices == NULL) {
2986 kfree(atsru);
2987 return -ENOMEM;
2988 }
2989 }
2990
2991 list_add_rcu(&atsru->list, &dmar_atsr_units);
2992
2993 return 0;
2994}
2995
2996static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2997{
2998 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2999 kfree(atsru);
3000}
3001
3002int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3003{
3004 struct acpi_dmar_atsr *atsr;
3005 struct dmar_atsr_unit *atsru;
3006
3007 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3008 atsru = dmar_find_atsr(atsr);
3009 if (atsru) {
3010 list_del_rcu(&atsru->list);
3011 synchronize_rcu();
3012 intel_iommu_free_atsr(atsru);
3013 }
3014
3015 return 0;
3016}
3017
3018int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3019{
3020 int i;
3021 struct device *dev;
3022 struct acpi_dmar_atsr *atsr;
3023 struct dmar_atsr_unit *atsru;
3024
3025 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3026 atsru = dmar_find_atsr(atsr);
3027 if (!atsru)
3028 return 0;
3029
3030 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3031 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3032 i, dev)
3033 return -EBUSY;
3034 }
3035
3036 return 0;
3037}
3038
3039static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3040{
3041 struct dmar_satc_unit *satcu;
3042 struct acpi_dmar_satc *tmp;
3043
3044 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3045 dmar_rcu_check()) {
3046 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3047 if (satc->segment != tmp->segment)
3048 continue;
3049 if (satc->header.length != tmp->header.length)
3050 continue;
3051 if (memcmp(satc, tmp, satc->header.length) == 0)
3052 return satcu;
3053 }
3054
3055 return NULL;
3056}
3057
3058int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3059{
3060 struct acpi_dmar_satc *satc;
3061 struct dmar_satc_unit *satcu;
3062
3063 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3064 return 0;
3065
3066 satc = container_of(hdr, struct acpi_dmar_satc, header);
3067 satcu = dmar_find_satc(satc);
3068 if (satcu)
3069 return 0;
3070
3071 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3072 if (!satcu)
3073 return -ENOMEM;
3074
3075 satcu->hdr = (void *)(satcu + 1);
3076 memcpy(satcu->hdr, hdr, hdr->length);
3077 satcu->atc_required = satc->flags & 0x1;
3078 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3079 (void *)satc + satc->header.length,
3080 &satcu->devices_cnt);
3081 if (satcu->devices_cnt && !satcu->devices) {
3082 kfree(satcu);
3083 return -ENOMEM;
3084 }
3085 list_add_rcu(&satcu->list, &dmar_satc_units);
3086
3087 return 0;
3088}
3089
3090static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3091{
3092 int sp, ret;
3093 struct intel_iommu *iommu = dmaru->iommu;
3094
3095 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3096 if (ret)
3097 goto out;
3098
3099 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3100 pr_warn("%s: Doesn't support hardware pass through.\n",
3101 iommu->name);
3102 return -ENXIO;
3103 }
3104
3105 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3106 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3107 pr_warn("%s: Doesn't support large page.\n",
3108 iommu->name);
3109 return -ENXIO;
3110 }
3111
3112 /*
3113 * Disable translation if already enabled prior to OS handover.
3114 */
3115 if (iommu->gcmd & DMA_GCMD_TE)
3116 iommu_disable_translation(iommu);
3117
3118 ret = iommu_init_domains(iommu);
3119 if (ret == 0)
3120 ret = iommu_alloc_root_entry(iommu);
3121 if (ret)
3122 goto out;
3123
3124 intel_svm_check(iommu);
3125
3126 if (dmaru->ignored) {
3127 /*
3128 * we always have to disable PMRs or DMA may fail on this device
3129 */
3130 if (force_on)
3131 iommu_disable_protect_mem_regions(iommu);
3132 return 0;
3133 }
3134
3135 intel_iommu_init_qi(iommu);
3136 iommu_flush_write_buffer(iommu);
3137
3138#ifdef CONFIG_INTEL_IOMMU_SVM
3139 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3140 ret = intel_svm_enable_prq(iommu);
3141 if (ret)
3142 goto disable_iommu;
3143 }
3144#endif
3145 ret = dmar_set_interrupt(iommu);
3146 if (ret)
3147 goto disable_iommu;
3148
3149 iommu_set_root_entry(iommu);
3150 iommu_enable_translation(iommu);
3151
3152 iommu_disable_protect_mem_regions(iommu);
3153 return 0;
3154
3155disable_iommu:
3156 disable_dmar_iommu(iommu);
3157out:
3158 free_dmar_iommu(iommu);
3159 return ret;
3160}
3161
3162int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3163{
3164 int ret = 0;
3165 struct intel_iommu *iommu = dmaru->iommu;
3166
3167 if (!intel_iommu_enabled)
3168 return 0;
3169 if (iommu == NULL)
3170 return -EINVAL;
3171
3172 if (insert) {
3173 ret = intel_iommu_add(dmaru);
3174 } else {
3175 disable_dmar_iommu(iommu);
3176 free_dmar_iommu(iommu);
3177 }
3178
3179 return ret;
3180}
3181
3182static void intel_iommu_free_dmars(void)
3183{
3184 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3185 struct dmar_atsr_unit *atsru, *atsr_n;
3186 struct dmar_satc_unit *satcu, *satc_n;
3187
3188 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3189 list_del(&rmrru->list);
3190 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3191 kfree(rmrru);
3192 }
3193
3194 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3195 list_del(&atsru->list);
3196 intel_iommu_free_atsr(atsru);
3197 }
3198 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3199 list_del(&satcu->list);
3200 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3201 kfree(satcu);
3202 }
3203}
3204
3205static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3206{
3207 struct dmar_satc_unit *satcu;
3208 struct acpi_dmar_satc *satc;
3209 struct device *tmp;
3210 int i;
3211
3212 dev = pci_physfn(dev);
3213 rcu_read_lock();
3214
3215 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3216 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3217 if (satc->segment != pci_domain_nr(dev->bus))
3218 continue;
3219 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3220 if (to_pci_dev(tmp) == dev)
3221 goto out;
3222 }
3223 satcu = NULL;
3224out:
3225 rcu_read_unlock();
3226 return satcu;
3227}
3228
3229static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3230{
3231 int i, ret = 1;
3232 struct pci_bus *bus;
3233 struct pci_dev *bridge = NULL;
3234 struct device *tmp;
3235 struct acpi_dmar_atsr *atsr;
3236 struct dmar_atsr_unit *atsru;
3237 struct dmar_satc_unit *satcu;
3238
3239 dev = pci_physfn(dev);
3240 satcu = dmar_find_matched_satc_unit(dev);
3241 if (satcu)
3242 /*
3243 * This device supports ATS as it is in SATC table.
3244 * When IOMMU is in legacy mode, enabling ATS is done
3245 * automatically by HW for the device that requires
3246 * ATS, hence OS should not enable this device ATS
3247 * to avoid duplicated TLB invalidation.
3248 */
3249 return !(satcu->atc_required && !sm_supported(iommu));
3250
3251 for (bus = dev->bus; bus; bus = bus->parent) {
3252 bridge = bus->self;
3253 /* If it's an integrated device, allow ATS */
3254 if (!bridge)
3255 return 1;
3256 /* Connected via non-PCIe: no ATS */
3257 if (!pci_is_pcie(bridge) ||
3258 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3259 return 0;
3260 /* If we found the root port, look it up in the ATSR */
3261 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3262 break;
3263 }
3264
3265 rcu_read_lock();
3266 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3267 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3268 if (atsr->segment != pci_domain_nr(dev->bus))
3269 continue;
3270
3271 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3272 if (tmp == &bridge->dev)
3273 goto out;
3274
3275 if (atsru->include_all)
3276 goto out;
3277 }
3278 ret = 0;
3279out:
3280 rcu_read_unlock();
3281
3282 return ret;
3283}
3284
3285int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3286{
3287 int ret;
3288 struct dmar_rmrr_unit *rmrru;
3289 struct dmar_atsr_unit *atsru;
3290 struct dmar_satc_unit *satcu;
3291 struct acpi_dmar_atsr *atsr;
3292 struct acpi_dmar_reserved_memory *rmrr;
3293 struct acpi_dmar_satc *satc;
3294
3295 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3296 return 0;
3297
3298 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3299 rmrr = container_of(rmrru->hdr,
3300 struct acpi_dmar_reserved_memory, header);
3301 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3302 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3303 ((void *)rmrr) + rmrr->header.length,
3304 rmrr->segment, rmrru->devices,
3305 rmrru->devices_cnt);
3306 if (ret < 0)
3307 return ret;
3308 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3309 dmar_remove_dev_scope(info, rmrr->segment,
3310 rmrru->devices, rmrru->devices_cnt);
3311 }
3312 }
3313
3314 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3315 if (atsru->include_all)
3316 continue;
3317
3318 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3319 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3320 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3321 (void *)atsr + atsr->header.length,
3322 atsr->segment, atsru->devices,
3323 atsru->devices_cnt);
3324 if (ret > 0)
3325 break;
3326 else if (ret < 0)
3327 return ret;
3328 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3329 if (dmar_remove_dev_scope(info, atsr->segment,
3330 atsru->devices, atsru->devices_cnt))
3331 break;
3332 }
3333 }
3334 list_for_each_entry(satcu, &dmar_satc_units, list) {
3335 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3336 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3337 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3338 (void *)satc + satc->header.length,
3339 satc->segment, satcu->devices,
3340 satcu->devices_cnt);
3341 if (ret > 0)
3342 break;
3343 else if (ret < 0)
3344 return ret;
3345 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3346 if (dmar_remove_dev_scope(info, satc->segment,
3347 satcu->devices, satcu->devices_cnt))
3348 break;
3349 }
3350 }
3351
3352 return 0;
3353}
3354
3355static int intel_iommu_memory_notifier(struct notifier_block *nb,
3356 unsigned long val, void *v)
3357{
3358 struct memory_notify *mhp = v;
3359 unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3360 unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3361 mhp->nr_pages - 1);
3362
3363 switch (val) {
3364 case MEM_GOING_ONLINE:
3365 if (iommu_domain_identity_map(si_domain,
3366 start_vpfn, last_vpfn)) {
3367 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3368 start_vpfn, last_vpfn);
3369 return NOTIFY_BAD;
3370 }
3371 break;
3372
3373 case MEM_OFFLINE:
3374 case MEM_CANCEL_ONLINE:
3375 {
3376 struct dmar_drhd_unit *drhd;
3377 struct intel_iommu *iommu;
3378 LIST_HEAD(freelist);
3379
3380 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3381
3382 rcu_read_lock();
3383 for_each_active_iommu(iommu, drhd)
3384 iommu_flush_iotlb_psi(iommu, si_domain,
3385 start_vpfn, mhp->nr_pages,
3386 list_empty(&freelist), 0);
3387 rcu_read_unlock();
3388 put_pages_list(&freelist);
3389 }
3390 break;
3391 }
3392
3393 return NOTIFY_OK;
3394}
3395
3396static struct notifier_block intel_iommu_memory_nb = {
3397 .notifier_call = intel_iommu_memory_notifier,
3398 .priority = 0
3399};
3400
3401static void intel_disable_iommus(void)
3402{
3403 struct intel_iommu *iommu = NULL;
3404 struct dmar_drhd_unit *drhd;
3405
3406 for_each_iommu(iommu, drhd)
3407 iommu_disable_translation(iommu);
3408}
3409
3410void intel_iommu_shutdown(void)
3411{
3412 struct dmar_drhd_unit *drhd;
3413 struct intel_iommu *iommu = NULL;
3414
3415 if (no_iommu || dmar_disabled)
3416 return;
3417
3418 down_write(&dmar_global_lock);
3419
3420 /* Disable PMRs explicitly here. */
3421 for_each_iommu(iommu, drhd)
3422 iommu_disable_protect_mem_regions(iommu);
3423
3424 /* Make sure the IOMMUs are switched off */
3425 intel_disable_iommus();
3426
3427 up_write(&dmar_global_lock);
3428}
3429
3430static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3431{
3432 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3433
3434 return container_of(iommu_dev, struct intel_iommu, iommu);
3435}
3436
3437static ssize_t version_show(struct device *dev,
3438 struct device_attribute *attr, char *buf)
3439{
3440 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3441 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3442 return sysfs_emit(buf, "%d:%d\n",
3443 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3444}
3445static DEVICE_ATTR_RO(version);
3446
3447static ssize_t address_show(struct device *dev,
3448 struct device_attribute *attr, char *buf)
3449{
3450 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3451 return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3452}
3453static DEVICE_ATTR_RO(address);
3454
3455static ssize_t cap_show(struct device *dev,
3456 struct device_attribute *attr, char *buf)
3457{
3458 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3459 return sysfs_emit(buf, "%llx\n", iommu->cap);
3460}
3461static DEVICE_ATTR_RO(cap);
3462
3463static ssize_t ecap_show(struct device *dev,
3464 struct device_attribute *attr, char *buf)
3465{
3466 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3467 return sysfs_emit(buf, "%llx\n", iommu->ecap);
3468}
3469static DEVICE_ATTR_RO(ecap);
3470
3471static ssize_t domains_supported_show(struct device *dev,
3472 struct device_attribute *attr, char *buf)
3473{
3474 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3475 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3476}
3477static DEVICE_ATTR_RO(domains_supported);
3478
3479static ssize_t domains_used_show(struct device *dev,
3480 struct device_attribute *attr, char *buf)
3481{
3482 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3483 return sysfs_emit(buf, "%d\n",
3484 bitmap_weight(iommu->domain_ids,
3485 cap_ndoms(iommu->cap)));
3486}
3487static DEVICE_ATTR_RO(domains_used);
3488
3489static struct attribute *intel_iommu_attrs[] = {
3490 &dev_attr_version.attr,
3491 &dev_attr_address.attr,
3492 &dev_attr_cap.attr,
3493 &dev_attr_ecap.attr,
3494 &dev_attr_domains_supported.attr,
3495 &dev_attr_domains_used.attr,
3496 NULL,
3497};
3498
3499static struct attribute_group intel_iommu_group = {
3500 .name = "intel-iommu",
3501 .attrs = intel_iommu_attrs,
3502};
3503
3504const struct attribute_group *intel_iommu_groups[] = {
3505 &intel_iommu_group,
3506 NULL,
3507};
3508
3509static bool has_external_pci(void)
3510{
3511 struct pci_dev *pdev = NULL;
3512
3513 for_each_pci_dev(pdev)
3514 if (pdev->external_facing) {
3515 pci_dev_put(pdev);
3516 return true;
3517 }
3518
3519 return false;
3520}
3521
3522static int __init platform_optin_force_iommu(void)
3523{
3524 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3525 return 0;
3526
3527 if (no_iommu || dmar_disabled)
3528 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3529
3530 /*
3531 * If Intel-IOMMU is disabled by default, we will apply identity
3532 * map for all devices except those marked as being untrusted.
3533 */
3534 if (dmar_disabled)
3535 iommu_set_default_passthrough(false);
3536
3537 dmar_disabled = 0;
3538 no_iommu = 0;
3539
3540 return 1;
3541}
3542
3543static int __init probe_acpi_namespace_devices(void)
3544{
3545 struct dmar_drhd_unit *drhd;
3546 /* To avoid a -Wunused-but-set-variable warning. */
3547 struct intel_iommu *iommu __maybe_unused;
3548 struct device *dev;
3549 int i, ret = 0;
3550
3551 for_each_active_iommu(iommu, drhd) {
3552 for_each_active_dev_scope(drhd->devices,
3553 drhd->devices_cnt, i, dev) {
3554 struct acpi_device_physical_node *pn;
3555 struct acpi_device *adev;
3556
3557 if (dev->bus != &acpi_bus_type)
3558 continue;
3559
3560 adev = to_acpi_device(dev);
3561 mutex_lock(&adev->physical_node_lock);
3562 list_for_each_entry(pn,
3563 &adev->physical_node_list, node) {
3564 ret = iommu_probe_device(pn->dev);
3565 if (ret)
3566 break;
3567 }
3568 mutex_unlock(&adev->physical_node_lock);
3569
3570 if (ret)
3571 return ret;
3572 }
3573 }
3574
3575 return 0;
3576}
3577
3578static __init int tboot_force_iommu(void)
3579{
3580 if (!tboot_enabled())
3581 return 0;
3582
3583 if (no_iommu || dmar_disabled)
3584 pr_warn("Forcing Intel-IOMMU to enabled\n");
3585
3586 dmar_disabled = 0;
3587 no_iommu = 0;
3588
3589 return 1;
3590}
3591
3592int __init intel_iommu_init(void)
3593{
3594 int ret = -ENODEV;
3595 struct dmar_drhd_unit *drhd;
3596 struct intel_iommu *iommu;
3597
3598 /*
3599 * Intel IOMMU is required for a TXT/tboot launch or platform
3600 * opt in, so enforce that.
3601 */
3602 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3603 platform_optin_force_iommu();
3604
3605 down_write(&dmar_global_lock);
3606 if (dmar_table_init()) {
3607 if (force_on)
3608 panic("tboot: Failed to initialize DMAR table\n");
3609 goto out_free_dmar;
3610 }
3611
3612 if (dmar_dev_scope_init() < 0) {
3613 if (force_on)
3614 panic("tboot: Failed to initialize DMAR device scope\n");
3615 goto out_free_dmar;
3616 }
3617
3618 up_write(&dmar_global_lock);
3619
3620 /*
3621 * The bus notifier takes the dmar_global_lock, so lockdep will
3622 * complain later when we register it under the lock.
3623 */
3624 dmar_register_bus_notifier();
3625
3626 down_write(&dmar_global_lock);
3627
3628 if (!no_iommu)
3629 intel_iommu_debugfs_init();
3630
3631 if (no_iommu || dmar_disabled) {
3632 /*
3633 * We exit the function here to ensure IOMMU's remapping and
3634 * mempool aren't setup, which means that the IOMMU's PMRs
3635 * won't be disabled via the call to init_dmars(). So disable
3636 * it explicitly here. The PMRs were setup by tboot prior to
3637 * calling SENTER, but the kernel is expected to reset/tear
3638 * down the PMRs.
3639 */
3640 if (intel_iommu_tboot_noforce) {
3641 for_each_iommu(iommu, drhd)
3642 iommu_disable_protect_mem_regions(iommu);
3643 }
3644
3645 /*
3646 * Make sure the IOMMUs are switched off, even when we
3647 * boot into a kexec kernel and the previous kernel left
3648 * them enabled
3649 */
3650 intel_disable_iommus();
3651 goto out_free_dmar;
3652 }
3653
3654 if (list_empty(&dmar_rmrr_units))
3655 pr_info("No RMRR found\n");
3656
3657 if (list_empty(&dmar_atsr_units))
3658 pr_info("No ATSR found\n");
3659
3660 if (list_empty(&dmar_satc_units))
3661 pr_info("No SATC found\n");
3662
3663 init_no_remapping_devices();
3664
3665 ret = init_dmars();
3666 if (ret) {
3667 if (force_on)
3668 panic("tboot: Failed to initialize DMARs\n");
3669 pr_err("Initialization failed\n");
3670 goto out_free_dmar;
3671 }
3672 up_write(&dmar_global_lock);
3673
3674 init_iommu_pm_ops();
3675
3676 down_read(&dmar_global_lock);
3677 for_each_active_iommu(iommu, drhd) {
3678 /*
3679 * The flush queue implementation does not perform
3680 * page-selective invalidations that are required for efficient
3681 * TLB flushes in virtual environments. The benefit of batching
3682 * is likely to be much lower than the overhead of synchronizing
3683 * the virtual and physical IOMMU page-tables.
3684 */
3685 if (cap_caching_mode(iommu->cap) &&
3686 !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3687 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3688 iommu_set_dma_strict();
3689 }
3690 iommu_device_sysfs_add(&iommu->iommu, NULL,
3691 intel_iommu_groups,
3692 "%s", iommu->name);
3693 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3694
3695 iommu_pmu_register(iommu);
3696 }
3697 up_read(&dmar_global_lock);
3698
3699 if (si_domain && !hw_pass_through)
3700 register_memory_notifier(&intel_iommu_memory_nb);
3701
3702 down_read(&dmar_global_lock);
3703 if (probe_acpi_namespace_devices())
3704 pr_warn("ACPI name space devices didn't probe correctly\n");
3705
3706 /* Finally, we enable the DMA remapping hardware. */
3707 for_each_iommu(iommu, drhd) {
3708 if (!drhd->ignored && !translation_pre_enabled(iommu))
3709 iommu_enable_translation(iommu);
3710
3711 iommu_disable_protect_mem_regions(iommu);
3712 }
3713 up_read(&dmar_global_lock);
3714
3715 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3716
3717 intel_iommu_enabled = 1;
3718
3719 return 0;
3720
3721out_free_dmar:
3722 intel_iommu_free_dmars();
3723 up_write(&dmar_global_lock);
3724 return ret;
3725}
3726
3727static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3728{
3729 struct device_domain_info *info = opaque;
3730
3731 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3732 return 0;
3733}
3734
3735/*
3736 * NB - intel-iommu lacks any sort of reference counting for the users of
3737 * dependent devices. If multiple endpoints have intersecting dependent
3738 * devices, unbinding the driver from any one of them will possibly leave
3739 * the others unable to operate.
3740 */
3741static void domain_context_clear(struct device_domain_info *info)
3742{
3743 if (!dev_is_pci(info->dev))
3744 domain_context_clear_one(info, info->bus, info->devfn);
3745
3746 pci_for_each_dma_alias(to_pci_dev(info->dev),
3747 &domain_context_clear_one_cb, info);
3748}
3749
3750static void dmar_remove_one_dev_info(struct device *dev)
3751{
3752 struct device_domain_info *info = dev_iommu_priv_get(dev);
3753 struct dmar_domain *domain = info->domain;
3754 struct intel_iommu *iommu = info->iommu;
3755 unsigned long flags;
3756
3757 if (!dev_is_real_dma_subdevice(info->dev)) {
3758 if (dev_is_pci(info->dev) && sm_supported(iommu))
3759 intel_pasid_tear_down_entry(iommu, info->dev,
3760 IOMMU_NO_PASID, false);
3761
3762 iommu_disable_pci_caps(info);
3763 domain_context_clear(info);
3764 }
3765
3766 spin_lock_irqsave(&domain->lock, flags);
3767 list_del(&info->link);
3768 spin_unlock_irqrestore(&domain->lock, flags);
3769
3770 domain_detach_iommu(domain, iommu);
3771 info->domain = NULL;
3772}
3773
3774/*
3775 * Clear the page table pointer in context or pasid table entries so that
3776 * all DMA requests without PASID from the device are blocked. If the page
3777 * table has been set, clean up the data structures.
3778 */
3779void device_block_translation(struct device *dev)
3780{
3781 struct device_domain_info *info = dev_iommu_priv_get(dev);
3782 struct intel_iommu *iommu = info->iommu;
3783 unsigned long flags;
3784
3785 iommu_disable_pci_caps(info);
3786 if (!dev_is_real_dma_subdevice(dev)) {
3787 if (sm_supported(iommu))
3788 intel_pasid_tear_down_entry(iommu, dev,
3789 IOMMU_NO_PASID, false);
3790 else
3791 domain_context_clear(info);
3792 }
3793
3794 if (!info->domain)
3795 return;
3796
3797 spin_lock_irqsave(&info->domain->lock, flags);
3798 list_del(&info->link);
3799 spin_unlock_irqrestore(&info->domain->lock, flags);
3800
3801 domain_detach_iommu(info->domain, iommu);
3802 info->domain = NULL;
3803}
3804
3805static int md_domain_init(struct dmar_domain *domain, int guest_width)
3806{
3807 int adjust_width;
3808
3809 /* calculate AGAW */
3810 domain->gaw = guest_width;
3811 adjust_width = guestwidth_to_adjustwidth(guest_width);
3812 domain->agaw = width_to_agaw(adjust_width);
3813
3814 domain->iommu_coherency = false;
3815 domain->iommu_superpage = 0;
3816 domain->max_addr = 0;
3817
3818 /* always allocate the top pgd */
3819 domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
3820 if (!domain->pgd)
3821 return -ENOMEM;
3822 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3823 return 0;
3824}
3825
3826static int blocking_domain_attach_dev(struct iommu_domain *domain,
3827 struct device *dev)
3828{
3829 device_block_translation(dev);
3830 return 0;
3831}
3832
3833static struct iommu_domain blocking_domain = {
3834 .type = IOMMU_DOMAIN_BLOCKED,
3835 .ops = &(const struct iommu_domain_ops) {
3836 .attach_dev = blocking_domain_attach_dev,
3837 }
3838};
3839
3840static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3841{
3842 struct dmar_domain *dmar_domain;
3843 struct iommu_domain *domain;
3844
3845 switch (type) {
3846 case IOMMU_DOMAIN_DMA:
3847 case IOMMU_DOMAIN_UNMANAGED:
3848 dmar_domain = alloc_domain(type);
3849 if (!dmar_domain) {
3850 pr_err("Can't allocate dmar_domain\n");
3851 return NULL;
3852 }
3853 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3854 pr_err("Domain initialization failed\n");
3855 domain_exit(dmar_domain);
3856 return NULL;
3857 }
3858
3859 domain = &dmar_domain->domain;
3860 domain->geometry.aperture_start = 0;
3861 domain->geometry.aperture_end =
3862 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3863 domain->geometry.force_aperture = true;
3864
3865 return domain;
3866 case IOMMU_DOMAIN_IDENTITY:
3867 return &si_domain->domain;
3868 case IOMMU_DOMAIN_SVA:
3869 return intel_svm_domain_alloc();
3870 default:
3871 return NULL;
3872 }
3873
3874 return NULL;
3875}
3876
3877static struct iommu_domain *
3878intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3879 struct iommu_domain *parent,
3880 const struct iommu_user_data *user_data)
3881{
3882 struct device_domain_info *info = dev_iommu_priv_get(dev);
3883 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3884 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3885 struct intel_iommu *iommu = info->iommu;
3886 struct iommu_domain *domain;
3887
3888 /* Must be NESTING domain */
3889 if (parent) {
3890 if (!nested_supported(iommu) || flags)
3891 return ERR_PTR(-EOPNOTSUPP);
3892 return intel_nested_domain_alloc(parent, user_data);
3893 }
3894
3895 if (flags &
3896 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3897 return ERR_PTR(-EOPNOTSUPP);
3898 if (nested_parent && !nested_supported(iommu))
3899 return ERR_PTR(-EOPNOTSUPP);
3900 if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3901 return ERR_PTR(-EOPNOTSUPP);
3902
3903 /*
3904 * domain_alloc_user op needs to fully initialize a domain before
3905 * return, so uses iommu_domain_alloc() here for simple.
3906 */
3907 domain = iommu_domain_alloc(dev->bus);
3908 if (!domain)
3909 return ERR_PTR(-ENOMEM);
3910
3911 if (nested_parent)
3912 to_dmar_domain(domain)->nested_parent = true;
3913
3914 if (dirty_tracking) {
3915 if (to_dmar_domain(domain)->use_first_level) {
3916 iommu_domain_free(domain);
3917 return ERR_PTR(-EOPNOTSUPP);
3918 }
3919 domain->dirty_ops = &intel_dirty_ops;
3920 }
3921
3922 return domain;
3923}
3924
3925static void intel_iommu_domain_free(struct iommu_domain *domain)
3926{
3927 if (domain != &si_domain->domain)
3928 domain_exit(to_dmar_domain(domain));
3929}
3930
3931int prepare_domain_attach_device(struct iommu_domain *domain,
3932 struct device *dev)
3933{
3934 struct device_domain_info *info = dev_iommu_priv_get(dev);
3935 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3936 struct intel_iommu *iommu = info->iommu;
3937 int addr_width;
3938
3939 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3940 return -EINVAL;
3941
3942 if (domain->dirty_ops && !ssads_supported(iommu))
3943 return -EINVAL;
3944
3945 /* check if this iommu agaw is sufficient for max mapped address */
3946 addr_width = agaw_to_width(iommu->agaw);
3947 if (addr_width > cap_mgaw(iommu->cap))
3948 addr_width = cap_mgaw(iommu->cap);
3949
3950 if (dmar_domain->max_addr > (1LL << addr_width))
3951 return -EINVAL;
3952 dmar_domain->gaw = addr_width;
3953
3954 /*
3955 * Knock out extra levels of page tables if necessary
3956 */
3957 while (iommu->agaw < dmar_domain->agaw) {
3958 struct dma_pte *pte;
3959
3960 pte = dmar_domain->pgd;
3961 if (dma_pte_present(pte)) {
3962 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3963 free_pgtable_page(pte);
3964 }
3965 dmar_domain->agaw--;
3966 }
3967
3968 return 0;
3969}
3970
3971static int intel_iommu_attach_device(struct iommu_domain *domain,
3972 struct device *dev)
3973{
3974 struct device_domain_info *info = dev_iommu_priv_get(dev);
3975 int ret;
3976
3977 if (info->domain)
3978 device_block_translation(dev);
3979
3980 ret = prepare_domain_attach_device(domain, dev);
3981 if (ret)
3982 return ret;
3983
3984 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3985}
3986
3987static int intel_iommu_map(struct iommu_domain *domain,
3988 unsigned long iova, phys_addr_t hpa,
3989 size_t size, int iommu_prot, gfp_t gfp)
3990{
3991 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3992 u64 max_addr;
3993 int prot = 0;
3994
3995 if (iommu_prot & IOMMU_READ)
3996 prot |= DMA_PTE_READ;
3997 if (iommu_prot & IOMMU_WRITE)
3998 prot |= DMA_PTE_WRITE;
3999 if (dmar_domain->set_pte_snp)
4000 prot |= DMA_PTE_SNP;
4001
4002 max_addr = iova + size;
4003 if (dmar_domain->max_addr < max_addr) {
4004 u64 end;
4005
4006 /* check if minimum agaw is sufficient for mapped address */
4007 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4008 if (end < max_addr) {
4009 pr_err("%s: iommu width (%d) is not "
4010 "sufficient for the mapped address (%llx)\n",
4011 __func__, dmar_domain->gaw, max_addr);
4012 return -EFAULT;
4013 }
4014 dmar_domain->max_addr = max_addr;
4015 }
4016 /* Round up size to next multiple of PAGE_SIZE, if it and
4017 the low bits of hpa would take us onto the next page */
4018 size = aligned_nrpages(hpa, size);
4019 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4020 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4021}
4022
4023static int intel_iommu_map_pages(struct iommu_domain *domain,
4024 unsigned long iova, phys_addr_t paddr,
4025 size_t pgsize, size_t pgcount,
4026 int prot, gfp_t gfp, size_t *mapped)
4027{
4028 unsigned long pgshift = __ffs(pgsize);
4029 size_t size = pgcount << pgshift;
4030 int ret;
4031
4032 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4033 return -EINVAL;
4034
4035 if (!IS_ALIGNED(iova | paddr, pgsize))
4036 return -EINVAL;
4037
4038 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4039 if (!ret && mapped)
4040 *mapped = size;
4041
4042 return ret;
4043}
4044
4045static size_t intel_iommu_unmap(struct iommu_domain *domain,
4046 unsigned long iova, size_t size,
4047 struct iommu_iotlb_gather *gather)
4048{
4049 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4050 unsigned long start_pfn, last_pfn;
4051 int level = 0;
4052
4053 /* Cope with horrid API which requires us to unmap more than the
4054 size argument if it happens to be a large-page mapping. */
4055 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4056 &level, GFP_ATOMIC)))
4057 return 0;
4058
4059 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4060 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4061
4062 start_pfn = iova >> VTD_PAGE_SHIFT;
4063 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4064
4065 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4066
4067 if (dmar_domain->max_addr == iova + size)
4068 dmar_domain->max_addr = iova;
4069
4070 /*
4071 * We do not use page-selective IOTLB invalidation in flush queue,
4072 * so there is no need to track page and sync iotlb.
4073 */
4074 if (!iommu_iotlb_gather_queued(gather))
4075 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4076
4077 return size;
4078}
4079
4080static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4081 unsigned long iova,
4082 size_t pgsize, size_t pgcount,
4083 struct iommu_iotlb_gather *gather)
4084{
4085 unsigned long pgshift = __ffs(pgsize);
4086 size_t size = pgcount << pgshift;
4087
4088 return intel_iommu_unmap(domain, iova, size, gather);
4089}
4090
4091static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4092 struct iommu_iotlb_gather *gather)
4093{
4094 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4095 unsigned long iova_pfn = IOVA_PFN(gather->start);
4096 size_t size = gather->end - gather->start;
4097 struct iommu_domain_info *info;
4098 unsigned long start_pfn;
4099 unsigned long nrpages;
4100 unsigned long i;
4101
4102 nrpages = aligned_nrpages(gather->start, size);
4103 start_pfn = mm_to_dma_pfn_start(iova_pfn);
4104
4105 xa_for_each(&dmar_domain->iommu_array, i, info)
4106 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4107 start_pfn, nrpages,
4108 list_empty(&gather->freelist), 0);
4109
4110 put_pages_list(&gather->freelist);
4111}
4112
4113static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4114 dma_addr_t iova)
4115{
4116 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4117 struct dma_pte *pte;
4118 int level = 0;
4119 u64 phys = 0;
4120
4121 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4122 GFP_ATOMIC);
4123 if (pte && dma_pte_present(pte))
4124 phys = dma_pte_addr(pte) +
4125 (iova & (BIT_MASK(level_to_offset_bits(level) +
4126 VTD_PAGE_SHIFT) - 1));
4127
4128 return phys;
4129}
4130
4131static bool domain_support_force_snooping(struct dmar_domain *domain)
4132{
4133 struct device_domain_info *info;
4134 bool support = true;
4135
4136 assert_spin_locked(&domain->lock);
4137 list_for_each_entry(info, &domain->devices, link) {
4138 if (!ecap_sc_support(info->iommu->ecap)) {
4139 support = false;
4140 break;
4141 }
4142 }
4143
4144 return support;
4145}
4146
4147static void domain_set_force_snooping(struct dmar_domain *domain)
4148{
4149 struct device_domain_info *info;
4150
4151 assert_spin_locked(&domain->lock);
4152 /*
4153 * Second level page table supports per-PTE snoop control. The
4154 * iommu_map() interface will handle this by setting SNP bit.
4155 */
4156 if (!domain->use_first_level) {
4157 domain->set_pte_snp = true;
4158 return;
4159 }
4160
4161 list_for_each_entry(info, &domain->devices, link)
4162 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4163 IOMMU_NO_PASID);
4164}
4165
4166static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4167{
4168 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4169 unsigned long flags;
4170
4171 if (dmar_domain->force_snooping)
4172 return true;
4173
4174 spin_lock_irqsave(&dmar_domain->lock, flags);
4175 if (!domain_support_force_snooping(dmar_domain) ||
4176 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4177 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4178 return false;
4179 }
4180
4181 domain_set_force_snooping(dmar_domain);
4182 dmar_domain->force_snooping = true;
4183 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4184
4185 return true;
4186}
4187
4188static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4189{
4190 struct device_domain_info *info = dev_iommu_priv_get(dev);
4191
4192 switch (cap) {
4193 case IOMMU_CAP_CACHE_COHERENCY:
4194 case IOMMU_CAP_DEFERRED_FLUSH:
4195 return true;
4196 case IOMMU_CAP_PRE_BOOT_PROTECTION:
4197 return dmar_platform_optin();
4198 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4199 return ecap_sc_support(info->iommu->ecap);
4200 case IOMMU_CAP_DIRTY_TRACKING:
4201 return ssads_supported(info->iommu);
4202 default:
4203 return false;
4204 }
4205}
4206
4207static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4208{
4209 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4210 struct device_domain_info *info;
4211 struct intel_iommu *iommu;
4212 u8 bus, devfn;
4213 int ret;
4214
4215 iommu = device_lookup_iommu(dev, &bus, &devfn);
4216 if (!iommu || !iommu->iommu.ops)
4217 return ERR_PTR(-ENODEV);
4218
4219 info = kzalloc(sizeof(*info), GFP_KERNEL);
4220 if (!info)
4221 return ERR_PTR(-ENOMEM);
4222
4223 if (dev_is_real_dma_subdevice(dev)) {
4224 info->bus = pdev->bus->number;
4225 info->devfn = pdev->devfn;
4226 info->segment = pci_domain_nr(pdev->bus);
4227 } else {
4228 info->bus = bus;
4229 info->devfn = devfn;
4230 info->segment = iommu->segment;
4231 }
4232
4233 info->dev = dev;
4234 info->iommu = iommu;
4235 if (dev_is_pci(dev)) {
4236 if (ecap_dev_iotlb_support(iommu->ecap) &&
4237 pci_ats_supported(pdev) &&
4238 dmar_ats_supported(pdev, iommu)) {
4239 info->ats_supported = 1;
4240 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4241
4242 /*
4243 * For IOMMU that supports device IOTLB throttling
4244 * (DIT), we assign PFSID to the invalidation desc
4245 * of a VF such that IOMMU HW can gauge queue depth
4246 * at PF level. If DIT is not set, PFSID will be
4247 * treated as reserved, which should be set to 0.
4248 */
4249 if (ecap_dit(iommu->ecap))
4250 info->pfsid = pci_dev_id(pci_physfn(pdev));
4251 info->ats_qdep = pci_ats_queue_depth(pdev);
4252 }
4253 if (sm_supported(iommu)) {
4254 if (pasid_supported(iommu)) {
4255 int features = pci_pasid_features(pdev);
4256
4257 if (features >= 0)
4258 info->pasid_supported = features | 1;
4259 }
4260
4261 if (info->ats_supported && ecap_prs(iommu->ecap) &&
4262 pci_pri_supported(pdev))
4263 info->pri_supported = 1;
4264 }
4265 }
4266
4267 dev_iommu_priv_set(dev, info);
4268
4269 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4270 ret = intel_pasid_alloc_table(dev);
4271 if (ret) {
4272 dev_err(dev, "PASID table allocation failed\n");
4273 kfree(info);
4274 return ERR_PTR(ret);
4275 }
4276 }
4277
4278 intel_iommu_debugfs_create_dev(info);
4279
4280 return &iommu->iommu;
4281}
4282
4283static void intel_iommu_release_device(struct device *dev)
4284{
4285 struct device_domain_info *info = dev_iommu_priv_get(dev);
4286
4287 dmar_remove_one_dev_info(dev);
4288 intel_pasid_free_table(dev);
4289 intel_iommu_debugfs_remove_dev(info);
4290 kfree(info);
4291 set_dma_ops(dev, NULL);
4292}
4293
4294static void intel_iommu_probe_finalize(struct device *dev)
4295{
4296 set_dma_ops(dev, NULL);
4297 iommu_setup_dma_ops(dev, 0, U64_MAX);
4298}
4299
4300static void intel_iommu_get_resv_regions(struct device *device,
4301 struct list_head *head)
4302{
4303 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4304 struct iommu_resv_region *reg;
4305 struct dmar_rmrr_unit *rmrr;
4306 struct device *i_dev;
4307 int i;
4308
4309 rcu_read_lock();
4310 for_each_rmrr_units(rmrr) {
4311 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4312 i, i_dev) {
4313 struct iommu_resv_region *resv;
4314 enum iommu_resv_type type;
4315 size_t length;
4316
4317 if (i_dev != device &&
4318 !is_downstream_to_pci_bridge(device, i_dev))
4319 continue;
4320
4321 length = rmrr->end_address - rmrr->base_address + 1;
4322
4323 type = device_rmrr_is_relaxable(device) ?
4324 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4325
4326 resv = iommu_alloc_resv_region(rmrr->base_address,
4327 length, prot, type,
4328 GFP_ATOMIC);
4329 if (!resv)
4330 break;
4331
4332 list_add_tail(&resv->list, head);
4333 }
4334 }
4335 rcu_read_unlock();
4336
4337#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4338 if (dev_is_pci(device)) {
4339 struct pci_dev *pdev = to_pci_dev(device);
4340
4341 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4342 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4343 IOMMU_RESV_DIRECT_RELAXABLE,
4344 GFP_KERNEL);
4345 if (reg)
4346 list_add_tail(®->list, head);
4347 }
4348 }
4349#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4350
4351 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4352 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4353 0, IOMMU_RESV_MSI, GFP_KERNEL);
4354 if (!reg)
4355 return;
4356 list_add_tail(®->list, head);
4357}
4358
4359static struct iommu_group *intel_iommu_device_group(struct device *dev)
4360{
4361 if (dev_is_pci(dev))
4362 return pci_device_group(dev);
4363 return generic_device_group(dev);
4364}
4365
4366static int intel_iommu_enable_sva(struct device *dev)
4367{
4368 struct device_domain_info *info = dev_iommu_priv_get(dev);
4369 struct intel_iommu *iommu;
4370
4371 if (!info || dmar_disabled)
4372 return -EINVAL;
4373
4374 iommu = info->iommu;
4375 if (!iommu)
4376 return -EINVAL;
4377
4378 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4379 return -ENODEV;
4380
4381 if (!info->pasid_enabled || !info->ats_enabled)
4382 return -EINVAL;
4383
4384 /*
4385 * Devices having device-specific I/O fault handling should not
4386 * support PCI/PRI. The IOMMU side has no means to check the
4387 * capability of device-specific IOPF. Therefore, IOMMU can only
4388 * default that if the device driver enables SVA on a non-PRI
4389 * device, it will handle IOPF in its own way.
4390 */
4391 if (!info->pri_supported)
4392 return 0;
4393
4394 /* Devices supporting PRI should have it enabled. */
4395 if (!info->pri_enabled)
4396 return -EINVAL;
4397
4398 return 0;
4399}
4400
4401static int intel_iommu_enable_iopf(struct device *dev)
4402{
4403 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4404 struct device_domain_info *info = dev_iommu_priv_get(dev);
4405 struct intel_iommu *iommu;
4406 int ret;
4407
4408 if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4409 return -ENODEV;
4410
4411 if (info->pri_enabled)
4412 return -EBUSY;
4413
4414 iommu = info->iommu;
4415 if (!iommu)
4416 return -EINVAL;
4417
4418 /* PASID is required in PRG Response Message. */
4419 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4420 return -EINVAL;
4421
4422 ret = pci_reset_pri(pdev);
4423 if (ret)
4424 return ret;
4425
4426 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4427 if (ret)
4428 return ret;
4429
4430 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4431 if (ret)
4432 goto iopf_remove_device;
4433
4434 ret = pci_enable_pri(pdev, PRQ_DEPTH);
4435 if (ret)
4436 goto iopf_unregister_handler;
4437 info->pri_enabled = 1;
4438
4439 return 0;
4440
4441iopf_unregister_handler:
4442 iommu_unregister_device_fault_handler(dev);
4443iopf_remove_device:
4444 iopf_queue_remove_device(iommu->iopf_queue, dev);
4445
4446 return ret;
4447}
4448
4449static int intel_iommu_disable_iopf(struct device *dev)
4450{
4451 struct device_domain_info *info = dev_iommu_priv_get(dev);
4452 struct intel_iommu *iommu = info->iommu;
4453
4454 if (!info->pri_enabled)
4455 return -EINVAL;
4456
4457 /*
4458 * PCIe spec states that by clearing PRI enable bit, the Page
4459 * Request Interface will not issue new page requests, but has
4460 * outstanding page requests that have been transmitted or are
4461 * queued for transmission. This is supposed to be called after
4462 * the device driver has stopped DMA, all PASIDs have been
4463 * unbound and the outstanding PRQs have been drained.
4464 */
4465 pci_disable_pri(to_pci_dev(dev));
4466 info->pri_enabled = 0;
4467
4468 /*
4469 * With PRI disabled and outstanding PRQs drained, unregistering
4470 * fault handler and removing device from iopf queue should never
4471 * fail.
4472 */
4473 WARN_ON(iommu_unregister_device_fault_handler(dev));
4474 WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4475
4476 return 0;
4477}
4478
4479static int
4480intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4481{
4482 switch (feat) {
4483 case IOMMU_DEV_FEAT_IOPF:
4484 return intel_iommu_enable_iopf(dev);
4485
4486 case IOMMU_DEV_FEAT_SVA:
4487 return intel_iommu_enable_sva(dev);
4488
4489 default:
4490 return -ENODEV;
4491 }
4492}
4493
4494static int
4495intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4496{
4497 switch (feat) {
4498 case IOMMU_DEV_FEAT_IOPF:
4499 return intel_iommu_disable_iopf(dev);
4500
4501 case IOMMU_DEV_FEAT_SVA:
4502 return 0;
4503
4504 default:
4505 return -ENODEV;
4506 }
4507}
4508
4509static bool intel_iommu_is_attach_deferred(struct device *dev)
4510{
4511 struct device_domain_info *info = dev_iommu_priv_get(dev);
4512
4513 return translation_pre_enabled(info->iommu) && !info->domain;
4514}
4515
4516/*
4517 * Check that the device does not live on an external facing PCI port that is
4518 * marked as untrusted. Such devices should not be able to apply quirks and
4519 * thus not be able to bypass the IOMMU restrictions.
4520 */
4521static bool risky_device(struct pci_dev *pdev)
4522{
4523 if (pdev->untrusted) {
4524 pci_info(pdev,
4525 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4526 pdev->vendor, pdev->device);
4527 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4528 return true;
4529 }
4530 return false;
4531}
4532
4533static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4534 unsigned long iova, size_t size)
4535{
4536 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4537 unsigned long pages = aligned_nrpages(iova, size);
4538 unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4539 struct iommu_domain_info *info;
4540 unsigned long i;
4541
4542 xa_for_each(&dmar_domain->iommu_array, i, info)
4543 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4544 return 0;
4545}
4546
4547static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4548{
4549 struct device_domain_info *info = dev_iommu_priv_get(dev);
4550 struct dev_pasid_info *curr, *dev_pasid = NULL;
4551 struct intel_iommu *iommu = info->iommu;
4552 struct dmar_domain *dmar_domain;
4553 struct iommu_domain *domain;
4554 unsigned long flags;
4555
4556 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4557 if (WARN_ON_ONCE(!domain))
4558 goto out_tear_down;
4559
4560 /*
4561 * The SVA implementation needs to handle its own stuffs like the mm
4562 * notification. Before consolidating that code into iommu core, let
4563 * the intel sva code handle it.
4564 */
4565 if (domain->type == IOMMU_DOMAIN_SVA) {
4566 intel_svm_remove_dev_pasid(dev, pasid);
4567 goto out_tear_down;
4568 }
4569
4570 dmar_domain = to_dmar_domain(domain);
4571 spin_lock_irqsave(&dmar_domain->lock, flags);
4572 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4573 if (curr->dev == dev && curr->pasid == pasid) {
4574 list_del(&curr->link_domain);
4575 dev_pasid = curr;
4576 break;
4577 }
4578 }
4579 WARN_ON_ONCE(!dev_pasid);
4580 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4581
4582 domain_detach_iommu(dmar_domain, iommu);
4583 intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4584 kfree(dev_pasid);
4585out_tear_down:
4586 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4587 intel_drain_pasid_prq(dev, pasid);
4588}
4589
4590static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4591 struct device *dev, ioasid_t pasid)
4592{
4593 struct device_domain_info *info = dev_iommu_priv_get(dev);
4594 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4595 struct intel_iommu *iommu = info->iommu;
4596 struct dev_pasid_info *dev_pasid;
4597 unsigned long flags;
4598 int ret;
4599
4600 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4601 return -EOPNOTSUPP;
4602
4603 if (domain->dirty_ops)
4604 return -EINVAL;
4605
4606 if (context_copied(iommu, info->bus, info->devfn))
4607 return -EBUSY;
4608
4609 ret = prepare_domain_attach_device(domain, dev);
4610 if (ret)
4611 return ret;
4612
4613 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4614 if (!dev_pasid)
4615 return -ENOMEM;
4616
4617 ret = domain_attach_iommu(dmar_domain, iommu);
4618 if (ret)
4619 goto out_free;
4620
4621 if (domain_type_is_si(dmar_domain))
4622 ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
4623 else if (dmar_domain->use_first_level)
4624 ret = domain_setup_first_level(iommu, dmar_domain,
4625 dev, pasid);
4626 else
4627 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4628 dev, pasid);
4629 if (ret)
4630 goto out_detach_iommu;
4631
4632 dev_pasid->dev = dev;
4633 dev_pasid->pasid = pasid;
4634 spin_lock_irqsave(&dmar_domain->lock, flags);
4635 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4636 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4637
4638 if (domain->type & __IOMMU_DOMAIN_PAGING)
4639 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4640
4641 return 0;
4642out_detach_iommu:
4643 domain_detach_iommu(dmar_domain, iommu);
4644out_free:
4645 kfree(dev_pasid);
4646 return ret;
4647}
4648
4649static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4650{
4651 struct device_domain_info *info = dev_iommu_priv_get(dev);
4652 struct intel_iommu *iommu = info->iommu;
4653 struct iommu_hw_info_vtd *vtd;
4654
4655 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4656 if (!vtd)
4657 return ERR_PTR(-ENOMEM);
4658
4659 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4660 vtd->cap_reg = iommu->cap;
4661 vtd->ecap_reg = iommu->ecap;
4662 *length = sizeof(*vtd);
4663 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4664 return vtd;
4665}
4666
4667static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4668 bool enable)
4669{
4670 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4671 struct device_domain_info *info;
4672 int ret;
4673
4674 spin_lock(&dmar_domain->lock);
4675 if (dmar_domain->dirty_tracking == enable)
4676 goto out_unlock;
4677
4678 list_for_each_entry(info, &dmar_domain->devices, link) {
4679 ret = intel_pasid_setup_dirty_tracking(info->iommu,
4680 info->domain, info->dev,
4681 IOMMU_NO_PASID, enable);
4682 if (ret)
4683 goto err_unwind;
4684 }
4685
4686 dmar_domain->dirty_tracking = enable;
4687out_unlock:
4688 spin_unlock(&dmar_domain->lock);
4689
4690 return 0;
4691
4692err_unwind:
4693 list_for_each_entry(info, &dmar_domain->devices, link)
4694 intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
4695 info->dev, IOMMU_NO_PASID,
4696 dmar_domain->dirty_tracking);
4697 spin_unlock(&dmar_domain->lock);
4698 return ret;
4699}
4700
4701static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4702 unsigned long iova, size_t size,
4703 unsigned long flags,
4704 struct iommu_dirty_bitmap *dirty)
4705{
4706 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4707 unsigned long end = iova + size - 1;
4708 unsigned long pgsize;
4709
4710 /*
4711 * IOMMUFD core calls into a dirty tracking disabled domain without an
4712 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4713 * have occurred when we stopped dirty tracking. This ensures that we
4714 * never inherit dirtied bits from a previous cycle.
4715 */
4716 if (!dmar_domain->dirty_tracking && dirty->bitmap)
4717 return -EINVAL;
4718
4719 do {
4720 struct dma_pte *pte;
4721 int lvl = 0;
4722
4723 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4724 GFP_ATOMIC);
4725 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4726 if (!pte || !dma_pte_present(pte)) {
4727 iova += pgsize;
4728 continue;
4729 }
4730
4731 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4732 iommu_dirty_bitmap_record(dirty, iova, pgsize);
4733 iova += pgsize;
4734 } while (iova < end);
4735
4736 return 0;
4737}
4738
4739static const struct iommu_dirty_ops intel_dirty_ops = {
4740 .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4741 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4742};
4743
4744const struct iommu_ops intel_iommu_ops = {
4745 .blocked_domain = &blocking_domain,
4746 .capable = intel_iommu_capable,
4747 .hw_info = intel_iommu_hw_info,
4748 .domain_alloc = intel_iommu_domain_alloc,
4749 .domain_alloc_user = intel_iommu_domain_alloc_user,
4750 .probe_device = intel_iommu_probe_device,
4751 .probe_finalize = intel_iommu_probe_finalize,
4752 .release_device = intel_iommu_release_device,
4753 .get_resv_regions = intel_iommu_get_resv_regions,
4754 .device_group = intel_iommu_device_group,
4755 .dev_enable_feat = intel_iommu_dev_enable_feat,
4756 .dev_disable_feat = intel_iommu_dev_disable_feat,
4757 .is_attach_deferred = intel_iommu_is_attach_deferred,
4758 .def_domain_type = device_def_domain_type,
4759 .remove_dev_pasid = intel_iommu_remove_dev_pasid,
4760 .pgsize_bitmap = SZ_4K,
4761#ifdef CONFIG_INTEL_IOMMU_SVM
4762 .page_response = intel_svm_page_response,
4763#endif
4764 .default_domain_ops = &(const struct iommu_domain_ops) {
4765 .attach_dev = intel_iommu_attach_device,
4766 .set_dev_pasid = intel_iommu_set_dev_pasid,
4767 .map_pages = intel_iommu_map_pages,
4768 .unmap_pages = intel_iommu_unmap_pages,
4769 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4770 .flush_iotlb_all = intel_flush_iotlb_all,
4771 .iotlb_sync = intel_iommu_tlb_sync,
4772 .iova_to_phys = intel_iommu_iova_to_phys,
4773 .free = intel_iommu_domain_free,
4774 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4775 }
4776};
4777
4778static void quirk_iommu_igfx(struct pci_dev *dev)
4779{
4780 if (risky_device(dev))
4781 return;
4782
4783 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4784 dmar_map_gfx = 0;
4785}
4786
4787/* G4x/GM45 integrated gfx dmar support is totally busted. */
4788DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4789DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4790DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4791DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4792DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4793DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4794DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4795
4796/* Broadwell igfx malfunctions with dmar */
4797DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4798DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4799DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4800DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4801DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4802DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4803DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4804DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4805DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4806DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4807DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4808DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4809DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4810DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4811DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4812DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4813DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4814DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4815DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4816DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4817DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4818DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4819DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4820DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4821
4822static void quirk_iommu_rwbf(struct pci_dev *dev)
4823{
4824 if (risky_device(dev))
4825 return;
4826
4827 /*
4828 * Mobile 4 Series Chipset neglects to set RWBF capability,
4829 * but needs it. Same seems to hold for the desktop versions.
4830 */
4831 pci_info(dev, "Forcing write-buffer flush capability\n");
4832 rwbf_quirk = 1;
4833}
4834
4835DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4836DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4837DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4838DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4839DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4840DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4841DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4842
4843#define GGC 0x52
4844#define GGC_MEMORY_SIZE_MASK (0xf << 8)
4845#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4846#define GGC_MEMORY_SIZE_1M (0x1 << 8)
4847#define GGC_MEMORY_SIZE_2M (0x3 << 8)
4848#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4849#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4850#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4851#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4852
4853static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4854{
4855 unsigned short ggc;
4856
4857 if (risky_device(dev))
4858 return;
4859
4860 if (pci_read_config_word(dev, GGC, &ggc))
4861 return;
4862
4863 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4864 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4865 dmar_map_gfx = 0;
4866 } else if (dmar_map_gfx) {
4867 /* we have to ensure the gfx device is idle before we flush */
4868 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4869 iommu_set_dma_strict();
4870 }
4871}
4872DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4873DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4874DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4875DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4876
4877static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4878{
4879 unsigned short ver;
4880
4881 if (!IS_GFX_DEVICE(dev))
4882 return;
4883
4884 ver = (dev->device >> 8) & 0xff;
4885 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4886 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4887 ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4888 return;
4889
4890 if (risky_device(dev))
4891 return;
4892
4893 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4894 iommu_skip_te_disable = 1;
4895}
4896DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4897
4898/* On Tylersburg chipsets, some BIOSes have been known to enable the
4899 ISOCH DMAR unit for the Azalia sound device, but not give it any
4900 TLB entries, which causes it to deadlock. Check for that. We do
4901 this in a function called from init_dmars(), instead of in a PCI
4902 quirk, because we don't want to print the obnoxious "BIOS broken"
4903 message if VT-d is actually disabled.
4904*/
4905static void __init check_tylersburg_isoch(void)
4906{
4907 struct pci_dev *pdev;
4908 uint32_t vtisochctrl;
4909
4910 /* If there's no Azalia in the system anyway, forget it. */
4911 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4912 if (!pdev)
4913 return;
4914
4915 if (risky_device(pdev)) {
4916 pci_dev_put(pdev);
4917 return;
4918 }
4919
4920 pci_dev_put(pdev);
4921
4922 /* System Management Registers. Might be hidden, in which case
4923 we can't do the sanity check. But that's OK, because the
4924 known-broken BIOSes _don't_ actually hide it, so far. */
4925 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4926 if (!pdev)
4927 return;
4928
4929 if (risky_device(pdev)) {
4930 pci_dev_put(pdev);
4931 return;
4932 }
4933
4934 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4935 pci_dev_put(pdev);
4936 return;
4937 }
4938
4939 pci_dev_put(pdev);
4940
4941 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4942 if (vtisochctrl & 1)
4943 return;
4944
4945 /* Drop all bits other than the number of TLB entries */
4946 vtisochctrl &= 0x1c;
4947
4948 /* If we have the recommended number of TLB entries (16), fine. */
4949 if (vtisochctrl == 0x10)
4950 return;
4951
4952 /* Zero TLB entries? You get to ride the short bus to school. */
4953 if (!vtisochctrl) {
4954 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4955 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4956 dmi_get_system_info(DMI_BIOS_VENDOR),
4957 dmi_get_system_info(DMI_BIOS_VERSION),
4958 dmi_get_system_info(DMI_PRODUCT_VERSION));
4959 iommu_identity_mapping |= IDENTMAP_AZALIA;
4960 return;
4961 }
4962
4963 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4964 vtisochctrl);
4965}
4966
4967/*
4968 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4969 * invalidation completion before posted writes initiated with translated address
4970 * that utilized translations matching the invalidation address range, violating
4971 * the invalidation completion ordering.
4972 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4973 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4974 * under the control of the trusted/privileged host device driver must use this
4975 * quirk.
4976 * Device TLBs are invalidated under the following six conditions:
4977 * 1. Device driver does DMA API unmap IOVA
4978 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4979 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4980 * exit_mmap() due to crash
4981 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4982 * VM has to free pages that were unmapped
4983 * 5. Userspace driver unmaps a DMA buffer
4984 * 6. Cache invalidation in vSVA usage (upcoming)
4985 *
4986 * For #1 and #2, device drivers are responsible for stopping DMA traffic
4987 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4988 * invalidate TLB the same way as normal user unmap which will use this quirk.
4989 * The dTLB invalidation after PASID cache flush does not need this quirk.
4990 *
4991 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4992 */
4993void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4994 unsigned long address, unsigned long mask,
4995 u32 pasid, u16 qdep)
4996{
4997 u16 sid;
4998
4999 if (likely(!info->dtlb_extra_inval))
5000 return;
5001
5002 sid = PCI_DEVID(info->bus, info->devfn);
5003 if (pasid == IOMMU_NO_PASID) {
5004 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5005 qdep, address, mask);
5006 } else {
5007 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5008 pasid, qdep, address, mask);
5009 }
5010}
5011
5012#define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
5013
5014/*
5015 * Function to submit a command to the enhanced command interface. The
5016 * valid enhanced command descriptions are defined in Table 47 of the
5017 * VT-d spec. The VT-d hardware implementation may support some but not
5018 * all commands, which can be determined by checking the Enhanced
5019 * Command Capability Register.
5020 *
5021 * Return values:
5022 * - 0: Command successful without any error;
5023 * - Negative: software error value;
5024 * - Nonzero positive: failure status code defined in Table 48.
5025 */
5026int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5027{
5028 unsigned long flags;
5029 u64 res;
5030 int ret;
5031
5032 if (!cap_ecmds(iommu->cap))
5033 return -ENODEV;
5034
5035 raw_spin_lock_irqsave(&iommu->register_lock, flags);
5036
5037 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5038 if (res & DMA_ECMD_ECRSP_IP) {
5039 ret = -EBUSY;
5040 goto err;
5041 }
5042
5043 /*
5044 * Unconditionally write the operand B, because
5045 * - There is no side effect if an ecmd doesn't require an
5046 * operand B, but we set the register to some value.
5047 * - It's not invoked in any critical path. The extra MMIO
5048 * write doesn't bring any performance concerns.
5049 */
5050 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5051 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5052
5053 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5054 !(res & DMA_ECMD_ECRSP_IP), res);
5055
5056 if (res & DMA_ECMD_ECRSP_IP) {
5057 ret = -ETIMEDOUT;
5058 goto err;
5059 }
5060
5061 ret = ecmd_get_status_code(res);
5062err:
5063 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5064
5065 return ret;
5066}