Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.4 4014 lines 94 kB view raw
1/* 2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. 3 * Author: Joerg Roedel <jroedel@suse.de> 4 * Leo Duran <leo.duran@amd.com> 5 * 6 * This program is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 as published 8 * by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program; if not, write to the Free Software 17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 18 */ 19 20#include <linux/ratelimit.h> 21#include <linux/pci.h> 22#include <linux/pci-ats.h> 23#include <linux/bitmap.h> 24#include <linux/slab.h> 25#include <linux/debugfs.h> 26#include <linux/scatterlist.h> 27#include <linux/dma-mapping.h> 28#include <linux/iommu-helper.h> 29#include <linux/iommu.h> 30#include <linux/delay.h> 31#include <linux/amd-iommu.h> 32#include <linux/notifier.h> 33#include <linux/export.h> 34#include <linux/irq.h> 35#include <linux/msi.h> 36#include <linux/dma-contiguous.h> 37#include <linux/irqdomain.h> 38#include <asm/irq_remapping.h> 39#include <asm/io_apic.h> 40#include <asm/apic.h> 41#include <asm/hw_irq.h> 42#include <asm/msidef.h> 43#include <asm/proto.h> 44#include <asm/iommu.h> 45#include <asm/gart.h> 46#include <asm/dma.h> 47 48#include "amd_iommu_proto.h" 49#include "amd_iommu_types.h" 50#include "irq_remapping.h" 51 52#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) 53 54#define LOOP_TIMEOUT 100000 55 56/* 57 * This bitmap is used to advertise the page sizes our hardware support 58 * to the IOMMU core, which will then use this information to split 59 * physically contiguous memory regions it is mapping into page sizes 60 * that we support. 61 * 62 * 512GB Pages are not supported due to a hardware bug 63 */ 64#define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38)) 65 66static DEFINE_RWLOCK(amd_iommu_devtable_lock); 67 68/* List of all available dev_data structures */ 69static LIST_HEAD(dev_data_list); 70static DEFINE_SPINLOCK(dev_data_list_lock); 71 72LIST_HEAD(ioapic_map); 73LIST_HEAD(hpet_map); 74 75/* 76 * Domain for untranslated devices - only allocated 77 * if iommu=pt passed on kernel cmd line. 78 */ 79static const struct iommu_ops amd_iommu_ops; 80 81static ATOMIC_NOTIFIER_HEAD(ppr_notifier); 82int amd_iommu_max_glx_val = -1; 83 84static struct dma_map_ops amd_iommu_dma_ops; 85 86/* 87 * This struct contains device specific data for the IOMMU 88 */ 89struct iommu_dev_data { 90 struct list_head list; /* For domain->dev_list */ 91 struct list_head dev_data_list; /* For global dev_data_list */ 92 struct protection_domain *domain; /* Domain the device is bound to */ 93 u16 devid; /* PCI Device ID */ 94 bool iommu_v2; /* Device can make use of IOMMUv2 */ 95 bool passthrough; /* Device is identity mapped */ 96 struct { 97 bool enabled; 98 int qdep; 99 } ats; /* ATS state */ 100 bool pri_tlp; /* PASID TLB required for 101 PPR completions */ 102 u32 errata; /* Bitmap for errata to apply */ 103}; 104 105/* 106 * general struct to manage commands send to an IOMMU 107 */ 108struct iommu_cmd { 109 u32 data[4]; 110}; 111 112struct kmem_cache *amd_iommu_irq_cache; 113 114static void update_domain(struct protection_domain *domain); 115static int protection_domain_init(struct protection_domain *domain); 116 117/**************************************************************************** 118 * 119 * Helper functions 120 * 121 ****************************************************************************/ 122 123static struct protection_domain *to_pdomain(struct iommu_domain *dom) 124{ 125 return container_of(dom, struct protection_domain, domain); 126} 127 128static struct iommu_dev_data *alloc_dev_data(u16 devid) 129{ 130 struct iommu_dev_data *dev_data; 131 unsigned long flags; 132 133 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); 134 if (!dev_data) 135 return NULL; 136 137 dev_data->devid = devid; 138 139 spin_lock_irqsave(&dev_data_list_lock, flags); 140 list_add_tail(&dev_data->dev_data_list, &dev_data_list); 141 spin_unlock_irqrestore(&dev_data_list_lock, flags); 142 143 return dev_data; 144} 145 146static struct iommu_dev_data *search_dev_data(u16 devid) 147{ 148 struct iommu_dev_data *dev_data; 149 unsigned long flags; 150 151 spin_lock_irqsave(&dev_data_list_lock, flags); 152 list_for_each_entry(dev_data, &dev_data_list, dev_data_list) { 153 if (dev_data->devid == devid) 154 goto out_unlock; 155 } 156 157 dev_data = NULL; 158 159out_unlock: 160 spin_unlock_irqrestore(&dev_data_list_lock, flags); 161 162 return dev_data; 163} 164 165static struct iommu_dev_data *find_dev_data(u16 devid) 166{ 167 struct iommu_dev_data *dev_data; 168 169 dev_data = search_dev_data(devid); 170 171 if (dev_data == NULL) 172 dev_data = alloc_dev_data(devid); 173 174 return dev_data; 175} 176 177static inline u16 get_device_id(struct device *dev) 178{ 179 struct pci_dev *pdev = to_pci_dev(dev); 180 181 return PCI_DEVID(pdev->bus->number, pdev->devfn); 182} 183 184static struct iommu_dev_data *get_dev_data(struct device *dev) 185{ 186 return dev->archdata.iommu; 187} 188 189static bool pci_iommuv2_capable(struct pci_dev *pdev) 190{ 191 static const int caps[] = { 192 PCI_EXT_CAP_ID_ATS, 193 PCI_EXT_CAP_ID_PRI, 194 PCI_EXT_CAP_ID_PASID, 195 }; 196 int i, pos; 197 198 for (i = 0; i < 3; ++i) { 199 pos = pci_find_ext_capability(pdev, caps[i]); 200 if (pos == 0) 201 return false; 202 } 203 204 return true; 205} 206 207static bool pdev_pri_erratum(struct pci_dev *pdev, u32 erratum) 208{ 209 struct iommu_dev_data *dev_data; 210 211 dev_data = get_dev_data(&pdev->dev); 212 213 return dev_data->errata & (1 << erratum) ? true : false; 214} 215 216/* 217 * This function actually applies the mapping to the page table of the 218 * dma_ops domain. 219 */ 220static void alloc_unity_mapping(struct dma_ops_domain *dma_dom, 221 struct unity_map_entry *e) 222{ 223 u64 addr; 224 225 for (addr = e->address_start; addr < e->address_end; 226 addr += PAGE_SIZE) { 227 if (addr < dma_dom->aperture_size) 228 __set_bit(addr >> PAGE_SHIFT, 229 dma_dom->aperture[0]->bitmap); 230 } 231} 232 233/* 234 * Inits the unity mappings required for a specific device 235 */ 236static void init_unity_mappings_for_device(struct device *dev, 237 struct dma_ops_domain *dma_dom) 238{ 239 struct unity_map_entry *e; 240 u16 devid; 241 242 devid = get_device_id(dev); 243 244 list_for_each_entry(e, &amd_iommu_unity_map, list) { 245 if (!(devid >= e->devid_start && devid <= e->devid_end)) 246 continue; 247 alloc_unity_mapping(dma_dom, e); 248 } 249} 250 251/* 252 * This function checks if the driver got a valid device from the caller to 253 * avoid dereferencing invalid pointers. 254 */ 255static bool check_device(struct device *dev) 256{ 257 u16 devid; 258 259 if (!dev || !dev->dma_mask) 260 return false; 261 262 /* No PCI device */ 263 if (!dev_is_pci(dev)) 264 return false; 265 266 devid = get_device_id(dev); 267 268 /* Out of our scope? */ 269 if (devid > amd_iommu_last_bdf) 270 return false; 271 272 if (amd_iommu_rlookup_table[devid] == NULL) 273 return false; 274 275 return true; 276} 277 278static void init_iommu_group(struct device *dev) 279{ 280 struct dma_ops_domain *dma_domain; 281 struct iommu_domain *domain; 282 struct iommu_group *group; 283 284 group = iommu_group_get_for_dev(dev); 285 if (IS_ERR(group)) 286 return; 287 288 domain = iommu_group_default_domain(group); 289 if (!domain) 290 goto out; 291 292 dma_domain = to_pdomain(domain)->priv; 293 294 init_unity_mappings_for_device(dev, dma_domain); 295out: 296 iommu_group_put(group); 297} 298 299static int iommu_init_device(struct device *dev) 300{ 301 struct pci_dev *pdev = to_pci_dev(dev); 302 struct iommu_dev_data *dev_data; 303 304 if (dev->archdata.iommu) 305 return 0; 306 307 dev_data = find_dev_data(get_device_id(dev)); 308 if (!dev_data) 309 return -ENOMEM; 310 311 if (pci_iommuv2_capable(pdev)) { 312 struct amd_iommu *iommu; 313 314 iommu = amd_iommu_rlookup_table[dev_data->devid]; 315 dev_data->iommu_v2 = iommu->is_iommu_v2; 316 } 317 318 dev->archdata.iommu = dev_data; 319 320 iommu_device_link(amd_iommu_rlookup_table[dev_data->devid]->iommu_dev, 321 dev); 322 323 return 0; 324} 325 326static void iommu_ignore_device(struct device *dev) 327{ 328 u16 devid, alias; 329 330 devid = get_device_id(dev); 331 alias = amd_iommu_alias_table[devid]; 332 333 memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry)); 334 memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry)); 335 336 amd_iommu_rlookup_table[devid] = NULL; 337 amd_iommu_rlookup_table[alias] = NULL; 338} 339 340static void iommu_uninit_device(struct device *dev) 341{ 342 struct iommu_dev_data *dev_data = search_dev_data(get_device_id(dev)); 343 344 if (!dev_data) 345 return; 346 347 iommu_device_unlink(amd_iommu_rlookup_table[dev_data->devid]->iommu_dev, 348 dev); 349 350 iommu_group_remove_device(dev); 351 352 /* Remove dma-ops */ 353 dev->archdata.dma_ops = NULL; 354 355 /* 356 * We keep dev_data around for unplugged devices and reuse it when the 357 * device is re-plugged - not doing so would introduce a ton of races. 358 */ 359} 360 361#ifdef CONFIG_AMD_IOMMU_STATS 362 363/* 364 * Initialization code for statistics collection 365 */ 366 367DECLARE_STATS_COUNTER(compl_wait); 368DECLARE_STATS_COUNTER(cnt_map_single); 369DECLARE_STATS_COUNTER(cnt_unmap_single); 370DECLARE_STATS_COUNTER(cnt_map_sg); 371DECLARE_STATS_COUNTER(cnt_unmap_sg); 372DECLARE_STATS_COUNTER(cnt_alloc_coherent); 373DECLARE_STATS_COUNTER(cnt_free_coherent); 374DECLARE_STATS_COUNTER(cross_page); 375DECLARE_STATS_COUNTER(domain_flush_single); 376DECLARE_STATS_COUNTER(domain_flush_all); 377DECLARE_STATS_COUNTER(alloced_io_mem); 378DECLARE_STATS_COUNTER(total_map_requests); 379DECLARE_STATS_COUNTER(complete_ppr); 380DECLARE_STATS_COUNTER(invalidate_iotlb); 381DECLARE_STATS_COUNTER(invalidate_iotlb_all); 382DECLARE_STATS_COUNTER(pri_requests); 383 384static struct dentry *stats_dir; 385static struct dentry *de_fflush; 386 387static void amd_iommu_stats_add(struct __iommu_counter *cnt) 388{ 389 if (stats_dir == NULL) 390 return; 391 392 cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir, 393 &cnt->value); 394} 395 396static void amd_iommu_stats_init(void) 397{ 398 stats_dir = debugfs_create_dir("amd-iommu", NULL); 399 if (stats_dir == NULL) 400 return; 401 402 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, 403 &amd_iommu_unmap_flush); 404 405 amd_iommu_stats_add(&compl_wait); 406 amd_iommu_stats_add(&cnt_map_single); 407 amd_iommu_stats_add(&cnt_unmap_single); 408 amd_iommu_stats_add(&cnt_map_sg); 409 amd_iommu_stats_add(&cnt_unmap_sg); 410 amd_iommu_stats_add(&cnt_alloc_coherent); 411 amd_iommu_stats_add(&cnt_free_coherent); 412 amd_iommu_stats_add(&cross_page); 413 amd_iommu_stats_add(&domain_flush_single); 414 amd_iommu_stats_add(&domain_flush_all); 415 amd_iommu_stats_add(&alloced_io_mem); 416 amd_iommu_stats_add(&total_map_requests); 417 amd_iommu_stats_add(&complete_ppr); 418 amd_iommu_stats_add(&invalidate_iotlb); 419 amd_iommu_stats_add(&invalidate_iotlb_all); 420 amd_iommu_stats_add(&pri_requests); 421} 422 423#endif 424 425/**************************************************************************** 426 * 427 * Interrupt handling functions 428 * 429 ****************************************************************************/ 430 431static void dump_dte_entry(u16 devid) 432{ 433 int i; 434 435 for (i = 0; i < 4; ++i) 436 pr_err("AMD-Vi: DTE[%d]: %016llx\n", i, 437 amd_iommu_dev_table[devid].data[i]); 438} 439 440static void dump_command(unsigned long phys_addr) 441{ 442 struct iommu_cmd *cmd = phys_to_virt(phys_addr); 443 int i; 444 445 for (i = 0; i < 4; ++i) 446 pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]); 447} 448 449static void iommu_print_event(struct amd_iommu *iommu, void *__evt) 450{ 451 int type, devid, domid, flags; 452 volatile u32 *event = __evt; 453 int count = 0; 454 u64 address; 455 456retry: 457 type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; 458 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; 459 domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK; 460 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; 461 address = (u64)(((u64)event[3]) << 32) | event[2]; 462 463 if (type == 0) { 464 /* Did we hit the erratum? */ 465 if (++count == LOOP_TIMEOUT) { 466 pr_err("AMD-Vi: No event written to event log\n"); 467 return; 468 } 469 udelay(1); 470 goto retry; 471 } 472 473 printk(KERN_ERR "AMD-Vi: Event logged ["); 474 475 switch (type) { 476 case EVENT_TYPE_ILL_DEV: 477 printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x " 478 "address=0x%016llx flags=0x%04x]\n", 479 PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 480 address, flags); 481 dump_dte_entry(devid); 482 break; 483 case EVENT_TYPE_IO_FAULT: 484 printk("IO_PAGE_FAULT device=%02x:%02x.%x " 485 "domain=0x%04x address=0x%016llx flags=0x%04x]\n", 486 PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 487 domid, address, flags); 488 break; 489 case EVENT_TYPE_DEV_TAB_ERR: 490 printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x " 491 "address=0x%016llx flags=0x%04x]\n", 492 PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 493 address, flags); 494 break; 495 case EVENT_TYPE_PAGE_TAB_ERR: 496 printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x " 497 "domain=0x%04x address=0x%016llx flags=0x%04x]\n", 498 PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 499 domid, address, flags); 500 break; 501 case EVENT_TYPE_ILL_CMD: 502 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); 503 dump_command(address); 504 break; 505 case EVENT_TYPE_CMD_HARD_ERR: 506 printk("COMMAND_HARDWARE_ERROR address=0x%016llx " 507 "flags=0x%04x]\n", address, flags); 508 break; 509 case EVENT_TYPE_IOTLB_INV_TO: 510 printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x " 511 "address=0x%016llx]\n", 512 PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 513 address); 514 break; 515 case EVENT_TYPE_INV_DEV_REQ: 516 printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x " 517 "address=0x%016llx flags=0x%04x]\n", 518 PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 519 address, flags); 520 break; 521 default: 522 printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type); 523 } 524 525 memset(__evt, 0, 4 * sizeof(u32)); 526} 527 528static void iommu_poll_events(struct amd_iommu *iommu) 529{ 530 u32 head, tail; 531 532 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); 533 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); 534 535 while (head != tail) { 536 iommu_print_event(iommu, iommu->evt_buf + head); 537 head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE; 538 } 539 540 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); 541} 542 543static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw) 544{ 545 struct amd_iommu_fault fault; 546 547 INC_STATS_COUNTER(pri_requests); 548 549 if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) { 550 pr_err_ratelimited("AMD-Vi: Unknown PPR request received\n"); 551 return; 552 } 553 554 fault.address = raw[1]; 555 fault.pasid = PPR_PASID(raw[0]); 556 fault.device_id = PPR_DEVID(raw[0]); 557 fault.tag = PPR_TAG(raw[0]); 558 fault.flags = PPR_FLAGS(raw[0]); 559 560 atomic_notifier_call_chain(&ppr_notifier, 0, &fault); 561} 562 563static void iommu_poll_ppr_log(struct amd_iommu *iommu) 564{ 565 u32 head, tail; 566 567 if (iommu->ppr_log == NULL) 568 return; 569 570 head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); 571 tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET); 572 573 while (head != tail) { 574 volatile u64 *raw; 575 u64 entry[2]; 576 int i; 577 578 raw = (u64 *)(iommu->ppr_log + head); 579 580 /* 581 * Hardware bug: Interrupt may arrive before the entry is 582 * written to memory. If this happens we need to wait for the 583 * entry to arrive. 584 */ 585 for (i = 0; i < LOOP_TIMEOUT; ++i) { 586 if (PPR_REQ_TYPE(raw[0]) != 0) 587 break; 588 udelay(1); 589 } 590 591 /* Avoid memcpy function-call overhead */ 592 entry[0] = raw[0]; 593 entry[1] = raw[1]; 594 595 /* 596 * To detect the hardware bug we need to clear the entry 597 * back to zero. 598 */ 599 raw[0] = raw[1] = 0UL; 600 601 /* Update head pointer of hardware ring-buffer */ 602 head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE; 603 writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); 604 605 /* Handle PPR entry */ 606 iommu_handle_ppr_entry(iommu, entry); 607 608 /* Refresh ring-buffer information */ 609 head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); 610 tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET); 611 } 612} 613 614irqreturn_t amd_iommu_int_thread(int irq, void *data) 615{ 616 struct amd_iommu *iommu = (struct amd_iommu *) data; 617 u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); 618 619 while (status & (MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK)) { 620 /* Enable EVT and PPR interrupts again */ 621 writel((MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK), 622 iommu->mmio_base + MMIO_STATUS_OFFSET); 623 624 if (status & MMIO_STATUS_EVT_INT_MASK) { 625 pr_devel("AMD-Vi: Processing IOMMU Event Log\n"); 626 iommu_poll_events(iommu); 627 } 628 629 if (status & MMIO_STATUS_PPR_INT_MASK) { 630 pr_devel("AMD-Vi: Processing IOMMU PPR Log\n"); 631 iommu_poll_ppr_log(iommu); 632 } 633 634 /* 635 * Hardware bug: ERBT1312 636 * When re-enabling interrupt (by writing 1 637 * to clear the bit), the hardware might also try to set 638 * the interrupt bit in the event status register. 639 * In this scenario, the bit will be set, and disable 640 * subsequent interrupts. 641 * 642 * Workaround: The IOMMU driver should read back the 643 * status register and check if the interrupt bits are cleared. 644 * If not, driver will need to go through the interrupt handler 645 * again and re-clear the bits 646 */ 647 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); 648 } 649 return IRQ_HANDLED; 650} 651 652irqreturn_t amd_iommu_int_handler(int irq, void *data) 653{ 654 return IRQ_WAKE_THREAD; 655} 656 657/**************************************************************************** 658 * 659 * IOMMU command queuing functions 660 * 661 ****************************************************************************/ 662 663static int wait_on_sem(volatile u64 *sem) 664{ 665 int i = 0; 666 667 while (*sem == 0 && i < LOOP_TIMEOUT) { 668 udelay(1); 669 i += 1; 670 } 671 672 if (i == LOOP_TIMEOUT) { 673 pr_alert("AMD-Vi: Completion-Wait loop timed out\n"); 674 return -EIO; 675 } 676 677 return 0; 678} 679 680static void copy_cmd_to_buffer(struct amd_iommu *iommu, 681 struct iommu_cmd *cmd, 682 u32 tail) 683{ 684 u8 *target; 685 686 target = iommu->cmd_buf + tail; 687 tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE; 688 689 /* Copy command to buffer */ 690 memcpy(target, cmd, sizeof(*cmd)); 691 692 /* Tell the IOMMU about it */ 693 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 694} 695 696static void build_completion_wait(struct iommu_cmd *cmd, u64 address) 697{ 698 WARN_ON(address & 0x7ULL); 699 700 memset(cmd, 0, sizeof(*cmd)); 701 cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK; 702 cmd->data[1] = upper_32_bits(__pa(address)); 703 cmd->data[2] = 1; 704 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT); 705} 706 707static void build_inv_dte(struct iommu_cmd *cmd, u16 devid) 708{ 709 memset(cmd, 0, sizeof(*cmd)); 710 cmd->data[0] = devid; 711 CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY); 712} 713 714static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, 715 size_t size, u16 domid, int pde) 716{ 717 u64 pages; 718 bool s; 719 720 pages = iommu_num_pages(address, size, PAGE_SIZE); 721 s = false; 722 723 if (pages > 1) { 724 /* 725 * If we have to flush more than one page, flush all 726 * TLB entries for this domain 727 */ 728 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 729 s = true; 730 } 731 732 address &= PAGE_MASK; 733 734 memset(cmd, 0, sizeof(*cmd)); 735 cmd->data[1] |= domid; 736 cmd->data[2] = lower_32_bits(address); 737 cmd->data[3] = upper_32_bits(address); 738 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); 739 if (s) /* size bit - we flush more than one 4kb page */ 740 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; 741 if (pde) /* PDE bit - we want to flush everything, not only the PTEs */ 742 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 743} 744 745static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, 746 u64 address, size_t size) 747{ 748 u64 pages; 749 bool s; 750 751 pages = iommu_num_pages(address, size, PAGE_SIZE); 752 s = false; 753 754 if (pages > 1) { 755 /* 756 * If we have to flush more than one page, flush all 757 * TLB entries for this domain 758 */ 759 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 760 s = true; 761 } 762 763 address &= PAGE_MASK; 764 765 memset(cmd, 0, sizeof(*cmd)); 766 cmd->data[0] = devid; 767 cmd->data[0] |= (qdep & 0xff) << 24; 768 cmd->data[1] = devid; 769 cmd->data[2] = lower_32_bits(address); 770 cmd->data[3] = upper_32_bits(address); 771 CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); 772 if (s) 773 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; 774} 775 776static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, int pasid, 777 u64 address, bool size) 778{ 779 memset(cmd, 0, sizeof(*cmd)); 780 781 address &= ~(0xfffULL); 782 783 cmd->data[0] = pasid; 784 cmd->data[1] = domid; 785 cmd->data[2] = lower_32_bits(address); 786 cmd->data[3] = upper_32_bits(address); 787 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 788 cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; 789 if (size) 790 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; 791 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); 792} 793 794static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, int pasid, 795 int qdep, u64 address, bool size) 796{ 797 memset(cmd, 0, sizeof(*cmd)); 798 799 address &= ~(0xfffULL); 800 801 cmd->data[0] = devid; 802 cmd->data[0] |= ((pasid >> 8) & 0xff) << 16; 803 cmd->data[0] |= (qdep & 0xff) << 24; 804 cmd->data[1] = devid; 805 cmd->data[1] |= (pasid & 0xff) << 16; 806 cmd->data[2] = lower_32_bits(address); 807 cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; 808 cmd->data[3] = upper_32_bits(address); 809 if (size) 810 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; 811 CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); 812} 813 814static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, int pasid, 815 int status, int tag, bool gn) 816{ 817 memset(cmd, 0, sizeof(*cmd)); 818 819 cmd->data[0] = devid; 820 if (gn) { 821 cmd->data[1] = pasid; 822 cmd->data[2] = CMD_INV_IOMMU_PAGES_GN_MASK; 823 } 824 cmd->data[3] = tag & 0x1ff; 825 cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT; 826 827 CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR); 828} 829 830static void build_inv_all(struct iommu_cmd *cmd) 831{ 832 memset(cmd, 0, sizeof(*cmd)); 833 CMD_SET_TYPE(cmd, CMD_INV_ALL); 834} 835 836static void build_inv_irt(struct iommu_cmd *cmd, u16 devid) 837{ 838 memset(cmd, 0, sizeof(*cmd)); 839 cmd->data[0] = devid; 840 CMD_SET_TYPE(cmd, CMD_INV_IRT); 841} 842 843/* 844 * Writes the command to the IOMMUs command buffer and informs the 845 * hardware about the new command. 846 */ 847static int iommu_queue_command_sync(struct amd_iommu *iommu, 848 struct iommu_cmd *cmd, 849 bool sync) 850{ 851 u32 left, tail, head, next_tail; 852 unsigned long flags; 853 854again: 855 spin_lock_irqsave(&iommu->lock, flags); 856 857 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 858 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 859 next_tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE; 860 left = (head - next_tail) % CMD_BUFFER_SIZE; 861 862 if (left <= 2) { 863 struct iommu_cmd sync_cmd; 864 volatile u64 sem = 0; 865 int ret; 866 867 build_completion_wait(&sync_cmd, (u64)&sem); 868 copy_cmd_to_buffer(iommu, &sync_cmd, tail); 869 870 spin_unlock_irqrestore(&iommu->lock, flags); 871 872 if ((ret = wait_on_sem(&sem)) != 0) 873 return ret; 874 875 goto again; 876 } 877 878 copy_cmd_to_buffer(iommu, cmd, tail); 879 880 /* We need to sync now to make sure all commands are processed */ 881 iommu->need_sync = sync; 882 883 spin_unlock_irqrestore(&iommu->lock, flags); 884 885 return 0; 886} 887 888static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) 889{ 890 return iommu_queue_command_sync(iommu, cmd, true); 891} 892 893/* 894 * This function queues a completion wait command into the command 895 * buffer of an IOMMU 896 */ 897static int iommu_completion_wait(struct amd_iommu *iommu) 898{ 899 struct iommu_cmd cmd; 900 volatile u64 sem = 0; 901 int ret; 902 903 if (!iommu->need_sync) 904 return 0; 905 906 build_completion_wait(&cmd, (u64)&sem); 907 908 ret = iommu_queue_command_sync(iommu, &cmd, false); 909 if (ret) 910 return ret; 911 912 return wait_on_sem(&sem); 913} 914 915static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) 916{ 917 struct iommu_cmd cmd; 918 919 build_inv_dte(&cmd, devid); 920 921 return iommu_queue_command(iommu, &cmd); 922} 923 924static void iommu_flush_dte_all(struct amd_iommu *iommu) 925{ 926 u32 devid; 927 928 for (devid = 0; devid <= 0xffff; ++devid) 929 iommu_flush_dte(iommu, devid); 930 931 iommu_completion_wait(iommu); 932} 933 934/* 935 * This function uses heavy locking and may disable irqs for some time. But 936 * this is no issue because it is only called during resume. 937 */ 938static void iommu_flush_tlb_all(struct amd_iommu *iommu) 939{ 940 u32 dom_id; 941 942 for (dom_id = 0; dom_id <= 0xffff; ++dom_id) { 943 struct iommu_cmd cmd; 944 build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 945 dom_id, 1); 946 iommu_queue_command(iommu, &cmd); 947 } 948 949 iommu_completion_wait(iommu); 950} 951 952static void iommu_flush_all(struct amd_iommu *iommu) 953{ 954 struct iommu_cmd cmd; 955 956 build_inv_all(&cmd); 957 958 iommu_queue_command(iommu, &cmd); 959 iommu_completion_wait(iommu); 960} 961 962static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid) 963{ 964 struct iommu_cmd cmd; 965 966 build_inv_irt(&cmd, devid); 967 968 iommu_queue_command(iommu, &cmd); 969} 970 971static void iommu_flush_irt_all(struct amd_iommu *iommu) 972{ 973 u32 devid; 974 975 for (devid = 0; devid <= MAX_DEV_TABLE_ENTRIES; devid++) 976 iommu_flush_irt(iommu, devid); 977 978 iommu_completion_wait(iommu); 979} 980 981void iommu_flush_all_caches(struct amd_iommu *iommu) 982{ 983 if (iommu_feature(iommu, FEATURE_IA)) { 984 iommu_flush_all(iommu); 985 } else { 986 iommu_flush_dte_all(iommu); 987 iommu_flush_irt_all(iommu); 988 iommu_flush_tlb_all(iommu); 989 } 990} 991 992/* 993 * Command send function for flushing on-device TLB 994 */ 995static int device_flush_iotlb(struct iommu_dev_data *dev_data, 996 u64 address, size_t size) 997{ 998 struct amd_iommu *iommu; 999 struct iommu_cmd cmd; 1000 int qdep; 1001 1002 qdep = dev_data->ats.qdep; 1003 iommu = amd_iommu_rlookup_table[dev_data->devid]; 1004 1005 build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size); 1006 1007 return iommu_queue_command(iommu, &cmd); 1008} 1009 1010/* 1011 * Command send function for invalidating a device table entry 1012 */ 1013static int device_flush_dte(struct iommu_dev_data *dev_data) 1014{ 1015 struct amd_iommu *iommu; 1016 u16 alias; 1017 int ret; 1018 1019 iommu = amd_iommu_rlookup_table[dev_data->devid]; 1020 alias = amd_iommu_alias_table[dev_data->devid]; 1021 1022 ret = iommu_flush_dte(iommu, dev_data->devid); 1023 if (!ret && alias != dev_data->devid) 1024 ret = iommu_flush_dte(iommu, alias); 1025 if (ret) 1026 return ret; 1027 1028 if (dev_data->ats.enabled) 1029 ret = device_flush_iotlb(dev_data, 0, ~0UL); 1030 1031 return ret; 1032} 1033 1034/* 1035 * TLB invalidation function which is called from the mapping functions. 1036 * It invalidates a single PTE if the range to flush is within a single 1037 * page. Otherwise it flushes the whole TLB of the IOMMU. 1038 */ 1039static void __domain_flush_pages(struct protection_domain *domain, 1040 u64 address, size_t size, int pde) 1041{ 1042 struct iommu_dev_data *dev_data; 1043 struct iommu_cmd cmd; 1044 int ret = 0, i; 1045 1046 build_inv_iommu_pages(&cmd, address, size, domain->id, pde); 1047 1048 for (i = 0; i < amd_iommus_present; ++i) { 1049 if (!domain->dev_iommu[i]) 1050 continue; 1051 1052 /* 1053 * Devices of this domain are behind this IOMMU 1054 * We need a TLB flush 1055 */ 1056 ret |= iommu_queue_command(amd_iommus[i], &cmd); 1057 } 1058 1059 list_for_each_entry(dev_data, &domain->dev_list, list) { 1060 1061 if (!dev_data->ats.enabled) 1062 continue; 1063 1064 ret |= device_flush_iotlb(dev_data, address, size); 1065 } 1066 1067 WARN_ON(ret); 1068} 1069 1070static void domain_flush_pages(struct protection_domain *domain, 1071 u64 address, size_t size) 1072{ 1073 __domain_flush_pages(domain, address, size, 0); 1074} 1075 1076/* Flush the whole IO/TLB for a given protection domain */ 1077static void domain_flush_tlb(struct protection_domain *domain) 1078{ 1079 __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); 1080} 1081 1082/* Flush the whole IO/TLB for a given protection domain - including PDE */ 1083static void domain_flush_tlb_pde(struct protection_domain *domain) 1084{ 1085 __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); 1086} 1087 1088static void domain_flush_complete(struct protection_domain *domain) 1089{ 1090 int i; 1091 1092 for (i = 0; i < amd_iommus_present; ++i) { 1093 if (!domain->dev_iommu[i]) 1094 continue; 1095 1096 /* 1097 * Devices of this domain are behind this IOMMU 1098 * We need to wait for completion of all commands. 1099 */ 1100 iommu_completion_wait(amd_iommus[i]); 1101 } 1102} 1103 1104 1105/* 1106 * This function flushes the DTEs for all devices in domain 1107 */ 1108static void domain_flush_devices(struct protection_domain *domain) 1109{ 1110 struct iommu_dev_data *dev_data; 1111 1112 list_for_each_entry(dev_data, &domain->dev_list, list) 1113 device_flush_dte(dev_data); 1114} 1115 1116/**************************************************************************** 1117 * 1118 * The functions below are used the create the page table mappings for 1119 * unity mapped regions. 1120 * 1121 ****************************************************************************/ 1122 1123/* 1124 * This function is used to add another level to an IO page table. Adding 1125 * another level increases the size of the address space by 9 bits to a size up 1126 * to 64 bits. 1127 */ 1128static bool increase_address_space(struct protection_domain *domain, 1129 gfp_t gfp) 1130{ 1131 u64 *pte; 1132 1133 if (domain->mode == PAGE_MODE_6_LEVEL) 1134 /* address space already 64 bit large */ 1135 return false; 1136 1137 pte = (void *)get_zeroed_page(gfp); 1138 if (!pte) 1139 return false; 1140 1141 *pte = PM_LEVEL_PDE(domain->mode, 1142 virt_to_phys(domain->pt_root)); 1143 domain->pt_root = pte; 1144 domain->mode += 1; 1145 domain->updated = true; 1146 1147 return true; 1148} 1149 1150static u64 *alloc_pte(struct protection_domain *domain, 1151 unsigned long address, 1152 unsigned long page_size, 1153 u64 **pte_page, 1154 gfp_t gfp) 1155{ 1156 int level, end_lvl; 1157 u64 *pte, *page; 1158 1159 BUG_ON(!is_power_of_2(page_size)); 1160 1161 while (address > PM_LEVEL_SIZE(domain->mode)) 1162 increase_address_space(domain, gfp); 1163 1164 level = domain->mode - 1; 1165 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; 1166 address = PAGE_SIZE_ALIGN(address, page_size); 1167 end_lvl = PAGE_SIZE_LEVEL(page_size); 1168 1169 while (level > end_lvl) { 1170 if (!IOMMU_PTE_PRESENT(*pte)) { 1171 page = (u64 *)get_zeroed_page(gfp); 1172 if (!page) 1173 return NULL; 1174 *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); 1175 } 1176 1177 /* No level skipping support yet */ 1178 if (PM_PTE_LEVEL(*pte) != level) 1179 return NULL; 1180 1181 level -= 1; 1182 1183 pte = IOMMU_PTE_PAGE(*pte); 1184 1185 if (pte_page && level == end_lvl) 1186 *pte_page = pte; 1187 1188 pte = &pte[PM_LEVEL_INDEX(level, address)]; 1189 } 1190 1191 return pte; 1192} 1193 1194/* 1195 * This function checks if there is a PTE for a given dma address. If 1196 * there is one, it returns the pointer to it. 1197 */ 1198static u64 *fetch_pte(struct protection_domain *domain, 1199 unsigned long address, 1200 unsigned long *page_size) 1201{ 1202 int level; 1203 u64 *pte; 1204 1205 if (address > PM_LEVEL_SIZE(domain->mode)) 1206 return NULL; 1207 1208 level = domain->mode - 1; 1209 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; 1210 *page_size = PTE_LEVEL_PAGE_SIZE(level); 1211 1212 while (level > 0) { 1213 1214 /* Not Present */ 1215 if (!IOMMU_PTE_PRESENT(*pte)) 1216 return NULL; 1217 1218 /* Large PTE */ 1219 if (PM_PTE_LEVEL(*pte) == 7 || 1220 PM_PTE_LEVEL(*pte) == 0) 1221 break; 1222 1223 /* No level skipping support yet */ 1224 if (PM_PTE_LEVEL(*pte) != level) 1225 return NULL; 1226 1227 level -= 1; 1228 1229 /* Walk to the next level */ 1230 pte = IOMMU_PTE_PAGE(*pte); 1231 pte = &pte[PM_LEVEL_INDEX(level, address)]; 1232 *page_size = PTE_LEVEL_PAGE_SIZE(level); 1233 } 1234 1235 if (PM_PTE_LEVEL(*pte) == 0x07) { 1236 unsigned long pte_mask; 1237 1238 /* 1239 * If we have a series of large PTEs, make 1240 * sure to return a pointer to the first one. 1241 */ 1242 *page_size = pte_mask = PTE_PAGE_SIZE(*pte); 1243 pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1); 1244 pte = (u64 *)(((unsigned long)pte) & pte_mask); 1245 } 1246 1247 return pte; 1248} 1249 1250/* 1251 * Generic mapping functions. It maps a physical address into a DMA 1252 * address space. It allocates the page table pages if necessary. 1253 * In the future it can be extended to a generic mapping function 1254 * supporting all features of AMD IOMMU page tables like level skipping 1255 * and full 64 bit address spaces. 1256 */ 1257static int iommu_map_page(struct protection_domain *dom, 1258 unsigned long bus_addr, 1259 unsigned long phys_addr, 1260 int prot, 1261 unsigned long page_size) 1262{ 1263 u64 __pte, *pte; 1264 int i, count; 1265 1266 BUG_ON(!IS_ALIGNED(bus_addr, page_size)); 1267 BUG_ON(!IS_ALIGNED(phys_addr, page_size)); 1268 1269 if (!(prot & IOMMU_PROT_MASK)) 1270 return -EINVAL; 1271 1272 count = PAGE_SIZE_PTE_COUNT(page_size); 1273 pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL); 1274 1275 if (!pte) 1276 return -ENOMEM; 1277 1278 for (i = 0; i < count; ++i) 1279 if (IOMMU_PTE_PRESENT(pte[i])) 1280 return -EBUSY; 1281 1282 if (count > 1) { 1283 __pte = PAGE_SIZE_PTE(phys_addr, page_size); 1284 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC; 1285 } else 1286 __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC; 1287 1288 if (prot & IOMMU_PROT_IR) 1289 __pte |= IOMMU_PTE_IR; 1290 if (prot & IOMMU_PROT_IW) 1291 __pte |= IOMMU_PTE_IW; 1292 1293 for (i = 0; i < count; ++i) 1294 pte[i] = __pte; 1295 1296 update_domain(dom); 1297 1298 return 0; 1299} 1300 1301static unsigned long iommu_unmap_page(struct protection_domain *dom, 1302 unsigned long bus_addr, 1303 unsigned long page_size) 1304{ 1305 unsigned long long unmapped; 1306 unsigned long unmap_size; 1307 u64 *pte; 1308 1309 BUG_ON(!is_power_of_2(page_size)); 1310 1311 unmapped = 0; 1312 1313 while (unmapped < page_size) { 1314 1315 pte = fetch_pte(dom, bus_addr, &unmap_size); 1316 1317 if (pte) { 1318 int i, count; 1319 1320 count = PAGE_SIZE_PTE_COUNT(unmap_size); 1321 for (i = 0; i < count; i++) 1322 pte[i] = 0ULL; 1323 } 1324 1325 bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size; 1326 unmapped += unmap_size; 1327 } 1328 1329 BUG_ON(unmapped && !is_power_of_2(unmapped)); 1330 1331 return unmapped; 1332} 1333 1334/**************************************************************************** 1335 * 1336 * The next functions belong to the address allocator for the dma_ops 1337 * interface functions. They work like the allocators in the other IOMMU 1338 * drivers. Its basically a bitmap which marks the allocated pages in 1339 * the aperture. Maybe it could be enhanced in the future to a more 1340 * efficient allocator. 1341 * 1342 ****************************************************************************/ 1343 1344/* 1345 * The address allocator core functions. 1346 * 1347 * called with domain->lock held 1348 */ 1349 1350/* 1351 * Used to reserve address ranges in the aperture (e.g. for exclusion 1352 * ranges. 1353 */ 1354static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, 1355 unsigned long start_page, 1356 unsigned int pages) 1357{ 1358 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; 1359 1360 if (start_page + pages > last_page) 1361 pages = last_page - start_page; 1362 1363 for (i = start_page; i < start_page + pages; ++i) { 1364 int index = i / APERTURE_RANGE_PAGES; 1365 int page = i % APERTURE_RANGE_PAGES; 1366 __set_bit(page, dom->aperture[index]->bitmap); 1367 } 1368} 1369 1370/* 1371 * This function is used to add a new aperture range to an existing 1372 * aperture in case of dma_ops domain allocation or address allocation 1373 * failure. 1374 */ 1375static int alloc_new_range(struct dma_ops_domain *dma_dom, 1376 bool populate, gfp_t gfp) 1377{ 1378 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; 1379 struct amd_iommu *iommu; 1380 unsigned long i, old_size, pte_pgsize; 1381 1382#ifdef CONFIG_IOMMU_STRESS 1383 populate = false; 1384#endif 1385 1386 if (index >= APERTURE_MAX_RANGES) 1387 return -ENOMEM; 1388 1389 dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp); 1390 if (!dma_dom->aperture[index]) 1391 return -ENOMEM; 1392 1393 dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp); 1394 if (!dma_dom->aperture[index]->bitmap) 1395 goto out_free; 1396 1397 dma_dom->aperture[index]->offset = dma_dom->aperture_size; 1398 1399 if (populate) { 1400 unsigned long address = dma_dom->aperture_size; 1401 int i, num_ptes = APERTURE_RANGE_PAGES / 512; 1402 u64 *pte, *pte_page; 1403 1404 for (i = 0; i < num_ptes; ++i) { 1405 pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE, 1406 &pte_page, gfp); 1407 if (!pte) 1408 goto out_free; 1409 1410 dma_dom->aperture[index]->pte_pages[i] = pte_page; 1411 1412 address += APERTURE_RANGE_SIZE / 64; 1413 } 1414 } 1415 1416 old_size = dma_dom->aperture_size; 1417 dma_dom->aperture_size += APERTURE_RANGE_SIZE; 1418 1419 /* Reserve address range used for MSI messages */ 1420 if (old_size < MSI_ADDR_BASE_LO && 1421 dma_dom->aperture_size > MSI_ADDR_BASE_LO) { 1422 unsigned long spage; 1423 int pages; 1424 1425 pages = iommu_num_pages(MSI_ADDR_BASE_LO, 0x10000, PAGE_SIZE); 1426 spage = MSI_ADDR_BASE_LO >> PAGE_SHIFT; 1427 1428 dma_ops_reserve_addresses(dma_dom, spage, pages); 1429 } 1430 1431 /* Initialize the exclusion range if necessary */ 1432 for_each_iommu(iommu) { 1433 if (iommu->exclusion_start && 1434 iommu->exclusion_start >= dma_dom->aperture[index]->offset 1435 && iommu->exclusion_start < dma_dom->aperture_size) { 1436 unsigned long startpage; 1437 int pages = iommu_num_pages(iommu->exclusion_start, 1438 iommu->exclusion_length, 1439 PAGE_SIZE); 1440 startpage = iommu->exclusion_start >> PAGE_SHIFT; 1441 dma_ops_reserve_addresses(dma_dom, startpage, pages); 1442 } 1443 } 1444 1445 /* 1446 * Check for areas already mapped as present in the new aperture 1447 * range and mark those pages as reserved in the allocator. Such 1448 * mappings may already exist as a result of requested unity 1449 * mappings for devices. 1450 */ 1451 for (i = dma_dom->aperture[index]->offset; 1452 i < dma_dom->aperture_size; 1453 i += pte_pgsize) { 1454 u64 *pte = fetch_pte(&dma_dom->domain, i, &pte_pgsize); 1455 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 1456 continue; 1457 1458 dma_ops_reserve_addresses(dma_dom, i >> PAGE_SHIFT, 1459 pte_pgsize >> 12); 1460 } 1461 1462 update_domain(&dma_dom->domain); 1463 1464 return 0; 1465 1466out_free: 1467 update_domain(&dma_dom->domain); 1468 1469 free_page((unsigned long)dma_dom->aperture[index]->bitmap); 1470 1471 kfree(dma_dom->aperture[index]); 1472 dma_dom->aperture[index] = NULL; 1473 1474 return -ENOMEM; 1475} 1476 1477static unsigned long dma_ops_area_alloc(struct device *dev, 1478 struct dma_ops_domain *dom, 1479 unsigned int pages, 1480 unsigned long align_mask, 1481 u64 dma_mask, 1482 unsigned long start) 1483{ 1484 unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE; 1485 int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT; 1486 int i = start >> APERTURE_RANGE_SHIFT; 1487 unsigned long boundary_size, mask; 1488 unsigned long address = -1; 1489 unsigned long limit; 1490 1491 next_bit >>= PAGE_SHIFT; 1492 1493 mask = dma_get_seg_boundary(dev); 1494 1495 boundary_size = mask + 1 ? ALIGN(mask + 1, PAGE_SIZE) >> PAGE_SHIFT : 1496 1UL << (BITS_PER_LONG - PAGE_SHIFT); 1497 1498 for (;i < max_index; ++i) { 1499 unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT; 1500 1501 if (dom->aperture[i]->offset >= dma_mask) 1502 break; 1503 1504 limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset, 1505 dma_mask >> PAGE_SHIFT); 1506 1507 address = iommu_area_alloc(dom->aperture[i]->bitmap, 1508 limit, next_bit, pages, 0, 1509 boundary_size, align_mask); 1510 if (address != -1) { 1511 address = dom->aperture[i]->offset + 1512 (address << PAGE_SHIFT); 1513 dom->next_address = address + (pages << PAGE_SHIFT); 1514 break; 1515 } 1516 1517 next_bit = 0; 1518 } 1519 1520 return address; 1521} 1522 1523static unsigned long dma_ops_alloc_addresses(struct device *dev, 1524 struct dma_ops_domain *dom, 1525 unsigned int pages, 1526 unsigned long align_mask, 1527 u64 dma_mask) 1528{ 1529 unsigned long address; 1530 1531#ifdef CONFIG_IOMMU_STRESS 1532 dom->next_address = 0; 1533 dom->need_flush = true; 1534#endif 1535 1536 address = dma_ops_area_alloc(dev, dom, pages, align_mask, 1537 dma_mask, dom->next_address); 1538 1539 if (address == -1) { 1540 dom->next_address = 0; 1541 address = dma_ops_area_alloc(dev, dom, pages, align_mask, 1542 dma_mask, 0); 1543 dom->need_flush = true; 1544 } 1545 1546 if (unlikely(address == -1)) 1547 address = DMA_ERROR_CODE; 1548 1549 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); 1550 1551 return address; 1552} 1553 1554/* 1555 * The address free function. 1556 * 1557 * called with domain->lock held 1558 */ 1559static void dma_ops_free_addresses(struct dma_ops_domain *dom, 1560 unsigned long address, 1561 unsigned int pages) 1562{ 1563 unsigned i = address >> APERTURE_RANGE_SHIFT; 1564 struct aperture_range *range = dom->aperture[i]; 1565 1566 BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL); 1567 1568#ifdef CONFIG_IOMMU_STRESS 1569 if (i < 4) 1570 return; 1571#endif 1572 1573 if (address >= dom->next_address) 1574 dom->need_flush = true; 1575 1576 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; 1577 1578 bitmap_clear(range->bitmap, address, pages); 1579 1580} 1581 1582/**************************************************************************** 1583 * 1584 * The next functions belong to the domain allocation. A domain is 1585 * allocated for every IOMMU as the default domain. If device isolation 1586 * is enabled, every device get its own domain. The most important thing 1587 * about domains is the page table mapping the DMA address space they 1588 * contain. 1589 * 1590 ****************************************************************************/ 1591 1592/* 1593 * This function adds a protection domain to the global protection domain list 1594 */ 1595static void add_domain_to_list(struct protection_domain *domain) 1596{ 1597 unsigned long flags; 1598 1599 spin_lock_irqsave(&amd_iommu_pd_lock, flags); 1600 list_add(&domain->list, &amd_iommu_pd_list); 1601 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); 1602} 1603 1604/* 1605 * This function removes a protection domain to the global 1606 * protection domain list 1607 */ 1608static void del_domain_from_list(struct protection_domain *domain) 1609{ 1610 unsigned long flags; 1611 1612 spin_lock_irqsave(&amd_iommu_pd_lock, flags); 1613 list_del(&domain->list); 1614 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); 1615} 1616 1617static u16 domain_id_alloc(void) 1618{ 1619 unsigned long flags; 1620 int id; 1621 1622 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1623 id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID); 1624 BUG_ON(id == 0); 1625 if (id > 0 && id < MAX_DOMAIN_ID) 1626 __set_bit(id, amd_iommu_pd_alloc_bitmap); 1627 else 1628 id = 0; 1629 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1630 1631 return id; 1632} 1633 1634static void domain_id_free(int id) 1635{ 1636 unsigned long flags; 1637 1638 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1639 if (id > 0 && id < MAX_DOMAIN_ID) 1640 __clear_bit(id, amd_iommu_pd_alloc_bitmap); 1641 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1642} 1643 1644#define DEFINE_FREE_PT_FN(LVL, FN) \ 1645static void free_pt_##LVL (unsigned long __pt) \ 1646{ \ 1647 unsigned long p; \ 1648 u64 *pt; \ 1649 int i; \ 1650 \ 1651 pt = (u64 *)__pt; \ 1652 \ 1653 for (i = 0; i < 512; ++i) { \ 1654 /* PTE present? */ \ 1655 if (!IOMMU_PTE_PRESENT(pt[i])) \ 1656 continue; \ 1657 \ 1658 /* Large PTE? */ \ 1659 if (PM_PTE_LEVEL(pt[i]) == 0 || \ 1660 PM_PTE_LEVEL(pt[i]) == 7) \ 1661 continue; \ 1662 \ 1663 p = (unsigned long)IOMMU_PTE_PAGE(pt[i]); \ 1664 FN(p); \ 1665 } \ 1666 free_page((unsigned long)pt); \ 1667} 1668 1669DEFINE_FREE_PT_FN(l2, free_page) 1670DEFINE_FREE_PT_FN(l3, free_pt_l2) 1671DEFINE_FREE_PT_FN(l4, free_pt_l3) 1672DEFINE_FREE_PT_FN(l5, free_pt_l4) 1673DEFINE_FREE_PT_FN(l6, free_pt_l5) 1674 1675static void free_pagetable(struct protection_domain *domain) 1676{ 1677 unsigned long root = (unsigned long)domain->pt_root; 1678 1679 switch (domain->mode) { 1680 case PAGE_MODE_NONE: 1681 break; 1682 case PAGE_MODE_1_LEVEL: 1683 free_page(root); 1684 break; 1685 case PAGE_MODE_2_LEVEL: 1686 free_pt_l2(root); 1687 break; 1688 case PAGE_MODE_3_LEVEL: 1689 free_pt_l3(root); 1690 break; 1691 case PAGE_MODE_4_LEVEL: 1692 free_pt_l4(root); 1693 break; 1694 case PAGE_MODE_5_LEVEL: 1695 free_pt_l5(root); 1696 break; 1697 case PAGE_MODE_6_LEVEL: 1698 free_pt_l6(root); 1699 break; 1700 default: 1701 BUG(); 1702 } 1703} 1704 1705static void free_gcr3_tbl_level1(u64 *tbl) 1706{ 1707 u64 *ptr; 1708 int i; 1709 1710 for (i = 0; i < 512; ++i) { 1711 if (!(tbl[i] & GCR3_VALID)) 1712 continue; 1713 1714 ptr = __va(tbl[i] & PAGE_MASK); 1715 1716 free_page((unsigned long)ptr); 1717 } 1718} 1719 1720static void free_gcr3_tbl_level2(u64 *tbl) 1721{ 1722 u64 *ptr; 1723 int i; 1724 1725 for (i = 0; i < 512; ++i) { 1726 if (!(tbl[i] & GCR3_VALID)) 1727 continue; 1728 1729 ptr = __va(tbl[i] & PAGE_MASK); 1730 1731 free_gcr3_tbl_level1(ptr); 1732 } 1733} 1734 1735static void free_gcr3_table(struct protection_domain *domain) 1736{ 1737 if (domain->glx == 2) 1738 free_gcr3_tbl_level2(domain->gcr3_tbl); 1739 else if (domain->glx == 1) 1740 free_gcr3_tbl_level1(domain->gcr3_tbl); 1741 else 1742 BUG_ON(domain->glx != 0); 1743 1744 free_page((unsigned long)domain->gcr3_tbl); 1745} 1746 1747/* 1748 * Free a domain, only used if something went wrong in the 1749 * allocation path and we need to free an already allocated page table 1750 */ 1751static void dma_ops_domain_free(struct dma_ops_domain *dom) 1752{ 1753 int i; 1754 1755 if (!dom) 1756 return; 1757 1758 del_domain_from_list(&dom->domain); 1759 1760 free_pagetable(&dom->domain); 1761 1762 for (i = 0; i < APERTURE_MAX_RANGES; ++i) { 1763 if (!dom->aperture[i]) 1764 continue; 1765 free_page((unsigned long)dom->aperture[i]->bitmap); 1766 kfree(dom->aperture[i]); 1767 } 1768 1769 kfree(dom); 1770} 1771 1772/* 1773 * Allocates a new protection domain usable for the dma_ops functions. 1774 * It also initializes the page table and the address allocator data 1775 * structures required for the dma_ops interface 1776 */ 1777static struct dma_ops_domain *dma_ops_domain_alloc(void) 1778{ 1779 struct dma_ops_domain *dma_dom; 1780 1781 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); 1782 if (!dma_dom) 1783 return NULL; 1784 1785 if (protection_domain_init(&dma_dom->domain)) 1786 goto free_dma_dom; 1787 1788 dma_dom->domain.mode = PAGE_MODE_2_LEVEL; 1789 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); 1790 dma_dom->domain.flags = PD_DMA_OPS_MASK; 1791 dma_dom->domain.priv = dma_dom; 1792 if (!dma_dom->domain.pt_root) 1793 goto free_dma_dom; 1794 1795 dma_dom->need_flush = false; 1796 1797 add_domain_to_list(&dma_dom->domain); 1798 1799 if (alloc_new_range(dma_dom, true, GFP_KERNEL)) 1800 goto free_dma_dom; 1801 1802 /* 1803 * mark the first page as allocated so we never return 0 as 1804 * a valid dma-address. So we can use 0 as error value 1805 */ 1806 dma_dom->aperture[0]->bitmap[0] = 1; 1807 dma_dom->next_address = 0; 1808 1809 1810 return dma_dom; 1811 1812free_dma_dom: 1813 dma_ops_domain_free(dma_dom); 1814 1815 return NULL; 1816} 1817 1818/* 1819 * little helper function to check whether a given protection domain is a 1820 * dma_ops domain 1821 */ 1822static bool dma_ops_domain(struct protection_domain *domain) 1823{ 1824 return domain->flags & PD_DMA_OPS_MASK; 1825} 1826 1827static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats) 1828{ 1829 u64 pte_root = 0; 1830 u64 flags = 0; 1831 1832 if (domain->mode != PAGE_MODE_NONE) 1833 pte_root = virt_to_phys(domain->pt_root); 1834 1835 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) 1836 << DEV_ENTRY_MODE_SHIFT; 1837 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; 1838 1839 flags = amd_iommu_dev_table[devid].data[1]; 1840 1841 if (ats) 1842 flags |= DTE_FLAG_IOTLB; 1843 1844 if (domain->flags & PD_IOMMUV2_MASK) { 1845 u64 gcr3 = __pa(domain->gcr3_tbl); 1846 u64 glx = domain->glx; 1847 u64 tmp; 1848 1849 pte_root |= DTE_FLAG_GV; 1850 pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT; 1851 1852 /* First mask out possible old values for GCR3 table */ 1853 tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B; 1854 flags &= ~tmp; 1855 1856 tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C; 1857 flags &= ~tmp; 1858 1859 /* Encode GCR3 table into DTE */ 1860 tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A; 1861 pte_root |= tmp; 1862 1863 tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B; 1864 flags |= tmp; 1865 1866 tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C; 1867 flags |= tmp; 1868 } 1869 1870 flags &= ~(0xffffUL); 1871 flags |= domain->id; 1872 1873 amd_iommu_dev_table[devid].data[1] = flags; 1874 amd_iommu_dev_table[devid].data[0] = pte_root; 1875} 1876 1877static void clear_dte_entry(u16 devid) 1878{ 1879 /* remove entry from the device table seen by the hardware */ 1880 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; 1881 amd_iommu_dev_table[devid].data[1] &= DTE_FLAG_MASK; 1882 1883 amd_iommu_apply_erratum_63(devid); 1884} 1885 1886static void do_attach(struct iommu_dev_data *dev_data, 1887 struct protection_domain *domain) 1888{ 1889 struct amd_iommu *iommu; 1890 u16 alias; 1891 bool ats; 1892 1893 iommu = amd_iommu_rlookup_table[dev_data->devid]; 1894 alias = amd_iommu_alias_table[dev_data->devid]; 1895 ats = dev_data->ats.enabled; 1896 1897 /* Update data structures */ 1898 dev_data->domain = domain; 1899 list_add(&dev_data->list, &domain->dev_list); 1900 1901 /* Do reference counting */ 1902 domain->dev_iommu[iommu->index] += 1; 1903 domain->dev_cnt += 1; 1904 1905 /* Update device table */ 1906 set_dte_entry(dev_data->devid, domain, ats); 1907 if (alias != dev_data->devid) 1908 set_dte_entry(dev_data->devid, domain, ats); 1909 1910 device_flush_dte(dev_data); 1911} 1912 1913static void do_detach(struct iommu_dev_data *dev_data) 1914{ 1915 struct amd_iommu *iommu; 1916 u16 alias; 1917 1918 /* 1919 * First check if the device is still attached. It might already 1920 * be detached from its domain because the generic 1921 * iommu_detach_group code detached it and we try again here in 1922 * our alias handling. 1923 */ 1924 if (!dev_data->domain) 1925 return; 1926 1927 iommu = amd_iommu_rlookup_table[dev_data->devid]; 1928 alias = amd_iommu_alias_table[dev_data->devid]; 1929 1930 /* decrease reference counters */ 1931 dev_data->domain->dev_iommu[iommu->index] -= 1; 1932 dev_data->domain->dev_cnt -= 1; 1933 1934 /* Update data structures */ 1935 dev_data->domain = NULL; 1936 list_del(&dev_data->list); 1937 clear_dte_entry(dev_data->devid); 1938 if (alias != dev_data->devid) 1939 clear_dte_entry(alias); 1940 1941 /* Flush the DTE entry */ 1942 device_flush_dte(dev_data); 1943} 1944 1945/* 1946 * If a device is not yet associated with a domain, this function does 1947 * assigns it visible for the hardware 1948 */ 1949static int __attach_device(struct iommu_dev_data *dev_data, 1950 struct protection_domain *domain) 1951{ 1952 int ret; 1953 1954 /* 1955 * Must be called with IRQs disabled. Warn here to detect early 1956 * when its not. 1957 */ 1958 WARN_ON(!irqs_disabled()); 1959 1960 /* lock domain */ 1961 spin_lock(&domain->lock); 1962 1963 ret = -EBUSY; 1964 if (dev_data->domain != NULL) 1965 goto out_unlock; 1966 1967 /* Attach alias group root */ 1968 do_attach(dev_data, domain); 1969 1970 ret = 0; 1971 1972out_unlock: 1973 1974 /* ready */ 1975 spin_unlock(&domain->lock); 1976 1977 return ret; 1978} 1979 1980 1981static void pdev_iommuv2_disable(struct pci_dev *pdev) 1982{ 1983 pci_disable_ats(pdev); 1984 pci_disable_pri(pdev); 1985 pci_disable_pasid(pdev); 1986} 1987 1988/* FIXME: Change generic reset-function to do the same */ 1989static int pri_reset_while_enabled(struct pci_dev *pdev) 1990{ 1991 u16 control; 1992 int pos; 1993 1994 pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); 1995 if (!pos) 1996 return -EINVAL; 1997 1998 pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control); 1999 control |= PCI_PRI_CTRL_RESET; 2000 pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control); 2001 2002 return 0; 2003} 2004 2005static int pdev_iommuv2_enable(struct pci_dev *pdev) 2006{ 2007 bool reset_enable; 2008 int reqs, ret; 2009 2010 /* FIXME: Hardcode number of outstanding requests for now */ 2011 reqs = 32; 2012 if (pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE)) 2013 reqs = 1; 2014 reset_enable = pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_ENABLE_RESET); 2015 2016 /* Only allow access to user-accessible pages */ 2017 ret = pci_enable_pasid(pdev, 0); 2018 if (ret) 2019 goto out_err; 2020 2021 /* First reset the PRI state of the device */ 2022 ret = pci_reset_pri(pdev); 2023 if (ret) 2024 goto out_err; 2025 2026 /* Enable PRI */ 2027 ret = pci_enable_pri(pdev, reqs); 2028 if (ret) 2029 goto out_err; 2030 2031 if (reset_enable) { 2032 ret = pri_reset_while_enabled(pdev); 2033 if (ret) 2034 goto out_err; 2035 } 2036 2037 ret = pci_enable_ats(pdev, PAGE_SHIFT); 2038 if (ret) 2039 goto out_err; 2040 2041 return 0; 2042 2043out_err: 2044 pci_disable_pri(pdev); 2045 pci_disable_pasid(pdev); 2046 2047 return ret; 2048} 2049 2050/* FIXME: Move this to PCI code */ 2051#define PCI_PRI_TLP_OFF (1 << 15) 2052 2053static bool pci_pri_tlp_required(struct pci_dev *pdev) 2054{ 2055 u16 status; 2056 int pos; 2057 2058 pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); 2059 if (!pos) 2060 return false; 2061 2062 pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status); 2063 2064 return (status & PCI_PRI_TLP_OFF) ? true : false; 2065} 2066 2067/* 2068 * If a device is not yet associated with a domain, this function 2069 * assigns it visible for the hardware 2070 */ 2071static int attach_device(struct device *dev, 2072 struct protection_domain *domain) 2073{ 2074 struct pci_dev *pdev = to_pci_dev(dev); 2075 struct iommu_dev_data *dev_data; 2076 unsigned long flags; 2077 int ret; 2078 2079 dev_data = get_dev_data(dev); 2080 2081 if (domain->flags & PD_IOMMUV2_MASK) { 2082 if (!dev_data->passthrough) 2083 return -EINVAL; 2084 2085 if (dev_data->iommu_v2) { 2086 if (pdev_iommuv2_enable(pdev) != 0) 2087 return -EINVAL; 2088 2089 dev_data->ats.enabled = true; 2090 dev_data->ats.qdep = pci_ats_queue_depth(pdev); 2091 dev_data->pri_tlp = pci_pri_tlp_required(pdev); 2092 } 2093 } else if (amd_iommu_iotlb_sup && 2094 pci_enable_ats(pdev, PAGE_SHIFT) == 0) { 2095 dev_data->ats.enabled = true; 2096 dev_data->ats.qdep = pci_ats_queue_depth(pdev); 2097 } 2098 2099 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 2100 ret = __attach_device(dev_data, domain); 2101 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2102 2103 /* 2104 * We might boot into a crash-kernel here. The crashed kernel 2105 * left the caches in the IOMMU dirty. So we have to flush 2106 * here to evict all dirty stuff. 2107 */ 2108 domain_flush_tlb_pde(domain); 2109 2110 return ret; 2111} 2112 2113/* 2114 * Removes a device from a protection domain (unlocked) 2115 */ 2116static void __detach_device(struct iommu_dev_data *dev_data) 2117{ 2118 struct protection_domain *domain; 2119 2120 /* 2121 * Must be called with IRQs disabled. Warn here to detect early 2122 * when its not. 2123 */ 2124 WARN_ON(!irqs_disabled()); 2125 2126 if (WARN_ON(!dev_data->domain)) 2127 return; 2128 2129 domain = dev_data->domain; 2130 2131 spin_lock(&domain->lock); 2132 2133 do_detach(dev_data); 2134 2135 spin_unlock(&domain->lock); 2136} 2137 2138/* 2139 * Removes a device from a protection domain (with devtable_lock held) 2140 */ 2141static void detach_device(struct device *dev) 2142{ 2143 struct protection_domain *domain; 2144 struct iommu_dev_data *dev_data; 2145 unsigned long flags; 2146 2147 dev_data = get_dev_data(dev); 2148 domain = dev_data->domain; 2149 2150 /* lock device table */ 2151 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 2152 __detach_device(dev_data); 2153 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2154 2155 if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2) 2156 pdev_iommuv2_disable(to_pci_dev(dev)); 2157 else if (dev_data->ats.enabled) 2158 pci_disable_ats(to_pci_dev(dev)); 2159 2160 dev_data->ats.enabled = false; 2161} 2162 2163static int amd_iommu_add_device(struct device *dev) 2164{ 2165 struct iommu_dev_data *dev_data; 2166 struct iommu_domain *domain; 2167 struct amd_iommu *iommu; 2168 u16 devid; 2169 int ret; 2170 2171 if (!check_device(dev) || get_dev_data(dev)) 2172 return 0; 2173 2174 devid = get_device_id(dev); 2175 iommu = amd_iommu_rlookup_table[devid]; 2176 2177 ret = iommu_init_device(dev); 2178 if (ret) { 2179 if (ret != -ENOTSUPP) 2180 pr_err("Failed to initialize device %s - trying to proceed anyway\n", 2181 dev_name(dev)); 2182 2183 iommu_ignore_device(dev); 2184 dev->archdata.dma_ops = &nommu_dma_ops; 2185 goto out; 2186 } 2187 init_iommu_group(dev); 2188 2189 dev_data = get_dev_data(dev); 2190 2191 BUG_ON(!dev_data); 2192 2193 if (iommu_pass_through || dev_data->iommu_v2) 2194 iommu_request_dm_for_dev(dev); 2195 2196 /* Domains are initialized for this device - have a look what we ended up with */ 2197 domain = iommu_get_domain_for_dev(dev); 2198 if (domain->type == IOMMU_DOMAIN_IDENTITY) 2199 dev_data->passthrough = true; 2200 else 2201 dev->archdata.dma_ops = &amd_iommu_dma_ops; 2202 2203out: 2204 iommu_completion_wait(iommu); 2205 2206 return 0; 2207} 2208 2209static void amd_iommu_remove_device(struct device *dev) 2210{ 2211 struct amd_iommu *iommu; 2212 u16 devid; 2213 2214 if (!check_device(dev)) 2215 return; 2216 2217 devid = get_device_id(dev); 2218 iommu = amd_iommu_rlookup_table[devid]; 2219 2220 iommu_uninit_device(dev); 2221 iommu_completion_wait(iommu); 2222} 2223 2224/***************************************************************************** 2225 * 2226 * The next functions belong to the dma_ops mapping/unmapping code. 2227 * 2228 *****************************************************************************/ 2229 2230/* 2231 * In the dma_ops path we only have the struct device. This function 2232 * finds the corresponding IOMMU, the protection domain and the 2233 * requestor id for a given device. 2234 * If the device is not yet associated with a domain this is also done 2235 * in this function. 2236 */ 2237static struct protection_domain *get_domain(struct device *dev) 2238{ 2239 struct protection_domain *domain; 2240 struct iommu_domain *io_domain; 2241 2242 if (!check_device(dev)) 2243 return ERR_PTR(-EINVAL); 2244 2245 io_domain = iommu_get_domain_for_dev(dev); 2246 if (!io_domain) 2247 return NULL; 2248 2249 domain = to_pdomain(io_domain); 2250 if (!dma_ops_domain(domain)) 2251 return ERR_PTR(-EBUSY); 2252 2253 return domain; 2254} 2255 2256static void update_device_table(struct protection_domain *domain) 2257{ 2258 struct iommu_dev_data *dev_data; 2259 2260 list_for_each_entry(dev_data, &domain->dev_list, list) 2261 set_dte_entry(dev_data->devid, domain, dev_data->ats.enabled); 2262} 2263 2264static void update_domain(struct protection_domain *domain) 2265{ 2266 if (!domain->updated) 2267 return; 2268 2269 update_device_table(domain); 2270 2271 domain_flush_devices(domain); 2272 domain_flush_tlb_pde(domain); 2273 2274 domain->updated = false; 2275} 2276 2277/* 2278 * This function fetches the PTE for a given address in the aperture 2279 */ 2280static u64* dma_ops_get_pte(struct dma_ops_domain *dom, 2281 unsigned long address) 2282{ 2283 struct aperture_range *aperture; 2284 u64 *pte, *pte_page; 2285 2286 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; 2287 if (!aperture) 2288 return NULL; 2289 2290 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; 2291 if (!pte) { 2292 pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page, 2293 GFP_ATOMIC); 2294 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; 2295 } else 2296 pte += PM_LEVEL_INDEX(0, address); 2297 2298 update_domain(&dom->domain); 2299 2300 return pte; 2301} 2302 2303/* 2304 * This is the generic map function. It maps one 4kb page at paddr to 2305 * the given address in the DMA address space for the domain. 2306 */ 2307static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom, 2308 unsigned long address, 2309 phys_addr_t paddr, 2310 int direction) 2311{ 2312 u64 *pte, __pte; 2313 2314 WARN_ON(address > dom->aperture_size); 2315 2316 paddr &= PAGE_MASK; 2317 2318 pte = dma_ops_get_pte(dom, address); 2319 if (!pte) 2320 return DMA_ERROR_CODE; 2321 2322 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; 2323 2324 if (direction == DMA_TO_DEVICE) 2325 __pte |= IOMMU_PTE_IR; 2326 else if (direction == DMA_FROM_DEVICE) 2327 __pte |= IOMMU_PTE_IW; 2328 else if (direction == DMA_BIDIRECTIONAL) 2329 __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW; 2330 2331 WARN_ON(*pte); 2332 2333 *pte = __pte; 2334 2335 return (dma_addr_t)address; 2336} 2337 2338/* 2339 * The generic unmapping function for on page in the DMA address space. 2340 */ 2341static void dma_ops_domain_unmap(struct dma_ops_domain *dom, 2342 unsigned long address) 2343{ 2344 struct aperture_range *aperture; 2345 u64 *pte; 2346 2347 if (address >= dom->aperture_size) 2348 return; 2349 2350 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; 2351 if (!aperture) 2352 return; 2353 2354 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; 2355 if (!pte) 2356 return; 2357 2358 pte += PM_LEVEL_INDEX(0, address); 2359 2360 WARN_ON(!*pte); 2361 2362 *pte = 0ULL; 2363} 2364 2365/* 2366 * This function contains common code for mapping of a physically 2367 * contiguous memory region into DMA address space. It is used by all 2368 * mapping functions provided with this IOMMU driver. 2369 * Must be called with the domain lock held. 2370 */ 2371static dma_addr_t __map_single(struct device *dev, 2372 struct dma_ops_domain *dma_dom, 2373 phys_addr_t paddr, 2374 size_t size, 2375 int dir, 2376 bool align, 2377 u64 dma_mask) 2378{ 2379 dma_addr_t offset = paddr & ~PAGE_MASK; 2380 dma_addr_t address, start, ret; 2381 unsigned int pages; 2382 unsigned long align_mask = 0; 2383 int i; 2384 2385 pages = iommu_num_pages(paddr, size, PAGE_SIZE); 2386 paddr &= PAGE_MASK; 2387 2388 INC_STATS_COUNTER(total_map_requests); 2389 2390 if (pages > 1) 2391 INC_STATS_COUNTER(cross_page); 2392 2393 if (align) 2394 align_mask = (1UL << get_order(size)) - 1; 2395 2396retry: 2397 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, 2398 dma_mask); 2399 if (unlikely(address == DMA_ERROR_CODE)) { 2400 /* 2401 * setting next_address here will let the address 2402 * allocator only scan the new allocated range in the 2403 * first run. This is a small optimization. 2404 */ 2405 dma_dom->next_address = dma_dom->aperture_size; 2406 2407 if (alloc_new_range(dma_dom, false, GFP_ATOMIC)) 2408 goto out; 2409 2410 /* 2411 * aperture was successfully enlarged by 128 MB, try 2412 * allocation again 2413 */ 2414 goto retry; 2415 } 2416 2417 start = address; 2418 for (i = 0; i < pages; ++i) { 2419 ret = dma_ops_domain_map(dma_dom, start, paddr, dir); 2420 if (ret == DMA_ERROR_CODE) 2421 goto out_unmap; 2422 2423 paddr += PAGE_SIZE; 2424 start += PAGE_SIZE; 2425 } 2426 address += offset; 2427 2428 ADD_STATS_COUNTER(alloced_io_mem, size); 2429 2430 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { 2431 domain_flush_tlb(&dma_dom->domain); 2432 dma_dom->need_flush = false; 2433 } else if (unlikely(amd_iommu_np_cache)) 2434 domain_flush_pages(&dma_dom->domain, address, size); 2435 2436out: 2437 return address; 2438 2439out_unmap: 2440 2441 for (--i; i >= 0; --i) { 2442 start -= PAGE_SIZE; 2443 dma_ops_domain_unmap(dma_dom, start); 2444 } 2445 2446 dma_ops_free_addresses(dma_dom, address, pages); 2447 2448 return DMA_ERROR_CODE; 2449} 2450 2451/* 2452 * Does the reverse of the __map_single function. Must be called with 2453 * the domain lock held too 2454 */ 2455static void __unmap_single(struct dma_ops_domain *dma_dom, 2456 dma_addr_t dma_addr, 2457 size_t size, 2458 int dir) 2459{ 2460 dma_addr_t flush_addr; 2461 dma_addr_t i, start; 2462 unsigned int pages; 2463 2464 if ((dma_addr == DMA_ERROR_CODE) || 2465 (dma_addr + size > dma_dom->aperture_size)) 2466 return; 2467 2468 flush_addr = dma_addr; 2469 pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); 2470 dma_addr &= PAGE_MASK; 2471 start = dma_addr; 2472 2473 for (i = 0; i < pages; ++i) { 2474 dma_ops_domain_unmap(dma_dom, start); 2475 start += PAGE_SIZE; 2476 } 2477 2478 SUB_STATS_COUNTER(alloced_io_mem, size); 2479 2480 dma_ops_free_addresses(dma_dom, dma_addr, pages); 2481 2482 if (amd_iommu_unmap_flush || dma_dom->need_flush) { 2483 domain_flush_pages(&dma_dom->domain, flush_addr, size); 2484 dma_dom->need_flush = false; 2485 } 2486} 2487 2488/* 2489 * The exported map_single function for dma_ops. 2490 */ 2491static dma_addr_t map_page(struct device *dev, struct page *page, 2492 unsigned long offset, size_t size, 2493 enum dma_data_direction dir, 2494 struct dma_attrs *attrs) 2495{ 2496 unsigned long flags; 2497 struct protection_domain *domain; 2498 dma_addr_t addr; 2499 u64 dma_mask; 2500 phys_addr_t paddr = page_to_phys(page) + offset; 2501 2502 INC_STATS_COUNTER(cnt_map_single); 2503 2504 domain = get_domain(dev); 2505 if (PTR_ERR(domain) == -EINVAL) 2506 return (dma_addr_t)paddr; 2507 else if (IS_ERR(domain)) 2508 return DMA_ERROR_CODE; 2509 2510 dma_mask = *dev->dma_mask; 2511 2512 spin_lock_irqsave(&domain->lock, flags); 2513 2514 addr = __map_single(dev, domain->priv, paddr, size, dir, false, 2515 dma_mask); 2516 if (addr == DMA_ERROR_CODE) 2517 goto out; 2518 2519 domain_flush_complete(domain); 2520 2521out: 2522 spin_unlock_irqrestore(&domain->lock, flags); 2523 2524 return addr; 2525} 2526 2527/* 2528 * The exported unmap_single function for dma_ops. 2529 */ 2530static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, 2531 enum dma_data_direction dir, struct dma_attrs *attrs) 2532{ 2533 unsigned long flags; 2534 struct protection_domain *domain; 2535 2536 INC_STATS_COUNTER(cnt_unmap_single); 2537 2538 domain = get_domain(dev); 2539 if (IS_ERR(domain)) 2540 return; 2541 2542 spin_lock_irqsave(&domain->lock, flags); 2543 2544 __unmap_single(domain->priv, dma_addr, size, dir); 2545 2546 domain_flush_complete(domain); 2547 2548 spin_unlock_irqrestore(&domain->lock, flags); 2549} 2550 2551/* 2552 * The exported map_sg function for dma_ops (handles scatter-gather 2553 * lists). 2554 */ 2555static int map_sg(struct device *dev, struct scatterlist *sglist, 2556 int nelems, enum dma_data_direction dir, 2557 struct dma_attrs *attrs) 2558{ 2559 unsigned long flags; 2560 struct protection_domain *domain; 2561 int i; 2562 struct scatterlist *s; 2563 phys_addr_t paddr; 2564 int mapped_elems = 0; 2565 u64 dma_mask; 2566 2567 INC_STATS_COUNTER(cnt_map_sg); 2568 2569 domain = get_domain(dev); 2570 if (IS_ERR(domain)) 2571 return 0; 2572 2573 dma_mask = *dev->dma_mask; 2574 2575 spin_lock_irqsave(&domain->lock, flags); 2576 2577 for_each_sg(sglist, s, nelems, i) { 2578 paddr = sg_phys(s); 2579 2580 s->dma_address = __map_single(dev, domain->priv, 2581 paddr, s->length, dir, false, 2582 dma_mask); 2583 2584 if (s->dma_address) { 2585 s->dma_length = s->length; 2586 mapped_elems++; 2587 } else 2588 goto unmap; 2589 } 2590 2591 domain_flush_complete(domain); 2592 2593out: 2594 spin_unlock_irqrestore(&domain->lock, flags); 2595 2596 return mapped_elems; 2597unmap: 2598 for_each_sg(sglist, s, mapped_elems, i) { 2599 if (s->dma_address) 2600 __unmap_single(domain->priv, s->dma_address, 2601 s->dma_length, dir); 2602 s->dma_address = s->dma_length = 0; 2603 } 2604 2605 mapped_elems = 0; 2606 2607 goto out; 2608} 2609 2610/* 2611 * The exported map_sg function for dma_ops (handles scatter-gather 2612 * lists). 2613 */ 2614static void unmap_sg(struct device *dev, struct scatterlist *sglist, 2615 int nelems, enum dma_data_direction dir, 2616 struct dma_attrs *attrs) 2617{ 2618 unsigned long flags; 2619 struct protection_domain *domain; 2620 struct scatterlist *s; 2621 int i; 2622 2623 INC_STATS_COUNTER(cnt_unmap_sg); 2624 2625 domain = get_domain(dev); 2626 if (IS_ERR(domain)) 2627 return; 2628 2629 spin_lock_irqsave(&domain->lock, flags); 2630 2631 for_each_sg(sglist, s, nelems, i) { 2632 __unmap_single(domain->priv, s->dma_address, 2633 s->dma_length, dir); 2634 s->dma_address = s->dma_length = 0; 2635 } 2636 2637 domain_flush_complete(domain); 2638 2639 spin_unlock_irqrestore(&domain->lock, flags); 2640} 2641 2642/* 2643 * The exported alloc_coherent function for dma_ops. 2644 */ 2645static void *alloc_coherent(struct device *dev, size_t size, 2646 dma_addr_t *dma_addr, gfp_t flag, 2647 struct dma_attrs *attrs) 2648{ 2649 u64 dma_mask = dev->coherent_dma_mask; 2650 struct protection_domain *domain; 2651 unsigned long flags; 2652 struct page *page; 2653 2654 INC_STATS_COUNTER(cnt_alloc_coherent); 2655 2656 domain = get_domain(dev); 2657 if (PTR_ERR(domain) == -EINVAL) { 2658 page = alloc_pages(flag, get_order(size)); 2659 *dma_addr = page_to_phys(page); 2660 return page_address(page); 2661 } else if (IS_ERR(domain)) 2662 return NULL; 2663 2664 size = PAGE_ALIGN(size); 2665 dma_mask = dev->coherent_dma_mask; 2666 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); 2667 flag |= __GFP_ZERO; 2668 2669 page = alloc_pages(flag | __GFP_NOWARN, get_order(size)); 2670 if (!page) { 2671 if (!gfpflags_allow_blocking(flag)) 2672 return NULL; 2673 2674 page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT, 2675 get_order(size)); 2676 if (!page) 2677 return NULL; 2678 } 2679 2680 if (!dma_mask) 2681 dma_mask = *dev->dma_mask; 2682 2683 spin_lock_irqsave(&domain->lock, flags); 2684 2685 *dma_addr = __map_single(dev, domain->priv, page_to_phys(page), 2686 size, DMA_BIDIRECTIONAL, true, dma_mask); 2687 2688 if (*dma_addr == DMA_ERROR_CODE) { 2689 spin_unlock_irqrestore(&domain->lock, flags); 2690 goto out_free; 2691 } 2692 2693 domain_flush_complete(domain); 2694 2695 spin_unlock_irqrestore(&domain->lock, flags); 2696 2697 return page_address(page); 2698 2699out_free: 2700 2701 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT)) 2702 __free_pages(page, get_order(size)); 2703 2704 return NULL; 2705} 2706 2707/* 2708 * The exported free_coherent function for dma_ops. 2709 */ 2710static void free_coherent(struct device *dev, size_t size, 2711 void *virt_addr, dma_addr_t dma_addr, 2712 struct dma_attrs *attrs) 2713{ 2714 struct protection_domain *domain; 2715 unsigned long flags; 2716 struct page *page; 2717 2718 INC_STATS_COUNTER(cnt_free_coherent); 2719 2720 page = virt_to_page(virt_addr); 2721 size = PAGE_ALIGN(size); 2722 2723 domain = get_domain(dev); 2724 if (IS_ERR(domain)) 2725 goto free_mem; 2726 2727 spin_lock_irqsave(&domain->lock, flags); 2728 2729 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 2730 2731 domain_flush_complete(domain); 2732 2733 spin_unlock_irqrestore(&domain->lock, flags); 2734 2735free_mem: 2736 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT)) 2737 __free_pages(page, get_order(size)); 2738} 2739 2740/* 2741 * This function is called by the DMA layer to find out if we can handle a 2742 * particular device. It is part of the dma_ops. 2743 */ 2744static int amd_iommu_dma_supported(struct device *dev, u64 mask) 2745{ 2746 return check_device(dev); 2747} 2748 2749static struct dma_map_ops amd_iommu_dma_ops = { 2750 .alloc = alloc_coherent, 2751 .free = free_coherent, 2752 .map_page = map_page, 2753 .unmap_page = unmap_page, 2754 .map_sg = map_sg, 2755 .unmap_sg = unmap_sg, 2756 .dma_supported = amd_iommu_dma_supported, 2757}; 2758 2759int __init amd_iommu_init_api(void) 2760{ 2761 return bus_set_iommu(&pci_bus_type, &amd_iommu_ops); 2762} 2763 2764int __init amd_iommu_init_dma_ops(void) 2765{ 2766 swiotlb = iommu_pass_through ? 1 : 0; 2767 iommu_detected = 1; 2768 2769 /* 2770 * In case we don't initialize SWIOTLB (actually the common case 2771 * when AMD IOMMU is enabled), make sure there are global 2772 * dma_ops set as a fall-back for devices not handled by this 2773 * driver (for example non-PCI devices). 2774 */ 2775 if (!swiotlb) 2776 dma_ops = &nommu_dma_ops; 2777 2778 amd_iommu_stats_init(); 2779 2780 if (amd_iommu_unmap_flush) 2781 pr_info("AMD-Vi: IO/TLB flush on unmap enabled\n"); 2782 else 2783 pr_info("AMD-Vi: Lazy IO/TLB flushing enabled\n"); 2784 2785 return 0; 2786} 2787 2788/***************************************************************************** 2789 * 2790 * The following functions belong to the exported interface of AMD IOMMU 2791 * 2792 * This interface allows access to lower level functions of the IOMMU 2793 * like protection domain handling and assignement of devices to domains 2794 * which is not possible with the dma_ops interface. 2795 * 2796 *****************************************************************************/ 2797 2798static void cleanup_domain(struct protection_domain *domain) 2799{ 2800 struct iommu_dev_data *entry; 2801 unsigned long flags; 2802 2803 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 2804 2805 while (!list_empty(&domain->dev_list)) { 2806 entry = list_first_entry(&domain->dev_list, 2807 struct iommu_dev_data, list); 2808 __detach_device(entry); 2809 } 2810 2811 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2812} 2813 2814static void protection_domain_free(struct protection_domain *domain) 2815{ 2816 if (!domain) 2817 return; 2818 2819 del_domain_from_list(domain); 2820 2821 if (domain->id) 2822 domain_id_free(domain->id); 2823 2824 kfree(domain); 2825} 2826 2827static int protection_domain_init(struct protection_domain *domain) 2828{ 2829 spin_lock_init(&domain->lock); 2830 mutex_init(&domain->api_lock); 2831 domain->id = domain_id_alloc(); 2832 if (!domain->id) 2833 return -ENOMEM; 2834 INIT_LIST_HEAD(&domain->dev_list); 2835 2836 return 0; 2837} 2838 2839static struct protection_domain *protection_domain_alloc(void) 2840{ 2841 struct protection_domain *domain; 2842 2843 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 2844 if (!domain) 2845 return NULL; 2846 2847 if (protection_domain_init(domain)) 2848 goto out_err; 2849 2850 add_domain_to_list(domain); 2851 2852 return domain; 2853 2854out_err: 2855 kfree(domain); 2856 2857 return NULL; 2858} 2859 2860static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) 2861{ 2862 struct protection_domain *pdomain; 2863 struct dma_ops_domain *dma_domain; 2864 2865 switch (type) { 2866 case IOMMU_DOMAIN_UNMANAGED: 2867 pdomain = protection_domain_alloc(); 2868 if (!pdomain) 2869 return NULL; 2870 2871 pdomain->mode = PAGE_MODE_3_LEVEL; 2872 pdomain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); 2873 if (!pdomain->pt_root) { 2874 protection_domain_free(pdomain); 2875 return NULL; 2876 } 2877 2878 pdomain->domain.geometry.aperture_start = 0; 2879 pdomain->domain.geometry.aperture_end = ~0ULL; 2880 pdomain->domain.geometry.force_aperture = true; 2881 2882 break; 2883 case IOMMU_DOMAIN_DMA: 2884 dma_domain = dma_ops_domain_alloc(); 2885 if (!dma_domain) { 2886 pr_err("AMD-Vi: Failed to allocate\n"); 2887 return NULL; 2888 } 2889 pdomain = &dma_domain->domain; 2890 break; 2891 case IOMMU_DOMAIN_IDENTITY: 2892 pdomain = protection_domain_alloc(); 2893 if (!pdomain) 2894 return NULL; 2895 2896 pdomain->mode = PAGE_MODE_NONE; 2897 break; 2898 default: 2899 return NULL; 2900 } 2901 2902 return &pdomain->domain; 2903} 2904 2905static void amd_iommu_domain_free(struct iommu_domain *dom) 2906{ 2907 struct protection_domain *domain; 2908 2909 if (!dom) 2910 return; 2911 2912 domain = to_pdomain(dom); 2913 2914 if (domain->dev_cnt > 0) 2915 cleanup_domain(domain); 2916 2917 BUG_ON(domain->dev_cnt != 0); 2918 2919 if (domain->mode != PAGE_MODE_NONE) 2920 free_pagetable(domain); 2921 2922 if (domain->flags & PD_IOMMUV2_MASK) 2923 free_gcr3_table(domain); 2924 2925 protection_domain_free(domain); 2926} 2927 2928static void amd_iommu_detach_device(struct iommu_domain *dom, 2929 struct device *dev) 2930{ 2931 struct iommu_dev_data *dev_data = dev->archdata.iommu; 2932 struct amd_iommu *iommu; 2933 u16 devid; 2934 2935 if (!check_device(dev)) 2936 return; 2937 2938 devid = get_device_id(dev); 2939 2940 if (dev_data->domain != NULL) 2941 detach_device(dev); 2942 2943 iommu = amd_iommu_rlookup_table[devid]; 2944 if (!iommu) 2945 return; 2946 2947 iommu_completion_wait(iommu); 2948} 2949 2950static int amd_iommu_attach_device(struct iommu_domain *dom, 2951 struct device *dev) 2952{ 2953 struct protection_domain *domain = to_pdomain(dom); 2954 struct iommu_dev_data *dev_data; 2955 struct amd_iommu *iommu; 2956 int ret; 2957 2958 if (!check_device(dev)) 2959 return -EINVAL; 2960 2961 dev_data = dev->archdata.iommu; 2962 2963 iommu = amd_iommu_rlookup_table[dev_data->devid]; 2964 if (!iommu) 2965 return -EINVAL; 2966 2967 if (dev_data->domain) 2968 detach_device(dev); 2969 2970 ret = attach_device(dev, domain); 2971 2972 iommu_completion_wait(iommu); 2973 2974 return ret; 2975} 2976 2977static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, 2978 phys_addr_t paddr, size_t page_size, int iommu_prot) 2979{ 2980 struct protection_domain *domain = to_pdomain(dom); 2981 int prot = 0; 2982 int ret; 2983 2984 if (domain->mode == PAGE_MODE_NONE) 2985 return -EINVAL; 2986 2987 if (iommu_prot & IOMMU_READ) 2988 prot |= IOMMU_PROT_IR; 2989 if (iommu_prot & IOMMU_WRITE) 2990 prot |= IOMMU_PROT_IW; 2991 2992 mutex_lock(&domain->api_lock); 2993 ret = iommu_map_page(domain, iova, paddr, prot, page_size); 2994 mutex_unlock(&domain->api_lock); 2995 2996 return ret; 2997} 2998 2999static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, 3000 size_t page_size) 3001{ 3002 struct protection_domain *domain = to_pdomain(dom); 3003 size_t unmap_size; 3004 3005 if (domain->mode == PAGE_MODE_NONE) 3006 return -EINVAL; 3007 3008 mutex_lock(&domain->api_lock); 3009 unmap_size = iommu_unmap_page(domain, iova, page_size); 3010 mutex_unlock(&domain->api_lock); 3011 3012 domain_flush_tlb_pde(domain); 3013 3014 return unmap_size; 3015} 3016 3017static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 3018 dma_addr_t iova) 3019{ 3020 struct protection_domain *domain = to_pdomain(dom); 3021 unsigned long offset_mask, pte_pgsize; 3022 u64 *pte, __pte; 3023 3024 if (domain->mode == PAGE_MODE_NONE) 3025 return iova; 3026 3027 pte = fetch_pte(domain, iova, &pte_pgsize); 3028 3029 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 3030 return 0; 3031 3032 offset_mask = pte_pgsize - 1; 3033 __pte = *pte & PM_ADDR_MASK; 3034 3035 return (__pte & ~offset_mask) | (iova & offset_mask); 3036} 3037 3038static bool amd_iommu_capable(enum iommu_cap cap) 3039{ 3040 switch (cap) { 3041 case IOMMU_CAP_CACHE_COHERENCY: 3042 return true; 3043 case IOMMU_CAP_INTR_REMAP: 3044 return (irq_remapping_enabled == 1); 3045 case IOMMU_CAP_NOEXEC: 3046 return false; 3047 } 3048 3049 return false; 3050} 3051 3052static void amd_iommu_get_dm_regions(struct device *dev, 3053 struct list_head *head) 3054{ 3055 struct unity_map_entry *entry; 3056 u16 devid; 3057 3058 devid = get_device_id(dev); 3059 3060 list_for_each_entry(entry, &amd_iommu_unity_map, list) { 3061 struct iommu_dm_region *region; 3062 3063 if (devid < entry->devid_start || devid > entry->devid_end) 3064 continue; 3065 3066 region = kzalloc(sizeof(*region), GFP_KERNEL); 3067 if (!region) { 3068 pr_err("Out of memory allocating dm-regions for %s\n", 3069 dev_name(dev)); 3070 return; 3071 } 3072 3073 region->start = entry->address_start; 3074 region->length = entry->address_end - entry->address_start; 3075 if (entry->prot & IOMMU_PROT_IR) 3076 region->prot |= IOMMU_READ; 3077 if (entry->prot & IOMMU_PROT_IW) 3078 region->prot |= IOMMU_WRITE; 3079 3080 list_add_tail(&region->list, head); 3081 } 3082} 3083 3084static void amd_iommu_put_dm_regions(struct device *dev, 3085 struct list_head *head) 3086{ 3087 struct iommu_dm_region *entry, *next; 3088 3089 list_for_each_entry_safe(entry, next, head, list) 3090 kfree(entry); 3091} 3092 3093static const struct iommu_ops amd_iommu_ops = { 3094 .capable = amd_iommu_capable, 3095 .domain_alloc = amd_iommu_domain_alloc, 3096 .domain_free = amd_iommu_domain_free, 3097 .attach_dev = amd_iommu_attach_device, 3098 .detach_dev = amd_iommu_detach_device, 3099 .map = amd_iommu_map, 3100 .unmap = amd_iommu_unmap, 3101 .map_sg = default_iommu_map_sg, 3102 .iova_to_phys = amd_iommu_iova_to_phys, 3103 .add_device = amd_iommu_add_device, 3104 .remove_device = amd_iommu_remove_device, 3105 .device_group = pci_device_group, 3106 .get_dm_regions = amd_iommu_get_dm_regions, 3107 .put_dm_regions = amd_iommu_put_dm_regions, 3108 .pgsize_bitmap = AMD_IOMMU_PGSIZES, 3109}; 3110 3111/***************************************************************************** 3112 * 3113 * The next functions do a basic initialization of IOMMU for pass through 3114 * mode 3115 * 3116 * In passthrough mode the IOMMU is initialized and enabled but not used for 3117 * DMA-API translation. 3118 * 3119 *****************************************************************************/ 3120 3121/* IOMMUv2 specific functions */ 3122int amd_iommu_register_ppr_notifier(struct notifier_block *nb) 3123{ 3124 return atomic_notifier_chain_register(&ppr_notifier, nb); 3125} 3126EXPORT_SYMBOL(amd_iommu_register_ppr_notifier); 3127 3128int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb) 3129{ 3130 return atomic_notifier_chain_unregister(&ppr_notifier, nb); 3131} 3132EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier); 3133 3134void amd_iommu_domain_direct_map(struct iommu_domain *dom) 3135{ 3136 struct protection_domain *domain = to_pdomain(dom); 3137 unsigned long flags; 3138 3139 spin_lock_irqsave(&domain->lock, flags); 3140 3141 /* Update data structure */ 3142 domain->mode = PAGE_MODE_NONE; 3143 domain->updated = true; 3144 3145 /* Make changes visible to IOMMUs */ 3146 update_domain(domain); 3147 3148 /* Page-table is not visible to IOMMU anymore, so free it */ 3149 free_pagetable(domain); 3150 3151 spin_unlock_irqrestore(&domain->lock, flags); 3152} 3153EXPORT_SYMBOL(amd_iommu_domain_direct_map); 3154 3155int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids) 3156{ 3157 struct protection_domain *domain = to_pdomain(dom); 3158 unsigned long flags; 3159 int levels, ret; 3160 3161 if (pasids <= 0 || pasids > (PASID_MASK + 1)) 3162 return -EINVAL; 3163 3164 /* Number of GCR3 table levels required */ 3165 for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9) 3166 levels += 1; 3167 3168 if (levels > amd_iommu_max_glx_val) 3169 return -EINVAL; 3170 3171 spin_lock_irqsave(&domain->lock, flags); 3172 3173 /* 3174 * Save us all sanity checks whether devices already in the 3175 * domain support IOMMUv2. Just force that the domain has no 3176 * devices attached when it is switched into IOMMUv2 mode. 3177 */ 3178 ret = -EBUSY; 3179 if (domain->dev_cnt > 0 || domain->flags & PD_IOMMUV2_MASK) 3180 goto out; 3181 3182 ret = -ENOMEM; 3183 domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC); 3184 if (domain->gcr3_tbl == NULL) 3185 goto out; 3186 3187 domain->glx = levels; 3188 domain->flags |= PD_IOMMUV2_MASK; 3189 domain->updated = true; 3190 3191 update_domain(domain); 3192 3193 ret = 0; 3194 3195out: 3196 spin_unlock_irqrestore(&domain->lock, flags); 3197 3198 return ret; 3199} 3200EXPORT_SYMBOL(amd_iommu_domain_enable_v2); 3201 3202static int __flush_pasid(struct protection_domain *domain, int pasid, 3203 u64 address, bool size) 3204{ 3205 struct iommu_dev_data *dev_data; 3206 struct iommu_cmd cmd; 3207 int i, ret; 3208 3209 if (!(domain->flags & PD_IOMMUV2_MASK)) 3210 return -EINVAL; 3211 3212 build_inv_iommu_pasid(&cmd, domain->id, pasid, address, size); 3213 3214 /* 3215 * IOMMU TLB needs to be flushed before Device TLB to 3216 * prevent device TLB refill from IOMMU TLB 3217 */ 3218 for (i = 0; i < amd_iommus_present; ++i) { 3219 if (domain->dev_iommu[i] == 0) 3220 continue; 3221 3222 ret = iommu_queue_command(amd_iommus[i], &cmd); 3223 if (ret != 0) 3224 goto out; 3225 } 3226 3227 /* Wait until IOMMU TLB flushes are complete */ 3228 domain_flush_complete(domain); 3229 3230 /* Now flush device TLBs */ 3231 list_for_each_entry(dev_data, &domain->dev_list, list) { 3232 struct amd_iommu *iommu; 3233 int qdep; 3234 3235 /* 3236 There might be non-IOMMUv2 capable devices in an IOMMUv2 3237 * domain. 3238 */ 3239 if (!dev_data->ats.enabled) 3240 continue; 3241 3242 qdep = dev_data->ats.qdep; 3243 iommu = amd_iommu_rlookup_table[dev_data->devid]; 3244 3245 build_inv_iotlb_pasid(&cmd, dev_data->devid, pasid, 3246 qdep, address, size); 3247 3248 ret = iommu_queue_command(iommu, &cmd); 3249 if (ret != 0) 3250 goto out; 3251 } 3252 3253 /* Wait until all device TLBs are flushed */ 3254 domain_flush_complete(domain); 3255 3256 ret = 0; 3257 3258out: 3259 3260 return ret; 3261} 3262 3263static int __amd_iommu_flush_page(struct protection_domain *domain, int pasid, 3264 u64 address) 3265{ 3266 INC_STATS_COUNTER(invalidate_iotlb); 3267 3268 return __flush_pasid(domain, pasid, address, false); 3269} 3270 3271int amd_iommu_flush_page(struct iommu_domain *dom, int pasid, 3272 u64 address) 3273{ 3274 struct protection_domain *domain = to_pdomain(dom); 3275 unsigned long flags; 3276 int ret; 3277 3278 spin_lock_irqsave(&domain->lock, flags); 3279 ret = __amd_iommu_flush_page(domain, pasid, address); 3280 spin_unlock_irqrestore(&domain->lock, flags); 3281 3282 return ret; 3283} 3284EXPORT_SYMBOL(amd_iommu_flush_page); 3285 3286static int __amd_iommu_flush_tlb(struct protection_domain *domain, int pasid) 3287{ 3288 INC_STATS_COUNTER(invalidate_iotlb_all); 3289 3290 return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 3291 true); 3292} 3293 3294int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid) 3295{ 3296 struct protection_domain *domain = to_pdomain(dom); 3297 unsigned long flags; 3298 int ret; 3299 3300 spin_lock_irqsave(&domain->lock, flags); 3301 ret = __amd_iommu_flush_tlb(domain, pasid); 3302 spin_unlock_irqrestore(&domain->lock, flags); 3303 3304 return ret; 3305} 3306EXPORT_SYMBOL(amd_iommu_flush_tlb); 3307 3308static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc) 3309{ 3310 int index; 3311 u64 *pte; 3312 3313 while (true) { 3314 3315 index = (pasid >> (9 * level)) & 0x1ff; 3316 pte = &root[index]; 3317 3318 if (level == 0) 3319 break; 3320 3321 if (!(*pte & GCR3_VALID)) { 3322 if (!alloc) 3323 return NULL; 3324 3325 root = (void *)get_zeroed_page(GFP_ATOMIC); 3326 if (root == NULL) 3327 return NULL; 3328 3329 *pte = __pa(root) | GCR3_VALID; 3330 } 3331 3332 root = __va(*pte & PAGE_MASK); 3333 3334 level -= 1; 3335 } 3336 3337 return pte; 3338} 3339 3340static int __set_gcr3(struct protection_domain *domain, int pasid, 3341 unsigned long cr3) 3342{ 3343 u64 *pte; 3344 3345 if (domain->mode != PAGE_MODE_NONE) 3346 return -EINVAL; 3347 3348 pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true); 3349 if (pte == NULL) 3350 return -ENOMEM; 3351 3352 *pte = (cr3 & PAGE_MASK) | GCR3_VALID; 3353 3354 return __amd_iommu_flush_tlb(domain, pasid); 3355} 3356 3357static int __clear_gcr3(struct protection_domain *domain, int pasid) 3358{ 3359 u64 *pte; 3360 3361 if (domain->mode != PAGE_MODE_NONE) 3362 return -EINVAL; 3363 3364 pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false); 3365 if (pte == NULL) 3366 return 0; 3367 3368 *pte = 0; 3369 3370 return __amd_iommu_flush_tlb(domain, pasid); 3371} 3372 3373int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid, 3374 unsigned long cr3) 3375{ 3376 struct protection_domain *domain = to_pdomain(dom); 3377 unsigned long flags; 3378 int ret; 3379 3380 spin_lock_irqsave(&domain->lock, flags); 3381 ret = __set_gcr3(domain, pasid, cr3); 3382 spin_unlock_irqrestore(&domain->lock, flags); 3383 3384 return ret; 3385} 3386EXPORT_SYMBOL(amd_iommu_domain_set_gcr3); 3387 3388int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid) 3389{ 3390 struct protection_domain *domain = to_pdomain(dom); 3391 unsigned long flags; 3392 int ret; 3393 3394 spin_lock_irqsave(&domain->lock, flags); 3395 ret = __clear_gcr3(domain, pasid); 3396 spin_unlock_irqrestore(&domain->lock, flags); 3397 3398 return ret; 3399} 3400EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3); 3401 3402int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid, 3403 int status, int tag) 3404{ 3405 struct iommu_dev_data *dev_data; 3406 struct amd_iommu *iommu; 3407 struct iommu_cmd cmd; 3408 3409 INC_STATS_COUNTER(complete_ppr); 3410 3411 dev_data = get_dev_data(&pdev->dev); 3412 iommu = amd_iommu_rlookup_table[dev_data->devid]; 3413 3414 build_complete_ppr(&cmd, dev_data->devid, pasid, status, 3415 tag, dev_data->pri_tlp); 3416 3417 return iommu_queue_command(iommu, &cmd); 3418} 3419EXPORT_SYMBOL(amd_iommu_complete_ppr); 3420 3421struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev) 3422{ 3423 struct protection_domain *pdomain; 3424 3425 pdomain = get_domain(&pdev->dev); 3426 if (IS_ERR(pdomain)) 3427 return NULL; 3428 3429 /* Only return IOMMUv2 domains */ 3430 if (!(pdomain->flags & PD_IOMMUV2_MASK)) 3431 return NULL; 3432 3433 return &pdomain->domain; 3434} 3435EXPORT_SYMBOL(amd_iommu_get_v2_domain); 3436 3437void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum) 3438{ 3439 struct iommu_dev_data *dev_data; 3440 3441 if (!amd_iommu_v2_supported()) 3442 return; 3443 3444 dev_data = get_dev_data(&pdev->dev); 3445 dev_data->errata |= (1 << erratum); 3446} 3447EXPORT_SYMBOL(amd_iommu_enable_device_erratum); 3448 3449int amd_iommu_device_info(struct pci_dev *pdev, 3450 struct amd_iommu_device_info *info) 3451{ 3452 int max_pasids; 3453 int pos; 3454 3455 if (pdev == NULL || info == NULL) 3456 return -EINVAL; 3457 3458 if (!amd_iommu_v2_supported()) 3459 return -EINVAL; 3460 3461 memset(info, 0, sizeof(*info)); 3462 3463 pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS); 3464 if (pos) 3465 info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP; 3466 3467 pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); 3468 if (pos) 3469 info->flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP; 3470 3471 pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID); 3472 if (pos) { 3473 int features; 3474 3475 max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1)); 3476 max_pasids = min(max_pasids, (1 << 20)); 3477 3478 info->flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP; 3479 info->max_pasids = min(pci_max_pasids(pdev), max_pasids); 3480 3481 features = pci_pasid_features(pdev); 3482 if (features & PCI_PASID_CAP_EXEC) 3483 info->flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP; 3484 if (features & PCI_PASID_CAP_PRIV) 3485 info->flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP; 3486 } 3487 3488 return 0; 3489} 3490EXPORT_SYMBOL(amd_iommu_device_info); 3491 3492#ifdef CONFIG_IRQ_REMAP 3493 3494/***************************************************************************** 3495 * 3496 * Interrupt Remapping Implementation 3497 * 3498 *****************************************************************************/ 3499 3500union irte { 3501 u32 val; 3502 struct { 3503 u32 valid : 1, 3504 no_fault : 1, 3505 int_type : 3, 3506 rq_eoi : 1, 3507 dm : 1, 3508 rsvd_1 : 1, 3509 destination : 8, 3510 vector : 8, 3511 rsvd_2 : 8; 3512 } fields; 3513}; 3514 3515struct irq_2_irte { 3516 u16 devid; /* Device ID for IRTE table */ 3517 u16 index; /* Index into IRTE table*/ 3518}; 3519 3520struct amd_ir_data { 3521 struct irq_2_irte irq_2_irte; 3522 union irte irte_entry; 3523 union { 3524 struct msi_msg msi_entry; 3525 }; 3526}; 3527 3528static struct irq_chip amd_ir_chip; 3529 3530#define DTE_IRQ_PHYS_ADDR_MASK (((1ULL << 45)-1) << 6) 3531#define DTE_IRQ_REMAP_INTCTL (2ULL << 60) 3532#define DTE_IRQ_TABLE_LEN (8ULL << 1) 3533#define DTE_IRQ_REMAP_ENABLE 1ULL 3534 3535static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table) 3536{ 3537 u64 dte; 3538 3539 dte = amd_iommu_dev_table[devid].data[2]; 3540 dte &= ~DTE_IRQ_PHYS_ADDR_MASK; 3541 dte |= virt_to_phys(table->table); 3542 dte |= DTE_IRQ_REMAP_INTCTL; 3543 dte |= DTE_IRQ_TABLE_LEN; 3544 dte |= DTE_IRQ_REMAP_ENABLE; 3545 3546 amd_iommu_dev_table[devid].data[2] = dte; 3547} 3548 3549#define IRTE_ALLOCATED (~1U) 3550 3551static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic) 3552{ 3553 struct irq_remap_table *table = NULL; 3554 struct amd_iommu *iommu; 3555 unsigned long flags; 3556 u16 alias; 3557 3558 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 3559 3560 iommu = amd_iommu_rlookup_table[devid]; 3561 if (!iommu) 3562 goto out_unlock; 3563 3564 table = irq_lookup_table[devid]; 3565 if (table) 3566 goto out; 3567 3568 alias = amd_iommu_alias_table[devid]; 3569 table = irq_lookup_table[alias]; 3570 if (table) { 3571 irq_lookup_table[devid] = table; 3572 set_dte_irq_entry(devid, table); 3573 iommu_flush_dte(iommu, devid); 3574 goto out; 3575 } 3576 3577 /* Nothing there yet, allocate new irq remapping table */ 3578 table = kzalloc(sizeof(*table), GFP_ATOMIC); 3579 if (!table) 3580 goto out; 3581 3582 /* Initialize table spin-lock */ 3583 spin_lock_init(&table->lock); 3584 3585 if (ioapic) 3586 /* Keep the first 32 indexes free for IOAPIC interrupts */ 3587 table->min_index = 32; 3588 3589 table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_ATOMIC); 3590 if (!table->table) { 3591 kfree(table); 3592 table = NULL; 3593 goto out; 3594 } 3595 3596 memset(table->table, 0, MAX_IRQS_PER_TABLE * sizeof(u32)); 3597 3598 if (ioapic) { 3599 int i; 3600 3601 for (i = 0; i < 32; ++i) 3602 table->table[i] = IRTE_ALLOCATED; 3603 } 3604 3605 irq_lookup_table[devid] = table; 3606 set_dte_irq_entry(devid, table); 3607 iommu_flush_dte(iommu, devid); 3608 if (devid != alias) { 3609 irq_lookup_table[alias] = table; 3610 set_dte_irq_entry(alias, table); 3611 iommu_flush_dte(iommu, alias); 3612 } 3613 3614out: 3615 iommu_completion_wait(iommu); 3616 3617out_unlock: 3618 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 3619 3620 return table; 3621} 3622 3623static int alloc_irq_index(u16 devid, int count) 3624{ 3625 struct irq_remap_table *table; 3626 unsigned long flags; 3627 int index, c; 3628 3629 table = get_irq_table(devid, false); 3630 if (!table) 3631 return -ENODEV; 3632 3633 spin_lock_irqsave(&table->lock, flags); 3634 3635 /* Scan table for free entries */ 3636 for (c = 0, index = table->min_index; 3637 index < MAX_IRQS_PER_TABLE; 3638 ++index) { 3639 if (table->table[index] == 0) 3640 c += 1; 3641 else 3642 c = 0; 3643 3644 if (c == count) { 3645 for (; c != 0; --c) 3646 table->table[index - c + 1] = IRTE_ALLOCATED; 3647 3648 index -= count - 1; 3649 goto out; 3650 } 3651 } 3652 3653 index = -ENOSPC; 3654 3655out: 3656 spin_unlock_irqrestore(&table->lock, flags); 3657 3658 return index; 3659} 3660 3661static int modify_irte(u16 devid, int index, union irte irte) 3662{ 3663 struct irq_remap_table *table; 3664 struct amd_iommu *iommu; 3665 unsigned long flags; 3666 3667 iommu = amd_iommu_rlookup_table[devid]; 3668 if (iommu == NULL) 3669 return -EINVAL; 3670 3671 table = get_irq_table(devid, false); 3672 if (!table) 3673 return -ENOMEM; 3674 3675 spin_lock_irqsave(&table->lock, flags); 3676 table->table[index] = irte.val; 3677 spin_unlock_irqrestore(&table->lock, flags); 3678 3679 iommu_flush_irt(iommu, devid); 3680 iommu_completion_wait(iommu); 3681 3682 return 0; 3683} 3684 3685static void free_irte(u16 devid, int index) 3686{ 3687 struct irq_remap_table *table; 3688 struct amd_iommu *iommu; 3689 unsigned long flags; 3690 3691 iommu = amd_iommu_rlookup_table[devid]; 3692 if (iommu == NULL) 3693 return; 3694 3695 table = get_irq_table(devid, false); 3696 if (!table) 3697 return; 3698 3699 spin_lock_irqsave(&table->lock, flags); 3700 table->table[index] = 0; 3701 spin_unlock_irqrestore(&table->lock, flags); 3702 3703 iommu_flush_irt(iommu, devid); 3704 iommu_completion_wait(iommu); 3705} 3706 3707static int get_devid(struct irq_alloc_info *info) 3708{ 3709 int devid = -1; 3710 3711 switch (info->type) { 3712 case X86_IRQ_ALLOC_TYPE_IOAPIC: 3713 devid = get_ioapic_devid(info->ioapic_id); 3714 break; 3715 case X86_IRQ_ALLOC_TYPE_HPET: 3716 devid = get_hpet_devid(info->hpet_id); 3717 break; 3718 case X86_IRQ_ALLOC_TYPE_MSI: 3719 case X86_IRQ_ALLOC_TYPE_MSIX: 3720 devid = get_device_id(&info->msi_dev->dev); 3721 break; 3722 default: 3723 BUG_ON(1); 3724 break; 3725 } 3726 3727 return devid; 3728} 3729 3730static struct irq_domain *get_ir_irq_domain(struct irq_alloc_info *info) 3731{ 3732 struct amd_iommu *iommu; 3733 int devid; 3734 3735 if (!info) 3736 return NULL; 3737 3738 devid = get_devid(info); 3739 if (devid >= 0) { 3740 iommu = amd_iommu_rlookup_table[devid]; 3741 if (iommu) 3742 return iommu->ir_domain; 3743 } 3744 3745 return NULL; 3746} 3747 3748static struct irq_domain *get_irq_domain(struct irq_alloc_info *info) 3749{ 3750 struct amd_iommu *iommu; 3751 int devid; 3752 3753 if (!info) 3754 return NULL; 3755 3756 switch (info->type) { 3757 case X86_IRQ_ALLOC_TYPE_MSI: 3758 case X86_IRQ_ALLOC_TYPE_MSIX: 3759 devid = get_device_id(&info->msi_dev->dev); 3760 if (devid >= 0) { 3761 iommu = amd_iommu_rlookup_table[devid]; 3762 if (iommu) 3763 return iommu->msi_domain; 3764 } 3765 break; 3766 default: 3767 break; 3768 } 3769 3770 return NULL; 3771} 3772 3773struct irq_remap_ops amd_iommu_irq_ops = { 3774 .prepare = amd_iommu_prepare, 3775 .enable = amd_iommu_enable, 3776 .disable = amd_iommu_disable, 3777 .reenable = amd_iommu_reenable, 3778 .enable_faulting = amd_iommu_enable_faulting, 3779 .get_ir_irq_domain = get_ir_irq_domain, 3780 .get_irq_domain = get_irq_domain, 3781}; 3782 3783static void irq_remapping_prepare_irte(struct amd_ir_data *data, 3784 struct irq_cfg *irq_cfg, 3785 struct irq_alloc_info *info, 3786 int devid, int index, int sub_handle) 3787{ 3788 struct irq_2_irte *irte_info = &data->irq_2_irte; 3789 struct msi_msg *msg = &data->msi_entry; 3790 union irte *irte = &data->irte_entry; 3791 struct IO_APIC_route_entry *entry; 3792 3793 data->irq_2_irte.devid = devid; 3794 data->irq_2_irte.index = index + sub_handle; 3795 3796 /* Setup IRTE for IOMMU */ 3797 irte->val = 0; 3798 irte->fields.vector = irq_cfg->vector; 3799 irte->fields.int_type = apic->irq_delivery_mode; 3800 irte->fields.destination = irq_cfg->dest_apicid; 3801 irte->fields.dm = apic->irq_dest_mode; 3802 irte->fields.valid = 1; 3803 3804 switch (info->type) { 3805 case X86_IRQ_ALLOC_TYPE_IOAPIC: 3806 /* Setup IOAPIC entry */ 3807 entry = info->ioapic_entry; 3808 info->ioapic_entry = NULL; 3809 memset(entry, 0, sizeof(*entry)); 3810 entry->vector = index; 3811 entry->mask = 0; 3812 entry->trigger = info->ioapic_trigger; 3813 entry->polarity = info->ioapic_polarity; 3814 /* Mask level triggered irqs. */ 3815 if (info->ioapic_trigger) 3816 entry->mask = 1; 3817 break; 3818 3819 case X86_IRQ_ALLOC_TYPE_HPET: 3820 case X86_IRQ_ALLOC_TYPE_MSI: 3821 case X86_IRQ_ALLOC_TYPE_MSIX: 3822 msg->address_hi = MSI_ADDR_BASE_HI; 3823 msg->address_lo = MSI_ADDR_BASE_LO; 3824 msg->data = irte_info->index; 3825 break; 3826 3827 default: 3828 BUG_ON(1); 3829 break; 3830 } 3831} 3832 3833static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, 3834 unsigned int nr_irqs, void *arg) 3835{ 3836 struct irq_alloc_info *info = arg; 3837 struct irq_data *irq_data; 3838 struct amd_ir_data *data; 3839 struct irq_cfg *cfg; 3840 int i, ret, devid; 3841 int index = -1; 3842 3843 if (!info) 3844 return -EINVAL; 3845 if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_MSI && 3846 info->type != X86_IRQ_ALLOC_TYPE_MSIX) 3847 return -EINVAL; 3848 3849 /* 3850 * With IRQ remapping enabled, don't need contiguous CPU vectors 3851 * to support multiple MSI interrupts. 3852 */ 3853 if (info->type == X86_IRQ_ALLOC_TYPE_MSI) 3854 info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; 3855 3856 devid = get_devid(info); 3857 if (devid < 0) 3858 return -EINVAL; 3859 3860 ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); 3861 if (ret < 0) 3862 return ret; 3863 3864 if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) { 3865 if (get_irq_table(devid, true)) 3866 index = info->ioapic_pin; 3867 else 3868 ret = -ENOMEM; 3869 } else { 3870 index = alloc_irq_index(devid, nr_irqs); 3871 } 3872 if (index < 0) { 3873 pr_warn("Failed to allocate IRTE\n"); 3874 goto out_free_parent; 3875 } 3876 3877 for (i = 0; i < nr_irqs; i++) { 3878 irq_data = irq_domain_get_irq_data(domain, virq + i); 3879 cfg = irqd_cfg(irq_data); 3880 if (!irq_data || !cfg) { 3881 ret = -EINVAL; 3882 goto out_free_data; 3883 } 3884 3885 ret = -ENOMEM; 3886 data = kzalloc(sizeof(*data), GFP_KERNEL); 3887 if (!data) 3888 goto out_free_data; 3889 3890 irq_data->hwirq = (devid << 16) + i; 3891 irq_data->chip_data = data; 3892 irq_data->chip = &amd_ir_chip; 3893 irq_remapping_prepare_irte(data, cfg, info, devid, index, i); 3894 irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT); 3895 } 3896 3897 return 0; 3898 3899out_free_data: 3900 for (i--; i >= 0; i--) { 3901 irq_data = irq_domain_get_irq_data(domain, virq + i); 3902 if (irq_data) 3903 kfree(irq_data->chip_data); 3904 } 3905 for (i = 0; i < nr_irqs; i++) 3906 free_irte(devid, index + i); 3907out_free_parent: 3908 irq_domain_free_irqs_common(domain, virq, nr_irqs); 3909 return ret; 3910} 3911 3912static void irq_remapping_free(struct irq_domain *domain, unsigned int virq, 3913 unsigned int nr_irqs) 3914{ 3915 struct irq_2_irte *irte_info; 3916 struct irq_data *irq_data; 3917 struct amd_ir_data *data; 3918 int i; 3919 3920 for (i = 0; i < nr_irqs; i++) { 3921 irq_data = irq_domain_get_irq_data(domain, virq + i); 3922 if (irq_data && irq_data->chip_data) { 3923 data = irq_data->chip_data; 3924 irte_info = &data->irq_2_irte; 3925 free_irte(irte_info->devid, irte_info->index); 3926 kfree(data); 3927 } 3928 } 3929 irq_domain_free_irqs_common(domain, virq, nr_irqs); 3930} 3931 3932static void irq_remapping_activate(struct irq_domain *domain, 3933 struct irq_data *irq_data) 3934{ 3935 struct amd_ir_data *data = irq_data->chip_data; 3936 struct irq_2_irte *irte_info = &data->irq_2_irte; 3937 3938 modify_irte(irte_info->devid, irte_info->index, data->irte_entry); 3939} 3940 3941static void irq_remapping_deactivate(struct irq_domain *domain, 3942 struct irq_data *irq_data) 3943{ 3944 struct amd_ir_data *data = irq_data->chip_data; 3945 struct irq_2_irte *irte_info = &data->irq_2_irte; 3946 union irte entry; 3947 3948 entry.val = 0; 3949 modify_irte(irte_info->devid, irte_info->index, data->irte_entry); 3950} 3951 3952static struct irq_domain_ops amd_ir_domain_ops = { 3953 .alloc = irq_remapping_alloc, 3954 .free = irq_remapping_free, 3955 .activate = irq_remapping_activate, 3956 .deactivate = irq_remapping_deactivate, 3957}; 3958 3959static int amd_ir_set_affinity(struct irq_data *data, 3960 const struct cpumask *mask, bool force) 3961{ 3962 struct amd_ir_data *ir_data = data->chip_data; 3963 struct irq_2_irte *irte_info = &ir_data->irq_2_irte; 3964 struct irq_cfg *cfg = irqd_cfg(data); 3965 struct irq_data *parent = data->parent_data; 3966 int ret; 3967 3968 ret = parent->chip->irq_set_affinity(parent, mask, force); 3969 if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) 3970 return ret; 3971 3972 /* 3973 * Atomically updates the IRTE with the new destination, vector 3974 * and flushes the interrupt entry cache. 3975 */ 3976 ir_data->irte_entry.fields.vector = cfg->vector; 3977 ir_data->irte_entry.fields.destination = cfg->dest_apicid; 3978 modify_irte(irte_info->devid, irte_info->index, ir_data->irte_entry); 3979 3980 /* 3981 * After this point, all the interrupts will start arriving 3982 * at the new destination. So, time to cleanup the previous 3983 * vector allocation. 3984 */ 3985 send_cleanup_vector(cfg); 3986 3987 return IRQ_SET_MASK_OK_DONE; 3988} 3989 3990static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg) 3991{ 3992 struct amd_ir_data *ir_data = irq_data->chip_data; 3993 3994 *msg = ir_data->msi_entry; 3995} 3996 3997static struct irq_chip amd_ir_chip = { 3998 .irq_ack = ir_ack_apic_edge, 3999 .irq_set_affinity = amd_ir_set_affinity, 4000 .irq_compose_msi_msg = ir_compose_msi_msg, 4001}; 4002 4003int amd_iommu_create_irq_domain(struct amd_iommu *iommu) 4004{ 4005 iommu->ir_domain = irq_domain_add_tree(NULL, &amd_ir_domain_ops, iommu); 4006 if (!iommu->ir_domain) 4007 return -ENOMEM; 4008 4009 iommu->ir_domain->parent = arch_get_ir_parent_domain(); 4010 iommu->msi_domain = arch_create_msi_irq_domain(iommu->ir_domain); 4011 4012 return 0; 4013} 4014#endif