Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

vfio: powerpc/spapr: Support Dynamic DMA windows

This adds create/remove window ioctls to create and remove DMA windows.
sPAPR defines a Dynamic DMA windows capability which allows
para-virtualized guests to create additional DMA windows on a PCI bus.
The existing linux kernels use this new window to map the entire guest
memory and switch to the direct DMA operations saving time on map/unmap
requests which would normally happen in a big amounts.

This adds 2 ioctl handlers - VFIO_IOMMU_SPAPR_TCE_CREATE and
VFIO_IOMMU_SPAPR_TCE_REMOVE - to create and remove windows.
Up to 2 windows are supported now by the hardware and by this driver.

This changes VFIO_IOMMU_SPAPR_TCE_GET_INFO handler to return additional
information such as a number of supported windows and maximum number
levels of TCE tables.

DDW is added as a capability, not as a SPAPR TCE IOMMU v2 unique feature
as we still want to support v2 on platforms which cannot do DDW for
the sake of TCE acceleration in KVM (coming soon).

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

authored by

Alexey Kardashevskiy and committed by
Michael Ellerman
e633bc86 2157e7b8

+273 -5
+19
Documentation/vfio.txt
··· 464 464 465 465 This separation helps in optimizing DMA for guests. 466 466 467 + 6) sPAPR specification allows guests to have an additional DMA window(s) on 468 + a PCI bus with a variable page size. Two ioctls have been added to support 469 + this: VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE. 470 + The platform has to support the functionality or error will be returned to 471 + the userspace. The existing hardware supports up to 2 DMA windows, one is 472 + 2GB long, uses 4K pages and called "default 32bit window"; the other can 473 + be as big as entire RAM, use different page size, it is optional - guests 474 + create those in run-time if the guest driver supports 64bit DMA. 475 + 476 + VFIO_IOMMU_SPAPR_TCE_CREATE receives a page shift, a DMA window size and 477 + a number of TCE table levels (if a TCE table is going to be big enough and 478 + the kernel may not be able to allocate enough of physically contiguous memory). 479 + It creates a new window in the available slot and returns the bus address where 480 + the new window starts. Due to hardware limitation, the user space cannot choose 481 + the location of DMA windows. 482 + 483 + VFIO_IOMMU_SPAPR_TCE_REMOVE receives the bus start address of the window 484 + and removes it. 485 + 467 486 ------------------------------------------------------------------------------- 468 487 469 488 [1] VFIO was originally an acronym for "Virtual Function I/O" in its
+1 -1
arch/powerpc/include/asm/iommu.h
··· 149 149 */ 150 150 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, 151 151 int nid); 152 - #define IOMMU_TABLE_GROUP_MAX_TABLES 1 152 + #define IOMMU_TABLE_GROUP_MAX_TABLES 2 153 153 154 154 struct iommu_table_group; 155 155
+195 -1
drivers/vfio/vfio_iommu_spapr_tce.c
··· 211 211 return -1; 212 212 } 213 213 214 + static int tce_iommu_find_free_table(struct tce_container *container) 215 + { 216 + int i; 217 + 218 + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 219 + if (!container->tables[i]) 220 + return i; 221 + } 222 + 223 + return -ENOSPC; 224 + } 225 + 214 226 static int tce_iommu_enable(struct tce_container *container) 215 227 { 216 228 int ret = 0; ··· 605 593 decrement_locked_vm(pages); 606 594 } 607 595 596 + static long tce_iommu_create_window(struct tce_container *container, 597 + __u32 page_shift, __u64 window_size, __u32 levels, 598 + __u64 *start_addr) 599 + { 600 + struct tce_iommu_group *tcegrp; 601 + struct iommu_table_group *table_group; 602 + struct iommu_table *tbl = NULL; 603 + long ret, num; 604 + 605 + num = tce_iommu_find_free_table(container); 606 + if (num < 0) 607 + return num; 608 + 609 + /* Get the first group for ops::create_table */ 610 + tcegrp = list_first_entry(&container->group_list, 611 + struct tce_iommu_group, next); 612 + table_group = iommu_group_get_iommudata(tcegrp->grp); 613 + if (!table_group) 614 + return -EFAULT; 615 + 616 + if (!(table_group->pgsizes & (1ULL << page_shift))) 617 + return -EINVAL; 618 + 619 + if (!table_group->ops->set_window || !table_group->ops->unset_window || 620 + !table_group->ops->get_table_size || 621 + !table_group->ops->create_table) 622 + return -EPERM; 623 + 624 + /* Create TCE table */ 625 + ret = tce_iommu_create_table(container, table_group, num, 626 + page_shift, window_size, levels, &tbl); 627 + if (ret) 628 + return ret; 629 + 630 + BUG_ON(!tbl->it_ops->free); 631 + 632 + /* 633 + * Program the table to every group. 634 + * Groups have been tested for compatibility at the attach time. 635 + */ 636 + list_for_each_entry(tcegrp, &container->group_list, next) { 637 + table_group = iommu_group_get_iommudata(tcegrp->grp); 638 + 639 + ret = table_group->ops->set_window(table_group, num, tbl); 640 + if (ret) 641 + goto unset_exit; 642 + } 643 + 644 + container->tables[num] = tbl; 645 + 646 + /* Return start address assigned by platform in create_table() */ 647 + *start_addr = tbl->it_offset << tbl->it_page_shift; 648 + 649 + return 0; 650 + 651 + unset_exit: 652 + list_for_each_entry(tcegrp, &container->group_list, next) { 653 + table_group = iommu_group_get_iommudata(tcegrp->grp); 654 + table_group->ops->unset_window(table_group, num); 655 + } 656 + tce_iommu_free_table(tbl); 657 + 658 + return ret; 659 + } 660 + 661 + static long tce_iommu_remove_window(struct tce_container *container, 662 + __u64 start_addr) 663 + { 664 + struct iommu_table_group *table_group = NULL; 665 + struct iommu_table *tbl; 666 + struct tce_iommu_group *tcegrp; 667 + int num; 668 + 669 + num = tce_iommu_find_table(container, start_addr, &tbl); 670 + if (num < 0) 671 + return -EINVAL; 672 + 673 + BUG_ON(!tbl->it_size); 674 + 675 + /* Detach groups from IOMMUs */ 676 + list_for_each_entry(tcegrp, &container->group_list, next) { 677 + table_group = iommu_group_get_iommudata(tcegrp->grp); 678 + 679 + /* 680 + * SPAPR TCE IOMMU exposes the default DMA window to 681 + * the guest via dma32_window_start/size of 682 + * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow 683 + * the userspace to remove this window, some do not so 684 + * here we check for the platform capability. 685 + */ 686 + if (!table_group->ops || !table_group->ops->unset_window) 687 + return -EPERM; 688 + 689 + table_group->ops->unset_window(table_group, num); 690 + } 691 + 692 + /* Free table */ 693 + tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 694 + tce_iommu_free_table(tbl); 695 + container->tables[num] = NULL; 696 + 697 + return 0; 698 + } 699 + 608 700 static long tce_iommu_ioctl(void *iommu_data, 609 701 unsigned int cmd, unsigned long arg) 610 702 { 611 703 struct tce_container *container = iommu_data; 612 - unsigned long minsz; 704 + unsigned long minsz, ddwsz; 613 705 long ret; 614 706 615 707 switch (cmd) { ··· 757 641 info.dma32_window_start = table_group->tce32_start; 758 642 info.dma32_window_size = table_group->tce32_size; 759 643 info.flags = 0; 644 + memset(&info.ddw, 0, sizeof(info.ddw)); 645 + 646 + if (table_group->max_dynamic_windows_supported && 647 + container->v2) { 648 + info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW; 649 + info.ddw.pgsizes = table_group->pgsizes; 650 + info.ddw.max_dynamic_windows_supported = 651 + table_group->max_dynamic_windows_supported; 652 + info.ddw.levels = table_group->max_levels; 653 + } 654 + 655 + ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw); 656 + 657 + if (info.argsz >= ddwsz) 658 + minsz = ddwsz; 760 659 761 660 if (copy_to_user((void __user *)arg, &info, minsz)) 762 661 return -EFAULT; ··· 965 834 return ret; 966 835 } 967 836 837 + case VFIO_IOMMU_SPAPR_TCE_CREATE: { 838 + struct vfio_iommu_spapr_tce_create create; 839 + 840 + if (!container->v2) 841 + break; 842 + 843 + if (!tce_groups_attached(container)) 844 + return -ENXIO; 845 + 846 + minsz = offsetofend(struct vfio_iommu_spapr_tce_create, 847 + start_addr); 848 + 849 + if (copy_from_user(&create, (void __user *)arg, minsz)) 850 + return -EFAULT; 851 + 852 + if (create.argsz < minsz) 853 + return -EINVAL; 854 + 855 + if (create.flags) 856 + return -EINVAL; 857 + 858 + mutex_lock(&container->lock); 859 + 860 + ret = tce_iommu_create_window(container, create.page_shift, 861 + create.window_size, create.levels, 862 + &create.start_addr); 863 + 864 + mutex_unlock(&container->lock); 865 + 866 + if (!ret && copy_to_user((void __user *)arg, &create, minsz)) 867 + ret = -EFAULT; 868 + 869 + return ret; 870 + } 871 + case VFIO_IOMMU_SPAPR_TCE_REMOVE: { 872 + struct vfio_iommu_spapr_tce_remove remove; 873 + 874 + if (!container->v2) 875 + break; 876 + 877 + if (!tce_groups_attached(container)) 878 + return -ENXIO; 879 + 880 + minsz = offsetofend(struct vfio_iommu_spapr_tce_remove, 881 + start_addr); 882 + 883 + if (copy_from_user(&remove, (void __user *)arg, minsz)) 884 + return -EFAULT; 885 + 886 + if (remove.argsz < minsz) 887 + return -EINVAL; 888 + 889 + if (remove.flags) 890 + return -EINVAL; 891 + 892 + mutex_lock(&container->lock); 893 + 894 + ret = tce_iommu_remove_window(container, remove.start_addr); 895 + 896 + mutex_unlock(&container->lock); 897 + 898 + return ret; 899 + } 968 900 } 969 901 970 902 return -ENOTTY;
+58 -3
include/uapi/linux/vfio.h
··· 445 445 /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ 446 446 447 447 /* 448 + * The SPAPR TCE DDW info struct provides the information about 449 + * the details of Dynamic DMA window capability. 450 + * 451 + * @pgsizes contains a page size bitmask, 4K/64K/16M are supported. 452 + * @max_dynamic_windows_supported tells the maximum number of windows 453 + * which the platform can create. 454 + * @levels tells the maximum number of levels in multi-level IOMMU tables; 455 + * this allows splitting a table into smaller chunks which reduces 456 + * the amount of physically contiguous memory required for the table. 457 + */ 458 + struct vfio_iommu_spapr_tce_ddw_info { 459 + __u64 pgsizes; /* Bitmap of supported page sizes */ 460 + __u32 max_dynamic_windows_supported; 461 + __u32 levels; 462 + }; 463 + 464 + /* 448 465 * The SPAPR TCE info struct provides the information about the PCI bus 449 466 * address ranges available for DMA, these values are programmed into 450 467 * the hardware so the guest has to know that information. ··· 471 454 * addresses too so the window works as a filter rather than an offset 472 455 * for IOVA addresses. 473 456 * 474 - * A flag will need to be added if other page sizes are supported, 475 - * so as defined here, it is always 4k. 457 + * Flags supported: 458 + * - VFIO_IOMMU_SPAPR_INFO_DDW: informs the userspace that dynamic DMA windows 459 + * (DDW) support is present. @ddw is only supported when DDW is present. 476 460 */ 477 461 struct vfio_iommu_spapr_tce_info { 478 462 __u32 argsz; 479 - __u32 flags; /* reserved for future use */ 463 + __u32 flags; 464 + #define VFIO_IOMMU_SPAPR_INFO_DDW (1 << 0) /* DDW supported */ 480 465 __u32 dma32_window_start; /* 32 bit window start (bytes) */ 481 466 __u32 dma32_window_size; /* 32 bit window size (bytes) */ 467 + struct vfio_iommu_spapr_tce_ddw_info ddw; 482 468 }; 483 469 484 470 #define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) ··· 553 533 * Uses vfio_iommu_spapr_register_memory for parameters. 554 534 */ 555 535 #define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18) 536 + 537 + /** 538 + * VFIO_IOMMU_SPAPR_TCE_CREATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19, struct vfio_iommu_spapr_tce_create) 539 + * 540 + * Creates an additional TCE table and programs it (sets a new DMA window) 541 + * to every IOMMU group in the container. It receives page shift, window 542 + * size and number of levels in the TCE table being created. 543 + * 544 + * It allocates and returns an offset on a PCI bus of the new DMA window. 545 + */ 546 + struct vfio_iommu_spapr_tce_create { 547 + __u32 argsz; 548 + __u32 flags; 549 + /* in */ 550 + __u32 page_shift; 551 + __u64 window_size; 552 + __u32 levels; 553 + /* out */ 554 + __u64 start_addr; 555 + }; 556 + #define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19) 557 + 558 + /** 559 + * VFIO_IOMMU_SPAPR_TCE_REMOVE - _IOW(VFIO_TYPE, VFIO_BASE + 20, struct vfio_iommu_spapr_tce_remove) 560 + * 561 + * Unprograms a TCE table from all groups in the container and destroys it. 562 + * It receives a PCI bus offset as a window id. 563 + */ 564 + struct vfio_iommu_spapr_tce_remove { 565 + __u32 argsz; 566 + __u32 flags; 567 + /* in */ 568 + __u64 start_addr; 569 + }; 570 + #define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20) 556 571 557 572 /* ***************************************************************** */ 558 573