Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-linus-6.1-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip

Pull xen updates from Juergen Gross:

- Some minor typo fixes

- A fix of the Xen pcifront driver for supporting the device model to
run in a Linux stub domain

- A cleanup of the pcifront driver

- A series to enable grant-based virtio with Xen on x86

- A cleanup of Xen PV guests to distinguish between safe and faulting
MSR accesses

- Two fixes of the Xen gntdev driver

- Two fixes of the new xen grant DMA driver

* tag 'for-linus-6.1-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip:
xen: Kconfig: Fix spelling mistake "Maxmium" -> "Maximum"
xen/pv: support selecting safe/unsafe msr accesses
xen/pv: refactor msr access functions to support safe and unsafe accesses
xen/pv: fix vendor checks for pmu emulation
xen/pv: add fault recovery control to pmu msr accesses
xen/virtio: enable grant based virtio on x86
xen/virtio: use dom0 as default backend for CONFIG_XEN_VIRTIO_FORCE_GRANT
xen/virtio: restructure xen grant dma setup
xen/pcifront: move xenstore config scanning into sub-function
xen/gntdev: Accommodate VMA splitting
xen/gntdev: Prevent leaking grants
xen/virtio: Fix potential deadlock when accessing xen_grant_dma_devices
xen/virtio: Fix n_pages calculation in xen_grant_dma_map(unmap)_page()
xen/xenbus: Fix spelling mistake "hardward" -> "hardware"
xen-pcifront: Handle missed Connected state

+318 -247
+6
Documentation/admin-guide/kernel-parameters.txt
··· 6851 6851 Crash from Xen panic notifier, without executing late 6852 6852 panic() code such as dumping handler. 6853 6853 6854 + xen_msr_safe= [X86,XEN] 6855 + Format: <bool> 6856 + Select whether to always use non-faulting (safe) MSR 6857 + access functions when running as Xen PV guest. The 6858 + default value is controlled by CONFIG_XEN_PV_MSR_SAFE. 6859 + 6854 6860 xen_nopvspin [X86,XEN] 6855 6861 Disables the qspinlock slowpath using Xen PV optimizations. 6856 6862 This parameter is obsoleted by "nopvspin" parameter, which
+9
arch/x86/xen/Kconfig
··· 92 92 select X86_X2APIC if XEN_PVH && X86_64 93 93 help 94 94 Support running as a Xen Dom0 guest. 95 + 96 + config XEN_PV_MSR_SAFE 97 + bool "Always use safe MSR accesses in PV guests" 98 + default y 99 + depends on XEN_PV 100 + help 101 + Use safe (not faulting) MSR access functions even if the MSR access 102 + should not fault anyway. 103 + The default can be changed by using the "xen_msr_safe" boot parameter.
+1 -1
arch/x86/xen/enlighten_hvm.c
··· 212 212 return; 213 213 214 214 if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) 215 - virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); 215 + virtio_set_mem_acc_cb(xen_virtio_restricted_mem_acc); 216 216 217 217 init_hvm_pv_info(); 218 218
+70 -29
arch/x86/xen/enlighten_pv.c
··· 108 108 */ 109 109 static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); 110 110 111 + static __read_mostly bool xen_msr_safe = IS_ENABLED(CONFIG_XEN_PV_MSR_SAFE); 112 + 113 + static int __init parse_xen_msr_safe(char *str) 114 + { 115 + if (str) 116 + return strtobool(str, &xen_msr_safe); 117 + return -EINVAL; 118 + } 119 + early_param("xen_msr_safe", parse_xen_msr_safe); 120 + 111 121 static void __init xen_pv_init_platform(void) 112 122 { 113 123 /* PV guests can't operate virtio devices without grants. */ 114 124 if (IS_ENABLED(CONFIG_XEN_VIRTIO)) 115 - virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); 125 + virtio_set_mem_acc_cb(xen_virtio_restricted_mem_acc); 116 126 117 127 populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP)); 118 128 ··· 927 917 native_write_cr4(cr4); 928 918 } 929 919 930 - static u64 xen_read_msr_safe(unsigned int msr, int *err) 920 + static u64 xen_do_read_msr(unsigned int msr, int *err) 931 921 { 932 - u64 val; 922 + u64 val = 0; /* Avoid uninitialized value for safe variant. */ 933 923 934 924 if (pmu_msr_read(msr, &val, err)) 935 925 return val; 936 926 937 - val = native_read_msr_safe(msr, err); 927 + if (err) 928 + val = native_read_msr_safe(msr, err); 929 + else 930 + val = native_read_msr(msr); 931 + 938 932 switch (msr) { 939 933 case MSR_IA32_APICBASE: 940 934 val &= ~X2APIC_ENABLE; ··· 947 933 return val; 948 934 } 949 935 950 - static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) 936 + static void set_seg(unsigned int which, unsigned int low, unsigned int high, 937 + int *err) 951 938 { 952 - int ret; 953 - unsigned int which; 954 - u64 base; 939 + u64 base = ((u64)high << 32) | low; 955 940 956 - ret = 0; 941 + if (HYPERVISOR_set_segment_base(which, base) == 0) 942 + return; 957 943 944 + if (err) 945 + *err = -EIO; 946 + else 947 + WARN(1, "Xen set_segment_base(%u, %llx) failed\n", which, base); 948 + } 949 + 950 + /* 951 + * Support write_msr_safe() and write_msr() semantics. 952 + * With err == NULL write_msr() semantics are selected. 953 + * Supplying an err pointer requires err to be pre-initialized with 0. 954 + */ 955 + static void xen_do_write_msr(unsigned int msr, unsigned int low, 956 + unsigned int high, int *err) 957 + { 958 958 switch (msr) { 959 - case MSR_FS_BASE: which = SEGBASE_FS; goto set; 960 - case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; 961 - case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; 959 + case MSR_FS_BASE: 960 + set_seg(SEGBASE_FS, low, high, err); 961 + break; 962 962 963 - set: 964 - base = ((u64)high << 32) | low; 965 - if (HYPERVISOR_set_segment_base(which, base) != 0) 966 - ret = -EIO; 963 + case MSR_KERNEL_GS_BASE: 964 + set_seg(SEGBASE_GS_USER, low, high, err); 965 + break; 966 + 967 + case MSR_GS_BASE: 968 + set_seg(SEGBASE_GS_KERNEL, low, high, err); 967 969 break; 968 970 969 971 case MSR_STAR: ··· 995 965 break; 996 966 997 967 default: 998 - if (!pmu_msr_write(msr, low, high, &ret)) 999 - ret = native_write_msr_safe(msr, low, high); 968 + if (!pmu_msr_write(msr, low, high, err)) { 969 + if (err) 970 + *err = native_write_msr_safe(msr, low, high); 971 + else 972 + native_write_msr(msr, low, high); 973 + } 1000 974 } 975 + } 1001 976 1002 - return ret; 977 + static u64 xen_read_msr_safe(unsigned int msr, int *err) 978 + { 979 + return xen_do_read_msr(msr, err); 980 + } 981 + 982 + static int xen_write_msr_safe(unsigned int msr, unsigned int low, 983 + unsigned int high) 984 + { 985 + int err = 0; 986 + 987 + xen_do_write_msr(msr, low, high, &err); 988 + 989 + return err; 1003 990 } 1004 991 1005 992 static u64 xen_read_msr(unsigned int msr) 1006 993 { 1007 - /* 1008 - * This will silently swallow a #GP from RDMSR. It may be worth 1009 - * changing that. 1010 - */ 1011 994 int err; 1012 995 1013 - return xen_read_msr_safe(msr, &err); 996 + return xen_do_read_msr(msr, xen_msr_safe ? &err : NULL); 1014 997 } 1015 998 1016 999 static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) 1017 1000 { 1018 - /* 1019 - * This will silently swallow a #GP from WRMSR. It may be worth 1020 - * changing that. 1021 - */ 1022 - xen_write_msr_safe(msr, low, high); 1001 + int err; 1002 + 1003 + xen_do_write_msr(msr, low, high, xen_msr_safe ? &err : NULL); 1023 1004 } 1024 1005 1025 1006 /* This is called once we have the cpu_possible_mask */
+42 -29
arch/x86/xen/pmu.c
··· 131 131 132 132 static inline bool is_amd_pmu_msr(unsigned int msr) 133 133 { 134 + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && 135 + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) 136 + return false; 137 + 134 138 if ((msr >= MSR_F15H_PERF_CTL && 135 139 msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) || 136 140 (msr >= MSR_K7_EVNTSEL0 && ··· 144 140 return false; 145 141 } 146 142 147 - static int is_intel_pmu_msr(u32 msr_index, int *type, int *index) 143 + static bool is_intel_pmu_msr(u32 msr_index, int *type, int *index) 148 144 { 149 145 u32 msr_index_pmc; 146 + 147 + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && 148 + boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR && 149 + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) 150 + return false; 150 151 151 152 switch (msr_index) { 152 153 case MSR_CORE_PERF_FIXED_CTR_CTRL: ··· 299 290 return false; 300 291 } 301 292 293 + static bool pmu_msr_chk_emulated(unsigned int msr, uint64_t *val, bool is_read, 294 + bool *emul) 295 + { 296 + int type, index; 297 + 298 + if (is_amd_pmu_msr(msr)) 299 + *emul = xen_amd_pmu_emulate(msr, val, is_read); 300 + else if (is_intel_pmu_msr(msr, &type, &index)) 301 + *emul = xen_intel_pmu_emulate(msr, val, type, index, is_read); 302 + else 303 + return false; 304 + 305 + return true; 306 + } 307 + 302 308 bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err) 303 309 { 304 - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { 305 - if (is_amd_pmu_msr(msr)) { 306 - if (!xen_amd_pmu_emulate(msr, val, 1)) 307 - *val = native_read_msr_safe(msr, err); 308 - return true; 309 - } 310 - } else { 311 - int type, index; 310 + bool emulated; 312 311 313 - if (is_intel_pmu_msr(msr, &type, &index)) { 314 - if (!xen_intel_pmu_emulate(msr, val, type, index, 1)) 315 - *val = native_read_msr_safe(msr, err); 316 - return true; 317 - } 312 + if (!pmu_msr_chk_emulated(msr, val, true, &emulated)) 313 + return false; 314 + 315 + if (!emulated) { 316 + *val = err ? native_read_msr_safe(msr, err) 317 + : native_read_msr(msr); 318 318 } 319 319 320 - return false; 320 + return true; 321 321 } 322 322 323 323 bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err) 324 324 { 325 325 uint64_t val = ((uint64_t)high << 32) | low; 326 + bool emulated; 326 327 327 - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { 328 - if (is_amd_pmu_msr(msr)) { 329 - if (!xen_amd_pmu_emulate(msr, &val, 0)) 330 - *err = native_write_msr_safe(msr, low, high); 331 - return true; 332 - } 333 - } else { 334 - int type, index; 328 + if (!pmu_msr_chk_emulated(msr, &val, false, &emulated)) 329 + return false; 335 330 336 - if (is_intel_pmu_msr(msr, &type, &index)) { 337 - if (!xen_intel_pmu_emulate(msr, &val, type, index, 0)) 338 - *err = native_write_msr_safe(msr, low, high); 339 - return true; 340 - } 331 + if (!emulated) { 332 + if (err) 333 + *err = native_write_msr_safe(msr, low, high); 334 + else 335 + native_write_msr(msr, low, high); 341 336 } 342 337 343 - return false; 338 + return true; 344 339 } 345 340 346 341 static unsigned long long xen_amd_read_pmc(int counter)
+53 -108
drivers/pci/xen-pcifront.c
··· 521 521 int err; 522 522 struct pci_bus *b; 523 523 524 - #ifndef CONFIG_PCI_DOMAINS 525 - if (domain != 0) { 526 - dev_err(&pdev->xdev->dev, 527 - "PCI Root in non-zero PCI Domain! domain=%d\n", domain); 528 - dev_err(&pdev->xdev->dev, 529 - "Please compile with CONFIG_PCI_DOMAINS\n"); 530 - return -EINVAL; 531 - } 532 - #endif 533 - 534 - dev_info(&pdev->xdev->dev, "Rescanning PCI Frontend Bus %04x:%02x\n", 535 - domain, bus); 536 - 537 524 b = pci_find_bus(domain, bus); 538 525 if (!b) 539 526 /* If the bus is unknown, create it. */ 540 527 return pcifront_scan_root(pdev, domain, bus); 528 + 529 + dev_info(&pdev->xdev->dev, "Rescanning PCI Frontend Bus %04x:%02x\n", 530 + domain, bus); 541 531 542 532 err = pcifront_scan_bus(pdev, domain, bus, b); 543 533 ··· 809 819 return err; 810 820 } 811 821 812 - static int pcifront_try_connect(struct pcifront_device *pdev) 822 + static void pcifront_connect(struct pcifront_device *pdev) 813 823 { 814 - int err = -EFAULT; 824 + int err; 815 825 int i, num_roots, len; 816 826 char str[64]; 817 827 unsigned int domain, bus; 818 - 819 - 820 - /* Only connect once */ 821 - if (xenbus_read_driver_state(pdev->xdev->nodename) != 822 - XenbusStateInitialised) 823 - goto out; 824 - 825 - err = pcifront_connect_and_init_dma(pdev); 826 - if (err && err != -EEXIST) { 827 - xenbus_dev_fatal(pdev->xdev, err, 828 - "Error setting up PCI Frontend"); 829 - goto out; 830 - } 831 828 832 829 err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, 833 830 "root_num", "%d", &num_roots); 834 831 if (err == -ENOENT) { 835 832 xenbus_dev_error(pdev->xdev, err, 836 833 "No PCI Roots found, trying 0000:00"); 837 - err = pcifront_scan_root(pdev, 0, 0); 834 + err = pcifront_rescan_root(pdev, 0, 0); 838 835 if (err) { 839 836 xenbus_dev_fatal(pdev->xdev, err, 840 837 "Error scanning PCI root 0000:00"); 841 - goto out; 838 + return; 842 839 } 843 840 num_roots = 0; 844 841 } else if (err != 1) { 845 - if (err == 0) 846 - err = -EINVAL; 847 - xenbus_dev_fatal(pdev->xdev, err, 842 + xenbus_dev_fatal(pdev->xdev, err >= 0 ? -EINVAL : err, 848 843 "Error reading number of PCI roots"); 849 - goto out; 844 + return; 850 845 } 851 846 852 847 for (i = 0; i < num_roots; i++) { 853 848 len = snprintf(str, sizeof(str), "root-%d", i); 854 - if (unlikely(len >= (sizeof(str) - 1))) { 855 - err = -ENOMEM; 856 - goto out; 857 - } 849 + if (unlikely(len >= (sizeof(str) - 1))) 850 + return; 858 851 859 852 err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, 860 853 "%x:%x", &domain, &bus); 861 854 if (err != 2) { 862 - if (err >= 0) 863 - err = -EINVAL; 864 - xenbus_dev_fatal(pdev->xdev, err, 855 + xenbus_dev_fatal(pdev->xdev, err >= 0 ? -EINVAL : err, 865 856 "Error reading PCI root %d", i); 866 - goto out; 857 + return; 867 858 } 868 859 869 - err = pcifront_scan_root(pdev, domain, bus); 860 + err = pcifront_rescan_root(pdev, domain, bus); 870 861 if (err) { 871 862 xenbus_dev_fatal(pdev->xdev, err, 872 863 "Error scanning PCI root %04x:%02x", 873 864 domain, bus); 874 - goto out; 865 + return; 875 866 } 876 867 } 877 868 878 - err = xenbus_switch_state(pdev->xdev, XenbusStateConnected); 869 + xenbus_switch_state(pdev->xdev, XenbusStateConnected); 870 + } 879 871 880 - out: 881 - return err; 872 + static void pcifront_try_connect(struct pcifront_device *pdev) 873 + { 874 + int err; 875 + 876 + /* Only connect once */ 877 + if (xenbus_read_driver_state(pdev->xdev->nodename) != 878 + XenbusStateInitialised) 879 + return; 880 + 881 + err = pcifront_connect_and_init_dma(pdev); 882 + if (err && err != -EEXIST) { 883 + xenbus_dev_fatal(pdev->xdev, err, 884 + "Error setting up PCI Frontend"); 885 + return; 886 + } 887 + 888 + pcifront_connect(pdev); 882 889 } 883 890 884 891 static int pcifront_try_disconnect(struct pcifront_device *pdev) ··· 901 914 return err; 902 915 } 903 916 904 - static int pcifront_attach_devices(struct pcifront_device *pdev) 917 + static void pcifront_attach_devices(struct pcifront_device *pdev) 905 918 { 906 - int err = -EFAULT; 907 - int i, num_roots, len; 908 - unsigned int domain, bus; 909 - char str[64]; 910 - 911 - if (xenbus_read_driver_state(pdev->xdev->nodename) != 919 + if (xenbus_read_driver_state(pdev->xdev->nodename) == 912 920 XenbusStateReconfiguring) 913 - goto out; 914 - 915 - err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, 916 - "root_num", "%d", &num_roots); 917 - if (err == -ENOENT) { 918 - xenbus_dev_error(pdev->xdev, err, 919 - "No PCI Roots found, trying 0000:00"); 920 - err = pcifront_rescan_root(pdev, 0, 0); 921 - if (err) { 922 - xenbus_dev_fatal(pdev->xdev, err, 923 - "Error scanning PCI root 0000:00"); 924 - goto out; 925 - } 926 - num_roots = 0; 927 - } else if (err != 1) { 928 - if (err == 0) 929 - err = -EINVAL; 930 - xenbus_dev_fatal(pdev->xdev, err, 931 - "Error reading number of PCI roots"); 932 - goto out; 933 - } 934 - 935 - for (i = 0; i < num_roots; i++) { 936 - len = snprintf(str, sizeof(str), "root-%d", i); 937 - if (unlikely(len >= (sizeof(str) - 1))) { 938 - err = -ENOMEM; 939 - goto out; 940 - } 941 - 942 - err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, 943 - "%x:%x", &domain, &bus); 944 - if (err != 2) { 945 - if (err >= 0) 946 - err = -EINVAL; 947 - xenbus_dev_fatal(pdev->xdev, err, 948 - "Error reading PCI root %d", i); 949 - goto out; 950 - } 951 - 952 - err = pcifront_rescan_root(pdev, domain, bus); 953 - if (err) { 954 - xenbus_dev_fatal(pdev->xdev, err, 955 - "Error scanning PCI root %04x:%02x", 956 - domain, bus); 957 - goto out; 958 - } 959 - } 960 - 961 - xenbus_switch_state(pdev->xdev, XenbusStateConnected); 962 - 963 - out: 964 - return err; 921 + pcifront_connect(pdev); 965 922 } 966 923 967 924 static int pcifront_detach_devices(struct pcifront_device *pdev) 968 925 { 969 926 int err = 0; 970 927 int i, num_devs; 928 + enum xenbus_state state; 971 929 unsigned int domain, bus, slot, func; 972 930 struct pci_dev *pci_dev; 973 931 char str[64]; 974 932 975 - if (xenbus_read_driver_state(pdev->xdev->nodename) != 976 - XenbusStateConnected) 933 + state = xenbus_read_driver_state(pdev->xdev->nodename); 934 + if (state == XenbusStateInitialised) { 935 + dev_dbg(&pdev->xdev->dev, "Handle skipped connect.\n"); 936 + /* We missed Connected and need to initialize. */ 937 + err = pcifront_connect_and_init_dma(pdev); 938 + if (err && err != -EEXIST) { 939 + xenbus_dev_fatal(pdev->xdev, err, 940 + "Error setting up PCI Frontend"); 941 + goto out; 942 + } 943 + 944 + goto out_switch_state; 945 + } else if (state != XenbusStateConnected) { 977 946 goto out; 947 + } 978 948 979 949 err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "num_devs", "%d", 980 950 &num_devs); ··· 992 1048 domain, bus, slot, func); 993 1049 } 994 1050 1051 + out_switch_state: 995 1052 err = xenbus_switch_state(pdev->xdev, XenbusStateReconfiguring); 996 1053 997 1054 out:
+1 -1
drivers/xen/Kconfig
··· 56 56 depends on XEN_HAVE_PVMMU 57 57 depends on MEMORY_HOTPLUG 58 58 help 59 - Maxmium amount of memory (in GiB) that a PV guest can be 59 + Maximum amount of memory (in GiB) that a PV guest can be 60 60 expanded to when using memory hotplug. 61 61 62 62 A PV guest can have more memory than this limit if is
+2 -1
drivers/xen/gntdev-common.h
··· 44 44 }; 45 45 46 46 struct gntdev_grant_map { 47 + atomic_t in_use; 47 48 struct mmu_interval_notifier notifier; 49 + bool notifier_init; 48 50 struct list_head next; 49 - struct vm_area_struct *vma; 50 51 int index; 51 52 int count; 52 53 int flags;
+42 -38
drivers/xen/gntdev.c
··· 286 286 */ 287 287 } 288 288 289 + if (use_ptemod && map->notifier_init) 290 + mmu_interval_notifier_remove(&map->notifier); 291 + 289 292 if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { 290 293 notify_remote_via_evtchn(map->notify.event); 291 294 evtchn_put(map->notify.event); ··· 301 298 static int find_grant_ptes(pte_t *pte, unsigned long addr, void *data) 302 299 { 303 300 struct gntdev_grant_map *map = data; 304 - unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT; 301 + unsigned int pgnr = (addr - map->pages_vm_start) >> PAGE_SHIFT; 305 302 int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte | 306 303 (1 << _GNTMAP_guest_avail0); 307 304 u64 pte_maddr; ··· 370 367 for (i = 0; i < map->count; i++) { 371 368 if (map->map_ops[i].status == GNTST_okay) { 372 369 map->unmap_ops[i].handle = map->map_ops[i].handle; 373 - if (!use_ptemod) 374 - alloced++; 370 + alloced++; 375 371 } else if (!err) 376 372 err = -EINVAL; 377 373 ··· 379 377 380 378 if (use_ptemod) { 381 379 if (map->kmap_ops[i].status == GNTST_okay) { 382 - if (map->map_ops[i].status == GNTST_okay) 383 - alloced++; 380 + alloced++; 384 381 map->kunmap_ops[i].handle = map->kmap_ops[i].handle; 385 382 } else if (!err) 386 383 err = -EINVAL; ··· 395 394 unsigned int i; 396 395 struct gntdev_grant_map *map = data->data; 397 396 unsigned int offset = data->unmap_ops - map->unmap_ops; 397 + int successful_unmaps = 0; 398 + int live_grants; 398 399 399 400 for (i = 0; i < data->count; i++) { 401 + if (map->unmap_ops[offset + i].status == GNTST_okay && 402 + map->unmap_ops[offset + i].handle != INVALID_GRANT_HANDLE) 403 + successful_unmaps++; 404 + 400 405 WARN_ON(map->unmap_ops[offset + i].status != GNTST_okay && 401 406 map->unmap_ops[offset + i].handle != INVALID_GRANT_HANDLE); 402 407 pr_debug("unmap handle=%d st=%d\n", ··· 410 403 map->unmap_ops[offset+i].status); 411 404 map->unmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; 412 405 if (use_ptemod) { 406 + if (map->kunmap_ops[offset + i].status == GNTST_okay && 407 + map->kunmap_ops[offset + i].handle != INVALID_GRANT_HANDLE) 408 + successful_unmaps++; 409 + 413 410 WARN_ON(map->kunmap_ops[offset + i].status != GNTST_okay && 414 411 map->kunmap_ops[offset + i].handle != INVALID_GRANT_HANDLE); 415 412 pr_debug("kunmap handle=%u st=%d\n", ··· 422 411 map->kunmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; 423 412 } 424 413 } 414 + 425 415 /* 426 416 * Decrease the live-grant counter. This must happen after the loop to 427 417 * prevent premature reuse of the grants by gnttab_mmap(). 428 418 */ 429 - atomic_sub(data->count, &map->live_grants); 419 + live_grants = atomic_sub_return(successful_unmaps, &map->live_grants); 420 + if (WARN_ON(live_grants < 0)) 421 + pr_err("%s: live_grants became negative (%d) after unmapping %d pages!\n", 422 + __func__, live_grants, successful_unmaps); 430 423 431 424 /* Release reference taken by __unmap_grant_pages */ 432 425 gntdev_put_map(NULL, map); ··· 511 496 struct gntdev_priv *priv = file->private_data; 512 497 513 498 pr_debug("gntdev_vma_close %p\n", vma); 514 - if (use_ptemod) { 515 - WARN_ON(map->vma != vma); 516 - mmu_interval_notifier_remove(&map->notifier); 517 - map->vma = NULL; 518 - } 499 + 519 500 vma->vm_private_data = NULL; 520 501 gntdev_put_map(priv, map); 521 502 } ··· 539 528 struct gntdev_grant_map *map = 540 529 container_of(mn, struct gntdev_grant_map, notifier); 541 530 unsigned long mstart, mend; 531 + unsigned long map_start, map_end; 542 532 543 533 if (!mmu_notifier_range_blockable(range)) 544 534 return false; 535 + 536 + map_start = map->pages_vm_start; 537 + map_end = map->pages_vm_start + (map->count << PAGE_SHIFT); 545 538 546 539 /* 547 540 * If the VMA is split or otherwise changed the notifier is not ··· 553 538 * VMA. FIXME: It would be much more understandable to just prevent 554 539 * modifying the VMA in the first place. 555 540 */ 556 - if (map->vma->vm_start >= range->end || 557 - map->vma->vm_end <= range->start) 541 + if (map_start >= range->end || map_end <= range->start) 558 542 return true; 559 543 560 - mstart = max(range->start, map->vma->vm_start); 561 - mend = min(range->end, map->vma->vm_end); 544 + mstart = max(range->start, map_start); 545 + mend = min(range->end, map_end); 562 546 pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", 563 - map->index, map->count, 564 - map->vma->vm_start, map->vma->vm_end, 565 - range->start, range->end, mstart, mend); 566 - unmap_grant_pages(map, 567 - (mstart - map->vma->vm_start) >> PAGE_SHIFT, 568 - (mend - mstart) >> PAGE_SHIFT); 547 + map->index, map->count, map_start, map_end, 548 + range->start, range->end, mstart, mend); 549 + unmap_grant_pages(map, (mstart - map_start) >> PAGE_SHIFT, 550 + (mend - mstart) >> PAGE_SHIFT); 569 551 570 552 return true; 571 553 } ··· 1042 1030 return -EINVAL; 1043 1031 1044 1032 pr_debug("map %d+%d at %lx (pgoff %lx)\n", 1045 - index, count, vma->vm_start, vma->vm_pgoff); 1033 + index, count, vma->vm_start, vma->vm_pgoff); 1046 1034 1047 1035 mutex_lock(&priv->lock); 1048 1036 map = gntdev_find_map_index(priv, index, count); 1049 1037 if (!map) 1050 1038 goto unlock_out; 1051 - if (use_ptemod && map->vma) 1039 + if (!atomic_add_unless(&map->in_use, 1, 1)) 1052 1040 goto unlock_out; 1053 - if (atomic_read(&map->live_grants)) { 1054 - err = -EAGAIN; 1055 - goto unlock_out; 1056 - } 1041 + 1057 1042 refcount_inc(&map->users); 1058 1043 1059 1044 vma->vm_ops = &gntdev_vmops; ··· 1071 1062 map->flags |= GNTMAP_readonly; 1072 1063 } 1073 1064 1065 + map->pages_vm_start = vma->vm_start; 1066 + 1074 1067 if (use_ptemod) { 1075 - map->vma = vma; 1076 1068 err = mmu_interval_notifier_insert_locked( 1077 1069 &map->notifier, vma->vm_mm, vma->vm_start, 1078 1070 vma->vm_end - vma->vm_start, &gntdev_mmu_ops); 1079 - if (err) { 1080 - map->vma = NULL; 1071 + if (err) 1081 1072 goto out_unlock_put; 1082 - } 1073 + 1074 + map->notifier_init = true; 1083 1075 } 1084 1076 mutex_unlock(&priv->lock); 1085 1077 ··· 1097 1087 */ 1098 1088 mmu_interval_read_begin(&map->notifier); 1099 1089 1100 - map->pages_vm_start = vma->vm_start; 1101 1090 err = apply_to_page_range(vma->vm_mm, vma->vm_start, 1102 1091 vma->vm_end - vma->vm_start, 1103 1092 find_grant_ptes, map); ··· 1125 1116 out_unlock_put: 1126 1117 mutex_unlock(&priv->lock); 1127 1118 out_put_map: 1128 - if (use_ptemod) { 1119 + if (use_ptemod) 1129 1120 unmap_grant_pages(map, 0, map->count); 1130 - if (map->vma) { 1131 - mmu_interval_notifier_remove(&map->notifier); 1132 - map->vma = NULL; 1133 - } 1134 - } 1135 1121 gntdev_put_map(priv, map); 1136 1122 return err; 1137 1123 }
+85 -39
drivers/xen/grant-dma-ops.c
··· 25 25 bool broken; 26 26 }; 27 27 28 - static DEFINE_XARRAY(xen_grant_dma_devices); 28 + static DEFINE_XARRAY_FLAGS(xen_grant_dma_devices, XA_FLAGS_LOCK_IRQ); 29 29 30 30 #define XEN_GRANT_DMA_ADDR_OFF (1ULL << 63) 31 31 ··· 42 42 static struct xen_grant_dma_data *find_xen_grant_dma_data(struct device *dev) 43 43 { 44 44 struct xen_grant_dma_data *data; 45 + unsigned long flags; 45 46 46 - xa_lock(&xen_grant_dma_devices); 47 + xa_lock_irqsave(&xen_grant_dma_devices, flags); 47 48 data = xa_load(&xen_grant_dma_devices, (unsigned long)dev); 48 - xa_unlock(&xen_grant_dma_devices); 49 + xa_unlock_irqrestore(&xen_grant_dma_devices, flags); 49 50 50 51 return data; 52 + } 53 + 54 + static int store_xen_grant_dma_data(struct device *dev, 55 + struct xen_grant_dma_data *data) 56 + { 57 + unsigned long flags; 58 + int ret; 59 + 60 + xa_lock_irqsave(&xen_grant_dma_devices, flags); 61 + ret = xa_err(__xa_store(&xen_grant_dma_devices, (unsigned long)dev, data, 62 + GFP_ATOMIC)); 63 + xa_unlock_irqrestore(&xen_grant_dma_devices, flags); 64 + 65 + return ret; 51 66 } 52 67 53 68 /* ··· 168 153 unsigned long attrs) 169 154 { 170 155 struct xen_grant_dma_data *data; 171 - unsigned int i, n_pages = PFN_UP(size); 156 + unsigned int i, n_pages = PFN_UP(offset + size); 172 157 grant_ref_t grant; 173 158 dma_addr_t dma_handle; 174 159 ··· 200 185 unsigned long attrs) 201 186 { 202 187 struct xen_grant_dma_data *data; 203 - unsigned int i, n_pages = PFN_UP(size); 188 + unsigned long offset = dma_handle & (PAGE_SIZE - 1); 189 + unsigned int i, n_pages = PFN_UP(offset + size); 204 190 grant_ref_t grant; 205 191 206 192 if (WARN_ON(dir == DMA_NONE)) ··· 289 273 .dma_supported = xen_grant_dma_supported, 290 274 }; 291 275 292 - bool xen_is_grant_dma_device(struct device *dev) 276 + static bool xen_is_dt_grant_dma_device(struct device *dev) 293 277 { 294 278 struct device_node *iommu_np; 295 279 bool has_iommu; 296 280 297 - /* XXX Handle only DT devices for now */ 298 - if (!dev->of_node) 299 - return false; 300 - 301 281 iommu_np = of_parse_phandle(dev->of_node, "iommus", 0); 302 - has_iommu = iommu_np && of_device_is_compatible(iommu_np, "xen,grant-dma"); 282 + has_iommu = iommu_np && 283 + of_device_is_compatible(iommu_np, "xen,grant-dma"); 303 284 of_node_put(iommu_np); 304 285 305 286 return has_iommu; 306 287 } 307 288 289 + bool xen_is_grant_dma_device(struct device *dev) 290 + { 291 + /* XXX Handle only DT devices for now */ 292 + if (dev->of_node) 293 + return xen_is_dt_grant_dma_device(dev); 294 + 295 + return false; 296 + } 297 + 308 298 bool xen_virtio_mem_acc(struct virtio_device *dev) 309 299 { 310 - if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) 300 + if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT) || xen_pv_domain()) 311 301 return true; 312 302 313 303 return xen_is_grant_dma_device(dev->dev.parent); 314 304 } 315 305 306 + static int xen_dt_grant_init_backend_domid(struct device *dev, 307 + struct xen_grant_dma_data *data) 308 + { 309 + struct of_phandle_args iommu_spec; 310 + 311 + if (of_parse_phandle_with_args(dev->of_node, "iommus", "#iommu-cells", 312 + 0, &iommu_spec)) { 313 + dev_err(dev, "Cannot parse iommus property\n"); 314 + return -ESRCH; 315 + } 316 + 317 + if (!of_device_is_compatible(iommu_spec.np, "xen,grant-dma") || 318 + iommu_spec.args_count != 1) { 319 + dev_err(dev, "Incompatible IOMMU node\n"); 320 + of_node_put(iommu_spec.np); 321 + return -ESRCH; 322 + } 323 + 324 + of_node_put(iommu_spec.np); 325 + 326 + /* 327 + * The endpoint ID here means the ID of the domain where the 328 + * corresponding backend is running 329 + */ 330 + data->backend_domid = iommu_spec.args[0]; 331 + 332 + return 0; 333 + } 334 + 316 335 void xen_grant_setup_dma_ops(struct device *dev) 317 336 { 318 337 struct xen_grant_dma_data *data; 319 - struct of_phandle_args iommu_spec; 320 338 321 339 data = find_xen_grant_dma_data(dev); 322 340 if (data) { ··· 358 308 return; 359 309 } 360 310 361 - /* XXX ACPI device unsupported for now */ 362 - if (!dev->of_node) 363 - goto err; 364 - 365 - if (of_parse_phandle_with_args(dev->of_node, "iommus", "#iommu-cells", 366 - 0, &iommu_spec)) { 367 - dev_err(dev, "Cannot parse iommus property\n"); 368 - goto err; 369 - } 370 - 371 - if (!of_device_is_compatible(iommu_spec.np, "xen,grant-dma") || 372 - iommu_spec.args_count != 1) { 373 - dev_err(dev, "Incompatible IOMMU node\n"); 374 - of_node_put(iommu_spec.np); 375 - goto err; 376 - } 377 - 378 - of_node_put(iommu_spec.np); 379 - 380 311 data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); 381 312 if (!data) 382 313 goto err; 383 314 384 - /* 385 - * The endpoint ID here means the ID of the domain where the corresponding 386 - * backend is running 387 - */ 388 - data->backend_domid = iommu_spec.args[0]; 315 + if (dev->of_node) { 316 + if (xen_dt_grant_init_backend_domid(dev, data)) 317 + goto err; 318 + } else if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) { 319 + dev_info(dev, "Using dom0 as backend\n"); 320 + data->backend_domid = 0; 321 + } else { 322 + /* XXX ACPI device unsupported for now */ 323 + goto err; 324 + } 389 325 390 - if (xa_err(xa_store(&xen_grant_dma_devices, (unsigned long)dev, data, 391 - GFP_KERNEL))) { 326 + if (store_xen_grant_dma_data(dev, data)) { 392 327 dev_err(dev, "Cannot store Xen grant DMA data\n"); 393 328 goto err; 394 329 } ··· 383 348 return; 384 349 385 350 err: 351 + devm_kfree(dev, data); 386 352 dev_err(dev, "Cannot set up Xen grant DMA ops, retain platform DMA ops\n"); 353 + } 354 + 355 + bool xen_virtio_restricted_mem_acc(struct virtio_device *dev) 356 + { 357 + bool ret = xen_virtio_mem_acc(dev); 358 + 359 + if (ret) 360 + xen_grant_setup_dma_ops(dev->dev.parent); 361 + 362 + return ret; 387 363 } 388 364 389 365 MODULE_DESCRIPTION("Xen grant DMA-mapping layer");
+1 -1
drivers/xen/xen-pciback/xenbus.c
··· 31 31 " frontend (for example, a device at 06:01.b will still appear at\n"\ 32 32 " 06:01.b to the frontend). This is similar to how Xen 2.0.x\n"\ 33 33 " exposed PCI devices to its driver domains. This may be required\n"\ 34 - " for drivers which depend on finding their hardward in certain\n"\ 34 + " for drivers which depend on finding their hardware in certain\n"\ 35 35 " bus/slot locations."); 36 36 37 37 static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev)
+6
include/xen/xen-ops.h
··· 219 219 void xen_grant_setup_dma_ops(struct device *dev); 220 220 bool xen_is_grant_dma_device(struct device *dev); 221 221 bool xen_virtio_mem_acc(struct virtio_device *dev); 222 + bool xen_virtio_restricted_mem_acc(struct virtio_device *dev); 222 223 #else 223 224 static inline void xen_grant_setup_dma_ops(struct device *dev) 224 225 { ··· 232 231 struct virtio_device; 233 232 234 233 static inline bool xen_virtio_mem_acc(struct virtio_device *dev) 234 + { 235 + return false; 236 + } 237 + 238 + static inline bool xen_virtio_restricted_mem_acc(struct virtio_device *dev) 235 239 { 236 240 return false; 237 241 }