Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: s390/interrupt: do not pin adapter interrupt pages

The adapter interrupt page containing the indicator bits is currently
pinned. That means that a guest with many devices can pin a lot of
memory pages in the host. This also complicates the reference tracking
which is needed for memory management handling of protected virtual
machines. It might also have some strange side effects for madvise
MADV_DONTNEED and other things.

We can simply try to get the userspace page set the bits and free the
page. By storing the userspace address in the irq routing entry instead
of the guest address we can actually avoid many lookups and list walks
so that this variant is very likely not slower.

If userspace messes around with the memory slots the worst thing that
can happen is that we write to some other memory within that process.
As we get the the page with FOLL_WRITE this can also not be used to
write to shared read-only pages.

Signed-off-by: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
[borntraeger@de.ibm.com: patch simplification]
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>

authored by

Ulrich Weigand and committed by
Christian Borntraeger
f6547066 f15587c8

+51 -133
+2 -9
Documentation/virt/kvm/devices/s390_flic.rst
··· 108 108 mask or unmask the adapter, as specified in mask 109 109 110 110 KVM_S390_IO_ADAPTER_MAP 111 - perform a gmap translation for the guest address provided in addr, 112 - pin a userspace page for the translated address and add it to the 113 - list of mappings 114 - 115 - .. note:: A new mapping will be created unconditionally; therefore, 116 - the calling code should avoid making duplicate mappings. 117 - 111 + This is now a no-op. The mapping is purely done by the irq route. 118 112 KVM_S390_IO_ADAPTER_UNMAP 119 - release a userspace page for the translated address specified in addr 120 - from the list of mappings 113 + This is now a no-op. The mapping is purely done by the irq route. 121 114 122 115 KVM_DEV_FLIC_AISM 123 116 modify the adapter-interruption-suppression mode for a given isc if the
-3
arch/s390/include/asm/kvm_host.h
··· 701 701 bool masked; 702 702 bool swap; 703 703 bool suppressible; 704 - struct rw_semaphore maps_lock; 705 - struct list_head maps; 706 - atomic_t nr_maps; 707 704 }; 708 705 709 706 #define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8)
+49 -121
arch/s390/kvm/interrupt.c
··· 2 2 /* 3 3 * handling kvm guest interrupts 4 4 * 5 - * Copyright IBM Corp. 2008, 2015 5 + * Copyright IBM Corp. 2008, 2020 6 6 * 7 7 * Author(s): Carsten Otte <cotte@de.ibm.com> 8 8 */ ··· 2327 2327 if (!adapter) 2328 2328 return -ENOMEM; 2329 2329 2330 - INIT_LIST_HEAD(&adapter->maps); 2331 - init_rwsem(&adapter->maps_lock); 2332 - atomic_set(&adapter->nr_maps, 0); 2333 2330 adapter->id = adapter_info.id; 2334 2331 adapter->isc = adapter_info.isc; 2335 2332 adapter->maskable = adapter_info.maskable; ··· 2351 2354 return ret; 2352 2355 } 2353 2356 2354 - static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr) 2355 - { 2356 - struct s390_io_adapter *adapter = get_io_adapter(kvm, id); 2357 - struct s390_map_info *map; 2358 - int ret; 2359 - 2360 - if (!adapter || !addr) 2361 - return -EINVAL; 2362 - 2363 - map = kzalloc(sizeof(*map), GFP_KERNEL); 2364 - if (!map) { 2365 - ret = -ENOMEM; 2366 - goto out; 2367 - } 2368 - INIT_LIST_HEAD(&map->list); 2369 - map->guest_addr = addr; 2370 - map->addr = gmap_translate(kvm->arch.gmap, addr); 2371 - if (map->addr == -EFAULT) { 2372 - ret = -EFAULT; 2373 - goto out; 2374 - } 2375 - ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page); 2376 - if (ret < 0) 2377 - goto out; 2378 - BUG_ON(ret != 1); 2379 - down_write(&adapter->maps_lock); 2380 - if (atomic_inc_return(&adapter->nr_maps) < MAX_S390_ADAPTER_MAPS) { 2381 - list_add_tail(&map->list, &adapter->maps); 2382 - ret = 0; 2383 - } else { 2384 - put_page(map->page); 2385 - ret = -EINVAL; 2386 - } 2387 - up_write(&adapter->maps_lock); 2388 - out: 2389 - if (ret) 2390 - kfree(map); 2391 - return ret; 2392 - } 2393 - 2394 - static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr) 2395 - { 2396 - struct s390_io_adapter *adapter = get_io_adapter(kvm, id); 2397 - struct s390_map_info *map, *tmp; 2398 - int found = 0; 2399 - 2400 - if (!adapter || !addr) 2401 - return -EINVAL; 2402 - 2403 - down_write(&adapter->maps_lock); 2404 - list_for_each_entry_safe(map, tmp, &adapter->maps, list) { 2405 - if (map->guest_addr == addr) { 2406 - found = 1; 2407 - atomic_dec(&adapter->nr_maps); 2408 - list_del(&map->list); 2409 - put_page(map->page); 2410 - kfree(map); 2411 - break; 2412 - } 2413 - } 2414 - up_write(&adapter->maps_lock); 2415 - 2416 - return found ? 0 : -EINVAL; 2417 - } 2418 - 2419 2357 void kvm_s390_destroy_adapters(struct kvm *kvm) 2420 2358 { 2421 2359 int i; 2422 - struct s390_map_info *map, *tmp; 2423 2360 2424 - for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) { 2425 - if (!kvm->arch.adapters[i]) 2426 - continue; 2427 - list_for_each_entry_safe(map, tmp, 2428 - &kvm->arch.adapters[i]->maps, list) { 2429 - list_del(&map->list); 2430 - put_page(map->page); 2431 - kfree(map); 2432 - } 2361 + for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) 2433 2362 kfree(kvm->arch.adapters[i]); 2434 - } 2435 2363 } 2436 2364 2437 2365 static int modify_io_adapter(struct kvm_device *dev, ··· 2378 2456 if (ret > 0) 2379 2457 ret = 0; 2380 2458 break; 2459 + /* 2460 + * The following operations are no longer needed and therefore no-ops. 2461 + * The gpa to hva translation is done when an IRQ route is set up. The 2462 + * set_irq code uses get_user_pages_remote() to do the actual write. 2463 + */ 2381 2464 case KVM_S390_IO_ADAPTER_MAP: 2382 - ret = kvm_s390_adapter_map(dev->kvm, req.id, req.addr); 2383 - break; 2384 2465 case KVM_S390_IO_ADAPTER_UNMAP: 2385 - ret = kvm_s390_adapter_unmap(dev->kvm, req.id, req.addr); 2466 + ret = 0; 2386 2467 break; 2387 2468 default: 2388 2469 ret = -EINVAL; ··· 2624 2699 return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit; 2625 2700 } 2626 2701 2627 - static struct s390_map_info *get_map_info(struct s390_io_adapter *adapter, 2628 - u64 addr) 2702 + static struct page *get_map_page(struct kvm *kvm, u64 uaddr) 2629 2703 { 2630 - struct s390_map_info *map; 2704 + struct page *page = NULL; 2631 2705 2632 - if (!adapter) 2633 - return NULL; 2634 - 2635 - list_for_each_entry(map, &adapter->maps, list) { 2636 - if (map->guest_addr == addr) 2637 - return map; 2638 - } 2639 - return NULL; 2706 + down_read(&kvm->mm->mmap_sem); 2707 + get_user_pages_remote(NULL, kvm->mm, uaddr, 1, FOLL_WRITE, 2708 + &page, NULL, NULL); 2709 + up_read(&kvm->mm->mmap_sem); 2710 + return page; 2640 2711 } 2641 2712 2642 2713 static int adapter_indicators_set(struct kvm *kvm, ··· 2641 2720 { 2642 2721 unsigned long bit; 2643 2722 int summary_set, idx; 2644 - struct s390_map_info *info; 2723 + struct page *ind_page, *summary_page; 2645 2724 void *map; 2646 2725 2647 - info = get_map_info(adapter, adapter_int->ind_addr); 2648 - if (!info) 2726 + ind_page = get_map_page(kvm, adapter_int->ind_addr); 2727 + if (!ind_page) 2649 2728 return -1; 2650 - map = page_address(info->page); 2651 - bit = get_ind_bit(info->addr, adapter_int->ind_offset, adapter->swap); 2652 - set_bit(bit, map); 2653 - idx = srcu_read_lock(&kvm->srcu); 2654 - mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT); 2655 - set_page_dirty_lock(info->page); 2656 - info = get_map_info(adapter, adapter_int->summary_addr); 2657 - if (!info) { 2658 - srcu_read_unlock(&kvm->srcu, idx); 2729 + summary_page = get_map_page(kvm, adapter_int->summary_addr); 2730 + if (!summary_page) { 2731 + put_page(ind_page); 2659 2732 return -1; 2660 2733 } 2661 - map = page_address(info->page); 2662 - bit = get_ind_bit(info->addr, adapter_int->summary_offset, 2663 - adapter->swap); 2734 + 2735 + idx = srcu_read_lock(&kvm->srcu); 2736 + map = page_address(ind_page); 2737 + bit = get_ind_bit(adapter_int->ind_addr, 2738 + adapter_int->ind_offset, adapter->swap); 2739 + set_bit(bit, map); 2740 + mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT); 2741 + set_page_dirty_lock(ind_page); 2742 + map = page_address(summary_page); 2743 + bit = get_ind_bit(adapter_int->summary_addr, 2744 + adapter_int->summary_offset, adapter->swap); 2664 2745 summary_set = test_and_set_bit(bit, map); 2665 - mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT); 2666 - set_page_dirty_lock(info->page); 2746 + mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT); 2747 + set_page_dirty_lock(summary_page); 2667 2748 srcu_read_unlock(&kvm->srcu, idx); 2749 + 2750 + put_page(ind_page); 2751 + put_page(summary_page); 2668 2752 return summary_set ? 0 : 1; 2669 2753 } 2670 2754 ··· 2691 2765 adapter = get_io_adapter(kvm, e->adapter.adapter_id); 2692 2766 if (!adapter) 2693 2767 return -1; 2694 - down_read(&adapter->maps_lock); 2695 2768 ret = adapter_indicators_set(kvm, adapter, &e->adapter); 2696 - up_read(&adapter->maps_lock); 2697 2769 if ((ret > 0) && !adapter->masked) { 2698 2770 ret = kvm_s390_inject_airq(kvm, adapter); 2699 2771 if (ret == 0) ··· 2742 2818 struct kvm_kernel_irq_routing_entry *e, 2743 2819 const struct kvm_irq_routing_entry *ue) 2744 2820 { 2745 - int ret; 2821 + u64 uaddr; 2746 2822 2747 2823 switch (ue->type) { 2824 + /* we store the userspace addresses instead of the guest addresses */ 2748 2825 case KVM_IRQ_ROUTING_S390_ADAPTER: 2749 2826 e->set = set_adapter_int; 2750 - e->adapter.summary_addr = ue->u.adapter.summary_addr; 2751 - e->adapter.ind_addr = ue->u.adapter.ind_addr; 2827 + uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr); 2828 + if (uaddr == -EFAULT) 2829 + return -EFAULT; 2830 + e->adapter.summary_addr = uaddr; 2831 + uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.ind_addr); 2832 + if (uaddr == -EFAULT) 2833 + return -EFAULT; 2834 + e->adapter.ind_addr = uaddr; 2752 2835 e->adapter.summary_offset = ue->u.adapter.summary_offset; 2753 2836 e->adapter.ind_offset = ue->u.adapter.ind_offset; 2754 2837 e->adapter.adapter_id = ue->u.adapter.adapter_id; 2755 - ret = 0; 2756 - break; 2838 + return 0; 2757 2839 default: 2758 - ret = -EINVAL; 2840 + return -EINVAL; 2759 2841 } 2760 - 2761 - return ret; 2762 2842 } 2763 2843 2764 2844 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,