Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

vhost-vdpa: fix page pinning leakage in error path (rework)

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
Link: https://lore.kernel.org/r/1604618793-4681-1-git-send-email-si-wei.liu@oracle.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>

authored by

Si-Wei Liu and committed by
Michael S. Tsirkin
ad89653f 8009b0f4

+62 -18
+62 -18
drivers/vhost/vdpa.c
··· 577 577 578 578 if (r) 579 579 vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1); 580 + else 581 + atomic64_add(size >> PAGE_SHIFT, &dev->mm->pinned_vm); 580 582 581 583 return r; 582 584 } ··· 610 608 unsigned long list_size = PAGE_SIZE / sizeof(struct page *); 611 609 unsigned int gup_flags = FOLL_LONGTERM; 612 610 unsigned long npages, cur_base, map_pfn, last_pfn = 0; 613 - unsigned long locked, lock_limit, pinned, i; 611 + unsigned long lock_limit, sz2pin, nchunks, i; 614 612 u64 iova = msg->iova; 613 + long pinned; 615 614 int ret = 0; 616 615 617 616 if (msg->iova < v->range.first || ··· 623 620 msg->iova + msg->size - 1)) 624 621 return -EEXIST; 625 622 623 + /* Limit the use of memory for bookkeeping */ 626 624 page_list = (struct page **) __get_free_page(GFP_KERNEL); 627 625 if (!page_list) 628 626 return -ENOMEM; ··· 632 628 gup_flags |= FOLL_WRITE; 633 629 634 630 npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT; 635 - if (!npages) 636 - return -EINVAL; 631 + if (!npages) { 632 + ret = -EINVAL; 633 + goto free; 634 + } 637 635 638 636 mmap_read_lock(dev->mm); 639 637 640 - locked = atomic64_add_return(npages, &dev->mm->pinned_vm); 641 638 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 642 - 643 - if (locked > lock_limit) { 639 + if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) { 644 640 ret = -ENOMEM; 645 - goto out; 641 + goto unlock; 646 642 } 647 643 648 644 cur_base = msg->uaddr & PAGE_MASK; 649 645 iova &= PAGE_MASK; 646 + nchunks = 0; 650 647 651 648 while (npages) { 652 - pinned = min_t(unsigned long, npages, list_size); 653 - ret = pin_user_pages(cur_base, pinned, 654 - gup_flags, page_list, NULL); 655 - if (ret != pinned) 649 + sz2pin = min_t(unsigned long, npages, list_size); 650 + pinned = pin_user_pages(cur_base, sz2pin, 651 + gup_flags, page_list, NULL); 652 + if (sz2pin != pinned) { 653 + if (pinned < 0) { 654 + ret = pinned; 655 + } else { 656 + unpin_user_pages(page_list, pinned); 657 + ret = -ENOMEM; 658 + } 656 659 goto out; 660 + } 661 + nchunks++; 657 662 658 663 if (!last_pfn) 659 664 map_pfn = page_to_pfn(page_list[0]); 660 665 661 - for (i = 0; i < ret; i++) { 666 + for (i = 0; i < pinned; i++) { 662 667 unsigned long this_pfn = page_to_pfn(page_list[i]); 663 668 u64 csize; 664 669 665 670 if (last_pfn && (this_pfn != last_pfn + 1)) { 666 671 /* Pin a contiguous chunk of memory */ 667 672 csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT; 668 - if (vhost_vdpa_map(v, iova, csize, 669 - map_pfn << PAGE_SHIFT, 670 - msg->perm)) 673 + ret = vhost_vdpa_map(v, iova, csize, 674 + map_pfn << PAGE_SHIFT, 675 + msg->perm); 676 + if (ret) { 677 + /* 678 + * Unpin the pages that are left unmapped 679 + * from this point on in the current 680 + * page_list. The remaining outstanding 681 + * ones which may stride across several 682 + * chunks will be covered in the common 683 + * error path subsequently. 684 + */ 685 + unpin_user_pages(&page_list[i], 686 + pinned - i); 671 687 goto out; 688 + } 689 + 672 690 map_pfn = this_pfn; 673 691 iova += csize; 692 + nchunks = 0; 674 693 } 675 694 676 695 last_pfn = this_pfn; 677 696 } 678 697 679 - cur_base += ret << PAGE_SHIFT; 680 - npages -= ret; 698 + cur_base += pinned << PAGE_SHIFT; 699 + npages -= pinned; 681 700 } 682 701 683 702 /* Pin the rest chunk */ ··· 708 681 map_pfn << PAGE_SHIFT, msg->perm); 709 682 out: 710 683 if (ret) { 684 + if (nchunks) { 685 + unsigned long pfn; 686 + 687 + /* 688 + * Unpin the outstanding pages which are yet to be 689 + * mapped but haven't due to vdpa_map() or 690 + * pin_user_pages() failure. 691 + * 692 + * Mapped pages are accounted in vdpa_map(), hence 693 + * the corresponding unpinning will be handled by 694 + * vdpa_unmap(). 695 + */ 696 + WARN_ON(!last_pfn); 697 + for (pfn = map_pfn; pfn <= last_pfn; pfn++) 698 + unpin_user_page(pfn_to_page(pfn)); 699 + } 711 700 vhost_vdpa_unmap(v, msg->iova, msg->size); 712 - atomic64_sub(npages, &dev->mm->pinned_vm); 713 701 } 702 + unlock: 714 703 mmap_read_unlock(dev->mm); 704 + free: 715 705 free_page((unsigned long)page_list); 716 706 return ret; 717 707 }