Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm/mmap: use advanced maple tree API for mmap_region()

Changing mmap_region() to use the maple tree state and the advanced maple
tree interface allows for a lot less tree walking.

This change removes the last caller of munmap_vma_range(), so drop this
unused function.

Add vma_expand() to expand a VMA if possible by doing the necessary
hugepage check, uprobe_munmap of files, dcache flush, modifications then
undoing the detaches, etc.

Link: https://lkml.kernel.org/r/20220906194824.2110408-25-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Liam R. Howlett and committed by
Andrew Morton
4dd1b841 abdba2dd

+203 -48
+203 -48
mm/mmap.c
··· 470 470 return vma->vm_next; 471 471 } 472 472 473 - /* 474 - * munmap_vma_range() - munmap VMAs that overlap a range. 475 - * @mm: The mm struct 476 - * @start: The start of the range. 477 - * @len: The length of the range. 478 - * @pprev: pointer to the pointer that will be set to previous vm_area_struct 479 - * 480 - * Find all the vm_area_struct that overlap from @start to 481 - * @end and munmap them. Set @pprev to the previous vm_area_struct. 482 - * 483 - * Returns: -ENOMEM on munmap failure or 0 on success. 484 - */ 485 - static inline int 486 - munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len, 487 - struct vm_area_struct **pprev, struct list_head *uf) 488 - { 489 - while (range_has_overlap(mm, start, start + len, pprev)) 490 - if (do_munmap(mm, start, len, uf)) 491 - return -ENOMEM; 492 - return 0; 493 - } 494 - 495 473 static unsigned long count_vma_pages_range(struct mm_struct *mm, 496 474 unsigned long addr, unsigned long end) 497 475 { ··· 594 616 vma_mas_store(vma, mas); 595 617 __vma_link_list(mm, vma, prev); 596 618 mm->map_count++; 619 + } 620 + 621 + /* 622 + * vma_expand - Expand an existing VMA 623 + * 624 + * @mas: The maple state 625 + * @vma: The vma to expand 626 + * @start: The start of the vma 627 + * @end: The exclusive end of the vma 628 + * @pgoff: The page offset of vma 629 + * @next: The current of next vma. 630 + * 631 + * Expand @vma to @start and @end. Can expand off the start and end. Will 632 + * expand over @next if it's different from @vma and @end == @next->vm_end. 633 + * Checking if the @vma can expand and merge with @next needs to be handled by 634 + * the caller. 635 + * 636 + * Returns: 0 on success 637 + */ 638 + inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, 639 + unsigned long start, unsigned long end, pgoff_t pgoff, 640 + struct vm_area_struct *next) 641 + { 642 + struct mm_struct *mm = vma->vm_mm; 643 + struct address_space *mapping = NULL; 644 + struct rb_root_cached *root = NULL; 645 + struct anon_vma *anon_vma = vma->anon_vma; 646 + struct file *file = vma->vm_file; 647 + bool remove_next = false; 648 + 649 + if (next && (vma != next) && (end == next->vm_end)) { 650 + remove_next = true; 651 + if (next->anon_vma && !vma->anon_vma) { 652 + int error; 653 + 654 + anon_vma = next->anon_vma; 655 + vma->anon_vma = anon_vma; 656 + error = anon_vma_clone(vma, next); 657 + if (error) 658 + return error; 659 + } 660 + } 661 + 662 + /* Not merging but overwriting any part of next is not handled. */ 663 + VM_BUG_ON(next && !remove_next && next != vma && end > next->vm_start); 664 + /* Only handles expanding */ 665 + VM_BUG_ON(vma->vm_start < start || vma->vm_end > end); 666 + 667 + if (mas_preallocate(mas, vma, GFP_KERNEL)) 668 + goto nomem; 669 + 670 + vma_adjust_trans_huge(vma, start, end, 0); 671 + 672 + if (file) { 673 + mapping = file->f_mapping; 674 + root = &mapping->i_mmap; 675 + uprobe_munmap(vma, vma->vm_start, vma->vm_end); 676 + i_mmap_lock_write(mapping); 677 + } 678 + 679 + if (anon_vma) { 680 + anon_vma_lock_write(anon_vma); 681 + anon_vma_interval_tree_pre_update_vma(vma); 682 + } 683 + 684 + if (file) { 685 + flush_dcache_mmap_lock(mapping); 686 + vma_interval_tree_remove(vma, root); 687 + } 688 + 689 + vma->vm_start = start; 690 + vma->vm_end = end; 691 + vma->vm_pgoff = pgoff; 692 + /* Note: mas must be pointing to the expanding VMA */ 693 + vma_mas_store(vma, mas); 694 + 695 + if (file) { 696 + vma_interval_tree_insert(vma, root); 697 + flush_dcache_mmap_unlock(mapping); 698 + } 699 + 700 + /* Expanding over the next vma */ 701 + if (remove_next) { 702 + /* Remove from mm linked list - also updates highest_vm_end */ 703 + __vma_unlink_list(mm, next); 704 + 705 + /* Kill the cache */ 706 + vmacache_invalidate(mm); 707 + 708 + if (file) 709 + __remove_shared_vm_struct(next, file, mapping); 710 + 711 + } else if (!next) { 712 + mm->highest_vm_end = vm_end_gap(vma); 713 + } 714 + 715 + if (anon_vma) { 716 + anon_vma_interval_tree_post_update_vma(vma); 717 + anon_vma_unlock_write(anon_vma); 718 + } 719 + 720 + if (file) { 721 + i_mmap_unlock_write(mapping); 722 + uprobe_mmap(vma); 723 + } 724 + 725 + if (remove_next) { 726 + if (file) { 727 + uprobe_munmap(next, next->vm_start, next->vm_end); 728 + fput(file); 729 + } 730 + if (next->anon_vma) 731 + anon_vma_merge(vma, next); 732 + mm->map_count--; 733 + mpol_put(vma_policy(next)); 734 + vm_area_free(next); 735 + } 736 + 737 + validate_mm(mm); 738 + return 0; 739 + 740 + nomem: 741 + return -ENOMEM; 597 742 } 598 743 599 744 /* ··· 1731 1630 struct list_head *uf) 1732 1631 { 1733 1632 struct mm_struct *mm = current->mm; 1734 - struct vm_area_struct *vma, *prev, *merge; 1735 - int error; 1633 + struct vm_area_struct *vma = NULL; 1634 + struct vm_area_struct *next, *prev, *merge; 1635 + pgoff_t pglen = len >> PAGE_SHIFT; 1736 1636 unsigned long charged = 0; 1637 + unsigned long end = addr + len; 1638 + unsigned long merge_start = addr, merge_end = end; 1639 + pgoff_t vm_pgoff; 1640 + int error; 1641 + MA_STATE(mas, &mm->mm_mt, addr, end - 1); 1737 1642 1738 1643 /* Check against address space limit. */ 1739 1644 if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { ··· 1749 1642 * MAP_FIXED may remove pages of mappings that intersects with 1750 1643 * requested mapping. Account for the pages it would unmap. 1751 1644 */ 1752 - nr_pages = count_vma_pages_range(mm, addr, addr + len); 1645 + nr_pages = count_vma_pages_range(mm, addr, end); 1753 1646 1754 1647 if (!may_expand_vm(mm, vm_flags, 1755 1648 (len >> PAGE_SHIFT) - nr_pages)) 1756 1649 return -ENOMEM; 1757 1650 } 1758 1651 1759 - /* Clear old maps, set up prev and uf */ 1760 - if (munmap_vma_range(mm, addr, len, &prev, uf)) 1652 + /* Unmap any existing mapping in the area */ 1653 + if (do_munmap(mm, addr, len, uf)) 1761 1654 return -ENOMEM; 1655 + 1762 1656 /* 1763 1657 * Private writable mapping: check memory availability 1764 1658 */ ··· 1770 1662 vm_flags |= VM_ACCOUNT; 1771 1663 } 1772 1664 1773 - /* 1774 - * Can we just expand an old mapping? 1775 - */ 1776 - vma = vma_merge(mm, prev, addr, addr + len, vm_flags, 1777 - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); 1778 - if (vma) 1779 - goto out; 1665 + next = mas_next(&mas, ULONG_MAX); 1666 + prev = mas_prev(&mas, 0); 1667 + if (vm_flags & VM_SPECIAL) 1668 + goto cannot_expand; 1780 1669 1670 + /* Attempt to expand an old mapping */ 1671 + /* Check next */ 1672 + if (next && next->vm_start == end && !vma_policy(next) && 1673 + can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen, 1674 + NULL_VM_UFFD_CTX, NULL)) { 1675 + merge_end = next->vm_end; 1676 + vma = next; 1677 + vm_pgoff = next->vm_pgoff - pglen; 1678 + } 1679 + 1680 + /* Check prev */ 1681 + if (prev && prev->vm_end == addr && !vma_policy(prev) && 1682 + (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file, 1683 + pgoff, vma->vm_userfaultfd_ctx, NULL) : 1684 + can_vma_merge_after(prev, vm_flags, NULL, file, pgoff, 1685 + NULL_VM_UFFD_CTX, NULL))) { 1686 + merge_start = prev->vm_start; 1687 + vma = prev; 1688 + vm_pgoff = prev->vm_pgoff; 1689 + } 1690 + 1691 + 1692 + /* Actually expand, if possible */ 1693 + if (vma && 1694 + !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) { 1695 + khugepaged_enter_vma(vma, vm_flags); 1696 + goto expanded; 1697 + } 1698 + 1699 + mas.index = addr; 1700 + mas.last = end - 1; 1701 + cannot_expand: 1781 1702 /* 1782 1703 * Determine the object being mapped and call the appropriate 1783 1704 * specific mapper. the address has already been validated, but ··· 1819 1682 } 1820 1683 1821 1684 vma->vm_start = addr; 1822 - vma->vm_end = addr + len; 1685 + vma->vm_end = end; 1823 1686 vma->vm_flags = vm_flags; 1824 1687 vma->vm_page_prot = vm_get_page_prot(vm_flags); 1825 1688 vma->vm_pgoff = pgoff; ··· 1840 1703 * 1841 1704 * Answer: Yes, several device drivers can do it in their 1842 1705 * f_op->mmap method. -DaveM 1843 - * Bug: If addr is changed, prev, rb_link, rb_parent should 1844 - * be updated for vma_link() 1845 1706 */ 1846 1707 WARN_ON_ONCE(addr != vma->vm_start); 1847 1708 1848 1709 addr = vma->vm_start; 1710 + mas_reset(&mas); 1849 1711 1850 - /* If vm_flags changed after call_mmap(), we should try merge vma again 1851 - * as we may succeed this time. 1712 + /* 1713 + * If vm_flags changed after call_mmap(), we should try merge 1714 + * vma again as we may succeed this time. 1852 1715 */ 1853 1716 if (unlikely(vm_flags != vma->vm_flags && prev)) { 1854 1717 merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, 1855 1718 NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); 1856 1719 if (merge) { 1857 - /* ->mmap() can change vma->vm_file and fput the original file. So 1858 - * fput the vma->vm_file here or we would add an extra fput for file 1859 - * and cause general protection fault ultimately. 1720 + /* 1721 + * ->mmap() can change vma->vm_file and fput 1722 + * the original file. So fput the vma->vm_file 1723 + * here or we would add an extra fput for file 1724 + * and cause general protection fault 1725 + * ultimately. 1860 1726 */ 1861 1727 fput(vma->vm_file); 1862 1728 vm_area_free(vma); 1863 1729 vma = merge; 1864 1730 /* Update vm_flags to pick up the change. */ 1731 + addr = vma->vm_start; 1865 1732 vm_flags = vma->vm_flags; 1866 1733 goto unmap_writable; 1867 1734 } ··· 1889 1748 goto free_vma; 1890 1749 } 1891 1750 1892 - if (vma_link(mm, vma, prev)) { 1751 + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { 1893 1752 error = -ENOMEM; 1894 1753 if (file) 1895 1754 goto unmap_and_free_vma; 1896 1755 else 1897 1756 goto free_vma; 1757 + } 1758 + 1759 + if (vma->vm_file) 1760 + i_mmap_lock_write(vma->vm_file->f_mapping); 1761 + 1762 + vma_mas_store(vma, &mas); 1763 + __vma_link_list(mm, vma, prev); 1764 + mm->map_count++; 1765 + if (vma->vm_file) { 1766 + if (vma->vm_flags & VM_SHARED) 1767 + mapping_allow_writable(vma->vm_file->f_mapping); 1768 + 1769 + flush_dcache_mmap_lock(vma->vm_file->f_mapping); 1770 + vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap); 1771 + flush_dcache_mmap_unlock(vma->vm_file->f_mapping); 1772 + i_mmap_unlock_write(vma->vm_file->f_mapping); 1898 1773 } 1899 1774 1900 1775 /* ··· 1924 1767 if (file && vm_flags & VM_SHARED) 1925 1768 mapping_unmap_writable(file->f_mapping); 1926 1769 file = vma->vm_file; 1927 - out: 1770 + expanded: 1928 1771 perf_event_mmap(vma); 1929 1772 1930 1773 vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); ··· 1951 1794 1952 1795 vma_set_page_prot(vma); 1953 1796 1797 + validate_mm(mm); 1954 1798 return addr; 1955 1799 1956 1800 unmap_and_free_vma: ··· 1967 1809 unacct_error: 1968 1810 if (charged) 1969 1811 vm_unacct_memory(charged); 1812 + validate_mm(mm); 1970 1813 return error; 1971 1814 } 1972 1815 ··· 2790 2631 return -ENOMEM; 2791 2632 prev = vma->vm_prev; 2792 2633 /* we have start < vma->vm_end */ 2793 - 2794 - /* if it doesn't overlap, we have nothing.. */ 2795 - if (vma->vm_start >= end) 2796 - return 0; 2797 2634 2798 2635 /* 2799 2636 * If we need to split any vma, do it now to save pain later.