Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

fs/proc/task_mmu: implement IOCTL to get and optionally clear info about PTEs

The PAGEMAP_SCAN IOCTL on the pagemap file can be used to get or optionally
clear the info about page table entries. The following operations are
supported in this IOCTL:
- Scan the address range and get the memory ranges matching the provided
criteria. This is performed when the output buffer is specified.
- Write-protect the pages. The PM_SCAN_WP_MATCHING is used to write-protect
the pages of interest. The PM_SCAN_CHECK_WPASYNC aborts the operation if
non-Async Write Protected pages are found. The ``PM_SCAN_WP_MATCHING``
can be used with or without PM_SCAN_CHECK_WPASYNC.
- Both of those operations can be combined into one atomic operation where
we can get and write protect the pages as well.

Following flags about pages are currently supported:
- PAGE_IS_WPALLOWED - Page has async-write-protection enabled
- PAGE_IS_WRITTEN - Page has been written to from the time it was write protected
- PAGE_IS_FILE - Page is file backed
- PAGE_IS_PRESENT - Page is present in the memory
- PAGE_IS_SWAPPED - Page is in swapped
- PAGE_IS_PFNZERO - Page has zero PFN
- PAGE_IS_HUGE - Page is THP or Hugetlb backed

This IOCTL can be extended to get information about more PTE bits. The
entire address range passed by user [start, end) is scanned until either
the user provided buffer is full or max_pages have been found.

[akpm@linux-foundation.org: update it for "mm: hugetlb: add huge page size param to set_huge_pte_at()"]
[akpm@linux-foundation.org: fix CONFIG_HUGETLB_PAGE=n warning]
[arnd@arndb.de: hide unused pagemap_scan_backout_range() function]
Link: https://lkml.kernel.org/r/20230927060257.2975412-1-arnd@kernel.org
[sfr@canb.auug.org.au: fix "fs/proc/task_mmu: hide unused pagemap_scan_backout_range() function"]
Link: https://lkml.kernel.org/r/20230928092223.0625c6bf@canb.auug.org.au
Link: https://lkml.kernel.org/r/20230821141518.870589-3-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reviewed-by: Andrei Vagin <avagin@gmail.com>
Reviewed-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Cc: Alex Sierra <alex.sierra@amd.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Miroslaw <emmir@google.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Nadav Amit <namit@vmware.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Paul Gofman <pgofman@codeweavers.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yun Zhou <yun.zhou@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Muhammad Usama Anjum and committed by
Andrew Morton
52526ca7 d61ea1cb

+762 -2
+692
fs/proc/task_mmu.c
··· 20 20 #include <linux/shmem_fs.h> 21 21 #include <linux/uaccess.h> 22 22 #include <linux/pkeys.h> 23 + #include <linux/minmax.h> 24 + #include <linux/overflow.h> 23 25 24 26 #include <asm/elf.h> 25 27 #include <asm/tlb.h> ··· 1763 1761 return 0; 1764 1762 } 1765 1763 1764 + #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ 1765 + PAGE_IS_FILE | PAGE_IS_PRESENT | \ 1766 + PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ 1767 + PAGE_IS_HUGE) 1768 + #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) 1769 + 1770 + struct pagemap_scan_private { 1771 + struct pm_scan_arg arg; 1772 + unsigned long masks_of_interest, cur_vma_category; 1773 + struct page_region *vec_buf; 1774 + unsigned long vec_buf_len, vec_buf_index, found_pages; 1775 + struct page_region __user *vec_out; 1776 + }; 1777 + 1778 + static unsigned long pagemap_page_category(struct pagemap_scan_private *p, 1779 + struct vm_area_struct *vma, 1780 + unsigned long addr, pte_t pte) 1781 + { 1782 + unsigned long categories = 0; 1783 + 1784 + if (pte_present(pte)) { 1785 + struct page *page; 1786 + 1787 + categories |= PAGE_IS_PRESENT; 1788 + if (!pte_uffd_wp(pte)) 1789 + categories |= PAGE_IS_WRITTEN; 1790 + 1791 + if (p->masks_of_interest & PAGE_IS_FILE) { 1792 + page = vm_normal_page(vma, addr, pte); 1793 + if (page && !PageAnon(page)) 1794 + categories |= PAGE_IS_FILE; 1795 + } 1796 + 1797 + if (is_zero_pfn(pte_pfn(pte))) 1798 + categories |= PAGE_IS_PFNZERO; 1799 + } else if (is_swap_pte(pte)) { 1800 + swp_entry_t swp; 1801 + 1802 + categories |= PAGE_IS_SWAPPED; 1803 + if (!pte_swp_uffd_wp_any(pte)) 1804 + categories |= PAGE_IS_WRITTEN; 1805 + 1806 + if (p->masks_of_interest & PAGE_IS_FILE) { 1807 + swp = pte_to_swp_entry(pte); 1808 + if (is_pfn_swap_entry(swp) && 1809 + !PageAnon(pfn_swap_entry_to_page(swp))) 1810 + categories |= PAGE_IS_FILE; 1811 + } 1812 + } 1813 + 1814 + return categories; 1815 + } 1816 + 1817 + static void make_uffd_wp_pte(struct vm_area_struct *vma, 1818 + unsigned long addr, pte_t *pte) 1819 + { 1820 + pte_t ptent = ptep_get(pte); 1821 + 1822 + if (pte_present(ptent)) { 1823 + pte_t old_pte; 1824 + 1825 + old_pte = ptep_modify_prot_start(vma, addr, pte); 1826 + ptent = pte_mkuffd_wp(ptent); 1827 + ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); 1828 + } else if (is_swap_pte(ptent)) { 1829 + ptent = pte_swp_mkuffd_wp(ptent); 1830 + set_pte_at(vma->vm_mm, addr, pte, ptent); 1831 + } else { 1832 + set_pte_at(vma->vm_mm, addr, pte, 1833 + make_pte_marker(PTE_MARKER_UFFD_WP)); 1834 + } 1835 + } 1836 + 1837 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1838 + static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, 1839 + struct vm_area_struct *vma, 1840 + unsigned long addr, pmd_t pmd) 1841 + { 1842 + unsigned long categories = PAGE_IS_HUGE; 1843 + 1844 + if (pmd_present(pmd)) { 1845 + struct page *page; 1846 + 1847 + categories |= PAGE_IS_PRESENT; 1848 + if (!pmd_uffd_wp(pmd)) 1849 + categories |= PAGE_IS_WRITTEN; 1850 + 1851 + if (p->masks_of_interest & PAGE_IS_FILE) { 1852 + page = vm_normal_page_pmd(vma, addr, pmd); 1853 + if (page && !PageAnon(page)) 1854 + categories |= PAGE_IS_FILE; 1855 + } 1856 + 1857 + if (is_zero_pfn(pmd_pfn(pmd))) 1858 + categories |= PAGE_IS_PFNZERO; 1859 + } else if (is_swap_pmd(pmd)) { 1860 + swp_entry_t swp; 1861 + 1862 + categories |= PAGE_IS_SWAPPED; 1863 + if (!pmd_swp_uffd_wp(pmd)) 1864 + categories |= PAGE_IS_WRITTEN; 1865 + 1866 + if (p->masks_of_interest & PAGE_IS_FILE) { 1867 + swp = pmd_to_swp_entry(pmd); 1868 + if (is_pfn_swap_entry(swp) && 1869 + !PageAnon(pfn_swap_entry_to_page(swp))) 1870 + categories |= PAGE_IS_FILE; 1871 + } 1872 + } 1873 + 1874 + return categories; 1875 + } 1876 + 1877 + static void make_uffd_wp_pmd(struct vm_area_struct *vma, 1878 + unsigned long addr, pmd_t *pmdp) 1879 + { 1880 + pmd_t old, pmd = *pmdp; 1881 + 1882 + if (pmd_present(pmd)) { 1883 + old = pmdp_invalidate_ad(vma, addr, pmdp); 1884 + pmd = pmd_mkuffd_wp(old); 1885 + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 1886 + } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { 1887 + pmd = pmd_swp_mkuffd_wp(pmd); 1888 + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 1889 + } 1890 + } 1891 + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1892 + 1893 + #ifdef CONFIG_HUGETLB_PAGE 1894 + static unsigned long pagemap_hugetlb_category(pte_t pte) 1895 + { 1896 + unsigned long categories = PAGE_IS_HUGE; 1897 + 1898 + /* 1899 + * According to pagemap_hugetlb_range(), file-backed HugeTLB 1900 + * page cannot be swapped. So PAGE_IS_FILE is not checked for 1901 + * swapped pages. 1902 + */ 1903 + if (pte_present(pte)) { 1904 + categories |= PAGE_IS_PRESENT; 1905 + if (!huge_pte_uffd_wp(pte)) 1906 + categories |= PAGE_IS_WRITTEN; 1907 + if (!PageAnon(pte_page(pte))) 1908 + categories |= PAGE_IS_FILE; 1909 + if (is_zero_pfn(pte_pfn(pte))) 1910 + categories |= PAGE_IS_PFNZERO; 1911 + } else if (is_swap_pte(pte)) { 1912 + categories |= PAGE_IS_SWAPPED; 1913 + if (!pte_swp_uffd_wp_any(pte)) 1914 + categories |= PAGE_IS_WRITTEN; 1915 + } 1916 + 1917 + return categories; 1918 + } 1919 + 1920 + static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, 1921 + unsigned long addr, pte_t *ptep, 1922 + pte_t ptent) 1923 + { 1924 + unsigned long psize; 1925 + 1926 + if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent)) 1927 + return; 1928 + 1929 + psize = huge_page_size(hstate_vma(vma)); 1930 + 1931 + if (is_hugetlb_entry_migration(ptent)) 1932 + set_huge_pte_at(vma->vm_mm, addr, ptep, 1933 + pte_swp_mkuffd_wp(ptent), psize); 1934 + else if (!huge_pte_none(ptent)) 1935 + huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, 1936 + huge_pte_mkuffd_wp(ptent)); 1937 + else 1938 + set_huge_pte_at(vma->vm_mm, addr, ptep, 1939 + make_pte_marker(PTE_MARKER_UFFD_WP), psize); 1940 + } 1941 + #endif /* CONFIG_HUGETLB_PAGE */ 1942 + 1943 + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) 1944 + static void pagemap_scan_backout_range(struct pagemap_scan_private *p, 1945 + unsigned long addr, unsigned long end) 1946 + { 1947 + struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; 1948 + 1949 + if (cur_buf->start != addr) 1950 + cur_buf->end = addr; 1951 + else 1952 + cur_buf->start = cur_buf->end = 0; 1953 + 1954 + p->found_pages -= (end - addr) / PAGE_SIZE; 1955 + } 1956 + #endif 1957 + 1958 + static bool pagemap_scan_is_interesting_page(unsigned long categories, 1959 + const struct pagemap_scan_private *p) 1960 + { 1961 + categories ^= p->arg.category_inverted; 1962 + if ((categories & p->arg.category_mask) != p->arg.category_mask) 1963 + return false; 1964 + if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask)) 1965 + return false; 1966 + 1967 + return true; 1968 + } 1969 + 1970 + static bool pagemap_scan_is_interesting_vma(unsigned long categories, 1971 + const struct pagemap_scan_private *p) 1972 + { 1973 + unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED; 1974 + 1975 + categories ^= p->arg.category_inverted; 1976 + if ((categories & required) != required) 1977 + return false; 1978 + 1979 + return true; 1980 + } 1981 + 1982 + static int pagemap_scan_test_walk(unsigned long start, unsigned long end, 1983 + struct mm_walk *walk) 1984 + { 1985 + struct pagemap_scan_private *p = walk->private; 1986 + struct vm_area_struct *vma = walk->vma; 1987 + unsigned long vma_category = 0; 1988 + 1989 + if (userfaultfd_wp_async(vma) && userfaultfd_wp_use_markers(vma)) 1990 + vma_category |= PAGE_IS_WPALLOWED; 1991 + else if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) 1992 + return -EPERM; 1993 + 1994 + if (vma->vm_flags & VM_PFNMAP) 1995 + return 1; 1996 + 1997 + if (!pagemap_scan_is_interesting_vma(vma_category, p)) 1998 + return 1; 1999 + 2000 + p->cur_vma_category = vma_category; 2001 + 2002 + return 0; 2003 + } 2004 + 2005 + static bool pagemap_scan_push_range(unsigned long categories, 2006 + struct pagemap_scan_private *p, 2007 + unsigned long addr, unsigned long end) 2008 + { 2009 + struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; 2010 + 2011 + /* 2012 + * When there is no output buffer provided at all, the sentinel values 2013 + * won't match here. There is no other way for `cur_buf->end` to be 2014 + * non-zero other than it being non-empty. 2015 + */ 2016 + if (addr == cur_buf->end && categories == cur_buf->categories) { 2017 + cur_buf->end = end; 2018 + return true; 2019 + } 2020 + 2021 + if (cur_buf->end) { 2022 + if (p->vec_buf_index >= p->vec_buf_len - 1) 2023 + return false; 2024 + 2025 + cur_buf = &p->vec_buf[++p->vec_buf_index]; 2026 + } 2027 + 2028 + cur_buf->start = addr; 2029 + cur_buf->end = end; 2030 + cur_buf->categories = categories; 2031 + 2032 + return true; 2033 + } 2034 + 2035 + static int pagemap_scan_output(unsigned long categories, 2036 + struct pagemap_scan_private *p, 2037 + unsigned long addr, unsigned long *end) 2038 + { 2039 + unsigned long n_pages, total_pages; 2040 + int ret = 0; 2041 + 2042 + if (!p->vec_buf) 2043 + return 0; 2044 + 2045 + categories &= p->arg.return_mask; 2046 + 2047 + n_pages = (*end - addr) / PAGE_SIZE; 2048 + if (check_add_overflow(p->found_pages, n_pages, &total_pages) || 2049 + total_pages > p->arg.max_pages) { 2050 + size_t n_too_much = total_pages - p->arg.max_pages; 2051 + *end -= n_too_much * PAGE_SIZE; 2052 + n_pages -= n_too_much; 2053 + ret = -ENOSPC; 2054 + } 2055 + 2056 + if (!pagemap_scan_push_range(categories, p, addr, *end)) { 2057 + *end = addr; 2058 + n_pages = 0; 2059 + ret = -ENOSPC; 2060 + } 2061 + 2062 + p->found_pages += n_pages; 2063 + if (ret) 2064 + p->arg.walk_end = *end; 2065 + 2066 + return ret; 2067 + } 2068 + 2069 + static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start, 2070 + unsigned long end, struct mm_walk *walk) 2071 + { 2072 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2073 + struct pagemap_scan_private *p = walk->private; 2074 + struct vm_area_struct *vma = walk->vma; 2075 + unsigned long categories; 2076 + spinlock_t *ptl; 2077 + int ret = 0; 2078 + 2079 + ptl = pmd_trans_huge_lock(pmd, vma); 2080 + if (!ptl) 2081 + return -ENOENT; 2082 + 2083 + categories = p->cur_vma_category | 2084 + pagemap_thp_category(p, vma, start, *pmd); 2085 + 2086 + if (!pagemap_scan_is_interesting_page(categories, p)) 2087 + goto out_unlock; 2088 + 2089 + ret = pagemap_scan_output(categories, p, start, &end); 2090 + if (start == end) 2091 + goto out_unlock; 2092 + 2093 + if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2094 + goto out_unlock; 2095 + if (~categories & PAGE_IS_WRITTEN) 2096 + goto out_unlock; 2097 + 2098 + /* 2099 + * Break huge page into small pages if the WP operation 2100 + * needs to be performed on a portion of the huge page. 2101 + */ 2102 + if (end != start + HPAGE_SIZE) { 2103 + spin_unlock(ptl); 2104 + split_huge_pmd(vma, pmd, start); 2105 + pagemap_scan_backout_range(p, start, end); 2106 + /* Report as if there was no THP */ 2107 + return -ENOENT; 2108 + } 2109 + 2110 + make_uffd_wp_pmd(vma, start, pmd); 2111 + flush_tlb_range(vma, start, end); 2112 + out_unlock: 2113 + spin_unlock(ptl); 2114 + return ret; 2115 + #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ 2116 + return -ENOENT; 2117 + #endif 2118 + } 2119 + 2120 + static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, 2121 + unsigned long end, struct mm_walk *walk) 2122 + { 2123 + struct pagemap_scan_private *p = walk->private; 2124 + struct vm_area_struct *vma = walk->vma; 2125 + unsigned long addr, flush_end = 0; 2126 + pte_t *pte, *start_pte; 2127 + spinlock_t *ptl; 2128 + int ret; 2129 + 2130 + arch_enter_lazy_mmu_mode(); 2131 + 2132 + ret = pagemap_scan_thp_entry(pmd, start, end, walk); 2133 + if (ret != -ENOENT) { 2134 + arch_leave_lazy_mmu_mode(); 2135 + return ret; 2136 + } 2137 + 2138 + ret = 0; 2139 + start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); 2140 + if (!pte) { 2141 + arch_leave_lazy_mmu_mode(); 2142 + walk->action = ACTION_AGAIN; 2143 + return 0; 2144 + } 2145 + 2146 + for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { 2147 + unsigned long categories = p->cur_vma_category | 2148 + pagemap_page_category(p, vma, addr, ptep_get(pte)); 2149 + unsigned long next = addr + PAGE_SIZE; 2150 + 2151 + if (!pagemap_scan_is_interesting_page(categories, p)) 2152 + continue; 2153 + 2154 + ret = pagemap_scan_output(categories, p, addr, &next); 2155 + if (next == addr) 2156 + break; 2157 + 2158 + if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2159 + continue; 2160 + if (~categories & PAGE_IS_WRITTEN) 2161 + continue; 2162 + 2163 + make_uffd_wp_pte(vma, addr, pte); 2164 + if (!flush_end) 2165 + start = addr; 2166 + flush_end = next; 2167 + } 2168 + 2169 + if (flush_end) 2170 + flush_tlb_range(vma, start, addr); 2171 + 2172 + pte_unmap_unlock(start_pte, ptl); 2173 + arch_leave_lazy_mmu_mode(); 2174 + 2175 + cond_resched(); 2176 + return ret; 2177 + } 2178 + 2179 + #ifdef CONFIG_HUGETLB_PAGE 2180 + static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, 2181 + unsigned long start, unsigned long end, 2182 + struct mm_walk *walk) 2183 + { 2184 + struct pagemap_scan_private *p = walk->private; 2185 + struct vm_area_struct *vma = walk->vma; 2186 + unsigned long categories; 2187 + spinlock_t *ptl; 2188 + int ret = 0; 2189 + pte_t pte; 2190 + 2191 + if (~p->arg.flags & PM_SCAN_WP_MATCHING) { 2192 + /* Go the short route when not write-protecting pages. */ 2193 + 2194 + pte = huge_ptep_get(ptep); 2195 + categories = p->cur_vma_category | pagemap_hugetlb_category(pte); 2196 + 2197 + if (!pagemap_scan_is_interesting_page(categories, p)) 2198 + return 0; 2199 + 2200 + return pagemap_scan_output(categories, p, start, &end); 2201 + } 2202 + 2203 + i_mmap_lock_write(vma->vm_file->f_mapping); 2204 + ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); 2205 + 2206 + pte = huge_ptep_get(ptep); 2207 + categories = p->cur_vma_category | pagemap_hugetlb_category(pte); 2208 + 2209 + if (!pagemap_scan_is_interesting_page(categories, p)) 2210 + goto out_unlock; 2211 + 2212 + ret = pagemap_scan_output(categories, p, start, &end); 2213 + if (start == end) 2214 + goto out_unlock; 2215 + 2216 + if (~categories & PAGE_IS_WRITTEN) 2217 + goto out_unlock; 2218 + 2219 + if (end != start + HPAGE_SIZE) { 2220 + /* Partial HugeTLB page WP isn't possible. */ 2221 + pagemap_scan_backout_range(p, start, end); 2222 + p->arg.walk_end = start; 2223 + ret = 0; 2224 + goto out_unlock; 2225 + } 2226 + 2227 + make_uffd_wp_huge_pte(vma, start, ptep, pte); 2228 + flush_hugetlb_tlb_range(vma, start, end); 2229 + 2230 + out_unlock: 2231 + spin_unlock(ptl); 2232 + i_mmap_unlock_write(vma->vm_file->f_mapping); 2233 + 2234 + return ret; 2235 + } 2236 + #else 2237 + #define pagemap_scan_hugetlb_entry NULL 2238 + #endif 2239 + 2240 + static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, 2241 + int depth, struct mm_walk *walk) 2242 + { 2243 + struct pagemap_scan_private *p = walk->private; 2244 + struct vm_area_struct *vma = walk->vma; 2245 + int ret, err; 2246 + 2247 + if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p)) 2248 + return 0; 2249 + 2250 + ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end); 2251 + if (addr == end) 2252 + return ret; 2253 + 2254 + if (~p->arg.flags & PM_SCAN_WP_MATCHING) 2255 + return ret; 2256 + 2257 + err = uffd_wp_range(vma, addr, end - addr, true); 2258 + if (err < 0) 2259 + ret = err; 2260 + 2261 + return ret; 2262 + } 2263 + 2264 + static const struct mm_walk_ops pagemap_scan_ops = { 2265 + .test_walk = pagemap_scan_test_walk, 2266 + .pmd_entry = pagemap_scan_pmd_entry, 2267 + .pte_hole = pagemap_scan_pte_hole, 2268 + .hugetlb_entry = pagemap_scan_hugetlb_entry, 2269 + }; 2270 + 2271 + static int pagemap_scan_get_args(struct pm_scan_arg *arg, 2272 + unsigned long uarg) 2273 + { 2274 + if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg))) 2275 + return -EFAULT; 2276 + 2277 + if (arg->size != sizeof(struct pm_scan_arg)) 2278 + return -EINVAL; 2279 + 2280 + /* Validate requested features */ 2281 + if (arg->flags & ~PM_SCAN_FLAGS) 2282 + return -EINVAL; 2283 + if ((arg->category_inverted | arg->category_mask | 2284 + arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES) 2285 + return -EINVAL; 2286 + 2287 + arg->start = untagged_addr((unsigned long)arg->start); 2288 + arg->end = untagged_addr((unsigned long)arg->end); 2289 + arg->vec = untagged_addr((unsigned long)arg->vec); 2290 + 2291 + /* Validate memory pointers */ 2292 + if (!IS_ALIGNED(arg->start, PAGE_SIZE)) 2293 + return -EINVAL; 2294 + if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start)) 2295 + return -EFAULT; 2296 + if (!arg->vec && arg->vec_len) 2297 + return -EINVAL; 2298 + if (arg->vec && !access_ok((void __user *)(long)arg->vec, 2299 + arg->vec_len * sizeof(struct page_region))) 2300 + return -EFAULT; 2301 + 2302 + /* Fixup default values */ 2303 + arg->end = ALIGN(arg->end, PAGE_SIZE); 2304 + arg->walk_end = 0; 2305 + if (!arg->max_pages) 2306 + arg->max_pages = ULONG_MAX; 2307 + 2308 + return 0; 2309 + } 2310 + 2311 + static int pagemap_scan_writeback_args(struct pm_scan_arg *arg, 2312 + unsigned long uargl) 2313 + { 2314 + struct pm_scan_arg __user *uarg = (void __user *)uargl; 2315 + 2316 + if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end))) 2317 + return -EFAULT; 2318 + 2319 + return 0; 2320 + } 2321 + 2322 + static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) 2323 + { 2324 + if (!p->arg.vec_len) 2325 + return 0; 2326 + 2327 + p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT, 2328 + p->arg.vec_len); 2329 + p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf), 2330 + GFP_KERNEL); 2331 + if (!p->vec_buf) 2332 + return -ENOMEM; 2333 + 2334 + p->vec_buf->start = p->vec_buf->end = 0; 2335 + p->vec_out = (struct page_region __user *)(long)p->arg.vec; 2336 + 2337 + return 0; 2338 + } 2339 + 2340 + static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p) 2341 + { 2342 + const struct page_region *buf = p->vec_buf; 2343 + long n = p->vec_buf_index; 2344 + 2345 + if (!p->vec_buf) 2346 + return 0; 2347 + 2348 + if (buf[n].end != buf[n].start) 2349 + n++; 2350 + 2351 + if (!n) 2352 + return 0; 2353 + 2354 + if (copy_to_user(p->vec_out, buf, n * sizeof(*buf))) 2355 + return -EFAULT; 2356 + 2357 + p->arg.vec_len -= n; 2358 + p->vec_out += n; 2359 + 2360 + p->vec_buf_index = 0; 2361 + p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len); 2362 + p->vec_buf->start = p->vec_buf->end = 0; 2363 + 2364 + return n; 2365 + } 2366 + 2367 + static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) 2368 + { 2369 + struct mmu_notifier_range range; 2370 + struct pagemap_scan_private p = {0}; 2371 + unsigned long walk_start; 2372 + size_t n_ranges_out = 0; 2373 + int ret; 2374 + 2375 + ret = pagemap_scan_get_args(&p.arg, uarg); 2376 + if (ret) 2377 + return ret; 2378 + 2379 + p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask | 2380 + p.arg.return_mask; 2381 + ret = pagemap_scan_init_bounce_buffer(&p); 2382 + if (ret) 2383 + return ret; 2384 + 2385 + /* Protection change for the range is going to happen. */ 2386 + if (p.arg.flags & PM_SCAN_WP_MATCHING) { 2387 + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, 2388 + mm, p.arg.start, p.arg.end); 2389 + mmu_notifier_invalidate_range_start(&range); 2390 + } 2391 + 2392 + for (walk_start = p.arg.start; walk_start < p.arg.end; 2393 + walk_start = p.arg.walk_end) { 2394 + long n_out; 2395 + 2396 + if (fatal_signal_pending(current)) { 2397 + ret = -EINTR; 2398 + break; 2399 + } 2400 + 2401 + ret = mmap_read_lock_killable(mm); 2402 + if (ret) 2403 + break; 2404 + ret = walk_page_range(mm, walk_start, p.arg.end, 2405 + &pagemap_scan_ops, &p); 2406 + mmap_read_unlock(mm); 2407 + 2408 + n_out = pagemap_scan_flush_buffer(&p); 2409 + if (n_out < 0) 2410 + ret = n_out; 2411 + else 2412 + n_ranges_out += n_out; 2413 + 2414 + if (ret != -ENOSPC) 2415 + break; 2416 + 2417 + if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages) 2418 + break; 2419 + } 2420 + 2421 + /* ENOSPC signifies early stop (buffer full) from the walk. */ 2422 + if (!ret || ret == -ENOSPC) 2423 + ret = n_ranges_out; 2424 + 2425 + /* The walk_end isn't set when ret is zero */ 2426 + if (!p.arg.walk_end) 2427 + p.arg.walk_end = p.arg.end; 2428 + if (pagemap_scan_writeback_args(&p.arg, uarg)) 2429 + ret = -EFAULT; 2430 + 2431 + if (p.arg.flags & PM_SCAN_WP_MATCHING) 2432 + mmu_notifier_invalidate_range_end(&range); 2433 + 2434 + kfree(p.vec_buf); 2435 + return ret; 2436 + } 2437 + 2438 + static long do_pagemap_cmd(struct file *file, unsigned int cmd, 2439 + unsigned long arg) 2440 + { 2441 + struct mm_struct *mm = file->private_data; 2442 + 2443 + switch (cmd) { 2444 + case PAGEMAP_SCAN: 2445 + return do_pagemap_scan(mm, arg); 2446 + 2447 + default: 2448 + return -EINVAL; 2449 + } 2450 + } 2451 + 1766 2452 const struct file_operations proc_pagemap_operations = { 1767 2453 .llseek = mem_lseek, /* borrow this */ 1768 2454 .read = pagemap_read, 1769 2455 .open = pagemap_open, 1770 2456 .release = pagemap_release, 2457 + .unlocked_ioctl = do_pagemap_cmd, 2458 + .compat_ioctl = do_pagemap_cmd, 1771 2459 }; 1772 2460 #endif /* CONFIG_PROC_PAGE_MONITOR */ 1773 2461
+1
include/linux/hugetlb.h
··· 280 280 unsigned long cp_flags); 281 281 282 282 bool is_hugetlb_entry_migration(pte_t pte); 283 + bool is_hugetlb_entry_hwpoisoned(pte_t pte); 283 284 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); 284 285 285 286 #else /* !CONFIG_HUGETLB_PAGE */
+7
include/linux/userfaultfd_k.h
··· 221 221 return VM_FAULT_SIGBUS; 222 222 } 223 223 224 + static inline long uffd_wp_range(struct vm_area_struct *vma, 225 + unsigned long start, unsigned long len, 226 + bool enable_wp) 227 + { 228 + return false; 229 + } 230 + 224 231 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, 225 232 struct vm_userfaultfd_ctx vm_ctx) 226 233 {
+59
include/uapi/linux/fs.h
··· 305 305 #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ 306 306 RWF_APPEND) 307 307 308 + /* Pagemap ioctl */ 309 + #define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) 310 + 311 + /* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */ 312 + #define PAGE_IS_WPALLOWED (1 << 0) 313 + #define PAGE_IS_WRITTEN (1 << 1) 314 + #define PAGE_IS_FILE (1 << 2) 315 + #define PAGE_IS_PRESENT (1 << 3) 316 + #define PAGE_IS_SWAPPED (1 << 4) 317 + #define PAGE_IS_PFNZERO (1 << 5) 318 + #define PAGE_IS_HUGE (1 << 6) 319 + 320 + /* 321 + * struct page_region - Page region with flags 322 + * @start: Start of the region 323 + * @end: End of the region (exclusive) 324 + * @categories: PAGE_IS_* category bitmask for the region 325 + */ 326 + struct page_region { 327 + __u64 start; 328 + __u64 end; 329 + __u64 categories; 330 + }; 331 + 332 + /* Flags for PAGEMAP_SCAN ioctl */ 333 + #define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */ 334 + #define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */ 335 + 336 + /* 337 + * struct pm_scan_arg - Pagemap ioctl argument 338 + * @size: Size of the structure 339 + * @flags: Flags for the IOCTL 340 + * @start: Starting address of the region 341 + * @end: Ending address of the region 342 + * @walk_end Address where the scan stopped (written by kernel). 343 + * walk_end == end (address tags cleared) informs that the scan completed on entire range. 344 + * @vec: Address of page_region struct array for output 345 + * @vec_len: Length of the page_region struct array 346 + * @max_pages: Optional limit for number of returned pages (0 = disabled) 347 + * @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1 348 + * @category_mask: Skip pages for which any category doesn't match 349 + * @category_anyof_mask: Skip pages for which no category matches 350 + * @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned 351 + */ 352 + struct pm_scan_arg { 353 + __u64 size; 354 + __u64 flags; 355 + __u64 start; 356 + __u64 end; 357 + __u64 walk_end; 358 + __u64 vec; 359 + __u64 vec_len; 360 + __u64 max_pages; 361 + __u64 category_inverted; 362 + __u64 category_mask; 363 + __u64 category_anyof_mask; 364 + __u64 return_mask; 365 + }; 366 + 308 367 #endif /* _UAPI_LINUX_FS_H */
+3 -2
mm/hugetlb.c
··· 5044 5044 return false; 5045 5045 } 5046 5046 5047 - static bool is_hugetlb_entry_hwpoisoned(pte_t pte) 5047 + bool is_hugetlb_entry_hwpoisoned(pte_t pte) 5048 5048 { 5049 5049 swp_entry_t swp; 5050 5050 ··· 6266 6266 } 6267 6267 6268 6268 entry = huge_pte_clear_uffd_wp(entry); 6269 - set_huge_pte_at(mm, haddr, ptep, entry); 6269 + set_huge_pte_at(mm, haddr, ptep, entry, 6270 + huge_page_size(hstate_vma(vma))); 6270 6271 /* Fallthrough to CoW */ 6271 6272 } 6272 6273