Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm/gup: Provide gup_get_pte() more generic

In order to write another lockless page-table walker, we need
gup_get_pte() exposed. While doing that, rename it to
ptep_get_lockless() to match the existing ptep_get() naming.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201126121121.036370527@infradead.org

+56 -57
+55
include/linux/pgtable.h
··· 258 258 } 259 259 #endif 260 260 261 + #ifdef CONFIG_GUP_GET_PTE_LOW_HIGH 262 + /* 263 + * WARNING: only to be used in the get_user_pages_fast() implementation. 264 + * 265 + * With get_user_pages_fast(), we walk down the pagetables without taking any 266 + * locks. For this we would like to load the pointers atomically, but sometimes 267 + * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What 268 + * we do have is the guarantee that a PTE will only either go from not present 269 + * to present, or present to not present or both -- it will not switch to a 270 + * completely different present page without a TLB flush in between; something 271 + * that we are blocking by holding interrupts off. 272 + * 273 + * Setting ptes from not present to present goes: 274 + * 275 + * ptep->pte_high = h; 276 + * smp_wmb(); 277 + * ptep->pte_low = l; 278 + * 279 + * And present to not present goes: 280 + * 281 + * ptep->pte_low = 0; 282 + * smp_wmb(); 283 + * ptep->pte_high = 0; 284 + * 285 + * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. 286 + * We load pte_high *after* loading pte_low, which ensures we don't see an older 287 + * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't 288 + * picked up a changed pte high. We might have gotten rubbish values from 289 + * pte_low and pte_high, but we are guaranteed that pte_low will not have the 290 + * present bit set *unless* it is 'l'. Because get_user_pages_fast() only 291 + * operates on present ptes we're safe. 292 + */ 293 + static inline pte_t ptep_get_lockless(pte_t *ptep) 294 + { 295 + pte_t pte; 296 + 297 + do { 298 + pte.pte_low = ptep->pte_low; 299 + smp_rmb(); 300 + pte.pte_high = ptep->pte_high; 301 + smp_rmb(); 302 + } while (unlikely(pte.pte_low != ptep->pte_low)); 303 + 304 + return pte; 305 + } 306 + #else /* CONFIG_GUP_GET_PTE_LOW_HIGH */ 307 + /* 308 + * We require that the PTE can be read atomically. 309 + */ 310 + static inline pte_t ptep_get_lockless(pte_t *ptep) 311 + { 312 + return ptep_get(ptep); 313 + } 314 + #endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ 315 + 261 316 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 262 317 #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR 263 318 static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+1 -57
mm/gup.c
··· 2085 2085 put_page(page); 2086 2086 } 2087 2087 2088 - #ifdef CONFIG_GUP_GET_PTE_LOW_HIGH 2089 - 2090 - /* 2091 - * WARNING: only to be used in the get_user_pages_fast() implementation. 2092 - * 2093 - * With get_user_pages_fast(), we walk down the pagetables without taking any 2094 - * locks. For this we would like to load the pointers atomically, but sometimes 2095 - * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What 2096 - * we do have is the guarantee that a PTE will only either go from not present 2097 - * to present, or present to not present or both -- it will not switch to a 2098 - * completely different present page without a TLB flush in between; something 2099 - * that we are blocking by holding interrupts off. 2100 - * 2101 - * Setting ptes from not present to present goes: 2102 - * 2103 - * ptep->pte_high = h; 2104 - * smp_wmb(); 2105 - * ptep->pte_low = l; 2106 - * 2107 - * And present to not present goes: 2108 - * 2109 - * ptep->pte_low = 0; 2110 - * smp_wmb(); 2111 - * ptep->pte_high = 0; 2112 - * 2113 - * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. 2114 - * We load pte_high *after* loading pte_low, which ensures we don't see an older 2115 - * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't 2116 - * picked up a changed pte high. We might have gotten rubbish values from 2117 - * pte_low and pte_high, but we are guaranteed that pte_low will not have the 2118 - * present bit set *unless* it is 'l'. Because get_user_pages_fast() only 2119 - * operates on present ptes we're safe. 2120 - */ 2121 - static inline pte_t gup_get_pte(pte_t *ptep) 2122 - { 2123 - pte_t pte; 2124 - 2125 - do { 2126 - pte.pte_low = ptep->pte_low; 2127 - smp_rmb(); 2128 - pte.pte_high = ptep->pte_high; 2129 - smp_rmb(); 2130 - } while (unlikely(pte.pte_low != ptep->pte_low)); 2131 - 2132 - return pte; 2133 - } 2134 - #else /* CONFIG_GUP_GET_PTE_LOW_HIGH */ 2135 - /* 2136 - * We require that the PTE can be read atomically. 2137 - */ 2138 - static inline pte_t gup_get_pte(pte_t *ptep) 2139 - { 2140 - return ptep_get(ptep); 2141 - } 2142 - #endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ 2143 - 2144 2088 static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, 2145 2089 unsigned int flags, 2146 2090 struct page **pages) ··· 2110 2166 2111 2167 ptem = ptep = pte_offset_map(&pmd, addr); 2112 2168 do { 2113 - pte_t pte = gup_get_pte(ptep); 2169 + pte_t pte = ptep_get_lockless(ptep); 2114 2170 struct page *head, *page; 2115 2171 2116 2172 /*