Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm/gup: fix gup_fast with dynamic page table folding

Currently to make sure that every page table entry is read just once
gup_fast walks perform READ_ONCE and pass pXd value down to the next
gup_pXd_range function by value e.g.:

static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
...
pudp = pud_offset(&p4d, addr);

This function passes a reference on that local value copy to pXd_offset,
and might get the very same pointer in return. This happens when the
level is folded (on most arches), and that pointer should not be
iterated.

On s390 due to the fact that each task might have different 5,4 or
3-level address translation and hence different levels folded the logic
is more complex and non-iteratable pointer to a local copy leads to
severe problems.

Here is an example of what happens with gup_fast on s390, for a task
with 3-level paging, crossing a 2 GB pud boundary:

// addr = 0x1007ffff000, end = 0x10080001000
static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;

// pud_offset returns &p4d itself (a pointer to a value on stack)
pudp = pud_offset(&p4d, addr);
do {
// on second iteratation reading "random" stack value
pud_t pud = READ_ONCE(*pudp);

// next = 0x10080000000, due to PUD_SIZE/MASK != PGDIR_SIZE/MASK on s390
next = pud_addr_end(addr, end);
...
} while (pudp++, addr = next, addr != end); // pudp++ iterating over stack

return 1;
}

This happens since s390 moved to common gup code with commit
d1874a0c2805 ("s390/mm: make the pxd_offset functions more robust") and
commit 1a42010cdc26 ("s390/mm: convert to the generic
get_user_pages_fast code").

s390 tried to mimic static level folding by changing pXd_offset
primitives to always calculate top level page table offset in pgd_offset
and just return the value passed when pXd_offset has to act as folded.

What is crucial for gup_fast and what has been overlooked is that
PxD_SIZE/MASK and thus pXd_addr_end should also change correspondingly.
And the latter is not possible with dynamic folding.

To fix the issue in addition to pXd values pass original pXdp pointers
down to gup_pXd_range functions. And introduce pXd_offset_lockless
helpers, which take an additional pXd entry value parameter. This has
already been discussed in

https://lkml.kernel.org/r/20190418100218.0a4afd51@mschwideX1

Fixes: 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast code")
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: <stable@vger.kernel.org> [5.2+]
Link: https://lkml.kernel.org/r/patch.git-943f1e5dcff2.your-ad-here.call-01599856292-ext-8676@work.hours
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Vasily Gorbik and committed by
Linus Torvalds
d3f7b1bb 8d3fe09d

+49 -21
+30 -12
arch/s390/include/asm/pgtable.h
··· 1260 1260 1261 1261 #define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address) 1262 1262 1263 - static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) 1263 + static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long address) 1264 1264 { 1265 - if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1) 1266 - return (p4d_t *) pgd_deref(*pgd) + p4d_index(address); 1267 - return (p4d_t *) pgd; 1265 + if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1) 1266 + return (p4d_t *) pgd_deref(pgd) + p4d_index(address); 1267 + return (p4d_t *) pgdp; 1268 + } 1269 + #define p4d_offset_lockless p4d_offset_lockless 1270 + 1271 + static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long address) 1272 + { 1273 + return p4d_offset_lockless(pgdp, *pgdp, address); 1268 1274 } 1269 1275 1270 - static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) 1276 + static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long address) 1271 1277 { 1272 - if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2) 1273 - return (pud_t *) p4d_deref(*p4d) + pud_index(address); 1274 - return (pud_t *) p4d; 1278 + if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2) 1279 + return (pud_t *) p4d_deref(p4d) + pud_index(address); 1280 + return (pud_t *) p4dp; 1281 + } 1282 + #define pud_offset_lockless pud_offset_lockless 1283 + 1284 + static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long address) 1285 + { 1286 + return pud_offset_lockless(p4dp, *p4dp, address); 1275 1287 } 1276 1288 #define pud_offset pud_offset 1277 1289 1278 - static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) 1290 + static inline pmd_t *pmd_offset_lockless(pud_t *pudp, pud_t pud, unsigned long address) 1279 1291 { 1280 - if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3) 1281 - return (pmd_t *) pud_deref(*pud) + pmd_index(address); 1282 - return (pmd_t *) pud; 1292 + if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3) 1293 + return (pmd_t *) pud_deref(pud) + pmd_index(address); 1294 + return (pmd_t *) pudp; 1295 + } 1296 + #define pmd_offset_lockless pmd_offset_lockless 1297 + 1298 + static inline pmd_t *pmd_offset(pud_t *pudp, unsigned long address) 1299 + { 1300 + return pmd_offset_lockless(pudp, *pudp, address); 1283 1301 } 1284 1302 #define pmd_offset pmd_offset 1285 1303
+10
include/linux/pgtable.h
··· 1427 1427 #define mm_pmd_folded(mm) __is_defined(__PAGETABLE_PMD_FOLDED) 1428 1428 #endif 1429 1429 1430 + #ifndef p4d_offset_lockless 1431 + #define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address) 1432 + #endif 1433 + #ifndef pud_offset_lockless 1434 + #define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address) 1435 + #endif 1436 + #ifndef pmd_offset_lockless 1437 + #define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address) 1438 + #endif 1439 + 1430 1440 /* 1431 1441 * p?d_leaf() - true if this entry is a final mapping to a physical address. 1432 1442 * This differs from p?d_huge() by the fact that they are always available (if
+9 -9
mm/gup.c
··· 2485 2485 return 1; 2486 2486 } 2487 2487 2488 - static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, 2488 + static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, 2489 2489 unsigned int flags, struct page **pages, int *nr) 2490 2490 { 2491 2491 unsigned long next; 2492 2492 pmd_t *pmdp; 2493 2493 2494 - pmdp = pmd_offset(&pud, addr); 2494 + pmdp = pmd_offset_lockless(pudp, pud, addr); 2495 2495 do { 2496 2496 pmd_t pmd = READ_ONCE(*pmdp); 2497 2497 ··· 2528 2528 return 1; 2529 2529 } 2530 2530 2531 - static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, 2531 + static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end, 2532 2532 unsigned int flags, struct page **pages, int *nr) 2533 2533 { 2534 2534 unsigned long next; 2535 2535 pud_t *pudp; 2536 2536 2537 - pudp = pud_offset(&p4d, addr); 2537 + pudp = pud_offset_lockless(p4dp, p4d, addr); 2538 2538 do { 2539 2539 pud_t pud = READ_ONCE(*pudp); 2540 2540 ··· 2549 2549 if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, 2550 2550 PUD_SHIFT, next, flags, pages, nr)) 2551 2551 return 0; 2552 - } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr)) 2552 + } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr)) 2553 2553 return 0; 2554 2554 } while (pudp++, addr = next, addr != end); 2555 2555 2556 2556 return 1; 2557 2557 } 2558 2558 2559 - static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, 2559 + static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end, 2560 2560 unsigned int flags, struct page **pages, int *nr) 2561 2561 { 2562 2562 unsigned long next; 2563 2563 p4d_t *p4dp; 2564 2564 2565 - p4dp = p4d_offset(&pgd, addr); 2565 + p4dp = p4d_offset_lockless(pgdp, pgd, addr); 2566 2566 do { 2567 2567 p4d_t p4d = READ_ONCE(*p4dp); 2568 2568 ··· 2574 2574 if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, 2575 2575 P4D_SHIFT, next, flags, pages, nr)) 2576 2576 return 0; 2577 - } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr)) 2577 + } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) 2578 2578 return 0; 2579 2579 } while (p4dp++, addr = next, addr != end); 2580 2580 ··· 2602 2602 if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, 2603 2603 PGDIR_SHIFT, next, flags, pages, nr)) 2604 2604 return; 2605 - } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr)) 2605 + } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) 2606 2606 return; 2607 2607 } while (pgdp++, addr = next, addr != end); 2608 2608 }