Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: lift the x86_32 PAE version of gup_get_pte to common code

The split low/high access is the only non-READ_ONCE version of gup_get_pte
that did show up in the various arch implemenations. Lift it to common
code and drop the ifdef based arch override.

Link: http://lkml.kernel.org/r/20190625143715.1689-4-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Miller <davem@davemloft.net>
Cc: James Hogan <jhogan@kernel.org>
Cc: Khalid Aziz <khalid.aziz@oracle.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Christoph Hellwig and committed by
Linus Torvalds
39656e83 26f4c328

+53 -53
+1
arch/x86/Kconfig
··· 123 123 select GENERIC_STRNLEN_USER 124 124 select GENERIC_TIME_VSYSCALL 125 125 select GENERIC_GETTIMEOFDAY 126 + select GUP_GET_PTE_LOW_HIGH if X86_PAE 126 127 select HARDLOCKUP_CHECK_TIMESTAMP if X86_64 127 128 select HAVE_ACPI_APEI if ACPI 128 129 select HAVE_ACPI_APEI_NMI if ACPI
-47
arch/x86/include/asm/pgtable-3level.h
··· 285 285 #define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \ 286 286 __pteval_swp_offset(pte))) 287 287 288 - #define gup_get_pte gup_get_pte 289 - /* 290 - * WARNING: only to be used in the get_user_pages_fast() implementation. 291 - * 292 - * With get_user_pages_fast(), we walk down the pagetables without taking 293 - * any locks. For this we would like to load the pointers atomically, 294 - * but that is not possible (without expensive cmpxchg8b) on PAE. What 295 - * we do have is the guarantee that a PTE will only either go from not 296 - * present to present, or present to not present or both -- it will not 297 - * switch to a completely different present page without a TLB flush in 298 - * between; something that we are blocking by holding interrupts off. 299 - * 300 - * Setting ptes from not present to present goes: 301 - * 302 - * ptep->pte_high = h; 303 - * smp_wmb(); 304 - * ptep->pte_low = l; 305 - * 306 - * And present to not present goes: 307 - * 308 - * ptep->pte_low = 0; 309 - * smp_wmb(); 310 - * ptep->pte_high = 0; 311 - * 312 - * We must ensure here that the load of pte_low sees 'l' iff pte_high 313 - * sees 'h'. We load pte_high *after* loading pte_low, which ensures we 314 - * don't see an older value of pte_high. *Then* we recheck pte_low, 315 - * which ensures that we haven't picked up a changed pte high. We might 316 - * have gotten rubbish values from pte_low and pte_high, but we are 317 - * guaranteed that pte_low will not have the present bit set *unless* 318 - * it is 'l'. Because get_user_pages_fast() only operates on present ptes 319 - * we're safe. 320 - */ 321 - static inline pte_t gup_get_pte(pte_t *ptep) 322 - { 323 - pte_t pte; 324 - 325 - do { 326 - pte.pte_low = ptep->pte_low; 327 - smp_rmb(); 328 - pte.pte_high = ptep->pte_high; 329 - smp_rmb(); 330 - } while (unlikely(pte.pte_low != ptep->pte_low)); 331 - 332 - return pte; 333 - } 334 - 335 288 #include <asm/pgtable-invert.h> 336 289 337 290 #endif /* _ASM_X86_PGTABLE_3LEVEL_H */
+1 -1
arch/x86/kvm/mmu.c
··· 650 650 651 651 /* 652 652 * The idea using the light way get the spte on x86_32 guest is from 653 - * gup_get_pte(arch/x86/mm/gup.c). 653 + * gup_get_pte (mm/gup.c). 654 654 * 655 655 * An spte tlb flush may be pending, because kvm_set_pte_rmapp 656 656 * coalesces them and we are running out of the MMU lock. Therefore
+3
mm/Kconfig
··· 762 762 763 763 See tools/testing/selftests/vm/gup_benchmark.c 764 764 765 + config GUP_GET_PTE_LOW_HIGH 766 + bool 767 + 765 768 config ARCH_HAS_PTE_SPECIAL 766 769 bool 767 770
+48 -5
mm/gup.c
··· 1684 1684 * This code is based heavily on the PowerPC implementation by Nick Piggin. 1685 1685 */ 1686 1686 #ifdef CONFIG_HAVE_GENERIC_GUP 1687 - 1688 - #ifndef gup_get_pte 1687 + #ifdef CONFIG_GUP_GET_PTE_LOW_HIGH 1689 1688 /* 1690 - * We assume that the PTE can be read atomically. If this is not the case for 1691 - * your architecture, please provide the helper. 1689 + * WARNING: only to be used in the get_user_pages_fast() implementation. 1690 + * 1691 + * With get_user_pages_fast(), we walk down the pagetables without taking any 1692 + * locks. For this we would like to load the pointers atomically, but sometimes 1693 + * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What 1694 + * we do have is the guarantee that a PTE will only either go from not present 1695 + * to present, or present to not present or both -- it will not switch to a 1696 + * completely different present page without a TLB flush in between; something 1697 + * that we are blocking by holding interrupts off. 1698 + * 1699 + * Setting ptes from not present to present goes: 1700 + * 1701 + * ptep->pte_high = h; 1702 + * smp_wmb(); 1703 + * ptep->pte_low = l; 1704 + * 1705 + * And present to not present goes: 1706 + * 1707 + * ptep->pte_low = 0; 1708 + * smp_wmb(); 1709 + * ptep->pte_high = 0; 1710 + * 1711 + * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. 1712 + * We load pte_high *after* loading pte_low, which ensures we don't see an older 1713 + * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't 1714 + * picked up a changed pte high. We might have gotten rubbish values from 1715 + * pte_low and pte_high, but we are guaranteed that pte_low will not have the 1716 + * present bit set *unless* it is 'l'. Because get_user_pages_fast() only 1717 + * operates on present ptes we're safe. 1718 + */ 1719 + static inline pte_t gup_get_pte(pte_t *ptep) 1720 + { 1721 + pte_t pte; 1722 + 1723 + do { 1724 + pte.pte_low = ptep->pte_low; 1725 + smp_rmb(); 1726 + pte.pte_high = ptep->pte_high; 1727 + smp_rmb(); 1728 + } while (unlikely(pte.pte_low != ptep->pte_low)); 1729 + 1730 + return pte; 1731 + } 1732 + #else /* CONFIG_GUP_GET_PTE_LOW_HIGH */ 1733 + /* 1734 + * We require that the PTE can be read atomically. 1692 1735 */ 1693 1736 static inline pte_t gup_get_pte(pte_t *ptep) 1694 1737 { 1695 1738 return READ_ONCE(*ptep); 1696 1739 } 1697 - #endif 1740 + #endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ 1698 1741 1699 1742 static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) 1700 1743 {