Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

powerpc: Use 64k pages without needing cache-inhibited large pages

Some POWER5+ machines can do 64k hardware pages for normal memory but
not for cache-inhibited pages. This patch lets us use 64k hardware
pages for most user processes on such machines (assuming the kernel
has been configured with CONFIG_PPC_64K_PAGES=y). User processes
start out using 64k pages and get switched to 4k pages if they use any
non-cacheable mappings.

With this, we use 64k pages for the vmalloc region and 4k pages for
the imalloc region. If anything creates a non-cacheable mapping in
the vmalloc region, the vmalloc region will get switched to 4k pages.
I don't know of any driver other than the DRM that would do this,
though, and these machines don't have AGP.

When a region gets switched from 64k pages to 4k pages, we do not have
to clear out all the 64k HPTEs from the hash table immediately. We
use the _PAGE_COMBO bit in the Linux PTE to indicate whether the page
was hashed in as a 64k page or a set of 4k pages. If hash_page is
trying to insert a 4k page for a Linux PTE and it sees that it has
already been inserted as a 64k page, it first invalidates the 64k HPTE
before inserting the 4k HPTE. The hash invalidation routines also use
the _PAGE_COMBO bit, to determine whether to look for a 64k HPTE or a
set of 4k HPTEs to remove. With those two changes, we can tolerate a
mix of 4k and 64k HPTEs in the hash table, and they will all get
removed when the address space is torn down.

Signed-off-by: Paul Mackerras <paulus@samba.org>

+160 -39
+2
arch/powerpc/kernel/asm-offsets.c
··· 122 122 DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache)); 123 123 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); 124 124 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); 125 + DEFINE(PACACONTEXTSLLP, offsetof(struct paca_struct, context.sllp)); 126 + DEFINE(PACAVMALLOCSLLP, offsetof(struct paca_struct, vmalloc_sllp)); 125 127 #ifdef CONFIG_HUGETLB_PAGE 126 128 DEFINE(PACALOWHTLBAREAS, offsetof(struct paca_struct, context.low_htlb_areas)); 127 129 DEFINE(PACAHIGHHTLBAREAS, offsetof(struct paca_struct, context.high_htlb_areas));
+3
arch/powerpc/kernel/prom.c
··· 948 948 {CPU_FTR_CTRL, 0, 0, 3, 0}, 949 949 {CPU_FTR_NOEXECUTE, 0, 0, 6, 0}, 950 950 {CPU_FTR_NODSISRALIGN, 0, 1, 1, 1}, 951 + #if 0 952 + /* put this back once we know how to test if firmware does 64k IO */ 951 953 {CPU_FTR_CI_LARGE_PAGE, 0, 1, 2, 0}, 954 + #endif 952 955 }; 953 956 954 957 static void __init check_cpu_pa_features(unsigned long node)
+28
arch/powerpc/mm/hash_low_64.S
··· 369 369 rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */ 370 370 or r30,r30,r31 371 371 ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE 372 + oris r30,r30,_PAGE_COMBO@h 372 373 /* Write the linux PTE atomically (setting busy) */ 373 374 stdcx. r30,0,r6 374 375 bne- 1b ··· 429 428 andi. r0,r31,_PAGE_HASHPTE 430 429 li r26,0 /* Default hidx */ 431 430 beq htab_insert_pte 431 + 432 + /* 433 + * Check if the pte was already inserted into the hash table 434 + * as a 64k HW page, and invalidate the 64k HPTE if so. 435 + */ 436 + andis. r0,r31,_PAGE_COMBO@h 437 + beq htab_inval_old_hpte 438 + 432 439 ld r6,STK_PARM(r6)(r1) 433 440 ori r26,r6,0x8000 /* Load the hidx mask */ 434 441 ld r26,0(r26) ··· 507 498 /* Try all again */ 508 499 b htab_insert_pte 509 500 501 + /* 502 + * Call out to C code to invalidate an 64k HW HPTE that is 503 + * useless now that the segment has been switched to 4k pages. 504 + */ 505 + htab_inval_old_hpte: 506 + mr r3,r29 /* virtual addr */ 507 + mr r4,r31 /* PTE.pte */ 508 + li r5,0 /* PTE.hidx */ 509 + li r6,MMU_PAGE_64K /* psize */ 510 + ld r7,STK_PARM(r8)(r1) /* local */ 511 + bl .flush_hash_page 512 + b htab_insert_pte 513 + 510 514 htab_bail_ok: 511 515 li r3,0 512 516 b htab_bail ··· 660 638 * is changing this PTE anyway and might hash it. 661 639 */ 662 640 bne- ht64_bail_ok 641 + BEGIN_FTR_SECTION 642 + /* Check if PTE has the cache-inhibit bit set */ 643 + andi. r0,r31,_PAGE_NO_CACHE 644 + /* If so, bail out and refault as a 4k page */ 645 + bne- ht64_bail_ok 646 + END_FTR_SECTION_IFCLR(CPU_FTR_CI_LARGE_PAGE) 663 647 /* Prepare new PTE value (turn access RW into DIRTY, then 664 648 * add BUSY,HASHPTE and ACCESSED) 665 649 */
+75 -9
arch/powerpc/mm/hash_utils_64.c
··· 92 92 unsigned long htab_hash_mask; 93 93 int mmu_linear_psize = MMU_PAGE_4K; 94 94 int mmu_virtual_psize = MMU_PAGE_4K; 95 + int mmu_vmalloc_psize = MMU_PAGE_4K; 96 + int mmu_io_psize = MMU_PAGE_4K; 95 97 #ifdef CONFIG_HUGETLB_PAGE 96 98 int mmu_huge_psize = MMU_PAGE_16M; 97 99 unsigned int HPAGE_SHIFT; 100 + #endif 101 + #ifdef CONFIG_PPC_64K_PAGES 102 + int mmu_ci_restrictions; 98 103 #endif 99 104 100 105 /* There are definitions of page sizes arrays to be used when none ··· 313 308 else if (mmu_psize_defs[MMU_PAGE_1M].shift) 314 309 mmu_linear_psize = MMU_PAGE_1M; 315 310 311 + #ifdef CONFIG_PPC_64K_PAGES 316 312 /* 317 313 * Pick a size for the ordinary pages. Default is 4K, we support 318 - * 64K if cache inhibited large pages are supported by the 319 - * processor 314 + * 64K for user mappings and vmalloc if supported by the processor. 315 + * We only use 64k for ioremap if the processor 316 + * (and firmware) support cache-inhibited large pages. 317 + * If not, we use 4k and set mmu_ci_restrictions so that 318 + * hash_page knows to switch processes that use cache-inhibited 319 + * mappings to 4k pages. 320 320 */ 321 - #ifdef CONFIG_PPC_64K_PAGES 322 - if (mmu_psize_defs[MMU_PAGE_64K].shift && 323 - cpu_has_feature(CPU_FTR_CI_LARGE_PAGE)) 321 + if (mmu_psize_defs[MMU_PAGE_64K].shift) { 324 322 mmu_virtual_psize = MMU_PAGE_64K; 323 + mmu_vmalloc_psize = MMU_PAGE_64K; 324 + if (cpu_has_feature(CPU_FTR_CI_LARGE_PAGE)) 325 + mmu_io_psize = MMU_PAGE_64K; 326 + else 327 + mmu_ci_restrictions = 1; 328 + } 325 329 #endif 326 330 327 - printk(KERN_DEBUG "Page orders: linear mapping = %d, others = %d\n", 331 + printk(KERN_DEBUG "Page orders: linear mapping = %d, " 332 + "virtual = %d, io = %d\n", 328 333 mmu_psize_defs[mmu_linear_psize].shift, 329 - mmu_psize_defs[mmu_virtual_psize].shift); 334 + mmu_psize_defs[mmu_virtual_psize].shift, 335 + mmu_psize_defs[mmu_io_psize].shift); 330 336 331 337 #ifdef CONFIG_HUGETLB_PAGE 332 338 /* Init large page size. Currently, we pick 16M or 1M depending ··· 572 556 pte_t *ptep; 573 557 cpumask_t tmp; 574 558 int rc, user_region = 0, local = 0; 559 + int psize; 575 560 576 561 DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n", 577 562 ea, access, trap); ··· 592 575 return 1; 593 576 } 594 577 vsid = get_vsid(mm->context.id, ea); 578 + psize = mm->context.user_psize; 595 579 break; 596 580 case VMALLOC_REGION_ID: 597 581 mm = &init_mm; 598 582 vsid = get_kernel_vsid(ea); 583 + if (ea < VMALLOC_END) 584 + psize = mmu_vmalloc_psize; 585 + else 586 + psize = mmu_io_psize; 599 587 break; 600 588 default: 601 589 /* Not a valid range ··· 651 629 #ifndef CONFIG_PPC_64K_PAGES 652 630 rc = __hash_page_4K(ea, access, vsid, ptep, trap, local); 653 631 #else 654 - if (mmu_virtual_psize == MMU_PAGE_64K) 632 + if (mmu_ci_restrictions) { 633 + /* If this PTE is non-cacheable, switch to 4k */ 634 + if (psize == MMU_PAGE_64K && 635 + (pte_val(*ptep) & _PAGE_NO_CACHE)) { 636 + if (user_region) { 637 + psize = MMU_PAGE_4K; 638 + mm->context.user_psize = MMU_PAGE_4K; 639 + mm->context.sllp = SLB_VSID_USER | 640 + mmu_psize_defs[MMU_PAGE_4K].sllp; 641 + } else if (ea < VMALLOC_END) { 642 + /* 643 + * some driver did a non-cacheable mapping 644 + * in vmalloc space, so switch vmalloc 645 + * to 4k pages 646 + */ 647 + printk(KERN_ALERT "Reducing vmalloc segment " 648 + "to 4kB pages because of " 649 + "non-cacheable mapping\n"); 650 + psize = mmu_vmalloc_psize = MMU_PAGE_4K; 651 + } 652 + } 653 + if (user_region) { 654 + if (psize != get_paca()->context.user_psize) { 655 + get_paca()->context = mm->context; 656 + slb_flush_and_rebolt(); 657 + } 658 + } else if (get_paca()->vmalloc_sllp != 659 + mmu_psize_defs[mmu_vmalloc_psize].sllp) { 660 + get_paca()->vmalloc_sllp = 661 + mmu_psize_defs[mmu_vmalloc_psize].sllp; 662 + slb_flush_and_rebolt(); 663 + } 664 + } 665 + if (psize == MMU_PAGE_64K) 655 666 rc = __hash_page_64K(ea, access, vsid, ptep, trap, local); 656 667 else 657 668 rc = __hash_page_4K(ea, access, vsid, ptep, trap, local); ··· 736 681 #ifndef CONFIG_PPC_64K_PAGES 737 682 __hash_page_4K(ea, access, vsid, ptep, trap, local); 738 683 #else 739 - if (mmu_virtual_psize == MMU_PAGE_64K) 684 + if (mmu_ci_restrictions) { 685 + /* If this PTE is non-cacheable, switch to 4k */ 686 + if (mm->context.user_psize == MMU_PAGE_64K && 687 + (pte_val(*ptep) & _PAGE_NO_CACHE)) { 688 + mm->context.user_psize = MMU_PAGE_4K; 689 + mm->context.sllp = SLB_VSID_USER | 690 + mmu_psize_defs[MMU_PAGE_4K].sllp; 691 + get_paca()->context = mm->context; 692 + slb_flush_and_rebolt(); 693 + } 694 + } 695 + if (mm->context.user_psize == MMU_PAGE_64K) 740 696 __hash_page_64K(ea, access, vsid, ptep, trap, local); 741 697 else 742 698 __hash_page_4K(ea, access, vsid, ptep, trap, local);
+3
arch/powerpc/mm/mmu_context_64.c
··· 49 49 } 50 50 51 51 mm->context.id = index; 52 + mm->context.user_psize = mmu_virtual_psize; 53 + mm->context.sllp = SLB_VSID_USER | 54 + mmu_psize_defs[mmu_virtual_psize].sllp; 52 55 53 56 return 0; 54 57 }
+14 -15
arch/powerpc/mm/slb.c
··· 60 60 : "memory" ); 61 61 } 62 62 63 - static void slb_flush_and_rebolt(void) 63 + void slb_flush_and_rebolt(void) 64 64 { 65 65 /* If you change this make sure you change SLB_NUM_BOLTED 66 66 * appropriately too. */ 67 - unsigned long linear_llp, virtual_llp, lflags, vflags; 67 + unsigned long linear_llp, vmalloc_llp, lflags, vflags; 68 68 unsigned long ksp_esid_data; 69 69 70 70 WARN_ON(!irqs_disabled()); 71 71 72 72 linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; 73 - virtual_llp = mmu_psize_defs[mmu_virtual_psize].sllp; 73 + vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; 74 74 lflags = SLB_VSID_KERNEL | linear_llp; 75 - vflags = SLB_VSID_KERNEL | virtual_llp; 75 + vflags = SLB_VSID_KERNEL | vmalloc_llp; 76 76 77 77 ksp_esid_data = mk_esid_data(get_paca()->kstack, 2); 78 78 if ((ksp_esid_data & ESID_MASK) == PAGE_OFFSET) ··· 164 164 165 165 void slb_initialize(void) 166 166 { 167 - unsigned long linear_llp, virtual_llp; 167 + unsigned long linear_llp, vmalloc_llp, io_llp; 168 168 static int slb_encoding_inited; 169 169 extern unsigned int *slb_miss_kernel_load_linear; 170 - extern unsigned int *slb_miss_kernel_load_virtual; 171 - extern unsigned int *slb_miss_user_load_normal; 170 + extern unsigned int *slb_miss_kernel_load_io; 172 171 #ifdef CONFIG_HUGETLB_PAGE 173 172 extern unsigned int *slb_miss_user_load_huge; 174 173 unsigned long huge_llp; ··· 177 178 178 179 /* Prepare our SLB miss handler based on our page size */ 179 180 linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; 180 - virtual_llp = mmu_psize_defs[mmu_virtual_psize].sllp; 181 + io_llp = mmu_psize_defs[mmu_io_psize].sllp; 182 + vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; 183 + get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp; 184 + 181 185 if (!slb_encoding_inited) { 182 186 slb_encoding_inited = 1; 183 187 patch_slb_encoding(slb_miss_kernel_load_linear, 184 188 SLB_VSID_KERNEL | linear_llp); 185 - patch_slb_encoding(slb_miss_kernel_load_virtual, 186 - SLB_VSID_KERNEL | virtual_llp); 187 - patch_slb_encoding(slb_miss_user_load_normal, 188 - SLB_VSID_USER | virtual_llp); 189 + patch_slb_encoding(slb_miss_kernel_load_io, 190 + SLB_VSID_KERNEL | io_llp); 189 191 190 192 DBG("SLB: linear LLP = %04x\n", linear_llp); 191 - DBG("SLB: virtual LLP = %04x\n", virtual_llp); 193 + DBG("SLB: io LLP = %04x\n", io_llp); 192 194 #ifdef CONFIG_HUGETLB_PAGE 193 195 patch_slb_encoding(slb_miss_user_load_huge, 194 196 SLB_VSID_USER | huge_llp); ··· 204 204 unsigned long lflags, vflags; 205 205 206 206 lflags = SLB_VSID_KERNEL | linear_llp; 207 - vflags = SLB_VSID_KERNEL | virtual_llp; 207 + vflags = SLB_VSID_KERNEL | vmalloc_llp; 208 208 209 209 /* Invalidate the entire SLB (even slot 0) & all the ERATS */ 210 210 asm volatile("isync":::"memory"); ··· 212 212 asm volatile("isync; slbia; isync":::"memory"); 213 213 create_slbe(PAGE_OFFSET, lflags, 0); 214 214 215 - /* VMALLOC space has 4K pages always for now */ 216 215 create_slbe(VMALLOC_START, vflags, 1); 217 216 218 217 /* We don't bolt the stack for the time being - we're in boot,
+12 -5
arch/powerpc/mm/slb_low.S
··· 59 59 li r11,0 60 60 b slb_finish_load 61 61 62 - 1: /* vmalloc/ioremap mapping encoding bits, the "li" instruction below 62 + 1: /* vmalloc/ioremap mapping encoding bits, the "li" instructions below 63 63 * will be patched by the kernel at boot 64 64 */ 65 - _GLOBAL(slb_miss_kernel_load_virtual) 65 + BEGIN_FTR_SECTION 66 + /* check whether this is in vmalloc or ioremap space */ 67 + clrldi r11,r10,48 68 + cmpldi r11,(VMALLOC_SIZE >> 28) - 1 69 + bgt 5f 70 + lhz r11,PACAVMALLOCSLLP(r13) 71 + b slb_finish_load 72 + 5: 73 + END_FTR_SECTION_IFCLR(CPU_FTR_CI_LARGE_PAGE) 74 + _GLOBAL(slb_miss_kernel_load_io) 66 75 li r11,0 67 76 b slb_finish_load 68 77 ··· 105 96 1: 106 97 #endif /* CONFIG_HUGETLB_PAGE */ 107 98 108 - _GLOBAL(slb_miss_user_load_normal) 109 - li r11,0 110 - 99 + lhz r11,PACACONTEXTSLLP(r13) 111 100 2: 112 101 ld r9,PACACONTEXTID(r13) 113 102 rldimi r10,r9,USER_ESID_BITS,0
+3 -2
arch/powerpc/mm/tlb_64.c
··· 131 131 { 132 132 struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); 133 133 unsigned long vsid; 134 - unsigned int psize = mmu_virtual_psize; 134 + unsigned int psize; 135 135 int i; 136 136 137 137 i = batch->index; ··· 148 148 #else 149 149 BUG(); 150 150 #endif 151 - } 151 + } else 152 + psize = pte_pagesize_index(pte); 152 153 153 154 /* 154 155 * This can happen when we are in the middle of a TLB batch and
+13
include/asm-powerpc/mmu.h
··· 165 165 extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; 166 166 extern int mmu_linear_psize; 167 167 extern int mmu_virtual_psize; 168 + extern int mmu_vmalloc_psize; 169 + extern int mmu_io_psize; 170 + 171 + /* 172 + * If the processor supports 64k normal pages but not 64k cache 173 + * inhibited pages, we have to be prepared to switch processes 174 + * to use 4k pages when they create cache-inhibited mappings. 175 + * If this is the case, mmu_ci_restrictions will be set to 1. 176 + */ 177 + extern int mmu_ci_restrictions; 168 178 169 179 #ifdef CONFIG_HUGETLB_PAGE 170 180 /* ··· 266 256 267 257 extern void stabs_alloc(void); 268 258 extern void slb_initialize(void); 259 + extern void slb_flush_and_rebolt(void); 269 260 extern void stab_initialize(unsigned long stab); 270 261 271 262 #endif /* __ASSEMBLY__ */ ··· 370 359 371 360 typedef struct { 372 361 mm_context_id_t id; 362 + u16 user_psize; /* page size index */ 363 + u16 sllp; /* SLB entry page size encoding */ 373 364 #ifdef CONFIG_HUGETLB_PAGE 374 365 u16 low_htlb_areas, high_htlb_areas; 375 366 #endif
+1
include/asm-powerpc/paca.h
··· 81 81 * on the linear mapping */ 82 82 83 83 mm_context_t context; 84 + u16 vmalloc_sllp; 84 85 u16 slb_cache[SLB_CACHE_ENTRIES]; 85 86 u16 slb_cache_ptr; 86 87
+2
include/asm-powerpc/pgtable-4k.h
··· 78 78 79 79 #define pte_iterate_hashed_end() } while(0) 80 80 81 + #define pte_pagesize_index(pte) MMU_PAGE_4K 82 + 81 83 /* 82 84 * 4-level page tables related bits 83 85 */
+2
include/asm-powerpc/pgtable-64k.h
··· 90 90 91 91 #define pte_iterate_hashed_end() } while(0); } } while(0) 92 92 93 + #define pte_pagesize_index(pte) \ 94 + (((pte) & _PAGE_COMBO)? MMU_PAGE_4K: MMU_PAGE_64K) 93 95 94 96 #endif /* __ASSEMBLY__ */ 95 97 #endif /* __KERNEL__ */
+2 -8
include/asm-powerpc/pgtable.h
··· 47 47 /* 48 48 * Define the address range of the vmalloc VM area. 49 49 */ 50 - #define VMALLOC_START (0xD000000000000000ul) 51 - #define VMALLOC_SIZE (0x80000000000UL) 50 + #define VMALLOC_START ASM_CONST(0xD000000000000000) 51 + #define VMALLOC_SIZE ASM_CONST(0x80000000000) 52 52 #define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) 53 53 54 54 /* ··· 413 413 flush_tlb_pending(); 414 414 } 415 415 pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); 416 - 417 - #ifdef CONFIG_PPC_64K_PAGES 418 - if (mmu_virtual_psize != MMU_PAGE_64K) 419 - pte = __pte(pte_val(pte) | _PAGE_COMBO); 420 - #endif /* CONFIG_PPC_64K_PAGES */ 421 - 422 416 *ptep = pte; 423 417 } 424 418