powerpc/mm: THP is only available on hash64 as of now

+11 -13

arch/powerpc/include/asm/book3s/64/pgtable.h

··· 777 777 #endif 778 778 struct page *realmode_pfn_to_page(unsigned long pfn); 779 779 780 - #ifdef CONFIG_TRANSPARENT_HUGEPAGE 781 - extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); 782 - extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); 783 - extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); 784 - extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, 785 - pmd_t *pmdp, pmd_t pmd); 786 - extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, 787 - pmd_t *pmd); 788 - extern int has_transparent_hugepage(void); 789 - #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 790 - 791 - 792 780 static inline pte_t pmd_pte(pmd_t pmd) 793 781 { 794 782 return __pte(pmd_val(pmd)); ··· 791 803 { 792 804 return (pte_t *)pmd; 793 805 } 794 - 795 806 #define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd)) 796 807 #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) 797 808 #define pmd_young(pmd) pte_young(pmd_pte(pmd)) ··· 816 829 817 830 #define __HAVE_ARCH_PMD_WRITE 818 831 #define pmd_write(pmd) pte_write(pmd_pte(pmd)) 832 + 833 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 834 + extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); 835 + extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); 836 + extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); 837 + extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, 838 + pmd_t *pmdp, pmd_t pmd); 839 + extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, 840 + pmd_t *pmd); 841 + extern int has_transparent_hugepage(void); 819 842 820 843 static inline pmd_t pmd_mkhuge(pmd_t pmd) 821 844 { ··· 875 878 */ 876 879 return true; 877 880 } 881 + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 878 882 #endif /* __ASSEMBLY__ */ 879 883 #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */

+358

arch/powerpc/mm/pgtable-hash64.c

··· 15 15 16 16 #include "mmu_decl.h" 17 17 18 + #define CREATE_TRACE_POINTS 19 + #include <trace/events/thp.h> 20 + 18 21 #ifdef CONFIG_SPARSEMEM_VMEMMAP 19 22 /* 20 23 * On hash-based CPUs, the vmemmap is bolted in the hash table. ··· 96 93 smp_wmb(); 97 94 return 0; 98 95 } 96 + 97 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 98 + 99 + /* 100 + * This is called when relaxing access to a hugepage. It's also called in the page 101 + * fault path when we don't hit any of the major fault cases, ie, a minor 102 + * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have 103 + * handled those two for us, we additionally deal with missing execute 104 + * permission here on some processors 105 + */ 106 + int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, 107 + pmd_t *pmdp, pmd_t entry, int dirty) 108 + { 109 + int changed; 110 + #ifdef CONFIG_DEBUG_VM 111 + WARN_ON(!pmd_trans_huge(*pmdp)); 112 + assert_spin_locked(&vma->vm_mm->page_table_lock); 113 + #endif 114 + changed = !pmd_same(*(pmdp), entry); 115 + if (changed) { 116 + __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); 117 + /* 118 + * Since we are not supporting SW TLB systems, we don't 119 + * have any thing similar to flush_tlb_page_nohash() 120 + */ 121 + } 122 + return changed; 123 + } 124 + 125 + unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, 126 + pmd_t *pmdp, unsigned long clr, 127 + unsigned long set) 128 + { 129 + __be64 old_be, tmp; 130 + unsigned long old; 131 + 132 + #ifdef CONFIG_DEBUG_VM 133 + WARN_ON(!pmd_trans_huge(*pmdp)); 134 + assert_spin_locked(&mm->page_table_lock); 135 + #endif 136 + 137 + __asm__ __volatile__( 138 + "1: ldarx %0,0,%3\n\ 139 + and. %1,%0,%6\n\ 140 + bne- 1b \n\ 141 + andc %1,%0,%4 \n\ 142 + or %1,%1,%7\n\ 143 + stdcx. %1,0,%3 \n\ 144 + bne- 1b" 145 + : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp) 146 + : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp), 147 + "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) 148 + : "cc" ); 149 + 150 + old = be64_to_cpu(old_be); 151 + 152 + trace_hugepage_update(addr, old, clr, set); 153 + if (old & H_PAGE_HASHPTE) 154 + hpte_do_hugepage_flush(mm, addr, pmdp, old); 155 + return old; 156 + } 157 + 158 + pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, 159 + pmd_t *pmdp) 160 + { 161 + pmd_t pmd; 162 + 163 + VM_BUG_ON(address & ~HPAGE_PMD_MASK); 164 + VM_BUG_ON(pmd_trans_huge(*pmdp)); 165 + 166 + pmd = *pmdp; 167 + pmd_clear(pmdp); 168 + /* 169 + * Wait for all pending hash_page to finish. This is needed 170 + * in case of subpage collapse. When we collapse normal pages 171 + * to hugepage, we first clear the pmd, then invalidate all 172 + * the PTE entries. The assumption here is that any low level 173 + * page fault will see a none pmd and take the slow path that 174 + * will wait on mmap_sem. But we could very well be in a 175 + * hash_page with local ptep pointer value. Such a hash page 176 + * can result in adding new HPTE entries for normal subpages. 177 + * That means we could be modifying the page content as we 178 + * copy them to a huge page. So wait for parallel hash_page 179 + * to finish before invalidating HPTE entries. We can do this 180 + * by sending an IPI to all the cpus and executing a dummy 181 + * function there. 182 + */ 183 + kick_all_cpus_sync(); 184 + /* 185 + * Now invalidate the hpte entries in the range 186 + * covered by pmd. This make sure we take a 187 + * fault and will find the pmd as none, which will 188 + * result in a major fault which takes mmap_sem and 189 + * hence wait for collapse to complete. Without this 190 + * the __collapse_huge_page_copy can result in copying 191 + * the old content. 192 + */ 193 + flush_tlb_pmd_range(vma->vm_mm, &pmd, address); 194 + return pmd; 195 + } 196 + 197 + /* 198 + * We currently remove entries from the hashtable regardless of whether 199 + * the entry was young or dirty. 200 + * 201 + * We should be more intelligent about this but for the moment we override 202 + * these functions and force a tlb flush unconditionally 203 + */ 204 + int pmdp_test_and_clear_young(struct vm_area_struct *vma, 205 + unsigned long address, pmd_t *pmdp) 206 + { 207 + return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); 208 + } 209 + 210 + /* 211 + * We want to put the pgtable in pmd and use pgtable for tracking 212 + * the base page size hptes 213 + */ 214 + void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 215 + pgtable_t pgtable) 216 + { 217 + pgtable_t *pgtable_slot; 218 + assert_spin_locked(&mm->page_table_lock); 219 + /* 220 + * we store the pgtable in the second half of PMD 221 + */ 222 + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; 223 + *pgtable_slot = pgtable; 224 + /* 225 + * expose the deposited pgtable to other cpus. 226 + * before we set the hugepage PTE at pmd level 227 + * hash fault code looks at the deposted pgtable 228 + * to store hash index values. 229 + */ 230 + smp_wmb(); 231 + } 232 + 233 + pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 234 + { 235 + pgtable_t pgtable; 236 + pgtable_t *pgtable_slot; 237 + 238 + assert_spin_locked(&mm->page_table_lock); 239 + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; 240 + pgtable = *pgtable_slot; 241 + /* 242 + * Once we withdraw, mark the entry NULL. 243 + */ 244 + *pgtable_slot = NULL; 245 + /* 246 + * We store HPTE information in the deposited PTE fragment. 247 + * zero out the content on withdraw. 248 + */ 249 + memset(pgtable, 0, PTE_FRAG_SIZE); 250 + return pgtable; 251 + } 252 + 253 + void pmdp_huge_split_prepare(struct vm_area_struct *vma, 254 + unsigned long address, pmd_t *pmdp) 255 + { 256 + VM_BUG_ON(address & ~HPAGE_PMD_MASK); 257 + VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); 258 + 259 + /* 260 + * We can't mark the pmd none here, because that will cause a race 261 + * against exit_mmap. We need to continue mark pmd TRANS HUGE, while 262 + * we spilt, but at the same time we wan't rest of the ppc64 code 263 + * not to insert hash pte on this, because we will be modifying 264 + * the deposited pgtable in the caller of this function. Hence 265 + * clear the _PAGE_USER so that we move the fault handling to 266 + * higher level function and that will serialize against ptl. 267 + * We need to flush existing hash pte entries here even though, 268 + * the translation is still valid, because we will withdraw 269 + * pgtable_t after this. 270 + */ 271 + pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED); 272 + } 273 + 274 + 275 + /* 276 + * set a new huge pmd. We should not be called for updating 277 + * an existing pmd entry. That should go via pmd_hugepage_update. 278 + */ 279 + void set_pmd_at(struct mm_struct *mm, unsigned long addr, 280 + pmd_t *pmdp, pmd_t pmd) 281 + { 282 + #ifdef CONFIG_DEBUG_VM 283 + WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); 284 + assert_spin_locked(&mm->page_table_lock); 285 + WARN_ON(!pmd_trans_huge(pmd)); 286 + #endif 287 + trace_hugepage_set_pmd(addr, pmd_val(pmd)); 288 + return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); 289 + } 290 + 291 + /* 292 + * We use this to invalidate a pmdp entry before switching from a 293 + * hugepte to regular pmd entry. 294 + */ 295 + void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, 296 + pmd_t *pmdp) 297 + { 298 + pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); 299 + 300 + /* 301 + * This ensures that generic code that rely on IRQ disabling 302 + * to prevent a parallel THP split work as expected. 303 + */ 304 + kick_all_cpus_sync(); 305 + } 306 + 307 + /* 308 + * A linux hugepage PMD was changed and the corresponding hash table entries 309 + * neesd to be flushed. 310 + */ 311 + void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, 312 + pmd_t *pmdp, unsigned long old_pmd) 313 + { 314 + int ssize; 315 + unsigned int psize; 316 + unsigned long vsid; 317 + unsigned long flags = 0; 318 + const struct cpumask *tmp; 319 + 320 + /* get the base page size,vsid and segment size */ 321 + #ifdef CONFIG_DEBUG_VM 322 + psize = get_slice_psize(mm, addr); 323 + BUG_ON(psize == MMU_PAGE_16M); 324 + #endif 325 + if (old_pmd & H_PAGE_COMBO) 326 + psize = MMU_PAGE_4K; 327 + else 328 + psize = MMU_PAGE_64K; 329 + 330 + if (!is_kernel_addr(addr)) { 331 + ssize = user_segment_size(addr); 332 + vsid = get_vsid(mm->context.id, addr, ssize); 333 + WARN_ON(vsid == 0); 334 + } else { 335 + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); 336 + ssize = mmu_kernel_ssize; 337 + } 338 + 339 + tmp = cpumask_of(smp_processor_id()); 340 + if (cpumask_equal(mm_cpumask(mm), tmp)) 341 + flags |= HPTE_LOCAL_UPDATE; 342 + 343 + return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); 344 + } 345 + 346 + static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) 347 + { 348 + return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); 349 + } 350 + 351 + pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) 352 + { 353 + unsigned long pmdv; 354 + 355 + pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; 356 + return pmd_set_protbits(__pmd(pmdv), pgprot); 357 + } 358 + 359 + pmd_t mk_pmd(struct page *page, pgprot_t pgprot) 360 + { 361 + return pfn_pmd(page_to_pfn(page), pgprot); 362 + } 363 + 364 + pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) 365 + { 366 + unsigned long pmdv; 367 + 368 + pmdv = pmd_val(pmd); 369 + pmdv &= _HPAGE_CHG_MASK; 370 + return pmd_set_protbits(__pmd(pmdv), newprot); 371 + } 372 + 373 + /* 374 + * This is called at the end of handling a user page fault, when the 375 + * fault has been handled by updating a HUGE PMD entry in the linux page tables. 376 + * We use it to preload an HPTE into the hash table corresponding to 377 + * the updated linux HUGE PMD entry. 378 + */ 379 + void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, 380 + pmd_t *pmd) 381 + { 382 + return; 383 + } 384 + 385 + pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, 386 + unsigned long addr, pmd_t *pmdp) 387 + { 388 + pmd_t old_pmd; 389 + pgtable_t pgtable; 390 + unsigned long old; 391 + pgtable_t *pgtable_slot; 392 + 393 + old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); 394 + old_pmd = __pmd(old); 395 + /* 396 + * We have pmd == none and we are holding page_table_lock. 397 + * So we can safely go and clear the pgtable hash 398 + * index info. 399 + */ 400 + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; 401 + pgtable = *pgtable_slot; 402 + /* 403 + * Let's zero out old valid and hash index details 404 + * hash fault look at them. 405 + */ 406 + memset(pgtable, 0, PTE_FRAG_SIZE); 407 + /* 408 + * Serialize against find_linux_pte_or_hugepte which does lock-less 409 + * lookup in page tables with local interrupts disabled. For huge pages 410 + * it casts pmd_t to pte_t. Since format of pte_t is different from 411 + * pmd_t we want to prevent transit from pmd pointing to page table 412 + * to pmd pointing to huge page (and back) while interrupts are disabled. 413 + * We clear pmd to possibly replace it with page table pointer in 414 + * different code paths. So make sure we wait for the parallel 415 + * find_linux_pte_or_hugepage to finish. 416 + */ 417 + kick_all_cpus_sync(); 418 + return old_pmd; 419 + } 420 + 421 + int has_transparent_hugepage(void) 422 + { 423 + 424 + if (!mmu_has_feature(MMU_FTR_16M_PAGE)) 425 + return 0; 426 + /* 427 + * We support THP only if PMD_SIZE is 16MB. 428 + */ 429 + if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) 430 + return 0; 431 + /* 432 + * We need to make sure that we support 16MB hugepage in a segement 433 + * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE 434 + * of 64K. 435 + */ 436 + /* 437 + * If we have 64K HPTE, we will be using that by default 438 + */ 439 + if (mmu_psize_defs[MMU_PAGE_64K].shift && 440 + (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) 441 + return 0; 442 + /* 443 + * Ok we only have 4K HPTE 444 + */ 445 + if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) 446 + return 0; 447 + 448 + return 1; 449 + } 450 + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */

-359

arch/powerpc/mm/pgtable_64.c

··· 55 55 56 56 #include "mmu_decl.h" 57 57 58 - #define CREATE_TRACE_POINTS 59 - #include <trace/events/thp.h> 60 - 61 58 #ifdef CONFIG_PPC_STD_MMU_64 62 59 #if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) 63 60 #error TASK_SIZE_USER64 exceeds user VSID range ··· 432 435 } 433 436 } 434 437 #endif 435 - 436 - #ifdef CONFIG_TRANSPARENT_HUGEPAGE 437 - 438 - /* 439 - * This is called when relaxing access to a hugepage. It's also called in the page 440 - * fault path when we don't hit any of the major fault cases, ie, a minor 441 - * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have 442 - * handled those two for us, we additionally deal with missing execute 443 - * permission here on some processors 444 - */ 445 - int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, 446 - pmd_t *pmdp, pmd_t entry, int dirty) 447 - { 448 - int changed; 449 - #ifdef CONFIG_DEBUG_VM 450 - WARN_ON(!pmd_trans_huge(*pmdp)); 451 - assert_spin_locked(&vma->vm_mm->page_table_lock); 452 - #endif 453 - changed = !pmd_same(*(pmdp), entry); 454 - if (changed) { 455 - __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); 456 - /* 457 - * Since we are not supporting SW TLB systems, we don't 458 - * have any thing similar to flush_tlb_page_nohash() 459 - */ 460 - } 461 - return changed; 462 - } 463 - 464 - unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, 465 - pmd_t *pmdp, unsigned long clr, 466 - unsigned long set) 467 - { 468 - 469 - __be64 old_be, tmp; 470 - unsigned long old; 471 - 472 - #ifdef CONFIG_DEBUG_VM 473 - WARN_ON(!pmd_trans_huge(*pmdp)); 474 - assert_spin_locked(&mm->page_table_lock); 475 - #endif 476 - 477 - __asm__ __volatile__( 478 - "1: ldarx %0,0,%3\n\ 479 - and. %1,%0,%6\n\ 480 - bne- 1b \n\ 481 - andc %1,%0,%4 \n\ 482 - or %1,%1,%7\n\ 483 - stdcx. %1,0,%3 \n\ 484 - bne- 1b" 485 - : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp) 486 - : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp), 487 - "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) 488 - : "cc" ); 489 - 490 - old = be64_to_cpu(old_be); 491 - 492 - trace_hugepage_update(addr, old, clr, set); 493 - if (old & H_PAGE_HASHPTE) 494 - hpte_do_hugepage_flush(mm, addr, pmdp, old); 495 - return old; 496 - } 497 - 498 - pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, 499 - pmd_t *pmdp) 500 - { 501 - pmd_t pmd; 502 - 503 - VM_BUG_ON(address & ~HPAGE_PMD_MASK); 504 - VM_BUG_ON(pmd_trans_huge(*pmdp)); 505 - 506 - pmd = *pmdp; 507 - pmd_clear(pmdp); 508 - /* 509 - * Wait for all pending hash_page to finish. This is needed 510 - * in case of subpage collapse. When we collapse normal pages 511 - * to hugepage, we first clear the pmd, then invalidate all 512 - * the PTE entries. The assumption here is that any low level 513 - * page fault will see a none pmd and take the slow path that 514 - * will wait on mmap_sem. But we could very well be in a 515 - * hash_page with local ptep pointer value. Such a hash page 516 - * can result in adding new HPTE entries for normal subpages. 517 - * That means we could be modifying the page content as we 518 - * copy them to a huge page. So wait for parallel hash_page 519 - * to finish before invalidating HPTE entries. We can do this 520 - * by sending an IPI to all the cpus and executing a dummy 521 - * function there. 522 - */ 523 - kick_all_cpus_sync(); 524 - /* 525 - * Now invalidate the hpte entries in the range 526 - * covered by pmd. This make sure we take a 527 - * fault and will find the pmd as none, which will 528 - * result in a major fault which takes mmap_sem and 529 - * hence wait for collapse to complete. Without this 530 - * the __collapse_huge_page_copy can result in copying 531 - * the old content. 532 - */ 533 - flush_tlb_pmd_range(vma->vm_mm, &pmd, address); 534 - return pmd; 535 - } 536 - 537 - /* 538 - * We currently remove entries from the hashtable regardless of whether 539 - * the entry was young or dirty. 540 - * 541 - * We should be more intelligent about this but for the moment we override 542 - * these functions and force a tlb flush unconditionally 543 - */ 544 - int pmdp_test_and_clear_young(struct vm_area_struct *vma, 545 - unsigned long address, pmd_t *pmdp) 546 - { 547 - return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); 548 - } 549 - 550 - /* 551 - * We want to put the pgtable in pmd and use pgtable for tracking 552 - * the base page size hptes 553 - */ 554 - void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 555 - pgtable_t pgtable) 556 - { 557 - pgtable_t *pgtable_slot; 558 - assert_spin_locked(&mm->page_table_lock); 559 - /* 560 - * we store the pgtable in the second half of PMD 561 - */ 562 - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; 563 - *pgtable_slot = pgtable; 564 - /* 565 - * expose the deposited pgtable to other cpus. 566 - * before we set the hugepage PTE at pmd level 567 - * hash fault code looks at the deposted pgtable 568 - * to store hash index values. 569 - */ 570 - smp_wmb(); 571 - } 572 - 573 - pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 574 - { 575 - pgtable_t pgtable; 576 - pgtable_t *pgtable_slot; 577 - 578 - assert_spin_locked(&mm->page_table_lock); 579 - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; 580 - pgtable = *pgtable_slot; 581 - /* 582 - * Once we withdraw, mark the entry NULL. 583 - */ 584 - *pgtable_slot = NULL; 585 - /* 586 - * We store HPTE information in the deposited PTE fragment. 587 - * zero out the content on withdraw. 588 - */ 589 - memset(pgtable, 0, PTE_FRAG_SIZE); 590 - return pgtable; 591 - } 592 - 593 - void pmdp_huge_split_prepare(struct vm_area_struct *vma, 594 - unsigned long address, pmd_t *pmdp) 595 - { 596 - VM_BUG_ON(address & ~HPAGE_PMD_MASK); 597 - VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); 598 - 599 - /* 600 - * We can't mark the pmd none here, because that will cause a race 601 - * against exit_mmap. We need to continue mark pmd TRANS HUGE, while 602 - * we spilt, but at the same time we wan't rest of the ppc64 code 603 - * not to insert hash pte on this, because we will be modifying 604 - * the deposited pgtable in the caller of this function. Hence 605 - * clear the _PAGE_USER so that we move the fault handling to 606 - * higher level function and that will serialize against ptl. 607 - * We need to flush existing hash pte entries here even though, 608 - * the translation is still valid, because we will withdraw 609 - * pgtable_t after this. 610 - */ 611 - pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED); 612 - } 613 - 614 - 615 - /* 616 - * set a new huge pmd. We should not be called for updating 617 - * an existing pmd entry. That should go via pmd_hugepage_update. 618 - */ 619 - void set_pmd_at(struct mm_struct *mm, unsigned long addr, 620 - pmd_t *pmdp, pmd_t pmd) 621 - { 622 - #ifdef CONFIG_DEBUG_VM 623 - WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); 624 - assert_spin_locked(&mm->page_table_lock); 625 - WARN_ON(!pmd_trans_huge(pmd)); 626 - #endif 627 - trace_hugepage_set_pmd(addr, pmd_val(pmd)); 628 - return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); 629 - } 630 - 631 - /* 632 - * We use this to invalidate a pmdp entry before switching from a 633 - * hugepte to regular pmd entry. 634 - */ 635 - void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, 636 - pmd_t *pmdp) 637 - { 638 - pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); 639 - 640 - /* 641 - * This ensures that generic code that rely on IRQ disabling 642 - * to prevent a parallel THP split work as expected. 643 - */ 644 - kick_all_cpus_sync(); 645 - } 646 - 647 - /* 648 - * A linux hugepage PMD was changed and the corresponding hash table entries 649 - * neesd to be flushed. 650 - */ 651 - void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, 652 - pmd_t *pmdp, unsigned long old_pmd) 653 - { 654 - int ssize; 655 - unsigned int psize; 656 - unsigned long vsid; 657 - unsigned long flags = 0; 658 - const struct cpumask *tmp; 659 - 660 - /* get the base page size,vsid and segment size */ 661 - #ifdef CONFIG_DEBUG_VM 662 - psize = get_slice_psize(mm, addr); 663 - BUG_ON(psize == MMU_PAGE_16M); 664 - #endif 665 - if (old_pmd & H_PAGE_COMBO) 666 - psize = MMU_PAGE_4K; 667 - else 668 - psize = MMU_PAGE_64K; 669 - 670 - if (!is_kernel_addr(addr)) { 671 - ssize = user_segment_size(addr); 672 - vsid = get_vsid(mm->context.id, addr, ssize); 673 - WARN_ON(vsid == 0); 674 - } else { 675 - vsid = get_kernel_vsid(addr, mmu_kernel_ssize); 676 - ssize = mmu_kernel_ssize; 677 - } 678 - 679 - tmp = cpumask_of(smp_processor_id()); 680 - if (cpumask_equal(mm_cpumask(mm), tmp)) 681 - flags |= HPTE_LOCAL_UPDATE; 682 - 683 - return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); 684 - } 685 - 686 - static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) 687 - { 688 - return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); 689 - } 690 - 691 - pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) 692 - { 693 - unsigned long pmdv; 694 - 695 - pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; 696 - return pmd_set_protbits(__pmd(pmdv), pgprot); 697 - } 698 - 699 - pmd_t mk_pmd(struct page *page, pgprot_t pgprot) 700 - { 701 - return pfn_pmd(page_to_pfn(page), pgprot); 702 - } 703 - 704 - pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) 705 - { 706 - unsigned long pmdv; 707 - 708 - pmdv = pmd_val(pmd); 709 - pmdv &= _HPAGE_CHG_MASK; 710 - return pmd_set_protbits(__pmd(pmdv), newprot); 711 - } 712 - 713 - /* 714 - * This is called at the end of handling a user page fault, when the 715 - * fault has been handled by updating a HUGE PMD entry in the linux page tables. 716 - * We use it to preload an HPTE into the hash table corresponding to 717 - * the updated linux HUGE PMD entry. 718 - */ 719 - void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, 720 - pmd_t *pmd) 721 - { 722 - return; 723 - } 724 - 725 - pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, 726 - unsigned long addr, pmd_t *pmdp) 727 - { 728 - pmd_t old_pmd; 729 - pgtable_t pgtable; 730 - unsigned long old; 731 - pgtable_t *pgtable_slot; 732 - 733 - old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); 734 - old_pmd = __pmd(old); 735 - /* 736 - * We have pmd == none and we are holding page_table_lock. 737 - * So we can safely go and clear the pgtable hash 738 - * index info. 739 - */ 740 - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; 741 - pgtable = *pgtable_slot; 742 - /* 743 - * Let's zero out old valid and hash index details 744 - * hash fault look at them. 745 - */ 746 - memset(pgtable, 0, PTE_FRAG_SIZE); 747 - /* 748 - * Serialize against find_linux_pte_or_hugepte which does lock-less 749 - * lookup in page tables with local interrupts disabled. For huge pages 750 - * it casts pmd_t to pte_t. Since format of pte_t is different from 751 - * pmd_t we want to prevent transit from pmd pointing to page table 752 - * to pmd pointing to huge page (and back) while interrupts are disabled. 753 - * We clear pmd to possibly replace it with page table pointer in 754 - * different code paths. So make sure we wait for the parallel 755 - * find_linux_pte_or_hugepage to finish. 756 - */ 757 - kick_all_cpus_sync(); 758 - return old_pmd; 759 - } 760 - 761 - int has_transparent_hugepage(void) 762 - { 763 - 764 - if (!mmu_has_feature(MMU_FTR_16M_PAGE)) 765 - return 0; 766 - /* 767 - * We support THP only if PMD_SIZE is 16MB. 768 - */ 769 - if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) 770 - return 0; 771 - /* 772 - * We need to make sure that we support 16MB hugepage in a segement 773 - * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE 774 - * of 64K. 775 - */ 776 - /* 777 - * If we have 64K HPTE, we will be using that by default 778 - */ 779 - if (mmu_psize_defs[MMU_PAGE_64K].shift && 780 - (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) 781 - return 0; 782 - /* 783 - * Ok we only have 4K HPTE 784 - */ 785 - if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) 786 - return 0; 787 - 788 - return 1; 789 - } 790 - #endif /* CONFIG_TRANSPARENT_HUGEPAGE */