[PATCH] Four level pagetables for ppc64

Implement 4-level pagetables for ppc64

This patch implements full four-level page tables for ppc64, thereby
extending the usable user address range to 44 bits (16T).

The patch uses a full page for the tables at the bottom and top level,
and a quarter page for the intermediate levels. It uses full 64-bit
pointers at every level, thus also increasing the addressable range of
physical memory. This patch also tweaks the VSID allocation to allow
matching range for user addresses (this halves the number of available
contexts) and adds some #if and BUILD_BUG sanity checks.

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>

authored by

David Gibson and committed by
Paul Mackerras
e28f7faf decd300b

+305 -289
+1 -1
arch/ppc64/mm/hash_utils.c
··· 302 302 int local = 0; 303 303 cpumask_t tmp; 304 304 305 - if ((ea & ~REGION_MASK) > EADDR_MASK) 305 + if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) 306 306 return 1; 307 307 308 308 switch (REGION_ID(ea)) {
+69 -138
arch/ppc64/mm/hugetlbpage.c
··· 27 27 28 28 #include <linux/sysctl.h> 29 29 30 - #define HUGEPGDIR_SHIFT (HPAGE_SHIFT + PAGE_SHIFT - 3) 31 - #define HUGEPGDIR_SIZE (1UL << HUGEPGDIR_SHIFT) 32 - #define HUGEPGDIR_MASK (~(HUGEPGDIR_SIZE-1)) 33 - 34 - #define HUGEPTE_INDEX_SIZE 9 35 - #define HUGEPGD_INDEX_SIZE 10 36 - 37 - #define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE) 38 - #define PTRS_PER_HUGEPGD (1 << HUGEPGD_INDEX_SIZE) 39 - 40 - static inline int hugepgd_index(unsigned long addr) 30 + /* Modelled after find_linux_pte() */ 31 + pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 41 32 { 42 - return (addr & ~REGION_MASK) >> HUGEPGDIR_SHIFT; 43 - } 33 + pgd_t *pg; 34 + pud_t *pu; 35 + pmd_t *pm; 36 + pte_t *pt; 44 37 45 - static pud_t *hugepgd_offset(struct mm_struct *mm, unsigned long addr) 46 - { 47 - int index; 48 - 49 - if (! mm->context.huge_pgdir) 50 - return NULL; 51 - 52 - 53 - index = hugepgd_index(addr); 54 - BUG_ON(index >= PTRS_PER_HUGEPGD); 55 - return (pud_t *)(mm->context.huge_pgdir + index); 56 - } 57 - 58 - static inline pte_t *hugepte_offset(pud_t *dir, unsigned long addr) 59 - { 60 - int index; 61 - 62 - if (pud_none(*dir)) 63 - return NULL; 64 - 65 - index = (addr >> HPAGE_SHIFT) % PTRS_PER_HUGEPTE; 66 - return (pte_t *)pud_page(*dir) + index; 67 - } 68 - 69 - static pud_t *hugepgd_alloc(struct mm_struct *mm, unsigned long addr) 70 - { 71 38 BUG_ON(! in_hugepage_area(mm->context, addr)); 72 39 73 - if (! mm->context.huge_pgdir) { 74 - pgd_t *new; 75 - spin_unlock(&mm->page_table_lock); 76 - /* Don't use pgd_alloc(), because we want __GFP_REPEAT */ 77 - new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT); 78 - BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE)); 79 - spin_lock(&mm->page_table_lock); 40 + addr &= HPAGE_MASK; 80 41 81 - /* 82 - * Because we dropped the lock, we should re-check the 83 - * entry, as somebody else could have populated it.. 84 - */ 85 - if (mm->context.huge_pgdir) 86 - pgd_free(new); 87 - else 88 - mm->context.huge_pgdir = new; 89 - } 90 - return hugepgd_offset(mm, addr); 91 - } 92 - 93 - static pte_t *hugepte_alloc(struct mm_struct *mm, pud_t *dir, unsigned long addr) 94 - { 95 - if (! pud_present(*dir)) { 96 - pte_t *new; 97 - 98 - spin_unlock(&mm->page_table_lock); 99 - new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT); 100 - BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE)); 101 - spin_lock(&mm->page_table_lock); 102 - /* 103 - * Because we dropped the lock, we should re-check the 104 - * entry, as somebody else could have populated it.. 105 - */ 106 - if (pud_present(*dir)) { 107 - if (new) 108 - kmem_cache_free(zero_cache, new); 109 - } else { 110 - struct page *ptepage; 111 - 112 - if (! new) 113 - return NULL; 114 - ptepage = virt_to_page(new); 115 - ptepage->mapping = (void *) mm; 116 - ptepage->index = addr & HUGEPGDIR_MASK; 117 - pud_populate(mm, dir, new); 42 + pg = pgd_offset(mm, addr); 43 + if (!pgd_none(*pg)) { 44 + pu = pud_offset(pg, addr); 45 + if (!pud_none(*pu)) { 46 + pm = pmd_offset(pu, addr); 47 + pt = (pte_t *)pm; 48 + BUG_ON(!pmd_none(*pm) 49 + && !(pte_present(*pt) && pte_huge(*pt))); 50 + return pt; 118 51 } 119 52 } 120 53 121 - return hugepte_offset(dir, addr); 122 - } 123 - 124 - pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 125 - { 126 - pud_t *pud; 127 - 128 - BUG_ON(! in_hugepage_area(mm->context, addr)); 129 - 130 - pud = hugepgd_offset(mm, addr); 131 - if (! pud) 132 - return NULL; 133 - 134 - return hugepte_offset(pud, addr); 54 + return NULL; 135 55 } 136 56 137 57 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) 138 58 { 139 - pud_t *pud; 59 + pgd_t *pg; 60 + pud_t *pu; 61 + pmd_t *pm; 62 + pte_t *pt; 140 63 141 64 BUG_ON(! in_hugepage_area(mm->context, addr)); 142 65 143 - pud = hugepgd_alloc(mm, addr); 144 - if (! pud) 145 - return NULL; 66 + addr &= HPAGE_MASK; 146 67 147 - return hugepte_alloc(mm, pud, addr); 68 + pg = pgd_offset(mm, addr); 69 + pu = pud_alloc(mm, pg, addr); 70 + 71 + if (pu) { 72 + pm = pmd_alloc(mm, pu, addr); 73 + if (pm) { 74 + pt = (pte_t *)pm; 75 + BUG_ON(!pmd_none(*pm) 76 + && !(pte_present(*pt) && pte_huge(*pt))); 77 + return pt; 78 + } 79 + } 80 + 81 + return NULL; 82 + } 83 + 84 + #define HUGEPTE_BATCH_SIZE (HPAGE_SIZE / PMD_SIZE) 85 + 86 + void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 87 + pte_t *ptep, pte_t pte) 88 + { 89 + int i; 90 + 91 + if (pte_present(*ptep)) { 92 + pte_clear(mm, addr, ptep); 93 + flush_tlb_pending(); 94 + } 95 + 96 + for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) { 97 + *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); 98 + ptep++; 99 + } 100 + } 101 + 102 + pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, 103 + pte_t *ptep) 104 + { 105 + unsigned long old = pte_update(ptep, ~0UL); 106 + int i; 107 + 108 + if (old & _PAGE_HASHPTE) 109 + hpte_update(mm, addr, old, 0); 110 + 111 + for (i = 1; i < HUGEPTE_BATCH_SIZE; i++) 112 + ptep[i] = __pte(0); 113 + 114 + return __pte(old); 148 115 } 149 116 150 117 /* ··· 506 539 } else { 507 540 return htlb_get_high_area(len); 508 541 } 509 - } 510 - 511 - void hugetlb_mm_free_pgd(struct mm_struct *mm) 512 - { 513 - int i; 514 - pgd_t *pgdir; 515 - 516 - spin_lock(&mm->page_table_lock); 517 - 518 - pgdir = mm->context.huge_pgdir; 519 - if (! pgdir) 520 - goto out; 521 - 522 - mm->context.huge_pgdir = NULL; 523 - 524 - /* cleanup any hugepte pages leftover */ 525 - for (i = 0; i < PTRS_PER_HUGEPGD; i++) { 526 - pud_t *pud = (pud_t *)(pgdir + i); 527 - 528 - if (! pud_none(*pud)) { 529 - pte_t *pte = (pte_t *)pud_page(*pud); 530 - struct page *ptepage = virt_to_page(pte); 531 - 532 - ptepage->mapping = NULL; 533 - 534 - BUG_ON(memcmp(pte, empty_zero_page, PAGE_SIZE)); 535 - kmem_cache_free(zero_cache, pte); 536 - } 537 - pud_clear(pud); 538 - } 539 - 540 - BUG_ON(memcmp(pgdir, empty_zero_page, PAGE_SIZE)); 541 - kmem_cache_free(zero_cache, pgdir); 542 - 543 - out: 544 - spin_unlock(&mm->page_table_lock); 545 542 } 546 543 547 544 int hash_huge_page(struct mm_struct *mm, unsigned long access,
+1 -1
arch/ppc64/mm/imalloc.c
··· 31 31 break; 32 32 if ((unsigned long)tmp->addr >= ioremap_bot) 33 33 addr = tmp->size + (unsigned long) tmp->addr; 34 - if (addr > IMALLOC_END-size) 34 + if (addr >= IMALLOC_END-size) 35 35 return 1; 36 36 } 37 37 *im_addr = addr;
+41 -21
arch/ppc64/mm/init.c
··· 66 66 #include <asm/vdso.h> 67 67 #include <asm/imalloc.h> 68 68 69 + #if PGTABLE_RANGE > USER_VSID_RANGE 70 + #warning Limited user VSID range means pagetable space is wasted 71 + #endif 72 + 73 + #if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) 74 + #warning TASK_SIZE is smaller than it needs to be. 75 + #endif 76 + 69 77 int mem_init_done; 70 78 unsigned long ioremap_bot = IMALLOC_BASE; 71 79 static unsigned long phbs_io_bot = PHBS_IO_BASE; ··· 234 226 * Before that, we map using addresses going 235 227 * up from ioremap_bot. imalloc will use 236 228 * the addresses from ioremap_bot through 237 - * IMALLOC_END (0xE000001fffffffff) 229 + * IMALLOC_END 238 230 * 239 231 */ 240 232 pa = addr & PAGE_MASK; ··· 425 417 int index; 426 418 int err; 427 419 428 - #ifdef CONFIG_HUGETLB_PAGE 429 - /* We leave htlb_segs as it was, but for a fork, we need to 430 - * clear the huge_pgdir. */ 431 - mm->context.huge_pgdir = NULL; 432 - #endif 433 - 434 420 again: 435 421 if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL)) 436 422 return -ENOMEM; ··· 455 453 spin_unlock(&mmu_context_lock); 456 454 457 455 mm->context.id = NO_CONTEXT; 458 - 459 - hugetlb_mm_free_pgd(mm); 460 456 } 461 457 462 458 /* ··· 833 833 return virt_addr; 834 834 } 835 835 836 - kmem_cache_t *zero_cache; 837 - 838 - static void zero_ctor(void *pte, kmem_cache_t *cache, unsigned long flags) 836 + static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags) 839 837 { 840 - memset(pte, 0, PAGE_SIZE); 838 + memset(addr, 0, kmem_cache_size(cache)); 841 839 } 840 + 841 + static const int pgtable_cache_size[2] = { 842 + PTE_TABLE_SIZE, PMD_TABLE_SIZE 843 + }; 844 + static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { 845 + "pgd_pte_cache", "pud_pmd_cache", 846 + }; 847 + 848 + kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; 842 849 843 850 void pgtable_cache_init(void) 844 851 { 845 - zero_cache = kmem_cache_create("zero", 846 - PAGE_SIZE, 847 - 0, 848 - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, 849 - zero_ctor, 850 - NULL); 851 - if (!zero_cache) 852 - panic("pgtable_cache_init(): could not create zero_cache!\n"); 852 + int i; 853 + 854 + BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]); 855 + BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]); 856 + BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]); 857 + BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]); 858 + 859 + for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) { 860 + int size = pgtable_cache_size[i]; 861 + const char *name = pgtable_cache_name[i]; 862 + 863 + pgtable_cache[i] = kmem_cache_create(name, 864 + size, size, 865 + SLAB_HWCACHE_ALIGN 866 + | SLAB_MUST_HWCACHE_ALIGN, 867 + zero_ctor, 868 + NULL); 869 + if (! pgtable_cache[i]) 870 + panic("pgtable_cache_init(): could not create %s!\n", 871 + name); 872 + } 853 873 } 854 874 855 875 pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
+1 -1
arch/ppc64/mm/slb_low.S
··· 91 91 0: /* user address: proto-VSID = context<<15 | ESID */ 92 92 li r11,SLB_VSID_USER 93 93 94 - srdi. r9,r3,13 94 + srdi. r9,r3,USER_ESID_BITS 95 95 bne- 8f /* invalid ea bits set */ 96 96 97 97 #ifdef CONFIG_HUGETLB_PAGE
+55 -40
arch/ppc64/mm/tlb.c
··· 41 41 DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); 42 42 unsigned long pte_freelist_forced_free; 43 43 44 - void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage) 44 + struct pte_freelist_batch 45 + { 46 + struct rcu_head rcu; 47 + unsigned int index; 48 + pgtable_free_t tables[0]; 49 + }; 50 + 51 + DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); 52 + unsigned long pte_freelist_forced_free; 53 + 54 + #define PTE_FREELIST_SIZE \ 55 + ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \ 56 + / sizeof(pgtable_free_t)) 57 + 58 + #ifdef CONFIG_SMP 59 + static void pte_free_smp_sync(void *arg) 60 + { 61 + /* Do nothing, just ensure we sync with all CPUs */ 62 + } 63 + #endif 64 + 65 + /* This is only called when we are critically out of memory 66 + * (and fail to get a page in pte_free_tlb). 67 + */ 68 + static void pgtable_free_now(pgtable_free_t pgf) 69 + { 70 + pte_freelist_forced_free++; 71 + 72 + smp_call_function(pte_free_smp_sync, NULL, 0, 1); 73 + 74 + pgtable_free(pgf); 75 + } 76 + 77 + static void pte_free_rcu_callback(struct rcu_head *head) 78 + { 79 + struct pte_freelist_batch *batch = 80 + container_of(head, struct pte_freelist_batch, rcu); 81 + unsigned int i; 82 + 83 + for (i = 0; i < batch->index; i++) 84 + pgtable_free(batch->tables[i]); 85 + 86 + free_page((unsigned long)batch); 87 + } 88 + 89 + static void pte_free_submit(struct pte_freelist_batch *batch) 90 + { 91 + INIT_RCU_HEAD(&batch->rcu); 92 + call_rcu(&batch->rcu, pte_free_rcu_callback); 93 + } 94 + 95 + void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) 45 96 { 46 97 /* This is safe as we are holding page_table_lock */ 47 98 cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id()); ··· 100 49 101 50 if (atomic_read(&tlb->mm->mm_users) < 2 || 102 51 cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) { 103 - pte_free(ptepage); 52 + pgtable_free(pgf); 104 53 return; 105 54 } 106 55 107 56 if (*batchp == NULL) { 108 57 *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); 109 58 if (*batchp == NULL) { 110 - pte_free_now(ptepage); 59 + pgtable_free_now(pgf); 111 60 return; 112 61 } 113 62 (*batchp)->index = 0; 114 63 } 115 - (*batchp)->pages[(*batchp)->index++] = ptepage; 64 + (*batchp)->tables[(*batchp)->index++] = pgf; 116 65 if ((*batchp)->index == PTE_FREELIST_SIZE) { 117 66 pte_free_submit(*batchp); 118 67 *batchp = NULL; ··· 181 130 flush_hash_range(batch->context, i, local); 182 131 batch->index = 0; 183 132 put_cpu(); 184 - } 185 - 186 - #ifdef CONFIG_SMP 187 - static void pte_free_smp_sync(void *arg) 188 - { 189 - /* Do nothing, just ensure we sync with all CPUs */ 190 - } 191 - #endif 192 - 193 - /* This is only called when we are critically out of memory 194 - * (and fail to get a page in pte_free_tlb). 195 - */ 196 - void pte_free_now(struct page *ptepage) 197 - { 198 - pte_freelist_forced_free++; 199 - 200 - smp_call_function(pte_free_smp_sync, NULL, 0, 1); 201 - 202 - pte_free(ptepage); 203 - } 204 - 205 - static void pte_free_rcu_callback(struct rcu_head *head) 206 - { 207 - struct pte_freelist_batch *batch = 208 - container_of(head, struct pte_freelist_batch, rcu); 209 - unsigned int i; 210 - 211 - for (i = 0; i < batch->index; i++) 212 - pte_free(batch->pages[i]); 213 - free_page((unsigned long)batch); 214 - } 215 - 216 - void pte_free_submit(struct pte_freelist_batch *batch) 217 - { 218 - INIT_RCU_HEAD(&batch->rcu); 219 - call_rcu(&batch->rcu, pte_free_rcu_callback); 220 133 } 221 134 222 135 void pte_free_finish(void)
+1 -1
include/asm-ppc64/imalloc.h
··· 6 6 */ 7 7 #define PHBS_IO_BASE VMALLOC_END 8 8 #define IMALLOC_BASE (PHBS_IO_BASE + 0x80000000ul) /* Reserve 2 gigs for PHBs */ 9 - #define IMALLOC_END (VMALLOC_START + EADDR_MASK) 9 + #define IMALLOC_END (VMALLOC_START + PGTABLE_RANGE) 10 10 11 11 12 12 /* imalloc region types */
+4 -3
include/asm-ppc64/mmu.h
··· 259 259 #define VSID_BITS 36 260 260 #define VSID_MODULUS ((1UL<<VSID_BITS)-1) 261 261 262 - #define CONTEXT_BITS 20 263 - #define USER_ESID_BITS 15 262 + #define CONTEXT_BITS 19 263 + #define USER_ESID_BITS 16 264 + 265 + #define USER_VSID_RANGE (1UL << (USER_ESID_BITS + SID_SHIFT)) 264 266 265 267 /* 266 268 * This macro generates asm code to compute the VSID scramble ··· 304 302 typedef struct { 305 303 mm_context_id_t id; 306 304 #ifdef CONFIG_HUGETLB_PAGE 307 - pgd_t *huge_pgdir; 308 305 u16 htlb_segs; /* bitmask */ 309 306 #endif 310 307 } mm_context_t;
+15 -11
include/asm-ppc64/page.h
··· 46 46 47 47 #define ARCH_HAS_HUGEPAGE_ONLY_RANGE 48 48 #define ARCH_HAS_PREPARE_HUGEPAGE_RANGE 49 + #define ARCH_HAS_SETCLEAR_HUGE_PTE 49 50 50 51 #define touches_hugepage_low_range(mm, addr, len) \ 51 52 (LOW_ESID_MASK((addr), (len)) & mm->context.htlb_segs) ··· 126 125 * Entries in the pte table are 64b, while entries in the pgd & pmd are 32b. 127 126 */ 128 127 typedef struct { unsigned long pte; } pte_t; 129 - typedef struct { unsigned int pmd; } pmd_t; 130 - typedef struct { unsigned int pgd; } pgd_t; 128 + typedef struct { unsigned long pmd; } pmd_t; 129 + typedef struct { unsigned long pud; } pud_t; 130 + typedef struct { unsigned long pgd; } pgd_t; 131 131 typedef struct { unsigned long pgprot; } pgprot_t; 132 132 133 133 #define pte_val(x) ((x).pte) 134 134 #define pmd_val(x) ((x).pmd) 135 + #define pud_val(x) ((x).pud) 135 136 #define pgd_val(x) ((x).pgd) 136 137 #define pgprot_val(x) ((x).pgprot) 137 138 138 - #define __pte(x) ((pte_t) { (x) } ) 139 - #define __pmd(x) ((pmd_t) { (x) } ) 140 - #define __pgd(x) ((pgd_t) { (x) } ) 141 - #define __pgprot(x) ((pgprot_t) { (x) } ) 139 + #define __pte(x) ((pte_t) { (x) }) 140 + #define __pmd(x) ((pmd_t) { (x) }) 141 + #define __pud(x) ((pud_t) { (x) }) 142 + #define __pgd(x) ((pgd_t) { (x) }) 143 + #define __pgprot(x) ((pgprot_t) { (x) }) 142 144 143 145 #else 144 146 /* 145 147 * .. while these make it easier on the compiler 146 148 */ 147 149 typedef unsigned long pte_t; 148 - typedef unsigned int pmd_t; 149 - typedef unsigned int pgd_t; 150 + typedef unsigned long pmd_t; 151 + typedef unsigned long pud_t; 152 + typedef unsigned long pgd_t; 150 153 typedef unsigned long pgprot_t; 151 154 152 155 #define pte_val(x) (x) 153 156 #define pmd_val(x) (x) 157 + #define pud_val(x) (x) 154 158 #define pgd_val(x) (x) 155 159 #define pgprot_val(x) (x) 156 160 157 161 #define __pte(x) (x) 158 162 #define __pmd(x) (x) 163 + #define __pud(x) (x) 159 164 #define __pgd(x) (x) 160 165 #define __pgprot(x) (x) 161 166 ··· 214 207 #define KERNEL_REGION_ID (KERNELBASE >> REGION_SHIFT) 215 208 #define USER_REGION_ID (0UL) 216 209 #define REGION_ID(ea) (((unsigned long)(ea)) >> REGION_SHIFT) 217 - 218 - #define __bpn_to_ba(x) ((((unsigned long)(x)) << PAGE_SHIFT) + KERNELBASE) 219 - #define __ba_to_bpn(x) ((((unsigned long)(x)) & ~REGION_MASK) >> PAGE_SHIFT) 220 210 221 211 #define __va(x) ((void *)((unsigned long)(x) + KERNELBASE)) 222 212
+61 -32
include/asm-ppc64/pgalloc.h
··· 6 6 #include <linux/cpumask.h> 7 7 #include <linux/percpu.h> 8 8 9 - extern kmem_cache_t *zero_cache; 9 + extern kmem_cache_t *pgtable_cache[]; 10 + 11 + #define PTE_CACHE_NUM 0 12 + #define PMD_CACHE_NUM 1 13 + #define PUD_CACHE_NUM 1 14 + #define PGD_CACHE_NUM 0 10 15 11 16 /* 12 17 * This program is free software; you can redistribute it and/or ··· 20 15 * 2 of the License, or (at your option) any later version. 21 16 */ 22 17 23 - static inline pgd_t * 24 - pgd_alloc(struct mm_struct *mm) 18 + static inline pgd_t *pgd_alloc(struct mm_struct *mm) 25 19 { 26 - return kmem_cache_alloc(zero_cache, GFP_KERNEL); 20 + return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL); 27 21 } 28 22 29 - static inline void 30 - pgd_free(pgd_t *pgd) 23 + static inline void pgd_free(pgd_t *pgd) 31 24 { 32 - kmem_cache_free(zero_cache, pgd); 25 + kmem_cache_free(pgtable_cache[PGD_CACHE_NUM], pgd); 26 + } 27 + 28 + #define pgd_populate(MM, PGD, PUD) pgd_set(PGD, PUD) 29 + 30 + static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) 31 + { 32 + return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM], 33 + GFP_KERNEL|__GFP_REPEAT); 34 + } 35 + 36 + static inline void pud_free(pud_t *pud) 37 + { 38 + kmem_cache_free(pgtable_cache[PUD_CACHE_NUM], pud); 33 39 } 34 40 35 41 #define pud_populate(MM, PUD, PMD) pud_set(PUD, PMD) 36 42 37 - static inline pmd_t * 38 - pmd_alloc_one(struct mm_struct *mm, unsigned long addr) 43 + static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) 39 44 { 40 - return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); 45 + return kmem_cache_alloc(pgtable_cache[PMD_CACHE_NUM], 46 + GFP_KERNEL|__GFP_REPEAT); 41 47 } 42 48 43 - static inline void 44 - pmd_free(pmd_t *pmd) 49 + static inline void pmd_free(pmd_t *pmd) 45 50 { 46 - kmem_cache_free(zero_cache, pmd); 51 + kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd); 47 52 } 48 53 49 54 #define pmd_populate_kernel(mm, pmd, pte) pmd_set(pmd, pte) ··· 62 47 63 48 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 64 49 { 65 - return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); 50 + return kmem_cache_alloc(pgtable_cache[PTE_CACHE_NUM], 51 + GFP_KERNEL|__GFP_REPEAT); 66 52 } 67 53 68 54 static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) 69 55 { 70 - pte_t *pte = kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); 71 - if (pte) 72 - return virt_to_page(pte); 73 - return NULL; 56 + return virt_to_page(pte_alloc_one_kernel(mm, address)); 74 57 } 75 58 76 59 static inline void pte_free_kernel(pte_t *pte) 77 60 { 78 - kmem_cache_free(zero_cache, pte); 61 + kmem_cache_free(pgtable_cache[PTE_CACHE_NUM], pte); 79 62 } 80 63 81 64 static inline void pte_free(struct page *ptepage) 82 65 { 83 - kmem_cache_free(zero_cache, page_address(ptepage)); 66 + pte_free_kernel(page_address(ptepage)); 84 67 } 85 68 86 - struct pte_freelist_batch 69 + #define PGF_CACHENUM_MASK 0xf 70 + 71 + typedef struct pgtable_free { 72 + unsigned long val; 73 + } pgtable_free_t; 74 + 75 + static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum, 76 + unsigned long mask) 87 77 { 88 - struct rcu_head rcu; 89 - unsigned int index; 90 - struct page * pages[0]; 91 - }; 78 + BUG_ON(cachenum > PGF_CACHENUM_MASK); 92 79 93 - #define PTE_FREELIST_SIZE ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) / \ 94 - sizeof(struct page *)) 80 + return (pgtable_free_t){.val = ((unsigned long) p & ~mask) | cachenum}; 81 + } 95 82 96 - extern void pte_free_now(struct page *ptepage); 97 - extern void pte_free_submit(struct pte_freelist_batch *batch); 83 + static inline void pgtable_free(pgtable_free_t pgf) 84 + { 85 + void *p = (void *)(pgf.val & ~PGF_CACHENUM_MASK); 86 + int cachenum = pgf.val & PGF_CACHENUM_MASK; 98 87 99 - DECLARE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); 88 + kmem_cache_free(pgtable_cache[cachenum], p); 89 + } 100 90 101 - void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage); 102 - #define __pmd_free_tlb(tlb, pmd) __pte_free_tlb(tlb, virt_to_page(pmd)) 91 + void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf); 92 + 93 + #define __pte_free_tlb(tlb, ptepage) \ 94 + pgtable_free_tlb(tlb, pgtable_free_cache(page_address(ptepage), \ 95 + PTE_CACHE_NUM, PTE_TABLE_SIZE-1)) 96 + #define __pmd_free_tlb(tlb, pmd) \ 97 + pgtable_free_tlb(tlb, pgtable_free_cache(pmd, \ 98 + PMD_CACHE_NUM, PMD_TABLE_SIZE-1)) 99 + #define __pud_free_tlb(tlb, pmd) \ 100 + pgtable_free_tlb(tlb, pgtable_free_cache(pud, \ 101 + PUD_CACHE_NUM, PUD_TABLE_SIZE-1)) 103 102 104 103 #define check_pgt_cache() do { } while (0) 105 104
+54 -38
include/asm-ppc64/pgtable.h
··· 15 15 #include <asm/tlbflush.h> 16 16 #endif /* __ASSEMBLY__ */ 17 17 18 - #include <asm-generic/pgtable-nopud.h> 19 - 20 18 /* 21 19 * Entries per page directory level. The PTE level must use a 64b record 22 20 * for each page table entry. The PMD and PGD level use a 32b record for 23 21 * each entry by assuming that each entry is page aligned. 24 22 */ 25 23 #define PTE_INDEX_SIZE 9 26 - #define PMD_INDEX_SIZE 10 27 - #define PGD_INDEX_SIZE 10 24 + #define PMD_INDEX_SIZE 7 25 + #define PUD_INDEX_SIZE 7 26 + #define PGD_INDEX_SIZE 9 27 + 28 + #define PTE_TABLE_SIZE (sizeof(pte_t) << PTE_INDEX_SIZE) 29 + #define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) 30 + #define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) 31 + #define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) 28 32 29 33 #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) 30 34 #define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) 35 + #define PTRS_PER_PUD (1 << PMD_INDEX_SIZE) 31 36 #define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) 32 37 33 38 /* PMD_SHIFT determines what a second-level page table entry can map */ ··· 40 35 #define PMD_SIZE (1UL << PMD_SHIFT) 41 36 #define PMD_MASK (~(PMD_SIZE-1)) 42 37 43 - /* PGDIR_SHIFT determines what a third-level page table entry can map */ 44 - #define PGDIR_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) 38 + /* PUD_SHIFT determines what a third-level page table entry can map */ 39 + #define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) 40 + #define PUD_SIZE (1UL << PUD_SHIFT) 41 + #define PUD_MASK (~(PUD_SIZE-1)) 42 + 43 + /* PGDIR_SHIFT determines what a fourth-level page table entry can map */ 44 + #define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE) 45 45 #define PGDIR_SIZE (1UL << PGDIR_SHIFT) 46 46 #define PGDIR_MASK (~(PGDIR_SIZE-1)) 47 47 ··· 55 45 /* 56 46 * Size of EA range mapped by our pagetables. 57 47 */ 58 - #define EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \ 59 - PGD_INDEX_SIZE + PAGE_SHIFT) 60 - #define EADDR_MASK ((1UL << EADDR_SIZE) - 1) 48 + #define PGTABLE_EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \ 49 + PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT) 50 + #define PGTABLE_RANGE (1UL << PGTABLE_EADDR_SIZE) 51 + 52 + #if TASK_SIZE_USER64 > PGTABLE_RANGE 53 + #error TASK_SIZE_USER64 exceeds pagetable range 54 + #endif 55 + 56 + #if TASK_SIZE_USER64 > (1UL << (USER_ESID_BITS + SID_SHIFT)) 57 + #error TASK_SIZE_USER64 exceeds user VSID range 58 + #endif 61 59 62 60 /* 63 61 * Define the address range of the vmalloc VM area. 64 62 */ 65 63 #define VMALLOC_START (0xD000000000000000ul) 66 - #define VMALLOC_SIZE (0x10000000000UL) 64 + #define VMALLOC_SIZE (0x80000000000UL) 67 65 #define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) 68 66 69 67 /* ··· 172 154 #ifndef __ASSEMBLY__ 173 155 int hash_huge_page(struct mm_struct *mm, unsigned long access, 174 156 unsigned long ea, unsigned long vsid, int local); 175 - 176 - void hugetlb_mm_free_pgd(struct mm_struct *mm); 177 157 #endif /* __ASSEMBLY__ */ 178 158 179 159 #define HAVE_ARCH_UNMAPPED_AREA ··· 179 163 #else 180 164 181 165 #define hash_huge_page(mm,a,ea,vsid,local) -1 182 - #define hugetlb_mm_free_pgd(mm) do {} while (0) 183 166 184 167 #endif 185 168 ··· 212 197 #define pte_pfn(x) ((unsigned long)((pte_val(x) >> PTE_SHIFT))) 213 198 #define pte_page(x) pfn_to_page(pte_pfn(x)) 214 199 215 - #define pmd_set(pmdp, ptep) \ 216 - (pmd_val(*(pmdp)) = __ba_to_bpn(ptep)) 200 + #define pmd_set(pmdp, ptep) ({BUG_ON((u64)ptep < KERNELBASE); pmd_val(*(pmdp)) = (unsigned long)(ptep);}) 217 201 #define pmd_none(pmd) (!pmd_val(pmd)) 218 202 #define pmd_bad(pmd) (pmd_val(pmd) == 0) 219 203 #define pmd_present(pmd) (pmd_val(pmd) != 0) 220 204 #define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0) 221 - #define pmd_page_kernel(pmd) (__bpn_to_ba(pmd_val(pmd))) 205 + #define pmd_page_kernel(pmd) (pmd_val(pmd)) 222 206 #define pmd_page(pmd) virt_to_page(pmd_page_kernel(pmd)) 223 207 224 - #define pud_set(pudp, pmdp) (pud_val(*(pudp)) = (__ba_to_bpn(pmdp))) 208 + #define pud_set(pudp, pmdp) (pud_val(*(pudp)) = (unsigned long)(pmdp)) 225 209 #define pud_none(pud) (!pud_val(pud)) 226 - #define pud_bad(pud) ((pud_val(pud)) == 0UL) 227 - #define pud_present(pud) (pud_val(pud) != 0UL) 228 - #define pud_clear(pudp) (pud_val(*(pudp)) = 0UL) 229 - #define pud_page(pud) (__bpn_to_ba(pud_val(pud))) 210 + #define pud_bad(pud) ((pud_val(pud)) == 0) 211 + #define pud_present(pud) (pud_val(pud) != 0) 212 + #define pud_clear(pudp) (pud_val(*(pudp)) = 0) 213 + #define pud_page(pud) (pud_val(pud)) 214 + 215 + #define pgd_set(pgdp, pudp) ({pgd_val(*(pgdp)) = (unsigned long)(pudp);}) 216 + #define pgd_none(pgd) (!pgd_val(pgd)) 217 + #define pgd_bad(pgd) (pgd_val(pgd) == 0) 218 + #define pgd_present(pgd) (pgd_val(pgd) != 0) 219 + #define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0) 220 + #define pgd_page(pgd) (pgd_val(pgd)) 230 221 231 222 /* 232 223 * Find an entry in a page-table-directory. We combine the address region 233 224 * (the high order N bits) and the pgd portion of the address. 234 225 */ 235 226 /* to avoid overflow in free_pgtables we don't use PTRS_PER_PGD here */ 236 - #define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & 0x7ff) 227 + #define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & 0x1ff) 237 228 238 229 #define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address)) 239 230 240 - /* Find an entry in the second-level page table.. */ 241 - #define pmd_offset(pudp,addr) \ 242 - ((pmd_t *) pud_page(*(pudp)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) 231 + #define pud_offset(pgdp, addr) \ 232 + (((pud_t *) pgd_page(*(pgdp))) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))) 243 233 244 - /* Find an entry in the third-level page table.. */ 234 + #define pmd_offset(pudp,addr) \ 235 + (((pmd_t *) pud_page(*(pudp))) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) 236 + 245 237 #define pte_offset_kernel(dir,addr) \ 246 - ((pte_t *) pmd_page_kernel(*(dir)) \ 247 - + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) 238 + (((pte_t *) pmd_page_kernel(*(dir))) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) 248 239 249 240 #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) 250 241 #define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) ··· 479 458 #define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) 480 459 481 460 #define pmd_ERROR(e) \ 482 - printk("%s:%d: bad pmd %08x.\n", __FILE__, __LINE__, pmd_val(e)) 461 + printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) 462 + #define pud_ERROR(e) \ 463 + printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pud_val(e)) 483 464 #define pgd_ERROR(e) \ 484 - printk("%s:%d: bad pgd %08x.\n", __FILE__, __LINE__, pgd_val(e)) 465 + printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) 485 466 486 467 extern pgd_t swapper_pg_dir[]; 487 468 488 469 extern void paging_init(void); 489 470 490 - /* 491 - * Because the huge pgtables are only 2 level, they can take 492 - * at most around 4M, much less than one hugepage which the 493 - * process is presumably entitled to use. So we don't bother 494 - * freeing up the pagetables on unmap, and wait until 495 - * destroy_context() to clean up the lot. 496 - */ 497 471 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ 498 - do { } while (0) 472 + free_pgd_range(tlb, addr, end, floor, ceiling) 499 473 500 474 /* 501 475 * This gets called at the end of handling a page fault, when
+2 -2
include/asm-ppc64/processor.h
··· 382 382 extern struct task_struct *last_task_used_math; 383 383 extern struct task_struct *last_task_used_altivec; 384 384 385 - /* 64-bit user address space is 41-bits (2TBs user VM) */ 386 - #define TASK_SIZE_USER64 (0x0000020000000000UL) 385 + /* 64-bit user address space is 44-bits (16TB user VM) */ 386 + #define TASK_SIZE_USER64 (0x0000100000000000UL) 387 387 388 388 /* 389 389 * 32-bit user address space is 4GB - 1 page