powerpc/mm: Cleanup management of kmem_caches for pagetables

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Currently we have a fair bit of rather fiddly code to manage the
various kmem_caches used to store page tables of various levels. We
generally have two caches holding some combination of PGD, PUD and PMD
tables, plus several more for the special hugepage pagetables.

This patch cleans this all up by taking a different approach. Rather
than the caches being designated as for PUDs or for hugeptes for 16M
pages, the caches are simply allocated to be a specific size. Thus
sharing of caches between different types/levels of pagetables happens
naturally. The pagetable size, where needed, is passed around encoded
in the same way as {PGD,PUD,PMD}_INDEX_SIZE; that is n where the
pagetable contains 2^n pointers.

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

authored by

David Gibson and committed by

Benjamin Herrenschmidt 16 years ago a0668cdc f71dc176

+124 -121

7 changed files

expand all

arch

powerpc

include

asm

pgalloc-32.h

pgalloc-64.h

pgalloc.h

pgtable-ppc64.h

hugetlbpage.c

init_64.c

pgtable.c

+5 -5

arch/powerpc/include/asm/pgalloc-32.h

··· 3 3 4 4 #include <linux/threads.h> 5 5 6 - #define PTE_NONCACHE_NUM 0 /* dummy for now to share code w/ppc64 */ 6 + /* For 32-bit, all levels of page tables are just drawn from get_free_page() */ 7 + #define MAX_PGTABLE_INDEX_SIZE 0 7 8 8 9 extern void __bad_pte(pmd_t *pmd); 9 10 ··· 37 36 extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); 38 37 extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr); 39 38 40 - static inline void pgtable_free(pgtable_free_t pgf) 39 + static inline void pgtable_free(void *table, unsigned index_size) 41 40 { 42 - void *p = (void *)(pgf.val & ~PGF_CACHENUM_MASK); 43 - 44 - free_page((unsigned long)p); 41 + BUG_ON(index_size); /* 32-bit doesn't use this */ 42 + free_page((unsigned long)table); 45 43 } 46 44 47 45 #define check_pgt_cache() do { } while (0)

+35 -25

arch/powerpc/include/asm/pgalloc-64.h

··· 11 11 #include <linux/cpumask.h> 12 12 #include <linux/percpu.h> 13 13 14 + /* 15 + * Functions that deal with pagetables that could be at any level of 16 + * the table need to be passed an "index_size" so they know how to 17 + * handle allocation. For PTE pages (which are linked to a struct 18 + * page for now, and drawn from the main get_free_pages() pool), the 19 + * allocation size will be (2^index_size * sizeof(pointer)) and 20 + * allocations are drawn from the kmem_cache in PGT_CACHE(index_size). 21 + * 22 + * The maximum index size needs to be big enough to allow any 23 + * pagetable sizes we need, but small enough to fit in the low bits of 24 + * any page table pointer. In other words all pagetables, even tiny 25 + * ones, must be aligned to allow at least enough low 0 bits to 26 + * contain this value. This value is also used as a mask, so it must 27 + * be one less than a power of two. 28 + */ 29 + #define MAX_PGTABLE_INDEX_SIZE 0xf 30 + 14 31 #ifndef CONFIG_PPC_SUBPAGE_PROT 15 32 static inline void subpage_prot_free(pgd_t *pgd) {} 16 33 #endif 17 34 18 35 extern struct kmem_cache *pgtable_cache[]; 19 - 20 - #define PGD_CACHE_NUM 0 21 - #define PUD_CACHE_NUM 1 22 - #define PMD_CACHE_NUM 1 23 - #define HUGEPTE_CACHE_NUM 2 24 - #define PTE_NONCACHE_NUM 7 /* from GFP rather than kmem_cache */ 36 + #define PGT_CACHE(shift) (pgtable_cache[(shift)-1]) 25 37 26 38 static inline pgd_t *pgd_alloc(struct mm_struct *mm) 27 39 { 28 - return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL); 40 + return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL); 29 41 } 30 42 31 43 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) 32 44 { 33 45 subpage_prot_free(pgd); 34 - kmem_cache_free(pgtable_cache[PGD_CACHE_NUM], pgd); 46 + kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd); 35 47 } 36 48 37 49 #ifndef CONFIG_PPC_64K_PAGES ··· 52 40 53 41 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) 54 42 { 55 - return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM], 43 + return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), 56 44 GFP_KERNEL|__GFP_REPEAT); 57 45 } 58 46 59 47 static inline void pud_free(struct mm_struct *mm, pud_t *pud) 60 48 { 61 - kmem_cache_free(pgtable_cache[PUD_CACHE_NUM], pud); 49 + kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud); 62 50 } 63 51 64 52 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) ··· 90 78 91 79 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) 92 80 { 93 - return kmem_cache_alloc(pgtable_cache[PMD_CACHE_NUM], 81 + return kmem_cache_alloc(PGT_CACHE(PMD_INDEX_SIZE), 94 82 GFP_KERNEL|__GFP_REPEAT); 95 83 } 96 84 97 85 static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) 98 86 { 99 - kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd); 87 + kmem_cache_free(PGT_CACHE(PMD_INDEX_SIZE), pmd); 100 88 } 101 89 102 90 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, ··· 119 107 return page; 120 108 } 121 109 122 - static inline void pgtable_free(pgtable_free_t pgf) 110 + static inline void pgtable_free(void *table, unsigned index_size) 123 111 { 124 - void *p = (void *)(pgf.val & ~PGF_CACHENUM_MASK); 125 - int cachenum = pgf.val & PGF_CACHENUM_MASK; 126 - 127 - if (cachenum == PTE_NONCACHE_NUM) 128 - free_page((unsigned long)p); 129 - else 130 - kmem_cache_free(pgtable_cache[cachenum], p); 112 + if (!index_size) 113 + free_page((unsigned long)table); 114 + else { 115 + BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE); 116 + kmem_cache_free(PGT_CACHE(index_size), table); 117 + } 131 118 } 132 119 133 - #define __pmd_free_tlb(tlb, pmd,addr) \ 134 - pgtable_free_tlb(tlb, pgtable_free_cache(pmd, \ 135 - PMD_CACHE_NUM, PMD_TABLE_SIZE-1)) 120 + #define __pmd_free_tlb(tlb, pmd, addr) \ 121 + pgtable_free_tlb(tlb, pmd, PMD_INDEX_SIZE) 136 122 #ifndef CONFIG_PPC_64K_PAGES 137 123 #define __pud_free_tlb(tlb, pud, addr) \ 138 - pgtable_free_tlb(tlb, pgtable_free_cache(pud, \ 139 - PUD_CACHE_NUM, PUD_TABLE_SIZE-1)) 124 + pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE) 125 + 140 126 #endif /* CONFIG_PPC_64K_PAGES */ 141 127 142 128 #define check_pgt_cache() do { } while (0)

+4 -26

arch/powerpc/include/asm/pgalloc.h

··· 24 24 __free_page(ptepage); 25 25 } 26 26 27 - typedef struct pgtable_free { 28 - unsigned long val; 29 - } pgtable_free_t; 30 - 31 - /* This needs to be big enough to allow for MMU_PAGE_COUNT + 2 to be stored 32 - * and small enough to fit in the low bits of any naturally aligned page 33 - * table cache entry. Arbitrarily set to 0x1f, that should give us some 34 - * room to grow 35 - */ 36 - #define PGF_CACHENUM_MASK 0x1f 37 - 38 - static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum, 39 - unsigned long mask) 40 - { 41 - BUG_ON(cachenum > PGF_CACHENUM_MASK); 42 - 43 - return (pgtable_free_t){.val = ((unsigned long) p & ~mask) | cachenum}; 44 - } 45 - 46 27 #ifdef CONFIG_PPC64 47 28 #include <asm/pgalloc-64.h> 48 29 #else ··· 31 50 #endif 32 51 33 52 #ifdef CONFIG_SMP 34 - extern void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf); 53 + extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift); 35 54 extern void pte_free_finish(void); 36 55 #else /* CONFIG_SMP */ 37 - static inline void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) 56 + static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift) 38 57 { 39 - pgtable_free(pgf); 58 + pgtable_free(table, shift); 40 59 } 41 60 static inline void pte_free_finish(void) { } 42 61 #endif /* !CONFIG_SMP */ ··· 44 63 static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage, 45 64 unsigned long address) 46 65 { 47 - pgtable_free_t pgf = pgtable_free_cache(page_address(ptepage), 48 - PTE_NONCACHE_NUM, 49 - PTE_TABLE_SIZE-1); 50 66 tlb_flush_pgtable(tlb, address); 51 67 pgtable_page_dtor(ptepage); 52 - pgtable_free_tlb(tlb, pgf); 68 + pgtable_free_tlb(tlb, page_address(ptepage), 0); 53 69 } 54 70 55 71 #endif /* __KERNEL__ */

arch/powerpc/include/asm/pgtable-ppc64.h

··· 354 354 #define pgoff_to_pte(off) ((pte_t) {((off) << PTE_RPN_SHIFT)|_PAGE_FILE}) 355 355 #define PTE_FILE_MAX_BITS (BITS_PER_LONG - PTE_RPN_SHIFT) 356 356 357 + void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); 357 358 void pgtable_cache_init(void); 358 359 359 360 /*

+15 -36

arch/powerpc/mm/hugetlbpage.c

··· 43 43 unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ 44 44 45 45 #define hugepte_shift mmu_huge_psizes 46 - #define PTRS_PER_HUGEPTE(psize) (1 << hugepte_shift[psize]) 47 - #define HUGEPTE_TABLE_SIZE(psize) (sizeof(pte_t) << hugepte_shift[psize]) 46 + #define HUGEPTE_INDEX_SIZE(psize) (mmu_huge_psizes[(psize)]) 47 + #define PTRS_PER_HUGEPTE(psize) (1 << mmu_huge_psizes[psize]) 48 48 49 49 #define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \ 50 - + hugepte_shift[psize]) 50 + + HUGEPTE_INDEX_SIZE(psize)) 51 51 #define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize)) 52 52 #define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1)) 53 - 54 - /* Subtract one from array size because we don't need a cache for 4K since 55 - * is not a huge page size */ 56 - #define HUGE_PGTABLE_INDEX(psize) (HUGEPTE_CACHE_NUM + psize - 1) 57 - #define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize]) 58 - 59 - static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = { 60 - [MMU_PAGE_64K] = "hugepte_cache_64K", 61 - [MMU_PAGE_1M] = "hugepte_cache_1M", 62 - [MMU_PAGE_16M] = "hugepte_cache_16M", 63 - [MMU_PAGE_16G] = "hugepte_cache_16G", 64 - }; 65 53 66 54 /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() 67 55 * will choke on pointers to hugepte tables, which is handy for ··· 102 114 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 103 115 unsigned long address, unsigned int psize) 104 116 { 105 - pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], 106 - GFP_KERNEL|__GFP_REPEAT); 117 + pte_t *new = kmem_cache_zalloc(PGT_CACHE(hugepte_shift[psize]), 118 + GFP_KERNEL|__GFP_REPEAT); 107 119 108 120 if (! new) 109 121 return -ENOMEM; 110 122 111 123 spin_lock(&mm->page_table_lock); 112 124 if (!hugepd_none(*hpdp)) 113 - kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new); 125 + kmem_cache_free(PGT_CACHE(hugepte_shift[psize]), new); 114 126 else 115 127 hpdp->pd = (unsigned long)new | HUGEPD_OK; 116 128 spin_unlock(&mm->page_table_lock); ··· 259 271 260 272 hpdp->pd = 0; 261 273 tlb->need_flush = 1; 262 - pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, 263 - HUGEPTE_CACHE_NUM+psize-1, 264 - PGF_CACHENUM_MASK)); 274 + pgtable_free_tlb(tlb, hugepte, hugepte_shift[psize]); 265 275 } 266 276 267 277 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, ··· 684 698 if (mmu_huge_psizes[psize] || 685 699 mmu_psize_defs[psize].shift == PAGE_SHIFT) 686 700 return; 687 - if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL)) 688 - return; 689 701 hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); 690 702 691 703 switch (mmu_psize_defs[psize].shift) { ··· 737 753 if (!cpu_has_feature(CPU_FTR_16M_PAGE)) 738 754 return -ENODEV; 739 755 740 - /* Add supported huge page sizes. Need to change HUGE_MAX_HSTATE 741 - * and adjust PTE_NONCACHE_NUM if the number of supported huge page 742 - * sizes changes. 756 + /* Add supported huge page sizes. Need to change 757 + * HUGE_MAX_HSTATE if the number of supported huge page sizes 758 + * changes. 743 759 */ 744 760 set_huge_psize(MMU_PAGE_16M); 745 761 set_huge_psize(MMU_PAGE_16G); ··· 753 769 754 770 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 755 771 if (mmu_huge_psizes[psize]) { 756 - pgtable_cache[HUGE_PGTABLE_INDEX(psize)] = 757 - kmem_cache_create( 758 - HUGEPTE_CACHE_NAME(psize), 759 - HUGEPTE_TABLE_SIZE(psize), 760 - HUGEPTE_TABLE_SIZE(psize), 761 - 0, 762 - NULL); 763 - if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)]) 764 - panic("hugetlbpage_init(): could not create %s"\ 765 - "\n", HUGEPTE_CACHE_NAME(psize)); 772 + pgtable_cache_add(hugepte_shift[psize], NULL); 773 + if (!PGT_CACHE(hugepte_shift[psize])) 774 + panic("hugetlbpage_init(): could not create " 775 + "pgtable cache for %d bit pagesize\n", 776 + mmu_psize_to_shift(psize)); 766 777 } 767 778 } 768 779

+48 -20

arch/powerpc/mm/init_64.c

··· 119 119 memset(addr, 0, PMD_TABLE_SIZE); 120 120 } 121 121 122 - static const unsigned int pgtable_cache_size[2] = { 123 - PGD_TABLE_SIZE, PMD_TABLE_SIZE 124 - }; 125 - static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { 126 - #ifdef CONFIG_PPC_64K_PAGES 127 - "pgd_cache", "pmd_cache", 128 - #else 129 - "pgd_cache", "pud_pmd_cache", 130 - #endif /* CONFIG_PPC_64K_PAGES */ 131 - }; 122 + struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE]; 132 123 133 - #ifdef CONFIG_HUGETLB_PAGE 134 - /* Hugepages need an extra cache per hugepagesize, initialized in 135 - * hugetlbpage.c. We can't put into the tables above, because HPAGE_SHIFT 136 - * is not compile time constant. */ 137 - struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT]; 138 - #else 139 - struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; 140 - #endif 124 + /* 125 + * Create a kmem_cache() for pagetables. This is not used for PTE 126 + * pages - they're linked to struct page, come from the normal free 127 + * pages pool and have a different entry size (see real_pte_t) to 128 + * everything else. Caches created by this function are used for all 129 + * the higher level pagetables, and for hugepage pagetables. 130 + */ 131 + void pgtable_cache_add(unsigned shift, void (*ctor)(void *)) 132 + { 133 + char *name; 134 + unsigned long table_size = sizeof(void *) << shift; 135 + unsigned long align = table_size; 136 + 137 + /* When batching pgtable pointers for RCU freeing, we store 138 + * the index size in the low bits. Table alignment must be 139 + * big enough to fit it */ 140 + unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1; 141 + struct kmem_cache *new; 142 + 143 + /* It would be nice if this was a BUILD_BUG_ON(), but at the 144 + * moment, gcc doesn't seem to recognize is_power_of_2 as a 145 + * constant expression, so so much for that. */ 146 + BUG_ON(!is_power_of_2(minalign)); 147 + BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE)); 148 + 149 + if (PGT_CACHE(shift)) 150 + return; /* Already have a cache of this size */ 151 + 152 + align = max_t(unsigned long, align, minalign); 153 + name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift); 154 + new = kmem_cache_create(name, table_size, align, 0, ctor); 155 + PGT_CACHE(shift) = new; 156 + 157 + pr_debug("Allocated pgtable cache for order %d\n", shift); 158 + } 159 + 141 160 142 161 void pgtable_cache_init(void) 143 162 { 144 - pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor); 145 - pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor); 163 + pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor); 164 + pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor); 165 + if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE)) 166 + panic("Couldn't allocate pgtable caches"); 167 + 168 + /* In all current configs, when the PUD index exists it's the 169 + * same size as either the pgd or pmd index. Verify that the 170 + * initialization above has also created a PUD cache. This 171 + * will need re-examiniation if we add new possibilities for 172 + * the pagetable layout. */ 173 + BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE)); 146 174 } 147 175 148 176 #ifdef CONFIG_SPARSEMEM_VMEMMAP

+16 -9

arch/powerpc/mm/pgtable.c

··· 49 49 { 50 50 struct rcu_head rcu; 51 51 unsigned int index; 52 - pgtable_free_t tables[0]; 52 + unsigned long tables[0]; 53 53 }; 54 54 55 55 #define PTE_FREELIST_SIZE \ 56 56 ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \ 57 - / sizeof(pgtable_free_t)) 57 + / sizeof(unsigned long)) 58 58 59 59 static void pte_free_smp_sync(void *arg) 60 60 { ··· 64 64 /* This is only called when we are critically out of memory 65 65 * (and fail to get a page in pte_free_tlb). 66 66 */ 67 - static void pgtable_free_now(pgtable_free_t pgf) 67 + static void pgtable_free_now(void *table, unsigned shift) 68 68 { 69 69 pte_freelist_forced_free++; 70 70 71 71 smp_call_function(pte_free_smp_sync, NULL, 1); 72 72 73 - pgtable_free(pgf); 73 + pgtable_free(table, shift); 74 74 } 75 75 76 76 static void pte_free_rcu_callback(struct rcu_head *head) ··· 79 79 container_of(head, struct pte_freelist_batch, rcu); 80 80 unsigned int i; 81 81 82 - for (i = 0; i < batch->index; i++) 83 - pgtable_free(batch->tables[i]); 82 + for (i = 0; i < batch->index; i++) { 83 + void *table = (void *)(batch->tables[i] & ~MAX_PGTABLE_INDEX_SIZE); 84 + unsigned shift = batch->tables[i] & MAX_PGTABLE_INDEX_SIZE; 85 + 86 + pgtable_free(table, shift); 87 + } 84 88 85 89 free_page((unsigned long)batch); 86 90 } ··· 95 91 call_rcu(&batch->rcu, pte_free_rcu_callback); 96 92 } 97 93 98 - void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) 94 + void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift) 99 95 { 100 96 /* This is safe since tlb_gather_mmu has disabled preemption */ 101 97 struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); 98 + unsigned long pgf; 102 99 103 100 if (atomic_read(&tlb->mm->mm_users) < 2 || 104 101 cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){ 105 - pgtable_free(pgf); 102 + pgtable_free(table, shift); 106 103 return; 107 104 } 108 105 109 106 if (*batchp == NULL) { 110 107 *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); 111 108 if (*batchp == NULL) { 112 - pgtable_free_now(pgf); 109 + pgtable_free_now(table, shift); 113 110 return; 114 111 } 115 112 (*batchp)->index = 0; 116 113 } 114 + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); 115 + pgf = (unsigned long)table | shift; 117 116 (*batchp)->tables[(*batchp)->index++] = pgf; 118 117 if ((*batchp)->index == PTE_FREELIST_SIZE) { 119 118 pte_free_submit(*batchp);