Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

arch/tile: support 4KB page size as well as 64KB

The Tilera architecture traditionally supports 64KB page sizes
to improve TLB utilization and improve performance when the
hardware is being used primarily to run a single application.

For more generic server scenarios, it can be beneficial to run
with 4KB page sizes, so this commit allows that to be specified
(by modifying the arch/tile/include/hv/pagesize.h header).

As part of this change, we also re-worked the PTE management
slightly so that PTE writes all go through a __set_pte() function
where we can do some additional validation. The set_pte_order()
function was eliminated since the "order" argument wasn't being used.

One bug uncovered was in the PCI DMA code, which wasn't properly
flushing the specified range. This was benign with 64KB pages,
but with 4KB pages we were getting some larger flushes wrong.

The per-cpu memory reservation code also needed updating to
conform with the newer percpu stuff; before it always chose 64KB,
and that was always correct, but with 4KB granularity we now have
to pay closer attention and reserve the amount of memory that will
be requested when the percpu code starts allocating.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>

+237 -137
-6
arch/tile/Kconfig
··· 202 202 By default, 2, i.e. 2^2 == 4 DDR2 controllers. 203 203 In a system with more controllers, this value should be raised. 204 204 205 - # Need 16MB areas to enable hugetlb 206 - # See build-time check in arch/tile/mm/init.c. 207 - config FORCE_MAX_ZONEORDER 208 - int 209 - default 9 210 - 211 205 choice 212 206 depends on !TILEGX 213 207 prompt "Memory split" if EXPERT
+1 -1
arch/tile/include/asm/hugetlb.h
··· 54 54 static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 55 55 pte_t *ptep, pte_t pte) 56 56 { 57 - set_pte_order(ptep, pte, HUGETLB_PAGE_ORDER); 57 + set_pte(ptep, pte); 58 58 } 59 59 60 60 static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+12 -24
arch/tile/include/asm/page.h
··· 16 16 #define _ASM_TILE_PAGE_H 17 17 18 18 #include <linux/const.h> 19 + #include <hv/pagesize.h> 19 20 20 21 /* PAGE_SHIFT and HPAGE_SHIFT determine the page sizes. */ 21 - #define PAGE_SHIFT 16 22 - #define HPAGE_SHIFT 24 22 + #define PAGE_SHIFT HV_LOG2_PAGE_SIZE_SMALL 23 + #define HPAGE_SHIFT HV_LOG2_PAGE_SIZE_LARGE 23 24 24 25 #define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) 25 26 #define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT) ··· 30 29 31 30 #ifdef __KERNEL__ 32 31 32 + /* 33 + * If the Kconfig doesn't specify, set a maximum zone order that 34 + * is enough so that we can create huge pages from small pages given 35 + * the respective sizes of the two page types. See <linux/mmzone.h>. 36 + */ 37 + #ifndef CONFIG_FORCE_MAX_ZONEORDER 38 + #define CONFIG_FORCE_MAX_ZONEORDER (HPAGE_SHIFT - PAGE_SHIFT + 1) 39 + #endif 40 + 33 41 #include <hv/hypervisor.h> 34 42 #include <arch/chip.h> 35 - 36 - /* 37 - * The {,H}PAGE_SHIFT values must match the HV_LOG2_PAGE_SIZE_xxx 38 - * definitions in <hv/hypervisor.h>. We validate this at build time 39 - * here, and again at runtime during early boot. We provide a 40 - * separate definition since userspace doesn't have <hv/hypervisor.h>. 41 - * 42 - * Be careful to distinguish PAGE_SHIFT from HV_PTE_INDEX_PFN, since 43 - * they are the same on i386 but not TILE. 44 - */ 45 - #if HV_LOG2_PAGE_SIZE_SMALL != PAGE_SHIFT 46 - # error Small page size mismatch in Linux 47 - #endif 48 - #if HV_LOG2_PAGE_SIZE_LARGE != HPAGE_SHIFT 49 - # error Huge page size mismatch in Linux 50 - #endif 51 43 52 44 #ifndef __ASSEMBLY__ 53 45 ··· 74 80 /* 75 81 * Hypervisor page tables are made of the same basic structure. 76 82 */ 77 - 78 - typedef __u64 pteval_t; 79 - typedef __u64 pmdval_t; 80 - typedef __u64 pudval_t; 81 - typedef __u64 pgdval_t; 82 - typedef __u64 pgprotval_t; 83 83 84 84 typedef HV_PTE pte_t; 85 85 typedef HV_PTE pgd_t;
+5 -2
arch/tile/include/asm/pgalloc.h
··· 41 41 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) 42 42 { 43 43 #ifdef CONFIG_64BIT 44 - set_pte_order(pmdp, pmd, L2_USER_PGTABLE_ORDER); 44 + set_pte(pmdp, pmd); 45 45 #else 46 - set_pte_order(&pmdp->pud.pgd, pmd.pud.pgd, L2_USER_PGTABLE_ORDER); 46 + set_pte(&pmdp->pud.pgd, pmd.pud.pgd); 47 47 #endif 48 48 } 49 49 ··· 99 99 100 100 /* During init, we can shatter kernel huge pages if needed. */ 101 101 void shatter_pmd(pmd_t *pmd); 102 + 103 + /* After init, a more complex technique is required. */ 104 + void shatter_huge_page(unsigned long addr); 102 105 103 106 #ifdef __tilegx__ 104 107 /* We share a single page allocator for both L1 and L2 page tables. */
+12 -19
arch/tile/include/asm/pgtable.h
··· 233 233 #define pgd_ERROR(e) \ 234 234 pr_err("%s:%d: bad pgd 0x%016llx.\n", __FILE__, __LINE__, pgd_val(e)) 235 235 236 + /* Return PA and protection info for a given kernel VA. */ 237 + int va_to_cpa_and_pte(void *va, phys_addr_t *cpa, pte_t *pte); 238 + 236 239 /* 237 - * set_pte_order() sets the given PTE and also sanity-checks the 240 + * __set_pte() ensures we write the 64-bit PTE with 32-bit words in 241 + * the right order on 32-bit platforms and also allows us to write 242 + * hooks to check valid PTEs, etc., if we want. 243 + */ 244 + void __set_pte(pte_t *ptep, pte_t pte); 245 + 246 + /* 247 + * set_pte() sets the given PTE and also sanity-checks the 238 248 * requested PTE against the page homecaching. Unspecified parts 239 249 * of the PTE are filled in when it is written to memory, i.e. all 240 250 * caching attributes if "!forcecache", or the home cpu if "anyhome". 241 251 */ 242 - extern void set_pte_order(pte_t *ptep, pte_t pte, int order); 243 - 244 - #define set_pte(ptep, pteval) set_pte_order(ptep, pteval, 0) 252 + extern void set_pte(pte_t *ptep, pte_t pte); 245 253 #define set_pte_at(mm, addr, ptep, pteval) set_pte(ptep, pteval) 246 254 #define set_pte_atomic(pteptr, pteval) set_pte(pteptr, pteval) 247 255 ··· 299 291 #define __swp_entry(type, off) ((swp_entry_t) { (type) | ((off) << 5) }) 300 292 #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).val >> 32 }) 301 293 #define __swp_entry_to_pte(swp) ((pte_t) { (((long long) ((swp).val)) << 32) }) 302 - 303 - /* 304 - * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); 305 - * 306 - * dst - pointer to pgd range anwhere on a pgd page 307 - * src - "" 308 - * count - the number of pgds to copy. 309 - * 310 - * dst and src can be on the same page, but the range must not overlap, 311 - * and must not cross a page boundary. 312 - */ 313 - static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) 314 - { 315 - memcpy(dst, src, count * sizeof(pgd_t)); 316 - } 317 294 318 295 /* 319 296 * Conversion functions: convert a page and protection to a page entry,
+7 -1
arch/tile/include/asm/pgtable_32.h
··· 24 24 #define PGDIR_SIZE HV_PAGE_SIZE_LARGE 25 25 #define PGDIR_MASK (~(PGDIR_SIZE-1)) 26 26 #define PTRS_PER_PGD (1 << (32 - PGDIR_SHIFT)) 27 + #define SIZEOF_PGD (PTRS_PER_PGD * sizeof(pgd_t)) 27 28 28 29 /* 29 30 * The level-2 index is defined by the difference between the huge ··· 34 33 * this nomenclature is somewhat confusing. 35 34 */ 36 35 #define PTRS_PER_PTE (1 << (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL)) 36 + #define SIZEOF_PTE (PTRS_PER_PTE * sizeof(pte_t)) 37 37 38 38 #ifndef __ASSEMBLY__ 39 39 ··· 96 94 */ 97 95 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG 98 96 #define __HAVE_ARCH_PTEP_SET_WRPROTECT 99 - #define __HAVE_ARCH_PTEP_GET_AND_CLEAR 100 97 101 98 extern int ptep_test_and_clear_young(struct vm_area_struct *, 102 99 unsigned long addr, pte_t *); ··· 109 108 pte_t pte = *ptep; 110 109 pte_clear(_mm, addr, ptep); 111 110 return pte; 111 + } 112 + 113 + static inline void __set_pmd(pmd_t *pmdp, pmd_t pmdval) 114 + { 115 + set_pte(&pmdp->pud.pgd, pmdval.pud.pgd); 112 116 } 113 117 114 118 /* Create a pmd from a PTFN. */
+2 -1
arch/tile/include/asm/stack.h
··· 18 18 #include <linux/types.h> 19 19 #include <linux/sched.h> 20 20 #include <asm/backtrace.h> 21 + #include <asm/page.h> 21 22 #include <hv/hypervisor.h> 22 23 23 24 /* Everything we need to keep track of a backtrace iteration */ 24 25 struct KBacktraceIterator { 25 26 BacktraceIterator it; 26 27 struct task_struct *task; /* task we are backtracing */ 27 - HV_PTE *pgtable; /* page table for user space access */ 28 + pte_t *pgtable; /* page table for user space access */ 28 29 int end; /* iteration complete. */ 29 30 int new_context; /* new context is starting */ 30 31 int profile; /* profiling, so stop on async intrpt */
+1
arch/tile/include/asm/thread_info.h
··· 68 68 #else 69 69 #define THREAD_SIZE_ORDER (0) 70 70 #endif 71 + #define THREAD_SIZE_PAGES (1 << THREAD_SIZE_ORDER) 71 72 72 73 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) 73 74 #define LOG2_THREAD_SIZE (PAGE_SHIFT + THREAD_SIZE_ORDER)
+13 -3
arch/tile/kernel/intvec_32.S
··· 1556 1556 .align 64 1557 1557 /* Align much later jump on the start of a cache line. */ 1558 1558 #if !ATOMIC_LOCKS_FOUND_VIA_TABLE() 1559 - nop; nop 1559 + nop 1560 + #if PAGE_SIZE >= 0x10000 1561 + nop 1562 + #endif 1560 1563 #endif 1561 1564 ENTRY(sys_cmpxchg) 1562 1565 ··· 1589 1586 * both a cmpxchg64() and a cmpxchg() on either its low or high word. 1590 1587 * NOTE: this must match __atomic_hashed_lock() in lib/atomic_32.c. 1591 1588 */ 1589 + 1590 + #if (PAGE_OFFSET & 0xffff) != 0 1591 + # error Code here assumes PAGE_OFFSET can be loaded with just hi16() 1592 + #endif 1592 1593 1593 1594 #if ATOMIC_LOCKS_FOUND_VIA_TABLE() 1594 1595 { ··· 1686 1679 lw r26, r0 1687 1680 } 1688 1681 { 1689 - /* atomic_locks is page aligned so this suffices to get its addr. */ 1690 - auli r21, zero, hi16(atomic_locks) 1682 + auli r21, zero, ha16(atomic_locks) 1691 1683 1692 1684 bbns r23, .Lcmpxchg_badaddr 1693 1685 } 1686 + #if PAGE_SIZE < 0x10000 1687 + /* atomic_locks is page-aligned so for big pages we don't need this. */ 1688 + addli r21, r21, lo16(atomic_locks) 1689 + #endif 1694 1690 { 1695 1691 /* 1696 1692 * Insert the hash bits into the page-aligned pointer.
+5 -2
arch/tile/kernel/machine_kexec.c
··· 240 240 pte = hv_pte(_PAGE_KERNEL | _PAGE_HUGE_PAGE); 241 241 pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3); 242 242 243 - for (i = 0; i < pgd_index(PAGE_OFFSET); i++) 244 - pgtable[i] = pfn_pte(i << (HPAGE_SHIFT - PAGE_SHIFT), pte); 243 + for (i = 0; i < pgd_index(PAGE_OFFSET); i++) { 244 + unsigned long pfn = i << (HPAGE_SHIFT - PAGE_SHIFT); 245 + if (pfn_valid(pfn)) 246 + __set_pte(&pgtable[i], pfn_pte(pfn, pte)); 247 + } 245 248 } 246 249 247 250
+19 -19
arch/tile/kernel/pci-dma.c
··· 86 86 * can count on nothing having been touched. 87 87 */ 88 88 89 + /* Flush a PA range from cache page by page. */ 90 + static void __dma_map_pa_range(dma_addr_t dma_addr, size_t size) 91 + { 92 + struct page *page = pfn_to_page(PFN_DOWN(dma_addr)); 93 + size_t bytesleft = PAGE_SIZE - (dma_addr & (PAGE_SIZE - 1)); 94 + 95 + while ((ssize_t)size > 0) { 96 + /* Flush the page. */ 97 + homecache_flush_cache(page++, 0); 98 + 99 + /* Figure out if we need to continue on the next page. */ 100 + size -= bytesleft; 101 + bytesleft = PAGE_SIZE; 102 + } 103 + } 89 104 90 105 /* 91 106 * dma_map_single can be passed any memory address, and there appear ··· 112 97 dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size, 113 98 enum dma_data_direction direction) 114 99 { 115 - struct page *page; 116 - dma_addr_t dma_addr; 117 - int thispage; 100 + dma_addr_t dma_addr = __pa(ptr); 118 101 119 102 BUG_ON(!valid_dma_direction(direction)); 120 103 WARN_ON(size == 0); 121 104 122 - dma_addr = __pa(ptr); 123 - 124 - /* We might have been handed a buffer that wraps a page boundary */ 125 - while ((int)size > 0) { 126 - /* The amount to flush that's on this page */ 127 - thispage = PAGE_SIZE - ((unsigned long)ptr & (PAGE_SIZE - 1)); 128 - thispage = min((int)thispage, (int)size); 129 - /* Is this valid for any page we could be handed? */ 130 - page = pfn_to_page(kaddr_to_pfn(ptr)); 131 - homecache_flush_cache(page, 0); 132 - ptr += thispage; 133 - size -= thispage; 134 - } 105 + __dma_map_pa_range(dma_addr, size); 135 106 136 107 return dma_addr; 137 108 } ··· 141 140 WARN_ON(nents == 0 || sglist->length == 0); 142 141 143 142 for_each_sg(sglist, sg, nents, i) { 144 - struct page *page; 145 143 sg->dma_address = sg_phys(sg); 146 - page = pfn_to_page(sg->dma_address >> PAGE_SHIFT); 147 - homecache_flush_cache(page, 0); 144 + __dma_map_pa_range(sg->dma_address, sg->length); 148 145 } 149 146 150 147 return nents; ··· 162 163 { 163 164 BUG_ON(!valid_dma_direction(direction)); 164 165 166 + BUG_ON(offset + size > PAGE_SIZE); 165 167 homecache_flush_cache(page, 0); 166 168 167 169 return page_to_pa(page) + offset;
+1 -1
arch/tile/kernel/process.c
··· 165 165 kfree(step_state); 166 166 } 167 167 168 - free_page((unsigned long)info); 168 + free_pages((unsigned long)info, THREAD_SIZE_ORDER); 169 169 } 170 170 171 171 static void save_arch_state(struct thread_struct *t);
+12 -8
arch/tile/kernel/setup.c
··· 59 59 unsigned long __initdata node_percpu_pfn[MAX_NUMNODES]; 60 60 unsigned long __initdata node_free_pfn[MAX_NUMNODES]; 61 61 62 + static unsigned long __initdata node_percpu[MAX_NUMNODES]; 63 + 62 64 #ifdef CONFIG_HIGHMEM 63 65 /* Page frame index of end of lowmem on each controller. */ 64 66 unsigned long __cpuinitdata node_lowmem_end_pfn[MAX_NUMNODES]; ··· 556 554 reserve_bootmem(crashk_res.start, 557 555 crashk_res.end - crashk_res.start + 1, 0); 558 556 #endif 559 - 560 557 } 561 558 562 559 void *__init alloc_remap(int nid, unsigned long size) ··· 569 568 570 569 static int __init percpu_size(void) 571 570 { 572 - int size = ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE); 573 - #ifdef CONFIG_MODULES 574 - if (size < PERCPU_ENOUGH_ROOM) 575 - size = PERCPU_ENOUGH_ROOM; 576 - #endif 571 + int size = __per_cpu_end - __per_cpu_start; 572 + size += PERCPU_MODULE_RESERVE; 573 + size += PERCPU_DYNAMIC_EARLY_SIZE; 574 + if (size < PCPU_MIN_UNIT_SIZE) 575 + size = PCPU_MIN_UNIT_SIZE; 576 + size = roundup(size, PAGE_SIZE); 577 + 577 578 /* In several places we assume the per-cpu data fits on a huge page. */ 578 579 BUG_ON(kdata_huge && size > HPAGE_SIZE); 579 580 return size; ··· 592 589 static void __init zone_sizes_init(void) 593 590 { 594 591 unsigned long zones_size[MAX_NR_ZONES] = { 0 }; 595 - unsigned long node_percpu[MAX_NUMNODES] = { 0 }; 596 592 int size = percpu_size(); 597 593 int num_cpus = smp_height * smp_width; 598 594 int i; ··· 676 674 NODE_DATA(i)->bdata = NODE_DATA(0)->bdata; 677 675 678 676 free_area_init_node(i, zones_size, start, NULL); 679 - printk(KERN_DEBUG " DMA zone: %ld per-cpu pages\n", 677 + printk(KERN_DEBUG " Normal zone: %ld per-cpu pages\n", 680 678 PFN_UP(node_percpu[i])); 681 679 682 680 /* Track the type of memory on each node */ ··· 1314 1312 1315 1313 BUG_ON(size % PAGE_SIZE != 0); 1316 1314 pfn_offset[nid] += size / PAGE_SIZE; 1315 + BUG_ON(node_percpu[nid] < size); 1316 + node_percpu[nid] -= size; 1317 1317 if (percpu_pfn[cpu] == 0) 1318 1318 percpu_pfn[cpu] = pfn; 1319 1319 return pfn_to_kaddr(pfn);
+2 -2
arch/tile/lib/memcpy_tile64.c
··· 96 96 newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1)); 97 97 pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc); 98 98 ptep = pte_offset_kernel(pmdp, newsrc); 99 - *ptep = src_pte; /* set_pte() would be confused by this */ 99 + __set_pte(ptep, src_pte); /* set_pte() would be confused by this */ 100 100 local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); 101 101 102 102 /* Actually move the data. */ ··· 109 109 */ 110 110 src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3); 111 111 src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */ 112 - *ptep = src_pte; /* set_pte() would be confused by this */ 112 + __set_pte(ptep, src_pte); /* set_pte() would be confused by this */ 113 113 local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); 114 114 115 115 /*
+1 -1
arch/tile/mm/homecache.c
··· 412 412 pte_t *ptep = virt_to_pte(NULL, kva); 413 413 pte_t pteval = *ptep; 414 414 BUG_ON(!pte_present(pteval) || pte_huge(pteval)); 415 - *ptep = pte_set_home(pteval, home); 415 + __set_pte(ptep, pte_set_home(pteval, home)); 416 416 } 417 417 } 418 418
+1 -17
arch/tile/mm/init.c
··· 53 53 54 54 #include "migrate.h" 55 55 56 - /* 57 - * We could set FORCE_MAX_ZONEORDER to "(HPAGE_SHIFT - PAGE_SHIFT + 1)" 58 - * in the Tile Kconfig, but this generates configure warnings. 59 - * Do it here and force people to get it right to compile this file. 60 - * The problem is that with 4KB small pages and 16MB huge pages, 61 - * the default value doesn't allow us to group enough small pages 62 - * together to make up a huge page. 63 - */ 64 - #if CONFIG_FORCE_MAX_ZONEORDER < HPAGE_SHIFT - PAGE_SHIFT + 1 65 - # error "Change FORCE_MAX_ZONEORDER in arch/tile/Kconfig to match page size" 66 - #endif 67 - 68 56 #define clear_pgd(pmdptr) (*(pmdptr) = hv_pte(0)) 69 57 70 58 #ifndef __tilegx__ ··· 950 962 951 963 void __init pgtable_cache_init(void) 952 964 { 953 - pgd_cache = kmem_cache_create("pgd", 954 - PTRS_PER_PGD*sizeof(pgd_t), 955 - PTRS_PER_PGD*sizeof(pgd_t), 956 - 0, 957 - NULL); 965 + pgd_cache = kmem_cache_create("pgd", SIZEOF_PGD, SIZEOF_PGD, 0, NULL); 958 966 if (!pgd_cache) 959 967 panic("pgtable_cache_init(): Cannot create pgd cache"); 960 968 }
+1
arch/tile/mm/migrate_32.S
··· 18 18 #include <linux/linkage.h> 19 19 #include <linux/threads.h> 20 20 #include <asm/page.h> 21 + #include <asm/thread_info.h> 21 22 #include <asm/types.h> 22 23 #include <asm/asm-offsets.h> 23 24 #include <hv/hypervisor.h>
+142 -30
arch/tile/mm/pgtable.c
··· 142 142 } 143 143 #endif 144 144 145 + /** 146 + * shatter_huge_page() - ensure a given address is mapped by a small page. 147 + * 148 + * This function converts a huge PTE mapping kernel LOWMEM into a bunch 149 + * of small PTEs with the same caching. No cache flush required, but we 150 + * must do a global TLB flush. 151 + * 152 + * Any caller that wishes to modify a kernel mapping that might 153 + * have been made with a huge page should call this function, 154 + * since doing so properly avoids race conditions with installing the 155 + * newly-shattered page and then flushing all the TLB entries. 156 + * 157 + * @addr: Address at which to shatter any existing huge page. 158 + */ 159 + void shatter_huge_page(unsigned long addr) 160 + { 161 + pgd_t *pgd; 162 + pud_t *pud; 163 + pmd_t *pmd; 164 + unsigned long flags = 0; /* happy compiler */ 165 + #ifdef __PAGETABLE_PMD_FOLDED 166 + struct list_head *pos; 167 + #endif 168 + 169 + /* Get a pointer to the pmd entry that we need to change. */ 170 + addr &= HPAGE_MASK; 171 + BUG_ON(pgd_addr_invalid(addr)); 172 + BUG_ON(addr < PAGE_OFFSET); /* only for kernel LOWMEM */ 173 + pgd = swapper_pg_dir + pgd_index(addr); 174 + pud = pud_offset(pgd, addr); 175 + BUG_ON(!pud_present(*pud)); 176 + pmd = pmd_offset(pud, addr); 177 + BUG_ON(!pmd_present(*pmd)); 178 + if (!pmd_huge_page(*pmd)) 179 + return; 180 + 181 + /* 182 + * Grab the pgd_lock, since we may need it to walk the pgd_list, 183 + * and since we need some kind of lock here to avoid races. 184 + */ 185 + spin_lock_irqsave(&pgd_lock, flags); 186 + if (!pmd_huge_page(*pmd)) { 187 + /* Lost the race to convert the huge page. */ 188 + spin_unlock_irqrestore(&pgd_lock, flags); 189 + return; 190 + } 191 + 192 + /* Shatter the huge page into the preallocated L2 page table. */ 193 + pmd_populate_kernel(&init_mm, pmd, 194 + get_prealloc_pte(pte_pfn(*(pte_t *)pmd))); 195 + 196 + #ifdef __PAGETABLE_PMD_FOLDED 197 + /* Walk every pgd on the system and update the pmd there. */ 198 + list_for_each(pos, &pgd_list) { 199 + pmd_t *copy_pmd; 200 + pgd = list_to_pgd(pos) + pgd_index(addr); 201 + pud = pud_offset(pgd, addr); 202 + copy_pmd = pmd_offset(pud, addr); 203 + __set_pmd(copy_pmd, *pmd); 204 + } 205 + #endif 206 + 207 + /* Tell every cpu to notice the change. */ 208 + flush_remote(0, 0, NULL, addr, HPAGE_SIZE, HPAGE_SIZE, 209 + cpu_possible_mask, NULL, 0); 210 + 211 + /* Hold the lock until the TLB flush is finished to avoid races. */ 212 + spin_unlock_irqrestore(&pgd_lock, flags); 213 + } 214 + 145 215 /* 146 216 * List of all pgd's needed so it can invalidate entries in both cached 147 217 * and uncached pgd's. This is essentially codepath-based locking ··· 254 184 BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0); 255 185 #endif 256 186 257 - clone_pgd_range(pgd + KERNEL_PGD_INDEX_START, 258 - swapper_pg_dir + KERNEL_PGD_INDEX_START, 259 - KERNEL_PGD_PTRS); 187 + memcpy(pgd + KERNEL_PGD_INDEX_START, 188 + swapper_pg_dir + KERNEL_PGD_INDEX_START, 189 + KERNEL_PGD_PTRS * sizeof(pgd_t)); 260 190 261 191 pgd_list_add(pgd); 262 192 spin_unlock_irqrestore(&pgd_lock, flags); ··· 290 220 291 221 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) 292 222 { 293 - gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO|__GFP_COMP; 223 + gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO; 294 224 struct page *p; 225 + #if L2_USER_PGTABLE_ORDER > 0 226 + int i; 227 + #endif 295 228 296 229 #ifdef CONFIG_HIGHPTE 297 230 flags |= __GFP_HIGHMEM; ··· 303 230 p = alloc_pages(flags, L2_USER_PGTABLE_ORDER); 304 231 if (p == NULL) 305 232 return NULL; 233 + 234 + #if L2_USER_PGTABLE_ORDER > 0 235 + /* 236 + * Make every page have a page_count() of one, not just the first. 237 + * We don't use __GFP_COMP since it doesn't look like it works 238 + * correctly with tlb_remove_page(). 239 + */ 240 + for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) { 241 + init_page_count(p+i); 242 + inc_zone_page_state(p+i, NR_PAGETABLE); 243 + } 244 + #endif 306 245 307 246 pgtable_page_ctor(p); 308 247 return p; ··· 327 242 */ 328 243 void pte_free(struct mm_struct *mm, struct page *p) 329 244 { 245 + int i; 246 + 330 247 pgtable_page_dtor(p); 331 - __free_pages(p, L2_USER_PGTABLE_ORDER); 248 + __free_page(p); 249 + 250 + for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) { 251 + __free_page(p+i); 252 + dec_zone_page_state(p+i, NR_PAGETABLE); 253 + } 332 254 } 333 255 334 256 void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, ··· 344 252 int i; 345 253 346 254 pgtable_page_dtor(pte); 347 - for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) 255 + tlb_remove_page(tlb, pte); 256 + 257 + for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) { 348 258 tlb_remove_page(tlb, pte + i); 259 + dec_zone_page_state(pte + i, NR_PAGETABLE); 260 + } 349 261 } 350 262 351 263 #ifndef __tilegx__ ··· 431 335 return x + y * smp_width; 432 336 } 433 337 434 - void set_pte_order(pte_t *ptep, pte_t pte, int order) 338 + /* 339 + * Convert a kernel VA to a PA and homing information. 340 + */ 341 + int va_to_cpa_and_pte(void *va, unsigned long long *cpa, pte_t *pte) 435 342 { 436 - unsigned long pfn = pte_pfn(pte); 437 - struct page *page = pfn_to_page(pfn); 343 + struct page *page = virt_to_page(va); 344 + pte_t null_pte = { 0 }; 345 + 346 + *cpa = __pa(va); 347 + 348 + /* Note that this is not writing a page table, just returning a pte. */ 349 + *pte = pte_set_home(null_pte, page_home(page)); 350 + 351 + return 0; /* return non-zero if not hfh? */ 352 + } 353 + EXPORT_SYMBOL(va_to_cpa_and_pte); 354 + 355 + void __set_pte(pte_t *ptep, pte_t pte) 356 + { 357 + #ifdef __tilegx__ 358 + *ptep = pte; 359 + #else 360 + # if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32 361 + # error Must write the present and migrating bits last 362 + # endif 363 + if (pte_present(pte)) { 364 + ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32); 365 + barrier(); 366 + ((u32 *)ptep)[0] = (u32)(pte_val(pte)); 367 + } else { 368 + ((u32 *)ptep)[0] = (u32)(pte_val(pte)); 369 + barrier(); 370 + ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32); 371 + } 372 + #endif /* __tilegx__ */ 373 + } 374 + 375 + void set_pte(pte_t *ptep, pte_t pte) 376 + { 377 + struct page *page = pfn_to_page(pte_pfn(pte)); 438 378 439 379 /* Update the home of a PTE if necessary */ 440 380 pte = pte_set_home(pte, page_home(page)); 441 381 442 - #ifdef __tilegx__ 443 - *ptep = pte; 444 - #else 445 - /* 446 - * When setting a PTE, write the high bits first, then write 447 - * the low bits. This sets the "present" bit only after the 448 - * other bits are in place. If a particular PTE update 449 - * involves transitioning from one valid PTE to another, it 450 - * may be necessary to call set_pte_order() more than once, 451 - * transitioning via a suitable intermediate state. 452 - * Note that this sequence also means that if we are transitioning 453 - * from any migrating PTE to a non-migrating one, we will not 454 - * see a half-updated PTE with the migrating bit off. 455 - */ 456 - #if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32 457 - # error Must write the present and migrating bits last 458 - #endif 459 - ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32); 460 - barrier(); 461 - ((u32 *)ptep)[0] = (u32)(pte_val(pte)); 462 - #endif 382 + __set_pte(ptep, pte); 463 383 } 464 384 465 385 /* Can this mm load a PTE with cached_priority set? */