arch/tile: support 4KB page size as well as 64KB

tjh.dev / kernel

fork atom

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork atom

arch/tile: support 4KB page size as well as 64KB

The Tilera architecture traditionally supports 64KB page sizes
to improve TLB utilization and improve performance when the
hardware is being used primarily to run a single application.

For more generic server scenarios, it can be beneficial to run
with 4KB page sizes, so this commit allows that to be specified
(by modifying the arch/tile/include/hv/pagesize.h header).

As part of this change, we also re-worked the PTE management
slightly so that PTE writes all go through a __set_pte() function
where we can do some additional validation. The set_pte_order()
function was eliminated since the "order" argument wasn't being used.

One bug uncovered was in the PCI DMA code, which wasn't properly
flushing the specified range. This was benign with 64KB pages,
but with 4KB pages we were getting some larger flushes wrong.

The per-cpu memory reservation code also needed updating to
conform with the newer percpu stuff; before it always chose 64KB,
and that was always correct, but with 4KB granularity we now have
to pay closer attention and reserve the amount of memory that will
be requested when the percpu code starts allocating.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>

Chris Metcalf 15 years ago 76c567fb 09c17eab

+237 -137

18 changed files

expand all collapse all

arch

tile

Kconfig

include

asm

hugetlb.h

page.h

pgalloc.h

pgtable.h

pgtable_32.h

stack.h

thread_info.h

kernel

intvec_32.S

machine_kexec.c

pci-dma.c

process.c

setup.c

lib

memcpy_tile64.c

homecache.c

init.c

migrate_32.S

pgtable.c

-6

arch/tile/Kconfig

reviewed

··· 202 202 By default, 2, i.e. 2^2 == 4 DDR2 controllers. 203 203 In a system with more controllers, this value should be raised. 204 204 205 205 - # Need 16MB areas to enable hugetlb 206 206 - # See build-time check in arch/tile/mm/init.c. 207 207 - config FORCE_MAX_ZONEORDER 208 208 - int 209 209 - default 9 210 210 - 211 205 choice 212 206 depends on !TILEGX 213 207 prompt "Memory split" if EXPERT

+1 -1

arch/tile/include/asm/hugetlb.h

reviewed

··· 54 54 static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 55 55 pte_t *ptep, pte_t pte) 56 56 { 57 57 - set_pte_order(ptep, pte, HUGETLB_PAGE_ORDER); 57 57 + set_pte(ptep, pte); 58 58 } 59 59 60 60 static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,

+12 -24

arch/tile/include/asm/page.h

reviewed

··· 16 16 #define _ASM_TILE_PAGE_H 17 17 18 18 #include <linux/const.h> 19 19 + #include <hv/pagesize.h> 19 20 20 21 /* PAGE_SHIFT and HPAGE_SHIFT determine the page sizes. */ 21 21 - #define PAGE_SHIFT 16 22 22 - #define HPAGE_SHIFT 24 22 22 + #define PAGE_SHIFT HV_LOG2_PAGE_SIZE_SMALL 23 23 + #define HPAGE_SHIFT HV_LOG2_PAGE_SIZE_LARGE 23 24 24 25 #define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) 25 26 #define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT) ··· 30 29 31 30 #ifdef __KERNEL__ 32 31 32 32 + /* 33 33 + * If the Kconfig doesn't specify, set a maximum zone order that 34 34 + * is enough so that we can create huge pages from small pages given 35 35 + * the respective sizes of the two page types. See <linux/mmzone.h>. 36 36 + */ 37 37 + #ifndef CONFIG_FORCE_MAX_ZONEORDER 38 38 + #define CONFIG_FORCE_MAX_ZONEORDER (HPAGE_SHIFT - PAGE_SHIFT + 1) 39 39 + #endif 40 40 + 33 41 #include <hv/hypervisor.h> 34 42 #include <arch/chip.h> 35 35 - 36 36 - /* 37 37 - * The {,H}PAGE_SHIFT values must match the HV_LOG2_PAGE_SIZE_xxx 38 38 - * definitions in <hv/hypervisor.h>. We validate this at build time 39 39 - * here, and again at runtime during early boot. We provide a 40 40 - * separate definition since userspace doesn't have <hv/hypervisor.h>. 41 41 - * 42 42 - * Be careful to distinguish PAGE_SHIFT from HV_PTE_INDEX_PFN, since 43 43 - * they are the same on i386 but not TILE. 44 44 - */ 45 45 - #if HV_LOG2_PAGE_SIZE_SMALL != PAGE_SHIFT 46 46 - # error Small page size mismatch in Linux 47 47 - #endif 48 48 - #if HV_LOG2_PAGE_SIZE_LARGE != HPAGE_SHIFT 49 49 - # error Huge page size mismatch in Linux 50 50 - #endif 51 43 52 44 #ifndef __ASSEMBLY__ 53 45 ··· 74 80 /* 75 81 * Hypervisor page tables are made of the same basic structure. 76 82 */ 77 77 - 78 78 - typedef __u64 pteval_t; 79 79 - typedef __u64 pmdval_t; 80 80 - typedef __u64 pudval_t; 81 81 - typedef __u64 pgdval_t; 82 82 - typedef __u64 pgprotval_t; 83 83 84 84 typedef HV_PTE pte_t; 85 85 typedef HV_PTE pgd_t;

+5 -2

arch/tile/include/asm/pgalloc.h

reviewed

··· 41 41 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) 42 42 { 43 43 #ifdef CONFIG_64BIT 44 44 - set_pte_order(pmdp, pmd, L2_USER_PGTABLE_ORDER); 44 44 + set_pte(pmdp, pmd); 45 45 #else 46 46 - set_pte_order(&pmdp->pud.pgd, pmd.pud.pgd, L2_USER_PGTABLE_ORDER); 46 46 + set_pte(&pmdp->pud.pgd, pmd.pud.pgd); 47 47 #endif 48 48 } 49 49 ··· 99 99 100 100 /* During init, we can shatter kernel huge pages if needed. */ 101 101 void shatter_pmd(pmd_t *pmd); 102 102 + 103 103 + /* After init, a more complex technique is required. */ 104 104 + void shatter_huge_page(unsigned long addr); 102 105 103 106 #ifdef __tilegx__ 104 107 /* We share a single page allocator for both L1 and L2 page tables. */

+12 -19

arch/tile/include/asm/pgtable.h

reviewed

··· 233 233 #define pgd_ERROR(e) \ 234 234 pr_err("%s:%d: bad pgd 0x%016llx.\n", __FILE__, __LINE__, pgd_val(e)) 235 235 236 236 + /* Return PA and protection info for a given kernel VA. */ 237 237 + int va_to_cpa_and_pte(void *va, phys_addr_t *cpa, pte_t *pte); 238 238 + 236 239 /* 237 237 - * set_pte_order() sets the given PTE and also sanity-checks the 240 240 + * __set_pte() ensures we write the 64-bit PTE with 32-bit words in 241 241 + * the right order on 32-bit platforms and also allows us to write 242 242 + * hooks to check valid PTEs, etc., if we want. 243 243 + */ 244 244 + void __set_pte(pte_t *ptep, pte_t pte); 245 245 + 246 246 + /* 247 247 + * set_pte() sets the given PTE and also sanity-checks the 238 248 * requested PTE against the page homecaching. Unspecified parts 239 249 * of the PTE are filled in when it is written to memory, i.e. all 240 250 * caching attributes if "!forcecache", or the home cpu if "anyhome". 241 251 */ 242 242 - extern void set_pte_order(pte_t *ptep, pte_t pte, int order); 243 243 - 244 244 - #define set_pte(ptep, pteval) set_pte_order(ptep, pteval, 0) 252 252 + extern void set_pte(pte_t *ptep, pte_t pte); 245 253 #define set_pte_at(mm, addr, ptep, pteval) set_pte(ptep, pteval) 246 254 #define set_pte_atomic(pteptr, pteval) set_pte(pteptr, pteval) 247 255 ··· 299 291 #define __swp_entry(type, off) ((swp_entry_t) { (type) | ((off) << 5) }) 300 292 #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).val >> 32 }) 301 293 #define __swp_entry_to_pte(swp) ((pte_t) { (((long long) ((swp).val)) << 32) }) 302 302 - 303 303 - /* 304 304 - * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); 305 305 - * 306 306 - * dst - pointer to pgd range anwhere on a pgd page 307 307 - * src - "" 308 308 - * count - the number of pgds to copy. 309 309 - * 310 310 - * dst and src can be on the same page, but the range must not overlap, 311 311 - * and must not cross a page boundary. 312 312 - */ 313 313 - static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) 314 314 - { 315 315 - memcpy(dst, src, count * sizeof(pgd_t)); 316 316 - } 317 294 318 295 /* 319 296 * Conversion functions: convert a page and protection to a page entry,

+7 -1

arch/tile/include/asm/pgtable_32.h

reviewed

··· 24 24 #define PGDIR_SIZE HV_PAGE_SIZE_LARGE 25 25 #define PGDIR_MASK (~(PGDIR_SIZE-1)) 26 26 #define PTRS_PER_PGD (1 << (32 - PGDIR_SHIFT)) 27 27 + #define SIZEOF_PGD (PTRS_PER_PGD * sizeof(pgd_t)) 27 28 28 29 /* 29 30 * The level-2 index is defined by the difference between the huge ··· 34 33 * this nomenclature is somewhat confusing. 35 34 */ 36 35 #define PTRS_PER_PTE (1 << (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL)) 36 36 + #define SIZEOF_PTE (PTRS_PER_PTE * sizeof(pte_t)) 37 37 38 38 #ifndef __ASSEMBLY__ 39 39 ··· 96 94 */ 97 95 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG 98 96 #define __HAVE_ARCH_PTEP_SET_WRPROTECT 99 99 - #define __HAVE_ARCH_PTEP_GET_AND_CLEAR 100 97 101 98 extern int ptep_test_and_clear_young(struct vm_area_struct *, 102 99 unsigned long addr, pte_t *); ··· 109 108 pte_t pte = *ptep; 110 109 pte_clear(_mm, addr, ptep); 111 110 return pte; 111 111 + } 112 112 + 113 113 + static inline void __set_pmd(pmd_t *pmdp, pmd_t pmdval) 114 114 + { 115 115 + set_pte(&pmdp->pud.pgd, pmdval.pud.pgd); 112 116 } 113 117 114 118 /* Create a pmd from a PTFN. */

+2 -1

arch/tile/include/asm/stack.h

reviewed

··· 18 18 #include <linux/types.h> 19 19 #include <linux/sched.h> 20 20 #include <asm/backtrace.h> 21 21 + #include <asm/page.h> 21 22 #include <hv/hypervisor.h> 22 23 23 24 /* Everything we need to keep track of a backtrace iteration */ 24 25 struct KBacktraceIterator { 25 26 BacktraceIterator it; 26 27 struct task_struct *task; /* task we are backtracing */ 27 27 - HV_PTE *pgtable; /* page table for user space access */ 28 28 + pte_t *pgtable; /* page table for user space access */ 28 29 int end; /* iteration complete. */ 29 30 int new_context; /* new context is starting */ 30 31 int profile; /* profiling, so stop on async intrpt */

arch/tile/include/asm/thread_info.h

reviewed

··· 68 68 #else 69 69 #define THREAD_SIZE_ORDER (0) 70 70 #endif 71 71 + #define THREAD_SIZE_PAGES (1 << THREAD_SIZE_ORDER) 71 72 72 73 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) 73 74 #define LOG2_THREAD_SIZE (PAGE_SHIFT + THREAD_SIZE_ORDER)

+13 -3

arch/tile/kernel/intvec_32.S

reviewed

··· 1556 1556 .align 64 1557 1557 /* Align much later jump on the start of a cache line. */ 1558 1558 #if !ATOMIC_LOCKS_FOUND_VIA_TABLE() 1559 1559 - nop; nop 1559 1559 + nop 1560 1560 + #if PAGE_SIZE >= 0x10000 1561 1561 + nop 1562 1562 + #endif 1560 1563 #endif 1561 1564 ENTRY(sys_cmpxchg) 1562 1565 ··· 1589 1586 * both a cmpxchg64() and a cmpxchg() on either its low or high word. 1590 1587 * NOTE: this must match __atomic_hashed_lock() in lib/atomic_32.c. 1591 1588 */ 1589 1589 + 1590 1590 + #if (PAGE_OFFSET & 0xffff) != 0 1591 1591 + # error Code here assumes PAGE_OFFSET can be loaded with just hi16() 1592 1592 + #endif 1592 1593 1593 1594 #if ATOMIC_LOCKS_FOUND_VIA_TABLE() 1594 1595 { ··· 1686 1679 lw r26, r0 1687 1680 } 1688 1681 { 1689 1689 - /* atomic_locks is page aligned so this suffices to get its addr. */ 1690 1690 - auli r21, zero, hi16(atomic_locks) 1682 1682 + auli r21, zero, ha16(atomic_locks) 1691 1683 1692 1684 bbns r23, .Lcmpxchg_badaddr 1693 1685 } 1686 1686 + #if PAGE_SIZE < 0x10000 1687 1687 + /* atomic_locks is page-aligned so for big pages we don't need this. */ 1688 1688 + addli r21, r21, lo16(atomic_locks) 1689 1689 + #endif 1694 1690 { 1695 1691 /* 1696 1692 * Insert the hash bits into the page-aligned pointer.

+5 -2

arch/tile/kernel/machine_kexec.c

reviewed

··· 240 240 pte = hv_pte(_PAGE_KERNEL | _PAGE_HUGE_PAGE); 241 241 pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3); 242 242 243 243 - for (i = 0; i < pgd_index(PAGE_OFFSET); i++) 244 244 - pgtable[i] = pfn_pte(i << (HPAGE_SHIFT - PAGE_SHIFT), pte); 243 243 + for (i = 0; i < pgd_index(PAGE_OFFSET); i++) { 244 244 + unsigned long pfn = i << (HPAGE_SHIFT - PAGE_SHIFT); 245 245 + if (pfn_valid(pfn)) 246 246 + __set_pte(&pgtable[i], pfn_pte(pfn, pte)); 247 247 + } 245 248 } 246 249 247 250

+19 -19

arch/tile/kernel/pci-dma.c

reviewed

··· 86 86 * can count on nothing having been touched. 87 87 */ 88 88 89 89 + /* Flush a PA range from cache page by page. */ 90 90 + static void __dma_map_pa_range(dma_addr_t dma_addr, size_t size) 91 91 + { 92 92 + struct page *page = pfn_to_page(PFN_DOWN(dma_addr)); 93 93 + size_t bytesleft = PAGE_SIZE - (dma_addr & (PAGE_SIZE - 1)); 94 94 + 95 95 + while ((ssize_t)size > 0) { 96 96 + /* Flush the page. */ 97 97 + homecache_flush_cache(page++, 0); 98 98 + 99 99 + /* Figure out if we need to continue on the next page. */ 100 100 + size -= bytesleft; 101 101 + bytesleft = PAGE_SIZE; 102 102 + } 103 103 + } 89 104 90 105 /* 91 106 * dma_map_single can be passed any memory address, and there appear ··· 112 97 dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size, 113 98 enum dma_data_direction direction) 114 99 { 115 115 - struct page *page; 116 116 - dma_addr_t dma_addr; 117 117 - int thispage; 100 100 + dma_addr_t dma_addr = __pa(ptr); 118 101 119 102 BUG_ON(!valid_dma_direction(direction)); 120 103 WARN_ON(size == 0); 121 104 122 122 - dma_addr = __pa(ptr); 123 123 - 124 124 - /* We might have been handed a buffer that wraps a page boundary */ 125 125 - while ((int)size > 0) { 126 126 - /* The amount to flush that's on this page */ 127 127 - thispage = PAGE_SIZE - ((unsigned long)ptr & (PAGE_SIZE - 1)); 128 128 - thispage = min((int)thispage, (int)size); 129 129 - /* Is this valid for any page we could be handed? */ 130 130 - page = pfn_to_page(kaddr_to_pfn(ptr)); 131 131 - homecache_flush_cache(page, 0); 132 132 - ptr += thispage; 133 133 - size -= thispage; 134 134 - } 105 105 + __dma_map_pa_range(dma_addr, size); 135 106 136 107 return dma_addr; 137 108 } ··· 141 140 WARN_ON(nents == 0 || sglist->length == 0); 142 141 143 142 for_each_sg(sglist, sg, nents, i) { 144 144 - struct page *page; 145 143 sg->dma_address = sg_phys(sg); 146 146 - page = pfn_to_page(sg->dma_address >> PAGE_SHIFT); 147 147 - homecache_flush_cache(page, 0); 144 144 + __dma_map_pa_range(sg->dma_address, sg->length); 148 145 } 149 146 150 147 return nents; ··· 162 163 { 163 164 BUG_ON(!valid_dma_direction(direction)); 164 165 166 166 + BUG_ON(offset + size > PAGE_SIZE); 165 167 homecache_flush_cache(page, 0); 166 168 167 169 return page_to_pa(page) + offset;

+1 -1

arch/tile/kernel/process.c

reviewed

··· 165 165 kfree(step_state); 166 166 } 167 167 168 168 - free_page((unsigned long)info); 168 168 + free_pages((unsigned long)info, THREAD_SIZE_ORDER); 169 169 } 170 170 171 171 static void save_arch_state(struct thread_struct *t);

+12 -8

arch/tile/kernel/setup.c

reviewed

··· 59 59 unsigned long __initdata node_percpu_pfn[MAX_NUMNODES]; 60 60 unsigned long __initdata node_free_pfn[MAX_NUMNODES]; 61 61 62 62 + static unsigned long __initdata node_percpu[MAX_NUMNODES]; 63 63 + 62 64 #ifdef CONFIG_HIGHMEM 63 65 /* Page frame index of end of lowmem on each controller. */ 64 66 unsigned long __cpuinitdata node_lowmem_end_pfn[MAX_NUMNODES]; ··· 556 554 reserve_bootmem(crashk_res.start, 557 555 crashk_res.end - crashk_res.start + 1, 0); 558 556 #endif 559 559 - 560 557 } 561 558 562 559 void *__init alloc_remap(int nid, unsigned long size) ··· 569 568 570 569 static int __init percpu_size(void) 571 570 { 572 572 - int size = ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE); 573 573 - #ifdef CONFIG_MODULES 574 574 - if (size < PERCPU_ENOUGH_ROOM) 575 575 - size = PERCPU_ENOUGH_ROOM; 576 576 - #endif 571 571 + int size = __per_cpu_end - __per_cpu_start; 572 572 + size += PERCPU_MODULE_RESERVE; 573 573 + size += PERCPU_DYNAMIC_EARLY_SIZE; 574 574 + if (size < PCPU_MIN_UNIT_SIZE) 575 575 + size = PCPU_MIN_UNIT_SIZE; 576 576 + size = roundup(size, PAGE_SIZE); 577 577 + 577 578 /* In several places we assume the per-cpu data fits on a huge page. */ 578 579 BUG_ON(kdata_huge && size > HPAGE_SIZE); 579 580 return size; ··· 592 589 static void __init zone_sizes_init(void) 593 590 { 594 591 unsigned long zones_size[MAX_NR_ZONES] = { 0 }; 595 595 - unsigned long node_percpu[MAX_NUMNODES] = { 0 }; 596 592 int size = percpu_size(); 597 593 int num_cpus = smp_height * smp_width; 598 594 int i; ··· 676 674 NODE_DATA(i)->bdata = NODE_DATA(0)->bdata; 677 675 678 676 free_area_init_node(i, zones_size, start, NULL); 679 679 - printk(KERN_DEBUG " DMA zone: %ld per-cpu pages\n", 677 677 + printk(KERN_DEBUG " Normal zone: %ld per-cpu pages\n", 680 678 PFN_UP(node_percpu[i])); 681 679 682 680 /* Track the type of memory on each node */ ··· 1314 1312 1315 1313 BUG_ON(size % PAGE_SIZE != 0); 1316 1314 pfn_offset[nid] += size / PAGE_SIZE; 1315 1315 + BUG_ON(node_percpu[nid] < size); 1316 1316 + node_percpu[nid] -= size; 1317 1317 if (percpu_pfn[cpu] == 0) 1318 1318 percpu_pfn[cpu] = pfn; 1319 1319 return pfn_to_kaddr(pfn);

+2 -2

arch/tile/lib/memcpy_tile64.c

reviewed

··· 96 96 newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1)); 97 97 pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc); 98 98 ptep = pte_offset_kernel(pmdp, newsrc); 99 99 - *ptep = src_pte; /* set_pte() would be confused by this */ 99 99 + __set_pte(ptep, src_pte); /* set_pte() would be confused by this */ 100 100 local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); 101 101 102 102 /* Actually move the data. */ ··· 109 109 */ 110 110 src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3); 111 111 src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */ 112 112 - *ptep = src_pte; /* set_pte() would be confused by this */ 112 112 + __set_pte(ptep, src_pte); /* set_pte() would be confused by this */ 113 113 local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); 114 114 115 115 /*

+1 -1

arch/tile/mm/homecache.c

reviewed

··· 412 412 pte_t *ptep = virt_to_pte(NULL, kva); 413 413 pte_t pteval = *ptep; 414 414 BUG_ON(!pte_present(pteval) || pte_huge(pteval)); 415 415 - *ptep = pte_set_home(pteval, home); 415 415 + __set_pte(ptep, pte_set_home(pteval, home)); 416 416 } 417 417 } 418 418

+1 -17

arch/tile/mm/init.c

reviewed

··· 53 53 54 54 #include "migrate.h" 55 55 56 56 - /* 57 57 - * We could set FORCE_MAX_ZONEORDER to "(HPAGE_SHIFT - PAGE_SHIFT + 1)" 58 58 - * in the Tile Kconfig, but this generates configure warnings. 59 59 - * Do it here and force people to get it right to compile this file. 60 60 - * The problem is that with 4KB small pages and 16MB huge pages, 61 61 - * the default value doesn't allow us to group enough small pages 62 62 - * together to make up a huge page. 63 63 - */ 64 64 - #if CONFIG_FORCE_MAX_ZONEORDER < HPAGE_SHIFT - PAGE_SHIFT + 1 65 65 - # error "Change FORCE_MAX_ZONEORDER in arch/tile/Kconfig to match page size" 66 66 - #endif 67 67 - 68 56 #define clear_pgd(pmdptr) (*(pmdptr) = hv_pte(0)) 69 57 70 58 #ifndef __tilegx__ ··· 950 962 951 963 void __init pgtable_cache_init(void) 952 964 { 953 953 - pgd_cache = kmem_cache_create("pgd", 954 954 - PTRS_PER_PGD*sizeof(pgd_t), 955 955 - PTRS_PER_PGD*sizeof(pgd_t), 956 956 - 0, 957 957 - NULL); 965 965 + pgd_cache = kmem_cache_create("pgd", SIZEOF_PGD, SIZEOF_PGD, 0, NULL); 958 966 if (!pgd_cache) 959 967 panic("pgtable_cache_init(): Cannot create pgd cache"); 960 968 }

arch/tile/mm/migrate_32.S

reviewed

··· 18 18 #include <linux/linkage.h> 19 19 #include <linux/threads.h> 20 20 #include <asm/page.h> 21 21 + #include <asm/thread_info.h> 21 22 #include <asm/types.h> 22 23 #include <asm/asm-offsets.h> 23 24 #include <hv/hypervisor.h>

+142 -30

arch/tile/mm/pgtable.c

reviewed

··· 142 142 } 143 143 #endif 144 144 145 145 + /** 146 146 + * shatter_huge_page() - ensure a given address is mapped by a small page. 147 147 + * 148 148 + * This function converts a huge PTE mapping kernel LOWMEM into a bunch 149 149 + * of small PTEs with the same caching. No cache flush required, but we 150 150 + * must do a global TLB flush. 151 151 + * 152 152 + * Any caller that wishes to modify a kernel mapping that might 153 153 + * have been made with a huge page should call this function, 154 154 + * since doing so properly avoids race conditions with installing the 155 155 + * newly-shattered page and then flushing all the TLB entries. 156 156 + * 157 157 + * @addr: Address at which to shatter any existing huge page. 158 158 + */ 159 159 + void shatter_huge_page(unsigned long addr) 160 160 + { 161 161 + pgd_t *pgd; 162 162 + pud_t *pud; 163 163 + pmd_t *pmd; 164 164 + unsigned long flags = 0; /* happy compiler */ 165 165 + #ifdef __PAGETABLE_PMD_FOLDED 166 166 + struct list_head *pos; 167 167 + #endif 168 168 + 169 169 + /* Get a pointer to the pmd entry that we need to change. */ 170 170 + addr &= HPAGE_MASK; 171 171 + BUG_ON(pgd_addr_invalid(addr)); 172 172 + BUG_ON(addr < PAGE_OFFSET); /* only for kernel LOWMEM */ 173 173 + pgd = swapper_pg_dir + pgd_index(addr); 174 174 + pud = pud_offset(pgd, addr); 175 175 + BUG_ON(!pud_present(*pud)); 176 176 + pmd = pmd_offset(pud, addr); 177 177 + BUG_ON(!pmd_present(*pmd)); 178 178 + if (!pmd_huge_page(*pmd)) 179 179 + return; 180 180 + 181 181 + /* 182 182 + * Grab the pgd_lock, since we may need it to walk the pgd_list, 183 183 + * and since we need some kind of lock here to avoid races. 184 184 + */ 185 185 + spin_lock_irqsave(&pgd_lock, flags); 186 186 + if (!pmd_huge_page(*pmd)) { 187 187 + /* Lost the race to convert the huge page. */ 188 188 + spin_unlock_irqrestore(&pgd_lock, flags); 189 189 + return; 190 190 + } 191 191 + 192 192 + /* Shatter the huge page into the preallocated L2 page table. */ 193 193 + pmd_populate_kernel(&init_mm, pmd, 194 194 + get_prealloc_pte(pte_pfn(*(pte_t *)pmd))); 195 195 + 196 196 + #ifdef __PAGETABLE_PMD_FOLDED 197 197 + /* Walk every pgd on the system and update the pmd there. */ 198 198 + list_for_each(pos, &pgd_list) { 199 199 + pmd_t *copy_pmd; 200 200 + pgd = list_to_pgd(pos) + pgd_index(addr); 201 201 + pud = pud_offset(pgd, addr); 202 202 + copy_pmd = pmd_offset(pud, addr); 203 203 + __set_pmd(copy_pmd, *pmd); 204 204 + } 205 205 + #endif 206 206 + 207 207 + /* Tell every cpu to notice the change. */ 208 208 + flush_remote(0, 0, NULL, addr, HPAGE_SIZE, HPAGE_SIZE, 209 209 + cpu_possible_mask, NULL, 0); 210 210 + 211 211 + /* Hold the lock until the TLB flush is finished to avoid races. */ 212 212 + spin_unlock_irqrestore(&pgd_lock, flags); 213 213 + } 214 214 + 145 215 /* 146 216 * List of all pgd's needed so it can invalidate entries in both cached 147 217 * and uncached pgd's. This is essentially codepath-based locking ··· 254 184 BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0); 255 185 #endif 256 186 257 257 - clone_pgd_range(pgd + KERNEL_PGD_INDEX_START, 258 258 - swapper_pg_dir + KERNEL_PGD_INDEX_START, 259 259 - KERNEL_PGD_PTRS); 187 187 + memcpy(pgd + KERNEL_PGD_INDEX_START, 188 188 + swapper_pg_dir + KERNEL_PGD_INDEX_START, 189 189 + KERNEL_PGD_PTRS * sizeof(pgd_t)); 260 190 261 191 pgd_list_add(pgd); 262 192 spin_unlock_irqrestore(&pgd_lock, flags); ··· 290 220 291 221 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) 292 222 { 293 293 - gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO|__GFP_COMP; 223 223 + gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO; 294 224 struct page *p; 225 225 + #if L2_USER_PGTABLE_ORDER > 0 226 226 + int i; 227 227 + #endif 295 228 296 229 #ifdef CONFIG_HIGHPTE 297 230 flags |= __GFP_HIGHMEM; ··· 303 230 p = alloc_pages(flags, L2_USER_PGTABLE_ORDER); 304 231 if (p == NULL) 305 232 return NULL; 233 233 + 234 234 + #if L2_USER_PGTABLE_ORDER > 0 235 235 + /* 236 236 + * Make every page have a page_count() of one, not just the first. 237 237 + * We don't use __GFP_COMP since it doesn't look like it works 238 238 + * correctly with tlb_remove_page(). 239 239 + */ 240 240 + for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) { 241 241 + init_page_count(p+i); 242 242 + inc_zone_page_state(p+i, NR_PAGETABLE); 243 243 + } 244 244 + #endif 306 245 307 246 pgtable_page_ctor(p); 308 247 return p; ··· 327 242 */ 328 243 void pte_free(struct mm_struct *mm, struct page *p) 329 244 { 245 245 + int i; 246 246 + 330 247 pgtable_page_dtor(p); 331 331 - __free_pages(p, L2_USER_PGTABLE_ORDER); 248 248 + __free_page(p); 249 249 + 250 250 + for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) { 251 251 + __free_page(p+i); 252 252 + dec_zone_page_state(p+i, NR_PAGETABLE); 253 253 + } 332 254 } 333 255 334 256 void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, ··· 344 252 int i; 345 253 346 254 pgtable_page_dtor(pte); 347 347 - for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) 255 255 + tlb_remove_page(tlb, pte); 256 256 + 257 257 + for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) { 348 258 tlb_remove_page(tlb, pte + i); 259 259 + dec_zone_page_state(pte + i, NR_PAGETABLE); 260 260 + } 349 261 } 350 262 351 263 #ifndef __tilegx__ ··· 431 335 return x + y * smp_width; 432 336 } 433 337 434 434 - void set_pte_order(pte_t *ptep, pte_t pte, int order) 338 338 + /* 339 339 + * Convert a kernel VA to a PA and homing information. 340 340 + */ 341 341 + int va_to_cpa_and_pte(void *va, unsigned long long *cpa, pte_t *pte) 435 342 { 436 436 - unsigned long pfn = pte_pfn(pte); 437 437 - struct page *page = pfn_to_page(pfn); 343 343 + struct page *page = virt_to_page(va); 344 344 + pte_t null_pte = { 0 }; 345 345 + 346 346 + *cpa = __pa(va); 347 347 + 348 348 + /* Note that this is not writing a page table, just returning a pte. */ 349 349 + *pte = pte_set_home(null_pte, page_home(page)); 350 350 + 351 351 + return 0; /* return non-zero if not hfh? */ 352 352 + } 353 353 + EXPORT_SYMBOL(va_to_cpa_and_pte); 354 354 + 355 355 + void __set_pte(pte_t *ptep, pte_t pte) 356 356 + { 357 357 + #ifdef __tilegx__ 358 358 + *ptep = pte; 359 359 + #else 360 360 + # if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32 361 361 + # error Must write the present and migrating bits last 362 362 + # endif 363 363 + if (pte_present(pte)) { 364 364 + ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32); 365 365 + barrier(); 366 366 + ((u32 *)ptep)[0] = (u32)(pte_val(pte)); 367 367 + } else { 368 368 + ((u32 *)ptep)[0] = (u32)(pte_val(pte)); 369 369 + barrier(); 370 370 + ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32); 371 371 + } 372 372 + #endif /* __tilegx__ */ 373 373 + } 374 374 + 375 375 + void set_pte(pte_t *ptep, pte_t pte) 376 376 + { 377 377 + struct page *page = pfn_to_page(pte_pfn(pte)); 438 378 439 379 /* Update the home of a PTE if necessary */ 440 380 pte = pte_set_home(pte, page_home(page)); 441 381 442 442 - #ifdef __tilegx__ 443 443 - *ptep = pte; 444 444 - #else 445 445 - /* 446 446 - * When setting a PTE, write the high bits first, then write 447 447 - * the low bits. This sets the "present" bit only after the 448 448 - * other bits are in place. If a particular PTE update 449 449 - * involves transitioning from one valid PTE to another, it 450 450 - * may be necessary to call set_pte_order() more than once, 451 451 - * transitioning via a suitable intermediate state. 452 452 - * Note that this sequence also means that if we are transitioning 453 453 - * from any migrating PTE to a non-migrating one, we will not 454 454 - * see a half-updated PTE with the migrating bit off. 455 455 - */ 456 456 - #if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32 457 457 - # error Must write the present and migrating bits last 458 458 - #endif 459 459 - ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32); 460 460 - barrier(); 461 461 - ((u32 *)ptep)[0] = (u32)(pte_val(pte)); 462 462 - #endif 382 382 + __set_pte(ptep, pte); 463 383 } 464 384 465 385 /* Can this mm load a PTE with cached_priority set? */