[PATCH] Dynamic hugepage addresses for ppc64

Paulus, I think this is now a reasonable candidate for the post-2.6.13
queue.

Relax address restrictions for hugepages on ppc64

Presently, 64-bit applications on ppc64 may only use hugepages in the
address region from 1-1.5T. Furthermore, if hugepages are enabled in
the kernel config, they may only use hugepages and never normal pages
in this area. This patch relaxes this restriction, allowing any
address to be used with hugepages, but with a 1TB granularity. That
is if you map a hugepage anywhere in the region 1TB-2TB, that entire
area will be reserved exclusively for hugepages for the remainder of
the process's lifetime. This works analagously to hugepages in 32-bit
applications, where hugepages can be mapped anywhere, but with 256MB
(mmu segment) granularity.

This patch applies on top of the four level pagetable patch
(http://patchwork.ozlabs.org/linuxppc64/patch?id=1936).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>

authored by

David Gibson and committed by
Paul Mackerras
c594adad 9a5573e3

+191 -79
+2 -1
arch/ppc64/kernel/asm-offsets.c
··· 94 94 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); 95 95 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); 96 96 #ifdef CONFIG_HUGETLB_PAGE 97 - DEFINE(PACAHTLBSEGS, offsetof(struct paca_struct, context.htlb_segs)); 97 + DEFINE(PACALOWHTLBAREAS, offsetof(struct paca_struct, context.low_htlb_areas)); 98 + DEFINE(PACAHIGHHTLBAREAS, offsetof(struct paca_struct, context.high_htlb_areas)); 98 99 #endif /* CONFIG_HUGETLB_PAGE */ 99 100 DEFINE(PACADEFAULTDECR, offsetof(struct paca_struct, default_decr)); 100 101 DEFINE(PACA_EXGEN, offsetof(struct paca_struct, exgen));
+158 -53
arch/ppc64/mm/hugetlbpage.c
··· 27 27 28 28 #include <linux/sysctl.h> 29 29 30 + #define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) 31 + #define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) 32 + 30 33 /* Modelled after find_linux_pte() */ 31 34 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 32 35 { ··· 132 129 return 0; 133 130 } 134 131 135 - static void flush_segments(void *parm) 132 + static void flush_low_segments(void *parm) 136 133 { 137 - u16 segs = (unsigned long) parm; 134 + u16 areas = (unsigned long) parm; 138 135 unsigned long i; 139 136 140 137 asm volatile("isync" : : : "memory"); 141 138 142 - for (i = 0; i < 16; i++) { 143 - if (! (segs & (1U << i))) 139 + BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS); 140 + 141 + for (i = 0; i < NUM_LOW_AREAS; i++) { 142 + if (! (areas & (1U << i))) 144 143 continue; 145 144 asm volatile("slbie %0" : : "r" (i << SID_SHIFT)); 146 145 } ··· 150 145 asm volatile("isync" : : : "memory"); 151 146 } 152 147 153 - static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg) 148 + static void flush_high_segments(void *parm) 154 149 { 155 - unsigned long start = seg << SID_SHIFT; 156 - unsigned long end = (seg+1) << SID_SHIFT; 150 + u16 areas = (unsigned long) parm; 151 + unsigned long i, j; 152 + 153 + asm volatile("isync" : : : "memory"); 154 + 155 + BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS); 156 + 157 + for (i = 0; i < NUM_HIGH_AREAS; i++) { 158 + if (! (areas & (1U << i))) 159 + continue; 160 + for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++) 161 + asm volatile("slbie %0" 162 + :: "r" ((i << HTLB_AREA_SHIFT) + (j << SID_SHIFT))); 163 + } 164 + 165 + asm volatile("isync" : : : "memory"); 166 + } 167 + 168 + static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area) 169 + { 170 + unsigned long start = area << SID_SHIFT; 171 + unsigned long end = (area+1) << SID_SHIFT; 157 172 struct vm_area_struct *vma; 158 173 159 - BUG_ON(seg >= 16); 174 + BUG_ON(area >= NUM_LOW_AREAS); 160 175 161 176 /* Check no VMAs are in the region */ 162 177 vma = find_vma(mm, start); ··· 186 161 return 0; 187 162 } 188 163 189 - static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs) 164 + static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area) 165 + { 166 + unsigned long start = area << HTLB_AREA_SHIFT; 167 + unsigned long end = (area+1) << HTLB_AREA_SHIFT; 168 + struct vm_area_struct *vma; 169 + 170 + BUG_ON(area >= NUM_HIGH_AREAS); 171 + 172 + /* Check no VMAs are in the region */ 173 + vma = find_vma(mm, start); 174 + if (vma && (vma->vm_start < end)) 175 + return -EBUSY; 176 + 177 + return 0; 178 + } 179 + 180 + static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas) 190 181 { 191 182 unsigned long i; 192 183 193 - newsegs &= ~(mm->context.htlb_segs); 194 - if (! newsegs) 184 + BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS); 185 + BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS); 186 + 187 + newareas &= ~(mm->context.low_htlb_areas); 188 + if (! newareas) 195 189 return 0; /* The segments we want are already open */ 196 190 197 - for (i = 0; i < 16; i++) 198 - if ((1 << i) & newsegs) 199 - if (prepare_low_seg_for_htlb(mm, i) != 0) 191 + for (i = 0; i < NUM_LOW_AREAS; i++) 192 + if ((1 << i) & newareas) 193 + if (prepare_low_area_for_htlb(mm, i) != 0) 200 194 return -EBUSY; 201 195 202 - mm->context.htlb_segs |= newsegs; 196 + mm->context.low_htlb_areas |= newareas; 203 197 204 198 /* update the paca copy of the context struct */ 205 199 get_paca()->context = mm->context; ··· 226 182 /* the context change must make it to memory before the flush, 227 183 * so that further SLB misses do the right thing. */ 228 184 mb(); 229 - on_each_cpu(flush_segments, (void *)(unsigned long)newsegs, 0, 1); 185 + on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1); 186 + 187 + return 0; 188 + } 189 + 190 + static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas) 191 + { 192 + unsigned long i; 193 + 194 + BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS); 195 + BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8) 196 + != NUM_HIGH_AREAS); 197 + 198 + newareas &= ~(mm->context.high_htlb_areas); 199 + if (! newareas) 200 + return 0; /* The areas we want are already open */ 201 + 202 + for (i = 0; i < NUM_HIGH_AREAS; i++) 203 + if ((1 << i) & newareas) 204 + if (prepare_high_area_for_htlb(mm, i) != 0) 205 + return -EBUSY; 206 + 207 + mm->context.high_htlb_areas |= newareas; 208 + 209 + /* update the paca copy of the context struct */ 210 + get_paca()->context = mm->context; 211 + 212 + /* the context change must make it to memory before the flush, 213 + * so that further SLB misses do the right thing. */ 214 + mb(); 215 + on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1); 230 216 231 217 return 0; 232 218 } 233 219 234 220 int prepare_hugepage_range(unsigned long addr, unsigned long len) 235 221 { 236 - if (within_hugepage_high_range(addr, len)) 237 - return 0; 238 - else if ((addr < 0x100000000UL) && ((addr+len) < 0x100000000UL)) { 239 - int err; 240 - /* Yes, we need both tests, in case addr+len overflows 241 - * 64-bit arithmetic */ 242 - err = open_low_hpage_segs(current->mm, 222 + int err; 223 + 224 + if ( (addr+len) < addr ) 225 + return -EINVAL; 226 + 227 + if ((addr + len) < 0x100000000UL) 228 + err = open_low_hpage_areas(current->mm, 243 229 LOW_ESID_MASK(addr, len)); 244 - if (err) 245 - printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)" 246 - " failed (segs: 0x%04hx)\n", addr, len, 247 - LOW_ESID_MASK(addr, len)); 230 + else 231 + err = open_high_hpage_areas(current->mm, 232 + HTLB_AREA_MASK(addr, len)); 233 + if (err) { 234 + printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)" 235 + " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n", 236 + addr, len, 237 + LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len)); 248 238 return err; 249 239 } 250 240 251 - return -EINVAL; 241 + return 0; 252 242 } 253 243 254 244 struct page * ··· 354 276 vma = find_vma(mm, addr); 355 277 continue; 356 278 } 357 - if (touches_hugepage_high_range(addr, len)) { 358 - addr = TASK_HPAGE_END; 279 + if (touches_hugepage_high_range(mm, addr, len)) { 280 + addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT); 359 281 vma = find_vma(mm, addr); 360 282 continue; 361 283 } ··· 434 356 if (touches_hugepage_low_range(mm, addr, len)) { 435 357 addr = (addr & ((~0) << SID_SHIFT)) - len; 436 358 goto hugepage_recheck; 437 - } else if (touches_hugepage_high_range(addr, len)) { 438 - addr = TASK_HPAGE_BASE - len; 359 + } else if (touches_hugepage_high_range(mm, addr, len)) { 360 + addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len; 361 + goto hugepage_recheck; 439 362 } 440 363 441 364 /* ··· 527 448 return -ENOMEM; 528 449 } 529 450 530 - static unsigned long htlb_get_high_area(unsigned long len) 451 + static unsigned long htlb_get_high_area(unsigned long len, u16 areamask) 531 452 { 532 - unsigned long addr = TASK_HPAGE_BASE; 453 + unsigned long addr = 0x100000000UL; 533 454 struct vm_area_struct *vma; 534 455 535 456 vma = find_vma(current->mm, addr); 536 - for (vma = find_vma(current->mm, addr); 537 - addr + len <= TASK_HPAGE_END; 538 - vma = vma->vm_next) { 457 + while (addr + len <= TASK_SIZE_USER64) { 539 458 BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ 540 - BUG_ON(! within_hugepage_high_range(addr, len)); 459 + 460 + if (! __within_hugepage_high_range(addr, len, areamask)) { 461 + addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT); 462 + vma = find_vma(current->mm, addr); 463 + continue; 464 + } 541 465 542 466 if (!vma || (addr + len) <= vma->vm_start) 543 467 return addr; 544 468 addr = ALIGN(vma->vm_end, HPAGE_SIZE); 545 - /* Because we're in a hugepage region, this alignment 546 - * should not skip us over any VMAs */ 469 + /* Depending on segmask this might not be a confirmed 470 + * hugepage region, so the ALIGN could have skipped 471 + * some VMAs */ 472 + vma = find_vma(current->mm, addr); 547 473 } 548 474 549 475 return -ENOMEM; ··· 558 474 unsigned long len, unsigned long pgoff, 559 475 unsigned long flags) 560 476 { 477 + int lastshift; 478 + u16 areamask, curareas; 479 + 561 480 if (len & ~HPAGE_MASK) 562 481 return -EINVAL; 563 482 ··· 568 481 return -EINVAL; 569 482 570 483 if (test_thread_flag(TIF_32BIT)) { 571 - int lastshift = 0; 572 - u16 segmask, cursegs = current->mm->context.htlb_segs; 484 + curareas = current->mm->context.low_htlb_areas; 573 485 574 486 /* First see if we can do the mapping in the existing 575 - * low hpage segments */ 576 - addr = htlb_get_low_area(len, cursegs); 487 + * low areas */ 488 + addr = htlb_get_low_area(len, curareas); 577 489 if (addr != -ENOMEM) 578 490 return addr; 579 491 580 - for (segmask = LOW_ESID_MASK(0x100000000UL-len, len); 581 - ! lastshift; segmask >>=1) { 582 - if (segmask & 1) 492 + lastshift = 0; 493 + for (areamask = LOW_ESID_MASK(0x100000000UL-len, len); 494 + ! lastshift; areamask >>=1) { 495 + if (areamask & 1) 583 496 lastshift = 1; 584 497 585 - addr = htlb_get_low_area(len, cursegs | segmask); 498 + addr = htlb_get_low_area(len, curareas | areamask); 586 499 if ((addr != -ENOMEM) 587 - && open_low_hpage_segs(current->mm, segmask) == 0) 500 + && open_low_hpage_areas(current->mm, areamask) == 0) 588 501 return addr; 589 502 } 590 - printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open" 591 - " enough segments\n"); 592 - return -ENOMEM; 593 503 } else { 594 - return htlb_get_high_area(len); 504 + curareas = current->mm->context.high_htlb_areas; 505 + 506 + /* First see if we can do the mapping in the existing 507 + * high areas */ 508 + addr = htlb_get_high_area(len, curareas); 509 + if (addr != -ENOMEM) 510 + return addr; 511 + 512 + lastshift = 0; 513 + for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len); 514 + ! lastshift; areamask >>=1) { 515 + if (areamask & 1) 516 + lastshift = 1; 517 + 518 + addr = htlb_get_high_area(len, curareas | areamask); 519 + if ((addr != -ENOMEM) 520 + && open_high_hpage_areas(current->mm, areamask) == 0) 521 + return addr; 522 + } 595 523 } 524 + printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open" 525 + " enough areas\n"); 526 + return -ENOMEM; 596 527 } 597 528 598 529 int hash_huge_page(struct mm_struct *mm, unsigned long access,
+13 -12
arch/ppc64/mm/slb_low.S
··· 89 89 b 9f 90 90 91 91 0: /* user address: proto-VSID = context<<15 | ESID */ 92 - li r11,SLB_VSID_USER 93 - 94 92 srdi. r9,r3,USER_ESID_BITS 95 93 bne- 8f /* invalid ea bits set */ 96 94 97 95 #ifdef CONFIG_HUGETLB_PAGE 98 96 BEGIN_FTR_SECTION 99 - /* check against the hugepage ranges */ 100 - cmpldi r3,(TASK_HPAGE_END>>SID_SHIFT) 101 - bge 6f /* >= TASK_HPAGE_END */ 102 - cmpldi r3,(TASK_HPAGE_BASE>>SID_SHIFT) 103 - bge 5f /* TASK_HPAGE_BASE..TASK_HPAGE_END */ 104 - cmpldi r3,16 105 - bge 6f /* 4GB..TASK_HPAGE_BASE */ 97 + lhz r9,PACAHIGHHTLBAREAS(r13) 98 + srdi r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT) 99 + srd r9,r9,r11 100 + andi. r9,r9,1 101 + bne 5f 106 102 107 - lhz r9,PACAHTLBSEGS(r13) 103 + li r11,SLB_VSID_USER 104 + 105 + cmpldi r3,16 106 + bge 6f 107 + 108 + lhz r9,PACALOWHTLBAREAS(r13) 108 109 srd r9,r9,r3 109 110 andi. r9,r9,1 111 + 110 112 beq 6f 111 113 112 - 5: /* this is a hugepage user address */ 113 - li r11,(SLB_VSID_USER|SLB_VSID_L) 114 + 5: li r11,SLB_VSID_USER|SLB_VSID_L 114 115 END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) 115 116 #endif /* CONFIG_HUGETLB_PAGE */ 116 117
+1 -1
include/asm-ppc64/mmu.h
··· 307 307 typedef struct { 308 308 mm_context_id_t id; 309 309 #ifdef CONFIG_HUGETLB_PAGE 310 - u16 htlb_segs; /* bitmask */ 310 + u16 low_htlb_areas, high_htlb_areas; 311 311 #endif 312 312 } mm_context_t; 313 313
+17 -12
include/asm-ppc64/page.h
··· 37 37 38 38 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) 39 39 40 - /* For 64-bit processes the hugepage range is 1T-1.5T */ 41 - #define TASK_HPAGE_BASE ASM_CONST(0x0000010000000000) 42 - #define TASK_HPAGE_END ASM_CONST(0x0000018000000000) 40 + #define HTLB_AREA_SHIFT 40 41 + #define HTLB_AREA_SIZE (1UL << HTLB_AREA_SHIFT) 42 + #define GET_HTLB_AREA(x) ((x) >> HTLB_AREA_SHIFT) 43 43 44 44 #define LOW_ESID_MASK(addr, len) (((1U << (GET_ESID(addr+len-1)+1)) \ 45 45 - (1U << GET_ESID(addr))) & 0xffff) 46 + #define HTLB_AREA_MASK(addr, len) (((1U << (GET_HTLB_AREA(addr+len-1)+1)) \ 47 + - (1U << GET_HTLB_AREA(addr))) & 0xffff) 46 48 47 49 #define ARCH_HAS_HUGEPAGE_ONLY_RANGE 48 50 #define ARCH_HAS_PREPARE_HUGEPAGE_RANGE 49 51 #define ARCH_HAS_SETCLEAR_HUGE_PTE 50 52 51 53 #define touches_hugepage_low_range(mm, addr, len) \ 52 - (LOW_ESID_MASK((addr), (len)) & mm->context.htlb_segs) 53 - #define touches_hugepage_high_range(addr, len) \ 54 - (((addr) > (TASK_HPAGE_BASE-(len))) && ((addr) < TASK_HPAGE_END)) 54 + (LOW_ESID_MASK((addr), (len)) & (mm)->context.low_htlb_areas) 55 + #define touches_hugepage_high_range(mm, addr, len) \ 56 + (HTLB_AREA_MASK((addr), (len)) & (mm)->context.high_htlb_areas) 55 57 56 58 #define __within_hugepage_low_range(addr, len, segmask) \ 57 59 ((LOW_ESID_MASK((addr), (len)) | (segmask)) == (segmask)) 58 60 #define within_hugepage_low_range(addr, len) \ 59 61 __within_hugepage_low_range((addr), (len), \ 60 - current->mm->context.htlb_segs) 61 - #define within_hugepage_high_range(addr, len) (((addr) >= TASK_HPAGE_BASE) \ 62 - && ((addr)+(len) <= TASK_HPAGE_END) && ((addr)+(len) >= (addr))) 62 + current->mm->context.low_htlb_areas) 63 + #define __within_hugepage_high_range(addr, len, zonemask) \ 64 + ((HTLB_AREA_MASK((addr), (len)) | (zonemask)) == (zonemask)) 65 + #define within_hugepage_high_range(addr, len) \ 66 + __within_hugepage_high_range((addr), (len), \ 67 + current->mm->context.high_htlb_areas) 63 68 64 69 #define is_hugepage_only_range(mm, addr, len) \ 65 - (touches_hugepage_high_range((addr), (len)) || \ 70 + (touches_hugepage_high_range((mm), (addr), (len)) || \ 66 71 touches_hugepage_low_range((mm), (addr), (len))) 67 72 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA 68 73 69 74 #define in_hugepage_area(context, addr) \ 70 75 (cpu_has_feature(CPU_FTR_16M_PAGE) && \ 71 - ( (((addr) >= TASK_HPAGE_BASE) && ((addr) < TASK_HPAGE_END)) || \ 76 + ( ((1 << GET_HTLB_AREA(addr)) & (context).high_htlb_areas) || \ 72 77 ( ((addr) < 0x100000000L) && \ 73 - ((1 << GET_ESID(addr)) & (context).htlb_segs) ) ) ) 78 + ((1 << GET_ESID(addr)) & (context).low_htlb_areas) ) ) ) 74 79 75 80 #else /* !CONFIG_HUGETLB_PAGE */ 76 81