Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

powerpc: Update kernel VSID range

This patch change the kernel VSID range so that we limit VSID_BITS to 37.
This enables us to support 64TB with 65 bit VA (37+28). Without this patch
we have boot hangs on platforms that only support 65 bit VA.

With this patch we now have proto vsid generated as below:

We first generate a 37-bit "proto-VSID". Proto-VSIDs are generated
from mmu context id and effective segment id of the address.

For user processes max context id is limited to ((1ul << 19) - 5)
for kernel space, we use the top 4 context ids to map address as below
0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ]
0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ]
0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ]
0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ]

Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Tested-by: Geoff Levand <geoff@infradead.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: <stable@vger.kernel.org> [v3.8]

authored by

Aneesh Kumar K.V and committed by
Benjamin Herrenschmidt
c60ac569 e39d1a47

+126 -106
+59 -56
arch/powerpc/include/asm/mmu-hash64.h
··· 343 343 /* 344 344 * VSID allocation (256MB segment) 345 345 * 346 - * We first generate a 38-bit "proto-VSID". For kernel addresses this 347 - * is equal to the ESID | 1 << 37, for user addresses it is: 348 - * (context << USER_ESID_BITS) | (esid & ((1U << USER_ESID_BITS) - 1) 346 + * We first generate a 37-bit "proto-VSID". Proto-VSIDs are generated 347 + * from mmu context id and effective segment id of the address. 349 348 * 350 - * This splits the proto-VSID into the below range 351 - * 0 - (2^(CONTEXT_BITS + USER_ESID_BITS) - 1) : User proto-VSID range 352 - * 2^(CONTEXT_BITS + USER_ESID_BITS) - 2^(VSID_BITS) : Kernel proto-VSID range 353 - * 354 - * We also have CONTEXT_BITS + USER_ESID_BITS = VSID_BITS - 1 355 - * That is, we assign half of the space to user processes and half 356 - * to the kernel. 349 + * For user processes max context id is limited to ((1ul << 19) - 5) 350 + * for kernel space, we use the top 4 context ids to map address as below 351 + * NOTE: each context only support 64TB now. 352 + * 0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ] 353 + * 0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ] 354 + * 0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ] 355 + * 0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ] 357 356 * 358 357 * The proto-VSIDs are then scrambled into real VSIDs with the 359 358 * multiplicative hash: ··· 362 363 * VSID_MULTIPLIER is prime, so in particular it is 363 364 * co-prime to VSID_MODULUS, making this a 1:1 scrambling function. 364 365 * Because the modulus is 2^n-1 we can compute it efficiently without 365 - * a divide or extra multiply (see below). 366 + * a divide or extra multiply (see below). The scramble function gives 367 + * robust scattering in the hash table (at least based on some initial 368 + * results). 366 369 * 367 - * This scheme has several advantages over older methods: 370 + * We also consider VSID 0 special. We use VSID 0 for slb entries mapping 371 + * bad address. This enables us to consolidate bad address handling in 372 + * hash_page. 368 373 * 369 - * - We have VSIDs allocated for every kernel address 370 - * (i.e. everything above 0xC000000000000000), except the very top 371 - * segment, which simplifies several things. 372 - * 373 - * - We allow for USER_ESID_BITS significant bits of ESID and 374 - * CONTEXT_BITS bits of context for user addresses. 375 - * i.e. 64T (46 bits) of address space for up to half a million contexts. 376 - * 377 - * - The scramble function gives robust scattering in the hash 378 - * table (at least based on some initial results). The previous 379 - * method was more susceptible to pathological cases giving excessive 380 - * hash collisions. 374 + * We also need to avoid the last segment of the last context, because that 375 + * would give a protovsid of 0x1fffffffff. That will result in a VSID 0 376 + * because of the modulo operation in vsid scramble. But the vmemmap 377 + * (which is what uses region 0xf) will never be close to 64TB in size 378 + * (it's 56 bytes per page of system memory). 381 379 */ 382 380 383 381 #define CONTEXT_BITS 19 ··· 382 386 #define USER_ESID_BITS_1T 6 383 387 384 388 /* 389 + * 256MB segment 390 + * The proto-VSID space has 2^(CONTEX_BITS + USER_ESID_BITS) - 1 segments 391 + * available for user + kernel mapping. The top 4 contexts are used for 392 + * kernel mapping. Each segment contains 2^28 bytes. Each 393 + * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts 394 + * (19 == 37 + 28 - 46). 395 + */ 396 + #define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 5) 397 + 398 + /* 385 399 * This should be computed such that protovosid * vsid_mulitplier 386 400 * doesn't overflow 64 bits. It should also be co-prime to vsid_modulus 387 401 */ 388 402 #define VSID_MULTIPLIER_256M ASM_CONST(12538073) /* 24-bit prime */ 389 - #define VSID_BITS_256M (CONTEXT_BITS + USER_ESID_BITS + 1) 403 + #define VSID_BITS_256M (CONTEXT_BITS + USER_ESID_BITS) 390 404 #define VSID_MODULUS_256M ((1UL<<VSID_BITS_256M)-1) 391 405 392 406 #define VSID_MULTIPLIER_1T ASM_CONST(12538073) /* 24-bit prime */ 393 - #define VSID_BITS_1T (CONTEXT_BITS + USER_ESID_BITS_1T + 1) 407 + #define VSID_BITS_1T (CONTEXT_BITS + USER_ESID_BITS_1T) 394 408 #define VSID_MODULUS_1T ((1UL<<VSID_BITS_1T)-1) 395 409 396 410 ··· 428 422 srdi rx,rt,VSID_BITS_##size; \ 429 423 clrldi rt,rt,(64-VSID_BITS_##size); \ 430 424 add rt,rt,rx; /* add high and low bits */ \ 431 - /* Now, r3 == VSID (mod 2^36-1), and lies between 0 and \ 425 + /* NOTE: explanation based on VSID_BITS_##size = 36 \ 426 + * Now, r3 == VSID (mod 2^36-1), and lies between 0 and \ 432 427 * 2^36-1+2^28-1. That in particular means that if r3 >= \ 433 428 * 2^36-1, then r3+1 has the 2^36 bit set. So, if r3+1 has \ 434 429 * the bit clear, r3 already has the answer we want, if it \ ··· 521 514 }) 522 515 #endif /* 1 */ 523 516 524 - /* 525 - * This is only valid for addresses >= PAGE_OFFSET 526 - * The proto-VSID space is divided into two class 527 - * User: 0 to 2^(CONTEXT_BITS + USER_ESID_BITS) -1 528 - * kernel: 2^(CONTEXT_BITS + USER_ESID_BITS) to 2^(VSID_BITS) - 1 529 - * 530 - * With KERNEL_START at 0xc000000000000000, the proto vsid for 531 - * the kernel ends up with 0xc00000000 (36 bits). With 64TB 532 - * support we need to have kernel proto-VSID in the 533 - * [2^37 to 2^38 - 1] range due to the increased USER_ESID_BITS. 534 - */ 535 - static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize) 536 - { 537 - unsigned long proto_vsid; 538 - /* 539 - * We need to make sure proto_vsid for the kernel is 540 - * >= 2^(CONTEXT_BITS + USER_ESID_BITS[_1T]) 541 - */ 542 - if (ssize == MMU_SEGSIZE_256M) { 543 - proto_vsid = ea >> SID_SHIFT; 544 - proto_vsid |= (1UL << (CONTEXT_BITS + USER_ESID_BITS)); 545 - return vsid_scramble(proto_vsid, 256M); 546 - } 547 - proto_vsid = ea >> SID_SHIFT_1T; 548 - proto_vsid |= (1UL << (CONTEXT_BITS + USER_ESID_BITS_1T)); 549 - return vsid_scramble(proto_vsid, 1T); 550 - } 551 - 552 517 /* Returns the segment size indicator for a user address */ 553 518 static inline int user_segment_size(unsigned long addr) 554 519 { ··· 530 551 return MMU_SEGSIZE_256M; 531 552 } 532 553 533 - /* This is only valid for user addresses (which are below 2^44) */ 534 554 static inline unsigned long get_vsid(unsigned long context, unsigned long ea, 535 555 int ssize) 536 556 { 557 + /* 558 + * Bad address. We return VSID 0 for that 559 + */ 560 + if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) 561 + return 0; 562 + 537 563 if (ssize == MMU_SEGSIZE_256M) 538 564 return vsid_scramble((context << USER_ESID_BITS) 539 565 | (ea >> SID_SHIFT), 256M); ··· 546 562 | (ea >> SID_SHIFT_1T), 1T); 547 563 } 548 564 565 + /* 566 + * This is only valid for addresses >= PAGE_OFFSET 567 + * 568 + * For kernel space, we use the top 4 context ids to map address as below 569 + * 0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ] 570 + * 0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ] 571 + * 0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ] 572 + * 0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ] 573 + */ 574 + static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize) 575 + { 576 + unsigned long context; 577 + 578 + /* 579 + * kernel take the top 4 context from the available range 580 + */ 581 + context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1; 582 + return get_vsid(context, ea, ssize); 583 + } 549 584 #endif /* __ASSEMBLY__ */ 550 585 551 586 #endif /* _ASM_POWERPC_MMU_HASH64_H_ */
+25 -9
arch/powerpc/kernel/exceptions-64s.S
··· 1452 1452 _GLOBAL(do_stab_bolted) 1453 1453 stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ 1454 1454 std r11,PACA_EXSLB+EX_SRR0(r13) /* save SRR0 in exc. frame */ 1455 + mfspr r11,SPRN_DAR /* ea */ 1455 1456 1457 + /* 1458 + * check for bad kernel/user address 1459 + * (ea & ~REGION_MASK) >= PGTABLE_RANGE 1460 + */ 1461 + rldicr. r9,r11,4,(63 - 46 - 4) 1462 + li r9,0 /* VSID = 0 for bad address */ 1463 + bne- 0f 1464 + 1465 + /* 1466 + * Calculate VSID: 1467 + * This is the kernel vsid, we take the top for context from 1468 + * the range. context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 1469 + * Here we know that (ea >> 60) == 0xc 1470 + */ 1471 + lis r9,(MAX_USER_CONTEXT + 1)@ha 1472 + addi r9,r9,(MAX_USER_CONTEXT + 1)@l 1473 + 1474 + srdi r10,r11,SID_SHIFT 1475 + rldimi r10,r9,USER_ESID_BITS,0 /* proto vsid */ 1476 + ASM_VSID_SCRAMBLE(r10, r9, 256M) 1477 + rldic r9,r10,12,16 /* r9 = vsid << 12 */ 1478 + 1479 + 0: 1456 1480 /* Hash to the primary group */ 1457 1481 ld r10,PACASTABVIRT(r13) 1458 - mfspr r11,SPRN_DAR 1459 - srdi r11,r11,28 1482 + srdi r11,r11,SID_SHIFT 1460 1483 rldimi r10,r11,7,52 /* r10 = first ste of the group */ 1461 - 1462 - /* Calculate VSID */ 1463 - /* This is a kernel address, so protovsid = ESID | 1 << 37 */ 1464 - li r9,0x1 1465 - rldimi r11,r9,(CONTEXT_BITS + USER_ESID_BITS),0 1466 - ASM_VSID_SCRAMBLE(r11, r9, 256M) 1467 - rldic r9,r11,12,16 /* r9 = vsid << 12 */ 1468 1484 1469 1485 /* Search the primary group for a free entry */ 1470 1486 1: ld r11,0(r10) /* Test valid bit of the current ste */
+15 -5
arch/powerpc/mm/hash_utils_64.c
··· 195 195 unsigned long vpn = hpt_vpn(vaddr, vsid, ssize); 196 196 unsigned long tprot = prot; 197 197 198 + /* 199 + * If we hit a bad address return error. 200 + */ 201 + if (!vsid) 202 + return -1; 198 203 /* Make kernel text executable */ 199 204 if (overlaps_kernel_text(vaddr, vaddr + step)) 200 205 tprot &= ~HPTE_R_N; ··· 929 924 DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n", 930 925 ea, access, trap); 931 926 932 - if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) { 933 - DBG_LOW(" out of pgtable range !\n"); 934 - return 1; 935 - } 936 - 937 927 /* Get region & vsid */ 938 928 switch (REGION_ID(ea)) { 939 929 case USER_REGION_ID: ··· 959 959 } 960 960 DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid); 961 961 962 + /* Bad address. */ 963 + if (!vsid) { 964 + DBG_LOW("Bad address!\n"); 965 + return 1; 966 + } 962 967 /* Get pgdir */ 963 968 pgdir = mm->pgd; 964 969 if (pgdir == NULL) ··· 1133 1128 /* Get VSID */ 1134 1129 ssize = user_segment_size(ea); 1135 1130 vsid = get_vsid(mm->context.id, ea, ssize); 1131 + if (!vsid) 1132 + return; 1136 1133 1137 1134 /* Hash doesn't like irqs */ 1138 1135 local_irq_save(flags); ··· 1242 1235 hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); 1243 1236 hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); 1244 1237 1238 + /* Don't create HPTE entries for bad address */ 1239 + if (!vsid) 1240 + return; 1245 1241 ret = ppc_md.hpte_insert(hpteg, vpn, __pa(vaddr), 1246 1242 mode, HPTE_V_BOLTED, 1247 1243 mmu_linear_psize, mmu_kernel_ssize);
+1 -10
arch/powerpc/mm/mmu_context_hash64.c
··· 29 29 static DEFINE_SPINLOCK(mmu_context_lock); 30 30 static DEFINE_IDA(mmu_context_ida); 31 31 32 - /* 33 - * 256MB segment 34 - * The proto-VSID space has 2^(CONTEX_BITS + USER_ESID_BITS) - 1 segments 35 - * available for user mappings. Each segment contains 2^28 bytes. Each 36 - * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts 37 - * (19 == 37 + 28 - 46). 38 - */ 39 - #define MAX_CONTEXT ((1UL << CONTEXT_BITS) - 1) 40 - 41 32 int __init_new_context(void) 42 33 { 43 34 int index; ··· 47 56 else if (err) 48 57 return err; 49 58 50 - if (index > MAX_CONTEXT) { 59 + if (index > MAX_USER_CONTEXT) { 51 60 spin_lock(&mmu_context_lock); 52 61 ida_remove(&mmu_context_ida, index); 53 62 spin_unlock(&mmu_context_lock);
+25 -25
arch/powerpc/mm/slb_low.S
··· 31 31 * No other registers are examined or changed. 32 32 */ 33 33 _GLOBAL(slb_allocate_realmode) 34 - /* r3 = faulting address */ 34 + /* 35 + * check for bad kernel/user address 36 + * (ea & ~REGION_MASK) >= PGTABLE_RANGE 37 + */ 38 + rldicr. r9,r3,4,(63 - 46 - 4) 39 + bne- 8f 35 40 36 41 srdi r9,r3,60 /* get region */ 37 - srdi r10,r3,28 /* get esid */ 42 + srdi r10,r3,SID_SHIFT /* get esid */ 38 43 cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */ 39 44 40 45 /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */ ··· 61 56 */ 62 57 _GLOBAL(slb_miss_kernel_load_linear) 63 58 li r11,0 64 - li r9,0x1 65 59 /* 66 - * for 1T we shift 12 bits more. slb_finish_load_1T will do 67 - * the necessary adjustment 60 + * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 61 + * r9 = region id. 68 62 */ 69 - rldimi r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0 63 + addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha 64 + addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l 65 + 66 + 70 67 BEGIN_FTR_SECTION 71 68 b slb_finish_load 72 69 END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) ··· 98 91 _GLOBAL(slb_miss_kernel_load_io) 99 92 li r11,0 100 93 6: 101 - li r9,0x1 102 94 /* 103 - * for 1T we shift 12 bits more. slb_finish_load_1T will do 104 - * the necessary adjustment 95 + * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 96 + * r9 = region id. 105 97 */ 106 - rldimi r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0 98 + addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha 99 + addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l 100 + 107 101 BEGIN_FTR_SECTION 108 102 b slb_finish_load 109 103 END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) 110 104 b slb_finish_load_1T 111 105 112 - 0: /* user address: proto-VSID = context << 15 | ESID. First check 113 - * if the address is within the boundaries of the user region 114 - */ 115 - srdi. r9,r10,USER_ESID_BITS 116 - bne- 8f /* invalid ea bits set */ 117 - 118 - 106 + 0: 119 107 /* when using slices, we extract the psize off the slice bitmaps 120 108 * and then we need to get the sllp encoding off the mmu_psize_defs 121 109 * array. ··· 166 164 ld r9,PACACONTEXTID(r13) 167 165 BEGIN_FTR_SECTION 168 166 cmpldi r10,0x1000 169 - END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) 170 - rldimi r10,r9,USER_ESID_BITS,0 171 - BEGIN_FTR_SECTION 172 167 bge slb_finish_load_1T 173 168 END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) 174 169 b slb_finish_load 175 170 176 171 8: /* invalid EA */ 177 172 li r10,0 /* BAD_VSID */ 173 + li r9,0 /* BAD_VSID */ 178 174 li r11,SLB_VSID_USER /* flags don't much matter */ 179 175 b slb_finish_load 180 176 ··· 221 221 222 222 /* get context to calculate proto-VSID */ 223 223 ld r9,PACACONTEXTID(r13) 224 - rldimi r10,r9,USER_ESID_BITS,0 225 - 226 224 /* fall through slb_finish_load */ 227 225 228 226 #endif /* __DISABLED__ */ ··· 229 231 /* 230 232 * Finish loading of an SLB entry and return 231 233 * 232 - * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET 234 + * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET 233 235 */ 234 236 slb_finish_load: 237 + rldimi r10,r9,USER_ESID_BITS,0 235 238 ASM_VSID_SCRAMBLE(r10,r9,256M) 236 239 /* 237 240 * bits above VSID_BITS_256M need to be ignored from r10 ··· 297 298 /* 298 299 * Finish loading of a 1T SLB entry (for the kernel linear mapping) and return. 299 300 * 300 - * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9 301 + * r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9 301 302 */ 302 303 slb_finish_load_1T: 303 - srdi r10,r10,40-28 /* get 1T ESID */ 304 + srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */ 305 + rldimi r10,r9,USER_ESID_BITS_1T,0 304 306 ASM_VSID_SCRAMBLE(r10,r9,1T) 305 307 /* 306 308 * bits above VSID_BITS_1T need to be ignored from r10
+1 -1
arch/powerpc/mm/tlb_hash64.c
··· 82 82 if (!is_kernel_addr(addr)) { 83 83 ssize = user_segment_size(addr); 84 84 vsid = get_vsid(mm->context.id, addr, ssize); 85 - WARN_ON(vsid == 0); 86 85 } else { 87 86 vsid = get_kernel_vsid(addr, mmu_kernel_ssize); 88 87 ssize = mmu_kernel_ssize; 89 88 } 89 + WARN_ON(vsid == 0); 90 90 vpn = hpt_vpn(addr, vsid, ssize); 91 91 rpte = __real_pte(__pte(pte), ptep); 92 92