[IA64] Ensure cpu0 can access per-cpu variables in early boot code

ia64 handles per-cpu variables a litle differently from other architectures
in that it maps the physical memory allocated for each cpu at a constant
virtual address (0xffffffffffff0000). This mapping is not enabled until
the architecture specific cpu_init() function is run, which causes problems
since some generic code is run before this point. In particular when
CONFIG_PRINTK_TIME is enabled, the boot cpu will trap on the access to
per-cpu memory at the first printk() call so the boot will fail without
the kernel printing anything to the console.

Fix this by allocating percpu memory for cpu0 in the kernel data section
and doing all initialization to enable percpu access in head.S before
calling any generic code.

Other cpus must take care not to access per-cpu variables too early, but
their code path from start_secondary() to cpu_init() is all in arch/ia64

Signed-off-by: Tony Luck <tony.luck@intel.com>

Tony Luck 10617bbe 45fc3c4d

+53 -12
+25 -1
arch/ia64/kernel/head.S
··· 359 359 mov ar.rsc=0 // place RSE in enforced lazy mode 360 360 ;; 361 361 loadrs // clear the dirty partition 362 - mov IA64_KR(PER_CPU_DATA)=r0 // clear physical per-CPU base 362 + movl r19=__phys_per_cpu_start 363 + mov r18=PERCPU_PAGE_SIZE 364 + ;; 365 + #ifndef CONFIG_SMP 366 + add r19=r19,r18 367 + ;; 368 + #else 369 + (isAP) br.few 2f 370 + mov r20=r19 371 + sub r19=r19,r18 372 + ;; 373 + shr.u r18=r18,3 374 + 1: 375 + ld8 r21=[r20],8;; 376 + st8[r19]=r21,8 377 + adds r18=-1,r18;; 378 + cmp4.lt p7,p6=0,r18 379 + (p7) br.cond.dptk.few 1b 380 + 2: 381 + #endif 382 + tpa r19=r19 383 + ;; 384 + .pred.rel.mutex isBP,isAP 385 + (isBP) mov IA64_KR(PER_CPU_DATA)=r19 // per-CPU base for cpu0 386 + (isAP) mov IA64_KR(PER_CPU_DATA)=r0 // clear physical per-CPU base 363 387 ;; 364 388 mov ar.bspstore=r2 // establish the new RSE stack 365 389 ;;
+10 -8
arch/ia64/kernel/setup.c
··· 927 927 if (smp_processor_id() == 0) { 928 928 cpu_set(0, per_cpu(cpu_sibling_map, 0)); 929 929 cpu_set(0, cpu_core_map[0]); 930 + } else { 931 + /* 932 + * Set ar.k3 so that assembly code in MCA handler can compute 933 + * physical addresses of per cpu variables with a simple: 934 + * phys = ar.k3 + &per_cpu_var 935 + * and the alt-dtlb-miss handler can set per-cpu mapping into 936 + * the TLB when needed. head.S already did this for cpu0. 937 + */ 938 + ia64_set_kr(IA64_KR_PER_CPU_DATA, 939 + ia64_tpa(cpu_data) - (long) __per_cpu_start); 930 940 } 931 941 #endif 932 - 933 - /* 934 - * We set ar.k3 so that assembly code in MCA handler can compute 935 - * physical addresses of per cpu variables with a simple: 936 - * phys = ar.k3 + &per_cpu_var 937 - */ 938 - ia64_set_kr(IA64_KR_PER_CPU_DATA, 939 - ia64_tpa(cpu_data) - (long) __per_cpu_start); 940 942 941 943 get_max_cacheline_size(); 942 944
+2
arch/ia64/kernel/smpboot.c
··· 467 467 { 468 468 /* Early console may use I/O ports */ 469 469 ia64_set_kr(IA64_KR_IO_BASE, __pa(ia64_iobase)); 470 + #ifndef CONFIG_PRINTK_TIME 470 471 Dprintk("start_secondary: starting CPU 0x%x\n", hard_smp_processor_id()); 472 + #endif 471 473 efi_map_pal_code(); 472 474 cpu_init(); 473 475 preempt_disable();
+3
arch/ia64/kernel/vmlinux.lds.S
··· 215 215 /* Per-cpu data: */ 216 216 percpu : { } :percpu 217 217 . = ALIGN(PERCPU_PAGE_SIZE); 218 + #ifdef CONFIG_SMP 219 + . = . + PERCPU_PAGE_SIZE; /* cpu0 per-cpu space */ 220 + #endif 218 221 __phys_per_cpu_start = .; 219 222 .data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET) 220 223 {
+8 -2
arch/ia64/mm/contig.c
··· 163 163 * get_zeroed_page(). 164 164 */ 165 165 if (first_time) { 166 + void *cpu0_data = __phys_per_cpu_start - PERCPU_PAGE_SIZE; 167 + 166 168 first_time=0; 167 - for (cpu = 0; cpu < NR_CPUS; cpu++) { 169 + 170 + __per_cpu_offset[0] = (char *) cpu0_data - __per_cpu_start; 171 + per_cpu(local_per_cpu_offset, 0) = __per_cpu_offset[0]; 172 + 173 + for (cpu = 1; cpu < NR_CPUS; cpu++) { 168 174 memcpy(cpu_data, __phys_per_cpu_start, __per_cpu_end - __per_cpu_start); 169 175 __per_cpu_offset[cpu] = (char *) cpu_data - __per_cpu_start; 170 176 cpu_data += PERCPU_PAGE_SIZE; ··· 183 177 static inline void 184 178 alloc_per_cpu_data(void) 185 179 { 186 - cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS, 180 + cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS-1, 187 181 PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 188 182 } 189 183 #else
+5 -1
arch/ia64/mm/discontig.c
··· 143 143 int cpu; 144 144 145 145 for_each_possible_early_cpu(cpu) { 146 - if (node == node_cpuid[cpu].nid) { 146 + if (cpu == 0) { 147 + void *cpu0_data = __phys_per_cpu_start - PERCPU_PAGE_SIZE; 148 + __per_cpu_offset[cpu] = (char*)cpu0_data - 149 + __per_cpu_start; 150 + } else if (node == node_cpuid[cpu].nid) { 147 151 memcpy(__va(cpu_data), __phys_per_cpu_start, 148 152 __per_cpu_end - __per_cpu_start); 149 153 __per_cpu_offset[cpu] = (char*)__va(cpu_data) -