[IA64] Ensure cpu0 can access per-cpu variables in early boot code

ia64 handles per-cpu variables a litle differently from other architectures
in that it maps the physical memory allocated for each cpu at a constant
virtual address (0xffffffffffff0000). This mapping is not enabled until
the architecture specific cpu_init() function is run, which causes problems
since some generic code is run before this point. In particular when
CONFIG_PRINTK_TIME is enabled, the boot cpu will trap on the access to
per-cpu memory at the first printk() call so the boot will fail without
the kernel printing anything to the console.

Fix this by allocating percpu memory for cpu0 in the kernel data section
and doing all initialization to enable percpu access in head.S before
calling any generic code.

Other cpus must take care not to access per-cpu variables too early, but
their code path from start_secondary() to cpu_init() is all in arch/ia64

Signed-off-by: Tony Luck <tony.luck@intel.com>

Tony Luck 10617bbe 45fc3c4d

+53 -12
+25 -1
arch/ia64/kernel/head.S
··· 359 mov ar.rsc=0 // place RSE in enforced lazy mode 360 ;; 361 loadrs // clear the dirty partition 362 - mov IA64_KR(PER_CPU_DATA)=r0 // clear physical per-CPU base 363 ;; 364 mov ar.bspstore=r2 // establish the new RSE stack 365 ;;
··· 359 mov ar.rsc=0 // place RSE in enforced lazy mode 360 ;; 361 loadrs // clear the dirty partition 362 + movl r19=__phys_per_cpu_start 363 + mov r18=PERCPU_PAGE_SIZE 364 + ;; 365 + #ifndef CONFIG_SMP 366 + add r19=r19,r18 367 + ;; 368 + #else 369 + (isAP) br.few 2f 370 + mov r20=r19 371 + sub r19=r19,r18 372 + ;; 373 + shr.u r18=r18,3 374 + 1: 375 + ld8 r21=[r20],8;; 376 + st8[r19]=r21,8 377 + adds r18=-1,r18;; 378 + cmp4.lt p7,p6=0,r18 379 + (p7) br.cond.dptk.few 1b 380 + 2: 381 + #endif 382 + tpa r19=r19 383 + ;; 384 + .pred.rel.mutex isBP,isAP 385 + (isBP) mov IA64_KR(PER_CPU_DATA)=r19 // per-CPU base for cpu0 386 + (isAP) mov IA64_KR(PER_CPU_DATA)=r0 // clear physical per-CPU base 387 ;; 388 mov ar.bspstore=r2 // establish the new RSE stack 389 ;;
+10 -8
arch/ia64/kernel/setup.c
··· 927 if (smp_processor_id() == 0) { 928 cpu_set(0, per_cpu(cpu_sibling_map, 0)); 929 cpu_set(0, cpu_core_map[0]); 930 } 931 #endif 932 - 933 - /* 934 - * We set ar.k3 so that assembly code in MCA handler can compute 935 - * physical addresses of per cpu variables with a simple: 936 - * phys = ar.k3 + &per_cpu_var 937 - */ 938 - ia64_set_kr(IA64_KR_PER_CPU_DATA, 939 - ia64_tpa(cpu_data) - (long) __per_cpu_start); 940 941 get_max_cacheline_size(); 942
··· 927 if (smp_processor_id() == 0) { 928 cpu_set(0, per_cpu(cpu_sibling_map, 0)); 929 cpu_set(0, cpu_core_map[0]); 930 + } else { 931 + /* 932 + * Set ar.k3 so that assembly code in MCA handler can compute 933 + * physical addresses of per cpu variables with a simple: 934 + * phys = ar.k3 + &per_cpu_var 935 + * and the alt-dtlb-miss handler can set per-cpu mapping into 936 + * the TLB when needed. head.S already did this for cpu0. 937 + */ 938 + ia64_set_kr(IA64_KR_PER_CPU_DATA, 939 + ia64_tpa(cpu_data) - (long) __per_cpu_start); 940 } 941 #endif 942 943 get_max_cacheline_size(); 944
+2
arch/ia64/kernel/smpboot.c
··· 467 { 468 /* Early console may use I/O ports */ 469 ia64_set_kr(IA64_KR_IO_BASE, __pa(ia64_iobase)); 470 Dprintk("start_secondary: starting CPU 0x%x\n", hard_smp_processor_id()); 471 efi_map_pal_code(); 472 cpu_init(); 473 preempt_disable();
··· 467 { 468 /* Early console may use I/O ports */ 469 ia64_set_kr(IA64_KR_IO_BASE, __pa(ia64_iobase)); 470 + #ifndef CONFIG_PRINTK_TIME 471 Dprintk("start_secondary: starting CPU 0x%x\n", hard_smp_processor_id()); 472 + #endif 473 efi_map_pal_code(); 474 cpu_init(); 475 preempt_disable();
+3
arch/ia64/kernel/vmlinux.lds.S
··· 215 /* Per-cpu data: */ 216 percpu : { } :percpu 217 . = ALIGN(PERCPU_PAGE_SIZE); 218 __phys_per_cpu_start = .; 219 .data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET) 220 {
··· 215 /* Per-cpu data: */ 216 percpu : { } :percpu 217 . = ALIGN(PERCPU_PAGE_SIZE); 218 + #ifdef CONFIG_SMP 219 + . = . + PERCPU_PAGE_SIZE; /* cpu0 per-cpu space */ 220 + #endif 221 __phys_per_cpu_start = .; 222 .data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET) 223 {
+8 -2
arch/ia64/mm/contig.c
··· 163 * get_zeroed_page(). 164 */ 165 if (first_time) { 166 first_time=0; 167 - for (cpu = 0; cpu < NR_CPUS; cpu++) { 168 memcpy(cpu_data, __phys_per_cpu_start, __per_cpu_end - __per_cpu_start); 169 __per_cpu_offset[cpu] = (char *) cpu_data - __per_cpu_start; 170 cpu_data += PERCPU_PAGE_SIZE; ··· 183 static inline void 184 alloc_per_cpu_data(void) 185 { 186 - cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS, 187 PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 188 } 189 #else
··· 163 * get_zeroed_page(). 164 */ 165 if (first_time) { 166 + void *cpu0_data = __phys_per_cpu_start - PERCPU_PAGE_SIZE; 167 + 168 first_time=0; 169 + 170 + __per_cpu_offset[0] = (char *) cpu0_data - __per_cpu_start; 171 + per_cpu(local_per_cpu_offset, 0) = __per_cpu_offset[0]; 172 + 173 + for (cpu = 1; cpu < NR_CPUS; cpu++) { 174 memcpy(cpu_data, __phys_per_cpu_start, __per_cpu_end - __per_cpu_start); 175 __per_cpu_offset[cpu] = (char *) cpu_data - __per_cpu_start; 176 cpu_data += PERCPU_PAGE_SIZE; ··· 177 static inline void 178 alloc_per_cpu_data(void) 179 { 180 + cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS-1, 181 PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 182 } 183 #else
+5 -1
arch/ia64/mm/discontig.c
··· 143 int cpu; 144 145 for_each_possible_early_cpu(cpu) { 146 - if (node == node_cpuid[cpu].nid) { 147 memcpy(__va(cpu_data), __phys_per_cpu_start, 148 __per_cpu_end - __per_cpu_start); 149 __per_cpu_offset[cpu] = (char*)__va(cpu_data) -
··· 143 int cpu; 144 145 for_each_possible_early_cpu(cpu) { 146 + if (cpu == 0) { 147 + void *cpu0_data = __phys_per_cpu_start - PERCPU_PAGE_SIZE; 148 + __per_cpu_offset[cpu] = (char*)cpu0_data - 149 + __per_cpu_start; 150 + } else if (node == node_cpuid[cpu].nid) { 151 memcpy(__va(cpu_data), __phys_per_cpu_start, 152 __per_cpu_end - __per_cpu_start); 153 __per_cpu_offset[cpu] = (char*)__va(cpu_data) -