Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'x86-boot-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 boot and percpu updates from Ingo Molnar:
"This tree contains a bootable images documentation update plus three
slightly misplaced x86/asm percpu changes/optimizations"

* 'x86-boot-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86-64: Use RIP-relative addressing for most per-CPU accesses
x86-64: Handle PC-relative relocations on per-CPU data
x86: Convert a few more per-CPU items to read-mostly ones
x86, boot: Document intermediates more clearly

+103 -30
+12
arch/x86/boot/compressed/Makefile
··· 3 3 # 4 4 # create a compressed vmlinux image from the original vmlinux 5 5 # 6 + # vmlinuz is: 7 + # decompression code (*.o) 8 + # asm globals (piggy.S), including: 9 + # vmlinux.bin.(gz|bz2|lzma|...) 10 + # 11 + # vmlinux.bin is: 12 + # vmlinux stripped of debugging and comments 13 + # vmlinux.bin.all is: 14 + # vmlinux.bin + vmlinux.relocs 15 + # vmlinux.bin.(gz|bz2|lzma|...) is: 16 + # (see scripts/Makefile.lib size_append) 17 + # compressed vmlinux.bin.all + u32 size of vmlinux.bin.all 6 18 7 19 targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ 8 20 vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
+13 -1
arch/x86/boot/compressed/misc.c
··· 260 260 261 261 /* 262 262 * Process relocations: 32 bit relocations first then 64 bit after. 263 - * Two sets of binary relocations are added to the end of the kernel 263 + * Three sets of binary relocations are added to the end of the kernel 264 264 * before compression. Each relocation table entry is the kernel 265 265 * address of the location which needs to be updated stored as a 266 266 * 32-bit value which is sign extended to 64 bits. ··· 270 270 * kernel bits... 271 271 * 0 - zero terminator for 64 bit relocations 272 272 * 64 bit relocation repeated 273 + * 0 - zero terminator for inverse 32 bit relocations 274 + * 32 bit inverse relocation repeated 273 275 * 0 - zero terminator for 32 bit relocations 274 276 * 32 bit relocation repeated 275 277 * ··· 288 286 *(uint32_t *)ptr += delta; 289 287 } 290 288 #ifdef CONFIG_X86_64 289 + while (*--reloc) { 290 + long extended = *reloc; 291 + extended += map; 292 + 293 + ptr = (unsigned long)extended; 294 + if (ptr < min_addr || ptr > max_addr) 295 + error("inverse 32-bit relocation outside of kernel!\n"); 296 + 297 + *(int32_t *)ptr -= delta; 298 + } 291 299 for (reloc--; *reloc; reloc--) { 292 300 long extended = *reloc; 293 301 extended += map;
+45 -16
arch/x86/include/asm/percpu.h
··· 64 64 #define __percpu_prefix "" 65 65 #endif 66 66 67 - #define __percpu_arg(x) __percpu_prefix "%P" #x 67 + #define __percpu_arg(x) __percpu_prefix "%" #x 68 68 69 69 /* 70 70 * Initialized pointers to per-cpu variables needed for the boot ··· 179 179 } \ 180 180 } while (0) 181 181 182 - #define percpu_from_op(op, var, constraint) \ 182 + #define percpu_from_op(op, var) \ 183 183 ({ \ 184 184 typeof(var) pfo_ret__; \ 185 185 switch (sizeof(var)) { \ 186 186 case 1: \ 187 187 asm(op "b "__percpu_arg(1)",%0" \ 188 188 : "=q" (pfo_ret__) \ 189 - : constraint); \ 189 + : "m" (var)); \ 190 190 break; \ 191 191 case 2: \ 192 192 asm(op "w "__percpu_arg(1)",%0" \ 193 193 : "=r" (pfo_ret__) \ 194 - : constraint); \ 194 + : "m" (var)); \ 195 195 break; \ 196 196 case 4: \ 197 197 asm(op "l "__percpu_arg(1)",%0" \ 198 198 : "=r" (pfo_ret__) \ 199 - : constraint); \ 199 + : "m" (var)); \ 200 200 break; \ 201 201 case 8: \ 202 202 asm(op "q "__percpu_arg(1)",%0" \ 203 203 : "=r" (pfo_ret__) \ 204 - : constraint); \ 204 + : "m" (var)); \ 205 + break; \ 206 + default: __bad_percpu_size(); \ 207 + } \ 208 + pfo_ret__; \ 209 + }) 210 + 211 + #define percpu_stable_op(op, var) \ 212 + ({ \ 213 + typeof(var) pfo_ret__; \ 214 + switch (sizeof(var)) { \ 215 + case 1: \ 216 + asm(op "b "__percpu_arg(P1)",%0" \ 217 + : "=q" (pfo_ret__) \ 218 + : "p" (&(var))); \ 219 + break; \ 220 + case 2: \ 221 + asm(op "w "__percpu_arg(P1)",%0" \ 222 + : "=r" (pfo_ret__) \ 223 + : "p" (&(var))); \ 224 + break; \ 225 + case 4: \ 226 + asm(op "l "__percpu_arg(P1)",%0" \ 227 + : "=r" (pfo_ret__) \ 228 + : "p" (&(var))); \ 229 + break; \ 230 + case 8: \ 231 + asm(op "q "__percpu_arg(P1)",%0" \ 232 + : "=r" (pfo_ret__) \ 233 + : "p" (&(var))); \ 205 234 break; \ 206 235 default: __bad_percpu_size(); \ 207 236 } \ ··· 388 359 * per-thread variables implemented as per-cpu variables and thus 389 360 * stable for the duration of the respective task. 390 361 */ 391 - #define this_cpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var))) 362 + #define this_cpu_read_stable(var) percpu_stable_op("mov", var) 392 363 393 - #define raw_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 394 - #define raw_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 395 - #define raw_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 364 + #define raw_cpu_read_1(pcp) percpu_from_op("mov", pcp) 365 + #define raw_cpu_read_2(pcp) percpu_from_op("mov", pcp) 366 + #define raw_cpu_read_4(pcp) percpu_from_op("mov", pcp) 396 367 397 368 #define raw_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) 398 369 #define raw_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) ··· 410 381 #define raw_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val) 411 382 #define raw_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val) 412 383 413 - #define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 414 - #define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 415 - #define this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 384 + #define this_cpu_read_1(pcp) percpu_from_op("mov", pcp) 385 + #define this_cpu_read_2(pcp) percpu_from_op("mov", pcp) 386 + #define this_cpu_read_4(pcp) percpu_from_op("mov", pcp) 416 387 #define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) 417 388 #define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) 418 389 #define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) ··· 464 435 * 32 bit must fall back to generic operations. 465 436 */ 466 437 #ifdef CONFIG_X86_64 467 - #define raw_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 438 + #define raw_cpu_read_8(pcp) percpu_from_op("mov", pcp) 468 439 #define raw_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) 469 440 #define raw_cpu_add_8(pcp, val) percpu_add_op((pcp), val) 470 441 #define raw_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) ··· 473 444 #define raw_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) 474 445 #define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 475 446 476 - #define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 447 + #define this_cpu_read_8(pcp) percpu_from_op("mov", pcp) 477 448 #define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) 478 449 #define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) 479 450 #define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) ··· 551 522 #include <asm-generic/percpu.h> 552 523 553 524 /* We can use this directly for local CPU (faster). */ 554 - DECLARE_PER_CPU(unsigned long, this_cpu_off); 525 + DECLARE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off); 555 526 556 527 #endif /* !__ASSEMBLY__ */ 557 528
+2 -2
arch/x86/include/asm/processor.h
··· 127 127 /* Index into per_cpu list: */ 128 128 u16 cpu_index; 129 129 u32 microcode; 130 - } __attribute__((__aligned__(SMP_CACHE_BYTES))); 130 + }; 131 131 132 132 #define X86_VENDOR_INTEL 0 133 133 #define X86_VENDOR_CYRIX 1 ··· 151 151 extern __u32 cpu_caps_set[NCAPINTS]; 152 152 153 153 #ifdef CONFIG_SMP 154 - DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 154 + DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); 155 155 #define cpu_data(cpu) per_cpu(cpu_info, cpu) 156 156 #else 157 157 #define cpu_info boot_cpu_data
+1 -1
arch/x86/kernel/setup_percpu.c
··· 30 30 #define BOOT_PERCPU_OFFSET 0 31 31 #endif 32 32 33 - DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; 33 + DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; 34 34 EXPORT_PER_CPU_SYMBOL(this_cpu_off); 35 35 36 36 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
+1 -1
arch/x86/kernel/smpboot.c
··· 99 99 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); 100 100 101 101 /* Per CPU bogomips and other parameters */ 102 - DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 102 + DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); 103 103 EXPORT_PER_CPU_SYMBOL(cpu_info); 104 104 105 105 atomic_t init_deasserted;
+2
arch/x86/kernel/vmlinux.lds.S
··· 186 186 * start another segment - init. 187 187 */ 188 188 PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu) 189 + ASSERT(SIZEOF(.data..percpu) < CONFIG_PHYSICAL_START, 190 + "per-CPU data too large - increase CONFIG_PHYSICAL_START") 189 191 #endif 190 192 191 193 INIT_TEXT_SECTION(PAGE_SIZE)
+27 -9
arch/x86/tools/relocs.c
··· 20 20 21 21 static struct relocs relocs16; 22 22 static struct relocs relocs32; 23 + #if ELF_BITS == 64 24 + static struct relocs relocs32neg; 23 25 static struct relocs relocs64; 26 + #endif 24 27 25 28 struct section { 26 29 Elf_Shdr shdr; ··· 765 762 766 763 switch (r_type) { 767 764 case R_X86_64_NONE: 765 + /* NONE can be ignored. */ 766 + break; 767 + 768 768 case R_X86_64_PC32: 769 769 /* 770 - * NONE can be ignored and PC relative relocations don't 771 - * need to be adjusted. 770 + * PC relative relocations don't need to be adjusted unless 771 + * referencing a percpu symbol. 772 772 */ 773 + if (is_percpu_sym(sym, symname)) 774 + add_reloc(&relocs32neg, offset); 773 775 break; 774 776 775 777 case R_X86_64_32: ··· 994 986 /* Order the relocations for more efficient processing */ 995 987 sort_relocs(&relocs16); 996 988 sort_relocs(&relocs32); 989 + #if ELF_BITS == 64 990 + sort_relocs(&relocs32neg); 997 991 sort_relocs(&relocs64); 992 + #endif 998 993 999 994 /* Print the relocations */ 1000 995 if (as_text) { ··· 1018 1007 for (i = 0; i < relocs32.count; i++) 1019 1008 write_reloc(relocs32.offset[i], stdout); 1020 1009 } else { 1021 - if (ELF_BITS == 64) { 1022 - /* Print a stop */ 1023 - write_reloc(0, stdout); 1010 + #if ELF_BITS == 64 1011 + /* Print a stop */ 1012 + write_reloc(0, stdout); 1024 1013 1025 - /* Now print each relocation */ 1026 - for (i = 0; i < relocs64.count; i++) 1027 - write_reloc(relocs64.offset[i], stdout); 1028 - } 1014 + /* Now print each relocation */ 1015 + for (i = 0; i < relocs64.count; i++) 1016 + write_reloc(relocs64.offset[i], stdout); 1017 + 1018 + /* Print a stop */ 1019 + write_reloc(0, stdout); 1020 + 1021 + /* Now print each inverse 32-bit relocation */ 1022 + for (i = 0; i < relocs32neg.count; i++) 1023 + write_reloc(relocs32neg.offset[i], stdout); 1024 + #endif 1029 1025 1030 1026 /* Print a stop */ 1031 1027 write_reloc(0, stdout);