Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

define new percpu interface for shared data

per cpu data section contains two types of data. One set which is
exclusively accessed by the local cpu and the other set which is per cpu,
but also shared by remote cpus. In the current kernel, these two sets are
not clearely separated out. This can potentially cause the same data
cacheline shared between the two sets of data, which will result in
unnecessary bouncing of the cacheline between cpus.

One way to fix the problem is to cacheline align the remotely accessed per
cpu data, both at the beginning and at the end. Because of the padding at
both ends, this will likely cause some memory wastage and also the
interface to achieve this is not clean.

This patch:

Moves the remotely accessed per cpu data (which is currently marked
as ____cacheline_aligned_in_smp) into a different section, where all the data
elements are cacheline aligned. And as such, this differentiates the local
only data and remotely accessed data cleanly.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: <linux-arch@vger.kernel.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Fenghua Yu and committed by
Linus Torvalds
5fb7dc37 3d7e3382

+84 -53
+1 -4
arch/alpha/kernel/vmlinux.lds.S
··· 69 69 . = ALIGN(8); 70 70 SECURITY_INIT 71 71 72 - . = ALIGN(8192); 73 - __per_cpu_start = .; 74 - .data.percpu : { *(.data.percpu) } 75 - __per_cpu_end = .; 72 + PERCPU(8192) 76 73 77 74 . = ALIGN(2*8192); 78 75 __init_end = .;
+1
arch/arm/kernel/vmlinux.lds.S
··· 66 66 . = ALIGN(4096); 67 67 __per_cpu_start = .; 68 68 *(.data.percpu) 69 + *(.data.percpu.shared_aligned) 69 70 __per_cpu_end = .; 70 71 #ifndef CONFIG_XIP_KERNEL 71 72 __init_begin = _stext;
+1 -4
arch/cris/arch-v32/vmlinux.lds.S
··· 91 91 } 92 92 SECURITY_INIT 93 93 94 - . = ALIGN (8192); 95 - __per_cpu_start = .; 96 - .data.percpu : { *(.data.percpu) } 97 - __per_cpu_end = .; 94 + PERCPU(8192) 98 95 99 96 #ifdef CONFIG_BLK_DEV_INITRD 100 97 .init.ramfs : {
+1 -4
arch/frv/kernel/vmlinux.lds.S
··· 57 57 __alt_instructions_end = .; 58 58 .altinstr_replacement : { *(.altinstr_replacement) } 59 59 60 - . = ALIGN(4096); 61 - __per_cpu_start = .; 62 - .data.percpu : { *(.data.percpu) } 63 - __per_cpu_end = .; 60 + PERCPU(4096) 64 61 65 62 #ifdef CONFIG_BLK_DEV_INITRD 66 63 . = ALIGN(4096);
+1
arch/i386/kernel/vmlinux.lds.S
··· 181 181 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { 182 182 __per_cpu_start = .; 183 183 *(.data.percpu) 184 + *(.data.percpu.shared_aligned) 184 185 __per_cpu_end = .; 185 186 } 186 187 . = ALIGN(4096);
+1
arch/ia64/kernel/vmlinux.lds.S
··· 206 206 { 207 207 __per_cpu_start = .; 208 208 *(.data.percpu) 209 + *(.data.percpu.shared_aligned) 209 210 __per_cpu_end = .; 210 211 } 211 212 . = __phys_per_cpu_start + PERCPU_PAGE_SIZE; /* ensure percpu data fits
+1 -4
arch/m32r/kernel/vmlinux.lds.S
··· 110 110 __initramfs_end = .; 111 111 #endif 112 112 113 - . = ALIGN(4096); 114 - __per_cpu_start = .; 115 - .data.percpu : { *(.data.percpu) } 116 - __per_cpu_end = .; 113 + PERCPU(4096) 117 114 . = ALIGN(4096); 118 115 __init_end = .; 119 116 /* freed after init ends here */
+1 -4
arch/mips/kernel/vmlinux.lds.S
··· 119 119 .init.ramfs : { *(.init.ramfs) } 120 120 __initramfs_end = .; 121 121 #endif 122 - . = ALIGN(_PAGE_SIZE); 123 - __per_cpu_start = .; 124 - .data.percpu : { *(.data.percpu) } 125 - __per_cpu_end = .; 122 + PERCPU(_PAGE_SIZE) 126 123 . = ALIGN(_PAGE_SIZE); 127 124 __init_end = .; 128 125 /* freed after init ends here */
+3 -4
arch/parisc/kernel/vmlinux.lds.S
··· 181 181 .init.ramfs : { *(.init.ramfs) } 182 182 __initramfs_end = .; 183 183 #endif 184 - . = ALIGN(ASM_PAGE_SIZE); 185 - __per_cpu_start = .; 186 - .data.percpu : { *(.data.percpu) } 187 - __per_cpu_end = .; 184 + 185 + PERCPU(ASM_PAGE_SIZE) 186 + 188 187 . = ALIGN(ASM_PAGE_SIZE); 189 188 __init_end = .; 190 189 /* freed after init ends here */
+1
arch/powerpc/kernel/vmlinux.lds.S
··· 144 144 .data.percpu : { 145 145 __per_cpu_start = .; 146 146 *(.data.percpu) 147 + *(.data.percpu.shared_aligned) 147 148 __per_cpu_end = .; 148 149 } 149 150
+1 -4
arch/ppc/kernel/vmlinux.lds.S
··· 130 130 __ftr_fixup : { *(__ftr_fixup) } 131 131 __stop___ftr_fixup = .; 132 132 133 - . = ALIGN(4096); 134 - __per_cpu_start = .; 135 - .data.percpu : { *(.data.percpu) } 136 - __per_cpu_end = .; 133 + PERCPU(4096) 137 134 138 135 #ifdef CONFIG_BLK_DEV_INITRD 139 136 . = ALIGN(4096);
+1 -4
arch/s390/kernel/vmlinux.lds.S
··· 107 107 . = ALIGN(2); 108 108 __initramfs_end = .; 109 109 #endif 110 - . = ALIGN(4096); 111 - __per_cpu_start = .; 112 - .data.percpu : { *(.data.percpu) } 113 - __per_cpu_end = .; 110 + PERCPU(4096) 114 111 . = ALIGN(4096); 115 112 __init_end = .; 116 113 /* freed after init ends here */
+1 -4
arch/sh/kernel/vmlinux.lds.S
··· 60 60 . = ALIGN(PAGE_SIZE); 61 61 __nosave_end = .; 62 62 63 - . = ALIGN(PAGE_SIZE); 64 - __per_cpu_start = .; 65 - .data.percpu : { *(.data.percpu) } 66 - __per_cpu_end = .; 63 + PERCPU(PAGE_SIZE) 67 64 .data.cacheline_aligned : { *(.data.cacheline_aligned) } 68 65 69 66 _edata = .; /* End of data section */
+4 -1
arch/sh64/kernel/vmlinux.lds.S
··· 87 87 88 88 . = ALIGN(PAGE_SIZE); 89 89 __per_cpu_start = .; 90 - .data.percpu : C_PHYS(.data.percpu) { *(.data.percpu) } 90 + .data.percpu : C_PHYS(.data.percpu) { 91 + *(.data.percpu) 92 + *(.data.percpu.shared_aligned) 93 + } 91 94 __per_cpu_end = . ; 92 95 .data.cacheline_aligned : C_PHYS(.data.cacheline_aligned) { *(.data.cacheline_aligned) } 93 96
+1 -4
arch/sparc/kernel/vmlinux.lds.S
··· 65 65 __initramfs_end = .; 66 66 #endif 67 67 68 - . = ALIGN(4096); 69 - __per_cpu_start = .; 70 - .data.percpu : { *(.data.percpu) } 71 - __per_cpu_end = .; 68 + PERCPU(4096) 72 69 . = ALIGN(4096); 73 70 __init_end = .; 74 71 . = ALIGN(32);
+2 -4
arch/sparc64/kernel/vmlinux.lds.S
··· 90 90 __initramfs_end = .; 91 91 #endif 92 92 93 - . = ALIGN(PAGE_SIZE); 94 - __per_cpu_start = .; 95 - .data.percpu : { *(.data.percpu) } 96 - __per_cpu_end = .; 93 + PERCPU(PAGE_SIZE) 94 + 97 95 . = ALIGN(PAGE_SIZE); 98 96 __init_end = .; 99 97 __bss_start = .;
+2 -4
arch/x86_64/kernel/vmlinux.lds.S
··· 194 194 __initramfs_end = .; 195 195 #endif 196 196 197 - . = ALIGN(4096); 198 - __per_cpu_start = .; 199 - .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } 200 - __per_cpu_end = .; 197 + PERCPU(4096) 198 + 201 199 . = ALIGN(4096); 202 200 __init_end = .; 203 201
+1 -4
arch/xtensa/kernel/vmlinux.lds.S
··· 190 190 __initramfs_end = .; 191 191 #endif 192 192 193 - . = ALIGN(4096); 194 - __per_cpu_start = .; 195 - .data.percpu : { *(.data.percpu) } 196 - __per_cpu_end = .; 193 + PERCPU(4096) 197 194 198 195 199 196 /* We need this dummy segment here */
+8
include/asm-generic/percpu.h
··· 14 14 #define DEFINE_PER_CPU(type, name) \ 15 15 __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name 16 16 17 + #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 18 + __attribute__((__section__(".data.percpu.shared_aligned"))) \ 19 + __typeof__(type) per_cpu__##name \ 20 + ____cacheline_aligned_in_smp 21 + 17 22 /* var is in discarded region: offset to particular copy we want */ 18 23 #define per_cpu(var, cpu) (*({ \ 19 24 extern int simple_identifier_##var(void); \ ··· 38 33 39 34 #define DEFINE_PER_CPU(type, name) \ 40 35 __typeof__(type) per_cpu__##name 36 + 37 + #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 38 + DEFINE_PER_CPU(type, name) 41 39 42 40 #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) 43 41 #define __get_cpu_var(var) per_cpu__##var
+8
include/asm-generic/vmlinux.lds.h
··· 245 245 *(.initcall7.init) \ 246 246 *(.initcall7s.init) 247 247 248 + #define PERCPU(align) \ 249 + . = ALIGN(align); \ 250 + __per_cpu_start = .; \ 251 + .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \ 252 + *(.data.percpu) \ 253 + *(.data.percpu.shared_aligned) \ 254 + } \ 255 + __per_cpu_end = .;
+5
include/asm-i386/percpu.h
··· 54 54 #define DEFINE_PER_CPU(type, name) \ 55 55 __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name 56 56 57 + #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 58 + __attribute__((__section__(".data.percpu.shared_aligned"))) \ 59 + __typeof__(type) per_cpu__##name \ 60 + ____cacheline_aligned_in_smp 61 + 57 62 /* We can use this directly for local CPU (faster). */ 58 63 DECLARE_PER_CPU(unsigned long, this_cpu_off); 59 64
+10
include/asm-ia64/percpu.h
··· 29 29 __attribute__((__section__(".data.percpu"))) \ 30 30 __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name 31 31 32 + #ifdef CONFIG_SMP 33 + #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 34 + __attribute__((__section__(".data.percpu.shared_aligned"))) \ 35 + __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name \ 36 + ____cacheline_aligned_in_smp 37 + #else 38 + #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 39 + DEFINE_PER_CPU(type, name) 40 + #endif 41 + 32 42 /* 33 43 * Pretty much a literal copy of asm-generic/percpu.h, except that percpu_modcopy() is an 34 44 * external routine, to avoid include-hell.
+7
include/asm-powerpc/percpu.h
··· 20 20 #define DEFINE_PER_CPU(type, name) \ 21 21 __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name 22 22 23 + #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 24 + __attribute__((__section__(".data.percpu.shared_aligned"))) \ 25 + __typeof__(type) per_cpu__##name \ 26 + ____cacheline_aligned_in_smp 27 + 23 28 /* var is in discarded region: offset to particular copy we want */ 24 29 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu))) 25 30 #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) ··· 45 40 46 41 #define DEFINE_PER_CPU(type, name) \ 47 42 __typeof__(type) per_cpu__##name 43 + #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 44 + DEFINE_PER_CPU(type, name) 48 45 49 46 #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) 50 47 #define __get_cpu_var(var) per_cpu__##var
+7
include/asm-s390/percpu.h
··· 41 41 __attribute__((__section__(".data.percpu"))) \ 42 42 __typeof__(type) per_cpu__##name 43 43 44 + #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 45 + __attribute__((__section__(".data.percpu.shared_aligned"))) \ 46 + __typeof__(type) per_cpu__##name \ 47 + ____cacheline_aligned_in_smp 48 + 44 49 #define __get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset) 45 50 #define __raw_get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset) 46 51 #define per_cpu(var,cpu) __reloc_hide(var,__per_cpu_offset[cpu]) ··· 64 59 65 60 #define DEFINE_PER_CPU(type, name) \ 66 61 __typeof__(type) per_cpu__##name 62 + #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 63 + DEFINE_PER_CPU(type, name) 67 64 68 65 #define __get_cpu_var(var) __reloc_hide(var,0) 69 66 #define __raw_get_cpu_var(var) __reloc_hide(var,0)
+7
include/asm-sparc64/percpu.h
··· 18 18 #define DEFINE_PER_CPU(type, name) \ 19 19 __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name 20 20 21 + #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 22 + __attribute__((__section__(".data.percpu.shared_aligned"))) \ 23 + __typeof__(type) per_cpu__##name \ 24 + ____cacheline_aligned_in_smp 25 + 21 26 register unsigned long __local_per_cpu_offset asm("g5"); 22 27 23 28 /* var is in discarded region: offset to particular copy we want */ ··· 43 38 #define real_setup_per_cpu_areas() do { } while (0) 44 39 #define DEFINE_PER_CPU(type, name) \ 45 40 __typeof__(type) per_cpu__##name 41 + #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 42 + DEFINE_PER_CPU(type, name) 46 43 47 44 #define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var)) 48 45 #define __get_cpu_var(var) per_cpu__##var
+7
include/asm-x86_64/percpu.h
··· 20 20 #define DEFINE_PER_CPU(type, name) \ 21 21 __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name 22 22 23 + #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 24 + __attribute__((__section__(".data.percpu.shared_aligned"))) \ 25 + __typeof__(type) per_cpu__##name \ 26 + ____cacheline_internodealigned_in_smp 27 + 23 28 /* var is in discarded region: offset to particular copy we want */ 24 29 #define per_cpu(var, cpu) (*({ \ 25 30 extern int simple_identifier_##var(void); \ ··· 51 46 52 47 #define DEFINE_PER_CPU(type, name) \ 53 48 __typeof__(type) per_cpu__##name 49 + #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 50 + DEFINE_PER_CPU(type, name) 54 51 55 52 #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) 56 53 #define __get_cpu_var(var) per_cpu__##var