Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

powerpc: Optimise per cpu accesses on 64bit

Now we dynamically allocate the paca array, it takes an extra load
whenever we want to access another cpu's paca. One place we do that a lot
is per cpu variables. A simple example:

DEFINE_PER_CPU(unsigned long, vara);
unsigned long test4(int cpu)
{
return per_cpu(vara, cpu);
}

This takes 4 loads, 5 if you include the actual load of the per cpu variable:

ld r11,-32760(r30) # load address of paca pointer
ld r9,-32768(r30) # load link address of percpu variable
sldi r3,r29,9 # get offset into paca (each entry is 512 bytes)
ld r0,0(r11) # load paca pointer
add r3,r0,r3 # paca + offset
ld r11,64(r3) # load paca[cpu].data_offset

ldx r3,r9,r11 # load per cpu variable

If we remove the ppc64 specific per_cpu_offset(), we get the generic one
which indexes into a statically allocated array. This removes one load and
one add:

ld r11,-32760(r30) # load address of __per_cpu_offset
ld r9,-32768(r30) # load link address of percpu variable
sldi r3,r29,3 # get offset into __per_cpu_offset (each entry 8 bytes)
ldx r11,r11,r3 # load __per_cpu_offset[cpu]

ldx r3,r9,r11 # load per cpu variable

Having all the offsets in one array also helps when iterating over a per cpu
variable across a number of cpus, such as in the scheduler. Before we would
need to load one paca cacheline when calculating each per cpu offset. Now we
have 16 (128 / sizeof(long)) per cpu offsets in each cacheline.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

authored by

Anton Blanchard and committed by
Benjamin Herrenschmidt
ae01f84b 51c7fdba

+7 -6
-3
arch/powerpc/include/asm/percpu.h
··· 1 1 #ifndef _ASM_POWERPC_PERCPU_H_ 2 2 #define _ASM_POWERPC_PERCPU_H_ 3 3 #ifdef __powerpc64__ 4 - #include <linux/compiler.h> 5 4 6 5 /* 7 6 * Same as asm-generic/percpu.h, except that we store the per cpu offset ··· 11 12 12 13 #include <asm/paca.h> 13 14 14 - #define __per_cpu_offset(cpu) (paca[cpu].data_offset) 15 15 #define __my_cpu_offset local_paca->data_offset 16 - #define per_cpu_offset(x) (__per_cpu_offset(x)) 17 16 18 17 #endif /* CONFIG_SMP */ 19 18 #endif /* __powerpc64__ */
-1
arch/powerpc/kernel/asm-offsets.c
··· 194 194 DEFINE(PACA_STARTSPURR, offsetof(struct paca_struct, startspurr)); 195 195 DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time)); 196 196 DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time)); 197 - DEFINE(PACA_DATA_OFFSET, offsetof(struct paca_struct, data_offset)); 198 197 DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save)); 199 198 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER 200 199 DEFINE(PACA_KVM_SVCPU, offsetof(struct paca_struct, shadow_vcpu));
+7 -2
arch/powerpc/kernel/setup_64.c
··· 600 600 return REMOTE_DISTANCE; 601 601 } 602 602 603 + unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 604 + EXPORT_SYMBOL(__per_cpu_offset); 605 + 603 606 void __init setup_per_cpu_areas(void) 604 607 { 605 608 const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; ··· 627 624 panic("cannot initialize percpu area (err=%d)", rc); 628 625 629 626 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 630 - for_each_possible_cpu(cpu) 631 - paca[cpu].data_offset = delta + pcpu_unit_offsets[cpu]; 627 + for_each_possible_cpu(cpu) { 628 + __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; 629 + paca[cpu].data_offset = __per_cpu_offset[cpu]; 630 + } 632 631 } 633 632 #endif 634 633