powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

This patch add VMX primitives to do memcmp() in case the compare size
is equal or greater than 4K bytes. KSM feature can benefit from this.

Test result with following test program(replace the "^>" with ""):
------
># cat tools/testing/selftests/powerpc/stringloops/memcmp.c
>#include <malloc.h>
>#include <stdlib.h>
>#include <string.h>
>#include <time.h>
>#include "utils.h"
>#define SIZE (1024 * 1024 * 900)
>#define ITERATIONS 40

int test_memcmp(const void *s1, const void *s2, size_t n);

static int testcase(void)
{
char *s1;
char *s2;
unsigned long i;

s1 = memalign(128, SIZE);
if (!s1) {
perror("memalign");
exit(1);
}

s2 = memalign(128, SIZE);
if (!s2) {
perror("memalign");
exit(1);
}

for (i = 0; i < SIZE; i++) {
s1[i] = i & 0xff;
s2[i] = i & 0xff;
}
for (i = 0; i < ITERATIONS; i++) {
int ret = test_memcmp(s1, s2, SIZE);

if (ret) {
printf("return %d at[%ld]! should have returned zero\n", ret, i);
abort();
}
}

return 0;
}

int main(void)
{
return test_harness(testcase, "memcmp");
}
------
Without this patch (but with the first patch "powerpc/64: Align bytes
before fall back to .Lshort in powerpc64 memcmp()." in the series):
4.726728762 seconds time elapsed ( +- 3.54%)
With VMX patch:
4.234335473 seconds time elapsed ( +- 2.63%)
There is ~+10% improvement.

Testing with unaligned and different offset version (make s1 and s2 shift
random offset within 16 bytes) can archieve higher improvement than 10%..

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

authored by

Simon Guo and committed by

Michael Ellerman 7 years ago d58badfb f1ecbaf4

+249 -12

5 changed files

expand all

arch

powerpc

include

asm

asm-prototypes.h

lib

copypage_power7.S

memcmp_64.S

memcpy_power7.S

vmx-helper.c

+2 -2

arch/powerpc/include/asm/asm-prototypes.h

··· 48 48 /* VMX copying */ 49 49 int enter_vmx_usercopy(void); 50 50 int exit_vmx_usercopy(void); 51 - int enter_vmx_copy(void); 52 - void * exit_vmx_copy(void *dest); 51 + int enter_vmx_ops(void); 52 + void *exit_vmx_ops(void *dest); 53 53 54 54 /* Traps */ 55 55 long machine_check_early(struct pt_regs *regs);

+2 -2

arch/powerpc/lib/copypage_power7.S

··· 57 57 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 58 58 std r0,16(r1) 59 59 stdu r1,-STACKFRAMESIZE(r1) 60 - bl enter_vmx_copy 60 + bl enter_vmx_ops 61 61 cmpwi r3,0 62 62 ld r0,STACKFRAMESIZE+16(r1) 63 63 ld r3,STK_REG(R31)(r1) ··· 100 100 addi r3,r3,128 101 101 bdnz 1b 102 102 103 - b exit_vmx_copy /* tail call optimise */ 103 + b exit_vmx_ops /* tail call optimise */ 104 104 105 105 #else 106 106 li r0,(PAGE_SIZE/128)

+240 -3

arch/powerpc/lib/memcmp_64.S

··· 9 9 */ 10 10 #include <asm/ppc_asm.h> 11 11 #include <asm/export.h> 12 + #include <asm/ppc-opcode.h> 12 13 13 14 #define off8 r6 14 15 #define off16 r7 ··· 28 27 #define LH lhbrx 29 28 #define LW lwbrx 30 29 #define LD ldbrx 30 + #define LVS lvsr 31 + #define VPERM(_VRT,_VRA,_VRB,_VRC) \ 32 + vperm _VRT,_VRB,_VRA,_VRC 31 33 #else 32 34 #define LH lhzx 33 35 #define LW lwzx 34 36 #define LD ldx 37 + #define LVS lvsl 38 + #define VPERM(_VRT,_VRA,_VRB,_VRC) \ 39 + vperm _VRT,_VRA,_VRB,_VRC 35 40 #endif 41 + 42 + #define VMX_THRESH 4096 43 + #define ENTER_VMX_OPS \ 44 + mflr r0; \ 45 + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ 46 + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ 47 + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ 48 + std r0,16(r1); \ 49 + stdu r1,-STACKFRAMESIZE(r1); \ 50 + bl enter_vmx_ops; \ 51 + cmpwi cr1,r3,0; \ 52 + ld r0,STACKFRAMESIZE+16(r1); \ 53 + ld r3,STK_REG(R31)(r1); \ 54 + ld r4,STK_REG(R30)(r1); \ 55 + ld r5,STK_REG(R29)(r1); \ 56 + addi r1,r1,STACKFRAMESIZE; \ 57 + mtlr r0 58 + 59 + #define EXIT_VMX_OPS \ 60 + mflr r0; \ 61 + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ 62 + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ 63 + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ 64 + std r0,16(r1); \ 65 + stdu r1,-STACKFRAMESIZE(r1); \ 66 + bl exit_vmx_ops; \ 67 + ld r0,STACKFRAMESIZE+16(r1); \ 68 + ld r3,STK_REG(R31)(r1); \ 69 + ld r4,STK_REG(R30)(r1); \ 70 + ld r5,STK_REG(R29)(r1); \ 71 + addi r1,r1,STACKFRAMESIZE; \ 72 + mtlr r0 73 + 74 + /* 75 + * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with 76 + * 16 bytes boundary and permute the result with the 1st 16 bytes. 77 + 78 + * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | 79 + * ^ ^ ^ 80 + * 0xbbbb10 0xbbbb20 0xbbb30 81 + * ^ 82 + * _vaddr 83 + * 84 + * 85 + * _vmask is the mask generated by LVS 86 + * _v1st_qw is the 1st aligned QW of current addr which is already loaded. 87 + * for example: 0xyyyyyyyyyyyyy012 for big endian 88 + * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. 89 + * for example: 0x3456789abcdefzzz for big endian 90 + * The permute result is saved in _v_res. 91 + * for example: 0x0123456789abcdef for big endian. 92 + */ 93 + #define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ 94 + lvx _v2nd_qw,_vaddr,off16; \ 95 + VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) 36 96 37 97 /* 38 98 * There are 2 categories for memcmp: ··· 102 40 * 2) src/dst has different offset to the 8 bytes boundary. The handlers 103 41 * are named like .Ldiffoffset_xxxx 104 42 */ 105 - _GLOBAL(memcmp) 43 + _GLOBAL_TOC(memcmp) 106 44 cmpdi cr1,r5,0 107 45 108 46 /* Use the short loop if the src/dst addresses are not ··· 194 132 bgt cr6,.Llong 195 133 196 134 .Lcmp_lt32bytes: 197 - /* compare 1 ~ 32 bytes, at least r3 addr is 8 bytes aligned now */ 135 + /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */ 198 136 cmpdi cr5,r5,7 199 137 srdi r0,r5,3 200 138 ble cr5,.Lcmp_rest_lt8bytes ··· 235 173 blr 236 174 237 175 .Llong: 176 + #ifdef CONFIG_ALTIVEC 177 + BEGIN_FTR_SECTION 178 + /* Try to use vmx loop if length is equal or greater than 4K */ 179 + cmpldi cr6,r5,VMX_THRESH 180 + bge cr6,.Lsameoffset_vmx_cmp 181 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 182 + 183 + .Llong_novmx_cmp: 184 + #endif 238 185 /* At least s1 addr is aligned with 8 bytes */ 239 186 li off8,8 240 187 li off16,16 ··· 401 330 li r3,-1 402 331 blr 403 332 333 + #ifdef CONFIG_ALTIVEC 334 + .Lsameoffset_vmx_cmp: 335 + /* Enter with src/dst addrs has the same offset with 8 bytes 336 + * align boundary 337 + */ 338 + ENTER_VMX_OPS 339 + beq cr1,.Llong_novmx_cmp 340 + 341 + 3: 342 + /* need to check whether r4 has the same offset with r3 343 + * for 16 bytes boundary. 344 + */ 345 + xor r0,r3,r4 346 + andi. r0,r0,0xf 347 + bne .Ldiffoffset_vmx_cmp_start 348 + 349 + /* len is no less than 4KB. Need to align with 16 bytes further. 350 + */ 351 + andi. rA,r3,8 352 + LD rA,0,r3 353 + beq 4f 354 + LD rB,0,r4 355 + cmpld cr0,rA,rB 356 + addi r3,r3,8 357 + addi r4,r4,8 358 + addi r5,r5,-8 359 + 360 + beq cr0,4f 361 + /* save and restore cr0 */ 362 + mfocrf r5,128 363 + EXIT_VMX_OPS 364 + mtocrf 128,r5 365 + b .LcmpAB_lightweight 366 + 367 + 4: 368 + /* compare 32 bytes for each loop */ 369 + srdi r0,r5,5 370 + mtctr r0 371 + clrldi r5,r5,59 372 + li off16,16 373 + 374 + .balign 16 375 + 5: 376 + lvx v0,0,r3 377 + lvx v1,0,r4 378 + VCMPEQUD_RC(v0,v0,v1) 379 + bnl cr6,7f 380 + lvx v0,off16,r3 381 + lvx v1,off16,r4 382 + VCMPEQUD_RC(v0,v0,v1) 383 + bnl cr6,6f 384 + addi r3,r3,32 385 + addi r4,r4,32 386 + bdnz 5b 387 + 388 + EXIT_VMX_OPS 389 + cmpdi r5,0 390 + beq .Lzero 391 + b .Lcmp_lt32bytes 392 + 393 + 6: 394 + addi r3,r3,16 395 + addi r4,r4,16 396 + 397 + 7: 398 + /* diff the last 16 bytes */ 399 + EXIT_VMX_OPS 400 + LD rA,0,r3 401 + LD rB,0,r4 402 + cmpld cr0,rA,rB 403 + li off8,8 404 + bne cr0,.LcmpAB_lightweight 405 + 406 + LD rA,off8,r3 407 + LD rB,off8,r4 408 + cmpld cr0,rA,rB 409 + bne cr0,.LcmpAB_lightweight 410 + b .Lzero 411 + #endif 412 + 404 413 .Ldiffoffset_8bytes_make_align_start: 414 + #ifdef CONFIG_ALTIVEC 415 + BEGIN_FTR_SECTION 416 + /* only do vmx ops when the size equal or greater than 4K bytes */ 417 + cmpdi cr5,r5,VMX_THRESH 418 + bge cr5,.Ldiffoffset_vmx_cmp 419 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 420 + 421 + .Ldiffoffset_novmx_cmp: 422 + #endif 423 + 405 424 /* now try to align s1 with 8 bytes */ 406 425 rlwinm r6,r3,3,26,28 407 426 beq .Ldiffoffset_align_s1_8bytes ··· 517 356 /* now s1 is aligned with 8 bytes. */ 518 357 cmpdi cr5,r5,31 519 358 ble cr5,.Lcmp_lt32bytes 520 - b .Llong 521 359 360 + #ifdef CONFIG_ALTIVEC 361 + b .Llong_novmx_cmp 362 + #else 363 + b .Llong 364 + #endif 365 + 366 + #ifdef CONFIG_ALTIVEC 367 + .Ldiffoffset_vmx_cmp: 368 + ENTER_VMX_OPS 369 + beq cr1,.Ldiffoffset_novmx_cmp 370 + 371 + .Ldiffoffset_vmx_cmp_start: 372 + /* Firstly try to align r3 with 16 bytes */ 373 + andi. r6,r3,0xf 374 + li off16,16 375 + beq .Ldiffoffset_vmx_s1_16bytes_align 376 + 377 + LVS v3,0,r3 378 + LVS v4,0,r4 379 + 380 + lvx v5,0,r3 381 + lvx v6,0,r4 382 + LD_VSR_CROSS16B(r3,v3,v5,v7,v9) 383 + LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 384 + 385 + VCMPEQUB_RC(v7,v9,v10) 386 + bnl cr6,.Ldiffoffset_vmx_diff_found 387 + 388 + subfic r6,r6,16 389 + subf r5,r6,r5 390 + add r3,r3,r6 391 + add r4,r4,r6 392 + 393 + .Ldiffoffset_vmx_s1_16bytes_align: 394 + /* now s1 is aligned with 16 bytes */ 395 + lvx v6,0,r4 396 + LVS v4,0,r4 397 + srdi r6,r5,5 /* loop for 32 bytes each */ 398 + clrldi r5,r5,59 399 + mtctr r6 400 + 401 + .balign 16 402 + .Ldiffoffset_vmx_32bytesloop: 403 + /* the first qw of r4 was saved in v6 */ 404 + lvx v9,0,r3 405 + LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 406 + VCMPEQUB_RC(v7,v9,v10) 407 + vor v6,v8,v8 408 + bnl cr6,.Ldiffoffset_vmx_diff_found 409 + 410 + addi r3,r3,16 411 + addi r4,r4,16 412 + 413 + lvx v9,0,r3 414 + LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 415 + VCMPEQUB_RC(v7,v9,v10) 416 + vor v6,v8,v8 417 + bnl cr6,.Ldiffoffset_vmx_diff_found 418 + 419 + addi r3,r3,16 420 + addi r4,r4,16 421 + 422 + bdnz .Ldiffoffset_vmx_32bytesloop 423 + 424 + EXIT_VMX_OPS 425 + 426 + cmpdi r5,0 427 + beq .Lzero 428 + b .Lcmp_lt32bytes 429 + 430 + .Ldiffoffset_vmx_diff_found: 431 + EXIT_VMX_OPS 432 + /* anyway, the diff will appear in next 16 bytes */ 433 + li r5,16 434 + b .Lcmp_lt32bytes 435 + 436 + #endif 522 437 EXPORT_SYMBOL(memcmp)

+3 -3

arch/powerpc/lib/memcpy_power7.S

··· 230 230 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 231 231 std r0,16(r1) 232 232 stdu r1,-STACKFRAMESIZE(r1) 233 - bl enter_vmx_copy 233 + bl enter_vmx_ops 234 234 cmpwi cr1,r3,0 235 235 ld r0,STACKFRAMESIZE+16(r1) 236 236 ld r3,STK_REG(R31)(r1) ··· 445 445 446 446 15: addi r1,r1,STACKFRAMESIZE 447 447 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 448 - b exit_vmx_copy /* tail call optimise */ 448 + b exit_vmx_ops /* tail call optimise */ 449 449 450 450 .Lvmx_unaligned_copy: 451 451 /* Get the destination 16B aligned */ ··· 649 649 650 650 15: addi r1,r1,STACKFRAMESIZE 651 651 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 652 - b exit_vmx_copy /* tail call optimise */ 652 + b exit_vmx_ops /* tail call optimise */ 653 653 #endif /* CONFIG_ALTIVEC */

+2 -2

arch/powerpc/lib/vmx-helper.c

··· 53 53 return 0; 54 54 } 55 55 56 - int enter_vmx_copy(void) 56 + int enter_vmx_ops(void) 57 57 { 58 58 if (in_interrupt()) 59 59 return 0; ··· 70 70 * passed a pointer to the destination which we return as required by a 71 71 * memcpy implementation. 72 72 */ 73 - void *exit_vmx_copy(void *dest) 73 + void *exit_vmx_ops(void *dest) 74 74 { 75 75 disable_kernel_altivec(); 76 76 preempt_enable();