powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp()

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Currently memcmp() 64bytes version in powerpc will fall back to .Lshort
(compare per byte mode) if either src or dst address is not 8 bytes aligned.
It can be opmitized in 2 situations:

1) if both addresses are with the same offset with 8 bytes boundary:
memcmp() can compare the unaligned bytes within 8 bytes boundary firstly
and then compare the rest 8-bytes-aligned content with .Llong mode.

2) If src/dst addrs are not with the same offset of 8 bytes boundary:
memcmp() can align src addr with 8 bytes, increment dst addr accordingly,
then load src with aligned mode and load dst with unaligned mode.

This patch optmizes memcmp() behavior in the above 2 situations.

Tested with both little/big endian. Performance result below is based on
little endian.

Following is the test result with src/dst having the same offset case:
(a similar result was observed when src/dst having different offset):
(1) 256 bytes
Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp:
- without patch
29.773018302 seconds time elapsed ( +- 0.09% )
- with patch
16.485568173 seconds time elapsed ( +- 0.02% )
-> There is ~+80% percent improvement

(2) 32 bytes
To observe performance impact on < 32 bytes, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
#include <string.h>
#include "utils.h"

-#define SIZE 256
+#define SIZE 32
#define ITERATIONS 10000

int test_memcmp(const void *s1, const void *s2, size_t n);
--------

- Without patch
0.244746482 seconds time elapsed ( +- 0.36%)
- with patch
0.215069477 seconds time elapsed ( +- 0.51%)
-> There is ～+13% improvement

(3) 0~8 bytes
To observe <8 bytes performance impact, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
#include <string.h>
#include "utils.h"

-#define SIZE 256
-#define ITERATIONS 10000
+#define SIZE 8
+#define ITERATIONS 1000000

int test_memcmp(const void *s1, const void *s2, size_t n);
-------
- Without patch
1.845642503 seconds time elapsed ( +- 0.12% )
- With patch
1.849767135 seconds time elapsed ( +- 0.26% )
-> They are nearly the same. (-0.2%)

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

authored by

Simon Guo and committed by

Michael Ellerman 7 years ago 2d9ee327 ca42d8d2

+133 -7

1 changed file

expand all

arch

powerpc

lib

memcmp_64.S

+133 -7

arch/powerpc/lib/memcmp_64.S

··· 24 24 #define rH r31 25 25 26 26 #ifdef __LITTLE_ENDIAN__ 27 + #define LH lhbrx 28 + #define LW lwbrx 27 29 #define LD ldbrx 28 30 #else 31 + #define LH lhzx 32 + #define LW lwzx 29 33 #define LD ldx 30 34 #endif 31 35 36 + /* 37 + * There are 2 categories for memcmp: 38 + * 1) src/dst has the same offset to the 8 bytes boundary. The handlers 39 + * are named like .Lsameoffset_xxxx 40 + * 2) src/dst has different offset to the 8 bytes boundary. The handlers 41 + * are named like .Ldiffoffset_xxxx 42 + */ 32 43 _GLOBAL(memcmp) 33 44 cmpdi cr1,r5,0 34 45 35 - /* Use the short loop if both strings are not 8B aligned */ 36 - or r6,r3,r4 46 + /* Use the short loop if the src/dst addresses are not 47 + * with the same offset of 8 bytes align boundary. 48 + */ 49 + xor r6,r3,r4 37 50 andi. r6,r6,7 38 51 39 - /* Use the short loop if length is less than 32B */ 40 - cmpdi cr6,r5,31 52 + /* Fall back to short loop if compare at aligned addrs 53 + * with less than 8 bytes. 54 + */ 55 + cmpdi cr6,r5,7 41 56 42 57 beq cr1,.Lzero 43 - bne .Lshort 44 - bgt cr6,.Llong 58 + bgt cr6,.Lno_short 45 59 46 60 .Lshort: 47 61 mtctr r5 48 - 49 62 1: lbz rA,0(r3) 50 63 lbz rB,0(r4) 51 64 subf. rC,rB,rA ··· 91 78 li r3,0 92 79 blr 93 80 81 + .Lno_short: 82 + dcbt 0,r3 83 + dcbt 0,r4 84 + bne .Ldiffoffset_8bytes_make_align_start 85 + 86 + 87 + .Lsameoffset_8bytes_make_align_start: 88 + /* attempt to compare bytes not aligned with 8 bytes so that 89 + * rest comparison can run based on 8 bytes alignment. 90 + */ 91 + andi. r6,r3,7 92 + 93 + /* Try to compare the first double word which is not 8 bytes aligned: 94 + * load the first double word at (src & ~7UL) and shift left appropriate 95 + * bits before comparision. 96 + */ 97 + rlwinm r6,r3,3,26,28 98 + beq .Lsameoffset_8bytes_aligned 99 + clrrdi r3,r3,3 100 + clrrdi r4,r4,3 101 + LD rA,0,r3 102 + LD rB,0,r4 103 + sld rA,rA,r6 104 + sld rB,rB,r6 105 + cmpld cr0,rA,rB 106 + srwi r6,r6,3 107 + bne cr0,.LcmpAB_lightweight 108 + subfic r6,r6,8 109 + subf. r5,r6,r5 110 + addi r3,r3,8 111 + addi r4,r4,8 112 + beq .Lzero 113 + 114 + .Lsameoffset_8bytes_aligned: 115 + /* now we are aligned with 8 bytes. 116 + * Use .Llong loop if left cmp bytes are equal or greater than 32B. 117 + */ 118 + cmpdi cr6,r5,31 119 + bgt cr6,.Llong 120 + 121 + .Lcmp_lt32bytes: 122 + /* compare 1 ~ 32 bytes, at least r3 addr is 8 bytes aligned now */ 123 + cmpdi cr5,r5,7 124 + srdi r0,r5,3 125 + ble cr5,.Lcmp_rest_lt8bytes 126 + 127 + /* handle 8 ~ 31 bytes */ 128 + clrldi r5,r5,61 129 + mtctr r0 130 + 2: 131 + LD rA,0,r3 132 + LD rB,0,r4 133 + cmpld cr0,rA,rB 134 + addi r3,r3,8 135 + addi r4,r4,8 136 + bne cr0,.LcmpAB_lightweight 137 + bdnz 2b 138 + 139 + cmpwi r5,0 140 + beq .Lzero 141 + 142 + .Lcmp_rest_lt8bytes: 143 + /* Here we have only less than 8 bytes to compare with. at least s1 144 + * Address is aligned with 8 bytes. 145 + * The next double words are load and shift right with appropriate 146 + * bits. 147 + */ 148 + subfic r6,r5,8 149 + slwi r6,r6,3 150 + LD rA,0,r3 151 + LD rB,0,r4 152 + srd rA,rA,r6 153 + srd rB,rB,r6 154 + cmpld cr0,rA,rB 155 + bne cr0,.LcmpAB_lightweight 156 + b .Lzero 157 + 94 158 .Lnon_zero: 95 159 mr r3,rC 96 160 blr 97 161 98 162 .Llong: 163 + /* At least s1 addr is aligned with 8 bytes */ 99 164 li off8,8 100 165 li off16,16 101 166 li off24,24 ··· 323 232 ld r28,-32(r1) 324 233 ld r27,-40(r1) 325 234 blr 235 + 236 + .LcmpAB_lightweight: /* skip NV GPRS restore */ 237 + li r3,1 238 + bgtlr 239 + li r3,-1 240 + blr 241 + 242 + .Ldiffoffset_8bytes_make_align_start: 243 + /* now try to align s1 with 8 bytes */ 244 + rlwinm r6,r3,3,26,28 245 + beq .Ldiffoffset_align_s1_8bytes 246 + 247 + clrrdi r3,r3,3 248 + LD rA,0,r3 249 + LD rB,0,r4 /* unaligned load */ 250 + sld rA,rA,r6 251 + srd rA,rA,r6 252 + srd rB,rB,r6 253 + cmpld cr0,rA,rB 254 + srwi r6,r6,3 255 + bne cr0,.LcmpAB_lightweight 256 + 257 + subfic r6,r6,8 258 + subf. r5,r6,r5 259 + addi r3,r3,8 260 + add r4,r4,r6 261 + 262 + beq .Lzero 263 + 264 + .Ldiffoffset_align_s1_8bytes: 265 + /* now s1 is aligned with 8 bytes. */ 266 + cmpdi cr5,r5,31 267 + ble cr5,.Lcmp_lt32bytes 268 + b .Llong 269 + 326 270 EXPORT_SYMBOL(memcmp)