Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp()

Currently memcmp() 64bytes version in powerpc will fall back to .Lshort
(compare per byte mode) if either src or dst address is not 8 bytes aligned.
It can be opmitized in 2 situations:

1) if both addresses are with the same offset with 8 bytes boundary:
memcmp() can compare the unaligned bytes within 8 bytes boundary firstly
and then compare the rest 8-bytes-aligned content with .Llong mode.

2) If src/dst addrs are not with the same offset of 8 bytes boundary:
memcmp() can align src addr with 8 bytes, increment dst addr accordingly,
then load src with aligned mode and load dst with unaligned mode.

This patch optmizes memcmp() behavior in the above 2 situations.

Tested with both little/big endian. Performance result below is based on
little endian.

Following is the test result with src/dst having the same offset case:
(a similar result was observed when src/dst having different offset):
(1) 256 bytes
Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp:
- without patch
29.773018302 seconds time elapsed ( +- 0.09% )
- with patch
16.485568173 seconds time elapsed ( +- 0.02% )
-> There is ~+80% percent improvement

(2) 32 bytes
To observe performance impact on < 32 bytes, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
#include <string.h>
#include "utils.h"

-#define SIZE 256
+#define SIZE 32
#define ITERATIONS 10000

int test_memcmp(const void *s1, const void *s2, size_t n);
--------

- Without patch
0.244746482 seconds time elapsed ( +- 0.36%)
- with patch
0.215069477 seconds time elapsed ( +- 0.51%)
-> There is ~+13% improvement

(3) 0~8 bytes
To observe <8 bytes performance impact, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
#include <string.h>
#include "utils.h"

-#define SIZE 256
-#define ITERATIONS 10000
+#define SIZE 8
+#define ITERATIONS 1000000

int test_memcmp(const void *s1, const void *s2, size_t n);
-------
- Without patch
1.845642503 seconds time elapsed ( +- 0.12% )
- With patch
1.849767135 seconds time elapsed ( +- 0.26% )
-> They are nearly the same. (-0.2%)

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

authored by

Simon Guo and committed by
Michael Ellerman
2d9ee327 ca42d8d2

+133 -7
+133 -7
arch/powerpc/lib/memcmp_64.S
··· 24 24 #define rH r31 25 25 26 26 #ifdef __LITTLE_ENDIAN__ 27 + #define LH lhbrx 28 + #define LW lwbrx 27 29 #define LD ldbrx 28 30 #else 31 + #define LH lhzx 32 + #define LW lwzx 29 33 #define LD ldx 30 34 #endif 31 35 36 + /* 37 + * There are 2 categories for memcmp: 38 + * 1) src/dst has the same offset to the 8 bytes boundary. The handlers 39 + * are named like .Lsameoffset_xxxx 40 + * 2) src/dst has different offset to the 8 bytes boundary. The handlers 41 + * are named like .Ldiffoffset_xxxx 42 + */ 32 43 _GLOBAL(memcmp) 33 44 cmpdi cr1,r5,0 34 45 35 - /* Use the short loop if both strings are not 8B aligned */ 36 - or r6,r3,r4 46 + /* Use the short loop if the src/dst addresses are not 47 + * with the same offset of 8 bytes align boundary. 48 + */ 49 + xor r6,r3,r4 37 50 andi. r6,r6,7 38 51 39 - /* Use the short loop if length is less than 32B */ 40 - cmpdi cr6,r5,31 52 + /* Fall back to short loop if compare at aligned addrs 53 + * with less than 8 bytes. 54 + */ 55 + cmpdi cr6,r5,7 41 56 42 57 beq cr1,.Lzero 43 - bne .Lshort 44 - bgt cr6,.Llong 58 + bgt cr6,.Lno_short 45 59 46 60 .Lshort: 47 61 mtctr r5 48 - 49 62 1: lbz rA,0(r3) 50 63 lbz rB,0(r4) 51 64 subf. rC,rB,rA ··· 91 78 li r3,0 92 79 blr 93 80 81 + .Lno_short: 82 + dcbt 0,r3 83 + dcbt 0,r4 84 + bne .Ldiffoffset_8bytes_make_align_start 85 + 86 + 87 + .Lsameoffset_8bytes_make_align_start: 88 + /* attempt to compare bytes not aligned with 8 bytes so that 89 + * rest comparison can run based on 8 bytes alignment. 90 + */ 91 + andi. r6,r3,7 92 + 93 + /* Try to compare the first double word which is not 8 bytes aligned: 94 + * load the first double word at (src & ~7UL) and shift left appropriate 95 + * bits before comparision. 96 + */ 97 + rlwinm r6,r3,3,26,28 98 + beq .Lsameoffset_8bytes_aligned 99 + clrrdi r3,r3,3 100 + clrrdi r4,r4,3 101 + LD rA,0,r3 102 + LD rB,0,r4 103 + sld rA,rA,r6 104 + sld rB,rB,r6 105 + cmpld cr0,rA,rB 106 + srwi r6,r6,3 107 + bne cr0,.LcmpAB_lightweight 108 + subfic r6,r6,8 109 + subf. r5,r6,r5 110 + addi r3,r3,8 111 + addi r4,r4,8 112 + beq .Lzero 113 + 114 + .Lsameoffset_8bytes_aligned: 115 + /* now we are aligned with 8 bytes. 116 + * Use .Llong loop if left cmp bytes are equal or greater than 32B. 117 + */ 118 + cmpdi cr6,r5,31 119 + bgt cr6,.Llong 120 + 121 + .Lcmp_lt32bytes: 122 + /* compare 1 ~ 32 bytes, at least r3 addr is 8 bytes aligned now */ 123 + cmpdi cr5,r5,7 124 + srdi r0,r5,3 125 + ble cr5,.Lcmp_rest_lt8bytes 126 + 127 + /* handle 8 ~ 31 bytes */ 128 + clrldi r5,r5,61 129 + mtctr r0 130 + 2: 131 + LD rA,0,r3 132 + LD rB,0,r4 133 + cmpld cr0,rA,rB 134 + addi r3,r3,8 135 + addi r4,r4,8 136 + bne cr0,.LcmpAB_lightweight 137 + bdnz 2b 138 + 139 + cmpwi r5,0 140 + beq .Lzero 141 + 142 + .Lcmp_rest_lt8bytes: 143 + /* Here we have only less than 8 bytes to compare with. at least s1 144 + * Address is aligned with 8 bytes. 145 + * The next double words are load and shift right with appropriate 146 + * bits. 147 + */ 148 + subfic r6,r5,8 149 + slwi r6,r6,3 150 + LD rA,0,r3 151 + LD rB,0,r4 152 + srd rA,rA,r6 153 + srd rB,rB,r6 154 + cmpld cr0,rA,rB 155 + bne cr0,.LcmpAB_lightweight 156 + b .Lzero 157 + 94 158 .Lnon_zero: 95 159 mr r3,rC 96 160 blr 97 161 98 162 .Llong: 163 + /* At least s1 addr is aligned with 8 bytes */ 99 164 li off8,8 100 165 li off16,16 101 166 li off24,24 ··· 323 232 ld r28,-32(r1) 324 233 ld r27,-40(r1) 325 234 blr 235 + 236 + .LcmpAB_lightweight: /* skip NV GPRS restore */ 237 + li r3,1 238 + bgtlr 239 + li r3,-1 240 + blr 241 + 242 + .Ldiffoffset_8bytes_make_align_start: 243 + /* now try to align s1 with 8 bytes */ 244 + rlwinm r6,r3,3,26,28 245 + beq .Ldiffoffset_align_s1_8bytes 246 + 247 + clrrdi r3,r3,3 248 + LD rA,0,r3 249 + LD rB,0,r4 /* unaligned load */ 250 + sld rA,rA,r6 251 + srd rA,rA,r6 252 + srd rB,rB,r6 253 + cmpld cr0,rA,rB 254 + srwi r6,r6,3 255 + bne cr0,.LcmpAB_lightweight 256 + 257 + subfic r6,r6,8 258 + subf. r5,r6,r5 259 + addi r3,r3,8 260 + add r4,r4,r6 261 + 262 + beq .Lzero 263 + 264 + .Ldiffoffset_align_s1_8bytes: 265 + /* now s1 is aligned with 8 bytes. */ 266 + cmpdi cr5,r5,31 267 + ble cr5,.Lcmp_lt32bytes 268 + b .Llong 269 + 326 270 EXPORT_SYMBOL(memcmp)