arm64: lib: Implement optimized string compare routines

+6

arch/arm64/include/asm/string.h

··· 22 22 #define __HAVE_ARCH_STRCHR 23 23 extern char *strchr(const char *, int c); 24 24 25 + #define __HAVE_ARCH_STRCMP 26 + extern int strcmp(const char *, const char *); 27 + 28 + #define __HAVE_ARCH_STRNCMP 29 + extern int strncmp(const char *, const char *, __kernel_size_t); 30 + 25 31 #define __HAVE_ARCH_MEMCPY 26 32 extern void *memcpy(void *, const void *, __kernel_size_t); 27 33

+2

arch/arm64/kernel/arm64ksyms.c

··· 44 44 /* string / mem functions */ 45 45 EXPORT_SYMBOL(strchr); 46 46 EXPORT_SYMBOL(strrchr); 47 + EXPORT_SYMBOL(strcmp); 48 + EXPORT_SYMBOL(strncmp); 47 49 EXPORT_SYMBOL(memset); 48 50 EXPORT_SYMBOL(memcpy); 49 51 EXPORT_SYMBOL(memmove);

+1 -1

arch/arm64/lib/Makefile

··· 1 1 lib-y := bitops.o clear_user.o delay.o copy_from_user.o \ 2 2 copy_to_user.o copy_in_user.o copy_page.o \ 3 3 clear_page.o memchr.o memcpy.o memmove.o memset.o \ 4 - memcmp.o strchr.o strrchr.o 4 + memcmp.o strcmp.o strncmp.o strchr.o strrchr.o

+234

arch/arm64/lib/strcmp.S

··· 1 + /* 2 + * Copyright (C) 2013 ARM Ltd. 3 + * Copyright (C) 2013 Linaro. 4 + * 5 + * This code is based on glibc cortex strings work originally authored by Linaro 6 + * and re-licensed under GPLv2 for the Linux kernel. The original code can 7 + * be found @ 8 + * 9 + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 + * files/head:/src/aarch64/ 11 + * 12 + * This program is free software; you can redistribute it and/or modify 13 + * it under the terms of the GNU General Public License version 2 as 14 + * published by the Free Software Foundation. 15 + * 16 + * This program is distributed in the hope that it will be useful, 17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 + * GNU General Public License for more details. 20 + * 21 + * You should have received a copy of the GNU General Public License 22 + * along with this program. If not, see <http://www.gnu.org/licenses/>. 23 + */ 24 + 25 + #include <linux/linkage.h> 26 + #include <asm/assembler.h> 27 + 28 + /* 29 + * compare two strings 30 + * 31 + * Parameters: 32 + * x0 - const string 1 pointer 33 + * x1 - const string 2 pointer 34 + * Returns: 35 + * x0 - an integer less than, equal to, or greater than zero 36 + * if s1 is found, respectively, to be less than, to match, 37 + * or be greater than s2. 38 + */ 39 + 40 + #define REP8_01 0x0101010101010101 41 + #define REP8_7f 0x7f7f7f7f7f7f7f7f 42 + #define REP8_80 0x8080808080808080 43 + 44 + /* Parameters and result. */ 45 + src1 .req x0 46 + src2 .req x1 47 + result .req x0 48 + 49 + /* Internal variables. */ 50 + data1 .req x2 51 + data1w .req w2 52 + data2 .req x3 53 + data2w .req w3 54 + has_nul .req x4 55 + diff .req x5 56 + syndrome .req x6 57 + tmp1 .req x7 58 + tmp2 .req x8 59 + tmp3 .req x9 60 + zeroones .req x10 61 + pos .req x11 62 + 63 + ENTRY(strcmp) 64 + eor tmp1, src1, src2 65 + mov zeroones, #REP8_01 66 + tst tmp1, #7 67 + b.ne .Lmisaligned8 68 + ands tmp1, src1, #7 69 + b.ne .Lmutual_align 70 + 71 + /* 72 + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 73 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 74 + * can be done in parallel across the entire word. 75 + */ 76 + .Lloop_aligned: 77 + ldr data1, [src1], #8 78 + ldr data2, [src2], #8 79 + .Lstart_realigned: 80 + sub tmp1, data1, zeroones 81 + orr tmp2, data1, #REP8_7f 82 + eor diff, data1, data2 /* Non-zero if differences found. */ 83 + bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 84 + orr syndrome, diff, has_nul 85 + cbz syndrome, .Lloop_aligned 86 + b .Lcal_cmpresult 87 + 88 + .Lmutual_align: 89 + /* 90 + * Sources are mutually aligned, but are not currently at an 91 + * alignment boundary. Round down the addresses and then mask off 92 + * the bytes that preceed the start point. 93 + */ 94 + bic src1, src1, #7 95 + bic src2, src2, #7 96 + lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ 97 + ldr data1, [src1], #8 98 + neg tmp1, tmp1 /* Bits to alignment -64. */ 99 + ldr data2, [src2], #8 100 + mov tmp2, #~0 101 + /* Big-endian. Early bytes are at MSB. */ 102 + CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ 103 + /* Little-endian. Early bytes are at LSB. */ 104 + CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ 105 + 106 + orr data1, data1, tmp2 107 + orr data2, data2, tmp2 108 + b .Lstart_realigned 109 + 110 + .Lmisaligned8: 111 + /* 112 + * Get the align offset length to compare per byte first. 113 + * After this process, one string's address will be aligned. 114 + */ 115 + and tmp1, src1, #7 116 + neg tmp1, tmp1 117 + add tmp1, tmp1, #8 118 + and tmp2, src2, #7 119 + neg tmp2, tmp2 120 + add tmp2, tmp2, #8 121 + subs tmp3, tmp1, tmp2 122 + csel pos, tmp1, tmp2, hi /*Choose the maximum. */ 123 + .Ltinycmp: 124 + ldrb data1w, [src1], #1 125 + ldrb data2w, [src2], #1 126 + subs pos, pos, #1 127 + ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ 128 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 129 + b.eq .Ltinycmp 130 + cbnz pos, 1f /*find the null or unequal...*/ 131 + cmp data1w, #1 132 + ccmp data1w, data2w, #0, cs 133 + b.eq .Lstart_align /*the last bytes are equal....*/ 134 + 1: 135 + sub result, data1, data2 136 + ret 137 + 138 + .Lstart_align: 139 + ands xzr, src1, #7 140 + b.eq .Lrecal_offset 141 + /*process more leading bytes to make str1 aligned...*/ 142 + add src1, src1, tmp3 143 + add src2, src2, tmp3 144 + /*load 8 bytes from aligned str1 and non-aligned str2..*/ 145 + ldr data1, [src1], #8 146 + ldr data2, [src2], #8 147 + 148 + sub tmp1, data1, zeroones 149 + orr tmp2, data1, #REP8_7f 150 + bic has_nul, tmp1, tmp2 151 + eor diff, data1, data2 /* Non-zero if differences found. */ 152 + orr syndrome, diff, has_nul 153 + cbnz syndrome, .Lcal_cmpresult 154 + /*How far is the current str2 from the alignment boundary...*/ 155 + and tmp3, tmp3, #7 156 + .Lrecal_offset: 157 + neg pos, tmp3 158 + .Lloopcmp_proc: 159 + /* 160 + * Divide the eight bytes into two parts. First,backwards the src2 161 + * to an alignment boundary,load eight bytes from the SRC2 alignment 162 + * boundary,then compare with the relative bytes from SRC1. 163 + * If all 8 bytes are equal,then start the second part's comparison. 164 + * Otherwise finish the comparison. 165 + * This special handle can garantee all the accesses are in the 166 + * thread/task space in avoid to overrange access. 167 + */ 168 + ldr data1, [src1,pos] 169 + ldr data2, [src2,pos] 170 + sub tmp1, data1, zeroones 171 + orr tmp2, data1, #REP8_7f 172 + bic has_nul, tmp1, tmp2 173 + eor diff, data1, data2 /* Non-zero if differences found. */ 174 + orr syndrome, diff, has_nul 175 + cbnz syndrome, .Lcal_cmpresult 176 + 177 + /*The second part process*/ 178 + ldr data1, [src1], #8 179 + ldr data2, [src2], #8 180 + sub tmp1, data1, zeroones 181 + orr tmp2, data1, #REP8_7f 182 + bic has_nul, tmp1, tmp2 183 + eor diff, data1, data2 /* Non-zero if differences found. */ 184 + orr syndrome, diff, has_nul 185 + cbz syndrome, .Lloopcmp_proc 186 + 187 + .Lcal_cmpresult: 188 + /* 189 + * reversed the byte-order as big-endian,then CLZ can find the most 190 + * significant zero bits. 191 + */ 192 + CPU_LE( rev syndrome, syndrome ) 193 + CPU_LE( rev data1, data1 ) 194 + CPU_LE( rev data2, data2 ) 195 + 196 + /* 197 + * For big-endian we cannot use the trick with the syndrome value 198 + * as carry-propagation can corrupt the upper bits if the trailing 199 + * bytes in the string contain 0x01. 200 + * However, if there is no NUL byte in the dword, we can generate 201 + * the result directly. We ca not just subtract the bytes as the 202 + * MSB might be significant. 203 + */ 204 + CPU_BE( cbnz has_nul, 1f ) 205 + CPU_BE( cmp data1, data2 ) 206 + CPU_BE( cset result, ne ) 207 + CPU_BE( cneg result, result, lo ) 208 + CPU_BE( ret ) 209 + CPU_BE( 1: ) 210 + /*Re-compute the NUL-byte detection, using a byte-reversed value. */ 211 + CPU_BE( rev tmp3, data1 ) 212 + CPU_BE( sub tmp1, tmp3, zeroones ) 213 + CPU_BE( orr tmp2, tmp3, #REP8_7f ) 214 + CPU_BE( bic has_nul, tmp1, tmp2 ) 215 + CPU_BE( rev has_nul, has_nul ) 216 + CPU_BE( orr syndrome, diff, has_nul ) 217 + 218 + clz pos, syndrome 219 + /* 220 + * The MS-non-zero bit of the syndrome marks either the first bit 221 + * that is different, or the top bit of the first zero byte. 222 + * Shifting left now will bring the critical information into the 223 + * top bits. 224 + */ 225 + lsl data1, data1, pos 226 + lsl data2, data2, pos 227 + /* 228 + * But we need to zero-extend (char is unsigned) the value and then 229 + * perform a signed 32-bit subtraction. 230 + */ 231 + lsr data1, data1, #56 232 + sub result, data1, data2, lsr #56 233 + ret 234 + ENDPROC(strcmp)

+310

arch/arm64/lib/strncmp.S

··· 1 + /* 2 + * Copyright (C) 2013 ARM Ltd. 3 + * Copyright (C) 2013 Linaro. 4 + * 5 + * This code is based on glibc cortex strings work originally authored by Linaro 6 + * and re-licensed under GPLv2 for the Linux kernel. The original code can 7 + * be found @ 8 + * 9 + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 + * files/head:/src/aarch64/ 11 + * 12 + * This program is free software; you can redistribute it and/or modify 13 + * it under the terms of the GNU General Public License version 2 as 14 + * published by the Free Software Foundation. 15 + * 16 + * This program is distributed in the hope that it will be useful, 17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 + * GNU General Public License for more details. 20 + * 21 + * You should have received a copy of the GNU General Public License 22 + * along with this program. If not, see <http://www.gnu.org/licenses/>. 23 + */ 24 + 25 + #include <linux/linkage.h> 26 + #include <asm/assembler.h> 27 + 28 + /* 29 + * compare two strings 30 + * 31 + * Parameters: 32 + * x0 - const string 1 pointer 33 + * x1 - const string 2 pointer 34 + * x2 - the maximal length to be compared 35 + * Returns: 36 + * x0 - an integer less than, equal to, or greater than zero if s1 is found, 37 + * respectively, to be less than, to match, or be greater than s2. 38 + */ 39 + 40 + #define REP8_01 0x0101010101010101 41 + #define REP8_7f 0x7f7f7f7f7f7f7f7f 42 + #define REP8_80 0x8080808080808080 43 + 44 + /* Parameters and result. */ 45 + src1 .req x0 46 + src2 .req x1 47 + limit .req x2 48 + result .req x0 49 + 50 + /* Internal variables. */ 51 + data1 .req x3 52 + data1w .req w3 53 + data2 .req x4 54 + data2w .req w4 55 + has_nul .req x5 56 + diff .req x6 57 + syndrome .req x7 58 + tmp1 .req x8 59 + tmp2 .req x9 60 + tmp3 .req x10 61 + zeroones .req x11 62 + pos .req x12 63 + limit_wd .req x13 64 + mask .req x14 65 + endloop .req x15 66 + 67 + ENTRY(strncmp) 68 + cbz limit, .Lret0 69 + eor tmp1, src1, src2 70 + mov zeroones, #REP8_01 71 + tst tmp1, #7 72 + b.ne .Lmisaligned8 73 + ands tmp1, src1, #7 74 + b.ne .Lmutual_align 75 + /* Calculate the number of full and partial words -1. */ 76 + /* 77 + * when limit is mulitply of 8, if not sub 1, 78 + * the judgement of last dword will wrong. 79 + */ 80 + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ 81 + lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ 82 + 83 + /* 84 + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 85 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 86 + * can be done in parallel across the entire word. 87 + */ 88 + .Lloop_aligned: 89 + ldr data1, [src1], #8 90 + ldr data2, [src2], #8 91 + .Lstart_realigned: 92 + subs limit_wd, limit_wd, #1 93 + sub tmp1, data1, zeroones 94 + orr tmp2, data1, #REP8_7f 95 + eor diff, data1, data2 /* Non-zero if differences found. */ 96 + csinv endloop, diff, xzr, pl /* Last Dword or differences.*/ 97 + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 98 + ccmp endloop, #0, #0, eq 99 + b.eq .Lloop_aligned 100 + 101 + /*Not reached the limit, must have found the end or a diff. */ 102 + tbz limit_wd, #63, .Lnot_limit 103 + 104 + /* Limit % 8 == 0 => all bytes significant. */ 105 + ands limit, limit, #7 106 + b.eq .Lnot_limit 107 + 108 + lsl limit, limit, #3 /* Bits -> bytes. */ 109 + mov mask, #~0 110 + CPU_BE( lsr mask, mask, limit ) 111 + CPU_LE( lsl mask, mask, limit ) 112 + bic data1, data1, mask 113 + bic data2, data2, mask 114 + 115 + /* Make sure that the NUL byte is marked in the syndrome. */ 116 + orr has_nul, has_nul, mask 117 + 118 + .Lnot_limit: 119 + orr syndrome, diff, has_nul 120 + b .Lcal_cmpresult 121 + 122 + .Lmutual_align: 123 + /* 124 + * Sources are mutually aligned, but are not currently at an 125 + * alignment boundary. Round down the addresses and then mask off 126 + * the bytes that precede the start point. 127 + * We also need to adjust the limit calculations, but without 128 + * overflowing if the limit is near ULONG_MAX. 129 + */ 130 + bic src1, src1, #7 131 + bic src2, src2, #7 132 + ldr data1, [src1], #8 133 + neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */ 134 + ldr data2, [src2], #8 135 + mov tmp2, #~0 136 + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ 137 + /* Big-endian. Early bytes are at MSB. */ 138 + CPU_BE( lsl tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */ 139 + /* Little-endian. Early bytes are at LSB. */ 140 + CPU_LE( lsr tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */ 141 + 142 + and tmp3, limit_wd, #7 143 + lsr limit_wd, limit_wd, #3 144 + /* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/ 145 + add limit, limit, tmp1 146 + add tmp3, tmp3, tmp1 147 + orr data1, data1, tmp2 148 + orr data2, data2, tmp2 149 + add limit_wd, limit_wd, tmp3, lsr #3 150 + b .Lstart_realigned 151 + 152 + /*when src1 offset is not equal to src2 offset...*/ 153 + .Lmisaligned8: 154 + cmp limit, #8 155 + b.lo .Ltiny8proc /*limit < 8... */ 156 + /* 157 + * Get the align offset length to compare per byte first. 158 + * After this process, one string's address will be aligned.*/ 159 + and tmp1, src1, #7 160 + neg tmp1, tmp1 161 + add tmp1, tmp1, #8 162 + and tmp2, src2, #7 163 + neg tmp2, tmp2 164 + add tmp2, tmp2, #8 165 + subs tmp3, tmp1, tmp2 166 + csel pos, tmp1, tmp2, hi /*Choose the maximum. */ 167 + /* 168 + * Here, limit is not less than 8, so directly run .Ltinycmp 169 + * without checking the limit.*/ 170 + sub limit, limit, pos 171 + .Ltinycmp: 172 + ldrb data1w, [src1], #1 173 + ldrb data2w, [src2], #1 174 + subs pos, pos, #1 175 + ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ 176 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 177 + b.eq .Ltinycmp 178 + cbnz pos, 1f /*find the null or unequal...*/ 179 + cmp data1w, #1 180 + ccmp data1w, data2w, #0, cs 181 + b.eq .Lstart_align /*the last bytes are equal....*/ 182 + 1: 183 + sub result, data1, data2 184 + ret 185 + 186 + .Lstart_align: 187 + lsr limit_wd, limit, #3 188 + cbz limit_wd, .Lremain8 189 + /*process more leading bytes to make str1 aligned...*/ 190 + ands xzr, src1, #7 191 + b.eq .Lrecal_offset 192 + add src1, src1, tmp3 /*tmp3 is positive in this branch.*/ 193 + add src2, src2, tmp3 194 + ldr data1, [src1], #8 195 + ldr data2, [src2], #8 196 + 197 + sub limit, limit, tmp3 198 + lsr limit_wd, limit, #3 199 + subs limit_wd, limit_wd, #1 200 + 201 + sub tmp1, data1, zeroones 202 + orr tmp2, data1, #REP8_7f 203 + eor diff, data1, data2 /* Non-zero if differences found. */ 204 + csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ 205 + bics has_nul, tmp1, tmp2 206 + ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/ 207 + b.ne .Lunequal_proc 208 + /*How far is the current str2 from the alignment boundary...*/ 209 + and tmp3, tmp3, #7 210 + .Lrecal_offset: 211 + neg pos, tmp3 212 + .Lloopcmp_proc: 213 + /* 214 + * Divide the eight bytes into two parts. First,backwards the src2 215 + * to an alignment boundary,load eight bytes from the SRC2 alignment 216 + * boundary,then compare with the relative bytes from SRC1. 217 + * If all 8 bytes are equal,then start the second part's comparison. 218 + * Otherwise finish the comparison. 219 + * This special handle can garantee all the accesses are in the 220 + * thread/task space in avoid to overrange access. 221 + */ 222 + ldr data1, [src1,pos] 223 + ldr data2, [src2,pos] 224 + sub tmp1, data1, zeroones 225 + orr tmp2, data1, #REP8_7f 226 + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 227 + eor diff, data1, data2 /* Non-zero if differences found. */ 228 + csinv endloop, diff, xzr, eq 229 + cbnz endloop, .Lunequal_proc 230 + 231 + /*The second part process*/ 232 + ldr data1, [src1], #8 233 + ldr data2, [src2], #8 234 + subs limit_wd, limit_wd, #1 235 + sub tmp1, data1, zeroones 236 + orr tmp2, data1, #REP8_7f 237 + eor diff, data1, data2 /* Non-zero if differences found. */ 238 + csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ 239 + bics has_nul, tmp1, tmp2 240 + ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/ 241 + b.eq .Lloopcmp_proc 242 + 243 + .Lunequal_proc: 244 + orr syndrome, diff, has_nul 245 + cbz syndrome, .Lremain8 246 + .Lcal_cmpresult: 247 + /* 248 + * reversed the byte-order as big-endian,then CLZ can find the most 249 + * significant zero bits. 250 + */ 251 + CPU_LE( rev syndrome, syndrome ) 252 + CPU_LE( rev data1, data1 ) 253 + CPU_LE( rev data2, data2 ) 254 + /* 255 + * For big-endian we cannot use the trick with the syndrome value 256 + * as carry-propagation can corrupt the upper bits if the trailing 257 + * bytes in the string contain 0x01. 258 + * However, if there is no NUL byte in the dword, we can generate 259 + * the result directly. We can't just subtract the bytes as the 260 + * MSB might be significant. 261 + */ 262 + CPU_BE( cbnz has_nul, 1f ) 263 + CPU_BE( cmp data1, data2 ) 264 + CPU_BE( cset result, ne ) 265 + CPU_BE( cneg result, result, lo ) 266 + CPU_BE( ret ) 267 + CPU_BE( 1: ) 268 + /* Re-compute the NUL-byte detection, using a byte-reversed value.*/ 269 + CPU_BE( rev tmp3, data1 ) 270 + CPU_BE( sub tmp1, tmp3, zeroones ) 271 + CPU_BE( orr tmp2, tmp3, #REP8_7f ) 272 + CPU_BE( bic has_nul, tmp1, tmp2 ) 273 + CPU_BE( rev has_nul, has_nul ) 274 + CPU_BE( orr syndrome, diff, has_nul ) 275 + /* 276 + * The MS-non-zero bit of the syndrome marks either the first bit 277 + * that is different, or the top bit of the first zero byte. 278 + * Shifting left now will bring the critical information into the 279 + * top bits. 280 + */ 281 + clz pos, syndrome 282 + lsl data1, data1, pos 283 + lsl data2, data2, pos 284 + /* 285 + * But we need to zero-extend (char is unsigned) the value and then 286 + * perform a signed 32-bit subtraction. 287 + */ 288 + lsr data1, data1, #56 289 + sub result, data1, data2, lsr #56 290 + ret 291 + 292 + .Lremain8: 293 + /* Limit % 8 == 0 => all bytes significant. */ 294 + ands limit, limit, #7 295 + b.eq .Lret0 296 + .Ltiny8proc: 297 + ldrb data1w, [src1], #1 298 + ldrb data2w, [src2], #1 299 + subs limit, limit, #1 300 + 301 + ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ 302 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 303 + b.eq .Ltiny8proc 304 + sub result, data1, data2 305 + ret 306 + 307 + .Lret0: 308 + mov result, #0 309 + ret 310 + ENDPROC(strncmp)