arm64: Import latest memcpy()/memmove() implementation

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Import the latest implementation of memcpy(), based on the
upstream code of string/aarch64/memcpy.S at commit afd6244 from
https://github.com/ARM-software/optimized-routines, and subsuming
memmove() in the process.

Note that for simplicity Arm have chosen to contribute this code
to Linux under GPLv2 rather than the original MIT license.

Note also that the needs of the usercopy routines vs. regular memcpy()
have now diverged so far that we abandon the shared template idea
and the damage which that incurred to the tuning of LDP/STP loops.
We'll be back to tackle those routines separately in future.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/3c953af43506581b2422f61952261e76949ba711.1622128527.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>

authored by

Robin Murphy and committed by

Will Deacon 4 years ago 28513304 b6c4ea48

+230 -233

3 changed files

expand all

arch

arm64

lib

Makefile

memcpy.S

memmove.S

+1 -1

arch/arm64/lib/Makefile

··· 1 1 # SPDX-License-Identifier: GPL-2.0 2 2 lib-y := clear_user.o delay.o copy_from_user.o \ 3 3 copy_to_user.o copy_in_user.o copy_page.o \ 4 - clear_page.o csum.o memchr.o memcpy.o memmove.o \ 4 + clear_page.o csum.o memchr.o memcpy.o \ 5 5 memset.o memcmp.o strcmp.o strncmp.o strlen.o \ 6 6 strnlen.o strchr.o strrchr.o tishift.o 7 7

+229 -43

arch/arm64/lib/memcpy.S

··· 1 1 /* SPDX-License-Identifier: GPL-2.0-only */ 2 2 /* 3 - * Copyright (C) 2013 ARM Ltd. 4 - * Copyright (C) 2013 Linaro. 3 + * Copyright (c) 2012-2020, Arm Limited. 5 4 * 6 - * This code is based on glibc cortex strings work originally authored by Linaro 7 - * be found @ 8 - * 9 - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 - * files/head:/src/aarch64/ 5 + * Adapted from the original at: 6 + * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcpy.S 11 7 */ 12 8 13 9 #include <linux/linkage.h> 14 10 #include <asm/assembler.h> 15 - #include <asm/cache.h> 16 11 17 - /* 18 - * Copy a buffer from src to dest (alignment handled by the hardware) 12 + /* Assumptions: 19 13 * 20 - * Parameters: 21 - * x0 - dest 22 - * x1 - src 23 - * x2 - n 24 - * Returns: 25 - * x0 - dest 14 + * ARMv8-a, AArch64, unaligned accesses. 15 + * 26 16 */ 27 - .macro ldrb1 reg, ptr, val 28 - ldrb \reg, [\ptr], \val 29 - .endm 30 17 31 - .macro strb1 reg, ptr, val 32 - strb \reg, [\ptr], \val 33 - .endm 18 + #define L(label) .L ## label 34 19 35 - .macro ldrh1 reg, ptr, val 36 - ldrh \reg, [\ptr], \val 37 - .endm 20 + #define dstin x0 21 + #define src x1 22 + #define count x2 23 + #define dst x3 24 + #define srcend x4 25 + #define dstend x5 26 + #define A_l x6 27 + #define A_lw w6 28 + #define A_h x7 29 + #define B_l x8 30 + #define B_lw w8 31 + #define B_h x9 32 + #define C_l x10 33 + #define C_lw w10 34 + #define C_h x11 35 + #define D_l x12 36 + #define D_h x13 37 + #define E_l x14 38 + #define E_h x15 39 + #define F_l x16 40 + #define F_h x17 41 + #define G_l count 42 + #define G_h dst 43 + #define H_l src 44 + #define H_h srcend 45 + #define tmp1 x14 38 46 39 - .macro strh1 reg, ptr, val 40 - strh \reg, [\ptr], \val 41 - .endm 47 + /* This implementation handles overlaps and supports both memcpy and memmove 48 + from a single entry point. It uses unaligned accesses and branchless 49 + sequences to keep the code small, simple and improve performance. 42 50 43 - .macro ldr1 reg, ptr, val 44 - ldr \reg, [\ptr], \val 45 - .endm 51 + Copies are split into 3 main cases: small copies of up to 32 bytes, medium 52 + copies of up to 128 bytes, and large copies. The overhead of the overlap 53 + check is negligible since it is only required for large copies. 46 54 47 - .macro str1 reg, ptr, val 48 - str \reg, [\ptr], \val 49 - .endm 55 + Large copies use a software pipelined loop processing 64 bytes per iteration. 56 + The destination pointer is 16-byte aligned to minimize unaligned accesses. 57 + The loop tail is handled by always copying 64 bytes from the end. 58 + */ 50 59 51 - .macro ldp1 reg1, reg2, ptr, val 52 - ldp \reg1, \reg2, [\ptr], \val 53 - .endm 54 - 55 - .macro stp1 reg1, reg2, ptr, val 56 - stp \reg1, \reg2, [\ptr], \val 57 - .endm 58 - 60 + SYM_FUNC_START_ALIAS(__memmove) 61 + SYM_FUNC_START_WEAK_ALIAS_PI(memmove) 59 62 SYM_FUNC_START_ALIAS(__memcpy) 60 63 SYM_FUNC_START_WEAK_PI(memcpy) 61 - #include "copy_template.S" 64 + add srcend, src, count 65 + add dstend, dstin, count 66 + cmp count, 128 67 + b.hi L(copy_long) 68 + cmp count, 32 69 + b.hi L(copy32_128) 70 + 71 + /* Small copies: 0..32 bytes. */ 72 + cmp count, 16 73 + b.lo L(copy16) 74 + ldp A_l, A_h, [src] 75 + ldp D_l, D_h, [srcend, -16] 76 + stp A_l, A_h, [dstin] 77 + stp D_l, D_h, [dstend, -16] 62 78 ret 79 + 80 + /* Copy 8-15 bytes. */ 81 + L(copy16): 82 + tbz count, 3, L(copy8) 83 + ldr A_l, [src] 84 + ldr A_h, [srcend, -8] 85 + str A_l, [dstin] 86 + str A_h, [dstend, -8] 87 + ret 88 + 89 + .p2align 3 90 + /* Copy 4-7 bytes. */ 91 + L(copy8): 92 + tbz count, 2, L(copy4) 93 + ldr A_lw, [src] 94 + ldr B_lw, [srcend, -4] 95 + str A_lw, [dstin] 96 + str B_lw, [dstend, -4] 97 + ret 98 + 99 + /* Copy 0..3 bytes using a branchless sequence. */ 100 + L(copy4): 101 + cbz count, L(copy0) 102 + lsr tmp1, count, 1 103 + ldrb A_lw, [src] 104 + ldrb C_lw, [srcend, -1] 105 + ldrb B_lw, [src, tmp1] 106 + strb A_lw, [dstin] 107 + strb B_lw, [dstin, tmp1] 108 + strb C_lw, [dstend, -1] 109 + L(copy0): 110 + ret 111 + 112 + .p2align 4 113 + /* Medium copies: 33..128 bytes. */ 114 + L(copy32_128): 115 + ldp A_l, A_h, [src] 116 + ldp B_l, B_h, [src, 16] 117 + ldp C_l, C_h, [srcend, -32] 118 + ldp D_l, D_h, [srcend, -16] 119 + cmp count, 64 120 + b.hi L(copy128) 121 + stp A_l, A_h, [dstin] 122 + stp B_l, B_h, [dstin, 16] 123 + stp C_l, C_h, [dstend, -32] 124 + stp D_l, D_h, [dstend, -16] 125 + ret 126 + 127 + .p2align 4 128 + /* Copy 65..128 bytes. */ 129 + L(copy128): 130 + ldp E_l, E_h, [src, 32] 131 + ldp F_l, F_h, [src, 48] 132 + cmp count, 96 133 + b.ls L(copy96) 134 + ldp G_l, G_h, [srcend, -64] 135 + ldp H_l, H_h, [srcend, -48] 136 + stp G_l, G_h, [dstend, -64] 137 + stp H_l, H_h, [dstend, -48] 138 + L(copy96): 139 + stp A_l, A_h, [dstin] 140 + stp B_l, B_h, [dstin, 16] 141 + stp E_l, E_h, [dstin, 32] 142 + stp F_l, F_h, [dstin, 48] 143 + stp C_l, C_h, [dstend, -32] 144 + stp D_l, D_h, [dstend, -16] 145 + ret 146 + 147 + .p2align 4 148 + /* Copy more than 128 bytes. */ 149 + L(copy_long): 150 + /* Use backwards copy if there is an overlap. */ 151 + sub tmp1, dstin, src 152 + cbz tmp1, L(copy0) 153 + cmp tmp1, count 154 + b.lo L(copy_long_backwards) 155 + 156 + /* Copy 16 bytes and then align dst to 16-byte alignment. */ 157 + 158 + ldp D_l, D_h, [src] 159 + and tmp1, dstin, 15 160 + bic dst, dstin, 15 161 + sub src, src, tmp1 162 + add count, count, tmp1 /* Count is now 16 too large. */ 163 + ldp A_l, A_h, [src, 16] 164 + stp D_l, D_h, [dstin] 165 + ldp B_l, B_h, [src, 32] 166 + ldp C_l, C_h, [src, 48] 167 + ldp D_l, D_h, [src, 64]! 168 + subs count, count, 128 + 16 /* Test and readjust count. */ 169 + b.ls L(copy64_from_end) 170 + 171 + L(loop64): 172 + stp A_l, A_h, [dst, 16] 173 + ldp A_l, A_h, [src, 16] 174 + stp B_l, B_h, [dst, 32] 175 + ldp B_l, B_h, [src, 32] 176 + stp C_l, C_h, [dst, 48] 177 + ldp C_l, C_h, [src, 48] 178 + stp D_l, D_h, [dst, 64]! 179 + ldp D_l, D_h, [src, 64]! 180 + subs count, count, 64 181 + b.hi L(loop64) 182 + 183 + /* Write the last iteration and copy 64 bytes from the end. */ 184 + L(copy64_from_end): 185 + ldp E_l, E_h, [srcend, -64] 186 + stp A_l, A_h, [dst, 16] 187 + ldp A_l, A_h, [srcend, -48] 188 + stp B_l, B_h, [dst, 32] 189 + ldp B_l, B_h, [srcend, -32] 190 + stp C_l, C_h, [dst, 48] 191 + ldp C_l, C_h, [srcend, -16] 192 + stp D_l, D_h, [dst, 64] 193 + stp E_l, E_h, [dstend, -64] 194 + stp A_l, A_h, [dstend, -48] 195 + stp B_l, B_h, [dstend, -32] 196 + stp C_l, C_h, [dstend, -16] 197 + ret 198 + 199 + .p2align 4 200 + 201 + /* Large backwards copy for overlapping copies. 202 + Copy 16 bytes and then align dst to 16-byte alignment. */ 203 + L(copy_long_backwards): 204 + ldp D_l, D_h, [srcend, -16] 205 + and tmp1, dstend, 15 206 + sub srcend, srcend, tmp1 207 + sub count, count, tmp1 208 + ldp A_l, A_h, [srcend, -16] 209 + stp D_l, D_h, [dstend, -16] 210 + ldp B_l, B_h, [srcend, -32] 211 + ldp C_l, C_h, [srcend, -48] 212 + ldp D_l, D_h, [srcend, -64]! 213 + sub dstend, dstend, tmp1 214 + subs count, count, 128 215 + b.ls L(copy64_from_start) 216 + 217 + L(loop64_backwards): 218 + stp A_l, A_h, [dstend, -16] 219 + ldp A_l, A_h, [srcend, -16] 220 + stp B_l, B_h, [dstend, -32] 221 + ldp B_l, B_h, [srcend, -32] 222 + stp C_l, C_h, [dstend, -48] 223 + ldp C_l, C_h, [srcend, -48] 224 + stp D_l, D_h, [dstend, -64]! 225 + ldp D_l, D_h, [srcend, -64]! 226 + subs count, count, 64 227 + b.hi L(loop64_backwards) 228 + 229 + /* Write the last iteration and copy 64 bytes from the start. */ 230 + L(copy64_from_start): 231 + ldp G_l, G_h, [src, 48] 232 + stp A_l, A_h, [dstend, -16] 233 + ldp A_l, A_h, [src, 32] 234 + stp B_l, B_h, [dstend, -32] 235 + ldp B_l, B_h, [src, 16] 236 + stp C_l, C_h, [dstend, -48] 237 + ldp C_l, C_h, [src] 238 + stp D_l, D_h, [dstend, -64] 239 + stp G_l, G_h, [dstin, 48] 240 + stp A_l, A_h, [dstin, 32] 241 + stp B_l, B_h, [dstin, 16] 242 + stp C_l, C_h, [dstin] 243 + ret 244 + 63 245 SYM_FUNC_END_PI(memcpy) 64 246 EXPORT_SYMBOL(memcpy) 65 247 SYM_FUNC_END_ALIAS(__memcpy) 66 248 EXPORT_SYMBOL(__memcpy) 249 + SYM_FUNC_END_ALIAS_PI(memmove) 250 + EXPORT_SYMBOL(memmove) 251 + SYM_FUNC_END_ALIAS(__memmove) 252 + EXPORT_SYMBOL(__memmove)

-189

arch/arm64/lib/memmove.S

··· 1 - /* SPDX-License-Identifier: GPL-2.0-only */ 2 - /* 3 - * Copyright (C) 2013 ARM Ltd. 4 - * Copyright (C) 2013 Linaro. 5 - * 6 - * This code is based on glibc cortex strings work originally authored by Linaro 7 - * be found @ 8 - * 9 - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 - * files/head:/src/aarch64/ 11 - */ 12 - 13 - #include <linux/linkage.h> 14 - #include <asm/assembler.h> 15 - #include <asm/cache.h> 16 - 17 - /* 18 - * Move a buffer from src to test (alignment handled by the hardware). 19 - * If dest <= src, call memcpy, otherwise copy in reverse order. 20 - * 21 - * Parameters: 22 - * x0 - dest 23 - * x1 - src 24 - * x2 - n 25 - * Returns: 26 - * x0 - dest 27 - */ 28 - dstin .req x0 29 - src .req x1 30 - count .req x2 31 - tmp1 .req x3 32 - tmp1w .req w3 33 - tmp2 .req x4 34 - tmp2w .req w4 35 - tmp3 .req x5 36 - tmp3w .req w5 37 - dst .req x6 38 - 39 - A_l .req x7 40 - A_h .req x8 41 - B_l .req x9 42 - B_h .req x10 43 - C_l .req x11 44 - C_h .req x12 45 - D_l .req x13 46 - D_h .req x14 47 - 48 - SYM_FUNC_START_ALIAS(__memmove) 49 - SYM_FUNC_START_WEAK_PI(memmove) 50 - cmp dstin, src 51 - b.lo __memcpy 52 - add tmp1, src, count 53 - cmp dstin, tmp1 54 - b.hs __memcpy /* No overlap. */ 55 - 56 - add dst, dstin, count 57 - add src, src, count 58 - cmp count, #16 59 - b.lo .Ltail15 /*probably non-alignment accesses.*/ 60 - 61 - ands tmp2, src, #15 /* Bytes to reach alignment. */ 62 - b.eq .LSrcAligned 63 - sub count, count, tmp2 64 - /* 65 - * process the aligned offset length to make the src aligned firstly. 66 - * those extra instructions' cost is acceptable. It also make the 67 - * coming accesses are based on aligned address. 68 - */ 69 - tbz tmp2, #0, 1f 70 - ldrb tmp1w, [src, #-1]! 71 - strb tmp1w, [dst, #-1]! 72 - 1: 73 - tbz tmp2, #1, 2f 74 - ldrh tmp1w, [src, #-2]! 75 - strh tmp1w, [dst, #-2]! 76 - 2: 77 - tbz tmp2, #2, 3f 78 - ldr tmp1w, [src, #-4]! 79 - str tmp1w, [dst, #-4]! 80 - 3: 81 - tbz tmp2, #3, .LSrcAligned 82 - ldr tmp1, [src, #-8]! 83 - str tmp1, [dst, #-8]! 84 - 85 - .LSrcAligned: 86 - cmp count, #64 87 - b.ge .Lcpy_over64 88 - 89 - /* 90 - * Deal with small copies quickly by dropping straight into the 91 - * exit block. 92 - */ 93 - .Ltail63: 94 - /* 95 - * Copy up to 48 bytes of data. At this point we only need the 96 - * bottom 6 bits of count to be accurate. 97 - */ 98 - ands tmp1, count, #0x30 99 - b.eq .Ltail15 100 - cmp tmp1w, #0x20 101 - b.eq 1f 102 - b.lt 2f 103 - ldp A_l, A_h, [src, #-16]! 104 - stp A_l, A_h, [dst, #-16]! 105 - 1: 106 - ldp A_l, A_h, [src, #-16]! 107 - stp A_l, A_h, [dst, #-16]! 108 - 2: 109 - ldp A_l, A_h, [src, #-16]! 110 - stp A_l, A_h, [dst, #-16]! 111 - 112 - .Ltail15: 113 - tbz count, #3, 1f 114 - ldr tmp1, [src, #-8]! 115 - str tmp1, [dst, #-8]! 116 - 1: 117 - tbz count, #2, 2f 118 - ldr tmp1w, [src, #-4]! 119 - str tmp1w, [dst, #-4]! 120 - 2: 121 - tbz count, #1, 3f 122 - ldrh tmp1w, [src, #-2]! 123 - strh tmp1w, [dst, #-2]! 124 - 3: 125 - tbz count, #0, .Lexitfunc 126 - ldrb tmp1w, [src, #-1] 127 - strb tmp1w, [dst, #-1] 128 - 129 - .Lexitfunc: 130 - ret 131 - 132 - .Lcpy_over64: 133 - subs count, count, #128 134 - b.ge .Lcpy_body_large 135 - /* 136 - * Less than 128 bytes to copy, so handle 64 bytes here and then jump 137 - * to the tail. 138 - */ 139 - ldp A_l, A_h, [src, #-16] 140 - stp A_l, A_h, [dst, #-16] 141 - ldp B_l, B_h, [src, #-32] 142 - ldp C_l, C_h, [src, #-48] 143 - stp B_l, B_h, [dst, #-32] 144 - stp C_l, C_h, [dst, #-48] 145 - ldp D_l, D_h, [src, #-64]! 146 - stp D_l, D_h, [dst, #-64]! 147 - 148 - tst count, #0x3f 149 - b.ne .Ltail63 150 - ret 151 - 152 - /* 153 - * Critical loop. Start at a new cache line boundary. Assuming 154 - * 64 bytes per line this ensures the entire loop is in one line. 155 - */ 156 - .p2align L1_CACHE_SHIFT 157 - .Lcpy_body_large: 158 - /* pre-load 64 bytes data. */ 159 - ldp A_l, A_h, [src, #-16] 160 - ldp B_l, B_h, [src, #-32] 161 - ldp C_l, C_h, [src, #-48] 162 - ldp D_l, D_h, [src, #-64]! 163 - 1: 164 - /* 165 - * interlace the load of next 64 bytes data block with store of the last 166 - * loaded 64 bytes data. 167 - */ 168 - stp A_l, A_h, [dst, #-16] 169 - ldp A_l, A_h, [src, #-16] 170 - stp B_l, B_h, [dst, #-32] 171 - ldp B_l, B_h, [src, #-32] 172 - stp C_l, C_h, [dst, #-48] 173 - ldp C_l, C_h, [src, #-48] 174 - stp D_l, D_h, [dst, #-64]! 175 - ldp D_l, D_h, [src, #-64]! 176 - subs count, count, #64 177 - b.ge 1b 178 - stp A_l, A_h, [dst, #-16] 179 - stp B_l, B_h, [dst, #-32] 180 - stp C_l, C_h, [dst, #-48] 181 - stp D_l, D_h, [dst, #-64]! 182 - 183 - tst count, #0x3f 184 - b.ne .Ltail63 185 - ret 186 - SYM_FUNC_END_PI(memmove) 187 - EXPORT_SYMBOL(memmove) 188 - SYM_FUNC_END_ALIAS(__memmove) 189 - EXPORT_SYMBOL(__memmove)