Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

arm64: Change memcpy in kernel to use the copy template file

This converts the memcpy.S to use the copy template file. The copy
template file was based originally on the memcpy.S

Signed-off-by: Feng Kan <fkan@apm.com>
Signed-off-by: Balamurugan Shanmugam <bshanmugam@apm.com>
[catalin.marinas@arm.com: removed tmp3(w) .req statements as they are not used]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

authored by

Feng Kan and committed by
Catalin Marinas
e5c88e3f efa773fe

+224 -158
+193
arch/arm64/lib/copy_template.S
··· 1 + /* 2 + * Copyright (C) 2013 ARM Ltd. 3 + * Copyright (C) 2013 Linaro. 4 + * 5 + * This code is based on glibc cortex strings work originally authored by Linaro 6 + * and re-licensed under GPLv2 for the Linux kernel. The original code can 7 + * be found @ 8 + * 9 + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 + * files/head:/src/aarch64/ 11 + * 12 + * This program is free software; you can redistribute it and/or modify 13 + * it under the terms of the GNU General Public License version 2 as 14 + * published by the Free Software Foundation. 15 + * 16 + * This program is distributed in the hope that it will be useful, 17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 + * GNU General Public License for more details. 20 + * 21 + * You should have received a copy of the GNU General Public License 22 + * along with this program. If not, see <http://www.gnu.org/licenses/>. 23 + */ 24 + 25 + 26 + /* 27 + * Copy a buffer from src to dest (alignment handled by the hardware) 28 + * 29 + * Parameters: 30 + * x0 - dest 31 + * x1 - src 32 + * x2 - n 33 + * Returns: 34 + * x0 - dest 35 + */ 36 + dstin .req x0 37 + src .req x1 38 + count .req x2 39 + tmp1 .req x3 40 + tmp1w .req w3 41 + tmp2 .req x4 42 + tmp2w .req w4 43 + dst .req x6 44 + 45 + A_l .req x7 46 + A_h .req x8 47 + B_l .req x9 48 + B_h .req x10 49 + C_l .req x11 50 + C_h .req x12 51 + D_l .req x13 52 + D_h .req x14 53 + 54 + mov dst, dstin 55 + cmp count, #16 56 + /*When memory length is less than 16, the accessed are not aligned.*/ 57 + b.lo .Ltiny15 58 + 59 + neg tmp2, src 60 + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ 61 + b.eq .LSrcAligned 62 + sub count, count, tmp2 63 + /* 64 + * Copy the leading memory data from src to dst in an increasing 65 + * address order.By this way,the risk of overwritting the source 66 + * memory data is eliminated when the distance between src and 67 + * dst is less than 16. The memory accesses here are alignment. 68 + */ 69 + tbz tmp2, #0, 1f 70 + ldrb1 tmp1w, src, #1 71 + strb1 tmp1w, dst, #1 72 + 1: 73 + tbz tmp2, #1, 2f 74 + ldrh1 tmp1w, src, #2 75 + strh1 tmp1w, dst, #2 76 + 2: 77 + tbz tmp2, #2, 3f 78 + ldr1 tmp1w, src, #4 79 + str1 tmp1w, dst, #4 80 + 3: 81 + tbz tmp2, #3, .LSrcAligned 82 + ldr1 tmp1, src, #8 83 + str1 tmp1, dst, #8 84 + 85 + .LSrcAligned: 86 + cmp count, #64 87 + b.ge .Lcpy_over64 88 + /* 89 + * Deal with small copies quickly by dropping straight into the 90 + * exit block. 91 + */ 92 + .Ltail63: 93 + /* 94 + * Copy up to 48 bytes of data. At this point we only need the 95 + * bottom 6 bits of count to be accurate. 96 + */ 97 + ands tmp1, count, #0x30 98 + b.eq .Ltiny15 99 + cmp tmp1w, #0x20 100 + b.eq 1f 101 + b.lt 2f 102 + ldp1 A_l, A_h, src, #16 103 + stp1 A_l, A_h, dst, #16 104 + 1: 105 + ldp1 A_l, A_h, src, #16 106 + stp1 A_l, A_h, dst, #16 107 + 2: 108 + ldp1 A_l, A_h, src, #16 109 + stp1 A_l, A_h, dst, #16 110 + .Ltiny15: 111 + /* 112 + * Prefer to break one ldp/stp into several load/store to access 113 + * memory in an increasing address order,rather than to load/store 16 114 + * bytes from (src-16) to (dst-16) and to backward the src to aligned 115 + * address,which way is used in original cortex memcpy. If keeping 116 + * the original memcpy process here, memmove need to satisfy the 117 + * precondition that src address is at least 16 bytes bigger than dst 118 + * address,otherwise some source data will be overwritten when memove 119 + * call memcpy directly. To make memmove simpler and decouple the 120 + * memcpy's dependency on memmove, withdrew the original process. 121 + */ 122 + tbz count, #3, 1f 123 + ldr1 tmp1, src, #8 124 + str1 tmp1, dst, #8 125 + 1: 126 + tbz count, #2, 2f 127 + ldr1 tmp1w, src, #4 128 + str1 tmp1w, dst, #4 129 + 2: 130 + tbz count, #1, 3f 131 + ldrh1 tmp1w, src, #2 132 + strh1 tmp1w, dst, #2 133 + 3: 134 + tbz count, #0, .Lexitfunc 135 + ldrb1 tmp1w, src, #1 136 + strb1 tmp1w, dst, #1 137 + 138 + b .Lexitfunc 139 + 140 + .Lcpy_over64: 141 + subs count, count, #128 142 + b.ge .Lcpy_body_large 143 + /* 144 + * Less than 128 bytes to copy, so handle 64 here and then jump 145 + * to the tail. 146 + */ 147 + ldp1 A_l, A_h, src, #16 148 + stp1 A_l, A_h, dst, #16 149 + ldp1 B_l, B_h, src, #16 150 + ldp1 C_l, C_h, src, #16 151 + stp1 B_l, B_h, dst, #16 152 + stp1 C_l, C_h, dst, #16 153 + ldp1 D_l, D_h, src, #16 154 + stp1 D_l, D_h, dst, #16 155 + 156 + tst count, #0x3f 157 + b.ne .Ltail63 158 + b .Lexitfunc 159 + 160 + /* 161 + * Critical loop. Start at a new cache line boundary. Assuming 162 + * 64 bytes per line this ensures the entire loop is in one line. 163 + */ 164 + .p2align L1_CACHE_SHIFT 165 + .Lcpy_body_large: 166 + /* pre-get 64 bytes data. */ 167 + ldp1 A_l, A_h, src, #16 168 + ldp1 B_l, B_h, src, #16 169 + ldp1 C_l, C_h, src, #16 170 + ldp1 D_l, D_h, src, #16 171 + 1: 172 + /* 173 + * interlace the load of next 64 bytes data block with store of the last 174 + * loaded 64 bytes data. 175 + */ 176 + stp1 A_l, A_h, dst, #16 177 + ldp1 A_l, A_h, src, #16 178 + stp1 B_l, B_h, dst, #16 179 + ldp1 B_l, B_h, src, #16 180 + stp1 C_l, C_h, dst, #16 181 + ldp1 C_l, C_h, src, #16 182 + stp1 D_l, D_h, dst, #16 183 + ldp1 D_l, D_h, src, #16 184 + subs count, count, #64 185 + b.ge 1b 186 + stp1 A_l, A_h, dst, #16 187 + stp1 B_l, B_h, dst, #16 188 + stp1 C_l, C_h, dst, #16 189 + stp1 D_l, D_h, dst, #16 190 + 191 + tst count, #0x3f 192 + b.ne .Ltail63 193 + .Lexitfunc:
+31 -158
arch/arm64/lib/memcpy.S
··· 36 36 * Returns: 37 37 * x0 - dest 38 38 */ 39 - dstin .req x0 40 - src .req x1 41 - count .req x2 42 - tmp1 .req x3 43 - tmp1w .req w3 44 - tmp2 .req x4 45 - tmp2w .req w4 46 - tmp3 .req x5 47 - tmp3w .req w5 48 - dst .req x6 39 + .macro ldrb1 ptr, regB, val 40 + ldrb \ptr, [\regB], \val 41 + .endm 49 42 50 - A_l .req x7 51 - A_h .req x8 52 - B_l .req x9 53 - B_h .req x10 54 - C_l .req x11 55 - C_h .req x12 56 - D_l .req x13 57 - D_h .req x14 43 + .macro strb1 ptr, regB, val 44 + strb \ptr, [\regB], \val 45 + .endm 46 + 47 + .macro ldrh1 ptr, regB, val 48 + ldrh \ptr, [\regB], \val 49 + .endm 50 + 51 + .macro strh1 ptr, regB, val 52 + strh \ptr, [\regB], \val 53 + .endm 54 + 55 + .macro ldr1 ptr, regB, val 56 + ldr \ptr, [\regB], \val 57 + .endm 58 + 59 + .macro str1 ptr, regB, val 60 + str \ptr, [\regB], \val 61 + .endm 62 + 63 + .macro ldp1 ptr, regB, regC, val 64 + ldp \ptr, \regB, [\regC], \val 65 + .endm 66 + 67 + .macro stp1 ptr, regB, regC, val 68 + stp \ptr, \regB, [\regC], \val 69 + .endm 58 70 59 71 ENTRY(memcpy) 60 - mov dst, dstin 61 - cmp count, #16 62 - /*When memory length is less than 16, the accessed are not aligned.*/ 63 - b.lo .Ltiny15 64 - 65 - neg tmp2, src 66 - ands tmp2, tmp2, #15/* Bytes to reach alignment. */ 67 - b.eq .LSrcAligned 68 - sub count, count, tmp2 69 - /* 70 - * Copy the leading memory data from src to dst in an increasing 71 - * address order.By this way,the risk of overwritting the source 72 - * memory data is eliminated when the distance between src and 73 - * dst is less than 16. The memory accesses here are alignment. 74 - */ 75 - tbz tmp2, #0, 1f 76 - ldrb tmp1w, [src], #1 77 - strb tmp1w, [dst], #1 78 - 1: 79 - tbz tmp2, #1, 2f 80 - ldrh tmp1w, [src], #2 81 - strh tmp1w, [dst], #2 82 - 2: 83 - tbz tmp2, #2, 3f 84 - ldr tmp1w, [src], #4 85 - str tmp1w, [dst], #4 86 - 3: 87 - tbz tmp2, #3, .LSrcAligned 88 - ldr tmp1, [src],#8 89 - str tmp1, [dst],#8 90 - 91 - .LSrcAligned: 92 - cmp count, #64 93 - b.ge .Lcpy_over64 94 - /* 95 - * Deal with small copies quickly by dropping straight into the 96 - * exit block. 97 - */ 98 - .Ltail63: 99 - /* 100 - * Copy up to 48 bytes of data. At this point we only need the 101 - * bottom 6 bits of count to be accurate. 102 - */ 103 - ands tmp1, count, #0x30 104 - b.eq .Ltiny15 105 - cmp tmp1w, #0x20 106 - b.eq 1f 107 - b.lt 2f 108 - ldp A_l, A_h, [src], #16 109 - stp A_l, A_h, [dst], #16 110 - 1: 111 - ldp A_l, A_h, [src], #16 112 - stp A_l, A_h, [dst], #16 113 - 2: 114 - ldp A_l, A_h, [src], #16 115 - stp A_l, A_h, [dst], #16 116 - .Ltiny15: 117 - /* 118 - * Prefer to break one ldp/stp into several load/store to access 119 - * memory in an increasing address order,rather than to load/store 16 120 - * bytes from (src-16) to (dst-16) and to backward the src to aligned 121 - * address,which way is used in original cortex memcpy. If keeping 122 - * the original memcpy process here, memmove need to satisfy the 123 - * precondition that src address is at least 16 bytes bigger than dst 124 - * address,otherwise some source data will be overwritten when memove 125 - * call memcpy directly. To make memmove simpler and decouple the 126 - * memcpy's dependency on memmove, withdrew the original process. 127 - */ 128 - tbz count, #3, 1f 129 - ldr tmp1, [src], #8 130 - str tmp1, [dst], #8 131 - 1: 132 - tbz count, #2, 2f 133 - ldr tmp1w, [src], #4 134 - str tmp1w, [dst], #4 135 - 2: 136 - tbz count, #1, 3f 137 - ldrh tmp1w, [src], #2 138 - strh tmp1w, [dst], #2 139 - 3: 140 - tbz count, #0, .Lexitfunc 141 - ldrb tmp1w, [src] 142 - strb tmp1w, [dst] 143 - 144 - .Lexitfunc: 145 - ret 146 - 147 - .Lcpy_over64: 148 - subs count, count, #128 149 - b.ge .Lcpy_body_large 150 - /* 151 - * Less than 128 bytes to copy, so handle 64 here and then jump 152 - * to the tail. 153 - */ 154 - ldp A_l, A_h, [src],#16 155 - stp A_l, A_h, [dst],#16 156 - ldp B_l, B_h, [src],#16 157 - ldp C_l, C_h, [src],#16 158 - stp B_l, B_h, [dst],#16 159 - stp C_l, C_h, [dst],#16 160 - ldp D_l, D_h, [src],#16 161 - stp D_l, D_h, [dst],#16 162 - 163 - tst count, #0x3f 164 - b.ne .Ltail63 165 - ret 166 - 167 - /* 168 - * Critical loop. Start at a new cache line boundary. Assuming 169 - * 64 bytes per line this ensures the entire loop is in one line. 170 - */ 171 - .p2align L1_CACHE_SHIFT 172 - .Lcpy_body_large: 173 - /* pre-get 64 bytes data. */ 174 - ldp A_l, A_h, [src],#16 175 - ldp B_l, B_h, [src],#16 176 - ldp C_l, C_h, [src],#16 177 - ldp D_l, D_h, [src],#16 178 - 1: 179 - /* 180 - * interlace the load of next 64 bytes data block with store of the last 181 - * loaded 64 bytes data. 182 - */ 183 - stp A_l, A_h, [dst],#16 184 - ldp A_l, A_h, [src],#16 185 - stp B_l, B_h, [dst],#16 186 - ldp B_l, B_h, [src],#16 187 - stp C_l, C_h, [dst],#16 188 - ldp C_l, C_h, [src],#16 189 - stp D_l, D_h, [dst],#16 190 - ldp D_l, D_h, [src],#16 191 - subs count, count, #64 192 - b.ge 1b 193 - stp A_l, A_h, [dst],#16 194 - stp B_l, B_h, [dst],#16 195 - stp C_l, C_h, [dst],#16 196 - stp D_l, D_h, [dst],#16 197 - 198 - tst count, #0x3f 199 - b.ne .Ltail63 72 + #include "copy_template.S" 200 73 ret 201 74 ENDPROC(memcpy)