arm64: Change memcpy in kernel to use the copy template file

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

This converts the memcpy.S to use the copy template file. The copy
template file was based originally on the memcpy.S

Signed-off-by: Feng Kan <fkan@apm.com>
Signed-off-by: Balamurugan Shanmugam <bshanmugam@apm.com>
[catalin.marinas@arm.com: removed tmp3(w) .req statements as they are not used]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

authored by

Feng Kan and committed by

Catalin Marinas 10 years ago e5c88e3f efa773fe

+224 -158

2 changed files

expand all

arch

arm64

lib

copy_template.S

memcpy.S

+193

arch/arm64/lib/copy_template.S

··· 1 + /* 2 + * Copyright (C) 2013 ARM Ltd. 3 + * Copyright (C) 2013 Linaro. 4 + * 5 + * This code is based on glibc cortex strings work originally authored by Linaro 6 + * and re-licensed under GPLv2 for the Linux kernel. The original code can 7 + * be found @ 8 + * 9 + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 + * files/head:/src/aarch64/ 11 + * 12 + * This program is free software; you can redistribute it and/or modify 13 + * it under the terms of the GNU General Public License version 2 as 14 + * published by the Free Software Foundation. 15 + * 16 + * This program is distributed in the hope that it will be useful, 17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 + * GNU General Public License for more details. 20 + * 21 + * You should have received a copy of the GNU General Public License 22 + * along with this program. If not, see <http://www.gnu.org/licenses/>. 23 + */ 24 + 25 + 26 + /* 27 + * Copy a buffer from src to dest (alignment handled by the hardware) 28 + * 29 + * Parameters: 30 + * x0 - dest 31 + * x1 - src 32 + * x2 - n 33 + * Returns: 34 + * x0 - dest 35 + */ 36 + dstin .req x0 37 + src .req x1 38 + count .req x2 39 + tmp1 .req x3 40 + tmp1w .req w3 41 + tmp2 .req x4 42 + tmp2w .req w4 43 + dst .req x6 44 + 45 + A_l .req x7 46 + A_h .req x8 47 + B_l .req x9 48 + B_h .req x10 49 + C_l .req x11 50 + C_h .req x12 51 + D_l .req x13 52 + D_h .req x14 53 + 54 + mov dst, dstin 55 + cmp count, #16 56 + /*When memory length is less than 16, the accessed are not aligned.*/ 57 + b.lo .Ltiny15 58 + 59 + neg tmp2, src 60 + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ 61 + b.eq .LSrcAligned 62 + sub count, count, tmp2 63 + /* 64 + * Copy the leading memory data from src to dst in an increasing 65 + * address order.By this way,the risk of overwritting the source 66 + * memory data is eliminated when the distance between src and 67 + * dst is less than 16. The memory accesses here are alignment. 68 + */ 69 + tbz tmp2, #0, 1f 70 + ldrb1 tmp1w, src, #1 71 + strb1 tmp1w, dst, #1 72 + 1: 73 + tbz tmp2, #1, 2f 74 + ldrh1 tmp1w, src, #2 75 + strh1 tmp1w, dst, #2 76 + 2: 77 + tbz tmp2, #2, 3f 78 + ldr1 tmp1w, src, #4 79 + str1 tmp1w, dst, #4 80 + 3: 81 + tbz tmp2, #3, .LSrcAligned 82 + ldr1 tmp1, src, #8 83 + str1 tmp1, dst, #8 84 + 85 + .LSrcAligned: 86 + cmp count, #64 87 + b.ge .Lcpy_over64 88 + /* 89 + * Deal with small copies quickly by dropping straight into the 90 + * exit block. 91 + */ 92 + .Ltail63: 93 + /* 94 + * Copy up to 48 bytes of data. At this point we only need the 95 + * bottom 6 bits of count to be accurate. 96 + */ 97 + ands tmp1, count, #0x30 98 + b.eq .Ltiny15 99 + cmp tmp1w, #0x20 100 + b.eq 1f 101 + b.lt 2f 102 + ldp1 A_l, A_h, src, #16 103 + stp1 A_l, A_h, dst, #16 104 + 1: 105 + ldp1 A_l, A_h, src, #16 106 + stp1 A_l, A_h, dst, #16 107 + 2: 108 + ldp1 A_l, A_h, src, #16 109 + stp1 A_l, A_h, dst, #16 110 + .Ltiny15: 111 + /* 112 + * Prefer to break one ldp/stp into several load/store to access 113 + * memory in an increasing address order,rather than to load/store 16 114 + * bytes from (src-16) to (dst-16) and to backward the src to aligned 115 + * address,which way is used in original cortex memcpy. If keeping 116 + * the original memcpy process here, memmove need to satisfy the 117 + * precondition that src address is at least 16 bytes bigger than dst 118 + * address,otherwise some source data will be overwritten when memove 119 + * call memcpy directly. To make memmove simpler and decouple the 120 + * memcpy's dependency on memmove, withdrew the original process. 121 + */ 122 + tbz count, #3, 1f 123 + ldr1 tmp1, src, #8 124 + str1 tmp1, dst, #8 125 + 1: 126 + tbz count, #2, 2f 127 + ldr1 tmp1w, src, #4 128 + str1 tmp1w, dst, #4 129 + 2: 130 + tbz count, #1, 3f 131 + ldrh1 tmp1w, src, #2 132 + strh1 tmp1w, dst, #2 133 + 3: 134 + tbz count, #0, .Lexitfunc 135 + ldrb1 tmp1w, src, #1 136 + strb1 tmp1w, dst, #1 137 + 138 + b .Lexitfunc 139 + 140 + .Lcpy_over64: 141 + subs count, count, #128 142 + b.ge .Lcpy_body_large 143 + /* 144 + * Less than 128 bytes to copy, so handle 64 here and then jump 145 + * to the tail. 146 + */ 147 + ldp1 A_l, A_h, src, #16 148 + stp1 A_l, A_h, dst, #16 149 + ldp1 B_l, B_h, src, #16 150 + ldp1 C_l, C_h, src, #16 151 + stp1 B_l, B_h, dst, #16 152 + stp1 C_l, C_h, dst, #16 153 + ldp1 D_l, D_h, src, #16 154 + stp1 D_l, D_h, dst, #16 155 + 156 + tst count, #0x3f 157 + b.ne .Ltail63 158 + b .Lexitfunc 159 + 160 + /* 161 + * Critical loop. Start at a new cache line boundary. Assuming 162 + * 64 bytes per line this ensures the entire loop is in one line. 163 + */ 164 + .p2align L1_CACHE_SHIFT 165 + .Lcpy_body_large: 166 + /* pre-get 64 bytes data. */ 167 + ldp1 A_l, A_h, src, #16 168 + ldp1 B_l, B_h, src, #16 169 + ldp1 C_l, C_h, src, #16 170 + ldp1 D_l, D_h, src, #16 171 + 1: 172 + /* 173 + * interlace the load of next 64 bytes data block with store of the last 174 + * loaded 64 bytes data. 175 + */ 176 + stp1 A_l, A_h, dst, #16 177 + ldp1 A_l, A_h, src, #16 178 + stp1 B_l, B_h, dst, #16 179 + ldp1 B_l, B_h, src, #16 180 + stp1 C_l, C_h, dst, #16 181 + ldp1 C_l, C_h, src, #16 182 + stp1 D_l, D_h, dst, #16 183 + ldp1 D_l, D_h, src, #16 184 + subs count, count, #64 185 + b.ge 1b 186 + stp1 A_l, A_h, dst, #16 187 + stp1 B_l, B_h, dst, #16 188 + stp1 C_l, C_h, dst, #16 189 + stp1 D_l, D_h, dst, #16 190 + 191 + tst count, #0x3f 192 + b.ne .Ltail63 193 + .Lexitfunc:

+31 -158

arch/arm64/lib/memcpy.S

··· 36 36 * Returns: 37 37 * x0 - dest 38 38 */ 39 - dstin .req x0 40 - src .req x1 41 - count .req x2 42 - tmp1 .req x3 43 - tmp1w .req w3 44 - tmp2 .req x4 45 - tmp2w .req w4 46 - tmp3 .req x5 47 - tmp3w .req w5 48 - dst .req x6 39 + .macro ldrb1 ptr, regB, val 40 + ldrb \ptr, [\regB], \val 41 + .endm 49 42 50 - A_l .req x7 51 - A_h .req x8 52 - B_l .req x9 53 - B_h .req x10 54 - C_l .req x11 55 - C_h .req x12 56 - D_l .req x13 57 - D_h .req x14 43 + .macro strb1 ptr, regB, val 44 + strb \ptr, [\regB], \val 45 + .endm 46 + 47 + .macro ldrh1 ptr, regB, val 48 + ldrh \ptr, [\regB], \val 49 + .endm 50 + 51 + .macro strh1 ptr, regB, val 52 + strh \ptr, [\regB], \val 53 + .endm 54 + 55 + .macro ldr1 ptr, regB, val 56 + ldr \ptr, [\regB], \val 57 + .endm 58 + 59 + .macro str1 ptr, regB, val 60 + str \ptr, [\regB], \val 61 + .endm 62 + 63 + .macro ldp1 ptr, regB, regC, val 64 + ldp \ptr, \regB, [\regC], \val 65 + .endm 66 + 67 + .macro stp1 ptr, regB, regC, val 68 + stp \ptr, \regB, [\regC], \val 69 + .endm 58 70 59 71 ENTRY(memcpy) 60 - mov dst, dstin 61 - cmp count, #16 62 - /*When memory length is less than 16, the accessed are not aligned.*/ 63 - b.lo .Ltiny15 64 - 65 - neg tmp2, src 66 - ands tmp2, tmp2, #15/* Bytes to reach alignment. */ 67 - b.eq .LSrcAligned 68 - sub count, count, tmp2 69 - /* 70 - * Copy the leading memory data from src to dst in an increasing 71 - * address order.By this way,the risk of overwritting the source 72 - * memory data is eliminated when the distance between src and 73 - * dst is less than 16. The memory accesses here are alignment. 74 - */ 75 - tbz tmp2, #0, 1f 76 - ldrb tmp1w, [src], #1 77 - strb tmp1w, [dst], #1 78 - 1: 79 - tbz tmp2, #1, 2f 80 - ldrh tmp1w, [src], #2 81 - strh tmp1w, [dst], #2 82 - 2: 83 - tbz tmp2, #2, 3f 84 - ldr tmp1w, [src], #4 85 - str tmp1w, [dst], #4 86 - 3: 87 - tbz tmp2, #3, .LSrcAligned 88 - ldr tmp1, [src],#8 89 - str tmp1, [dst],#8 90 - 91 - .LSrcAligned: 92 - cmp count, #64 93 - b.ge .Lcpy_over64 94 - /* 95 - * Deal with small copies quickly by dropping straight into the 96 - * exit block. 97 - */ 98 - .Ltail63: 99 - /* 100 - * Copy up to 48 bytes of data. At this point we only need the 101 - * bottom 6 bits of count to be accurate. 102 - */ 103 - ands tmp1, count, #0x30 104 - b.eq .Ltiny15 105 - cmp tmp1w, #0x20 106 - b.eq 1f 107 - b.lt 2f 108 - ldp A_l, A_h, [src], #16 109 - stp A_l, A_h, [dst], #16 110 - 1: 111 - ldp A_l, A_h, [src], #16 112 - stp A_l, A_h, [dst], #16 113 - 2: 114 - ldp A_l, A_h, [src], #16 115 - stp A_l, A_h, [dst], #16 116 - .Ltiny15: 117 - /* 118 - * Prefer to break one ldp/stp into several load/store to access 119 - * memory in an increasing address order,rather than to load/store 16 120 - * bytes from (src-16) to (dst-16) and to backward the src to aligned 121 - * address,which way is used in original cortex memcpy. If keeping 122 - * the original memcpy process here, memmove need to satisfy the 123 - * precondition that src address is at least 16 bytes bigger than dst 124 - * address,otherwise some source data will be overwritten when memove 125 - * call memcpy directly. To make memmove simpler and decouple the 126 - * memcpy's dependency on memmove, withdrew the original process. 127 - */ 128 - tbz count, #3, 1f 129 - ldr tmp1, [src], #8 130 - str tmp1, [dst], #8 131 - 1: 132 - tbz count, #2, 2f 133 - ldr tmp1w, [src], #4 134 - str tmp1w, [dst], #4 135 - 2: 136 - tbz count, #1, 3f 137 - ldrh tmp1w, [src], #2 138 - strh tmp1w, [dst], #2 139 - 3: 140 - tbz count, #0, .Lexitfunc 141 - ldrb tmp1w, [src] 142 - strb tmp1w, [dst] 143 - 144 - .Lexitfunc: 145 - ret 146 - 147 - .Lcpy_over64: 148 - subs count, count, #128 149 - b.ge .Lcpy_body_large 150 - /* 151 - * Less than 128 bytes to copy, so handle 64 here and then jump 152 - * to the tail. 153 - */ 154 - ldp A_l, A_h, [src],#16 155 - stp A_l, A_h, [dst],#16 156 - ldp B_l, B_h, [src],#16 157 - ldp C_l, C_h, [src],#16 158 - stp B_l, B_h, [dst],#16 159 - stp C_l, C_h, [dst],#16 160 - ldp D_l, D_h, [src],#16 161 - stp D_l, D_h, [dst],#16 162 - 163 - tst count, #0x3f 164 - b.ne .Ltail63 165 - ret 166 - 167 - /* 168 - * Critical loop. Start at a new cache line boundary. Assuming 169 - * 64 bytes per line this ensures the entire loop is in one line. 170 - */ 171 - .p2align L1_CACHE_SHIFT 172 - .Lcpy_body_large: 173 - /* pre-get 64 bytes data. */ 174 - ldp A_l, A_h, [src],#16 175 - ldp B_l, B_h, [src],#16 176 - ldp C_l, C_h, [src],#16 177 - ldp D_l, D_h, [src],#16 178 - 1: 179 - /* 180 - * interlace the load of next 64 bytes data block with store of the last 181 - * loaded 64 bytes data. 182 - */ 183 - stp A_l, A_h, [dst],#16 184 - ldp A_l, A_h, [src],#16 185 - stp B_l, B_h, [dst],#16 186 - ldp B_l, B_h, [src],#16 187 - stp C_l, C_h, [dst],#16 188 - ldp C_l, C_h, [src],#16 189 - stp D_l, D_h, [dst],#16 190 - ldp D_l, D_h, [src],#16 191 - subs count, count, #64 192 - b.ge 1b 193 - stp A_l, A_h, [dst],#16 194 - stp B_l, B_h, [dst],#16 195 - stp C_l, C_h, [dst],#16 196 - stp D_l, D_h, [dst],#16 197 - 198 - tst count, #0x3f 199 - b.ne .Ltail63 72 + #include "copy_template.S" 200 73 ret 201 74 ENDPROC(memcpy)