Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (C) 2013 ARM Ltd.
4 * Copyright (C) 2013 Linaro.
5 *
6 * This code is based on glibc cortex strings work originally authored by Linaro
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
11 */
12
13
14/*
15 * Copy a buffer from src to dest (alignment handled by the hardware)
16 *
17 * Parameters:
18 * x0 - dest
19 * x1 - src
20 * x2 - n
21 * Returns:
22 * x0 - dest
23 */
24dstin .req x0
25src .req x1
26count .req x2
27tmp1 .req x3
28tmp1w .req w3
29tmp2 .req x4
30tmp2w .req w4
31dst .req x6
32
33A_l .req x7
34A_h .req x8
35B_l .req x9
36B_h .req x10
37C_l .req x11
38C_h .req x12
39D_l .req x13
40D_h .req x14
41
42 mov dst, dstin
43
44#ifdef CONFIG_AS_HAS_MOPS
45alternative_if_not ARM64_HAS_MOPS
46 b .Lno_mops
47alternative_else_nop_endif
48 cpy1 dst, src, count
49 b .Lexitfunc
50.Lno_mops:
51#endif
52
53 cmp count, #16
54 /*When memory length is less than 16, the accessed are not aligned.*/
55 b.lo .Ltiny15
56
57 neg tmp2, src
58 ands tmp2, tmp2, #15/* Bytes to reach alignment. */
59 b.eq .LSrcAligned
60 sub count, count, tmp2
61 /*
62 * Copy the leading memory data from src to dst in an increasing
63 * address order.By this way,the risk of overwriting the source
64 * memory data is eliminated when the distance between src and
65 * dst is less than 16. The memory accesses here are alignment.
66 */
67 tbz tmp2, #0, 1f
68 ldrb1 tmp1w, src, #1
69 strb1 tmp1w, dst, #1
701:
71 tbz tmp2, #1, 2f
72 ldrh1 tmp1w, src, #2
73 strh1 tmp1w, dst, #2
742:
75 tbz tmp2, #2, 3f
76 ldr1 tmp1w, src, #4
77 str1 tmp1w, dst, #4
783:
79 tbz tmp2, #3, .LSrcAligned
80 ldr1 tmp1, src, #8
81 str1 tmp1, dst, #8
82
83.LSrcAligned:
84 cmp count, #64
85 b.ge .Lcpy_over64
86 /*
87 * Deal with small copies quickly by dropping straight into the
88 * exit block.
89 */
90.Ltail63:
91 /*
92 * Copy up to 48 bytes of data. At this point we only need the
93 * bottom 6 bits of count to be accurate.
94 */
95 ands tmp1, count, #0x30
96 b.eq .Ltiny15
97 cmp tmp1w, #0x20
98 b.eq 1f
99 b.lt 2f
100 ldp1 A_l, A_h, src, #16
101 stp1 A_l, A_h, dst, #16
1021:
103 ldp1 A_l, A_h, src, #16
104 stp1 A_l, A_h, dst, #16
1052:
106 ldp1 A_l, A_h, src, #16
107 stp1 A_l, A_h, dst, #16
108.Ltiny15:
109 /*
110 * Prefer to break one ldp/stp into several load/store to access
111 * memory in an increasing address order,rather than to load/store 16
112 * bytes from (src-16) to (dst-16) and to backward the src to aligned
113 * address,which way is used in original cortex memcpy. If keeping
114 * the original memcpy process here, memmove need to satisfy the
115 * precondition that src address is at least 16 bytes bigger than dst
116 * address,otherwise some source data will be overwritten when memove
117 * call memcpy directly. To make memmove simpler and decouple the
118 * memcpy's dependency on memmove, withdrew the original process.
119 */
120 tbz count, #3, 1f
121 ldr1 tmp1, src, #8
122 str1 tmp1, dst, #8
1231:
124 tbz count, #2, 2f
125 ldr1 tmp1w, src, #4
126 str1 tmp1w, dst, #4
1272:
128 tbz count, #1, 3f
129 ldrh1 tmp1w, src, #2
130 strh1 tmp1w, dst, #2
1313:
132 tbz count, #0, .Lexitfunc
133 ldrb1 tmp1w, src, #1
134 strb1 tmp1w, dst, #1
135
136 b .Lexitfunc
137
138.Lcpy_over64:
139 subs count, count, #128
140 b.ge .Lcpy_body_large
141 /*
142 * Less than 128 bytes to copy, so handle 64 here and then jump
143 * to the tail.
144 */
145 ldp1 A_l, A_h, src, #16
146 stp1 A_l, A_h, dst, #16
147 ldp1 B_l, B_h, src, #16
148 ldp1 C_l, C_h, src, #16
149 stp1 B_l, B_h, dst, #16
150 stp1 C_l, C_h, dst, #16
151 ldp1 D_l, D_h, src, #16
152 stp1 D_l, D_h, dst, #16
153
154 tst count, #0x3f
155 b.ne .Ltail63
156 b .Lexitfunc
157
158 /*
159 * Critical loop. Start at a new cache line boundary. Assuming
160 * 64 bytes per line this ensures the entire loop is in one line.
161 */
162 .p2align L1_CACHE_SHIFT
163.Lcpy_body_large:
164 /* pre-get 64 bytes data. */
165 ldp1 A_l, A_h, src, #16
166 ldp1 B_l, B_h, src, #16
167 ldp1 C_l, C_h, src, #16
168 ldp1 D_l, D_h, src, #16
1691:
170 /*
171 * interlace the load of next 64 bytes data block with store of the last
172 * loaded 64 bytes data.
173 */
174 stp1 A_l, A_h, dst, #16
175 ldp1 A_l, A_h, src, #16
176 stp1 B_l, B_h, dst, #16
177 ldp1 B_l, B_h, src, #16
178 stp1 C_l, C_h, dst, #16
179 ldp1 C_l, C_h, src, #16
180 stp1 D_l, D_h, dst, #16
181 ldp1 D_l, D_h, src, #16
182 subs count, count, #64
183 b.ge 1b
184 stp1 A_l, A_h, dst, #16
185 stp1 B_l, B_h, dst, #16
186 stp1 C_l, C_h, dst, #16
187 stp1 D_l, D_h, dst, #16
188
189 tst count, #0x3f
190 b.ne .Ltail63
191.Lexitfunc: