arch/arm64/lib/copy_template.S at v6.16-rc4

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / arch / arm64 / lib / copy_template.S
at v6.16-rc4 191 lines 4.2 kB view raw
wrap content
  1/* SPDX-License-Identifier: GPL-2.0-only */
  2/*
  3 * Copyright (C) 2013 ARM Ltd.
  4 * Copyright (C) 2013 Linaro.
  5 *
  6 * This code is based on glibc cortex strings work originally authored by Linaro
  7 * be found @
  8 *
  9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
 10 * files/head:/src/aarch64/
 11 */
 12
 13
 14/*
 15 * Copy a buffer from src to dest (alignment handled by the hardware)
 16 *
 17 * Parameters:
 18 *	x0 - dest
 19 *	x1 - src
 20 *	x2 - n
 21 * Returns:
 22 *	x0 - dest
 23 */
 24dstin	.req	x0
 25src	.req	x1
 26count	.req	x2
 27tmp1	.req	x3
 28tmp1w	.req	w3
 29tmp2	.req	x4
 30tmp2w	.req	w4
 31dst	.req	x6
 32
 33A_l	.req	x7
 34A_h	.req	x8
 35B_l	.req	x9
 36B_h	.req	x10
 37C_l	.req	x11
 38C_h	.req	x12
 39D_l	.req	x13
 40D_h	.req	x14
 41
 42	mov	dst, dstin
 43
 44#ifdef CONFIG_AS_HAS_MOPS
 45alternative_if_not ARM64_HAS_MOPS
 46	b	.Lno_mops
 47alternative_else_nop_endif
 48	cpy1	dst, src, count
 49	b	.Lexitfunc
 50.Lno_mops:
 51#endif
 52
 53	cmp	count, #16
 54	/*When memory length is less than 16, the accessed are not aligned.*/
 55	b.lo	.Ltiny15
 56
 57	neg	tmp2, src
 58	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
 59	b.eq	.LSrcAligned
 60	sub	count, count, tmp2
 61	/*
 62	* Copy the leading memory data from src to dst in an increasing
 63	* address order.By this way,the risk of overwriting the source
 64	* memory data is eliminated when the distance between src and
 65	* dst is less than 16. The memory accesses here are alignment.
 66	*/
 67	tbz	tmp2, #0, 1f
 68	ldrb1	tmp1w, src, #1
 69	strb1	tmp1w, dst, #1
 701:
 71	tbz	tmp2, #1, 2f
 72	ldrh1	tmp1w, src, #2
 73	strh1	tmp1w, dst, #2
 742:
 75	tbz	tmp2, #2, 3f
 76	ldr1	tmp1w, src, #4
 77	str1	tmp1w, dst, #4
 783:
 79	tbz	tmp2, #3, .LSrcAligned
 80	ldr1	tmp1, src, #8
 81	str1	tmp1, dst, #8
 82
 83.LSrcAligned:
 84	cmp	count, #64
 85	b.ge	.Lcpy_over64
 86	/*
 87	* Deal with small copies quickly by dropping straight into the
 88	* exit block.
 89	*/
 90.Ltail63:
 91	/*
 92	* Copy up to 48 bytes of data. At this point we only need the
 93	* bottom 6 bits of count to be accurate.
 94	*/
 95	ands	tmp1, count, #0x30
 96	b.eq	.Ltiny15
 97	cmp	tmp1w, #0x20
 98	b.eq	1f
 99	b.lt	2f
100	ldp1	A_l, A_h, src, #16
101	stp1	A_l, A_h, dst, #16
1021:
103	ldp1	A_l, A_h, src, #16
104	stp1	A_l, A_h, dst, #16
1052:
106	ldp1	A_l, A_h, src, #16
107	stp1	A_l, A_h, dst, #16
108.Ltiny15:
109	/*
110	* Prefer to break one ldp/stp into several load/store to access
111	* memory in an increasing address order,rather than to load/store 16
112	* bytes from (src-16) to (dst-16) and to backward the src to aligned
113	* address,which way is used in original cortex memcpy. If keeping
114	* the original memcpy process here, memmove need to satisfy the
115	* precondition that src address is at least 16 bytes bigger than dst
116	* address,otherwise some source data will be overwritten when memove
117	* call memcpy directly. To make memmove simpler and decouple the
118	* memcpy's dependency on memmove, withdrew the original process.
119	*/
120	tbz	count, #3, 1f
121	ldr1	tmp1, src, #8
122	str1	tmp1, dst, #8
1231:
124	tbz	count, #2, 2f
125	ldr1	tmp1w, src, #4
126	str1	tmp1w, dst, #4
1272:
128	tbz	count, #1, 3f
129	ldrh1	tmp1w, src, #2
130	strh1	tmp1w, dst, #2
1313:
132	tbz	count, #0, .Lexitfunc
133	ldrb1	tmp1w, src, #1
134	strb1	tmp1w, dst, #1
135
136	b	.Lexitfunc
137
138.Lcpy_over64:
139	subs	count, count, #128
140	b.ge	.Lcpy_body_large
141	/*
142	* Less than 128 bytes to copy, so handle 64 here and then jump
143	* to the tail.
144	*/
145	ldp1	A_l, A_h, src, #16
146	stp1	A_l, A_h, dst, #16
147	ldp1	B_l, B_h, src, #16
148	ldp1	C_l, C_h, src, #16
149	stp1	B_l, B_h, dst, #16
150	stp1	C_l, C_h, dst, #16
151	ldp1	D_l, D_h, src, #16
152	stp1	D_l, D_h, dst, #16
153
154	tst	count, #0x3f
155	b.ne	.Ltail63
156	b	.Lexitfunc
157
158	/*
159	* Critical loop.  Start at a new cache line boundary.  Assuming
160	* 64 bytes per line this ensures the entire loop is in one line.
161	*/
162	.p2align	L1_CACHE_SHIFT
163.Lcpy_body_large:
164	/* pre-get 64 bytes data. */
165	ldp1	A_l, A_h, src, #16
166	ldp1	B_l, B_h, src, #16
167	ldp1	C_l, C_h, src, #16
168	ldp1	D_l, D_h, src, #16
1691:
170	/*
171	* interlace the load of next 64 bytes data block with store of the last
172	* loaded 64 bytes data.
173	*/
174	stp1	A_l, A_h, dst, #16
175	ldp1	A_l, A_h, src, #16
176	stp1	B_l, B_h, dst, #16
177	ldp1	B_l, B_h, src, #16
178	stp1	C_l, C_h, dst, #16
179	ldp1	C_l, C_h, src, #16
180	stp1	D_l, D_h, dst, #16
181	ldp1	D_l, D_h, src, #16
182	subs	count, count, #64
183	b.ge	1b
184	stp1	A_l, A_h, dst, #16
185	stp1	B_l, B_h, dst, #16
186	stp1	C_l, C_h, dst, #16
187	stp1	D_l, D_h, dst, #16
188
189	tst	count, #0x3f
190	b.ne	.Ltail63
191.Lexitfunc:
Configure Feed

Configure Feed