arch/x86/lib/memcpy_64.S at v2.6.35

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / arch / x86 / lib / memcpy_64.S
at v2.6.35 143 lines 2.4 kB view raw
wrap content
  1/* Copyright 2002 Andi Kleen */
  2
  3#include <linux/linkage.h>
  4
  5#include <asm/cpufeature.h>
  6#include <asm/dwarf2.h>
  7
  8/*
  9 * memcpy - Copy a memory block.
 10 *
 11 * Input:
 12 *  rdi destination
 13 *  rsi source
 14 *  rdx count
 15 *
 16 * Output:
 17 * rax original destination
 18 */
 19
 20/*
 21 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
 22 *
 23 * This gets patched over the unrolled variant (below) via the
 24 * alternative instructions framework:
 25 */
 26	.section .altinstr_replacement, "ax", @progbits
 27.Lmemcpy_c:
 28	movq %rdi, %rax
 29
 30	movl %edx, %ecx
 31	shrl $3, %ecx
 32	andl $7, %edx
 33	rep movsq
 34	movl %edx, %ecx
 35	rep movsb
 36	ret
 37.Lmemcpy_e:
 38	.previous
 39
 40ENTRY(__memcpy)
 41ENTRY(memcpy)
 42	CFI_STARTPROC
 43
 44	/*
 45	 * Put the number of full 64-byte blocks into %ecx.
 46	 * Tail portion is handled at the end:
 47	 */
 48	movq %rdi, %rax
 49	movl %edx, %ecx
 50	shrl   $6, %ecx
 51	jz .Lhandle_tail
 52
 53	.p2align 4
 54.Lloop_64:
 55	/*
 56	 * We decrement the loop index here - and the zero-flag is
 57	 * checked at the end of the loop (instructions inbetween do
 58	 * not change the zero flag):
 59	 */
 60	decl %ecx
 61
 62	/*
 63	 * Move in blocks of 4x16 bytes:
 64	 */
 65	movq 0*8(%rsi),		%r11
 66	movq 1*8(%rsi),		%r8
 67	movq %r11,		0*8(%rdi)
 68	movq %r8,		1*8(%rdi)
 69
 70	movq 2*8(%rsi),		%r9
 71	movq 3*8(%rsi),		%r10
 72	movq %r9,		2*8(%rdi)
 73	movq %r10,		3*8(%rdi)
 74
 75	movq 4*8(%rsi),		%r11
 76	movq 5*8(%rsi),		%r8
 77	movq %r11,		4*8(%rdi)
 78	movq %r8,		5*8(%rdi)
 79
 80	movq 6*8(%rsi),		%r9
 81	movq 7*8(%rsi),		%r10
 82	movq %r9,		6*8(%rdi)
 83	movq %r10,		7*8(%rdi)
 84
 85	leaq 64(%rsi), %rsi
 86	leaq 64(%rdi), %rdi
 87
 88	jnz  .Lloop_64
 89
 90.Lhandle_tail:
 91	movl %edx, %ecx
 92	andl  $63, %ecx
 93	shrl   $3, %ecx
 94	jz   .Lhandle_7
 95
 96	.p2align 4
 97.Lloop_8:
 98	decl %ecx
 99	movq (%rsi),		%r8
100	movq %r8,		(%rdi)
101	leaq 8(%rdi),		%rdi
102	leaq 8(%rsi),		%rsi
103	jnz  .Lloop_8
104
105.Lhandle_7:
106	movl %edx, %ecx
107	andl $7, %ecx
108	jz .Lend
109
110	.p2align 4
111.Lloop_1:
112	movb (%rsi), %r8b
113	movb %r8b, (%rdi)
114	incq %rdi
115	incq %rsi
116	decl %ecx
117	jnz .Lloop_1
118
119.Lend:
120	ret
121	CFI_ENDPROC
122ENDPROC(memcpy)
123ENDPROC(__memcpy)
124
125	/*
126	 * Some CPUs run faster using the string copy instructions.
127	 * It is also a lot simpler. Use this when possible:
128	 */
129
130	.section .altinstructions, "a"
131	.align 8
132	.quad memcpy
133	.quad .Lmemcpy_c
134	.byte X86_FEATURE_REP_GOOD
135
136	/*
137	 * Replace only beginning, memcpy is used to apply alternatives,
138	 * so it is silly to overwrite itself with nops - reboot is the
139	 * only outcome...
140	 */
141	.byte .Lmemcpy_e - .Lmemcpy_c
142	.byte .Lmemcpy_e - .Lmemcpy_c
143	.previous
Configure Feed

Configure Feed