Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'sparc64-M7-memcpy'

Babu Moger says:

====================
sparc64: Update memcpy, memset etc. for M7/M8 architectures

This series of patches updates the memcpy, memset, copy_to_user, copy_from_user
etc for SPARC M7/M8 architecture.

New algorithm here takes advantage of the M7/M8 block init store ASIs, with much
more optimized way to improve the performance. More detail are in code comments.

Tested and compared the latency measured in ticks(NG4memcpy vs new M7memcpy).

1. Memset numbers(Aligned memset)

No.of bytes NG4memset M7memset Delta ((B-A)/A)*100
(Avg.Ticks A) (Avg.Ticks B) (latency reduction)
3 77 25 -67.53
7 43 33 -23.25
32 72 68 -5.55
128 164 44 -73.17
256 335 68 -79.70
512 511 220 -56.94
1024 1552 627 -59.60
2048 3515 1322 -62.38
4096 6303 2472 -60.78
8192 13118 4867 -62.89
16384 26206 10371 -60.42
32768 52501 18569 -64.63
65536 100219 35899 -64.17

2. Memcpy numbers(Aligned memcpy)

No.of bytes NG4memcpy M7memcpy Delta ((B-A)/A)*100
(Avg.Ticks A) (Avg.Ticks B) (latency reduction)
3 20 19 -5
7 29 27 -6.89
32 30 28 -6.66
128 89 69 -22.47
256 142 143 0.70
512 341 283 -17.00
1024 1588 655 -58.75
2048 3553 1357 -61.80
4096 7218 2590 -64.11
8192 13701 5231 -61.82
16384 28304 10716 -62.13
32768 56516 22995 -59.31
65536 115443 50840 -55.96

3. Memset numbers(un-aligned memset)

No.of bytes NG4memset M7memset Delta ((B-A)/A)*100
(Avg.Ticks A) (Avg.Ticks B) (latency reduction)
3 40 31 -22.5
7 52 29 -44.2307692308
32 89 86 -3.3707865169
128 201 74 -63.184079602
256 340 154 -54.7058823529
512 961 335 -65.1404786681
1024 1799 686 -61.8677042802
2048 3575 1260 -64.7552447552
4096 6560 2627 -59.9542682927
8192 13161 6018 -54.273991338
16384 26465 10439 -60.5554505951
32768 52119 18649 -64.2184232238
65536 101593 35724 -64.8361599717

4. Memcpy numbers(un-aligned memcpy)

No.of bytes NG4memcpy M7memcpy Delta ((B-A)/A)*100
(Avg.Ticks A) (Avg.Ticks B) (latency reduction)
3 26 19 -26.9230769231
7 48 45 -6.25
32 52 49 -5.7692307692
128 284 334 17.6056338028
256 430 482 12.0930232558
512 646 690 6.8111455108
1024 1051 1016 -3.3301617507
2048 1787 1818 1.7347509793
4096 3309 3376 2.0247809006
8192 8151 7444 -8.673782358
16384 34222 34556 0.9759803635
32768 87851 95044 8.1877269468
65536 158331 159572 0.7838010244

There is not much difference in numbers with Un-aligned copies
between NG4memcpy and M7memcpy because they both mostly use the
same algorithems.

v2:
1. Fixed indentation issues found by David Miller
2. Used ENTRY and ENDPROC for the labels in M7patch.S as suggested by David Miller
3. Now M8 also will use M7memcpy. Also tested on M8 config.
4. These patches are created on top of below M8 patches
https://patchwork.ozlabs.org/patch/792661/
https://patchwork.ozlabs.org/patch/792662/
However, I did not see these patches in sparc-next tree. It may be in queue now.
It is possible these patches might cause some build problems. It will resolve
once all M8 patches are in sparc-next tree.

v0: Initial version
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+1845 -215
+14 -2
arch/sparc/kernel/head_64.S
··· 603 603 be,pt %xcc, niagara4_patch 604 604 nop 605 605 cmp %g1, SUN4V_CHIP_SPARC_M7 606 - be,pt %xcc, niagara4_patch 606 + be,pt %xcc, sparc_m7_patch 607 607 nop 608 608 cmp %g1, SUN4V_CHIP_SPARC_M8 609 - be,pt %xcc, niagara4_patch 609 + be,pt %xcc, sparc_m7_patch 610 610 nop 611 611 cmp %g1, SUN4V_CHIP_SPARC_SN 612 612 be,pt %xcc, niagara4_patch ··· 621 621 622 622 ba,a,pt %xcc, 80f 623 623 nop 624 + 625 + sparc_m7_patch: 626 + call m7_patch_copyops 627 + nop 628 + call m7_patch_bzero 629 + nop 630 + call m7_patch_pageops 631 + nop 632 + 633 + ba,a,pt %xcc, 80f 634 + nop 635 + 624 636 niagara4_patch: 625 637 call niagara4_patch_copyops 626 638 nop
+40
arch/sparc/lib/M7copy_from_user.S
··· 1 + /* 2 + * M7copy_from_user.S: SPARC M7 optimized copy from userspace. 3 + * 4 + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. 5 + */ 6 + 7 + 8 + #define EX_LD(x, y) \ 9 + 98: x; \ 10 + .section __ex_table,"a"; \ 11 + .align 4; \ 12 + .word 98b, y; \ 13 + .text; \ 14 + .align 4; 15 + 16 + #define EX_LD_FP(x, y) \ 17 + 98: x; \ 18 + .section __ex_table,"a"; \ 19 + .align 4; \ 20 + .word 98b, y##_fp; \ 21 + .text; \ 22 + .align 4; 23 + 24 + #ifndef ASI_AIUS 25 + #define ASI_AIUS 0x11 26 + #endif 27 + 28 + #define FUNC_NAME M7copy_from_user 29 + #define LOAD(type,addr,dest) type##a [addr] %asi, dest 30 + #define EX_RETVAL(x) 0 31 + 32 + #ifdef __KERNEL__ 33 + #define PREAMBLE \ 34 + rd %asi, %g1; \ 35 + cmp %g1, ASI_AIUS; \ 36 + bne,pn %icc, raw_copy_in_user; \ 37 + nop 38 + #endif 39 + 40 + #include "M7memcpy.S"
+51
arch/sparc/lib/M7copy_to_user.S
··· 1 + /* 2 + * M7copy_to_user.S: SPARC M7 optimized copy to userspace. 3 + * 4 + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. 5 + */ 6 + 7 + 8 + #define EX_ST(x, y) \ 9 + 98: x; \ 10 + .section __ex_table,"a"; \ 11 + .align 4; \ 12 + .word 98b, y; \ 13 + .text; \ 14 + .align 4; 15 + 16 + #define EX_ST_FP(x, y) \ 17 + 98: x; \ 18 + .section __ex_table,"a"; \ 19 + .align 4; \ 20 + .word 98b, y##_fp; \ 21 + .text; \ 22 + .align 4; 23 + 24 + 25 + #ifndef ASI_AIUS 26 + #define ASI_AIUS 0x11 27 + #endif 28 + 29 + #ifndef ASI_BLK_INIT_QUAD_LDD_AIUS 30 + #define ASI_BLK_INIT_QUAD_LDD_AIUS 0x23 31 + #endif 32 + 33 + #define FUNC_NAME M7copy_to_user 34 + #define STORE(type,src,addr) type##a src, [addr] %asi 35 + #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_AIUS 36 + #define STORE_MRU_ASI ASI_ST_BLKINIT_MRU_S 37 + #define EX_RETVAL(x) 0 38 + 39 + #ifdef __KERNEL__ 40 + /* Writing to %asi is _expensive_ so we hardcode it. 41 + * Reading %asi to check for KERNEL_DS is comparatively 42 + * cheap. 43 + */ 44 + #define PREAMBLE \ 45 + rd %asi, %g1; \ 46 + cmp %g1, ASI_AIUS; \ 47 + bne,pn %icc, raw_copy_in_user; \ 48 + nop 49 + #endif 50 + 51 + #include "M7memcpy.S"
+923
arch/sparc/lib/M7memcpy.S
··· 1 + /* 2 + * M7memcpy: Optimized SPARC M7 memcpy 3 + * 4 + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. 5 + */ 6 + 7 + .file "M7memcpy.S" 8 + 9 + /* 10 + * memcpy(s1, s2, len) 11 + * 12 + * Copy s2 to s1, always copy n bytes. 13 + * Note: this C code does not work for overlapped copies. 14 + * 15 + * Fast assembler language version of the following C-program for memcpy 16 + * which represents the `standard' for the C-library. 17 + * 18 + * void * 19 + * memcpy(void *s, const void *s0, size_t n) 20 + * { 21 + * if (n != 0) { 22 + * char *s1 = s; 23 + * const char *s2 = s0; 24 + * do { 25 + * *s1++ = *s2++; 26 + * } while (--n != 0); 27 + * } 28 + * return (s); 29 + * } 30 + * 31 + * 32 + * SPARC T7/M7 Flow : 33 + * 34 + * if (count < SMALL_MAX) { 35 + * if count < SHORTCOPY (SHORTCOPY=3) 36 + * copy bytes; exit with dst addr 37 + * if src & dst aligned on word boundary but not long word boundary, 38 + * copy with ldw/stw; branch to finish_up 39 + * if src & dst aligned on long word boundary 40 + * copy with ldx/stx; branch to finish_up 41 + * if src & dst not aligned and length <= SHORTCHECK (SHORTCHECK=14) 42 + * copy bytes; exit with dst addr 43 + * move enough bytes to get src to word boundary 44 + * if dst now on word boundary 45 + * move_words: 46 + * copy words; branch to finish_up 47 + * if dst now on half word boundary 48 + * load words, shift half words, store words; branch to finish_up 49 + * if dst on byte 1 50 + * load words, shift 3 bytes, store words; branch to finish_up 51 + * if dst on byte 3 52 + * load words, shift 1 byte, store words; branch to finish_up 53 + * finish_up: 54 + * copy bytes; exit with dst addr 55 + * } else { More than SMALL_MAX bytes 56 + * move bytes until dst is on long word boundary 57 + * if( src is on long word boundary ) { 58 + * if (count < MED_MAX) { 59 + * finish_long: src/dst aligned on 8 bytes 60 + * copy with ldx/stx in 8-way unrolled loop; 61 + * copy final 0-63 bytes; exit with dst addr 62 + * } else { src/dst aligned; count > MED_MAX 63 + * align dst on 64 byte boundary; for main data movement: 64 + * prefetch src data to L2 cache; let HW prefetch move data to L1 cache 65 + * Use BIS (block initializing store) to avoid copying store cache 66 + * lines from memory. But pre-store first element of each cache line 67 + * ST_CHUNK lines in advance of the rest of that cache line. That 68 + * gives time for replacement cache lines to be written back without 69 + * excess STQ and Miss Buffer filling. Repeat until near the end, 70 + * then finish up storing before going to finish_long. 71 + * } 72 + * } else { src/dst not aligned on 8 bytes 73 + * if src is word aligned and count < MED_WMAX 74 + * move words in 8-way unrolled loop 75 + * move final 0-31 bytes; exit with dst addr 76 + * if count < MED_UMAX 77 + * use alignaddr/faligndata combined with ldd/std in 8-way 78 + * unrolled loop to move data. 79 + * go to unalign_done 80 + * else 81 + * setup alignaddr for faligndata instructions 82 + * align dst on 64 byte boundary; prefetch src data to L1 cache 83 + * loadx8, falign, block-store, prefetch loop 84 + * (only use block-init-store when src/dst on 8 byte boundaries.) 85 + * unalign_done: 86 + * move remaining bytes for unaligned cases. exit with dst addr. 87 + * } 88 + * 89 + */ 90 + 91 + #include <asm/visasm.h> 92 + #include <asm/asi.h> 93 + 94 + #if !defined(EX_LD) && !defined(EX_ST) 95 + #define NON_USER_COPY 96 + #endif 97 + 98 + #ifndef EX_LD 99 + #define EX_LD(x,y) x 100 + #endif 101 + #ifndef EX_LD_FP 102 + #define EX_LD_FP(x,y) x 103 + #endif 104 + 105 + #ifndef EX_ST 106 + #define EX_ST(x,y) x 107 + #endif 108 + #ifndef EX_ST_FP 109 + #define EX_ST_FP(x,y) x 110 + #endif 111 + 112 + #ifndef EX_RETVAL 113 + #define EX_RETVAL(x) x 114 + #endif 115 + 116 + #ifndef LOAD 117 + #define LOAD(type,addr,dest) type [addr], dest 118 + #endif 119 + 120 + #ifndef STORE 121 + #define STORE(type,src,addr) type src, [addr] 122 + #endif 123 + 124 + /* 125 + * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache 126 + * line as "least recently used" which means if many threads are 127 + * active, it has a high probability of being pushed out of the cache 128 + * between the first initializing store and the final stores. 129 + * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which 130 + * marks the cache line as "most recently used" for all 131 + * but the last cache line 132 + */ 133 + #ifndef STORE_ASI 134 + #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 135 + #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P 136 + #else 137 + #define STORE_ASI 0x80 /* ASI_P */ 138 + #endif 139 + #endif 140 + 141 + #ifndef STORE_MRU_ASI 142 + #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 143 + #define STORE_MRU_ASI ASI_ST_BLKINIT_MRU_P 144 + #else 145 + #define STORE_MRU_ASI 0x80 /* ASI_P */ 146 + #endif 147 + #endif 148 + 149 + #ifndef STORE_INIT 150 + #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI 151 + #endif 152 + 153 + #ifndef STORE_INIT_MRU 154 + #define STORE_INIT_MRU(src,addr) stxa src, [addr] STORE_MRU_ASI 155 + #endif 156 + 157 + #ifndef FUNC_NAME 158 + #define FUNC_NAME M7memcpy 159 + #endif 160 + 161 + #ifndef PREAMBLE 162 + #define PREAMBLE 163 + #endif 164 + 165 + #define BLOCK_SIZE 64 166 + #define SHORTCOPY 3 167 + #define SHORTCHECK 14 168 + #define SHORT_LONG 64 /* max copy for short longword-aligned case */ 169 + /* must be at least 64 */ 170 + #define SMALL_MAX 128 171 + #define MED_UMAX 1024 /* max copy for medium un-aligned case */ 172 + #define MED_WMAX 1024 /* max copy for medium word-aligned case */ 173 + #define MED_MAX 1024 /* max copy for medium longword-aligned case */ 174 + #define ST_CHUNK 24 /* ST_CHUNK - block of values for BIS Store */ 175 + #define ALIGN_PRE 24 /* distance for aligned prefetch loop */ 176 + 177 + .register %g2,#scratch 178 + 179 + .section ".text" 180 + .global FUNC_NAME 181 + .type FUNC_NAME, #function 182 + .align 16 183 + FUNC_NAME: 184 + srlx %o2, 31, %g2 185 + cmp %g2, 0 186 + tne %xcc, 5 187 + PREAMBLE 188 + mov %o0, %g1 ! save %o0 189 + brz,pn %o2, .Lsmallx 190 + cmp %o2, 3 191 + ble,pn %icc, .Ltiny_cp 192 + cmp %o2, 19 193 + ble,pn %icc, .Lsmall_cp 194 + or %o0, %o1, %g2 195 + cmp %o2, SMALL_MAX 196 + bl,pn %icc, .Lmedium_cp 197 + nop 198 + 199 + .Lmedium: 200 + neg %o0, %o5 201 + andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 202 + brz,pt %o5, .Ldst_aligned_on_8 203 + 204 + ! %o5 has the bytes to be written in partial store. 205 + sub %o2, %o5, %o2 206 + sub %o1, %o0, %o1 ! %o1 gets the difference 207 + 7: ! dst aligning loop 208 + add %o1, %o0, %o4 209 + EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5) ! load one byte 210 + subcc %o5, 1, %o5 211 + EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1) 212 + bgu,pt %xcc, 7b 213 + add %o0, 1, %o0 ! advance dst 214 + add %o1, %o0, %o1 ! restore %o1 215 + .Ldst_aligned_on_8: 216 + andcc %o1, 7, %o5 217 + brnz,pt %o5, .Lsrc_dst_unaligned_on_8 218 + nop 219 + 220 + .Lsrc_dst_aligned_on_8: 221 + ! check if we are copying MED_MAX or more bytes 222 + set MED_MAX, %o3 223 + cmp %o2, %o3 ! limit to store buffer size 224 + bgu,pn %xcc, .Llarge_align8_copy 225 + nop 226 + 227 + /* 228 + * Special case for handling when src and dest are both long word aligned 229 + * and total data to move is less than MED_MAX bytes 230 + */ 231 + .Lmedlong: 232 + subcc %o2, 63, %o2 ! adjust length to allow cc test 233 + ble,pn %xcc, .Lmedl63 ! skip big loop if less than 64 bytes 234 + nop 235 + .Lmedl64: 236 + EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63) ! load 237 + subcc %o2, 64, %o2 ! decrement length count 238 + EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64) ! and store 239 + EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56) ! a block of 64 240 + EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56) 241 + EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48) 242 + EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48) 243 + EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40) 244 + EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40) 245 + EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store 246 + EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32) 247 + EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64 248 + add %o1, 64, %o1 ! increase src ptr by 64 249 + EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24) 250 + EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16) 251 + add %o0, 64, %o0 ! increase dst ptr by 64 252 + EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16) 253 + EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8) 254 + bgu,pt %xcc, .Lmedl64 ! repeat if at least 64 bytes left 255 + EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8) 256 + .Lmedl63: 257 + addcc %o2, 32, %o2 ! adjust remaining count 258 + ble,pt %xcc, .Lmedl31 ! to skip if 31 or fewer bytes left 259 + nop 260 + EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31) ! load 261 + sub %o2, 32, %o2 ! decrement length count 262 + EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32) ! and store 263 + EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24) ! a block of 32 264 + add %o1, 32, %o1 ! increase src ptr by 32 265 + EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24) 266 + EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16) 267 + add %o0, 32, %o0 ! increase dst ptr by 32 268 + EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16) 269 + EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8) 270 + EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8) 271 + .Lmedl31: 272 + addcc %o2, 16, %o2 ! adjust remaining count 273 + ble,pt %xcc, .Lmedl15 ! skip if 15 or fewer bytes left 274 + nop ! 275 + EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15) 276 + add %o1, 16, %o1 ! increase src ptr by 16 277 + EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15) 278 + sub %o2, 16, %o2 ! decrease count by 16 279 + EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8) 280 + add %o0, 16, %o0 ! increase dst ptr by 16 281 + EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8) 282 + .Lmedl15: 283 + addcc %o2, 15, %o2 ! restore count 284 + bz,pt %xcc, .Lsmallx ! exit if finished 285 + cmp %o2, 8 286 + blt,pt %xcc, .Lmedw7 ! skip if 7 or fewer bytes left 287 + tst %o2 288 + EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) ! load 8 bytes 289 + add %o1, 8, %o1 ! increase src ptr by 8 290 + add %o0, 8, %o0 ! increase dst ptr by 8 291 + subcc %o2, 8, %o2 ! decrease count by 8 292 + bnz,pn %xcc, .Lmedw7 293 + EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) ! and store 8 294 + retl 295 + mov EX_RETVAL(%g1), %o0 ! restore %o0 296 + 297 + .align 16 298 + .Lsrc_dst_unaligned_on_8: 299 + ! DST is 8-byte aligned, src is not 300 + 2: 301 + andcc %o1, 0x3, %o5 ! test word alignment 302 + bnz,pt %xcc, .Lunalignsetup ! branch to skip if not word aligned 303 + nop 304 + 305 + /* 306 + * Handle all cases where src and dest are aligned on word 307 + * boundaries. Use unrolled loops for better performance. 308 + * This option wins over standard large data move when 309 + * source and destination is in cache for.Lmedium 310 + * to short data moves. 311 + */ 312 + set MED_WMAX, %o3 313 + cmp %o2, %o3 ! limit to store buffer size 314 + bge,pt %xcc, .Lunalignrejoin ! otherwise rejoin main loop 315 + nop 316 + 317 + subcc %o2, 31, %o2 ! adjust length to allow cc test 318 + ! for end of loop 319 + ble,pt %xcc, .Lmedw31 ! skip big loop if less than 16 320 + .Lmedw32: 321 + EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32 322 + sllx %o4, 32, %o5 323 + EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31) 324 + or %o4, %o5, %o5 325 + EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31) 326 + subcc %o2, 32, %o2 ! decrement length count 327 + EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24) 328 + sllx %o4, 32, %o5 329 + EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24) 330 + or %o4, %o5, %o5 331 + EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24) 332 + add %o1, 32, %o1 ! increase src ptr by 32 333 + EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16) 334 + sllx %o4, 32, %o5 335 + EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16) 336 + or %o4, %o5, %o5 337 + EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16) 338 + add %o0, 32, %o0 ! increase dst ptr by 32 339 + EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8) 340 + sllx %o4, 32, %o5 341 + EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8) 342 + or %o4, %o5, %o5 343 + bgu,pt %xcc, .Lmedw32 ! repeat if at least 32 bytes left 344 + EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8) 345 + .Lmedw31: 346 + addcc %o2, 31, %o2 ! restore count 347 + 348 + bz,pt %xcc, .Lsmallx ! exit if finished 349 + nop 350 + cmp %o2, 16 351 + blt,pt %xcc, .Lmedw15 352 + nop 353 + EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes 354 + sllx %o4, 32, %o5 355 + subcc %o2, 16, %o2 ! decrement length count 356 + EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16) 357 + or %o4, %o5, %o5 358 + EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16) 359 + add %o1, 16, %o1 ! increase src ptr by 16 360 + EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8) 361 + add %o0, 16, %o0 ! increase dst ptr by 16 362 + sllx %o4, 32, %o5 363 + EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8) 364 + or %o4, %o5, %o5 365 + EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8) 366 + .Lmedw15: 367 + bz,pt %xcc, .Lsmallx ! exit if finished 368 + cmp %o2, 8 369 + blt,pn %xcc, .Lmedw7 ! skip if 7 or fewer bytes left 370 + tst %o2 371 + EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes 372 + subcc %o2, 8, %o2 ! decrease count by 8 373 + EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes 374 + add %o1, 8, %o1 ! increase src ptr by 8 375 + EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4) ! load 4 bytes 376 + add %o0, 8, %o0 ! increase dst ptr by 8 377 + EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes 378 + bz,pt %xcc, .Lsmallx ! exit if finished 379 + .Lmedw7: ! count is ge 1, less than 8 380 + cmp %o2, 4 ! check for 4 bytes left 381 + blt,pn %xcc, .Lsmallleft3 ! skip if 3 or fewer bytes left 382 + nop ! 383 + EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes 384 + add %o1, 4, %o1 ! increase src ptr by 4 385 + add %o0, 4, %o0 ! increase dst ptr by 4 386 + subcc %o2, 4, %o2 ! decrease count by 4 387 + bnz .Lsmallleft3 388 + EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes 389 + retl 390 + mov EX_RETVAL(%g1), %o0 391 + 392 + .align 16 393 + .Llarge_align8_copy: ! Src and dst share 8 byte alignment 394 + ! align dst to 64 byte boundary 395 + andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 396 + brz,pn %o3, .Laligned_to_64 397 + andcc %o0, 8, %o3 ! odd long words to move? 398 + brz,pt %o3, .Laligned_to_16 399 + nop 400 + EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) 401 + sub %o2, 8, %o2 402 + add %o1, 8, %o1 ! increment src ptr 403 + add %o0, 8, %o0 ! increment dst ptr 404 + EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) 405 + .Laligned_to_16: 406 + andcc %o0, 16, %o3 ! pair of long words to move? 407 + brz,pt %o3, .Laligned_to_32 408 + nop 409 + EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) 410 + sub %o2, 16, %o2 411 + EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16) 412 + add %o1, 16, %o1 ! increment src ptr 413 + EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8) 414 + add %o0, 16, %o0 ! increment dst ptr 415 + EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) 416 + .Laligned_to_32: 417 + andcc %o0, 32, %o3 ! four long words to move? 418 + brz,pt %o3, .Laligned_to_64 419 + nop 420 + EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) 421 + sub %o2, 32, %o2 422 + EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32) 423 + EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24) 424 + EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24) 425 + EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16) 426 + EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16) 427 + add %o1, 32, %o1 ! increment src ptr 428 + EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8) 429 + add %o0, 32, %o0 ! increment dst ptr 430 + EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) 431 + .Laligned_to_64: 432 + ! 433 + ! Using block init store (BIS) instructions to avoid fetching cache 434 + ! lines from memory. Use ST_CHUNK stores to first element of each cache 435 + ! line (similar to prefetching) to avoid overfilling STQ or miss buffers. 436 + ! Gives existing cache lines time to be moved out of L1/L2/L3 cache. 437 + ! Initial stores using MRU version of BIS to keep cache line in 438 + ! cache until we are ready to store final element of cache line. 439 + ! Then store last element using the LRU version of BIS. 440 + ! 441 + andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 442 + and %o2, 0x3f, %o2 ! residue bytes in %o2 443 + ! 444 + ! We use STORE_MRU_ASI for the first seven stores to each cache line 445 + ! followed by STORE_ASI (mark as LRU) for the last store. That 446 + ! mixed approach reduces the probability that the cache line is removed 447 + ! before we finish setting it, while minimizing the effects on 448 + ! other cached values during a large memcpy 449 + ! 450 + ! ST_CHUNK batches up initial BIS operations for several cache lines 451 + ! to allow multiple requests to not be blocked by overflowing the 452 + ! the store miss buffer. Then the matching stores for all those 453 + ! BIS operations are executed. 454 + ! 455 + 456 + sub %o0, 8, %o0 ! adjust %o0 for ASI alignment 457 + .Lalign_loop: 458 + cmp %o5, ST_CHUNK*64 459 + blu,pt %xcc, .Lalign_loop_fin 460 + mov ST_CHUNK,%o3 461 + .Lalign_loop_start: 462 + prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21 463 + subcc %o3, 1, %o3 464 + EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5) 465 + add %o1, 64, %o1 466 + add %o0, 8, %o0 467 + EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 468 + bgu %xcc,.Lalign_loop_start 469 + add %o0, 56, %o0 470 + 471 + mov ST_CHUNK,%o3 472 + sllx %o3, 6, %o4 ! ST_CHUNK*64 473 + sub %o1, %o4, %o1 ! reset %o1 474 + sub %o0, %o4, %o0 ! reset %o0 475 + 476 + .Lalign_loop_rest: 477 + EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5) 478 + add %o0, 16, %o0 479 + EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 480 + EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5) 481 + add %o0, 8, %o0 482 + EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 483 + subcc %o3, 1, %o3 484 + EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5) 485 + add %o0, 8, %o0 486 + EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 487 + EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5) 488 + add %o0, 8, %o0 489 + EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 490 + EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5) 491 + add %o0, 8, %o0 492 + EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 493 + EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5) 494 + add %o1, 64, %o1 495 + add %o0, 8, %o0 496 + EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 497 + add %o0, 8, %o0 498 + EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5) 499 + sub %o5, 64, %o5 500 + bgu %xcc,.Lalign_loop_rest 501 + ! mark cache line as LRU 502 + EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64) 503 + 504 + cmp %o5, ST_CHUNK*64 505 + bgu,pt %xcc, .Lalign_loop_start 506 + mov ST_CHUNK,%o3 507 + 508 + cmp %o5, 0 509 + beq .Lalign_done 510 + nop 511 + .Lalign_loop_fin: 512 + EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5) 513 + EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5) 514 + EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5) 515 + EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5) 516 + EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5) 517 + EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5) 518 + subcc %o5, 64, %o5 519 + EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64) 520 + EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64) 521 + EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64) 522 + EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64) 523 + EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64) 524 + EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64) 525 + EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64) 526 + add %o1, 64, %o1 527 + EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64) 528 + add %o0, 64, %o0 529 + EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64) 530 + bgu %xcc,.Lalign_loop_fin 531 + EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64) 532 + 533 + .Lalign_done: 534 + add %o0, 8, %o0 ! restore %o0 from ASI alignment 535 + membar #StoreStore 536 + sub %o2, 63, %o2 ! adjust length to allow cc test 537 + ba .Lmedl63 ! in .Lmedl63 538 + nop 539 + 540 + .align 16 541 + ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX 542 + .Lunalignsetup: 543 + .Lunalignrejoin: 544 + mov %g1, %o3 ! save %g1 as VISEntryHalf clobbers it 545 + #ifdef NON_USER_COPY 546 + VISEntryHalfFast(.Lmedium_vis_entry_fail_cp) 547 + #else 548 + VISEntryHalf 549 + #endif 550 + mov %o3, %g1 ! restore %g1 551 + 552 + set MED_UMAX, %o3 553 + cmp %o2, %o3 ! check for.Lmedium unaligned limit 554 + bge,pt %xcc,.Lunalign_large 555 + prefetch [%o1 + (4 * BLOCK_SIZE)], 20 556 + andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 557 + and %o2, 0x3f, %o2 ! residue bytes in %o2 558 + cmp %o2, 8 ! Insure we do not load beyond 559 + bgt .Lunalign_adjust ! end of source buffer 560 + andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 561 + add %o2, 64, %o2 ! adjust to leave loop 562 + sub %o5, 64, %o5 ! early if necessary 563 + .Lunalign_adjust: 564 + alignaddr %o1, %g0, %g0 ! generate %gsr 565 + add %o1, %o5, %o1 ! advance %o1 to after blocks 566 + EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5) 567 + .Lunalign_loop: 568 + EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5) 569 + faligndata %f0, %f2, %f16 570 + EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5) 571 + subcc %o5, BLOCK_SIZE, %o5 572 + EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64) 573 + faligndata %f2, %f4, %f18 574 + EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56) 575 + EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56) 576 + faligndata %f4, %f6, %f20 577 + EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48) 578 + EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48) 579 + faligndata %f6, %f8, %f22 580 + EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40) 581 + EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40) 582 + faligndata %f8, %f10, %f24 583 + EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32) 584 + EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32) 585 + faligndata %f10, %f12, %f26 586 + EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24) 587 + add %o4, BLOCK_SIZE, %o4 588 + EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24) 589 + faligndata %f12, %f14, %f28 590 + EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16) 591 + EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16) 592 + faligndata %f14, %f0, %f30 593 + EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8) 594 + add %o0, BLOCK_SIZE, %o0 595 + bgu,pt %xcc, .Lunalign_loop 596 + prefetch [%o4 + (5 * BLOCK_SIZE)], 20 597 + ba .Lunalign_done 598 + nop 599 + 600 + .Lunalign_large: 601 + andcc %o0, 0x3f, %o3 ! is dst 64-byte block aligned? 602 + bz %xcc, .Lunalignsrc 603 + sub %o3, 64, %o3 ! %o3 will be multiple of 8 604 + neg %o3 ! bytes until dest is 64 byte aligned 605 + sub %o2, %o3, %o2 ! update cnt with bytes to be moved 606 + ! Move bytes according to source alignment 607 + andcc %o1, 0x1, %o5 608 + bnz %xcc, .Lunalignbyte ! check for byte alignment 609 + nop 610 + andcc %o1, 2, %o5 ! check for half word alignment 611 + bnz %xcc, .Lunalignhalf 612 + nop 613 + ! Src is word aligned 614 + .Lunalignword: 615 + EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 4 bytes 616 + add %o1, 8, %o1 ! increase src ptr by 8 617 + EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3) ! and store 4 618 + subcc %o3, 8, %o3 ! decrease count by 8 619 + EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4 620 + add %o0, 8, %o0 ! increase dst ptr by 8 621 + bnz %xcc, .Lunalignword 622 + EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4) 623 + ba .Lunalignsrc 624 + nop 625 + 626 + ! Src is half-word aligned 627 + .Lunalignhalf: 628 + EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 2 bytes 629 + sllx %o4, 32, %o5 ! shift left 630 + EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3) 631 + or %o4, %o5, %o5 632 + sllx %o5, 16, %o5 633 + EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3) 634 + or %o4, %o5, %o5 635 + EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3) 636 + add %o1, 8, %o1 637 + subcc %o3, 8, %o3 638 + bnz %xcc, .Lunalignhalf 639 + add %o0, 8, %o0 640 + ba .Lunalignsrc 641 + nop 642 + 643 + ! Src is Byte aligned 644 + .Lunalignbyte: 645 + sub %o0, %o1, %o0 ! share pointer advance 646 + .Lunalignbyte_loop: 647 + EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3) 648 + sllx %o4, 56, %o5 649 + EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3) 650 + sllx %o4, 40, %o4 651 + or %o4, %o5, %o5 652 + EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3) 653 + sllx %o4, 24, %o4 654 + or %o4, %o5, %o5 655 + EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3) 656 + sllx %o4, 8, %o4 657 + or %o4, %o5, %o5 658 + EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3) 659 + or %o4, %o5, %o5 660 + add %o0, %o1, %o0 661 + EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3) 662 + sub %o0, %o1, %o0 663 + subcc %o3, 8, %o3 664 + bnz %xcc, .Lunalignbyte_loop 665 + add %o1, 8, %o1 666 + add %o0,%o1, %o0 ! restore pointer 667 + 668 + ! Destination is now block (64 byte aligned) 669 + .Lunalignsrc: 670 + andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 671 + and %o2, 0x3f, %o2 ! residue bytes in %o2 672 + add %o2, 64, %o2 ! Insure we do not load beyond 673 + sub %o5, 64, %o5 ! end of source buffer 674 + 675 + andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 676 + alignaddr %o1, %g0, %g0 ! generate %gsr 677 + add %o1, %o5, %o1 ! advance %o1 to after blocks 678 + 679 + EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5) 680 + add %o4, 8, %o4 681 + .Lunalign_sloop: 682 + EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5) 683 + faligndata %f14, %f16, %f0 684 + EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5) 685 + faligndata %f16, %f18, %f2 686 + EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5) 687 + faligndata %f18, %f20, %f4 688 + EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5) 689 + subcc %o5, 64, %o5 690 + EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56) 691 + faligndata %f20, %f22, %f6 692 + EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56) 693 + EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48) 694 + faligndata %f22, %f24, %f8 695 + EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48) 696 + EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40) 697 + faligndata %f24, %f26, %f10 698 + EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40) 699 + EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40) 700 + faligndata %f26, %f28, %f12 701 + EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40) 702 + add %o4, 64, %o4 703 + EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40) 704 + faligndata %f28, %f30, %f14 705 + EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40) 706 + EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40) 707 + add %o0, 64, %o0 708 + EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40) 709 + fsrc2 %f30, %f14 710 + bgu,pt %xcc, .Lunalign_sloop 711 + prefetch [%o4 + (8 * BLOCK_SIZE)], 20 712 + 713 + .Lunalign_done: 714 + ! Handle trailing bytes, 64 to 127 715 + ! Dest long word aligned, Src not long word aligned 716 + cmp %o2, 15 717 + bleu %xcc, .Lunalign_short 718 + 719 + andn %o2, 0x7, %o5 ! %o5 is multiple of 8 720 + and %o2, 0x7, %o2 ! residue bytes in %o2 721 + add %o2, 8, %o2 722 + sub %o5, 8, %o5 ! insure we do not load past end of src 723 + andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 724 + add %o1, %o5, %o1 ! advance %o1 to after multiple of 8 725 + EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword 726 + .Lunalign_by8: 727 + EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5) 728 + add %o4, 8, %o4 729 + faligndata %f0, %f2, %f16 730 + subcc %o5, 8, %o5 731 + EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5) 732 + fsrc2 %f2, %f0 733 + bgu,pt %xcc, .Lunalign_by8 734 + add %o0, 8, %o0 735 + 736 + .Lunalign_short: 737 + #ifdef NON_USER_COPY 738 + VISExitHalfFast 739 + #else 740 + VISExitHalf 741 + #endif 742 + ba .Lsmallrest 743 + nop 744 + 745 + /* 746 + * This is a special case of nested memcpy. This can happen when kernel 747 + * calls unaligned memcpy back to back without saving FP registers. We need 748 + * traps(context switch) to save/restore FP registers. If the kernel calls 749 + * memcpy without this trap sequence we will hit FP corruption. Let's use 750 + * the normal integer load/store method in this case. 751 + */ 752 + 753 + #ifdef NON_USER_COPY 754 + .Lmedium_vis_entry_fail_cp: 755 + or %o0, %o1, %g2 756 + #endif 757 + .Lmedium_cp: 758 + LOAD(prefetch, %o1 + 0x40, #n_reads_strong) 759 + andcc %g2, 0x7, %g0 760 + bne,pn %xcc, .Lmedium_unaligned_cp 761 + nop 762 + 763 + .Lmedium_noprefetch_cp: 764 + andncc %o2, 0x20 - 1, %o5 765 + be,pn %xcc, 2f 766 + sub %o2, %o5, %o2 767 + 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) 768 + EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5) 769 + EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5) 770 + EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5) 771 + add %o1, 0x20, %o1 772 + subcc %o5, 0x20, %o5 773 + EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32) 774 + EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24) 775 + EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24) 776 + EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8) 777 + bne,pt %xcc, 1b 778 + add %o0, 0x20, %o0 779 + 2: andcc %o2, 0x18, %o5 780 + be,pt %xcc, 3f 781 + sub %o2, %o5, %o2 782 + 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) 783 + add %o1, 0x08, %o1 784 + add %o0, 0x08, %o0 785 + subcc %o5, 0x08, %o5 786 + bne,pt %xcc, 1b 787 + EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8) 788 + 3: brz,pt %o2, .Lexit_cp 789 + cmp %o2, 0x04 790 + bl,pn %xcc, .Ltiny_cp 791 + nop 792 + EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2) 793 + add %o1, 0x04, %o1 794 + add %o0, 0x04, %o0 795 + subcc %o2, 0x04, %o2 796 + bne,pn %xcc, .Ltiny_cp 797 + EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4) 798 + ba,a,pt %xcc, .Lexit_cp 799 + 800 + .Lmedium_unaligned_cp: 801 + /* First get dest 8 byte aligned. */ 802 + sub %g0, %o0, %o3 803 + and %o3, 0x7, %o3 804 + brz,pt %o3, 2f 805 + sub %o2, %o3, %o2 806 + 807 + 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1) 808 + add %o1, 1, %o1 809 + subcc %o3, 1, %o3 810 + add %o0, 1, %o0 811 + bne,pt %xcc, 1b 812 + EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1) 813 + 2: 814 + and %o1, 0x7, %o3 815 + brz,pn %o3, .Lmedium_noprefetch_cp 816 + sll %o3, 3, %o3 817 + mov 64, %g2 818 + sub %g2, %o3, %g2 819 + andn %o1, 0x7, %o1 820 + EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2) 821 + sllx %o4, %o3, %o4 822 + andn %o2, 0x08 - 1, %o5 823 + sub %o2, %o5, %o2 824 + 825 + 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5) 826 + add %o1, 0x08, %o1 827 + subcc %o5, 0x08, %o5 828 + srlx %g3, %g2, %g7 829 + or %g7, %o4, %g7 830 + EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8) 831 + add %o0, 0x08, %o0 832 + bne,pt %xcc, 1b 833 + sllx %g3, %o3, %o4 834 + srl %o3, 3, %o3 835 + add %o1, %o3, %o1 836 + brz,pn %o2, .Lexit_cp 837 + nop 838 + ba,pt %xcc, .Lsmall_unaligned_cp 839 + 840 + .Ltiny_cp: 841 + EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2) 842 + subcc %o2, 1, %o2 843 + be,pn %xcc, .Lexit_cp 844 + EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1) 845 + EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2) 846 + subcc %o2, 1, %o2 847 + be,pn %xcc, .Lexit_cp 848 + EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1) 849 + EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2) 850 + ba,pt %xcc, .Lexit_cp 851 + EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2) 852 + 853 + .Lsmall_cp: 854 + andcc %g2, 0x3, %g0 855 + bne,pn %xcc, .Lsmall_unaligned_cp 856 + andn %o2, 0x4 - 1, %o5 857 + sub %o2, %o5, %o2 858 + 1: 859 + EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) 860 + add %o1, 0x04, %o1 861 + subcc %o5, 0x04, %o5 862 + add %o0, 0x04, %o0 863 + bne,pt %xcc, 1b 864 + EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4) 865 + brz,pt %o2, .Lexit_cp 866 + nop 867 + ba,a,pt %xcc, .Ltiny_cp 868 + 869 + .Lsmall_unaligned_cp: 870 + 1: EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2) 871 + add %o1, 1, %o1 872 + add %o0, 1, %o0 873 + subcc %o2, 1, %o2 874 + bne,pt %xcc, 1b 875 + EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1) 876 + ba,a,pt %xcc, .Lexit_cp 877 + 878 + .Lsmallrest: 879 + tst %o2 880 + bz,pt %xcc, .Lsmallx 881 + cmp %o2, 4 882 + blt,pn %xcc, .Lsmallleft3 883 + nop 884 + sub %o2, 3, %o2 885 + .Lsmallnotalign4: 886 + EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte 887 + subcc %o2, 4, %o2 ! reduce count by 4 888 + EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat 889 + EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4 890 + add %o1, 4, %o1 ! advance SRC by 4 891 + EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6) 892 + EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5) 893 + add %o0, 4, %o0 ! advance DST by 4 894 + EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5) 895 + EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4) 896 + bgu,pt %xcc, .Lsmallnotalign4 ! loop til 3 or fewer bytes remain 897 + EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4) 898 + addcc %o2, 3, %o2 ! restore count 899 + bz,pt %xcc, .Lsmallx 900 + .Lsmallleft3: ! 1, 2, or 3 bytes remain 901 + subcc %o2, 1, %o2 902 + EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1) ! load one byte 903 + bz,pt %xcc, .Lsmallx 904 + EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1) ! store one byte 905 + EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2) ! load second byte 906 + subcc %o2, 1, %o2 907 + bz,pt %xcc, .Lsmallx 908 + EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte 909 + EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2) ! load third byte 910 + EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2) ! store third byte 911 + .Lsmallx: 912 + retl 913 + mov EX_RETVAL(%g1), %o0 914 + .Lsmallfin: 915 + tst %o2 916 + bnz,pn %xcc, .Lsmallleft3 917 + nop 918 + retl 919 + mov EX_RETVAL(%g1), %o0 ! restore %o0 920 + .Lexit_cp: 921 + retl 922 + mov EX_RETVAL(%g1), %o0 923 + .size FUNC_NAME, .-FUNC_NAME
+352
arch/sparc/lib/M7memset.S
··· 1 + /* 2 + * M7memset.S: SPARC M7 optimized memset. 3 + * 4 + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. 5 + */ 6 + 7 + /* 8 + * M7memset.S: M7 optimized memset. 9 + * 10 + * char *memset(sp, c, n) 11 + * 12 + * Set an array of n chars starting at sp to the character c. 13 + * Return sp. 14 + * 15 + * Fast assembler language version of the following C-program for memset 16 + * which represents the `standard' for the C-library. 17 + * 18 + * void * 19 + * memset(void *sp1, int c, size_t n) 20 + * { 21 + * if (n != 0) { 22 + * char *sp = sp1; 23 + * do { 24 + * *sp++ = (char)c; 25 + * } while (--n != 0); 26 + * } 27 + * return (sp1); 28 + * } 29 + * 30 + * The algorithm is as follows : 31 + * 32 + * For small 6 or fewer bytes stores, bytes will be stored. 33 + * 34 + * For less than 32 bytes stores, align the address on 4 byte boundary. 35 + * Then store as many 4-byte chunks, followed by trailing bytes. 36 + * 37 + * For sizes greater than 32 bytes, align the address on 8 byte boundary. 38 + * if (count >= 64) { 39 + * store 8-bytes chunks to align the address on 64 byte boundary 40 + * if (value to be set is zero && count >= MIN_ZERO) { 41 + * Using BIS stores, set the first long word of each 42 + * 64-byte cache line to zero which will also clear the 43 + * other seven long words of the cache line. 44 + * } 45 + * else if (count >= MIN_LOOP) { 46 + * Using BIS stores, set the first long word of each of 47 + * ST_CHUNK cache lines (64 bytes each) before the main 48 + * loop is entered. 49 + * In the main loop, continue pre-setting the first long 50 + * word of each cache line ST_CHUNK lines in advance while 51 + * setting the other seven long words (56 bytes) of each 52 + * cache line until fewer than ST_CHUNK*64 bytes remain. 53 + * Then set the remaining seven long words of each cache 54 + * line that has already had its first long word set. 55 + * } 56 + * store remaining data in 64-byte chunks until less than 57 + * 64 bytes remain. 58 + * } 59 + * Store as many 8-byte chunks, followed by trailing bytes. 60 + * 61 + * BIS = Block Init Store 62 + * Doing the advance store of the first element of the cache line 63 + * initiates the displacement of a cache line while only using a single 64 + * instruction in the pipeline. That avoids various pipeline delays, 65 + * such as filling the miss buffer. The performance effect is 66 + * similar to prefetching for normal stores. 67 + * The special case for zero fills runs faster and uses fewer instruction 68 + * cycles than the normal memset loop. 69 + * 70 + * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence 71 + * BIS stores must be followed by a membar #StoreStore. The benefit of 72 + * the BIS store must be balanced against the cost of the membar operation. 73 + */ 74 + 75 + /* 76 + * ASI_STBI_P marks the cache line as "least recently used" 77 + * which means if many threads are active, it has a high chance 78 + * of being pushed out of the cache between the first initializing 79 + * store and the final stores. 80 + * Thus, we use ASI_STBIMRU_P which marks the cache line as 81 + * "most recently used" for all but the last store to the cache line. 82 + */ 83 + 84 + #include <asm/asi.h> 85 + #include <asm/page.h> 86 + 87 + #define ASI_STBI_P ASI_BLK_INIT_QUAD_LDD_P 88 + #define ASI_STBIMRU_P ASI_ST_BLKINIT_MRU_P 89 + 90 + 91 + #define ST_CHUNK 24 /* multiple of 4 due to loop unrolling */ 92 + #define MIN_LOOP 16320 93 + #define MIN_ZERO 512 94 + 95 + .section ".text" 96 + .align 32 97 + 98 + /* 99 + * Define clear_page(dest) as memset(dest, 0, PAGE_SIZE) 100 + * (can create a more optimized version later.) 101 + */ 102 + .globl M7clear_page 103 + .globl M7clear_user_page 104 + M7clear_page: /* clear_page(dest) */ 105 + M7clear_user_page: 106 + set PAGE_SIZE, %o1 107 + /* fall through into bzero code */ 108 + 109 + .size M7clear_page,.-M7clear_page 110 + .size M7clear_user_page,.-M7clear_user_page 111 + 112 + /* 113 + * Define bzero(dest, n) as memset(dest, 0, n) 114 + * (can create a more optimized version later.) 115 + */ 116 + .globl M7bzero 117 + M7bzero: /* bzero(dest, size) */ 118 + mov %o1, %o2 119 + mov 0, %o1 120 + /* fall through into memset code */ 121 + 122 + .size M7bzero,.-M7bzero 123 + 124 + .global M7memset 125 + .type M7memset, #function 126 + .register %g3, #scratch 127 + M7memset: 128 + mov %o0, %o5 ! copy sp1 before using it 129 + cmp %o2, 7 ! if small counts, just write bytes 130 + bleu,pn %xcc, .wrchar 131 + and %o1, 0xff, %o1 ! o1 is (char)c 132 + 133 + sll %o1, 8, %o3 134 + or %o1, %o3, %o1 ! now o1 has 2 bytes of c 135 + sll %o1, 16, %o3 136 + cmp %o2, 32 137 + blu,pn %xcc, .wdalign 138 + or %o1, %o3, %o1 ! now o1 has 4 bytes of c 139 + 140 + sllx %o1, 32, %o3 141 + or %o1, %o3, %o1 ! now o1 has 8 bytes of c 142 + 143 + .dbalign: 144 + andcc %o5, 7, %o3 ! is sp1 aligned on a 8 byte bound? 145 + bz,pt %xcc, .blkalign ! already long word aligned 146 + sub %o3, 8, %o3 ! -(bytes till long word aligned) 147 + 148 + add %o2, %o3, %o2 ! update o2 with new count 149 + ! Set -(%o3) bytes till sp1 long word aligned 150 + 1: stb %o1, [%o5] ! there is at least 1 byte to set 151 + inccc %o3 ! byte clearing loop 152 + bl,pt %xcc, 1b 153 + inc %o5 154 + 155 + ! Now sp1 is long word aligned (sp1 is found in %o5) 156 + .blkalign: 157 + cmp %o2, 64 ! check if there are 64 bytes to set 158 + blu,pn %xcc, .wrshort 159 + mov %o2, %o3 160 + 161 + andcc %o5, 63, %o3 ! is sp1 block aligned? 162 + bz,pt %xcc, .blkwr ! now block aligned 163 + sub %o3, 64, %o3 ! o3 is -(bytes till block aligned) 164 + add %o2, %o3, %o2 ! o2 is the remainder 165 + 166 + ! Store -(%o3) bytes till dst is block (64 byte) aligned. 167 + ! Use long word stores. 168 + ! Recall that dst is already long word aligned 169 + 1: 170 + addcc %o3, 8, %o3 171 + stx %o1, [%o5] 172 + bl,pt %xcc, 1b 173 + add %o5, 8, %o5 174 + 175 + ! Now sp1 is block aligned 176 + .blkwr: 177 + andn %o2, 63, %o4 ! calculate size of blocks in bytes 178 + brz,pn %o1, .wrzero ! special case if c == 0 179 + and %o2, 63, %o3 ! %o3 = bytes left after blk stores. 180 + 181 + set MIN_LOOP, %g1 182 + cmp %o4, %g1 ! check there are enough bytes to set 183 + blu,pn %xcc, .short_set ! to justify cost of membar 184 + ! must be > pre-cleared lines 185 + nop 186 + 187 + ! initial cache-clearing stores 188 + ! get store pipeline moving 189 + rd %asi, %g3 ! save %asi to be restored later 190 + wr %g0, ASI_STBIMRU_P, %asi 191 + 192 + ! Primary memset loop for large memsets 193 + .wr_loop: 194 + sub %o5, 8, %o5 ! adjust %o5 for ASI store alignment 195 + mov ST_CHUNK, %g1 196 + .wr_loop_start: 197 + stxa %o1, [%o5+8]%asi 198 + subcc %g1, 4, %g1 199 + stxa %o1, [%o5+8+64]%asi 200 + add %o5, 256, %o5 201 + stxa %o1, [%o5+8-128]%asi 202 + bgu %xcc, .wr_loop_start 203 + stxa %o1, [%o5+8-64]%asi 204 + 205 + sub %o5, ST_CHUNK*64, %o5 ! reset %o5 206 + mov ST_CHUNK, %g1 207 + 208 + .wr_loop_rest: 209 + stxa %o1, [%o5+8+8]%asi 210 + sub %o4, 64, %o4 211 + stxa %o1, [%o5+16+8]%asi 212 + subcc %g1, 1, %g1 213 + stxa %o1, [%o5+24+8]%asi 214 + stxa %o1, [%o5+32+8]%asi 215 + stxa %o1, [%o5+40+8]%asi 216 + add %o5, 64, %o5 217 + stxa %o1, [%o5-8]%asi 218 + bgu %xcc, .wr_loop_rest 219 + stxa %o1, [%o5]ASI_STBI_P 220 + 221 + ! If more than ST_CHUNK*64 bytes remain to set, continue 222 + ! setting the first long word of each cache line in advance 223 + ! to keep the store pipeline moving. 224 + 225 + cmp %o4, ST_CHUNK*64 226 + bge,pt %xcc, .wr_loop_start 227 + mov ST_CHUNK, %g1 228 + 229 + brz,a,pn %o4, .asi_done 230 + add %o5, 8, %o5 ! restore %o5 offset 231 + 232 + .wr_loop_small: 233 + stxa %o1, [%o5+8]%asi 234 + stxa %o1, [%o5+8+8]%asi 235 + stxa %o1, [%o5+16+8]%asi 236 + stxa %o1, [%o5+24+8]%asi 237 + stxa %o1, [%o5+32+8]%asi 238 + subcc %o4, 64, %o4 239 + stxa %o1, [%o5+40+8]%asi 240 + add %o5, 64, %o5 241 + stxa %o1, [%o5-8]%asi 242 + bgu,pt %xcc, .wr_loop_small 243 + stxa %o1, [%o5]ASI_STBI_P 244 + 245 + ba .asi_done 246 + add %o5, 8, %o5 ! restore %o5 offset 247 + 248 + ! Special case loop for zero fill memsets 249 + ! For each 64 byte cache line, single STBI to first element 250 + ! clears line 251 + .wrzero: 252 + cmp %o4, MIN_ZERO ! check if enough bytes to set 253 + ! to pay %asi + membar cost 254 + blu %xcc, .short_set 255 + nop 256 + sub %o4, 256, %o4 257 + 258 + .wrzero_loop: 259 + mov 64, %g3 260 + stxa %o1, [%o5]ASI_STBI_P 261 + subcc %o4, 256, %o4 262 + stxa %o1, [%o5+%g3]ASI_STBI_P 263 + add %o5, 256, %o5 264 + sub %g3, 192, %g3 265 + stxa %o1, [%o5+%g3]ASI_STBI_P 266 + add %g3, 64, %g3 267 + bge,pt %xcc, .wrzero_loop 268 + stxa %o1, [%o5+%g3]ASI_STBI_P 269 + add %o4, 256, %o4 270 + 271 + brz,pn %o4, .bsi_done 272 + nop 273 + 274 + .wrzero_small: 275 + stxa %o1, [%o5]ASI_STBI_P 276 + subcc %o4, 64, %o4 277 + bgu,pt %xcc, .wrzero_small 278 + add %o5, 64, %o5 279 + ba,a .bsi_done 280 + 281 + .asi_done: 282 + wr %g3, 0x0, %asi ! restored saved %asi 283 + .bsi_done: 284 + membar #StoreStore ! required by use of Block Store Init 285 + 286 + .short_set: 287 + cmp %o4, 64 ! check if 64 bytes to set 288 + blu %xcc, 5f 289 + nop 290 + 4: ! set final blocks of 64 bytes 291 + stx %o1, [%o5] 292 + stx %o1, [%o5+8] 293 + stx %o1, [%o5+16] 294 + stx %o1, [%o5+24] 295 + subcc %o4, 64, %o4 296 + stx %o1, [%o5+32] 297 + stx %o1, [%o5+40] 298 + add %o5, 64, %o5 299 + stx %o1, [%o5-16] 300 + bgu,pt %xcc, 4b 301 + stx %o1, [%o5-8] 302 + 303 + 5: 304 + ! Set the remaining long words 305 + .wrshort: 306 + subcc %o3, 8, %o3 ! Can we store any long words? 307 + blu,pn %xcc, .wrchars 308 + and %o2, 7, %o2 ! calc bytes left after long words 309 + 6: 310 + subcc %o3, 8, %o3 311 + stx %o1, [%o5] ! store the long words 312 + bgeu,pt %xcc, 6b 313 + add %o5, 8, %o5 314 + 315 + .wrchars: ! check for extra chars 316 + brnz %o2, .wrfin 317 + nop 318 + retl 319 + nop 320 + 321 + .wdalign: 322 + andcc %o5, 3, %o3 ! is sp1 aligned on a word boundary 323 + bz,pn %xcc, .wrword 324 + andn %o2, 3, %o3 ! create word sized count in %o3 325 + 326 + dec %o2 ! decrement count 327 + stb %o1, [%o5] ! clear a byte 328 + b .wdalign 329 + inc %o5 ! next byte 330 + 331 + .wrword: 332 + subcc %o3, 4, %o3 333 + st %o1, [%o5] ! 4-byte writing loop 334 + bnz,pt %xcc, .wrword 335 + add %o5, 4, %o5 336 + 337 + and %o2, 3, %o2 ! leftover count, if any 338 + 339 + .wrchar: 340 + ! Set the remaining bytes, if any 341 + brz %o2, .exit 342 + nop 343 + .wrfin: 344 + deccc %o2 345 + stb %o1, [%o5] 346 + bgu,pt %xcc, .wrfin 347 + inc %o5 348 + .exit: 349 + retl ! %o0 was preserved 350 + nop 351 + 352 + .size M7memset,.-M7memset
+51
arch/sparc/lib/M7patch.S
··· 1 + /* 2 + * M7patch.S: Patch generic routines with M7 variant. 3 + * 4 + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. 5 + */ 6 + 7 + #include <linux/linkage.h> 8 + 9 + #define BRANCH_ALWAYS 0x10680000 10 + #define NOP 0x01000000 11 + #define NG_DO_PATCH(OLD, NEW) \ 12 + sethi %hi(NEW), %g1; \ 13 + or %g1, %lo(NEW), %g1; \ 14 + sethi %hi(OLD), %g2; \ 15 + or %g2, %lo(OLD), %g2; \ 16 + sub %g1, %g2, %g1; \ 17 + sethi %hi(BRANCH_ALWAYS), %g3; \ 18 + sll %g1, 11, %g1; \ 19 + srl %g1, 11 + 2, %g1; \ 20 + or %g3, %lo(BRANCH_ALWAYS), %g3; \ 21 + or %g3, %g1, %g3; \ 22 + stw %g3, [%g2]; \ 23 + sethi %hi(NOP), %g3; \ 24 + or %g3, %lo(NOP), %g3; \ 25 + stw %g3, [%g2 + 0x4]; \ 26 + flush %g2; 27 + 28 + ENTRY(m7_patch_copyops) 29 + NG_DO_PATCH(memcpy, M7memcpy) 30 + NG_DO_PATCH(raw_copy_from_user, M7copy_from_user) 31 + NG_DO_PATCH(raw_copy_to_user, M7copy_to_user) 32 + retl 33 + nop 34 + ENDPROC(m7_patch_copyops) 35 + 36 + ENTRY(m7_patch_bzero) 37 + NG_DO_PATCH(memset, M7memset) 38 + NG_DO_PATCH(__bzero, M7bzero) 39 + NG_DO_PATCH(__clear_user, NGclear_user) 40 + NG_DO_PATCH(tsb_init, NGtsb_init) 41 + retl 42 + nop 43 + ENDPROC(m7_patch_bzero) 44 + 45 + ENTRY(m7_patch_pageops) 46 + NG_DO_PATCH(copy_user_page, NG4copy_user_page) 47 + NG_DO_PATCH(_clear_page, M7clear_page) 48 + NG_DO_PATCH(clear_user_page, M7clear_user_page) 49 + retl 50 + nop 51 + ENDPROC(m7_patch_pageops)
+5
arch/sparc/lib/Makefile
··· 36 36 lib-$(CONFIG_SPARC64) += NG4memcpy.o NG4copy_from_user.o NG4copy_to_user.o 37 37 lib-$(CONFIG_SPARC64) += NG4patch.o NG4copy_page.o NG4clear_page.o NG4memset.o 38 38 39 + lib-$(CONFIG_SPARC64) += Memcpy_utils.o 40 + 41 + lib-$(CONFIG_SPARC64) += M7memcpy.o M7copy_from_user.o M7copy_to_user.o 42 + lib-$(CONFIG_SPARC64) += M7patch.o M7memset.o 43 + 39 44 lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o 40 45 lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o 41 46
+345
arch/sparc/lib/Memcpy_utils.S
··· 1 + #ifndef __ASM_MEMCPY_UTILS 2 + #define __ASM_MEMCPY_UTILS 3 + 4 + #include <linux/linkage.h> 5 + #include <asm/asi.h> 6 + #include <asm/visasm.h> 7 + 8 + ENTRY(__restore_asi_fp) 9 + VISExitHalf 10 + retl 11 + wr %g0, ASI_AIUS, %asi 12 + ENDPROC(__restore_asi_fp) 13 + 14 + ENTRY(__restore_asi) 15 + retl 16 + wr %g0, ASI_AIUS, %asi 17 + ENDPROC(__restore_asi) 18 + 19 + ENTRY(memcpy_retl_o2) 20 + ba,pt %xcc, __restore_asi 21 + mov %o2, %o0 22 + ENDPROC(memcpy_retl_o2) 23 + ENTRY(memcpy_retl_o2_plus_1) 24 + ba,pt %xcc, __restore_asi 25 + add %o2, 1, %o0 26 + ENDPROC(memcpy_retl_o2_plus_1) 27 + ENTRY(memcpy_retl_o2_plus_3) 28 + ba,pt %xcc, __restore_asi 29 + add %o2, 3, %o0 30 + ENDPROC(memcpy_retl_o2_plus_3) 31 + ENTRY(memcpy_retl_o2_plus_4) 32 + ba,pt %xcc, __restore_asi 33 + add %o2, 4, %o0 34 + ENDPROC(memcpy_retl_o2_plus_4) 35 + ENTRY(memcpy_retl_o2_plus_5) 36 + ba,pt %xcc, __restore_asi 37 + add %o2, 5, %o0 38 + ENDPROC(memcpy_retl_o2_plus_5) 39 + ENTRY(memcpy_retl_o2_plus_6) 40 + ba,pt %xcc, __restore_asi 41 + add %o2, 6, %o0 42 + ENDPROC(memcpy_retl_o2_plus_6) 43 + ENTRY(memcpy_retl_o2_plus_7) 44 + ba,pt %xcc, __restore_asi 45 + add %o2, 7, %o0 46 + ENDPROC(memcpy_retl_o2_plus_7) 47 + ENTRY(memcpy_retl_o2_plus_8) 48 + ba,pt %xcc, __restore_asi 49 + add %o2, 8, %o0 50 + ENDPROC(memcpy_retl_o2_plus_8) 51 + ENTRY(memcpy_retl_o2_plus_15) 52 + ba,pt %xcc, __restore_asi 53 + add %o2, 15, %o0 54 + ENDPROC(memcpy_retl_o2_plus_15) 55 + ENTRY(memcpy_retl_o2_plus_15_8) 56 + add %o2, 15, %o2 57 + ba,pt %xcc, __restore_asi 58 + add %o2, 8, %o0 59 + ENDPROC(memcpy_retl_o2_plus_15_8) 60 + ENTRY(memcpy_retl_o2_plus_16) 61 + ba,pt %xcc, __restore_asi 62 + add %o2, 16, %o0 63 + ENDPROC(memcpy_retl_o2_plus_16) 64 + ENTRY(memcpy_retl_o2_plus_24) 65 + ba,pt %xcc, __restore_asi 66 + add %o2, 24, %o0 67 + ENDPROC(memcpy_retl_o2_plus_24) 68 + ENTRY(memcpy_retl_o2_plus_31) 69 + ba,pt %xcc, __restore_asi 70 + add %o2, 31, %o0 71 + ENDPROC(memcpy_retl_o2_plus_31) 72 + ENTRY(memcpy_retl_o2_plus_32) 73 + ba,pt %xcc, __restore_asi 74 + add %o2, 32, %o0 75 + ENDPROC(memcpy_retl_o2_plus_32) 76 + ENTRY(memcpy_retl_o2_plus_31_32) 77 + add %o2, 31, %o2 78 + ba,pt %xcc, __restore_asi 79 + add %o2, 32, %o0 80 + ENDPROC(memcpy_retl_o2_plus_31_32) 81 + ENTRY(memcpy_retl_o2_plus_31_24) 82 + add %o2, 31, %o2 83 + ba,pt %xcc, __restore_asi 84 + add %o2, 24, %o0 85 + ENDPROC(memcpy_retl_o2_plus_31_24) 86 + ENTRY(memcpy_retl_o2_plus_31_16) 87 + add %o2, 31, %o2 88 + ba,pt %xcc, __restore_asi 89 + add %o2, 16, %o0 90 + ENDPROC(memcpy_retl_o2_plus_31_16) 91 + ENTRY(memcpy_retl_o2_plus_31_8) 92 + add %o2, 31, %o2 93 + ba,pt %xcc, __restore_asi 94 + add %o2, 8, %o0 95 + ENDPROC(memcpy_retl_o2_plus_31_8) 96 + ENTRY(memcpy_retl_o2_plus_63) 97 + ba,pt %xcc, __restore_asi 98 + add %o2, 63, %o0 99 + ENDPROC(memcpy_retl_o2_plus_63) 100 + ENTRY(memcpy_retl_o2_plus_63_64) 101 + add %o2, 63, %o2 102 + ba,pt %xcc, __restore_asi 103 + add %o2, 64, %o0 104 + ENDPROC(memcpy_retl_o2_plus_63_64) 105 + ENTRY(memcpy_retl_o2_plus_63_56) 106 + add %o2, 63, %o2 107 + ba,pt %xcc, __restore_asi 108 + add %o2, 56, %o0 109 + ENDPROC(memcpy_retl_o2_plus_63_56) 110 + ENTRY(memcpy_retl_o2_plus_63_48) 111 + add %o2, 63, %o2 112 + ba,pt %xcc, __restore_asi 113 + add %o2, 48, %o0 114 + ENDPROC(memcpy_retl_o2_plus_63_48) 115 + ENTRY(memcpy_retl_o2_plus_63_40) 116 + add %o2, 63, %o2 117 + ba,pt %xcc, __restore_asi 118 + add %o2, 40, %o0 119 + ENDPROC(memcpy_retl_o2_plus_63_40) 120 + ENTRY(memcpy_retl_o2_plus_63_32) 121 + add %o2, 63, %o2 122 + ba,pt %xcc, __restore_asi 123 + add %o2, 32, %o0 124 + ENDPROC(memcpy_retl_o2_plus_63_32) 125 + ENTRY(memcpy_retl_o2_plus_63_24) 126 + add %o2, 63, %o2 127 + ba,pt %xcc, __restore_asi 128 + add %o2, 24, %o0 129 + ENDPROC(memcpy_retl_o2_plus_63_24) 130 + ENTRY(memcpy_retl_o2_plus_63_16) 131 + add %o2, 63, %o2 132 + ba,pt %xcc, __restore_asi 133 + add %o2, 16, %o0 134 + ENDPROC(memcpy_retl_o2_plus_63_16) 135 + ENTRY(memcpy_retl_o2_plus_63_8) 136 + add %o2, 63, %o2 137 + ba,pt %xcc, __restore_asi 138 + add %o2, 8, %o0 139 + ENDPROC(memcpy_retl_o2_plus_63_8) 140 + ENTRY(memcpy_retl_o2_plus_o5) 141 + ba,pt %xcc, __restore_asi 142 + add %o2, %o5, %o0 143 + ENDPROC(memcpy_retl_o2_plus_o5) 144 + ENTRY(memcpy_retl_o2_plus_o5_plus_1) 145 + add %o5, 1, %o5 146 + ba,pt %xcc, __restore_asi 147 + add %o2, %o5, %o0 148 + ENDPROC(memcpy_retl_o2_plus_o5_plus_1) 149 + ENTRY(memcpy_retl_o2_plus_o5_plus_4) 150 + add %o5, 4, %o5 151 + ba,pt %xcc, __restore_asi 152 + add %o2, %o5, %o0 153 + ENDPROC(memcpy_retl_o2_plus_o5_plus_4) 154 + ENTRY(memcpy_retl_o2_plus_o5_plus_8) 155 + add %o5, 8, %o5 156 + ba,pt %xcc, __restore_asi 157 + add %o2, %o5, %o0 158 + ENDPROC(memcpy_retl_o2_plus_o5_plus_8) 159 + ENTRY(memcpy_retl_o2_plus_o5_plus_16) 160 + add %o5, 16, %o5 161 + ba,pt %xcc, __restore_asi 162 + add %o2, %o5, %o0 163 + ENDPROC(memcpy_retl_o2_plus_o5_plus_16) 164 + ENTRY(memcpy_retl_o2_plus_o5_plus_24) 165 + add %o5, 24, %o5 166 + ba,pt %xcc, __restore_asi 167 + add %o2, %o5, %o0 168 + ENDPROC(memcpy_retl_o2_plus_o5_plus_24) 169 + ENTRY(memcpy_retl_o2_plus_o5_plus_32) 170 + add %o5, 32, %o5 171 + ba,pt %xcc, __restore_asi 172 + add %o2, %o5, %o0 173 + ENDPROC(memcpy_retl_o2_plus_o5_plus_32) 174 + ENTRY(memcpy_retl_o2_plus_o5_64) 175 + add %o5, 32, %o5 176 + ba,pt %xcc, __restore_asi 177 + add %o2, %o5, %o0 178 + ENDPROC(memcpy_retl_o2_plus_o5_64) 179 + ENTRY(memcpy_retl_o2_plus_g1) 180 + ba,pt %xcc, __restore_asi 181 + add %o2, %g1, %o0 182 + ENDPROC(memcpy_retl_o2_plus_g1) 183 + ENTRY(memcpy_retl_o2_plus_g1_plus_1) 184 + add %g1, 1, %g1 185 + ba,pt %xcc, __restore_asi 186 + add %o2, %g1, %o0 187 + ENDPROC(memcpy_retl_o2_plus_g1_plus_1) 188 + ENTRY(memcpy_retl_o2_plus_g1_plus_8) 189 + add %g1, 8, %g1 190 + ba,pt %xcc, __restore_asi 191 + add %o2, %g1, %o0 192 + ENDPROC(memcpy_retl_o2_plus_g1_plus_8) 193 + ENTRY(memcpy_retl_o2_plus_o4) 194 + ba,pt %xcc, __restore_asi 195 + add %o2, %o4, %o0 196 + ENDPROC(memcpy_retl_o2_plus_o4) 197 + ENTRY(memcpy_retl_o2_plus_o4_plus_8) 198 + add %o4, 8, %o4 199 + ba,pt %xcc, __restore_asi 200 + add %o2, %o4, %o0 201 + ENDPROC(memcpy_retl_o2_plus_o4_plus_8) 202 + ENTRY(memcpy_retl_o2_plus_o4_plus_16) 203 + add %o4, 16, %o4 204 + ba,pt %xcc, __restore_asi 205 + add %o2, %o4, %o0 206 + ENDPROC(memcpy_retl_o2_plus_o4_plus_16) 207 + ENTRY(memcpy_retl_o2_plus_o4_plus_24) 208 + add %o4, 24, %o4 209 + ba,pt %xcc, __restore_asi 210 + add %o2, %o4, %o0 211 + ENDPROC(memcpy_retl_o2_plus_o4_plus_24) 212 + ENTRY(memcpy_retl_o2_plus_o4_plus_32) 213 + add %o4, 32, %o4 214 + ba,pt %xcc, __restore_asi 215 + add %o2, %o4, %o0 216 + ENDPROC(memcpy_retl_o2_plus_o4_plus_32) 217 + ENTRY(memcpy_retl_o2_plus_o4_plus_40) 218 + add %o4, 40, %o4 219 + ba,pt %xcc, __restore_asi 220 + add %o2, %o4, %o0 221 + ENDPROC(memcpy_retl_o2_plus_o4_plus_40) 222 + ENTRY(memcpy_retl_o2_plus_o4_plus_48) 223 + add %o4, 48, %o4 224 + ba,pt %xcc, __restore_asi 225 + add %o2, %o4, %o0 226 + ENDPROC(memcpy_retl_o2_plus_o4_plus_48) 227 + ENTRY(memcpy_retl_o2_plus_o4_plus_56) 228 + add %o4, 56, %o4 229 + ba,pt %xcc, __restore_asi 230 + add %o2, %o4, %o0 231 + ENDPROC(memcpy_retl_o2_plus_o4_plus_56) 232 + ENTRY(memcpy_retl_o2_plus_o4_plus_64) 233 + add %o4, 64, %o4 234 + ba,pt %xcc, __restore_asi 235 + add %o2, %o4, %o0 236 + ENDPROC(memcpy_retl_o2_plus_o4_plus_64) 237 + ENTRY(memcpy_retl_o2_plus_o5_plus_64) 238 + add %o5, 64, %o5 239 + ba,pt %xcc, __restore_asi 240 + add %o2, %o5, %o0 241 + ENDPROC(memcpy_retl_o2_plus_o5_plus_64) 242 + ENTRY(memcpy_retl_o2_plus_o3_fp) 243 + ba,pt %xcc, __restore_asi_fp 244 + add %o2, %o3, %o0 245 + ENDPROC(memcpy_retl_o2_plus_o3_fp) 246 + ENTRY(memcpy_retl_o2_plus_o3_plus_1_fp) 247 + add %o3, 1, %o3 248 + ba,pt %xcc, __restore_asi_fp 249 + add %o2, %o3, %o0 250 + ENDPROC(memcpy_retl_o2_plus_o3_plus_1_fp) 251 + ENTRY(memcpy_retl_o2_plus_o3_plus_4_fp) 252 + add %o3, 4, %o3 253 + ba,pt %xcc, __restore_asi_fp 254 + add %o2, %o3, %o0 255 + ENDPROC(memcpy_retl_o2_plus_o3_plus_4_fp) 256 + ENTRY(memcpy_retl_o2_plus_o4_fp) 257 + ba,pt %xcc, __restore_asi_fp 258 + add %o2, %o4, %o0 259 + ENDPROC(memcpy_retl_o2_plus_o4_fp) 260 + ENTRY(memcpy_retl_o2_plus_o4_plus_8_fp) 261 + add %o4, 8, %o4 262 + ba,pt %xcc, __restore_asi_fp 263 + add %o2, %o4, %o0 264 + ENDPROC(memcpy_retl_o2_plus_o4_plus_8_fp) 265 + ENTRY(memcpy_retl_o2_plus_o4_plus_16_fp) 266 + add %o4, 16, %o4 267 + ba,pt %xcc, __restore_asi_fp 268 + add %o2, %o4, %o0 269 + ENDPROC(memcpy_retl_o2_plus_o4_plus_16_fp) 270 + ENTRY(memcpy_retl_o2_plus_o4_plus_24_fp) 271 + add %o4, 24, %o4 272 + ba,pt %xcc, __restore_asi_fp 273 + add %o2, %o4, %o0 274 + ENDPROC(memcpy_retl_o2_plus_o4_plus_24_fp) 275 + ENTRY(memcpy_retl_o2_plus_o4_plus_32_fp) 276 + add %o4, 32, %o4 277 + ba,pt %xcc, __restore_asi_fp 278 + add %o2, %o4, %o0 279 + ENDPROC(memcpy_retl_o2_plus_o4_plus_32_fp) 280 + ENTRY(memcpy_retl_o2_plus_o4_plus_40_fp) 281 + add %o4, 40, %o4 282 + ba,pt %xcc, __restore_asi_fp 283 + add %o2, %o4, %o0 284 + ENDPROC(memcpy_retl_o2_plus_o4_plus_40_fp) 285 + ENTRY(memcpy_retl_o2_plus_o4_plus_48_fp) 286 + add %o4, 48, %o4 287 + ba,pt %xcc, __restore_asi_fp 288 + add %o2, %o4, %o0 289 + ENDPROC(memcpy_retl_o2_plus_o4_plus_48_fp) 290 + ENTRY(memcpy_retl_o2_plus_o4_plus_56_fp) 291 + add %o4, 56, %o4 292 + ba,pt %xcc, __restore_asi_fp 293 + add %o2, %o4, %o0 294 + ENDPROC(memcpy_retl_o2_plus_o4_plus_56_fp) 295 + ENTRY(memcpy_retl_o2_plus_o4_plus_64_fp) 296 + add %o4, 64, %o4 297 + ba,pt %xcc, __restore_asi_fp 298 + add %o2, %o4, %o0 299 + ENDPROC(memcpy_retl_o2_plus_o4_plus_64_fp) 300 + ENTRY(memcpy_retl_o2_plus_o5_fp) 301 + ba,pt %xcc, __restore_asi_fp 302 + add %o2, %o5, %o0 303 + ENDPROC(memcpy_retl_o2_plus_o5_fp) 304 + ENTRY(memcpy_retl_o2_plus_o5_plus_64_fp) 305 + add %o5, 64, %o5 306 + ba,pt %xcc, __restore_asi_fp 307 + add %o2, %o5, %o0 308 + ENDPROC(memcpy_retl_o2_plus_o5_plus_64_fp) 309 + ENTRY(memcpy_retl_o2_plus_o5_plus_56_fp) 310 + add %o5, 56, %o5 311 + ba,pt %xcc, __restore_asi_fp 312 + add %o2, %o5, %o0 313 + ENDPROC(memcpy_retl_o2_plus_o5_plus_56_fp) 314 + ENTRY(memcpy_retl_o2_plus_o5_plus_48_fp) 315 + add %o5, 48, %o5 316 + ba,pt %xcc, __restore_asi_fp 317 + add %o2, %o5, %o0 318 + ENDPROC(memcpy_retl_o2_plus_o5_plus_48_fp) 319 + ENTRY(memcpy_retl_o2_plus_o5_plus_40_fp) 320 + add %o5, 40, %o5 321 + ba,pt %xcc, __restore_asi_fp 322 + add %o2, %o5, %o0 323 + ENDPROC(memcpy_retl_o2_plus_o5_plus_40_fp) 324 + ENTRY(memcpy_retl_o2_plus_o5_plus_32_fp) 325 + add %o5, 32, %o5 326 + ba,pt %xcc, __restore_asi_fp 327 + add %o2, %o5, %o0 328 + ENDPROC(memcpy_retl_o2_plus_o5_plus_32_fp) 329 + ENTRY(memcpy_retl_o2_plus_o5_plus_24_fp) 330 + add %o5, 24, %o5 331 + ba,pt %xcc, __restore_asi_fp 332 + add %o2, %o5, %o0 333 + ENDPROC(memcpy_retl_o2_plus_o5_plus_24_fp) 334 + ENTRY(memcpy_retl_o2_plus_o5_plus_16_fp) 335 + add %o5, 16, %o5 336 + ba,pt %xcc, __restore_asi_fp 337 + add %o2, %o5, %o0 338 + ENDPROC(memcpy_retl_o2_plus_o5_plus_16_fp) 339 + ENTRY(memcpy_retl_o2_plus_o5_plus_8_fp) 340 + add %o5, 8, %o5 341 + ba,pt %xcc, __restore_asi_fp 342 + add %o2, %o5, %o0 343 + ENDPROC(memcpy_retl_o2_plus_o5_plus_8_fp) 344 + 345 + #endif
+64 -213
arch/sparc/lib/NG4memcpy.S
··· 94 94 .text 95 95 #ifndef EX_RETVAL 96 96 #define EX_RETVAL(x) x 97 - __restore_asi_fp: 98 - VISExitHalf 99 - __restore_asi: 100 - retl 101 - wr %g0, ASI_AIUS, %asi 102 - 103 - ENTRY(NG4_retl_o2) 104 - ba,pt %xcc, __restore_asi 105 - mov %o2, %o0 106 - ENDPROC(NG4_retl_o2) 107 - ENTRY(NG4_retl_o2_plus_1) 108 - ba,pt %xcc, __restore_asi 109 - add %o2, 1, %o0 110 - ENDPROC(NG4_retl_o2_plus_1) 111 - ENTRY(NG4_retl_o2_plus_4) 112 - ba,pt %xcc, __restore_asi 113 - add %o2, 4, %o0 114 - ENDPROC(NG4_retl_o2_plus_4) 115 - ENTRY(NG4_retl_o2_plus_o5) 116 - ba,pt %xcc, __restore_asi 117 - add %o2, %o5, %o0 118 - ENDPROC(NG4_retl_o2_plus_o5) 119 - ENTRY(NG4_retl_o2_plus_o5_plus_4) 120 - add %o5, 4, %o5 121 - ba,pt %xcc, __restore_asi 122 - add %o2, %o5, %o0 123 - ENDPROC(NG4_retl_o2_plus_o5_plus_4) 124 - ENTRY(NG4_retl_o2_plus_o5_plus_8) 125 - add %o5, 8, %o5 126 - ba,pt %xcc, __restore_asi 127 - add %o2, %o5, %o0 128 - ENDPROC(NG4_retl_o2_plus_o5_plus_8) 129 - ENTRY(NG4_retl_o2_plus_o5_plus_16) 130 - add %o5, 16, %o5 131 - ba,pt %xcc, __restore_asi 132 - add %o2, %o5, %o0 133 - ENDPROC(NG4_retl_o2_plus_o5_plus_16) 134 - ENTRY(NG4_retl_o2_plus_o5_plus_24) 135 - add %o5, 24, %o5 136 - ba,pt %xcc, __restore_asi 137 - add %o2, %o5, %o0 138 - ENDPROC(NG4_retl_o2_plus_o5_plus_24) 139 - ENTRY(NG4_retl_o2_plus_o5_plus_32) 140 - add %o5, 32, %o5 141 - ba,pt %xcc, __restore_asi 142 - add %o2, %o5, %o0 143 - ENDPROC(NG4_retl_o2_plus_o5_plus_32) 144 - ENTRY(NG4_retl_o2_plus_g1) 145 - ba,pt %xcc, __restore_asi 146 - add %o2, %g1, %o0 147 - ENDPROC(NG4_retl_o2_plus_g1) 148 - ENTRY(NG4_retl_o2_plus_g1_plus_1) 149 - add %g1, 1, %g1 150 - ba,pt %xcc, __restore_asi 151 - add %o2, %g1, %o0 152 - ENDPROC(NG4_retl_o2_plus_g1_plus_1) 153 - ENTRY(NG4_retl_o2_plus_g1_plus_8) 154 - add %g1, 8, %g1 155 - ba,pt %xcc, __restore_asi 156 - add %o2, %g1, %o0 157 - ENDPROC(NG4_retl_o2_plus_g1_plus_8) 158 - ENTRY(NG4_retl_o2_plus_o4) 159 - ba,pt %xcc, __restore_asi 160 - add %o2, %o4, %o0 161 - ENDPROC(NG4_retl_o2_plus_o4) 162 - ENTRY(NG4_retl_o2_plus_o4_plus_8) 163 - add %o4, 8, %o4 164 - ba,pt %xcc, __restore_asi 165 - add %o2, %o4, %o0 166 - ENDPROC(NG4_retl_o2_plus_o4_plus_8) 167 - ENTRY(NG4_retl_o2_plus_o4_plus_16) 168 - add %o4, 16, %o4 169 - ba,pt %xcc, __restore_asi 170 - add %o2, %o4, %o0 171 - ENDPROC(NG4_retl_o2_plus_o4_plus_16) 172 - ENTRY(NG4_retl_o2_plus_o4_plus_24) 173 - add %o4, 24, %o4 174 - ba,pt %xcc, __restore_asi 175 - add %o2, %o4, %o0 176 - ENDPROC(NG4_retl_o2_plus_o4_plus_24) 177 - ENTRY(NG4_retl_o2_plus_o4_plus_32) 178 - add %o4, 32, %o4 179 - ba,pt %xcc, __restore_asi 180 - add %o2, %o4, %o0 181 - ENDPROC(NG4_retl_o2_plus_o4_plus_32) 182 - ENTRY(NG4_retl_o2_plus_o4_plus_40) 183 - add %o4, 40, %o4 184 - ba,pt %xcc, __restore_asi 185 - add %o2, %o4, %o0 186 - ENDPROC(NG4_retl_o2_plus_o4_plus_40) 187 - ENTRY(NG4_retl_o2_plus_o4_plus_48) 188 - add %o4, 48, %o4 189 - ba,pt %xcc, __restore_asi 190 - add %o2, %o4, %o0 191 - ENDPROC(NG4_retl_o2_plus_o4_plus_48) 192 - ENTRY(NG4_retl_o2_plus_o4_plus_56) 193 - add %o4, 56, %o4 194 - ba,pt %xcc, __restore_asi 195 - add %o2, %o4, %o0 196 - ENDPROC(NG4_retl_o2_plus_o4_plus_56) 197 - ENTRY(NG4_retl_o2_plus_o4_plus_64) 198 - add %o4, 64, %o4 199 - ba,pt %xcc, __restore_asi 200 - add %o2, %o4, %o0 201 - ENDPROC(NG4_retl_o2_plus_o4_plus_64) 202 - ENTRY(NG4_retl_o2_plus_o4_fp) 203 - ba,pt %xcc, __restore_asi_fp 204 - add %o2, %o4, %o0 205 - ENDPROC(NG4_retl_o2_plus_o4_fp) 206 - ENTRY(NG4_retl_o2_plus_o4_plus_8_fp) 207 - add %o4, 8, %o4 208 - ba,pt %xcc, __restore_asi_fp 209 - add %o2, %o4, %o0 210 - ENDPROC(NG4_retl_o2_plus_o4_plus_8_fp) 211 - ENTRY(NG4_retl_o2_plus_o4_plus_16_fp) 212 - add %o4, 16, %o4 213 - ba,pt %xcc, __restore_asi_fp 214 - add %o2, %o4, %o0 215 - ENDPROC(NG4_retl_o2_plus_o4_plus_16_fp) 216 - ENTRY(NG4_retl_o2_plus_o4_plus_24_fp) 217 - add %o4, 24, %o4 218 - ba,pt %xcc, __restore_asi_fp 219 - add %o2, %o4, %o0 220 - ENDPROC(NG4_retl_o2_plus_o4_plus_24_fp) 221 - ENTRY(NG4_retl_o2_plus_o4_plus_32_fp) 222 - add %o4, 32, %o4 223 - ba,pt %xcc, __restore_asi_fp 224 - add %o2, %o4, %o0 225 - ENDPROC(NG4_retl_o2_plus_o4_plus_32_fp) 226 - ENTRY(NG4_retl_o2_plus_o4_plus_40_fp) 227 - add %o4, 40, %o4 228 - ba,pt %xcc, __restore_asi_fp 229 - add %o2, %o4, %o0 230 - ENDPROC(NG4_retl_o2_plus_o4_plus_40_fp) 231 - ENTRY(NG4_retl_o2_plus_o4_plus_48_fp) 232 - add %o4, 48, %o4 233 - ba,pt %xcc, __restore_asi_fp 234 - add %o2, %o4, %o0 235 - ENDPROC(NG4_retl_o2_plus_o4_plus_48_fp) 236 - ENTRY(NG4_retl_o2_plus_o4_plus_56_fp) 237 - add %o4, 56, %o4 238 - ba,pt %xcc, __restore_asi_fp 239 - add %o2, %o4, %o0 240 - ENDPROC(NG4_retl_o2_plus_o4_plus_56_fp) 241 - ENTRY(NG4_retl_o2_plus_o4_plus_64_fp) 242 - add %o4, 64, %o4 243 - ba,pt %xcc, __restore_asi_fp 244 - add %o2, %o4, %o0 245 - ENDPROC(NG4_retl_o2_plus_o4_plus_64_fp) 246 97 #endif 247 98 .align 64 248 99 ··· 126 275 sub %o2, %g1, %o2 127 276 128 277 129 - 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), NG4_retl_o2_plus_g1) 278 + 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1) 130 279 add %o1, 1, %o1 131 280 subcc %g1, 1, %g1 132 281 add %o0, 1, %o0 133 282 bne,pt %icc, 1b 134 - EX_ST(STORE(stb, %g2, %o0 - 0x01), NG4_retl_o2_plus_g1_plus_1) 283 + EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1) 135 284 136 285 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong) 137 286 LOAD(prefetch, %o1 + 0x080, #n_reads_strong) ··· 156 305 brz,pt %g1, .Llarge_aligned 157 306 sub %o2, %g1, %o2 158 307 159 - 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2), NG4_retl_o2_plus_g1) 308 + 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1) 160 309 add %o1, 8, %o1 161 310 subcc %g1, 8, %g1 162 311 add %o0, 8, %o0 163 312 bne,pt %icc, 1b 164 - EX_ST(STORE(stx, %g2, %o0 - 0x08), NG4_retl_o2_plus_g1_plus_8) 313 + EX_ST(STORE(stx, %g2, %o0 - 0x08), memcpy_retl_o2_plus_g1_plus_8) 165 314 166 315 .Llarge_aligned: 167 316 /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */ 168 317 andn %o2, 0x3f, %o4 169 318 sub %o2, %o4, %o2 170 319 171 - 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), NG4_retl_o2_plus_o4) 320 + 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), memcpy_retl_o2_plus_o4) 172 321 add %o1, 0x40, %o1 173 - EX_LD(LOAD(ldx, %o1 - 0x38, %g2), NG4_retl_o2_plus_o4) 322 + EX_LD(LOAD(ldx, %o1 - 0x38, %g2), memcpy_retl_o2_plus_o4) 174 323 subcc %o4, 0x40, %o4 175 - EX_LD(LOAD(ldx, %o1 - 0x30, %g3), NG4_retl_o2_plus_o4_plus_64) 176 - EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE), NG4_retl_o2_plus_o4_plus_64) 177 - EX_LD(LOAD(ldx, %o1 - 0x20, %o5), NG4_retl_o2_plus_o4_plus_64) 178 - EX_ST(STORE_INIT(%g1, %o0), NG4_retl_o2_plus_o4_plus_64) 324 + EX_LD(LOAD(ldx, %o1 - 0x30, %g3), memcpy_retl_o2_plus_o4_plus_64) 325 + EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE), memcpy_retl_o2_plus_o4_plus_64) 326 + EX_LD(LOAD(ldx, %o1 - 0x20, %o5), memcpy_retl_o2_plus_o4_plus_64) 327 + EX_ST(STORE_INIT(%g1, %o0), memcpy_retl_o2_plus_o4_plus_64) 179 328 add %o0, 0x08, %o0 180 - EX_ST(STORE_INIT(%g2, %o0), NG4_retl_o2_plus_o4_plus_56) 329 + EX_ST(STORE_INIT(%g2, %o0), memcpy_retl_o2_plus_o4_plus_56) 181 330 add %o0, 0x08, %o0 182 - EX_LD(LOAD(ldx, %o1 - 0x18, %g2), NG4_retl_o2_plus_o4_plus_48) 183 - EX_ST(STORE_INIT(%g3, %o0), NG4_retl_o2_plus_o4_plus_48) 331 + EX_LD(LOAD(ldx, %o1 - 0x18, %g2), memcpy_retl_o2_plus_o4_plus_48) 332 + EX_ST(STORE_INIT(%g3, %o0), memcpy_retl_o2_plus_o4_plus_48) 184 333 add %o0, 0x08, %o0 185 - EX_LD(LOAD(ldx, %o1 - 0x10, %g3), NG4_retl_o2_plus_o4_plus_40) 186 - EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), NG4_retl_o2_plus_o4_plus_40) 334 + EX_LD(LOAD(ldx, %o1 - 0x10, %g3), memcpy_retl_o2_plus_o4_plus_40) 335 + EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), memcpy_retl_o2_plus_o4_plus_40) 187 336 add %o0, 0x08, %o0 188 - EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE), NG4_retl_o2_plus_o4_plus_32) 189 - EX_ST(STORE_INIT(%o5, %o0), NG4_retl_o2_plus_o4_plus_32) 337 + EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE), memcpy_retl_o2_plus_o4_plus_32) 338 + EX_ST(STORE_INIT(%o5, %o0), memcpy_retl_o2_plus_o4_plus_32) 190 339 add %o0, 0x08, %o0 191 - EX_ST(STORE_INIT(%g2, %o0), NG4_retl_o2_plus_o4_plus_24) 340 + EX_ST(STORE_INIT(%g2, %o0), memcpy_retl_o2_plus_o4_plus_24) 192 341 add %o0, 0x08, %o0 193 - EX_ST(STORE_INIT(%g3, %o0), NG4_retl_o2_plus_o4_plus_16) 342 + EX_ST(STORE_INIT(%g3, %o0), memcpy_retl_o2_plus_o4_plus_16) 194 343 add %o0, 0x08, %o0 195 - EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), NG4_retl_o2_plus_o4_plus_8) 344 + EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), memcpy_retl_o2_plus_o4_plus_8) 196 345 add %o0, 0x08, %o0 197 346 bne,pt %icc, 1b 198 347 LOAD(prefetch, %o1 + 0x200, #n_reads_strong) ··· 218 367 sub %o2, %o4, %o2 219 368 alignaddr %o1, %g0, %g1 220 369 add %o1, %o4, %o1 221 - EX_LD_FP(LOAD(ldd, %g1 + 0x00, %f0), NG4_retl_o2_plus_o4) 222 - 1: EX_LD_FP(LOAD(ldd, %g1 + 0x08, %f2), NG4_retl_o2_plus_o4) 370 + EX_LD_FP(LOAD(ldd, %g1 + 0x00, %f0), memcpy_retl_o2_plus_o4) 371 + 1: EX_LD_FP(LOAD(ldd, %g1 + 0x08, %f2), memcpy_retl_o2_plus_o4) 223 372 subcc %o4, 0x40, %o4 224 - EX_LD_FP(LOAD(ldd, %g1 + 0x10, %f4), NG4_retl_o2_plus_o4_plus_64) 225 - EX_LD_FP(LOAD(ldd, %g1 + 0x18, %f6), NG4_retl_o2_plus_o4_plus_64) 226 - EX_LD_FP(LOAD(ldd, %g1 + 0x20, %f8), NG4_retl_o2_plus_o4_plus_64) 227 - EX_LD_FP(LOAD(ldd, %g1 + 0x28, %f10), NG4_retl_o2_plus_o4_plus_64) 228 - EX_LD_FP(LOAD(ldd, %g1 + 0x30, %f12), NG4_retl_o2_plus_o4_plus_64) 229 - EX_LD_FP(LOAD(ldd, %g1 + 0x38, %f14), NG4_retl_o2_plus_o4_plus_64) 373 + EX_LD_FP(LOAD(ldd, %g1 + 0x10, %f4), memcpy_retl_o2_plus_o4_plus_64) 374 + EX_LD_FP(LOAD(ldd, %g1 + 0x18, %f6), memcpy_retl_o2_plus_o4_plus_64) 375 + EX_LD_FP(LOAD(ldd, %g1 + 0x20, %f8), memcpy_retl_o2_plus_o4_plus_64) 376 + EX_LD_FP(LOAD(ldd, %g1 + 0x28, %f10), memcpy_retl_o2_plus_o4_plus_64) 377 + EX_LD_FP(LOAD(ldd, %g1 + 0x30, %f12), memcpy_retl_o2_plus_o4_plus_64) 378 + EX_LD_FP(LOAD(ldd, %g1 + 0x38, %f14), memcpy_retl_o2_plus_o4_plus_64) 230 379 faligndata %f0, %f2, %f16 231 - EX_LD_FP(LOAD(ldd, %g1 + 0x40, %f0), NG4_retl_o2_plus_o4_plus_64) 380 + EX_LD_FP(LOAD(ldd, %g1 + 0x40, %f0), memcpy_retl_o2_plus_o4_plus_64) 232 381 faligndata %f2, %f4, %f18 233 382 add %g1, 0x40, %g1 234 383 faligndata %f4, %f6, %f20 ··· 237 386 faligndata %f10, %f12, %f26 238 387 faligndata %f12, %f14, %f28 239 388 faligndata %f14, %f0, %f30 240 - EX_ST_FP(STORE(std, %f16, %o0 + 0x00), NG4_retl_o2_plus_o4_plus_64) 241 - EX_ST_FP(STORE(std, %f18, %o0 + 0x08), NG4_retl_o2_plus_o4_plus_56) 242 - EX_ST_FP(STORE(std, %f20, %o0 + 0x10), NG4_retl_o2_plus_o4_plus_48) 243 - EX_ST_FP(STORE(std, %f22, %o0 + 0x18), NG4_retl_o2_plus_o4_plus_40) 244 - EX_ST_FP(STORE(std, %f24, %o0 + 0x20), NG4_retl_o2_plus_o4_plus_32) 245 - EX_ST_FP(STORE(std, %f26, %o0 + 0x28), NG4_retl_o2_plus_o4_plus_24) 246 - EX_ST_FP(STORE(std, %f28, %o0 + 0x30), NG4_retl_o2_plus_o4_plus_16) 247 - EX_ST_FP(STORE(std, %f30, %o0 + 0x38), NG4_retl_o2_plus_o4_plus_8) 389 + EX_ST_FP(STORE(std, %f16, %o0 + 0x00), memcpy_retl_o2_plus_o4_plus_64) 390 + EX_ST_FP(STORE(std, %f18, %o0 + 0x08), memcpy_retl_o2_plus_o4_plus_56) 391 + EX_ST_FP(STORE(std, %f20, %o0 + 0x10), memcpy_retl_o2_plus_o4_plus_48) 392 + EX_ST_FP(STORE(std, %f22, %o0 + 0x18), memcpy_retl_o2_plus_o4_plus_40) 393 + EX_ST_FP(STORE(std, %f24, %o0 + 0x20), memcpy_retl_o2_plus_o4_plus_32) 394 + EX_ST_FP(STORE(std, %f26, %o0 + 0x28), memcpy_retl_o2_plus_o4_plus_24) 395 + EX_ST_FP(STORE(std, %f28, %o0 + 0x30), memcpy_retl_o2_plus_o4_plus_16) 396 + EX_ST_FP(STORE(std, %f30, %o0 + 0x38), memcpy_retl_o2_plus_o4_plus_8) 248 397 add %o0, 0x40, %o0 249 398 bne,pt %icc, 1b 250 399 LOAD(prefetch, %g1 + 0x200, #n_reads_strong) ··· 272 421 andncc %o2, 0x20 - 1, %o5 273 422 be,pn %icc, 2f 274 423 sub %o2, %o5, %o2 275 - 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), NG4_retl_o2_plus_o5) 276 - EX_LD(LOAD(ldx, %o1 + 0x08, %g2), NG4_retl_o2_plus_o5) 277 - EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE), NG4_retl_o2_plus_o5) 278 - EX_LD(LOAD(ldx, %o1 + 0x18, %o4), NG4_retl_o2_plus_o5) 424 + 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), memcpy_retl_o2_plus_o5) 425 + EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5) 426 + EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE), memcpy_retl_o2_plus_o5) 427 + EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5) 279 428 add %o1, 0x20, %o1 280 429 subcc %o5, 0x20, %o5 281 - EX_ST(STORE(stx, %g1, %o0 + 0x00), NG4_retl_o2_plus_o5_plus_32) 282 - EX_ST(STORE(stx, %g2, %o0 + 0x08), NG4_retl_o2_plus_o5_plus_24) 283 - EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10), NG4_retl_o2_plus_o5_plus_24) 284 - EX_ST(STORE(stx, %o4, %o0 + 0x18), NG4_retl_o2_plus_o5_plus_8) 430 + EX_ST(STORE(stx, %g1, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32) 431 + EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24) 432 + EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24) 433 + EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8) 285 434 bne,pt %icc, 1b 286 435 add %o0, 0x20, %o0 287 436 2: andcc %o2, 0x18, %o5 288 437 be,pt %icc, 3f 289 438 sub %o2, %o5, %o2 290 439 291 - 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), NG4_retl_o2_plus_o5) 440 + 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), memcpy_retl_o2_plus_o5) 292 441 add %o1, 0x08, %o1 293 442 add %o0, 0x08, %o0 294 443 subcc %o5, 0x08, %o5 295 444 bne,pt %icc, 1b 296 - EX_ST(STORE(stx, %g1, %o0 - 0x08), NG4_retl_o2_plus_o5_plus_8) 445 + EX_ST(STORE(stx, %g1, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8) 297 446 3: brz,pt %o2, .Lexit 298 447 cmp %o2, 0x04 299 448 bl,pn %icc, .Ltiny 300 449 nop 301 - EX_LD(LOAD(lduw, %o1 + 0x00, %g1), NG4_retl_o2) 450 + EX_LD(LOAD(lduw, %o1 + 0x00, %g1), memcpy_retl_o2) 302 451 add %o1, 0x04, %o1 303 452 add %o0, 0x04, %o0 304 453 subcc %o2, 0x04, %o2 305 454 bne,pn %icc, .Ltiny 306 - EX_ST(STORE(stw, %g1, %o0 - 0x04), NG4_retl_o2_plus_4) 455 + EX_ST(STORE(stw, %g1, %o0 - 0x04), memcpy_retl_o2_plus_4) 307 456 ba,a,pt %icc, .Lexit 308 457 .Lmedium_unaligned: 309 458 /* First get dest 8 byte aligned. */ ··· 312 461 brz,pt %g1, 2f 313 462 sub %o2, %g1, %o2 314 463 315 - 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), NG4_retl_o2_plus_g1) 464 + 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1) 316 465 add %o1, 1, %o1 317 466 subcc %g1, 1, %g1 318 467 add %o0, 1, %o0 319 468 bne,pt %icc, 1b 320 - EX_ST(STORE(stb, %g2, %o0 - 0x01), NG4_retl_o2_plus_g1_plus_1) 469 + EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1) 321 470 2: 322 471 and %o1, 0x7, %g1 323 472 brz,pn %g1, .Lmedium_noprefetch ··· 325 474 mov 64, %g2 326 475 sub %g2, %g1, %g2 327 476 andn %o1, 0x7, %o1 328 - EX_LD(LOAD(ldx, %o1 + 0x00, %o4), NG4_retl_o2) 477 + EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2) 329 478 sllx %o4, %g1, %o4 330 479 andn %o2, 0x08 - 1, %o5 331 480 sub %o2, %o5, %o2 332 - 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), NG4_retl_o2_plus_o5) 481 + 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5) 333 482 add %o1, 0x08, %o1 334 483 subcc %o5, 0x08, %o5 335 484 srlx %g3, %g2, GLOBAL_SPARE 336 485 or GLOBAL_SPARE, %o4, GLOBAL_SPARE 337 - EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00), NG4_retl_o2_plus_o5_plus_8) 486 + EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8) 338 487 add %o0, 0x08, %o0 339 488 bne,pt %icc, 1b 340 489 sllx %g3, %g1, %o4 ··· 345 494 ba,pt %icc, .Lsmall_unaligned 346 495 347 496 .Ltiny: 348 - EX_LD(LOAD(ldub, %o1 + 0x00, %g1), NG4_retl_o2) 497 + EX_LD(LOAD(ldub, %o1 + 0x00, %g1), memcpy_retl_o2) 349 498 subcc %o2, 1, %o2 350 499 be,pn %icc, .Lexit 351 - EX_ST(STORE(stb, %g1, %o0 + 0x00), NG4_retl_o2_plus_1) 352 - EX_LD(LOAD(ldub, %o1 + 0x01, %g1), NG4_retl_o2) 500 + EX_ST(STORE(stb, %g1, %o0 + 0x00), memcpy_retl_o2_plus_1) 501 + EX_LD(LOAD(ldub, %o1 + 0x01, %g1), memcpy_retl_o2) 353 502 subcc %o2, 1, %o2 354 503 be,pn %icc, .Lexit 355 - EX_ST(STORE(stb, %g1, %o0 + 0x01), NG4_retl_o2_plus_1) 356 - EX_LD(LOAD(ldub, %o1 + 0x02, %g1), NG4_retl_o2) 504 + EX_ST(STORE(stb, %g1, %o0 + 0x01), memcpy_retl_o2_plus_1) 505 + EX_LD(LOAD(ldub, %o1 + 0x02, %g1), memcpy_retl_o2) 357 506 ba,pt %icc, .Lexit 358 - EX_ST(STORE(stb, %g1, %o0 + 0x02), NG4_retl_o2) 507 + EX_ST(STORE(stb, %g1, %o0 + 0x02), memcpy_retl_o2) 359 508 360 509 .Lsmall: 361 510 andcc %g2, 0x3, %g0 ··· 363 512 andn %o2, 0x4 - 1, %o5 364 513 sub %o2, %o5, %o2 365 514 1: 366 - EX_LD(LOAD(lduw, %o1 + 0x00, %g1), NG4_retl_o2_plus_o5) 515 + EX_LD(LOAD(lduw, %o1 + 0x00, %g1), memcpy_retl_o2_plus_o5) 367 516 add %o1, 0x04, %o1 368 517 subcc %o5, 0x04, %o5 369 518 add %o0, 0x04, %o0 370 519 bne,pt %icc, 1b 371 - EX_ST(STORE(stw, %g1, %o0 - 0x04), NG4_retl_o2_plus_o5_plus_4) 520 + EX_ST(STORE(stw, %g1, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4) 372 521 brz,pt %o2, .Lexit 373 522 nop 374 523 ba,a,pt %icc, .Ltiny 375 524 376 525 .Lsmall_unaligned: 377 - 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1), NG4_retl_o2) 526 + 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1), memcpy_retl_o2) 378 527 add %o1, 1, %o1 379 528 add %o0, 1, %o0 380 529 subcc %o2, 1, %o2 381 530 bne,pt %icc, 1b 382 - EX_ST(STORE(stb, %g1, %o0 - 0x01), NG4_retl_o2_plus_1) 531 + EX_ST(STORE(stb, %g1, %o0 - 0x01), memcpy_retl_o2_plus_1) 383 532 ba,a,pt %icc, .Lexit 384 533 nop 385 534 .size FUNC_NAME, .-FUNC_NAME