Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

arch/sparc: Optimized memcpy, memset, copy_to_user, copy_from_user for M7/M8

New algorithm that takes advantage of the M7/M8 block init store
ASI, ie, overlapping pipelines and miss buffer filling.
Full details in code comments.

Signed-off-by: Babu Moger <babu.moger@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Babu Moger and committed by
David S. Miller
b3a04ed5 1ab32693

+1435 -2
+14 -2
arch/sparc/kernel/head_64.S
··· 603 603 be,pt %xcc, niagara4_patch 604 604 nop 605 605 cmp %g1, SUN4V_CHIP_SPARC_M7 606 - be,pt %xcc, niagara4_patch 606 + be,pt %xcc, sparc_m7_patch 607 607 nop 608 608 cmp %g1, SUN4V_CHIP_SPARC_M8 609 - be,pt %xcc, niagara4_patch 609 + be,pt %xcc, sparc_m7_patch 610 610 nop 611 611 cmp %g1, SUN4V_CHIP_SPARC_SN 612 612 be,pt %xcc, niagara4_patch ··· 621 621 622 622 ba,a,pt %xcc, 80f 623 623 nop 624 + 625 + sparc_m7_patch: 626 + call m7_patch_copyops 627 + nop 628 + call m7_patch_bzero 629 + nop 630 + call m7_patch_pageops 631 + nop 632 + 633 + ba,a,pt %xcc, 80f 634 + nop 635 + 624 636 niagara4_patch: 625 637 call niagara4_patch_copyops 626 638 nop
+41
arch/sparc/lib/M7copy_from_user.S
··· 1 + /* 2 + * M7copy_from_user.S: SPARC M7 optimized copy from userspace. 3 + * 4 + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. 5 + */ 6 + 7 + 8 + #define EX_LD(x) \ 9 + 98: x; \ 10 + .section __ex_table,"a"; \ 11 + .align 4; \ 12 + .word 98b, __restore_asi; \ 13 + .text; \ 14 + .align 4; 15 + 16 + #define EX_LD_FP(x) \ 17 + 98: x; \ 18 + .section __ex_table,"a"; \ 19 + .align 4; \ 20 + .word 98b, __restore_asi_fp; \ 21 + .text; \ 22 + .align 4; 23 + 24 + 25 + #ifndef ASI_AIUS 26 + #define ASI_AIUS 0x11 27 + #endif 28 + 29 + #define FUNC_NAME M7copy_from_user 30 + #define LOAD(type,addr,dest) type##a [addr] %asi, dest 31 + #define EX_RETVAL(x) 0 32 + 33 + #ifdef __KERNEL__ 34 + #define PREAMBLE \ 35 + rd %asi, %g1; \ 36 + cmp %g1, ASI_AIUS; \ 37 + bne,pn %icc, raw_copy_in_user; \ 38 + nop 39 + #endif 40 + 41 + #include "M7memcpy.S"
+51
arch/sparc/lib/M7copy_to_user.S
··· 1 + /* 2 + * M7copy_to_user.S: SPARC M7 optimized copy to userspace. 3 + * 4 + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. 5 + */ 6 + 7 + 8 + #define EX_ST(x) \ 9 + 98: x; \ 10 + .section __ex_table,"a"; \ 11 + .align 4; \ 12 + .word 98b, __restore_asi; \ 13 + .text; \ 14 + .align 4; 15 + 16 + #define EX_ST_FP(x) \ 17 + 98: x; \ 18 + .section __ex_table,"a"; \ 19 + .align 4; \ 20 + .word 98b, __restore_asi_fp; \ 21 + .text; \ 22 + .align 4; 23 + 24 + 25 + #ifndef ASI_AIUS 26 + #define ASI_AIUS 0x11 27 + #endif 28 + 29 + #ifndef ASI_BLK_INIT_QUAD_LDD_AIUS 30 + #define ASI_BLK_INIT_QUAD_LDD_AIUS 0x23 31 + #endif 32 + 33 + #define FUNC_NAME M7copy_to_user 34 + #define STORE(type,src,addr) type##a src, [addr] %asi 35 + #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_AIUS 36 + #define STORE_MRU_ASI ASI_ST_BLKINIT_MRU_S 37 + #define EX_RETVAL(x) 0 38 + 39 + #ifdef __KERNEL__ 40 + /* Writing to %asi is _expensive_ so we hardcode it. 41 + * Reading %asi to check for KERNEL_DS is comparatively 42 + * cheap. 43 + */ 44 + #define PREAMBLE \ 45 + rd %asi, %g1; \ 46 + cmp %g1, ASI_AIUS; \ 47 + bne,pn %icc, raw_copy_in_user; \ 48 + nop 49 + #endif 50 + 51 + #include "M7memcpy.S"
+923
arch/sparc/lib/M7memcpy.S
··· 1 + /* 2 + * M7memcpy: Optimized SPARC M7 memcpy 3 + * 4 + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. 5 + */ 6 + 7 + .file "M7memcpy.S" 8 + 9 + /* 10 + * memcpy(s1, s2, len) 11 + * 12 + * Copy s2 to s1, always copy n bytes. 13 + * Note: this C code does not work for overlapped copies. 14 + * 15 + * Fast assembler language version of the following C-program for memcpy 16 + * which represents the `standard' for the C-library. 17 + * 18 + * void * 19 + * memcpy(void *s, const void *s0, size_t n) 20 + * { 21 + * if (n != 0) { 22 + * char *s1 = s; 23 + * const char *s2 = s0; 24 + * do { 25 + * *s1++ = *s2++; 26 + * } while (--n != 0); 27 + * } 28 + * return (s); 29 + * } 30 + * 31 + * 32 + * SPARC T7/M7 Flow : 33 + * 34 + * if (count < SMALL_MAX) { 35 + * if count < SHORTCOPY (SHORTCOPY=3) 36 + * copy bytes; exit with dst addr 37 + * if src & dst aligned on word boundary but not long word boundary, 38 + * copy with ldw/stw; branch to finish_up 39 + * if src & dst aligned on long word boundary 40 + * copy with ldx/stx; branch to finish_up 41 + * if src & dst not aligned and length <= SHORTCHECK (SHORTCHECK=14) 42 + * copy bytes; exit with dst addr 43 + * move enough bytes to get src to word boundary 44 + * if dst now on word boundary 45 + * move_words: 46 + * copy words; branch to finish_up 47 + * if dst now on half word boundary 48 + * load words, shift half words, store words; branch to finish_up 49 + * if dst on byte 1 50 + * load words, shift 3 bytes, store words; branch to finish_up 51 + * if dst on byte 3 52 + * load words, shift 1 byte, store words; branch to finish_up 53 + * finish_up: 54 + * copy bytes; exit with dst addr 55 + * } else { More than SMALL_MAX bytes 56 + * move bytes until dst is on long word boundary 57 + * if( src is on long word boundary ) { 58 + * if (count < MED_MAX) { 59 + * finish_long: src/dst aligned on 8 bytes 60 + * copy with ldx/stx in 8-way unrolled loop; 61 + * copy final 0-63 bytes; exit with dst addr 62 + * } else { src/dst aligned; count > MED_MAX 63 + * align dst on 64 byte boundary; for main data movement: 64 + * prefetch src data to L2 cache; let HW prefetch move data to L1 cache 65 + * Use BIS (block initializing store) to avoid copying store cache 66 + * lines from memory. But pre-store first element of each cache line 67 + * ST_CHUNK lines in advance of the rest of that cache line. That 68 + * gives time for replacement cache lines to be written back without 69 + * excess STQ and Miss Buffer filling. Repeat until near the end, 70 + * then finish up storing before going to finish_long. 71 + * } 72 + * } else { src/dst not aligned on 8 bytes 73 + * if src is word aligned and count < MED_WMAX 74 + * move words in 8-way unrolled loop 75 + * move final 0-31 bytes; exit with dst addr 76 + * if count < MED_UMAX 77 + * use alignaddr/faligndata combined with ldd/std in 8-way 78 + * unrolled loop to move data. 79 + * go to unalign_done 80 + * else 81 + * setup alignaddr for faligndata instructions 82 + * align dst on 64 byte boundary; prefetch src data to L1 cache 83 + * loadx8, falign, block-store, prefetch loop 84 + * (only use block-init-store when src/dst on 8 byte boundaries.) 85 + * unalign_done: 86 + * move remaining bytes for unaligned cases. exit with dst addr. 87 + * } 88 + * 89 + */ 90 + 91 + #include <asm/visasm.h> 92 + #include <asm/asi.h> 93 + 94 + #if !defined(EX_LD) && !defined(EX_ST) 95 + #define NON_USER_COPY 96 + #endif 97 + 98 + #ifndef EX_LD 99 + #define EX_LD(x) x 100 + #endif 101 + #ifndef EX_LD_FP 102 + #define EX_LD_FP(x) x 103 + #endif 104 + 105 + #ifndef EX_ST 106 + #define EX_ST(x) x 107 + #endif 108 + #ifndef EX_ST_FP 109 + #define EX_ST_FP(x) x 110 + #endif 111 + 112 + #ifndef EX_RETVAL 113 + #define EX_RETVAL(x) x 114 + #endif 115 + 116 + #ifndef LOAD 117 + #define LOAD(type,addr,dest) type [addr], dest 118 + #endif 119 + 120 + #ifndef STORE 121 + #define STORE(type,src,addr) type src, [addr] 122 + #endif 123 + 124 + /* 125 + * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache 126 + * line as "least recently used" which means if many threads are 127 + * active, it has a high probability of being pushed out of the cache 128 + * between the first initializing store and the final stores. 129 + * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which 130 + * marks the cache line as "most recently used" for all 131 + * but the last cache line 132 + */ 133 + #ifndef STORE_ASI 134 + #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 135 + #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P 136 + #else 137 + #define STORE_ASI 0x80 /* ASI_P */ 138 + #endif 139 + #endif 140 + 141 + #ifndef STORE_MRU_ASI 142 + #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 143 + #define STORE_MRU_ASI ASI_ST_BLKINIT_MRU_P 144 + #else 145 + #define STORE_MRU_ASI 0x80 /* ASI_P */ 146 + #endif 147 + #endif 148 + 149 + #ifndef STORE_INIT 150 + #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI 151 + #endif 152 + 153 + #ifndef STORE_INIT_MRU 154 + #define STORE_INIT_MRU(src,addr) stxa src, [addr] STORE_MRU_ASI 155 + #endif 156 + 157 + #ifndef FUNC_NAME 158 + #define FUNC_NAME M7memcpy 159 + #endif 160 + 161 + #ifndef PREAMBLE 162 + #define PREAMBLE 163 + #endif 164 + 165 + #define BLOCK_SIZE 64 166 + #define SHORTCOPY 3 167 + #define SHORTCHECK 14 168 + #define SHORT_LONG 64 /* max copy for short longword-aligned case */ 169 + /* must be at least 64 */ 170 + #define SMALL_MAX 128 171 + #define MED_UMAX 1024 /* max copy for medium un-aligned case */ 172 + #define MED_WMAX 1024 /* max copy for medium word-aligned case */ 173 + #define MED_MAX 1024 /* max copy for medium longword-aligned case */ 174 + #define ST_CHUNK 24 /* ST_CHUNK - block of values for BIS Store */ 175 + #define ALIGN_PRE 24 /* distance for aligned prefetch loop */ 176 + 177 + .register %g2,#scratch 178 + 179 + .section ".text" 180 + .global FUNC_NAME 181 + .type FUNC_NAME, #function 182 + .align 16 183 + FUNC_NAME: 184 + srlx %o2, 31, %g2 185 + cmp %g2, 0 186 + tne %xcc, 5 187 + PREAMBLE 188 + mov %o0, %g1 ! save %o0 189 + brz,pn %o2, .Lsmallx 190 + cmp %o2, 3 191 + ble,pn %icc, .Ltiny_cp 192 + cmp %o2, 19 193 + ble,pn %icc, .Lsmall_cp 194 + or %o0, %o1, %g2 195 + cmp %o2, SMALL_MAX 196 + bl,pn %icc, .Lmedium_cp 197 + nop 198 + 199 + .Lmedium: 200 + neg %o0, %o5 201 + andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 202 + brz,pt %o5, .Ldst_aligned_on_8 203 + 204 + ! %o5 has the bytes to be written in partial store. 205 + sub %o2, %o5, %o2 206 + sub %o1, %o0, %o1 ! %o1 gets the difference 207 + 7: ! dst aligning loop 208 + add %o1, %o0, %o4 209 + EX_LD(LOAD(ldub, %o4, %o4)) ! load one byte 210 + subcc %o5, 1, %o5 211 + EX_ST(STORE(stb, %o4, %o0)) 212 + bgu,pt %xcc, 7b 213 + add %o0, 1, %o0 ! advance dst 214 + add %o1, %o0, %o1 ! restore %o1 215 + .Ldst_aligned_on_8: 216 + andcc %o1, 7, %o5 217 + brnz,pt %o5, .Lsrc_dst_unaligned_on_8 218 + nop 219 + 220 + .Lsrc_dst_aligned_on_8: 221 + ! check if we are copying MED_MAX or more bytes 222 + set MED_MAX, %o3 223 + cmp %o2, %o3 ! limit to store buffer size 224 + bgu,pn %xcc, .Llarge_align8_copy 225 + nop 226 + 227 + /* 228 + * Special case for handling when src and dest are both long word aligned 229 + * and total data to move is less than MED_MAX bytes 230 + */ 231 + .Lmedlong: 232 + subcc %o2, 63, %o2 ! adjust length to allow cc test 233 + ble,pn %xcc, .Lmedl63 ! skip big loop if less than 64 bytes 234 + nop 235 + .Lmedl64: 236 + EX_LD(LOAD(ldx, %o1, %o4)) ! load 237 + subcc %o2, 64, %o2 ! decrement length count 238 + EX_ST(STORE(stx, %o4, %o0)) ! and store 239 + EX_LD(LOAD(ldx, %o1+8, %o3)) ! a block of 64 bytes 240 + EX_ST(STORE(stx, %o3, %o0+8)) 241 + EX_LD(LOAD(ldx, %o1+16, %o4)) 242 + EX_ST(STORE(stx, %o4, %o0+16)) 243 + EX_LD(LOAD(ldx, %o1+24, %o3)) 244 + EX_ST(STORE(stx, %o3, %o0+24)) 245 + EX_LD(LOAD(ldx, %o1+32, %o4)) ! load 246 + EX_ST(STORE(stx, %o4, %o0+32)) ! and store 247 + EX_LD(LOAD(ldx, %o1+40, %o3)) ! a block of 64 bytes 248 + add %o1, 64, %o1 ! increase src ptr by 64 249 + EX_ST(STORE(stx, %o3, %o0+40)) 250 + EX_LD(LOAD(ldx, %o1-16, %o4)) 251 + add %o0, 64, %o0 ! increase dst ptr by 64 252 + EX_ST(STORE(stx, %o4, %o0-16)) 253 + EX_LD(LOAD(ldx, %o1-8, %o3)) 254 + bgu,pt %xcc, .Lmedl64 ! repeat if at least 64 bytes left 255 + EX_ST(STORE(stx, %o3, %o0-8)) 256 + .Lmedl63: 257 + addcc %o2, 32, %o2 ! adjust remaining count 258 + ble,pt %xcc, .Lmedl31 ! to skip if 31 or fewer bytes left 259 + nop 260 + EX_LD(LOAD(ldx, %o1, %o4)) ! load 261 + sub %o2, 32, %o2 ! decrement length count 262 + EX_ST(STORE(stx, %o4, %o0)) ! and store 263 + EX_LD(LOAD(ldx, %o1+8, %o3)) ! a block of 32 bytes 264 + add %o1, 32, %o1 ! increase src ptr by 32 265 + EX_ST(STORE(stx, %o3, %o0+8)) 266 + EX_LD(LOAD(ldx, %o1-16, %o4)) 267 + add %o0, 32, %o0 ! increase dst ptr by 32 268 + EX_ST(STORE(stx, %o4, %o0-16)) 269 + EX_LD(LOAD(ldx, %o1-8, %o3)) 270 + EX_ST(STORE(stx, %o3, %o0-8)) 271 + .Lmedl31: 272 + addcc %o2, 16, %o2 ! adjust remaining count 273 + ble,pt %xcc, .Lmedl15 ! skip if 15 or fewer bytes left 274 + nop ! 275 + EX_LD(LOAD(ldx, %o1, %o4)) 276 + add %o1, 16, %o1 ! increase src ptr by 16 277 + EX_ST(STORE(stx, %o4, %o0)) 278 + sub %o2, 16, %o2 ! decrease count by 16 279 + EX_LD(LOAD(ldx, %o1-8, %o3)) 280 + add %o0, 16, %o0 ! increase dst ptr by 16 281 + EX_ST(STORE(stx, %o3, %o0-8)) 282 + .Lmedl15: 283 + addcc %o2, 15, %o2 ! restore count 284 + bz,pt %xcc, .Lsmallx ! exit if finished 285 + cmp %o2, 8 286 + blt,pt %xcc, .Lmedw7 ! skip if 7 or fewer bytes left 287 + tst %o2 288 + EX_LD(LOAD(ldx, %o1, %o4)) ! load 8 bytes 289 + add %o1, 8, %o1 ! increase src ptr by 8 290 + add %o0, 8, %o0 ! increase dst ptr by 8 291 + subcc %o2, 8, %o2 ! decrease count by 8 292 + bnz,pn %xcc, .Lmedw7 293 + EX_ST(STORE(stx, %o4, %o0-8)) ! and store 8 bytes 294 + retl 295 + mov EX_RETVAL(%g1), %o0 ! restore %o0 296 + 297 + .align 16 298 + .Lsrc_dst_unaligned_on_8: 299 + ! DST is 8-byte aligned, src is not 300 + 2: 301 + andcc %o1, 0x3, %o5 ! test word alignment 302 + bnz,pt %xcc, .Lunalignsetup ! branch to skip if not word aligned 303 + nop 304 + 305 + /* 306 + * Handle all cases where src and dest are aligned on word 307 + * boundaries. Use unrolled loops for better performance. 308 + * This option wins over standard large data move when 309 + * source and destination is in cache for.Lmedium 310 + * to short data moves. 311 + */ 312 + set MED_WMAX, %o3 313 + cmp %o2, %o3 ! limit to store buffer size 314 + bge,pt %xcc, .Lunalignrejoin ! otherwise rejoin main loop 315 + nop 316 + 317 + subcc %o2, 31, %o2 ! adjust length to allow cc test 318 + ! for end of loop 319 + ble,pt %xcc, .Lmedw31 ! skip big loop if less than 16 320 + .Lmedw32: 321 + EX_LD(LOAD(ld, %o1, %o4)) ! move a block of 32 bytes 322 + sllx %o4, 32, %o5 323 + EX_LD(LOAD(ld, %o1+4, %o4)) 324 + or %o4, %o5, %o5 325 + EX_ST(STORE(stx, %o5, %o0)) 326 + subcc %o2, 32, %o2 ! decrement length count 327 + EX_LD(LOAD(ld, %o1+8, %o4)) 328 + sllx %o4, 32, %o5 329 + EX_LD(LOAD(ld, %o1+12, %o4)) 330 + or %o4, %o5, %o5 331 + EX_ST(STORE(stx, %o5, %o0+8)) 332 + add %o1, 32, %o1 ! increase src ptr by 32 333 + EX_LD(LOAD(ld, %o1-16, %o4)) 334 + sllx %o4, 32, %o5 335 + EX_LD(LOAD(ld, %o1-12, %o4)) 336 + or %o4, %o5, %o5 337 + EX_ST(STORE(stx, %o5, %o0+16)) 338 + add %o0, 32, %o0 ! increase dst ptr by 32 339 + EX_LD(LOAD(ld, %o1-8, %o4)) 340 + sllx %o4, 32, %o5 341 + EX_LD(LOAD(ld, %o1-4, %o4)) 342 + or %o4, %o5, %o5 343 + bgu,pt %xcc, .Lmedw32 ! repeat if at least 32 bytes left 344 + EX_ST(STORE(stx, %o5, %o0-8)) 345 + .Lmedw31: 346 + addcc %o2, 31, %o2 ! restore count 347 + 348 + bz,pt %xcc, .Lsmallx ! exit if finished 349 + nop 350 + cmp %o2, 16 351 + blt,pt %xcc, .Lmedw15 352 + nop 353 + EX_LD(LOAD(ld, %o1, %o4)) ! move a block of 16 bytes 354 + sllx %o4, 32, %o5 355 + subcc %o2, 16, %o2 ! decrement length count 356 + EX_LD(LOAD(ld, %o1+4, %o4)) 357 + or %o4, %o5, %o5 358 + EX_ST(STORE(stx, %o5, %o0)) 359 + add %o1, 16, %o1 ! increase src ptr by 16 360 + EX_LD(LOAD(ld, %o1-8, %o4)) 361 + add %o0, 16, %o0 ! increase dst ptr by 16 362 + sllx %o4, 32, %o5 363 + EX_LD(LOAD(ld, %o1-4, %o4)) 364 + or %o4, %o5, %o5 365 + EX_ST(STORE(stx, %o5, %o0-8)) 366 + .Lmedw15: 367 + bz,pt %xcc, .Lsmallx ! exit if finished 368 + cmp %o2, 8 369 + blt,pn %xcc, .Lmedw7 ! skip if 7 or fewer bytes left 370 + tst %o2 371 + EX_LD(LOAD(ld, %o1, %o4)) ! load 4 bytes 372 + subcc %o2, 8, %o2 ! decrease count by 8 373 + EX_ST(STORE(stw, %o4, %o0)) ! and store 4 bytes 374 + add %o1, 8, %o1 ! increase src ptr by 8 375 + EX_LD(LOAD(ld, %o1-4, %o3)) ! load 4 bytes 376 + add %o0, 8, %o0 ! increase dst ptr by 8 377 + EX_ST(STORE(stw, %o3, %o0-4)) ! and store 4 bytes 378 + bz,pt %xcc, .Lsmallx ! exit if finished 379 + .Lmedw7: ! count is ge 1, less than 8 380 + cmp %o2, 4 ! check for 4 bytes left 381 + blt,pn %xcc, .Lsmallleft3 ! skip if 3 or fewer bytes left 382 + nop ! 383 + EX_LD(LOAD(ld, %o1, %o4)) ! load 4 bytes 384 + add %o1, 4, %o1 ! increase src ptr by 4 385 + add %o0, 4, %o0 ! increase dst ptr by 4 386 + subcc %o2, 4, %o2 ! decrease count by 4 387 + bnz .Lsmallleft3 388 + EX_ST(STORE(stw, %o4, %o0-4))! and store 4 bytes 389 + retl 390 + mov EX_RETVAL(%g1), %o0 391 + 392 + .align 16 393 + .Llarge_align8_copy: ! Src and dst share 8 byte alignment 394 + ! align dst to 64 byte boundary 395 + andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 396 + brz,pn %o3, .Laligned_to_64 397 + andcc %o0, 8, %o3 ! odd long words to move? 398 + brz,pt %o3, .Laligned_to_16 399 + nop 400 + EX_LD(LOAD(ldx, %o1, %o4)) 401 + sub %o2, 8, %o2 402 + add %o1, 8, %o1 ! increment src ptr 403 + add %o0, 8, %o0 ! increment dst ptr 404 + EX_ST(STORE(stx, %o4, %o0-8)) 405 + .Laligned_to_16: 406 + andcc %o0, 16, %o3 ! pair of long words to move? 407 + brz,pt %o3, .Laligned_to_32 408 + nop 409 + EX_LD(LOAD(ldx, %o1, %o4)) 410 + sub %o2, 16, %o2 411 + EX_ST(STORE(stx, %o4, %o0)) 412 + add %o1, 16, %o1 ! increment src ptr 413 + EX_LD(LOAD(ldx, %o1-8, %o4)) 414 + add %o0, 16, %o0 ! increment dst ptr 415 + EX_ST(STORE(stx, %o4, %o0-8)) 416 + .Laligned_to_32: 417 + andcc %o0, 32, %o3 ! four long words to move? 418 + brz,pt %o3, .Laligned_to_64 419 + nop 420 + EX_LD(LOAD(ldx, %o1, %o4)) 421 + sub %o2, 32, %o2 422 + EX_ST(STORE(stx, %o4, %o0)) 423 + EX_LD(LOAD(ldx, %o1+8, %o4)) 424 + EX_ST(STORE(stx, %o4, %o0+8)) 425 + EX_LD(LOAD(ldx, %o1+16, %o4)) 426 + EX_ST(STORE(stx, %o4, %o0+16)) 427 + add %o1, 32, %o1 ! increment src ptr 428 + EX_LD(LOAD(ldx, %o1-8, %o4)) 429 + add %o0, 32, %o0 ! increment dst ptr 430 + EX_ST(STORE(stx, %o4, %o0-8)) 431 + .Laligned_to_64: 432 + ! 433 + ! Using block init store (BIS) instructions to avoid fetching cache 434 + ! lines from memory. Use ST_CHUNK stores to first element of each cache 435 + ! line (similar to prefetching) to avoid overfilling STQ or miss buffers. 436 + ! Gives existing cache lines time to be moved out of L1/L2/L3 cache. 437 + ! Initial stores using MRU version of BIS to keep cache line in 438 + ! cache until we are ready to store final element of cache line. 439 + ! Then store last element using the LRU version of BIS. 440 + ! 441 + andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 442 + and %o2, 0x3f, %o2 ! residue bytes in %o2 443 + ! 444 + ! We use STORE_MRU_ASI for the first seven stores to each cache line 445 + ! followed by STORE_ASI (mark as LRU) for the last store. That 446 + ! mixed approach reduces the probability that the cache line is removed 447 + ! before we finish setting it, while minimizing the effects on 448 + ! other cached values during a large memcpy 449 + ! 450 + ! ST_CHUNK batches up initial BIS operations for several cache lines 451 + ! to allow multiple requests to not be blocked by overflowing the 452 + ! the store miss buffer. Then the matching stores for all those 453 + ! BIS operations are executed. 454 + ! 455 + 456 + sub %o0, 8, %o0 ! adjust %o0 for ASI alignment 457 + .Lalign_loop: 458 + cmp %o5, ST_CHUNK*64 459 + blu,pt %xcc, .Lalign_loop_fin 460 + mov ST_CHUNK,%o3 461 + .Lalign_loop_start: 462 + prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21 463 + subcc %o3, 1, %o3 464 + EX_LD(LOAD(ldx, %o1, %o4)) 465 + add %o1, 64, %o1 466 + add %o0, 8, %o0 467 + EX_ST(STORE_INIT_MRU(%o4, %o0)) 468 + bgu %xcc,.Lalign_loop_start 469 + add %o0, 56, %o0 470 + 471 + mov ST_CHUNK,%o3 472 + sllx %o3, 6, %o4 ! ST_CHUNK*64 473 + sub %o1, %o4, %o1 ! reset %o1 474 + sub %o0, %o4, %o0 ! reset %o0 475 + 476 + .Lalign_loop_rest: 477 + EX_LD(LOAD(ldx, %o1+8, %o4)) 478 + add %o0, 16, %o0 479 + EX_ST(STORE_INIT_MRU(%o4, %o0)) 480 + EX_LD(LOAD(ldx, %o1+16, %o4)) 481 + add %o0, 8, %o0 482 + EX_ST(STORE_INIT_MRU(%o4, %o0)) 483 + subcc %o3, 1, %o3 484 + EX_LD(LOAD(ldx, %o1+24, %o4)) 485 + add %o0, 8, %o0 486 + EX_ST(STORE_INIT_MRU(%o4, %o0)) 487 + EX_LD(LOAD(ldx, %o1+32, %o4)) 488 + add %o0, 8, %o0 489 + EX_ST(STORE_INIT_MRU(%o4, %o0)) 490 + EX_LD(LOAD(ldx, %o1+40, %o4)) 491 + add %o0, 8, %o0 492 + EX_ST(STORE_INIT_MRU(%o4, %o0)) 493 + EX_LD(LOAD(ldx, %o1+48, %o4)) 494 + add %o1, 64, %o1 495 + add %o0, 8, %o0 496 + EX_ST(STORE_INIT_MRU(%o4, %o0)) 497 + add %o0, 8, %o0 498 + EX_LD(LOAD(ldx, %o1-8, %o4)) 499 + sub %o5, 64, %o5 500 + bgu %xcc,.Lalign_loop_rest 501 + ! mark cache line as LRU 502 + EX_ST(STORE_INIT(%o4, %o0)) 503 + 504 + cmp %o5, ST_CHUNK*64 505 + bgu,pt %xcc, .Lalign_loop_start 506 + mov ST_CHUNK,%o3 507 + 508 + cmp %o5, 0 509 + beq .Lalign_done 510 + nop 511 + .Lalign_loop_fin: 512 + EX_LD(LOAD(ldx, %o1, %o4)) 513 + EX_ST(STORE(stx, %o4, %o0+8)) 514 + EX_LD(LOAD(ldx, %o1+8, %o4)) 515 + EX_ST(STORE(stx, %o4, %o0+8+8)) 516 + EX_LD(LOAD(ldx, %o1+16, %o4)) 517 + EX_ST(STORE(stx, %o4, %o0+8+16)) 518 + subcc %o5, 64, %o5 519 + EX_LD(LOAD(ldx, %o1+24, %o4)) 520 + EX_ST(STORE(stx, %o4, %o0+8+24)) 521 + EX_LD(LOAD(ldx, %o1+32, %o4)) 522 + EX_ST(STORE(stx, %o4, %o0+8+32)) 523 + EX_LD(LOAD(ldx, %o1+40, %o4)) 524 + EX_ST(STORE(stx, %o4, %o0+8+40)) 525 + EX_LD(LOAD(ldx, %o1+48, %o4)) 526 + add %o1, 64, %o1 527 + EX_ST(STORE(stx, %o4, %o0+8+48)) 528 + add %o0, 64, %o0 529 + EX_LD(LOAD(ldx, %o1-8, %o4)) 530 + bgu %xcc,.Lalign_loop_fin 531 + EX_ST(STORE(stx, %o4, %o0)) 532 + 533 + .Lalign_done: 534 + add %o0, 8, %o0 ! restore %o0 from ASI alignment 535 + membar #StoreStore 536 + sub %o2, 63, %o2 ! adjust length to allow cc test 537 + ba .Lmedl63 ! in .Lmedl63 538 + nop 539 + 540 + .align 16 541 + ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX 542 + .Lunalignsetup: 543 + .Lunalignrejoin: 544 + mov %g1, %o3 ! save %g1 as VISEntryHalf clobbers it 545 + #ifdef NON_USER_COPY 546 + VISEntryHalfFast(.Lmedium_vis_entry_fail_cp) 547 + #else 548 + VISEntryHalf 549 + #endif 550 + mov %o3, %g1 ! restore %g1 551 + 552 + set MED_UMAX, %o3 553 + cmp %o2, %o3 ! check for.Lmedium unaligned limit 554 + bge,pt %xcc,.Lunalign_large 555 + prefetch [%o1 + (4 * BLOCK_SIZE)], 20 556 + andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 557 + and %o2, 0x3f, %o2 ! residue bytes in %o2 558 + cmp %o2, 8 ! Insure we do not load beyond 559 + bgt .Lunalign_adjust ! end of source buffer 560 + andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 561 + add %o2, 64, %o2 ! adjust to leave loop 562 + sub %o5, 64, %o5 ! early if necessary 563 + .Lunalign_adjust: 564 + alignaddr %o1, %g0, %g0 ! generate %gsr 565 + add %o1, %o5, %o1 ! advance %o1 to after blocks 566 + EX_LD_FP(LOAD(ldd, %o4, %f0)) 567 + .Lunalign_loop: 568 + EX_LD_FP(LOAD(ldd, %o4+8, %f2)) 569 + faligndata %f0, %f2, %f16 570 + EX_LD_FP(LOAD(ldd, %o4+16, %f4)) 571 + subcc %o5, BLOCK_SIZE, %o5 572 + EX_ST_FP(STORE(std, %f16, %o0)) 573 + faligndata %f2, %f4, %f18 574 + EX_LD_FP(LOAD(ldd, %o4+24, %f6)) 575 + EX_ST_FP(STORE(std, %f18, %o0+8)) 576 + faligndata %f4, %f6, %f20 577 + EX_LD_FP(LOAD(ldd, %o4+32, %f8)) 578 + EX_ST_FP(STORE(std, %f20, %o0+16)) 579 + faligndata %f6, %f8, %f22 580 + EX_LD_FP(LOAD(ldd, %o4+40, %f10)) 581 + EX_ST_FP(STORE(std, %f22, %o0+24)) 582 + faligndata %f8, %f10, %f24 583 + EX_LD_FP(LOAD(ldd, %o4+48, %f12)) 584 + EX_ST_FP(STORE(std, %f24, %o0+32)) 585 + faligndata %f10, %f12, %f26 586 + EX_LD_FP(LOAD(ldd, %o4+56, %f14)) 587 + add %o4, BLOCK_SIZE, %o4 588 + EX_ST_FP(STORE(std, %f26, %o0+40)) 589 + faligndata %f12, %f14, %f28 590 + EX_LD_FP(LOAD(ldd, %o4, %f0)) 591 + EX_ST_FP(STORE(std, %f28, %o0+48)) 592 + faligndata %f14, %f0, %f30 593 + EX_ST_FP(STORE(std, %f30, %o0+56)) 594 + add %o0, BLOCK_SIZE, %o0 595 + bgu,pt %xcc, .Lunalign_loop 596 + prefetch [%o4 + (5 * BLOCK_SIZE)], 20 597 + ba .Lunalign_done 598 + nop 599 + 600 + .Lunalign_large: 601 + andcc %o0, 0x3f, %o3 ! is dst 64-byte block aligned? 602 + bz %xcc, .Lunalignsrc 603 + sub %o3, 64, %o3 ! %o3 will be multiple of 8 604 + neg %o3 ! bytes until dest is 64 byte aligned 605 + sub %o2, %o3, %o2 ! update cnt with bytes to be moved 606 + ! Move bytes according to source alignment 607 + andcc %o1, 0x1, %o5 608 + bnz %xcc, .Lunalignbyte ! check for byte alignment 609 + nop 610 + andcc %o1, 2, %o5 ! check for half word alignment 611 + bnz %xcc, .Lunalignhalf 612 + nop 613 + ! Src is word aligned 614 + .Lunalignword: 615 + EX_LD_FP(LOAD(ld, %o1, %o4)) ! load 4 bytes 616 + add %o1, 8, %o1 ! increase src ptr by 8 617 + EX_ST_FP(STORE(stw, %o4, %o0)) ! and store 4 bytes 618 + subcc %o3, 8, %o3 ! decrease count by 8 619 + EX_LD_FP(LOAD(ld, %o1-4, %o4)) ! load 4 bytes 620 + add %o0, 8, %o0 ! increase dst ptr by 8 621 + bnz %xcc, .Lunalignword 622 + EX_ST_FP(STORE(stw, %o4, %o0-4))! and store 4 bytes 623 + ba .Lunalignsrc 624 + nop 625 + 626 + ! Src is half-word aligned 627 + .Lunalignhalf: 628 + EX_LD_FP(LOAD(lduh, %o1, %o4)) ! load 2 bytes 629 + sllx %o4, 32, %o5 ! shift left 630 + EX_LD_FP(LOAD(lduw, %o1+2, %o4)) 631 + or %o4, %o5, %o5 632 + sllx %o5, 16, %o5 633 + EX_LD_FP(LOAD(lduh, %o1+6, %o4)) 634 + or %o4, %o5, %o5 635 + EX_ST_FP(STORE(stx, %o5, %o0)) 636 + add %o1, 8, %o1 637 + subcc %o3, 8, %o3 638 + bnz %xcc, .Lunalignhalf 639 + add %o0, 8, %o0 640 + ba .Lunalignsrc 641 + nop 642 + 643 + ! Src is Byte aligned 644 + .Lunalignbyte: 645 + sub %o0, %o1, %o0 ! share pointer advance 646 + .Lunalignbyte_loop: 647 + EX_LD_FP(LOAD(ldub, %o1, %o4)) 648 + sllx %o4, 56, %o5 649 + EX_LD_FP(LOAD(lduh, %o1+1, %o4)) 650 + sllx %o4, 40, %o4 651 + or %o4, %o5, %o5 652 + EX_LD_FP(LOAD(lduh, %o1+3, %o4)) 653 + sllx %o4, 24, %o4 654 + or %o4, %o5, %o5 655 + EX_LD_FP(LOAD(lduh, %o1+5, %o4)) 656 + sllx %o4, 8, %o4 657 + or %o4, %o5, %o5 658 + EX_LD_FP(LOAD(ldub, %o1+7, %o4)) 659 + or %o4, %o5, %o5 660 + add %o0, %o1, %o0 661 + EX_ST_FP(STORE(stx, %o5, %o0)) 662 + sub %o0, %o1, %o0 663 + subcc %o3, 8, %o3 664 + bnz %xcc, .Lunalignbyte_loop 665 + add %o1, 8, %o1 666 + add %o0,%o1, %o0 ! restore pointer 667 + 668 + ! Destination is now block (64 byte aligned) 669 + .Lunalignsrc: 670 + andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 671 + and %o2, 0x3f, %o2 ! residue bytes in %o2 672 + add %o2, 64, %o2 ! Insure we do not load beyond 673 + sub %o5, 64, %o5 ! end of source buffer 674 + 675 + andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 676 + alignaddr %o1, %g0, %g0 ! generate %gsr 677 + add %o1, %o5, %o1 ! advance %o1 to after blocks 678 + 679 + EX_LD_FP(LOAD(ldd, %o4, %f14)) 680 + add %o4, 8, %o4 681 + .Lunalign_sloop: 682 + EX_LD_FP(LOAD(ldd, %o4, %f16)) 683 + faligndata %f14, %f16, %f0 684 + EX_LD_FP(LOAD(ldd, %o4+8, %f18)) 685 + faligndata %f16, %f18, %f2 686 + EX_LD_FP(LOAD(ldd, %o4+16, %f20)) 687 + faligndata %f18, %f20, %f4 688 + EX_ST_FP(STORE(std, %f0, %o0)) 689 + subcc %o5, 64, %o5 690 + EX_LD_FP(LOAD(ldd, %o4+24, %f22)) 691 + faligndata %f20, %f22, %f6 692 + EX_ST_FP(STORE(std, %f2, %o0+8)) 693 + EX_LD_FP(LOAD(ldd, %o4+32, %f24)) 694 + faligndata %f22, %f24, %f8 695 + EX_ST_FP(STORE(std, %f4, %o0+16)) 696 + EX_LD_FP(LOAD(ldd, %o4+40, %f26)) 697 + faligndata %f24, %f26, %f10 698 + EX_ST_FP(STORE(std, %f6, %o0+24)) 699 + EX_LD_FP(LOAD(ldd, %o4+48, %f28)) 700 + faligndata %f26, %f28, %f12 701 + EX_ST_FP(STORE(std, %f8, %o0+32)) 702 + add %o4, 64, %o4 703 + EX_LD_FP(LOAD(ldd, %o4-8, %f30)) 704 + faligndata %f28, %f30, %f14 705 + EX_ST_FP(STORE(std, %f10, %o0+40)) 706 + EX_ST_FP(STORE(std, %f12, %o0+48)) 707 + add %o0, 64, %o0 708 + EX_ST_FP(STORE(std, %f14, %o0-8)) 709 + fsrc2 %f30, %f14 710 + bgu,pt %xcc, .Lunalign_sloop 711 + prefetch [%o4 + (8 * BLOCK_SIZE)], 20 712 + 713 + .Lunalign_done: 714 + ! Handle trailing bytes, 64 to 127 715 + ! Dest long word aligned, Src not long word aligned 716 + cmp %o2, 15 717 + bleu %xcc, .Lunalign_short 718 + 719 + andn %o2, 0x7, %o5 ! %o5 is multiple of 8 720 + and %o2, 0x7, %o2 ! residue bytes in %o2 721 + add %o2, 8, %o2 722 + sub %o5, 8, %o5 ! insure we do not load past end of src 723 + andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 724 + add %o1, %o5, %o1 ! advance %o1 to after multiple of 8 725 + EX_LD_FP(LOAD(ldd, %o4, %f0)) ! fetch partial word 726 + .Lunalign_by8: 727 + EX_LD_FP(LOAD(ldd, %o4+8, %f2)) 728 + add %o4, 8, %o4 729 + faligndata %f0, %f2, %f16 730 + subcc %o5, 8, %o5 731 + EX_ST_FP(STORE(std, %f16, %o0)) 732 + fsrc2 %f2, %f0 733 + bgu,pt %xcc, .Lunalign_by8 734 + add %o0, 8, %o0 735 + 736 + .Lunalign_short: 737 + #ifdef NON_USER_COPY 738 + VISExitHalfFast 739 + #else 740 + VISExitHalf 741 + #endif 742 + ba .Lsmallrest 743 + nop 744 + 745 + /* 746 + * This is a special case of nested memcpy. This can happen when kernel 747 + * calls unaligned memcpy back to back without saving FP registers. We need 748 + * traps(context switch) to save/restore FP registers. If the kernel calls 749 + * memcpy without this trap sequence we will hit FP corruption. Let's use 750 + * the normal integer load/store method in this case. 751 + */ 752 + 753 + #ifdef NON_USER_COPY 754 + .Lmedium_vis_entry_fail_cp: 755 + or %o0, %o1, %g2 756 + #endif 757 + .Lmedium_cp: 758 + LOAD(prefetch, %o1 + 0x40, #n_reads_strong) 759 + andcc %g2, 0x7, %g0 760 + bne,pn %xcc, .Lmedium_unaligned_cp 761 + nop 762 + 763 + .Lmedium_noprefetch_cp: 764 + andncc %o2, 0x20 - 1, %o5 765 + be,pn %xcc, 2f 766 + sub %o2, %o5, %o2 767 + 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3)) 768 + EX_LD(LOAD(ldx, %o1 + 0x08, %g2)) 769 + EX_LD(LOAD(ldx, %o1 + 0x10, %g7)) 770 + EX_LD(LOAD(ldx, %o1 + 0x18, %o4)) 771 + add %o1, 0x20, %o1 772 + subcc %o5, 0x20, %o5 773 + EX_ST(STORE(stx, %o3, %o0 + 0x00)) 774 + EX_ST(STORE(stx, %g2, %o0 + 0x08)) 775 + EX_ST(STORE(stx, %g7, %o0 + 0x10)) 776 + EX_ST(STORE(stx, %o4, %o0 + 0x18)) 777 + bne,pt %xcc, 1b 778 + add %o0, 0x20, %o0 779 + 2: andcc %o2, 0x18, %o5 780 + be,pt %xcc, 3f 781 + sub %o2, %o5, %o2 782 + 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3)) 783 + add %o1, 0x08, %o1 784 + add %o0, 0x08, %o0 785 + subcc %o5, 0x08, %o5 786 + bne,pt %xcc, 1b 787 + EX_ST(STORE(stx, %o3, %o0 - 0x08)) 788 + 3: brz,pt %o2, .Lexit_cp 789 + cmp %o2, 0x04 790 + bl,pn %xcc, .Ltiny_cp 791 + nop 792 + EX_LD(LOAD(lduw, %o1 + 0x00, %o3)) 793 + add %o1, 0x04, %o1 794 + add %o0, 0x04, %o0 795 + subcc %o2, 0x04, %o2 796 + bne,pn %xcc, .Ltiny_cp 797 + EX_ST(STORE(stw, %o3, %o0 - 0x04)) 798 + ba,a,pt %xcc, .Lexit_cp 799 + 800 + .Lmedium_unaligned_cp: 801 + /* First get dest 8 byte aligned. */ 802 + sub %g0, %o0, %o3 803 + and %o3, 0x7, %o3 804 + brz,pt %o3, 2f 805 + sub %o2, %o3, %o2 806 + 807 + 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) 808 + add %o1, 1, %o1 809 + subcc %o3, 1, %o3 810 + add %o0, 1, %o0 811 + bne,pt %xcc, 1b 812 + EX_ST(STORE(stb, %g2, %o0 - 0x01)) 813 + 2: 814 + and %o1, 0x7, %o3 815 + brz,pn %o3, .Lmedium_noprefetch_cp 816 + sll %o3, 3, %o3 817 + mov 64, %g2 818 + sub %g2, %o3, %g2 819 + andn %o1, 0x7, %o1 820 + EX_LD(LOAD(ldx, %o1 + 0x00, %o4)) 821 + sllx %o4, %o3, %o4 822 + andn %o2, 0x08 - 1, %o5 823 + sub %o2, %o5, %o2 824 + 825 + 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3)) 826 + add %o1, 0x08, %o1 827 + subcc %o5, 0x08, %o5 828 + srlx %g3, %g2, %g7 829 + or %g7, %o4, %g7 830 + EX_ST(STORE(stx, %g7, %o0 + 0x00)) 831 + add %o0, 0x08, %o0 832 + bne,pt %xcc, 1b 833 + sllx %g3, %o3, %o4 834 + srl %o3, 3, %o3 835 + add %o1, %o3, %o1 836 + brz,pn %o2, .Lexit_cp 837 + nop 838 + ba,pt %xcc, .Lsmall_unaligned_cp 839 + 840 + .Ltiny_cp: 841 + EX_LD(LOAD(ldub, %o1 + 0x00, %o3)) 842 + subcc %o2, 1, %o2 843 + be,pn %xcc, .Lexit_cp 844 + EX_ST(STORE(stb, %o3, %o0 + 0x00)) 845 + EX_LD(LOAD(ldub, %o1 + 0x01, %o3)) 846 + subcc %o2, 1, %o2 847 + be,pn %xcc, .Lexit_cp 848 + EX_ST(STORE(stb, %o3, %o0 + 0x01)) 849 + EX_LD(LOAD(ldub, %o1 + 0x02, %o3)) 850 + ba,pt %xcc, .Lexit_cp 851 + EX_ST(STORE(stb, %o3, %o0 + 0x02)) 852 + 853 + .Lsmall_cp: 854 + andcc %g2, 0x3, %g0 855 + bne,pn %xcc, .Lsmall_unaligned_cp 856 + andn %o2, 0x4 - 1, %o5 857 + sub %o2, %o5, %o2 858 + 1: 859 + EX_LD(LOAD(lduw, %o1 + 0x00, %o3)) 860 + add %o1, 0x04, %o1 861 + subcc %o5, 0x04, %o5 862 + add %o0, 0x04, %o0 863 + bne,pt %xcc, 1b 864 + EX_ST(STORE(stw, %o3, %o0 - 0x04)) 865 + brz,pt %o2, .Lexit_cp 866 + nop 867 + ba,a,pt %xcc, .Ltiny_cp 868 + 869 + .Lsmall_unaligned_cp: 870 + 1: EX_LD(LOAD(ldub, %o1 + 0x00, %o3)) 871 + add %o1, 1, %o1 872 + add %o0, 1, %o0 873 + subcc %o2, 1, %o2 874 + bne,pt %xcc, 1b 875 + EX_ST(STORE(stb, %o3, %o0 - 0x01)) 876 + ba,a,pt %xcc, .Lexit_cp 877 + 878 + .Lsmallrest: 879 + tst %o2 880 + bz,pt %xcc, .Lsmallx 881 + cmp %o2, 4 882 + blt,pn %xcc, .Lsmallleft3 883 + nop 884 + sub %o2, 3, %o2 885 + .Lsmallnotalign4: 886 + EX_LD(LOAD(ldub, %o1, %o3))! read byte 887 + subcc %o2, 4, %o2 ! reduce count by 4 888 + EX_ST(STORE(stb, %o3, %o0)) ! write byte 889 + EX_LD(LOAD(ldub, %o1+1, %o3))! repeat for total of 4 bytes 890 + add %o1, 4, %o1 ! advance SRC by 4 891 + EX_ST(STORE(stb, %o3, %o0+1)) 892 + EX_LD(LOAD(ldub, %o1-2, %o3)) 893 + add %o0, 4, %o0 ! advance DST by 4 894 + EX_ST(STORE(stb, %o3, %o0-2)) 895 + EX_LD(LOAD(ldub, %o1-1, %o3)) 896 + bgu,pt %xcc, .Lsmallnotalign4 ! loop til 3 or fewer bytes remain 897 + EX_ST(STORE(stb, %o3, %o0-1)) 898 + addcc %o2, 3, %o2 ! restore count 899 + bz,pt %xcc, .Lsmallx 900 + .Lsmallleft3: ! 1, 2, or 3 bytes remain 901 + subcc %o2, 1, %o2 902 + EX_LD(LOAD(ldub, %o1, %o3)) ! load one byte 903 + bz,pt %xcc, .Lsmallx 904 + EX_ST(STORE(stb, %o3, %o0)) ! store one byte 905 + EX_LD(LOAD(ldub, %o1+1, %o3)) ! load second byte 906 + subcc %o2, 1, %o2 907 + bz,pt %xcc, .Lsmallx 908 + EX_ST(STORE(stb, %o3, %o0+1))! store second byte 909 + EX_LD(LOAD(ldub, %o1+2, %o3)) ! load third byte 910 + EX_ST(STORE(stb, %o3, %o0+2)) ! store third byte 911 + .Lsmallx: 912 + retl 913 + mov EX_RETVAL(%g1), %o0 914 + .Lsmallfin: 915 + tst %o2 916 + bnz,pn %xcc, .Lsmallleft3 917 + nop 918 + retl 919 + mov EX_RETVAL(%g1), %o0 ! restore %o0 920 + .Lexit_cp: 921 + retl 922 + mov EX_RETVAL(%g1), %o0 923 + .size FUNC_NAME, .-FUNC_NAME
+352
arch/sparc/lib/M7memset.S
··· 1 + /* 2 + * M7memset.S: SPARC M7 optimized memset. 3 + * 4 + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. 5 + */ 6 + 7 + /* 8 + * M7memset.S: M7 optimized memset. 9 + * 10 + * char *memset(sp, c, n) 11 + * 12 + * Set an array of n chars starting at sp to the character c. 13 + * Return sp. 14 + * 15 + * Fast assembler language version of the following C-program for memset 16 + * which represents the `standard' for the C-library. 17 + * 18 + * void * 19 + * memset(void *sp1, int c, size_t n) 20 + * { 21 + * if (n != 0) { 22 + * char *sp = sp1; 23 + * do { 24 + * *sp++ = (char)c; 25 + * } while (--n != 0); 26 + * } 27 + * return (sp1); 28 + * } 29 + * 30 + * The algorithm is as follows : 31 + * 32 + * For small 6 or fewer bytes stores, bytes will be stored. 33 + * 34 + * For less than 32 bytes stores, align the address on 4 byte boundary. 35 + * Then store as many 4-byte chunks, followed by trailing bytes. 36 + * 37 + * For sizes greater than 32 bytes, align the address on 8 byte boundary. 38 + * if (count >= 64) { 39 + * store 8-bytes chunks to align the address on 64 byte boundary 40 + * if (value to be set is zero && count >= MIN_ZERO) { 41 + * Using BIS stores, set the first long word of each 42 + * 64-byte cache line to zero which will also clear the 43 + * other seven long words of the cache line. 44 + * } 45 + * else if (count >= MIN_LOOP) { 46 + * Using BIS stores, set the first long word of each of 47 + * ST_CHUNK cache lines (64 bytes each) before the main 48 + * loop is entered. 49 + * In the main loop, continue pre-setting the first long 50 + * word of each cache line ST_CHUNK lines in advance while 51 + * setting the other seven long words (56 bytes) of each 52 + * cache line until fewer than ST_CHUNK*64 bytes remain. 53 + * Then set the remaining seven long words of each cache 54 + * line that has already had its first long word set. 55 + * } 56 + * store remaining data in 64-byte chunks until less than 57 + * 64 bytes remain. 58 + * } 59 + * Store as many 8-byte chunks, followed by trailing bytes. 60 + * 61 + * BIS = Block Init Store 62 + * Doing the advance store of the first element of the cache line 63 + * initiates the displacement of a cache line while only using a single 64 + * instruction in the pipeline. That avoids various pipeline delays, 65 + * such as filling the miss buffer. The performance effect is 66 + * similar to prefetching for normal stores. 67 + * The special case for zero fills runs faster and uses fewer instruction 68 + * cycles than the normal memset loop. 69 + * 70 + * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence 71 + * BIS stores must be followed by a membar #StoreStore. The benefit of 72 + * the BIS store must be balanced against the cost of the membar operation. 73 + */ 74 + 75 + /* 76 + * ASI_STBI_P marks the cache line as "least recently used" 77 + * which means if many threads are active, it has a high chance 78 + * of being pushed out of the cache between the first initializing 79 + * store and the final stores. 80 + * Thus, we use ASI_STBIMRU_P which marks the cache line as 81 + * "most recently used" for all but the last store to the cache line. 82 + */ 83 + 84 + #include <asm/asi.h> 85 + #include <asm/page.h> 86 + 87 + #define ASI_STBI_P ASI_BLK_INIT_QUAD_LDD_P 88 + #define ASI_STBIMRU_P ASI_ST_BLKINIT_MRU_P 89 + 90 + 91 + #define ST_CHUNK 24 /* multiple of 4 due to loop unrolling */ 92 + #define MIN_LOOP 16320 93 + #define MIN_ZERO 512 94 + 95 + .section ".text" 96 + .align 32 97 + 98 + /* 99 + * Define clear_page(dest) as memset(dest, 0, PAGE_SIZE) 100 + * (can create a more optimized version later.) 101 + */ 102 + .globl M7clear_page 103 + .globl M7clear_user_page 104 + M7clear_page: /* clear_page(dest) */ 105 + M7clear_user_page: 106 + set PAGE_SIZE, %o1 107 + /* fall through into bzero code */ 108 + 109 + .size M7clear_page,.-M7clear_page 110 + .size M7clear_user_page,.-M7clear_user_page 111 + 112 + /* 113 + * Define bzero(dest, n) as memset(dest, 0, n) 114 + * (can create a more optimized version later.) 115 + */ 116 + .globl M7bzero 117 + M7bzero: /* bzero(dest, size) */ 118 + mov %o1, %o2 119 + mov 0, %o1 120 + /* fall through into memset code */ 121 + 122 + .size M7bzero,.-M7bzero 123 + 124 + .global M7memset 125 + .type M7memset, #function 126 + .register %g3, #scratch 127 + M7memset: 128 + mov %o0, %o5 ! copy sp1 before using it 129 + cmp %o2, 7 ! if small counts, just write bytes 130 + bleu,pn %xcc, .wrchar 131 + and %o1, 0xff, %o1 ! o1 is (char)c 132 + 133 + sll %o1, 8, %o3 134 + or %o1, %o3, %o1 ! now o1 has 2 bytes of c 135 + sll %o1, 16, %o3 136 + cmp %o2, 32 137 + blu,pn %xcc, .wdalign 138 + or %o1, %o3, %o1 ! now o1 has 4 bytes of c 139 + 140 + sllx %o1, 32, %o3 141 + or %o1, %o3, %o1 ! now o1 has 8 bytes of c 142 + 143 + .dbalign: 144 + andcc %o5, 7, %o3 ! is sp1 aligned on a 8 byte bound? 145 + bz,pt %xcc, .blkalign ! already long word aligned 146 + sub %o3, 8, %o3 ! -(bytes till long word aligned) 147 + 148 + add %o2, %o3, %o2 ! update o2 with new count 149 + ! Set -(%o3) bytes till sp1 long word aligned 150 + 1: stb %o1, [%o5] ! there is at least 1 byte to set 151 + inccc %o3 ! byte clearing loop 152 + bl,pt %xcc, 1b 153 + inc %o5 154 + 155 + ! Now sp1 is long word aligned (sp1 is found in %o5) 156 + .blkalign: 157 + cmp %o2, 64 ! check if there are 64 bytes to set 158 + blu,pn %xcc, .wrshort 159 + mov %o2, %o3 160 + 161 + andcc %o5, 63, %o3 ! is sp1 block aligned? 162 + bz,pt %xcc, .blkwr ! now block aligned 163 + sub %o3, 64, %o3 ! o3 is -(bytes till block aligned) 164 + add %o2, %o3, %o2 ! o2 is the remainder 165 + 166 + ! Store -(%o3) bytes till dst is block (64 byte) aligned. 167 + ! Use long word stores. 168 + ! Recall that dst is already long word aligned 169 + 1: 170 + addcc %o3, 8, %o3 171 + stx %o1, [%o5] 172 + bl,pt %xcc, 1b 173 + add %o5, 8, %o5 174 + 175 + ! Now sp1 is block aligned 176 + .blkwr: 177 + andn %o2, 63, %o4 ! calculate size of blocks in bytes 178 + brz,pn %o1, .wrzero ! special case if c == 0 179 + and %o2, 63, %o3 ! %o3 = bytes left after blk stores. 180 + 181 + set MIN_LOOP, %g1 182 + cmp %o4, %g1 ! check there are enough bytes to set 183 + blu,pn %xcc, .short_set ! to justify cost of membar 184 + ! must be > pre-cleared lines 185 + nop 186 + 187 + ! initial cache-clearing stores 188 + ! get store pipeline moving 189 + rd %asi, %g3 ! save %asi to be restored later 190 + wr %g0, ASI_STBIMRU_P, %asi 191 + 192 + ! Primary memset loop for large memsets 193 + .wr_loop: 194 + sub %o5, 8, %o5 ! adjust %o5 for ASI store alignment 195 + mov ST_CHUNK, %g1 196 + .wr_loop_start: 197 + stxa %o1, [%o5+8]%asi 198 + subcc %g1, 4, %g1 199 + stxa %o1, [%o5+8+64]%asi 200 + add %o5, 256, %o5 201 + stxa %o1, [%o5+8-128]%asi 202 + bgu %xcc, .wr_loop_start 203 + stxa %o1, [%o5+8-64]%asi 204 + 205 + sub %o5, ST_CHUNK*64, %o5 ! reset %o5 206 + mov ST_CHUNK, %g1 207 + 208 + .wr_loop_rest: 209 + stxa %o1, [%o5+8+8]%asi 210 + sub %o4, 64, %o4 211 + stxa %o1, [%o5+16+8]%asi 212 + subcc %g1, 1, %g1 213 + stxa %o1, [%o5+24+8]%asi 214 + stxa %o1, [%o5+32+8]%asi 215 + stxa %o1, [%o5+40+8]%asi 216 + add %o5, 64, %o5 217 + stxa %o1, [%o5-8]%asi 218 + bgu %xcc, .wr_loop_rest 219 + stxa %o1, [%o5]ASI_STBI_P 220 + 221 + ! If more than ST_CHUNK*64 bytes remain to set, continue 222 + ! setting the first long word of each cache line in advance 223 + ! to keep the store pipeline moving. 224 + 225 + cmp %o4, ST_CHUNK*64 226 + bge,pt %xcc, .wr_loop_start 227 + mov ST_CHUNK, %g1 228 + 229 + brz,a,pn %o4, .asi_done 230 + add %o5, 8, %o5 ! restore %o5 offset 231 + 232 + .wr_loop_small: 233 + stxa %o1, [%o5+8]%asi 234 + stxa %o1, [%o5+8+8]%asi 235 + stxa %o1, [%o5+16+8]%asi 236 + stxa %o1, [%o5+24+8]%asi 237 + stxa %o1, [%o5+32+8]%asi 238 + subcc %o4, 64, %o4 239 + stxa %o1, [%o5+40+8]%asi 240 + add %o5, 64, %o5 241 + stxa %o1, [%o5-8]%asi 242 + bgu,pt %xcc, .wr_loop_small 243 + stxa %o1, [%o5]ASI_STBI_P 244 + 245 + ba .asi_done 246 + add %o5, 8, %o5 ! restore %o5 offset 247 + 248 + ! Special case loop for zero fill memsets 249 + ! For each 64 byte cache line, single STBI to first element 250 + ! clears line 251 + .wrzero: 252 + cmp %o4, MIN_ZERO ! check if enough bytes to set 253 + ! to pay %asi + membar cost 254 + blu %xcc, .short_set 255 + nop 256 + sub %o4, 256, %o4 257 + 258 + .wrzero_loop: 259 + mov 64, %g3 260 + stxa %o1, [%o5]ASI_STBI_P 261 + subcc %o4, 256, %o4 262 + stxa %o1, [%o5+%g3]ASI_STBI_P 263 + add %o5, 256, %o5 264 + sub %g3, 192, %g3 265 + stxa %o1, [%o5+%g3]ASI_STBI_P 266 + add %g3, 64, %g3 267 + bge,pt %xcc, .wrzero_loop 268 + stxa %o1, [%o5+%g3]ASI_STBI_P 269 + add %o4, 256, %o4 270 + 271 + brz,pn %o4, .bsi_done 272 + nop 273 + 274 + .wrzero_small: 275 + stxa %o1, [%o5]ASI_STBI_P 276 + subcc %o4, 64, %o4 277 + bgu,pt %xcc, .wrzero_small 278 + add %o5, 64, %o5 279 + ba,a .bsi_done 280 + 281 + .asi_done: 282 + wr %g3, 0x0, %asi ! restored saved %asi 283 + .bsi_done: 284 + membar #StoreStore ! required by use of Block Store Init 285 + 286 + .short_set: 287 + cmp %o4, 64 ! check if 64 bytes to set 288 + blu %xcc, 5f 289 + nop 290 + 4: ! set final blocks of 64 bytes 291 + stx %o1, [%o5] 292 + stx %o1, [%o5+8] 293 + stx %o1, [%o5+16] 294 + stx %o1, [%o5+24] 295 + subcc %o4, 64, %o4 296 + stx %o1, [%o5+32] 297 + stx %o1, [%o5+40] 298 + add %o5, 64, %o5 299 + stx %o1, [%o5-16] 300 + bgu,pt %xcc, 4b 301 + stx %o1, [%o5-8] 302 + 303 + 5: 304 + ! Set the remaining long words 305 + .wrshort: 306 + subcc %o3, 8, %o3 ! Can we store any long words? 307 + blu,pn %xcc, .wrchars 308 + and %o2, 7, %o2 ! calc bytes left after long words 309 + 6: 310 + subcc %o3, 8, %o3 311 + stx %o1, [%o5] ! store the long words 312 + bgeu,pt %xcc, 6b 313 + add %o5, 8, %o5 314 + 315 + .wrchars: ! check for extra chars 316 + brnz %o2, .wrfin 317 + nop 318 + retl 319 + nop 320 + 321 + .wdalign: 322 + andcc %o5, 3, %o3 ! is sp1 aligned on a word boundary 323 + bz,pn %xcc, .wrword 324 + andn %o2, 3, %o3 ! create word sized count in %o3 325 + 326 + dec %o2 ! decrement count 327 + stb %o1, [%o5] ! clear a byte 328 + b .wdalign 329 + inc %o5 ! next byte 330 + 331 + .wrword: 332 + subcc %o3, 4, %o3 333 + st %o1, [%o5] ! 4-byte writing loop 334 + bnz,pt %xcc, .wrword 335 + add %o5, 4, %o5 336 + 337 + and %o2, 3, %o2 ! leftover count, if any 338 + 339 + .wrchar: 340 + ! Set the remaining bytes, if any 341 + brz %o2, .exit 342 + nop 343 + .wrfin: 344 + deccc %o2 345 + stb %o1, [%o5] 346 + bgu,pt %xcc, .wrfin 347 + inc %o5 348 + .exit: 349 + retl ! %o0 was preserved 350 + nop 351 + 352 + .size M7memset,.-M7memset
+51
arch/sparc/lib/M7patch.S
··· 1 + /* 2 + * M7patch.S: Patch generic routines with M7 variant. 3 + * 4 + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. 5 + */ 6 + 7 + #include <linux/linkage.h> 8 + 9 + #define BRANCH_ALWAYS 0x10680000 10 + #define NOP 0x01000000 11 + #define NG_DO_PATCH(OLD, NEW) \ 12 + sethi %hi(NEW), %g1; \ 13 + or %g1, %lo(NEW), %g1; \ 14 + sethi %hi(OLD), %g2; \ 15 + or %g2, %lo(OLD), %g2; \ 16 + sub %g1, %g2, %g1; \ 17 + sethi %hi(BRANCH_ALWAYS), %g3; \ 18 + sll %g1, 11, %g1; \ 19 + srl %g1, 11 + 2, %g1; \ 20 + or %g3, %lo(BRANCH_ALWAYS), %g3; \ 21 + or %g3, %g1, %g3; \ 22 + stw %g3, [%g2]; \ 23 + sethi %hi(NOP), %g3; \ 24 + or %g3, %lo(NOP), %g3; \ 25 + stw %g3, [%g2 + 0x4]; \ 26 + flush %g2; 27 + 28 + ENTRY(m7_patch_copyops) 29 + NG_DO_PATCH(memcpy, M7memcpy) 30 + NG_DO_PATCH(raw_copy_from_user, M7copy_from_user) 31 + NG_DO_PATCH(raw_copy_to_user, M7copy_to_user) 32 + retl 33 + nop 34 + ENDPROC(m7_patch_copyops) 35 + 36 + ENTRY(m7_patch_bzero) 37 + NG_DO_PATCH(memset, M7memset) 38 + NG_DO_PATCH(__bzero, M7bzero) 39 + NG_DO_PATCH(__clear_user, NGclear_user) 40 + NG_DO_PATCH(tsb_init, NGtsb_init) 41 + retl 42 + nop 43 + ENDPROC(m7_patch_bzero) 44 + 45 + ENTRY(m7_patch_pageops) 46 + NG_DO_PATCH(copy_user_page, NG4copy_user_page) 47 + NG_DO_PATCH(_clear_page, M7clear_page) 48 + NG_DO_PATCH(clear_user_page, M7clear_user_page) 49 + retl 50 + nop 51 + ENDPROC(m7_patch_pageops)
+3
arch/sparc/lib/Makefile
··· 38 38 39 39 lib-$(CONFIG_SPARC64) += Memcpy_utils.o 40 40 41 + lib-$(CONFIG_SPARC64) += M7memcpy.o M7copy_from_user.o M7copy_to_user.o 42 + lib-$(CONFIG_SPARC64) += M7patch.o M7memset.o 43 + 41 44 lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o 42 45 lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o 43 46