Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ARCv2: optimised string/mem lib routines

Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

authored by

Claudiu Zissulescu and committed by
Vineet Gupta
1f7e3dc0 bcc4d65a

+411 -2
+4 -2
arch/arc/lib/Makefile
··· 5 5 # it under the terms of the GNU General Public License version 2 as 6 6 # published by the Free Software Foundation. 7 7 8 - lib-y := strchr-700.o strcmp.o strcpy-700.o strlen.o 9 - lib-y += memcmp.o memcpy-700.o memset.o 8 + lib-y := strchr-700.o strcpy-700.o strlen.o memcmp.o 9 + 10 + lib-$(CONFIG_ISA_ARCOMPACT) += memcpy-700.o memset.o strcmp.o 11 + lib-$(CONFIG_ISA_ARCV2) += memcpy-archs.o memset-archs.o strcmp-archs.o
+236
arch/arc/lib/memcpy-archs.S
··· 1 + /* 2 + * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) 3 + * 4 + * This program is free software; you can redistribute it and/or modify 5 + * it under the terms of the GNU General Public License version 2 as 6 + * published by the Free Software Foundation. 7 + */ 8 + 9 + #include <linux/linkage.h> 10 + 11 + #ifdef __LITTLE_ENDIAN__ 12 + # define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; << 13 + # define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >> 14 + # define MERGE_1(RX,RY,IMM) asl RX, RY, IMM 15 + # define MERGE_2(RX,RY,IMM) 16 + # define EXTRACT_1(RX,RY,IMM) and RX, RY, 0xFFFF 17 + # define EXTRACT_2(RX,RY,IMM) lsr RX, RY, IMM 18 + #else 19 + # define SHIFT_1(RX,RY,IMM) lsr RX, RY, IMM ; >> 20 + # define SHIFT_2(RX,RY,IMM) asl RX, RY, IMM ; << 21 + # define MERGE_1(RX,RY,IMM) asl RX, RY, IMM ; << 22 + # define MERGE_2(RX,RY,IMM) asl RX, RY, IMM ; << 23 + # define EXTRACT_1(RX,RY,IMM) lsr RX, RY, IMM 24 + # define EXTRACT_2(RX,RY,IMM) lsr RX, RY, 0x08 25 + #endif 26 + 27 + #ifdef CONFIG_ARC_HAS_LL64 28 + # define PREFETCH_READ(RX) prefetch [RX, 56] 29 + # define PREFETCH_WRITE(RX) prefetchw [RX, 64] 30 + # define LOADX(DST,RX) ldd.ab DST, [RX, 8] 31 + # define STOREX(SRC,RX) std.ab SRC, [RX, 8] 32 + # define ZOLSHFT 5 33 + # define ZOLAND 0x1F 34 + #else 35 + # define PREFETCH_READ(RX) prefetch [RX, 28] 36 + # define PREFETCH_WRITE(RX) prefetchw [RX, 32] 37 + # define LOADX(DST,RX) ld.ab DST, [RX, 4] 38 + # define STOREX(SRC,RX) st.ab SRC, [RX, 4] 39 + # define ZOLSHFT 4 40 + # define ZOLAND 0xF 41 + #endif 42 + 43 + ENTRY(memcpy) 44 + prefetch [r1] ; Prefetch the read location 45 + prefetchw [r0] ; Prefetch the write location 46 + mov.f 0, r2 47 + ;;; if size is zero 48 + jz.d [blink] 49 + mov r3, r0 ; don;t clobber ret val 50 + 51 + ;;; if size <= 8 52 + cmp r2, 8 53 + bls.d @smallchunk 54 + mov.f lp_count, r2 55 + 56 + and.f r4, r0, 0x03 57 + rsub lp_count, r4, 4 58 + lpnz @aligndestination 59 + ;; LOOP BEGIN 60 + ldb.ab r5, [r1,1] 61 + sub r2, r2, 1 62 + stb.ab r5, [r3,1] 63 + aligndestination: 64 + 65 + ;;; Check the alignment of the source 66 + and.f r4, r1, 0x03 67 + bnz.d @sourceunaligned 68 + 69 + ;;; CASE 0: Both source and destination are 32bit aligned 70 + ;;; Convert len to Dwords, unfold x4 71 + lsr.f lp_count, r2, ZOLSHFT 72 + lpnz @copy32_64bytes 73 + ;; LOOP START 74 + LOADX (r6, r1) 75 + PREFETCH_READ (r1) 76 + PREFETCH_WRITE (r3) 77 + LOADX (r8, r1) 78 + LOADX (r10, r1) 79 + LOADX (r4, r1) 80 + STOREX (r6, r3) 81 + STOREX (r8, r3) 82 + STOREX (r10, r3) 83 + STOREX (r4, r3) 84 + copy32_64bytes: 85 + 86 + and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes 87 + smallchunk: 88 + lpnz @copyremainingbytes 89 + ;; LOOP START 90 + ldb.ab r5, [r1,1] 91 + stb.ab r5, [r3,1] 92 + copyremainingbytes: 93 + 94 + j [blink] 95 + ;;; END CASE 0 96 + 97 + sourceunaligned: 98 + cmp r4, 2 99 + beq.d @unalignedOffby2 100 + sub r2, r2, 1 101 + 102 + bhi.d @unalignedOffby3 103 + ldb.ab r5, [r1, 1] 104 + 105 + ;;; CASE 1: The source is unaligned, off by 1 106 + ;; Hence I need to read 1 byte for a 16bit alignment 107 + ;; and 2bytes to reach 32bit alignment 108 + ldh.ab r6, [r1, 2] 109 + sub r2, r2, 2 110 + ;; Convert to words, unfold x2 111 + lsr.f lp_count, r2, 3 112 + MERGE_1 (r6, r6, 8) 113 + MERGE_2 (r5, r5, 24) 114 + or r5, r5, r6 115 + 116 + ;; Both src and dst are aligned 117 + lpnz @copy8bytes_1 118 + ;; LOOP START 119 + ld.ab r6, [r1, 4] 120 + prefetch [r1, 28] ;Prefetch the next read location 121 + ld.ab r8, [r1,4] 122 + prefetchw [r3, 32] ;Prefetch the next write location 123 + 124 + SHIFT_1 (r7, r6, 24) 125 + or r7, r7, r5 126 + SHIFT_2 (r5, r6, 8) 127 + 128 + SHIFT_1 (r9, r8, 24) 129 + or r9, r9, r5 130 + SHIFT_2 (r5, r8, 8) 131 + 132 + st.ab r7, [r3, 4] 133 + st.ab r9, [r3, 4] 134 + copy8bytes_1: 135 + 136 + ;; Write back the remaining 16bits 137 + EXTRACT_1 (r6, r5, 16) 138 + sth.ab r6, [r3, 2] 139 + ;; Write back the remaining 8bits 140 + EXTRACT_2 (r5, r5, 16) 141 + stb.ab r5, [r3, 1] 142 + 143 + and.f lp_count, r2, 0x07 ;Last 8bytes 144 + lpnz @copybytewise_1 145 + ;; LOOP START 146 + ldb.ab r6, [r1,1] 147 + stb.ab r6, [r3,1] 148 + copybytewise_1: 149 + j [blink] 150 + 151 + unalignedOffby2: 152 + ;;; CASE 2: The source is unaligned, off by 2 153 + ldh.ab r5, [r1, 2] 154 + sub r2, r2, 1 155 + 156 + ;; Both src and dst are aligned 157 + ;; Convert to words, unfold x2 158 + lsr.f lp_count, r2, 3 159 + #ifdef __BIG_ENDIAN__ 160 + asl.nz r5, r5, 16 161 + #endif 162 + lpnz @copy8bytes_2 163 + ;; LOOP START 164 + ld.ab r6, [r1, 4] 165 + prefetch [r1, 28] ;Prefetch the next read location 166 + ld.ab r8, [r1,4] 167 + prefetchw [r3, 32] ;Prefetch the next write location 168 + 169 + SHIFT_1 (r7, r6, 16) 170 + or r7, r7, r5 171 + SHIFT_2 (r5, r6, 16) 172 + 173 + SHIFT_1 (r9, r8, 16) 174 + or r9, r9, r5 175 + SHIFT_2 (r5, r8, 16) 176 + 177 + st.ab r7, [r3, 4] 178 + st.ab r9, [r3, 4] 179 + copy8bytes_2: 180 + 181 + #ifdef __BIG_ENDIAN__ 182 + lsr.nz r5, r5, 16 183 + #endif 184 + sth.ab r5, [r3, 2] 185 + 186 + and.f lp_count, r2, 0x07 ;Last 8bytes 187 + lpnz @copybytewise_2 188 + ;; LOOP START 189 + ldb.ab r6, [r1,1] 190 + stb.ab r6, [r3,1] 191 + copybytewise_2: 192 + j [blink] 193 + 194 + unalignedOffby3: 195 + ;;; CASE 3: The source is unaligned, off by 3 196 + ;;; Hence, I need to read 1byte for achieve the 32bit alignment 197 + 198 + ;; Both src and dst are aligned 199 + ;; Convert to words, unfold x2 200 + lsr.f lp_count, r2, 3 201 + #ifdef __BIG_ENDIAN__ 202 + asl.ne r5, r5, 24 203 + #endif 204 + lpnz @copy8bytes_3 205 + ;; LOOP START 206 + ld.ab r6, [r1, 4] 207 + prefetch [r1, 28] ;Prefetch the next read location 208 + ld.ab r8, [r1,4] 209 + prefetch [r3, 32] ;Prefetch the next write location 210 + 211 + SHIFT_1 (r7, r6, 8) 212 + or r7, r7, r5 213 + SHIFT_2 (r5, r6, 24) 214 + 215 + SHIFT_1 (r9, r8, 8) 216 + or r9, r9, r5 217 + SHIFT_2 (r5, r8, 24) 218 + 219 + st.ab r7, [r3, 4] 220 + st.ab r9, [r3, 4] 221 + copy8bytes_3: 222 + 223 + #ifdef __BIG_ENDIAN__ 224 + lsr.nz r5, r5, 24 225 + #endif 226 + stb.ab r5, [r3, 1] 227 + 228 + and.f lp_count, r2, 0x07 ;Last 8bytes 229 + lpnz @copybytewise_3 230 + ;; LOOP START 231 + ldb.ab r6, [r1,1] 232 + stb.ab r6, [r3,1] 233 + copybytewise_3: 234 + j [blink] 235 + 236 + END(memcpy)
+93
arch/arc/lib/memset-archs.S
··· 1 + /* 2 + * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) 3 + * 4 + * This program is free software; you can redistribute it and/or modify 5 + * it under the terms of the GNU General Public License version 2 as 6 + * published by the Free Software Foundation. 7 + */ 8 + 9 + #include <linux/linkage.h> 10 + 11 + #undef PREALLOC_NOT_AVAIL 12 + 13 + #ifdef PREALLOC_NOT_AVAIL 14 + #define PREWRITE(A,B) prefetchw [(A),(B)] 15 + #else 16 + #define PREWRITE(A,B) prealloc [(A),(B)] 17 + #endif 18 + 19 + ENTRY(memset) 20 + prefetchw [r0] ; Prefetch the write location 21 + mov.f 0, r2 22 + ;;; if size is zero 23 + jz.d [blink] 24 + mov r3, r0 ; don't clobber ret val 25 + 26 + ;;; if length < 8 27 + brls.d.nt r2, 8, .Lsmallchunk 28 + mov.f lp_count,r2 29 + 30 + and.f r4, r0, 0x03 31 + rsub lp_count, r4, 4 32 + lpnz @.Laligndestination 33 + ;; LOOP BEGIN 34 + stb.ab r1, [r3,1] 35 + sub r2, r2, 1 36 + .Laligndestination: 37 + 38 + ;;; Destination is aligned 39 + and r1, r1, 0xFF 40 + asl r4, r1, 8 41 + or r4, r4, r1 42 + asl r5, r4, 16 43 + or r5, r5, r4 44 + mov r4, r5 45 + 46 + sub3 lp_count, r2, 8 47 + cmp r2, 64 48 + bmsk.hi r2, r2, 5 49 + mov.ls lp_count, 0 50 + add3.hi r2, r2, 8 51 + 52 + ;;; Convert len to Dwords, unfold x8 53 + lsr.f lp_count, lp_count, 6 54 + lpnz @.Lset64bytes 55 + ;; LOOP START 56 + PREWRITE(r3, 64) ;Prefetch the next write location 57 + std.ab r4, [r3, 8] 58 + std.ab r4, [r3, 8] 59 + std.ab r4, [r3, 8] 60 + std.ab r4, [r3, 8] 61 + std.ab r4, [r3, 8] 62 + std.ab r4, [r3, 8] 63 + std.ab r4, [r3, 8] 64 + std.ab r4, [r3, 8] 65 + .Lset64bytes: 66 + 67 + lsr.f lp_count, r2, 5 ;Last remaining max 124 bytes 68 + lpnz .Lset32bytes 69 + ;; LOOP START 70 + prefetchw [r3, 32] ;Prefetch the next write location 71 + std.ab r4, [r3, 8] 72 + std.ab r4, [r3, 8] 73 + std.ab r4, [r3, 8] 74 + std.ab r4, [r3, 8] 75 + .Lset32bytes: 76 + 77 + and.f lp_count, r2, 0x1F ;Last remaining 31 bytes 78 + .Lsmallchunk: 79 + lpnz .Lcopy3bytes 80 + ;; LOOP START 81 + stb.ab r1, [r3, 1] 82 + .Lcopy3bytes: 83 + 84 + j [blink] 85 + 86 + END(memset) 87 + 88 + ENTRY(memzero) 89 + ; adjust bzero args to memset args 90 + mov r2, r1 91 + b.d memset ;tail call so need to tinker with blink 92 + mov r1, 0 93 + END(memzero)
+78
arch/arc/lib/strcmp-archs.S
··· 1 + /* 2 + * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) 3 + * 4 + * This program is free software; you can redistribute it and/or modify 5 + * it under the terms of the GNU General Public License version 2 as 6 + * published by the Free Software Foundation. 7 + */ 8 + 9 + #include <linux/linkage.h> 10 + 11 + ENTRY(strcmp) 12 + or r2, r0, r1 13 + bmsk_s r2, r2, 1 14 + brne r2, 0, @.Lcharloop 15 + 16 + ;;; s1 and s2 are word aligned 17 + ld.ab r2, [r0, 4] 18 + 19 + mov_s r12, 0x01010101 20 + ror r11, r12 21 + .align 4 22 + .LwordLoop: 23 + ld.ab r3, [r1, 4] 24 + ;; Detect NULL char in str1 25 + sub r4, r2, r12 26 + ld.ab r5, [r0, 4] 27 + bic r4, r4, r2 28 + and r4, r4, r11 29 + brne.d.nt r4, 0, .LfoundNULL 30 + ;; Check if the read locations are the same 31 + cmp r2, r3 32 + beq.d .LwordLoop 33 + mov.eq r2, r5 34 + 35 + ;; A match is found, spot it out 36 + #ifdef __LITTLE_ENDIAN__ 37 + swape r3, r3 38 + mov_s r0, 1 39 + swape r2, r2 40 + #else 41 + mov_s r0, 1 42 + #endif 43 + cmp_s r2, r3 44 + j_s.d [blink] 45 + bset.lo r0, r0, 31 46 + 47 + .align 4 48 + .LfoundNULL: 49 + #ifdef __BIG_ENDIAN__ 50 + swape r4, r4 51 + swape r2, r2 52 + swape r3, r3 53 + #endif 54 + ;; Find null byte 55 + ffs r0, r4 56 + bmsk r2, r2, r0 57 + bmsk r3, r3, r0 58 + swape r2, r2 59 + swape r3, r3 60 + ;; make the return value 61 + sub.f r0, r2, r3 62 + mov.hi r0, 1 63 + j_s.d [blink] 64 + bset.lo r0, r0, 31 65 + 66 + .align 4 67 + .Lcharloop: 68 + ldb.ab r2, [r0, 1] 69 + ldb.ab r3, [r1, 1] 70 + nop 71 + breq r2, 0, .Lcmpend 72 + breq r2, r3, .Lcharloop 73 + 74 + .align 4 75 + .Lcmpend: 76 + j_s.d [blink] 77 + sub r0, r2, r3 78 + END(strcmp)