Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

csky: Add C based string functions

Try to access RAM with the largest bit width possible, but without
doing unaligned accesses.

A further improvement could be to use multiple read and writes as the
assembly version was trying to do.

Tested on a BeagleV Starlight with a SiFive U74 core, where the
improvement is noticeable.

Signed-off-by: Matteo Croce <mcroce@microsoft.com>
Co-developed-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>

authored by

Matteo Croce and committed by
Guo Ren
e4df2d5e cfb24463

+150 -356
+8
arch/csky/Kconfig
··· 320 320 controlled through /sys/devices/system/cpu/cpu1/hotplug/target. 321 321 322 322 Say N if you want to disable CPU hotplug. 323 + 324 + config HAVE_EFFICIENT_UNALIGNED_STRING_OPS 325 + bool "Enable EFFICIENT_UNALIGNED_STRING_OPS for abiv2" 326 + depends on CPU_CK807 || CPU_CK810 || CPU_CK860 327 + help 328 + Say Y here to enable EFFICIENT_UNALIGNED_STRING_OPS. Some CPU models could 329 + deal with unaligned access by hardware. 330 + 323 331 endmenu 324 332 325 333 source "arch/csky/Kconfig.platforms"
-2
arch/csky/abiv1/Makefile
··· 4 4 obj-y += bswapsi.o 5 5 obj-y += cacheflush.o 6 6 obj-y += mmap.o 7 - obj-y += memcpy.o 8 - obj-y += strksyms.o
-347
arch/csky/abiv1/memcpy.S
··· 1 - /* SPDX-License-Identifier: GPL-2.0 */ 2 - // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. 3 - 4 - #include <linux/linkage.h> 5 - 6 - .macro GET_FRONT_BITS rx y 7 - #ifdef __cskyLE__ 8 - lsri \rx, \y 9 - #else 10 - lsli \rx, \y 11 - #endif 12 - .endm 13 - 14 - .macro GET_AFTER_BITS rx y 15 - #ifdef __cskyLE__ 16 - lsli \rx, \y 17 - #else 18 - lsri \rx, \y 19 - #endif 20 - .endm 21 - 22 - /* void *memcpy(void *dest, const void *src, size_t n); */ 23 - ENTRY(memcpy) 24 - mov r7, r2 25 - cmplti r4, 4 26 - bt .L_copy_by_byte 27 - mov r6, r2 28 - andi r6, 3 29 - cmpnei r6, 0 30 - jbt .L_dest_not_aligned 31 - mov r6, r3 32 - andi r6, 3 33 - cmpnei r6, 0 34 - jbt .L_dest_aligned_but_src_not_aligned 35 - .L0: 36 - cmplti r4, 16 37 - jbt .L_aligned_and_len_less_16bytes 38 - subi sp, 8 39 - stw r8, (sp, 0) 40 - .L_aligned_and_len_larger_16bytes: 41 - ldw r1, (r3, 0) 42 - ldw r5, (r3, 4) 43 - ldw r8, (r3, 8) 44 - stw r1, (r7, 0) 45 - ldw r1, (r3, 12) 46 - stw r5, (r7, 4) 47 - stw r8, (r7, 8) 48 - stw r1, (r7, 12) 49 - subi r4, 16 50 - addi r3, 16 51 - addi r7, 16 52 - cmplti r4, 16 53 - jbf .L_aligned_and_len_larger_16bytes 54 - ldw r8, (sp, 0) 55 - addi sp, 8 56 - cmpnei r4, 0 57 - jbf .L_return 58 - 59 - .L_aligned_and_len_less_16bytes: 60 - cmplti r4, 4 61 - bt .L_copy_by_byte 62 - .L1: 63 - ldw r1, (r3, 0) 64 - stw r1, (r7, 0) 65 - subi r4, 4 66 - addi r3, 4 67 - addi r7, 4 68 - cmplti r4, 4 69 - jbf .L1 70 - br .L_copy_by_byte 71 - 72 - .L_return: 73 - rts 74 - 75 - .L_copy_by_byte: /* len less than 4 bytes */ 76 - cmpnei r4, 0 77 - jbf .L_return 78 - .L4: 79 - ldb r1, (r3, 0) 80 - stb r1, (r7, 0) 81 - addi r3, 1 82 - addi r7, 1 83 - decne r4 84 - jbt .L4 85 - rts 86 - 87 - /* 88 - * If dest is not aligned, just copying some bytes makes the dest align. 89 - * Afther that, we judge whether the src is aligned. 90 - */ 91 - .L_dest_not_aligned: 92 - mov r5, r3 93 - rsub r5, r5, r7 94 - abs r5, r5 95 - cmplt r5, r4 96 - bt .L_copy_by_byte 97 - mov r5, r7 98 - sub r5, r3 99 - cmphs r5, r4 100 - bf .L_copy_by_byte 101 - mov r5, r6 102 - .L5: 103 - ldb r1, (r3, 0) /* makes the dest align. */ 104 - stb r1, (r7, 0) 105 - addi r5, 1 106 - subi r4, 1 107 - addi r3, 1 108 - addi r7, 1 109 - cmpnei r5, 4 110 - jbt .L5 111 - cmplti r4, 4 112 - jbt .L_copy_by_byte 113 - mov r6, r3 /* judge whether the src is aligned. */ 114 - andi r6, 3 115 - cmpnei r6, 0 116 - jbf .L0 117 - 118 - /* Judge the number of misaligned, 1, 2, 3? */ 119 - .L_dest_aligned_but_src_not_aligned: 120 - mov r5, r3 121 - rsub r5, r5, r7 122 - abs r5, r5 123 - cmplt r5, r4 124 - bt .L_copy_by_byte 125 - bclri r3, 0 126 - bclri r3, 1 127 - ldw r1, (r3, 0) 128 - addi r3, 4 129 - cmpnei r6, 2 130 - bf .L_dest_aligned_but_src_not_aligned_2bytes 131 - cmpnei r6, 3 132 - bf .L_dest_aligned_but_src_not_aligned_3bytes 133 - 134 - .L_dest_aligned_but_src_not_aligned_1byte: 135 - mov r5, r7 136 - sub r5, r3 137 - cmphs r5, r4 138 - bf .L_copy_by_byte 139 - cmplti r4, 16 140 - bf .L11 141 - .L10: /* If the len is less than 16 bytes */ 142 - GET_FRONT_BITS r1 8 143 - mov r5, r1 144 - ldw r6, (r3, 0) 145 - mov r1, r6 146 - GET_AFTER_BITS r6 24 147 - or r5, r6 148 - stw r5, (r7, 0) 149 - subi r4, 4 150 - addi r3, 4 151 - addi r7, 4 152 - cmplti r4, 4 153 - bf .L10 154 - subi r3, 3 155 - br .L_copy_by_byte 156 - .L11: 157 - subi sp, 16 158 - stw r8, (sp, 0) 159 - stw r9, (sp, 4) 160 - stw r10, (sp, 8) 161 - stw r11, (sp, 12) 162 - .L12: 163 - ldw r5, (r3, 0) 164 - ldw r11, (r3, 4) 165 - ldw r8, (r3, 8) 166 - ldw r9, (r3, 12) 167 - 168 - GET_FRONT_BITS r1 8 /* little or big endian? */ 169 - mov r10, r5 170 - GET_AFTER_BITS r5 24 171 - or r5, r1 172 - 173 - GET_FRONT_BITS r10 8 174 - mov r1, r11 175 - GET_AFTER_BITS r11 24 176 - or r11, r10 177 - 178 - GET_FRONT_BITS r1 8 179 - mov r10, r8 180 - GET_AFTER_BITS r8 24 181 - or r8, r1 182 - 183 - GET_FRONT_BITS r10 8 184 - mov r1, r9 185 - GET_AFTER_BITS r9 24 186 - or r9, r10 187 - 188 - stw r5, (r7, 0) 189 - stw r11, (r7, 4) 190 - stw r8, (r7, 8) 191 - stw r9, (r7, 12) 192 - subi r4, 16 193 - addi r3, 16 194 - addi r7, 16 195 - cmplti r4, 16 196 - jbf .L12 197 - ldw r8, (sp, 0) 198 - ldw r9, (sp, 4) 199 - ldw r10, (sp, 8) 200 - ldw r11, (sp, 12) 201 - addi sp , 16 202 - cmplti r4, 4 203 - bf .L10 204 - subi r3, 3 205 - br .L_copy_by_byte 206 - 207 - .L_dest_aligned_but_src_not_aligned_2bytes: 208 - cmplti r4, 16 209 - bf .L21 210 - .L20: 211 - GET_FRONT_BITS r1 16 212 - mov r5, r1 213 - ldw r6, (r3, 0) 214 - mov r1, r6 215 - GET_AFTER_BITS r6 16 216 - or r5, r6 217 - stw r5, (r7, 0) 218 - subi r4, 4 219 - addi r3, 4 220 - addi r7, 4 221 - cmplti r4, 4 222 - bf .L20 223 - subi r3, 2 224 - br .L_copy_by_byte 225 - rts 226 - 227 - .L21: /* n > 16 */ 228 - subi sp, 16 229 - stw r8, (sp, 0) 230 - stw r9, (sp, 4) 231 - stw r10, (sp, 8) 232 - stw r11, (sp, 12) 233 - 234 - .L22: 235 - ldw r5, (r3, 0) 236 - ldw r11, (r3, 4) 237 - ldw r8, (r3, 8) 238 - ldw r9, (r3, 12) 239 - 240 - GET_FRONT_BITS r1 16 241 - mov r10, r5 242 - GET_AFTER_BITS r5 16 243 - or r5, r1 244 - 245 - GET_FRONT_BITS r10 16 246 - mov r1, r11 247 - GET_AFTER_BITS r11 16 248 - or r11, r10 249 - 250 - GET_FRONT_BITS r1 16 251 - mov r10, r8 252 - GET_AFTER_BITS r8 16 253 - or r8, r1 254 - 255 - GET_FRONT_BITS r10 16 256 - mov r1, r9 257 - GET_AFTER_BITS r9 16 258 - or r9, r10 259 - 260 - stw r5, (r7, 0) 261 - stw r11, (r7, 4) 262 - stw r8, (r7, 8) 263 - stw r9, (r7, 12) 264 - subi r4, 16 265 - addi r3, 16 266 - addi r7, 16 267 - cmplti r4, 16 268 - jbf .L22 269 - ldw r8, (sp, 0) 270 - ldw r9, (sp, 4) 271 - ldw r10, (sp, 8) 272 - ldw r11, (sp, 12) 273 - addi sp, 16 274 - cmplti r4, 4 275 - bf .L20 276 - subi r3, 2 277 - br .L_copy_by_byte 278 - 279 - 280 - .L_dest_aligned_but_src_not_aligned_3bytes: 281 - cmplti r4, 16 282 - bf .L31 283 - .L30: 284 - GET_FRONT_BITS r1 24 285 - mov r5, r1 286 - ldw r6, (r3, 0) 287 - mov r1, r6 288 - GET_AFTER_BITS r6 8 289 - or r5, r6 290 - stw r5, (r7, 0) 291 - subi r4, 4 292 - addi r3, 4 293 - addi r7, 4 294 - cmplti r4, 4 295 - bf .L30 296 - subi r3, 1 297 - br .L_copy_by_byte 298 - .L31: 299 - subi sp, 16 300 - stw r8, (sp, 0) 301 - stw r9, (sp, 4) 302 - stw r10, (sp, 8) 303 - stw r11, (sp, 12) 304 - .L32: 305 - ldw r5, (r3, 0) 306 - ldw r11, (r3, 4) 307 - ldw r8, (r3, 8) 308 - ldw r9, (r3, 12) 309 - 310 - GET_FRONT_BITS r1 24 311 - mov r10, r5 312 - GET_AFTER_BITS r5 8 313 - or r5, r1 314 - 315 - GET_FRONT_BITS r10 24 316 - mov r1, r11 317 - GET_AFTER_BITS r11 8 318 - or r11, r10 319 - 320 - GET_FRONT_BITS r1 24 321 - mov r10, r8 322 - GET_AFTER_BITS r8 8 323 - or r8, r1 324 - 325 - GET_FRONT_BITS r10 24 326 - mov r1, r9 327 - GET_AFTER_BITS r9 8 328 - or r9, r10 329 - 330 - stw r5, (r7, 0) 331 - stw r11, (r7, 4) 332 - stw r8, (r7, 8) 333 - stw r9, (r7, 12) 334 - subi r4, 16 335 - addi r3, 16 336 - addi r7, 16 337 - cmplti r4, 16 338 - jbf .L32 339 - ldw r8, (sp, 0) 340 - ldw r9, (sp, 4) 341 - ldw r10, (sp, 8) 342 - ldw r11, (sp, 12) 343 - addi sp, 16 344 - cmplti r4, 4 345 - bf .L30 346 - subi r3, 1 347 - br .L_copy_by_byte
-6
arch/csky/abiv1/strksyms.c
··· 1 - // SPDX-License-Identifier: GPL-2.0 2 - // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. 3 - 4 - #include <linux/module.h> 5 - 6 - EXPORT_SYMBOL(memcpy);
+2
arch/csky/abiv2/Makefile
··· 2 2 obj-y += cacheflush.o 3 3 obj-$(CONFIG_CPU_HAS_FPU) += fpu.o 4 4 obj-y += memcmp.o 5 + ifeq ($(CONFIG_HAVE_EFFICIENT_UNALIGNED_STRING_OPS), y) 5 6 obj-y += memcpy.o 6 7 obj-y += memmove.o 7 8 obj-y += memset.o 9 + endif 8 10 obj-y += strcmp.o 9 11 obj-y += strcpy.o 10 12 obj-y += strlen.o
+3 -1
arch/csky/abiv2/strksyms.c
··· 3 3 4 4 #include <linux/module.h> 5 5 6 + #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_STRING_OPS 6 7 EXPORT_SYMBOL(memcpy); 7 8 EXPORT_SYMBOL(memset); 8 - EXPORT_SYMBOL(memcmp); 9 9 EXPORT_SYMBOL(memmove); 10 + #endif 11 + EXPORT_SYMBOL(memcmp); 10 12 EXPORT_SYMBOL(strcmp); 11 13 EXPORT_SYMBOL(strcpy); 12 14 EXPORT_SYMBOL(strlen);
+3
arch/csky/lib/Makefile
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 lib-y := usercopy.o delay.o 3 3 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o 4 + ifneq ($(CONFIG_HAVE_EFFICIENT_UNALIGNED_STRING_OPS), y) 5 + lib-y += string.o 6 + endif
+134
arch/csky/lib/string.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * String functions optimized for hardware which doesn't 4 + * handle unaligned memory accesses efficiently. 5 + * 6 + * Copyright (C) 2021 Matteo Croce 7 + */ 8 + 9 + #include <linux/types.h> 10 + #include <linux/module.h> 11 + 12 + /* Minimum size for a word copy to be convenient */ 13 + #define BYTES_LONG sizeof(long) 14 + #define WORD_MASK (BYTES_LONG - 1) 15 + #define MIN_THRESHOLD (BYTES_LONG * 2) 16 + 17 + /* convenience union to avoid cast between different pointer types */ 18 + union types { 19 + u8 *as_u8; 20 + unsigned long *as_ulong; 21 + uintptr_t as_uptr; 22 + }; 23 + 24 + union const_types { 25 + const u8 *as_u8; 26 + unsigned long *as_ulong; 27 + uintptr_t as_uptr; 28 + }; 29 + 30 + void *memcpy(void *dest, const void *src, size_t count) 31 + { 32 + union const_types s = { .as_u8 = src }; 33 + union types d = { .as_u8 = dest }; 34 + int distance = 0; 35 + 36 + if (count < MIN_THRESHOLD) 37 + goto copy_remainder; 38 + 39 + /* Copy a byte at time until destination is aligned. */ 40 + for (; d.as_uptr & WORD_MASK; count--) 41 + *d.as_u8++ = *s.as_u8++; 42 + 43 + distance = s.as_uptr & WORD_MASK; 44 + 45 + if (distance) { 46 + unsigned long last, next; 47 + 48 + /* 49 + * s is distance bytes ahead of d, and d just reached 50 + * the alignment boundary. Move s backward to word align it 51 + * and shift data to compensate for distance, in order to do 52 + * word-by-word copy. 53 + */ 54 + s.as_u8 -= distance; 55 + 56 + next = s.as_ulong[0]; 57 + for (; count >= BYTES_LONG; count -= BYTES_LONG) { 58 + last = next; 59 + next = s.as_ulong[1]; 60 + 61 + d.as_ulong[0] = last >> (distance * 8) | 62 + next << ((BYTES_LONG - distance) * 8); 63 + 64 + d.as_ulong++; 65 + s.as_ulong++; 66 + } 67 + 68 + /* Restore s with the original offset. */ 69 + s.as_u8 += distance; 70 + } else { 71 + /* 72 + * If the source and dest lower bits are the same, do a simple 73 + * 32/64 bit wide copy. 74 + */ 75 + for (; count >= BYTES_LONG; count -= BYTES_LONG) 76 + *d.as_ulong++ = *s.as_ulong++; 77 + } 78 + 79 + copy_remainder: 80 + while (count--) 81 + *d.as_u8++ = *s.as_u8++; 82 + 83 + return dest; 84 + } 85 + EXPORT_SYMBOL(memcpy); 86 + 87 + /* 88 + * Simply check if the buffer overlaps an call memcpy() in case, 89 + * otherwise do a simple one byte at time backward copy. 90 + */ 91 + void *memmove(void *dest, const void *src, size_t count) 92 + { 93 + if (dest < src || src + count <= dest) 94 + return memcpy(dest, src, count); 95 + 96 + if (dest > src) { 97 + const char *s = src + count; 98 + char *tmp = dest + count; 99 + 100 + while (count--) 101 + *--tmp = *--s; 102 + } 103 + return dest; 104 + } 105 + EXPORT_SYMBOL(memmove); 106 + 107 + void *memset(void *s, int c, size_t count) 108 + { 109 + union types dest = { .as_u8 = s }; 110 + 111 + if (count >= MIN_THRESHOLD) { 112 + unsigned long cu = (unsigned long)c; 113 + 114 + /* Compose an ulong with 'c' repeated 4/8 times */ 115 + cu |= cu << 8; 116 + cu |= cu << 16; 117 + /* Suppress warning on 32 bit machines */ 118 + cu |= (cu << 16) << 16; 119 + 120 + for (; count && dest.as_uptr & WORD_MASK; count--) 121 + *dest.as_u8++ = c; 122 + 123 + /* Copy using the largest size allowed */ 124 + for (; count >= BYTES_LONG; count -= BYTES_LONG) 125 + *dest.as_ulong++ = cu; 126 + } 127 + 128 + /* copy the remainder */ 129 + while (count--) 130 + *dest.as_u8++ = c; 131 + 132 + return s; 133 + } 134 + EXPORT_SYMBOL(memset);