Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

cris: import memset.c from newlib: fixes compile error with newer (pre4.3) gcc

Adrian Bunk reported the following compile error with a SVN head GCC:

...
CC arch/cris/arch-v10/lib/memset.o
/home/bunk/linux/kernel-2.6/git/linux-2.6/arch/cris/arch-v10/lib/memset.c: In function 'memset':
/home/bunk/linux/kernel-2.6/git/linux-2.6/arch/cris/arch-v10/lib/memset.c:164: error: lvalue required as increment operand
/home/bunk/linux/kernel-2.6/git/linux-2.6/arch/cris/arch-v10/lib/memset.c:165: error: lvalue required as increment operand
/home/bunk/linux/kernel-2.6/git/linux-2.6/arch/cris/arch-v10/lib/memset.c:166: error: lvalue required as increment operand
/home/bunk/linux/kernel-2.6/git/linux-2.6/arch/cris/arch-v10/lib/memset.c:167: error: lvalue required as increment operand
/home/bunk/linux/kernel-2.6/git/linux-2.6/arch/cris/arch-v10/lib/memset.c:185: error: lvalue required as increment operand
/home/bunk/linux/kernel-2.6/git/linux-2.6/arch/cris/arch-v10/lib/memset.c:189: error: lvalue required as increment operand
/home/bunk/linux/kernel-2.6/git/linux-2.6/arch/cris/arch-v10/lib/memset.c:192: error: lvalue required as increment operand
... etc ...

This is due to the use of the construct:

*((long*)dst)++ = lc;

Which is no longer legal since casts don't return an lvalue.

The solution is to import the implementation from newlib,
which is continually autotested together with GCC mainline,
and uses the construct:

*(long *) dst = lc; dst += 4;

With this change, the generated code actually shrinks 76 bytes
since gcc notices that it can use autoincrement for the move
instruction in CRIS.

text data bss dec hex filename
304 0 0 304 130 memset.old.o
text data bss dec hex filename
228 0 0 228 e4 memset.o

Since this is an import of a file from newlib, I'm not touching
the formatting or correcting any checkpatch errors.

Note also that even if the two files for the CRIS v10 and CRIS v32
are identical at the moment, it might be possible to tweak the
CRIS v32 version. Thus, I'm not yet folding them into the same file,
at least not until we've done some research on it.

Signed-off-by: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Mikael Starvik <mikael.starvik@axis.com>
Cc: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Jesper Nilsson and committed by
Linus Torvalds
77a746ce 3c828e49

+391 -378
+196 -189
arch/cris/arch-v10/lib/memset.c
··· 1 - /*#************************************************************************#*/ 2 - /*#-------------------------------------------------------------------------*/ 3 - /*# */ 4 - /*# FUNCTION NAME: memset() */ 5 - /*# */ 6 - /*# PARAMETERS: void* dst; Destination address. */ 7 - /*# int c; Value of byte to write. */ 8 - /*# int len; Number of bytes to write. */ 9 - /*# */ 10 - /*# RETURNS: dst. */ 11 - /*# */ 12 - /*# DESCRIPTION: Sets the memory dst of length len bytes to c, as standard. */ 13 - /*# Framework taken from memcpy. This routine is */ 14 - /*# very sensitive to compiler changes in register allocation. */ 15 - /*# Should really be rewritten to avoid this problem. */ 16 - /*# */ 17 - /*#-------------------------------------------------------------------------*/ 18 - /*# */ 19 - /*# HISTORY */ 20 - /*# */ 21 - /*# DATE NAME CHANGES */ 22 - /*# ---- ---- ------- */ 23 - /*# 990713 HP Tired of watching this function (or */ 24 - /*# really, the nonoptimized generic */ 25 - /*# implementation) take up 90% of simulator */ 26 - /*# output. Measurements needed. */ 27 - /*# */ 28 - /*#-------------------------------------------------------------------------*/ 1 + /* A memset for CRIS. 2 + Copyright (C) 1999-2005 Axis Communications. 3 + All rights reserved. 29 4 30 - #include <linux/types.h> 5 + Redistribution and use in source and binary forms, with or without 6 + modification, are permitted provided that the following conditions 7 + are met: 31 8 32 - /* No, there's no macro saying 12*4, since it is "hard" to get it into 33 - the asm in a good way. Thus better to expose the problem everywhere. 34 - */ 9 + 1. Redistributions of source code must retain the above copyright 10 + notice, this list of conditions and the following disclaimer. 35 11 36 - /* Assuming 1 cycle per dword written or read (ok, not really true), and 37 - one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1) 38 - so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */ 12 + 2. Neither the name of Axis Communications nor the names of its 13 + contributors may be used to endorse or promote products derived 14 + from this software without specific prior written permission. 39 15 40 - #define ZERO_BLOCK_SIZE (1*12*4) 16 + THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS 17 + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS 20 + COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 21 + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 25 + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 26 + IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 + POSSIBILITY OF SUCH DAMAGE. */ 41 28 42 - void *memset(void *pdst, 43 - int c, 44 - size_t plen) 29 + /* FIXME: This file should really only be used for reference, as the 30 + result is somewhat depending on gcc generating what we expect rather 31 + than what we describe. An assembly file should be used instead. */ 32 + 33 + /* Note the multiple occurrence of the expression "12*4", including the 34 + asm. It is hard to get it into the asm in a good way. Thus better to 35 + expose the problem everywhere: no macro. */ 36 + 37 + /* Assuming one cycle per dword written or read (ok, not really true; the 38 + world is not ideal), and one cycle per instruction, then 43+3*(n/48-1) 39 + <= 24+24*(n/48-1) so n >= 45.7; n >= 0.9; we win on the first full 40 + 48-byte block to set. */ 41 + 42 + #define MEMSET_BY_BLOCK_THRESHOLD (1 * 48) 43 + 44 + /* No name ambiguities in this file. */ 45 + __asm__ (".syntax no_register_prefix"); 46 + 47 + void *memset(void *pdst, int c, unsigned int plen) 45 48 { 46 - /* Ok. Now we want the parameters put in special registers. 47 - Make sure the compiler is able to make something useful of this. */ 49 + /* Now we want the parameters in special registers. Make sure the 50 + compiler does something usable with this. */ 48 51 49 52 register char *return_dst __asm__ ("r10") = pdst; 50 53 register int n __asm__ ("r12") = plen; 51 54 register int lc __asm__ ("r11") = c; 52 55 53 - /* Most apps use memset sanely. Only those memsetting about 3..4 54 - bytes or less get penalized compared to the generic implementation 55 - - and that's not really sane use. */ 56 + /* Most apps use memset sanely. Memsetting about 3..4 bytes or less get 57 + penalized here compared to the generic implementation. */ 56 58 57 - /* Ugh. This is fragile at best. Check with newer GCC releases, if 58 - they compile cascaded "x |= x << 8" sanely! */ 59 - __asm__("movu.b %0,$r13\n\t" 60 - "lslq 8,$r13\n\t" 61 - "move.b %0,$r13\n\t" 62 - "move.d $r13,%0\n\t" 63 - "lslq 16,$r13\n\t" 64 - "or.d $r13,%0" 65 - : "=r" (lc) : "0" (lc) : "r13"); 59 + /* This is fragile performancewise at best. Check with newer GCC 60 + releases, if they compile cascaded "x |= x << 8" to sane code. */ 61 + __asm__("movu.b %0,r13 \n\ 62 + lslq 8,r13 \n\ 63 + move.b %0,r13 \n\ 64 + move.d r13,%0 \n\ 65 + lslq 16,r13 \n\ 66 + or.d r13,%0" 67 + : "=r" (lc) /* Inputs. */ 68 + : "0" (lc) /* Outputs. */ 69 + : "r13"); /* Trash. */ 66 70 67 71 { 68 72 register char *dst __asm__ ("r13") = pdst; 69 73 70 - /* This is NONPORTABLE, but since this whole routine is */ 71 - /* grossly nonportable that doesn't matter. */ 74 + if (((unsigned long) pdst & 3) != 0 75 + /* Oops! n = 0 must be a valid call, regardless of alignment. */ 76 + && n >= 3) 77 + { 78 + if ((unsigned long) dst & 1) 79 + { 80 + *dst = (char) lc; 81 + n--; 82 + dst++; 83 + } 72 84 73 - if (((unsigned long) pdst & 3) != 0 74 - /* Oops! n=0 must be a legal call, regardless of alignment. */ 75 - && n >= 3) 76 - { 77 - if ((unsigned long)dst & 1) 78 - { 79 - *dst = (char) lc; 80 - n--; 81 - dst++; 82 - } 85 + if ((unsigned long) dst & 2) 86 + { 87 + *(short *) dst = lc; 88 + n -= 2; 89 + dst += 2; 90 + } 91 + } 83 92 84 - if ((unsigned long)dst & 2) 85 - { 86 - *(short *)dst = lc; 87 - n -= 2; 88 - dst += 2; 89 - } 90 - } 91 - 92 - /* Now the fun part. For the threshold value of this, check the equation 93 - above. */ 94 - /* Decide which copying method to use. */ 95 - if (n >= ZERO_BLOCK_SIZE) 96 - { 97 - /* For large copies we use 'movem' */ 98 - 99 - /* It is not optimal to tell the compiler about clobbering any 100 - registers; that will move the saving/restoring of those registers 101 - to the function prologue/epilogue, and make non-movem sizes 102 - suboptimal. 103 - 104 - This method is not foolproof; it assumes that the "asm reg" 105 - declarations at the beginning of the function really are used 106 - here (beware: they may be moved to temporary registers). 107 - This way, we do not have to save/move the registers around into 108 - temporaries; we can safely use them straight away. 109 - 110 - If you want to check that the allocation was right; then 111 - check the equalities in the first comment. It should say 112 - "r13=r13, r12=r12, r11=r11" */ 113 - __asm__ volatile ("\n\ 114 - ;; Check that the following is true (same register names on \n\ 115 - ;; both sides of equal sign, as in r8=r8): \n\ 116 - ;; %0=r13, %1=r12, %4=r11 \n\ 117 - ;; \n\ 118 - ;; Save the registers we'll clobber in the movem process \n\ 119 - ;; on the stack. Don't mention them to gcc, it will only be \n\ 120 - ;; upset. \n\ 121 - subq 11*4,$sp \n\ 122 - movem $r10,[$sp] \n\ 93 + /* Decide which setting method to use. */ 94 + if (n >= MEMSET_BY_BLOCK_THRESHOLD) 95 + { 96 + /* It is not optimal to tell the compiler about clobbering any 97 + registers; that will move the saving/restoring of those registers 98 + to the function prologue/epilogue, and make non-block sizes 99 + suboptimal. */ 100 + __asm__ volatile 101 + ("\ 102 + ;; GCC does promise correct register allocations, but let's \n\ 103 + ;; make sure it keeps its promises. \n\ 104 + .ifnc %0-%1-%4,$r13-$r12-$r11 \n\ 105 + .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\" \n\ 106 + .endif \n\ 123 107 \n\ 124 - move.d $r11,$r0 \n\ 125 - move.d $r11,$r1 \n\ 126 - move.d $r11,$r2 \n\ 127 - move.d $r11,$r3 \n\ 128 - move.d $r11,$r4 \n\ 129 - move.d $r11,$r5 \n\ 130 - move.d $r11,$r6 \n\ 131 - move.d $r11,$r7 \n\ 132 - move.d $r11,$r8 \n\ 133 - move.d $r11,$r9 \n\ 134 - move.d $r11,$r10 \n\ 108 + ;; Save the registers we'll clobber in the movem process \n\ 109 + ;; on the stack. Don't mention them to gcc, it will only be \n\ 110 + ;; upset. \n\ 111 + subq 11*4,sp \n\ 112 + movem r10,[sp] \n\ 135 113 \n\ 136 - ;; Now we've got this: \n\ 137 - ;; r13 - dst \n\ 138 - ;; r12 - n \n\ 114 + move.d r11,r0 \n\ 115 + move.d r11,r1 \n\ 116 + move.d r11,r2 \n\ 117 + move.d r11,r3 \n\ 118 + move.d r11,r4 \n\ 119 + move.d r11,r5 \n\ 120 + move.d r11,r6 \n\ 121 + move.d r11,r7 \n\ 122 + move.d r11,r8 \n\ 123 + move.d r11,r9 \n\ 124 + move.d r11,r10 \n\ 139 125 \n\ 140 - ;; Update n for the first loop \n\ 141 - subq 12*4,$r12 \n\ 126 + ;; Now we've got this: \n\ 127 + ;; r13 - dst \n\ 128 + ;; r12 - n \n\ 129 + \n\ 130 + ;; Update n for the first loop \n\ 131 + subq 12*4,r12 \n\ 142 132 0: \n\ 143 - subq 12*4,$r12 \n\ 144 - bge 0b \n\ 145 - movem $r11,[$r13+] \n\ 133 + " 134 + #ifdef __arch_common_v10_v32 135 + /* Cater to branch offset difference between v32 and v10. We 136 + assume the branch below has an 8-bit offset. */ 137 + " setf\n" 138 + #endif 139 + " subq 12*4,r12 \n\ 140 + bge 0b \n\ 141 + movem r11,[r13+] \n\ 146 142 \n\ 147 - addq 12*4,$r12 ;; compensate for last loop underflowing n \n\ 143 + ;; Compensate for last loop underflowing n. \n\ 144 + addq 12*4,r12 \n\ 148 145 \n\ 149 - ;; Restore registers from stack \n\ 150 - movem [$sp+],$r10" 146 + ;; Restore registers from stack. \n\ 147 + movem [sp+],r10" 151 148 152 - /* Outputs */ : "=r" (dst), "=r" (n) 153 - /* Inputs */ : "0" (dst), "1" (n), "r" (lc)); 149 + /* Outputs. */ 150 + : "=r" (dst), "=r" (n) 154 151 155 - } 152 + /* Inputs. */ 153 + : "0" (dst), "1" (n), "r" (lc)); 154 + } 156 155 157 - /* Either we directly starts copying, using dword copying 158 - in a loop, or we copy as much as possible with 'movem' 159 - and then the last block (<44 bytes) is copied here. 160 - This will work since 'movem' will have updated src,dst,n. */ 156 + /* An ad-hoc unroll, used for 4*12-1..16 bytes. */ 157 + while (n >= 16) 158 + { 159 + *(long *) dst = lc; dst += 4; 160 + *(long *) dst = lc; dst += 4; 161 + *(long *) dst = lc; dst += 4; 162 + *(long *) dst = lc; dst += 4; 163 + n -= 16; 164 + } 161 165 162 - while ( n >= 16 ) 163 - { 164 - *((long*)dst)++ = lc; 165 - *((long*)dst)++ = lc; 166 - *((long*)dst)++ = lc; 167 - *((long*)dst)++ = lc; 168 - n -= 16; 169 - } 170 - 171 - /* A switch() is definitely the fastest although it takes a LOT of code. 172 - * Particularly if you inline code this. 173 - */ 174 166 switch (n) 175 - { 167 + { 176 168 case 0: 177 169 break; 170 + 178 171 case 1: 179 - *(char*)dst = (char) lc; 172 + *dst = (char) lc; 180 173 break; 174 + 181 175 case 2: 182 - *(short*)dst = (short) lc; 176 + *(short *) dst = (short) lc; 183 177 break; 178 + 184 179 case 3: 185 - *((short*)dst)++ = (short) lc; 186 - *(char*)dst = (char) lc; 180 + *(short *) dst = (short) lc; dst += 2; 181 + *dst = (char) lc; 187 182 break; 183 + 188 184 case 4: 189 - *((long*)dst)++ = lc; 185 + *(long *) dst = lc; 190 186 break; 187 + 191 188 case 5: 192 - *((long*)dst)++ = lc; 193 - *(char*)dst = (char) lc; 189 + *(long *) dst = lc; dst += 4; 190 + *dst = (char) lc; 194 191 break; 192 + 195 193 case 6: 196 - *((long*)dst)++ = lc; 197 - *(short*)dst = (short) lc; 194 + *(long *) dst = lc; dst += 4; 195 + *(short *) dst = (short) lc; 198 196 break; 197 + 199 198 case 7: 200 - *((long*)dst)++ = lc; 201 - *((short*)dst)++ = (short) lc; 202 - *(char*)dst = (char) lc; 199 + *(long *) dst = lc; dst += 4; 200 + *(short *) dst = (short) lc; dst += 2; 201 + *dst = (char) lc; 203 202 break; 203 + 204 204 case 8: 205 - *((long*)dst)++ = lc; 206 - *((long*)dst)++ = lc; 205 + *(long *) dst = lc; dst += 4; 206 + *(long *) dst = lc; 207 207 break; 208 + 208 209 case 9: 209 - *((long*)dst)++ = lc; 210 - *((long*)dst)++ = lc; 211 - *(char*)dst = (char) lc; 210 + *(long *) dst = lc; dst += 4; 211 + *(long *) dst = lc; dst += 4; 212 + *dst = (char) lc; 212 213 break; 214 + 213 215 case 10: 214 - *((long*)dst)++ = lc; 215 - *((long*)dst)++ = lc; 216 - *(short*)dst = (short) lc; 216 + *(long *) dst = lc; dst += 4; 217 + *(long *) dst = lc; dst += 4; 218 + *(short *) dst = (short) lc; 217 219 break; 220 + 218 221 case 11: 219 - *((long*)dst)++ = lc; 220 - *((long*)dst)++ = lc; 221 - *((short*)dst)++ = (short) lc; 222 - *(char*)dst = (char) lc; 222 + *(long *) dst = lc; dst += 4; 223 + *(long *) dst = lc; dst += 4; 224 + *(short *) dst = (short) lc; dst += 2; 225 + *dst = (char) lc; 223 226 break; 227 + 224 228 case 12: 225 - *((long*)dst)++ = lc; 226 - *((long*)dst)++ = lc; 227 - *((long*)dst)++ = lc; 229 + *(long *) dst = lc; dst += 4; 230 + *(long *) dst = lc; dst += 4; 231 + *(long *) dst = lc; 228 232 break; 233 + 229 234 case 13: 230 - *((long*)dst)++ = lc; 231 - *((long*)dst)++ = lc; 232 - *((long*)dst)++ = lc; 233 - *(char*)dst = (char) lc; 235 + *(long *) dst = lc; dst += 4; 236 + *(long *) dst = lc; dst += 4; 237 + *(long *) dst = lc; dst += 4; 238 + *dst = (char) lc; 234 239 break; 240 + 235 241 case 14: 236 - *((long*)dst)++ = lc; 237 - *((long*)dst)++ = lc; 238 - *((long*)dst)++ = lc; 239 - *(short*)dst = (short) lc; 242 + *(long *) dst = lc; dst += 4; 243 + *(long *) dst = lc; dst += 4; 244 + *(long *) dst = lc; dst += 4; 245 + *(short *) dst = (short) lc; 240 246 break; 247 + 241 248 case 15: 242 - *((long*)dst)++ = lc; 243 - *((long*)dst)++ = lc; 244 - *((long*)dst)++ = lc; 245 - *((short*)dst)++ = (short) lc; 246 - *(char*)dst = (char) lc; 249 + *(long *) dst = lc; dst += 4; 250 + *(long *) dst = lc; dst += 4; 251 + *(long *) dst = lc; dst += 4; 252 + *(short *) dst = (short) lc; dst += 2; 253 + *dst = (char) lc; 247 254 break; 248 - } 255 + } 249 256 } 250 257 251 - return return_dst; /* destination pointer. */ 252 - } /* memset() */ 258 + return return_dst; 259 + }
+195 -189
arch/cris/arch-v32/lib/memset.c
··· 1 - /*#************************************************************************#*/ 2 - /*#-------------------------------------------------------------------------*/ 3 - /*# */ 4 - /*# FUNCTION NAME: memset() */ 5 - /*# */ 6 - /*# PARAMETERS: void* dst; Destination address. */ 7 - /*# int c; Value of byte to write. */ 8 - /*# int len; Number of bytes to write. */ 9 - /*# */ 10 - /*# RETURNS: dst. */ 11 - /*# */ 12 - /*# DESCRIPTION: Sets the memory dst of length len bytes to c, as standard. */ 13 - /*# Framework taken from memcpy. This routine is */ 14 - /*# very sensitive to compiler changes in register allocation. */ 15 - /*# Should really be rewritten to avoid this problem. */ 16 - /*# */ 17 - /*#-------------------------------------------------------------------------*/ 18 - /*# */ 19 - /*# HISTORY */ 20 - /*# */ 21 - /*# DATE NAME CHANGES */ 22 - /*# ---- ---- ------- */ 23 - /*# 990713 HP Tired of watching this function (or */ 24 - /*# really, the nonoptimized generic */ 25 - /*# implementation) take up 90% of simulator */ 26 - /*# output. Measurements needed. */ 27 - /*# */ 28 - /*#-------------------------------------------------------------------------*/ 1 + /* A memset for CRIS. 2 + Copyright (C) 1999-2005 Axis Communications. 3 + All rights reserved. 29 4 30 - #include <linux/types.h> 5 + Redistribution and use in source and binary forms, with or without 6 + modification, are permitted provided that the following conditions 7 + are met: 31 8 32 - /* No, there's no macro saying 12*4, since it is "hard" to get it into 33 - the asm in a good way. Thus better to expose the problem everywhere. 34 - */ 9 + 1. Redistributions of source code must retain the above copyright 10 + notice, this list of conditions and the following disclaimer. 35 11 36 - /* Assuming 1 cycle per dword written or read (ok, not really true), and 37 - one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1) 38 - so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */ 12 + 2. Neither the name of Axis Communications nor the names of its 13 + contributors may be used to endorse or promote products derived 14 + from this software without specific prior written permission. 39 15 40 - #define ZERO_BLOCK_SIZE (1*12*4) 16 + THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS 17 + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS 20 + COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 21 + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 25 + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 26 + IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 + POSSIBILITY OF SUCH DAMAGE. */ 41 28 42 - void *memset(void *pdst, 43 - int c, 44 - size_t plen) 29 + /* FIXME: This file should really only be used for reference, as the 30 + result is somewhat depending on gcc generating what we expect rather 31 + than what we describe. An assembly file should be used instead. */ 32 + 33 + /* Note the multiple occurrence of the expression "12*4", including the 34 + asm. It is hard to get it into the asm in a good way. Thus better to 35 + expose the problem everywhere: no macro. */ 36 + 37 + /* Assuming one cycle per dword written or read (ok, not really true; the 38 + world is not ideal), and one cycle per instruction, then 43+3*(n/48-1) 39 + <= 24+24*(n/48-1) so n >= 45.7; n >= 0.9; we win on the first full 40 + 48-byte block to set. */ 41 + 42 + #define MEMSET_BY_BLOCK_THRESHOLD (1 * 48) 43 + 44 + /* No name ambiguities in this file. */ 45 + __asm__ (".syntax no_register_prefix"); 46 + 47 + void *memset(void *pdst, int c, unsigned int plen) 45 48 { 46 - /* Ok. Now we want the parameters put in special registers. 47 - Make sure the compiler is able to make something useful of this. */ 49 + /* Now we want the parameters in special registers. Make sure the 50 + compiler does something usable with this. */ 48 51 49 52 register char *return_dst __asm__ ("r10") = pdst; 50 53 register int n __asm__ ("r12") = plen; 51 54 register int lc __asm__ ("r11") = c; 52 55 53 - /* Most apps use memset sanely. Only those memsetting about 3..4 54 - bytes or less get penalized compared to the generic implementation 55 - - and that's not really sane use. */ 56 + /* Most apps use memset sanely. Memsetting about 3..4 bytes or less get 57 + penalized here compared to the generic implementation. */ 56 58 57 - /* Ugh. This is fragile at best. Check with newer GCC releases, if 58 - they compile cascaded "x |= x << 8" sanely! */ 59 - __asm__("movu.b %0,$r13 \n\ 60 - lslq 8,$r13 \n\ 61 - move.b %0,$r13 \n\ 62 - move.d $r13,%0 \n\ 63 - lslq 16,$r13 \n\ 64 - or.d $r13,%0" 65 - : "=r" (lc) : "0" (lc) : "r13"); 59 + /* This is fragile performancewise at best. Check with newer GCC 60 + releases, if they compile cascaded "x |= x << 8" to sane code. */ 61 + __asm__("movu.b %0,r13 \n\ 62 + lslq 8,r13 \n\ 63 + move.b %0,r13 \n\ 64 + move.d r13,%0 \n\ 65 + lslq 16,r13 \n\ 66 + or.d r13,%0" 67 + : "=r" (lc) /* Inputs. */ 68 + : "0" (lc) /* Outputs. */ 69 + : "r13"); /* Trash. */ 66 70 67 71 { 68 72 register char *dst __asm__ ("r13") = pdst; 69 73 70 - /* This is NONPORTABLE, but since this whole routine is */ 71 - /* grossly nonportable that doesn't matter. */ 74 + if (((unsigned long) pdst & 3) != 0 75 + /* Oops! n = 0 must be a valid call, regardless of alignment. */ 76 + && n >= 3) 77 + { 78 + if ((unsigned long) dst & 1) 79 + { 80 + *dst = (char) lc; 81 + n--; 82 + dst++; 83 + } 72 84 73 - if (((unsigned long) pdst & 3) != 0 74 - /* Oops! n=0 must be a legal call, regardless of alignment. */ 75 - && n >= 3) 76 - { 77 - if ((unsigned long)dst & 1) 78 - { 79 - *dst = (char) lc; 80 - n--; 81 - dst++; 82 - } 85 + if ((unsigned long) dst & 2) 86 + { 87 + *(short *) dst = lc; 88 + n -= 2; 89 + dst += 2; 90 + } 91 + } 83 92 84 - if ((unsigned long)dst & 2) 85 - { 86 - *(short *)dst = lc; 87 - n -= 2; 88 - dst += 2; 89 - } 90 - } 91 - 92 - /* Now the fun part. For the threshold value of this, check the equation 93 - above. */ 94 - /* Decide which copying method to use. */ 95 - if (n >= ZERO_BLOCK_SIZE) 96 - { 97 - /* For large copies we use 'movem' */ 98 - 99 - /* It is not optimal to tell the compiler about clobbering any 100 - registers; that will move the saving/restoring of those registers 101 - to the function prologue/epilogue, and make non-movem sizes 102 - suboptimal. 103 - 104 - This method is not foolproof; it assumes that the "asm reg" 105 - declarations at the beginning of the function really are used 106 - here (beware: they may be moved to temporary registers). 107 - This way, we do not have to save/move the registers around into 108 - temporaries; we can safely use them straight away. 109 - 110 - If you want to check that the allocation was right; then 111 - check the equalities in the first comment. It should say 112 - "r13=r13, r12=r12, r11=r11" */ 113 - __asm__ volatile (" \n\ 114 - ;; Check that the register asm declaration got right. \n\ 115 - ;; The GCC manual says it will work, but there *has* been bugs. \n\ 116 - .ifnc %0-%1-%4,$r13-$r12-$r11 \n\ 117 - .err \n\ 118 - .endif \n\ 93 + /* Decide which setting method to use. */ 94 + if (n >= MEMSET_BY_BLOCK_THRESHOLD) 95 + { 96 + /* It is not optimal to tell the compiler about clobbering any 97 + registers; that will move the saving/restoring of those registers 98 + to the function prologue/epilogue, and make non-block sizes 99 + suboptimal. */ 100 + __asm__ volatile 101 + ("\ 102 + ;; GCC does promise correct register allocations, but let's \n\ 103 + ;; make sure it keeps its promises. \n\ 104 + .ifnc %0-%1-%4,$r13-$r12-$r11 \n\ 105 + .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\" \n\ 106 + .endif \n\ 119 107 \n\ 120 - ;; Save the registers we'll clobber in the movem process \n\ 121 - ;; on the stack. Don't mention them to gcc, it will only be \n\ 122 - ;; upset. \n\ 123 - subq 11*4,$sp \n\ 124 - movem $r10,[$sp] \n\ 108 + ;; Save the registers we'll clobber in the movem process \n\ 109 + ;; on the stack. Don't mention them to gcc, it will only be \n\ 110 + ;; upset. \n\ 111 + subq 11*4,sp \n\ 112 + movem r10,[sp] \n\ 125 113 \n\ 126 - move.d $r11,$r0 \n\ 127 - move.d $r11,$r1 \n\ 128 - move.d $r11,$r2 \n\ 129 - move.d $r11,$r3 \n\ 130 - move.d $r11,$r4 \n\ 131 - move.d $r11,$r5 \n\ 132 - move.d $r11,$r6 \n\ 133 - move.d $r11,$r7 \n\ 134 - move.d $r11,$r8 \n\ 135 - move.d $r11,$r9 \n\ 136 - move.d $r11,$r10 \n\ 114 + move.d r11,r0 \n\ 115 + move.d r11,r1 \n\ 116 + move.d r11,r2 \n\ 117 + move.d r11,r3 \n\ 118 + move.d r11,r4 \n\ 119 + move.d r11,r5 \n\ 120 + move.d r11,r6 \n\ 121 + move.d r11,r7 \n\ 122 + move.d r11,r8 \n\ 123 + move.d r11,r9 \n\ 124 + move.d r11,r10 \n\ 137 125 \n\ 138 - ;; Now we've got this: \n\ 139 - ;; r13 - dst \n\ 140 - ;; r12 - n \n\ 126 + ;; Now we've got this: \n\ 127 + ;; r13 - dst \n\ 128 + ;; r12 - n \n\ 141 129 \n\ 142 - ;; Update n for the first loop \n\ 143 - subq 12*4,$r12 \n\ 130 + ;; Update n for the first loop \n\ 131 + subq 12*4,r12 \n\ 144 132 0: \n\ 145 - subq 12*4,$r12 \n\ 146 - bge 0b \n\ 147 - movem $r11,[$r13+] \n\ 133 + " 134 + #ifdef __arch_common_v10_v32 135 + /* Cater to branch offset difference between v32 and v10. We 136 + assume the branch below has an 8-bit offset. */ 137 + " setf\n" 138 + #endif 139 + " subq 12*4,r12 \n\ 140 + bge 0b \n\ 141 + movem r11,[r13+] \n\ 148 142 \n\ 149 - addq 12*4,$r12 ;; compensate for last loop underflowing n \n\ 143 + ;; Compensate for last loop underflowing n. \n\ 144 + addq 12*4,r12 \n\ 150 145 \n\ 151 - ;; Restore registers from stack \n\ 152 - movem [$sp+],$r10" 146 + ;; Restore registers from stack. \n\ 147 + movem [sp+],r10" 153 148 154 - /* Outputs */ : "=r" (dst), "=r" (n) 155 - /* Inputs */ : "0" (dst), "1" (n), "r" (lc)); 156 - } 149 + /* Outputs. */ 150 + : "=r" (dst), "=r" (n) 157 151 158 - /* Either we directly starts copying, using dword copying 159 - in a loop, or we copy as much as possible with 'movem' 160 - and then the last block (<44 bytes) is copied here. 161 - This will work since 'movem' will have updated src,dst,n. */ 152 + /* Inputs. */ 153 + : "0" (dst), "1" (n), "r" (lc)); 154 + } 162 155 163 - while ( n >= 16 ) 164 - { 165 - *((long*)dst)++ = lc; 166 - *((long*)dst)++ = lc; 167 - *((long*)dst)++ = lc; 168 - *((long*)dst)++ = lc; 169 - n -= 16; 170 - } 156 + /* An ad-hoc unroll, used for 4*12-1..16 bytes. */ 157 + while (n >= 16) 158 + { 159 + *(long *) dst = lc; dst += 4; 160 + *(long *) dst = lc; dst += 4; 161 + *(long *) dst = lc; dst += 4; 162 + *(long *) dst = lc; dst += 4; 163 + n -= 16; 164 + } 171 165 172 - /* A switch() is definitely the fastest although it takes a LOT of code. 173 - * Particularly if you inline code this. 174 - */ 175 166 switch (n) 176 - { 167 + { 177 168 case 0: 178 169 break; 170 + 179 171 case 1: 180 - *(char*)dst = (char) lc; 172 + *dst = (char) lc; 181 173 break; 174 + 182 175 case 2: 183 - *(short*)dst = (short) lc; 176 + *(short *) dst = (short) lc; 184 177 break; 178 + 185 179 case 3: 186 - *((short*)dst)++ = (short) lc; 187 - *(char*)dst = (char) lc; 180 + *(short *) dst = (short) lc; dst += 2; 181 + *dst = (char) lc; 188 182 break; 183 + 189 184 case 4: 190 - *((long*)dst)++ = lc; 185 + *(long *) dst = lc; 191 186 break; 187 + 192 188 case 5: 193 - *((long*)dst)++ = lc; 194 - *(char*)dst = (char) lc; 189 + *(long *) dst = lc; dst += 4; 190 + *dst = (char) lc; 195 191 break; 192 + 196 193 case 6: 197 - *((long*)dst)++ = lc; 198 - *(short*)dst = (short) lc; 194 + *(long *) dst = lc; dst += 4; 195 + *(short *) dst = (short) lc; 199 196 break; 197 + 200 198 case 7: 201 - *((long*)dst)++ = lc; 202 - *((short*)dst)++ = (short) lc; 203 - *(char*)dst = (char) lc; 199 + *(long *) dst = lc; dst += 4; 200 + *(short *) dst = (short) lc; dst += 2; 201 + *dst = (char) lc; 204 202 break; 203 + 205 204 case 8: 206 - *((long*)dst)++ = lc; 207 - *((long*)dst)++ = lc; 205 + *(long *) dst = lc; dst += 4; 206 + *(long *) dst = lc; 208 207 break; 208 + 209 209 case 9: 210 - *((long*)dst)++ = lc; 211 - *((long*)dst)++ = lc; 212 - *(char*)dst = (char) lc; 210 + *(long *) dst = lc; dst += 4; 211 + *(long *) dst = lc; dst += 4; 212 + *dst = (char) lc; 213 213 break; 214 + 214 215 case 10: 215 - *((long*)dst)++ = lc; 216 - *((long*)dst)++ = lc; 217 - *(short*)dst = (short) lc; 216 + *(long *) dst = lc; dst += 4; 217 + *(long *) dst = lc; dst += 4; 218 + *(short *) dst = (short) lc; 218 219 break; 220 + 219 221 case 11: 220 - *((long*)dst)++ = lc; 221 - *((long*)dst)++ = lc; 222 - *((short*)dst)++ = (short) lc; 223 - *(char*)dst = (char) lc; 222 + *(long *) dst = lc; dst += 4; 223 + *(long *) dst = lc; dst += 4; 224 + *(short *) dst = (short) lc; dst += 2; 225 + *dst = (char) lc; 224 226 break; 227 + 225 228 case 12: 226 - *((long*)dst)++ = lc; 227 - *((long*)dst)++ = lc; 228 - *((long*)dst)++ = lc; 229 + *(long *) dst = lc; dst += 4; 230 + *(long *) dst = lc; dst += 4; 231 + *(long *) dst = lc; 229 232 break; 233 + 230 234 case 13: 231 - *((long*)dst)++ = lc; 232 - *((long*)dst)++ = lc; 233 - *((long*)dst)++ = lc; 234 - *(char*)dst = (char) lc; 235 + *(long *) dst = lc; dst += 4; 236 + *(long *) dst = lc; dst += 4; 237 + *(long *) dst = lc; dst += 4; 238 + *dst = (char) lc; 235 239 break; 240 + 236 241 case 14: 237 - *((long*)dst)++ = lc; 238 - *((long*)dst)++ = lc; 239 - *((long*)dst)++ = lc; 240 - *(short*)dst = (short) lc; 242 + *(long *) dst = lc; dst += 4; 243 + *(long *) dst = lc; dst += 4; 244 + *(long *) dst = lc; dst += 4; 245 + *(short *) dst = (short) lc; 241 246 break; 247 + 242 248 case 15: 243 - *((long*)dst)++ = lc; 244 - *((long*)dst)++ = lc; 245 - *((long*)dst)++ = lc; 246 - *((short*)dst)++ = (short) lc; 247 - *(char*)dst = (char) lc; 249 + *(long *) dst = lc; dst += 4; 250 + *(long *) dst = lc; dst += 4; 251 + *(long *) dst = lc; dst += 4; 252 + *(short *) dst = (short) lc; dst += 2; 253 + *dst = (char) lc; 248 254 break; 249 - } 255 + } 250 256 } 251 257 252 - return return_dst; /* destination pointer. */ 253 - } /* memset() */ 258 + return return_dst; 259 + }