metag: Optimised library functions · tjh.dev/kernel@086e9dc

+92

arch/metag/include/asm/checksum.h

··· 1 + #ifndef _METAG_CHECKSUM_H 2 + #define _METAG_CHECKSUM_H 3 + 4 + /* 5 + * computes the checksum of a memory block at buff, length len, 6 + * and adds in "sum" (32-bit) 7 + * 8 + * returns a 32-bit number suitable for feeding into itself 9 + * or csum_tcpudp_magic 10 + * 11 + * this function must be called with even lengths, except 12 + * for the last fragment, which may be odd 13 + * 14 + * it's best to have buff aligned on a 32-bit boundary 15 + */ 16 + extern __wsum csum_partial(const void *buff, int len, __wsum sum); 17 + 18 + /* 19 + * the same as csum_partial, but copies from src while it 20 + * checksums 21 + * 22 + * here even more important to align src and dst on a 32-bit (or even 23 + * better 64-bit) boundary 24 + */ 25 + extern __wsum csum_partial_copy(const void *src, void *dst, int len, 26 + __wsum sum); 27 + 28 + /* 29 + * the same as csum_partial_copy, but copies from user space. 30 + * 31 + * here even more important to align src and dst on a 32-bit (or even 32 + * better 64-bit) boundary 33 + */ 34 + extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst, 35 + int len, __wsum sum, int *csum_err); 36 + 37 + #define csum_partial_copy_nocheck(src, dst, len, sum) \ 38 + csum_partial_copy((src), (dst), (len), (sum)) 39 + 40 + /* 41 + * Fold a partial checksum 42 + */ 43 + static inline __sum16 csum_fold(__wsum csum) 44 + { 45 + u32 sum = (__force u32)csum; 46 + sum = (sum & 0xffff) + (sum >> 16); 47 + sum = (sum & 0xffff) + (sum >> 16); 48 + return (__force __sum16)~sum; 49 + } 50 + 51 + /* 52 + * This is a version of ip_compute_csum() optimized for IP headers, 53 + * which always checksum on 4 octet boundaries. 54 + */ 55 + extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl); 56 + 57 + /* 58 + * computes the checksum of the TCP/UDP pseudo-header 59 + * returns a 16-bit checksum, already complemented 60 + */ 61 + static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, 62 + unsigned short len, 63 + unsigned short proto, 64 + __wsum sum) 65 + { 66 + unsigned long len_proto = (proto + len) << 8; 67 + asm ("ADD %0, %0, %1\n" 68 + "ADDS %0, %0, %2\n" 69 + "ADDCS %0, %0, #1\n" 70 + "ADDS %0, %0, %3\n" 71 + "ADDCS %0, %0, #1\n" 72 + : "=d" (sum) 73 + : "d" (daddr), "d" (saddr), "d" (len_proto), 74 + "0" (sum) 75 + : "cc"); 76 + return sum; 77 + } 78 + 79 + static inline __sum16 80 + csum_tcpudp_magic(__be32 saddr, __be32 daddr, unsigned short len, 81 + unsigned short proto, __wsum sum) 82 + { 83 + return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); 84 + } 85 + 86 + /* 87 + * this routine is used for miscellaneous IP-like checksums, mainly 88 + * in icmp.c 89 + */ 90 + extern __sum16 ip_compute_csum(const void *buff, int len); 91 + 92 + #endif /* _METAG_CHECKSUM_H */

+12

arch/metag/include/asm/div64.h

··· 1 + #ifndef __ASM_DIV64_H__ 2 + #define __ASM_DIV64_H__ 3 + 4 + #include <asm-generic/div64.h> 5 + 6 + extern u64 div_u64(u64 dividend, u64 divisor); 7 + extern s64 div_s64(s64 dividend, s64 divisor); 8 + 9 + #define div_u64 div_u64 10 + #define div_s64 div_s64 11 + 12 + #endif

+13

arch/metag/include/asm/string.h

··· 1 + #ifndef _METAG_STRING_H_ 2 + #define _METAG_STRING_H_ 3 + 4 + #define __HAVE_ARCH_MEMSET 5 + extern void *memset(void *__s, int __c, size_t __count); 6 + 7 + #define __HAVE_ARCH_MEMCPY 8 + void *memcpy(void *__to, __const__ void *__from, size_t __n); 9 + 10 + #define __HAVE_ARCH_MEMMOVE 11 + extern void *memmove(void *__dest, __const__ void *__src, size_t __n); 12 + 13 + #endif /* _METAG_STRING_H_ */

+33

arch/metag/lib/ashldi3.S

··· 1 + ! Copyright (C) 2012 by Imagination Technologies Ltd. 2 + ! 3 + ! 64-bit arithmetic shift left routine. 4 + ! 5 + 6 + .text 7 + .global ___ashldi3 8 + .type ___ashldi3,function 9 + 10 + ___ashldi3: 11 + MOV D0Re0,D0Ar2 12 + MOV D1Re0,D1Ar1 13 + CMP D1Ar3,#0 ! COUNT == 0 14 + MOVEQ PC,D1RtP ! Yes, return 15 + 16 + SUBS D0Ar4,D1Ar3,#32 ! N = COUNT - 32 17 + BGE $L10 18 + 19 + !! Shift < 32 20 + NEG D0Ar4,D0Ar4 ! N = - N 21 + LSL D1Re0,D1Re0,D1Ar3 ! HI = HI << COUNT 22 + LSR D0Ar6,D0Re0,D0Ar4 ! TMP= LO >> -(COUNT - 32) 23 + OR D1Re0,D1Re0,D0Ar6 ! HI = HI | TMP 24 + SWAP D0Ar4,D1Ar3 25 + LSL D0Re0,D0Re0,D0Ar4 ! LO = LO << COUNT 26 + MOV PC,D1RtP 27 + 28 + $L10: 29 + !! Shift >= 32 30 + LSL D1Re0,D0Re0,D0Ar4 ! HI = LO << N 31 + MOV D0Re0,#0 ! LO = 0 32 + MOV PC,D1RtP 33 + .size ___ashldi3,.-___ashldi3

+33

arch/metag/lib/ashrdi3.S

··· 1 + ! Copyright (C) 2012 by Imagination Technologies Ltd. 2 + ! 3 + ! 64-bit arithmetic shift right routine. 4 + ! 5 + 6 + .text 7 + .global ___ashrdi3 8 + .type ___ashrdi3,function 9 + 10 + ___ashrdi3: 11 + MOV D0Re0,D0Ar2 12 + MOV D1Re0,D1Ar1 13 + CMP D1Ar3,#0 ! COUNT == 0 14 + MOVEQ PC,D1RtP ! Yes, return 15 + 16 + MOV D0Ar4,D1Ar3 17 + SUBS D1Ar3,D1Ar3,#32 ! N = COUNT - 32 18 + BGE $L20 19 + 20 + !! Shift < 32 21 + NEG D1Ar3,D1Ar3 ! N = - N 22 + LSR D0Re0,D0Re0,D0Ar4 ! LO = LO >> COUNT 23 + LSL D0Ar6,D1Re0,D1Ar3 ! TMP= HI << -(COUNT - 32) 24 + OR D0Re0,D0Re0,D0Ar6 ! LO = LO | TMP 25 + SWAP D1Ar3,D0Ar4 26 + ASR D1Re0,D1Re0,D1Ar3 ! HI = HI >> COUNT 27 + MOV PC,D1RtP 28 + $L20: 29 + !! Shift >= 32 30 + ASR D0Re0,D1Re0,D1Ar3 ! LO = HI >> N 31 + ASR D1Re0,D1Re0,#31 ! HI = HI >> 31 32 + MOV PC,D1RtP 33 + .size ___ashrdi3,.-___ashrdi3

+168

arch/metag/lib/checksum.c

··· 1 + /* 2 + * 3 + * INET An implementation of the TCP/IP protocol suite for the LINUX 4 + * operating system. INET is implemented using the BSD Socket 5 + * interface as the means of communication with the user level. 6 + * 7 + * IP/TCP/UDP checksumming routines 8 + * 9 + * Authors: Jorge Cwik, <jorge@laser.satlink.net> 10 + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 11 + * Tom May, <ftom@netcom.com> 12 + * Andreas Schwab, <schwab@issan.informatik.uni-dortmund.de> 13 + * Lots of code moved from tcp.c and ip.c; see those files 14 + * for more names. 15 + * 16 + * 03/02/96 Jes Sorensen, Andreas Schwab, Roman Hodek: 17 + * Fixed some nasty bugs, causing some horrible crashes. 18 + * A: At some points, the sum (%0) was used as 19 + * length-counter instead of the length counter 20 + * (%1). Thanks to Roman Hodek for pointing this out. 21 + * B: GCC seems to mess up if one uses too many 22 + * data-registers to hold input values and one tries to 23 + * specify d0 and d1 as scratch registers. Letting gcc 24 + * choose these registers itself solves the problem. 25 + * 26 + * This program is free software; you can redistribute it and/or 27 + * modify it under the terms of the GNU General Public License 28 + * as published by the Free Software Foundation; either version 29 + * 2 of the License, or (at your option) any later version. 30 + */ 31 + 32 + /* Revised by Kenneth Albanowski for m68knommu. Basic problem: unaligned access 33 + kills, so most of the assembly has to go. */ 34 + 35 + #include <linux/module.h> 36 + #include <net/checksum.h> 37 + 38 + #include <asm/byteorder.h> 39 + 40 + static inline unsigned short from32to16(unsigned int x) 41 + { 42 + /* add up 16-bit and 16-bit for 16+c bit */ 43 + x = (x & 0xffff) + (x >> 16); 44 + /* add up carry.. */ 45 + x = (x & 0xffff) + (x >> 16); 46 + return x; 47 + } 48 + 49 + static unsigned int do_csum(const unsigned char *buff, int len) 50 + { 51 + int odd; 52 + unsigned int result = 0; 53 + 54 + if (len <= 0) 55 + goto out; 56 + odd = 1 & (unsigned long) buff; 57 + if (odd) { 58 + #ifdef __LITTLE_ENDIAN 59 + result += (*buff << 8); 60 + #else 61 + result = *buff; 62 + #endif 63 + len--; 64 + buff++; 65 + } 66 + if (len >= 2) { 67 + if (2 & (unsigned long) buff) { 68 + result += *(unsigned short *) buff; 69 + len -= 2; 70 + buff += 2; 71 + } 72 + if (len >= 4) { 73 + const unsigned char *end = buff + ((unsigned)len & ~3); 74 + unsigned int carry = 0; 75 + do { 76 + unsigned int w = *(unsigned int *) buff; 77 + buff += 4; 78 + result += carry; 79 + result += w; 80 + carry = (w > result); 81 + } while (buff < end); 82 + result += carry; 83 + result = (result & 0xffff) + (result >> 16); 84 + } 85 + if (len & 2) { 86 + result += *(unsigned short *) buff; 87 + buff += 2; 88 + } 89 + } 90 + if (len & 1) 91 + #ifdef __LITTLE_ENDIAN 92 + result += *buff; 93 + #else 94 + result += (*buff << 8); 95 + #endif 96 + result = from32to16(result); 97 + if (odd) 98 + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); 99 + out: 100 + return result; 101 + } 102 + EXPORT_SYMBOL(ip_fast_csum); 103 + 104 + /* 105 + * computes the checksum of a memory block at buff, length len, 106 + * and adds in "sum" (32-bit) 107 + * 108 + * returns a 32-bit number suitable for feeding into itself 109 + * or csum_tcpudp_magic 110 + * 111 + * this function must be called with even lengths, except 112 + * for the last fragment, which may be odd 113 + * 114 + * it's best to have buff aligned on a 32-bit boundary 115 + */ 116 + __wsum csum_partial(const void *buff, int len, __wsum wsum) 117 + { 118 + unsigned int sum = (__force unsigned int)wsum; 119 + unsigned int result = do_csum(buff, len); 120 + 121 + /* add in old sum, and carry.. */ 122 + result += sum; 123 + if (sum > result) 124 + result += 1; 125 + return (__force __wsum)result; 126 + } 127 + EXPORT_SYMBOL(csum_partial); 128 + 129 + /* 130 + * this routine is used for miscellaneous IP-like checksums, mainly 131 + * in icmp.c 132 + */ 133 + __sum16 ip_compute_csum(const void *buff, int len) 134 + { 135 + return (__force __sum16)~do_csum(buff, len); 136 + } 137 + EXPORT_SYMBOL(ip_compute_csum); 138 + 139 + /* 140 + * copy from fs while checksumming, otherwise like csum_partial 141 + */ 142 + __wsum 143 + csum_partial_copy_from_user(const void __user *src, void *dst, int len, 144 + __wsum sum, int *csum_err) 145 + { 146 + int missing; 147 + 148 + missing = __copy_from_user(dst, src, len); 149 + if (missing) { 150 + memset(dst + len - missing, 0, missing); 151 + *csum_err = -EFAULT; 152 + } else 153 + *csum_err = 0; 154 + 155 + return csum_partial(dst, len, sum); 156 + } 157 + EXPORT_SYMBOL(csum_partial_copy_from_user); 158 + 159 + /* 160 + * copy from ds while checksumming, otherwise like csum_partial 161 + */ 162 + __wsum 163 + csum_partial_copy(const void *src, void *dst, int len, __wsum sum) 164 + { 165 + memcpy(dst, src, len); 166 + return csum_partial(dst, len, sum); 167 + } 168 + EXPORT_SYMBOL(csum_partial_copy);

+17

arch/metag/lib/clear_page.S

··· 1 + ! Copyright 2007,2008,2009 Imagination Technologies Ltd. 2 + 3 + #include <asm/page.h> 4 + 5 + .text 6 + .global _clear_page 7 + .type _clear_page,function 8 + !! D1Ar1 - page 9 + _clear_page: 10 + MOV TXRPT,#((PAGE_SIZE / 8) - 1) 11 + MOV D0Re0,#0 12 + MOV D1Re0,#0 13 + $Lclear_page_loop: 14 + SETL [D1Ar1++],D0Re0,D1Re0 15 + BR $Lclear_page_loop 16 + MOV PC,D1RtP 17 + .size _clear_page,.-_clear_page

+32

arch/metag/lib/cmpdi2.S

··· 1 + ! Copyright (C) 2012 by Imagination Technologies Ltd. 2 + ! 3 + ! 64-bit signed compare routine. 4 + ! 5 + 6 + .text 7 + .global ___cmpdi2 8 + .type ___cmpdi2,function 9 + 10 + ! low high 11 + ! s64 a (D0Ar2, D1Ar1) 12 + ! s64 b (D0Ar4, D1Ar3) 13 + ___cmpdi2: 14 + ! start at 1 (equal) and conditionally increment or decrement 15 + MOV D0Re0,#1 16 + 17 + ! high words differ? 18 + CMP D1Ar1,D1Ar3 19 + BNE $Lhigh_differ 20 + 21 + ! unsigned compare low words 22 + CMP D0Ar2,D0Ar4 23 + SUBLO D0Re0,D0Re0,#1 24 + ADDHI D0Re0,D0Re0,#1 25 + MOV PC,D1RtP 26 + 27 + $Lhigh_differ: 28 + ! signed compare high words 29 + SUBLT D0Re0,D0Re0,#1 30 + ADDGT D0Re0,D0Re0,#1 31 + MOV PC,D1RtP 32 + .size ___cmpdi2,.-___cmpdi2

+20

arch/metag/lib/copy_page.S

··· 1 + ! Copyright 2007,2008 Imagination Technologies Ltd. 2 + 3 + #include <asm/page.h> 4 + 5 + .text 6 + .global _copy_page 7 + .type _copy_page,function 8 + !! D1Ar1 - to 9 + !! D0Ar2 - from 10 + _copy_page: 11 + MOV D0FrT,#PAGE_SIZE 12 + $Lcopy_page_loop: 13 + GETL D0Re0,D1Re0,[D0Ar2++] 14 + GETL D0Ar6,D1Ar5,[D0Ar2++] 15 + SETL [D1Ar1++],D0Re0,D1Re0 16 + SETL [D1Ar1++],D0Ar6,D1Ar5 17 + SUBS D0FrT,D0FrT,#16 18 + BNZ $Lcopy_page_loop 19 + MOV PC,D1RtP 20 + .size _copy_page,.-_copy_page

+56

arch/metag/lib/delay.c

··· 1 + /* 2 + * Precise Delay Loops for Meta 3 + * 4 + * Copyright (C) 1993 Linus Torvalds 5 + * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz> 6 + * Copyright (C) 2007,2009 Imagination Technologies Ltd. 7 + * 8 + */ 9 + 10 + #include <linux/export.h> 11 + #include <linux/sched.h> 12 + #include <linux/delay.h> 13 + 14 + #include <asm/core_reg.h> 15 + #include <asm/processor.h> 16 + 17 + /* 18 + * TXTACTCYC is only 24 bits, so on chips with fast clocks it will wrap 19 + * many times per-second. If it does wrap __delay will return prematurely, 20 + * but this is only likely with large delay values. 21 + * 22 + * We also can't implement read_current_timer() with TXTACTCYC due to 23 + * this wrapping behaviour. 24 + */ 25 + #define rdtimer(t) t = __core_reg_get(TXTACTCYC) 26 + 27 + void __delay(unsigned long loops) 28 + { 29 + unsigned long bclock, now; 30 + 31 + rdtimer(bclock); 32 + do { 33 + asm("NOP"); 34 + rdtimer(now); 35 + } while ((now-bclock) < loops); 36 + } 37 + EXPORT_SYMBOL(__delay); 38 + 39 + inline void __const_udelay(unsigned long xloops) 40 + { 41 + u64 loops = (u64)xloops * (u64)loops_per_jiffy * HZ; 42 + __delay(loops >> 32); 43 + } 44 + EXPORT_SYMBOL(__const_udelay); 45 + 46 + void __udelay(unsigned long usecs) 47 + { 48 + __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ 49 + } 50 + EXPORT_SYMBOL(__udelay); 51 + 52 + void __ndelay(unsigned long nsecs) 53 + { 54 + __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ 55 + } 56 + EXPORT_SYMBOL(__ndelay);

+108

arch/metag/lib/div64.S

··· 1 + ! Copyright (C) 2012 Imagination Technologies Ltd. 2 + ! 3 + ! Signed/unsigned 64-bit division routines. 4 + ! 5 + 6 + .text 7 + .global _div_u64 8 + .type _div_u64,function 9 + 10 + _div_u64: 11 + $L1: 12 + ORS A0.3,D1Ar3,D0Ar4 13 + BNE $L3 14 + $L2: 15 + MOV D0Re0,D0Ar2 16 + MOV D1Re0,D1Ar1 17 + MOV PC,D1RtP 18 + $L3: 19 + CMP D1Ar3,D1Ar1 20 + CMPEQ D0Ar4,D0Ar2 21 + MOV D0Re0,#1 22 + MOV D1Re0,#0 23 + BHS $L6 24 + $L4: 25 + ADDS D0Ar6,D0Ar4,D0Ar4 26 + ADD D1Ar5,D1Ar3,D1Ar3 27 + ADDCS D1Ar5,D1Ar5,#1 28 + CMP D1Ar5,D1Ar3 29 + CMPEQ D0Ar6,D0Ar4 30 + BLO $L6 31 + $L5: 32 + MOV D0Ar4,D0Ar6 33 + MOV D1Ar3,D1Ar5 34 + ADDS D0Re0,D0Re0,D0Re0 35 + ADD D1Re0,D1Re0,D1Re0 36 + ADDCS D1Re0,D1Re0,#1 37 + CMP D1Ar3,D1Ar1 38 + CMPEQ D0Ar4,D0Ar2 39 + BLO $L4 40 + $L6: 41 + ORS A0.3,D1Re0,D0Re0 42 + MOV D0Ar6,#0 43 + MOV D1Ar5,D0Ar6 44 + BEQ $L10 45 + $L7: 46 + CMP D1Ar1,D1Ar3 47 + CMPEQ D0Ar2,D0Ar4 48 + BLO $L9 49 + $L8: 50 + ADDS D0Ar6,D0Ar6,D0Re0 51 + ADD D1Ar5,D1Ar5,D1Re0 52 + ADDCS D1Ar5,D1Ar5,#1 53 + 54 + SUBS D0Ar2,D0Ar2,D0Ar4 55 + SUB D1Ar1,D1Ar1,D1Ar3 56 + SUBCS D1Ar1,D1Ar1,#1 57 + $L9: 58 + LSL A0.3,D1Re0,#31 59 + LSR D0Re0,D0Re0,#1 60 + LSR D1Re0,D1Re0,#1 61 + OR D0Re0,D0Re0,A0.3 62 + LSL A0.3,D1Ar3,#31 63 + LSR D0Ar4,D0Ar4,#1 64 + LSR D1Ar3,D1Ar3,#1 65 + OR D0Ar4,D0Ar4,A0.3 66 + ORS A0.3,D1Re0,D0Re0 67 + BNE $L7 68 + $L10: 69 + MOV D0Re0,D0Ar6 70 + MOV D1Re0,D1Ar5 71 + MOV PC,D1RtP 72 + .size _div_u64,.-_div_u64 73 + 74 + .text 75 + .global _div_s64 76 + .type _div_s64,function 77 + _div_s64: 78 + MSETL [A0StP],D0FrT,D0.5 79 + XOR D0.5,D0Ar2,D0Ar4 80 + XOR D1.5,D1Ar1,D1Ar3 81 + TSTT D1Ar1,#HI(0x80000000) 82 + BZ $L25 83 + 84 + NEGS D0Ar2,D0Ar2 85 + NEG D1Ar1,D1Ar1 86 + SUBCS D1Ar1,D1Ar1,#1 87 + $L25: 88 + TSTT D1Ar3,#HI(0x80000000) 89 + BZ $L27 90 + 91 + NEGS D0Ar4,D0Ar4 92 + NEG D1Ar3,D1Ar3 93 + SUBCS D1Ar3,D1Ar3,#1 94 + $L27: 95 + CALLR D1RtP,_div_u64 96 + TSTT D1.5,#HI(0x80000000) 97 + BZ $L29 98 + 99 + NEGS D0Re0,D0Re0 100 + NEG D1Re0,D1Re0 101 + SUBCS D1Re0,D1Re0,#1 102 + $L29: 103 + 104 + GETL D0FrT,D1RtP,[A0StP+#(-16)] 105 + GETL D0.5,D1.5,[A0StP+#(-8)] 106 + SUB A0StP,A0StP,#16 107 + MOV PC,D1RtP 108 + .size _div_s64,.-_div_s64

+100

arch/metag/lib/divsi3.S

··· 1 + ! Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007 2 + ! Imagination Technologies Ltd 3 + ! 4 + ! Integer divide routines. 5 + ! 6 + 7 + .text 8 + .global ___udivsi3 9 + .type ___udivsi3,function 10 + .align 2 11 + ___udivsi3: 12 + !! 13 + !! Since core is signed divide case, just set control variable 14 + !! 15 + MOV D1Re0,D0Ar2 ! Au already in A1Ar1, Bu -> D1Re0 16 + MOV D0Re0,#0 ! Result is 0 17 + MOV D0Ar4,#0 ! Return positive result 18 + B $LIDMCUStart 19 + .size ___udivsi3,.-___udivsi3 20 + 21 + !! 22 + !! 32-bit division signed i/p - passed signed 32-bit numbers 23 + !! 24 + .global ___divsi3 25 + .type ___divsi3,function 26 + .align 2 27 + ___divsi3: 28 + !! 29 + !! A already in D1Ar1, B already in D0Ar2 -> make B abs(B) 30 + !! 31 + MOV D1Re0,D0Ar2 ! A already in A1Ar1, B -> D1Re0 32 + MOV D0Re0,#0 ! Result is 0 33 + XOR D0Ar4,D1Ar1,D1Re0 ! D0Ar4 -ive if result is -ive 34 + ABS D1Ar1,D1Ar1 ! abs(A) -> Au 35 + ABS D1Re0,D1Re0 ! abs(B) -> Bu 36 + $LIDMCUStart: 37 + CMP D1Ar1,D1Re0 ! Is ( Au > Bu )? 38 + LSR D1Ar3,D1Ar1,#2 ! Calculate (Au & (~3)) >> 2 39 + CMPHI D1Re0,D1Ar3 ! OR ( (Au & (~3)) <= (Bu << 2) )? 40 + LSLSHI D1Ar3,D1Re0,#1 ! Buq = Bu << 1 41 + BLS $LIDMCUSetup ! Yes: Do normal divide 42 + !! 43 + !! Quick divide setup can assume that CurBit only needs to start at 2 44 + !! 45 + $LIDMCQuick: 46 + CMP D1Ar1,D1Ar3 ! ( A >= Buq )? 47 + ADDCC D0Re0,D0Re0,#2 ! If yes result += 2 48 + SUBCC D1Ar1,D1Ar1,D1Ar3 ! and A -= Buq 49 + CMP D1Ar1,D1Re0 ! ( A >= Bu )? 50 + ADDCC D0Re0,D0Re0,#1 ! If yes result += 1 51 + SUBCC D1Ar1,D1Ar1,D1Re0 ! and A -= Bu 52 + ORS D0Ar4,D0Ar4,D0Ar4 ! Return neg result? 53 + NEG D0Ar2,D0Re0 ! Calulate neg result 54 + MOVMI D0Re0,D0Ar2 ! Yes: Take neg result 55 + $LIDMCRet: 56 + MOV PC,D1RtP 57 + !! 58 + !! Setup for general unsigned divide code 59 + !! 60 + !! D0Re0 is used to form the result, already set to Zero 61 + !! D1Re0 is the input Bu value, this gets trashed 62 + !! D0Ar6 is curbit which is set to 1 at the start and shifted up 63 + !! D0Ar4 is negative if we should return a negative result 64 + !! D1Ar1 is the input Au value, eventually this holds the remainder 65 + !! 66 + $LIDMCUSetup: 67 + CMP D1Ar1,D1Re0 ! Is ( Au < Bu )? 68 + MOV D0Ar6,#1 ! Set curbit to 1 69 + BCS $LIDMCRet ! Yes: Return 0 remainder Au 70 + !! 71 + !! Calculate alignment using FFB instruction 72 + !! 73 + FFB D1Ar5,D1Ar1 ! Find first bit of Au 74 + ANDN D1Ar5,D1Ar5,#31 ! Handle exceptional case. 75 + ORN D1Ar5,D1Ar5,#31 ! if N bit set, set to 31 76 + FFB D1Ar3,D1Re0 ! Find first bit of Bu 77 + ANDN D1Ar3,D1Ar3,#31 ! Handle exceptional case. 78 + ORN D1Ar3,D1Ar3,#31 ! if N bit set, set to 31 79 + SUBS D1Ar3,D1Ar5,D1Ar3 ! calculate diff, ffbA - ffbB 80 + MOV D0Ar2,D1Ar3 ! copy into bank 0 81 + LSLGT D1Re0,D1Re0,D1Ar3 ! ( > 0) ? left shift B 82 + LSLGT D0Ar6,D0Ar6,D0Ar2 ! ( > 0) ? left shift curbit 83 + !! 84 + !! Now we start the divide proper, logic is 85 + !! 86 + !! if ( A >= B ) add curbit to result and subtract B from A 87 + !! shift curbit and B down by 1 in either case 88 + !! 89 + $LIDMCLoop: 90 + CMP D1Ar1, D1Re0 ! ( A >= B )? 91 + ADDCC D0Re0, D0Re0, D0Ar6 ! If yes result += curbit 92 + SUBCC D1Ar1, D1Ar1, D1Re0 ! and A -= B 93 + LSRS D0Ar6, D0Ar6, #1 ! Shift down curbit, is it zero? 94 + LSR D1Re0, D1Re0, #1 ! Shift down B 95 + BNZ $LIDMCLoop ! Was single bit in curbit lost? 96 + ORS D0Ar4,D0Ar4,D0Ar4 ! Return neg result? 97 + NEG D0Ar2,D0Re0 ! Calulate neg result 98 + MOVMI D0Re0,D0Ar2 ! Yes: Take neg result 99 + MOV PC,D1RtP 100 + .size ___divsi3,.-___divsi3

+32

arch/metag/lib/ip_fast_csum.S

··· 1 + 2 + .text 3 + /* 4 + * This is a version of ip_compute_csum() optimized for IP headers, 5 + * which always checksum on 4 octet boundaries. 6 + * 7 + * extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl); 8 + * 9 + */ 10 + .global _ip_fast_csum 11 + .type _ip_fast_csum,function 12 + _ip_fast_csum: 13 + !! TXRPT needs loops - 1 14 + SUBS TXRPT,D0Ar2,#1 15 + MOV D0Re0,#0 16 + BLO $Lfast_csum_exit 17 + $Lfast_csum_loop: 18 + GETD D1Ar3,[D1Ar1++] 19 + ADDS D0Re0,D0Re0,D1Ar3 20 + ADDCS D0Re0,D0Re0,#1 21 + BR $Lfast_csum_loop 22 + LSR D0Ar4,D0Re0,#16 23 + AND D0Re0,D0Re0,#0xffff 24 + AND D0Ar4,D0Ar4,#0xffff 25 + ADD D0Re0,D0Re0,D0Ar4 26 + LSR D0Ar4,D0Re0,#16 27 + ADD D0Re0,D0Re0,D0Ar4 28 + XOR D0Re0,D0Re0,#-1 29 + AND D0Re0,D0Re0,#0xffff 30 + $Lfast_csum_exit: 31 + MOV PC,D1RtP 32 + .size _ip_fast_csum,.-_ip_fast_csum

+33

arch/metag/lib/lshrdi3.S

··· 1 + ! Copyright (C) 2012 by Imagination Technologies Ltd. 2 + ! 3 + ! 64-bit logical shift right routine. 4 + ! 5 + 6 + .text 7 + .global ___lshrdi3 8 + .type ___lshrdi3,function 9 + 10 + ___lshrdi3: 11 + MOV D0Re0,D0Ar2 12 + MOV D1Re0,D1Ar1 13 + CMP D1Ar3,#0 ! COUNT == 0 14 + MOVEQ PC,D1RtP ! Yes, return 15 + 16 + MOV D0Ar4,D1Ar3 17 + SUBS D1Ar3,D1Ar3,#32 ! N = COUNT - 32 18 + BGE $L30 19 + 20 + !! Shift < 32 21 + NEG D1Ar3,D1Ar3 ! N = - N 22 + LSR D0Re0,D0Re0,D0Ar4 ! LO = LO >> COUNT 23 + LSL D0Ar6,D1Re0,D1Ar3 ! TMP= HI << -(COUNT - 32) 24 + OR D0Re0,D0Re0,D0Ar6 ! LO = LO | TMP 25 + SWAP D1Ar3,D0Ar4 26 + LSR D1Re0,D1Re0,D1Ar3 ! HI = HI >> COUNT 27 + MOV PC,D1RtP 28 + $L30: 29 + !! Shift >= 32 30 + LSR D0Re0,D1Re0,D1Ar3 ! LO = HI >> N 31 + MOV D1Re0,#0 ! HI = 0 32 + MOV PC,D1RtP 33 + .size ___lshrdi3,.-___lshrdi3

+185

arch/metag/lib/memcpy.S

··· 1 + ! Copyright (C) 2008-2012 Imagination Technologies Ltd. 2 + 3 + .text 4 + .global _memcpy 5 + .type _memcpy,function 6 + ! D1Ar1 dst 7 + ! D0Ar2 src 8 + ! D1Ar3 cnt 9 + ! D0Re0 dst 10 + _memcpy: 11 + CMP D1Ar3, #16 12 + MOV A1.2, D0Ar2 ! source pointer 13 + MOV A0.2, D1Ar1 ! destination pointer 14 + MOV A0.3, D1Ar1 ! for return value 15 + ! If there are less than 16 bytes to copy use the byte copy loop 16 + BGE $Llong_copy 17 + 18 + $Lbyte_copy: 19 + ! Simply copy a byte at a time 20 + SUBS TXRPT, D1Ar3, #1 21 + BLT $Lend 22 + $Lloop_byte: 23 + GETB D1Re0, [A1.2++] 24 + SETB [A0.2++], D1Re0 25 + BR $Lloop_byte 26 + 27 + $Lend: 28 + ! Finally set return value and return 29 + MOV D0Re0, A0.3 30 + MOV PC, D1RtP 31 + 32 + $Llong_copy: 33 + ANDS D1Ar5, D1Ar1, #7 ! test destination alignment 34 + BZ $Laligned_dst 35 + 36 + ! The destination address is not 8 byte aligned. We will copy bytes from 37 + ! the source to the destination until the remaining data has an 8 byte 38 + ! destination address alignment (i.e we should never copy more than 7 39 + ! bytes here). 40 + $Lalign_dst: 41 + GETB D0Re0, [A1.2++] 42 + ADD D1Ar5, D1Ar5, #1 ! dest is aligned when D1Ar5 reaches #8 43 + SUB D1Ar3, D1Ar3, #1 ! decrement count of remaining bytes 44 + SETB [A0.2++], D0Re0 45 + CMP D1Ar5, #8 46 + BNE $Lalign_dst 47 + 48 + ! We have at least (16 - 7) = 9 bytes to copy - calculate the number of 8 byte 49 + ! blocks, then jump to the unaligned copy loop or fall through to the aligned 50 + ! copy loop as appropriate. 51 + $Laligned_dst: 52 + MOV D0Ar4, A1.2 53 + LSR D1Ar5, D1Ar3, #3 ! D1Ar5 = number of 8 byte blocks 54 + ANDS D0Ar4, D0Ar4, #7 ! test source alignment 55 + BNZ $Lunaligned_copy ! if unaligned, use unaligned copy loop 56 + 57 + ! Both source and destination are 8 byte aligned - the easy case. 58 + $Laligned_copy: 59 + LSRS D1Ar5, D1Ar3, #5 ! D1Ar5 = number of 32 byte blocks 60 + BZ $Lbyte_copy 61 + SUB TXRPT, D1Ar5, #1 62 + 63 + $Laligned_32: 64 + GETL D0Re0, D1Re0, [A1.2++] 65 + GETL D0Ar6, D1Ar5, [A1.2++] 66 + SETL [A0.2++], D0Re0, D1Re0 67 + SETL [A0.2++], D0Ar6, D1Ar5 68 + GETL D0Re0, D1Re0, [A1.2++] 69 + GETL D0Ar6, D1Ar5, [A1.2++] 70 + SETL [A0.2++], D0Re0, D1Re0 71 + SETL [A0.2++], D0Ar6, D1Ar5 72 + BR $Laligned_32 73 + 74 + ! If there are any remaining bytes use the byte copy loop, otherwise we are done 75 + ANDS D1Ar3, D1Ar3, #0x1f 76 + BNZ $Lbyte_copy 77 + B $Lend 78 + 79 + ! The destination is 8 byte aligned but the source is not, and there are 8 80 + ! or more bytes to be copied. 81 + $Lunaligned_copy: 82 + ! Adjust the source pointer (A1.2) to the 8 byte boundary before its 83 + ! current value 84 + MOV D0Ar4, A1.2 85 + MOV D0Ar6, A1.2 86 + ANDMB D0Ar4, D0Ar4, #0xfff8 87 + MOV A1.2, D0Ar4 88 + ! Save the number of bytes of mis-alignment in D0Ar4 for use later 89 + SUBS D0Ar6, D0Ar6, D0Ar4 90 + MOV D0Ar4, D0Ar6 91 + ! if there is no mis-alignment after all, use the aligned copy loop 92 + BZ $Laligned_copy 93 + 94 + ! prefetch 8 bytes 95 + GETL D0Re0, D1Re0, [A1.2] 96 + 97 + SUB TXRPT, D1Ar5, #1 98 + 99 + ! There are 3 mis-alignment cases to be considered. Less than 4 bytes, exactly 100 + ! 4 bytes, and more than 4 bytes. 101 + CMP D0Ar6, #4 102 + BLT $Lunaligned_1_2_3 ! use 1-3 byte mis-alignment loop 103 + BZ $Lunaligned_4 ! use 4 byte mis-alignment loop 104 + 105 + ! The mis-alignment is more than 4 bytes 106 + $Lunaligned_5_6_7: 107 + SUB D0Ar6, D0Ar6, #4 108 + ! Calculate the bit offsets required for the shift operations necesssary 109 + ! to align the data. 110 + ! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset) 111 + MULW D0Ar6, D0Ar6, #8 112 + MOV D1Ar5, #32 113 + SUB D1Ar5, D1Ar5, D0Ar6 114 + ! Move data 4 bytes before we enter the main loop 115 + MOV D0Re0, D1Re0 116 + 117 + $Lloop_5_6_7: 118 + GETL D0Ar2, D1Ar1, [++A1.2] 119 + ! form 64-bit data in D0Re0, D1Re0 120 + LSR D0Re0, D0Re0, D0Ar6 121 + MOV D1Re0, D0Ar2 122 + LSL D1Re0, D1Re0, D1Ar5 123 + ADD D0Re0, D0Re0, D1Re0 124 + 125 + LSR D0Ar2, D0Ar2, D0Ar6 126 + LSL D1Re0, D1Ar1, D1Ar5 127 + ADD D1Re0, D1Re0, D0Ar2 128 + 129 + SETL [A0.2++], D0Re0, D1Re0 130 + MOV D0Re0, D1Ar1 131 + BR $Lloop_5_6_7 132 + 133 + B $Lunaligned_end 134 + 135 + $Lunaligned_1_2_3: 136 + ! Calculate the bit offsets required for the shift operations necesssary 137 + ! to align the data. 138 + ! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset) 139 + MULW D0Ar6, D0Ar6, #8 140 + MOV D1Ar5, #32 141 + SUB D1Ar5, D1Ar5, D0Ar6 142 + 143 + $Lloop_1_2_3: 144 + ! form 64-bit data in D0Re0,D1Re0 145 + LSR D0Re0, D0Re0, D0Ar6 146 + LSL D1Ar1, D1Re0, D1Ar5 147 + ADD D0Re0, D0Re0, D1Ar1 148 + MOV D0Ar2, D1Re0 149 + LSR D0FrT, D0Ar2, D0Ar6 150 + GETL D0Ar2, D1Ar1, [++A1.2] 151 + 152 + MOV D1Re0, D0Ar2 153 + LSL D1Re0, D1Re0, D1Ar5 154 + ADD D1Re0, D1Re0, D0FrT 155 + 156 + SETL [A0.2++], D0Re0, D1Re0 157 + MOV D0Re0, D0Ar2 158 + MOV D1Re0, D1Ar1 159 + BR $Lloop_1_2_3 160 + 161 + B $Lunaligned_end 162 + 163 + ! The 4 byte mis-alignment case - this does not require any shifting, just a 164 + ! shuffling of registers. 165 + $Lunaligned_4: 166 + MOV D0Re0, D1Re0 167 + $Lloop_4: 168 + GETL D0Ar2, D1Ar1, [++A1.2] 169 + MOV D1Re0, D0Ar2 170 + SETL [A0.2++], D0Re0, D1Re0 171 + MOV D0Re0, D1Ar1 172 + BR $Lloop_4 173 + 174 + $Lunaligned_end: 175 + ! If there are no remaining bytes to copy, we are done. 176 + ANDS D1Ar3, D1Ar3, #7 177 + BZ $Lend 178 + ! Re-adjust the source pointer (A1.2) back to the actual (unaligned) byte 179 + ! address of the remaining bytes, and fall through to the byte copy loop. 180 + MOV D0Ar6, A1.2 181 + ADD D1Ar5, D0Ar4, D0Ar6 182 + MOV A1.2, D1Ar5 183 + B $Lbyte_copy 184 + 185 + .size _memcpy,.-_memcpy

+345

arch/metag/lib/memmove.S

··· 1 + ! Copyright (C) 2008-2012 Imagination Technologies Ltd. 2 + 3 + .text 4 + .global _memmove 5 + .type _memmove,function 6 + ! D1Ar1 dst 7 + ! D0Ar2 src 8 + ! D1Ar3 cnt 9 + ! D0Re0 dst 10 + _memmove: 11 + CMP D1Ar3, #0 12 + MOV D0Re0, D1Ar1 13 + BZ $LEND2 14 + MSETL [A0StP], D0.5, D0.6, D0.7 15 + MOV D1Ar5, D0Ar2 16 + CMP D1Ar1, D1Ar5 17 + BLT $Lforwards_copy 18 + SUB D0Ar4, D1Ar1, D1Ar3 19 + ADD D0Ar4, D0Ar4, #1 20 + CMP D0Ar2, D0Ar4 21 + BLT $Lforwards_copy 22 + ! should copy backwards 23 + MOV D1Re0, D0Ar2 24 + ! adjust pointer to the end of mem 25 + ADD D0Ar2, D1Re0, D1Ar3 26 + ADD D1Ar1, D1Ar1, D1Ar3 27 + 28 + MOV A1.2, D0Ar2 29 + MOV A0.2, D1Ar1 30 + CMP D1Ar3, #8 31 + BLT $Lbbyte_loop 32 + 33 + MOV D0Ar4, D0Ar2 34 + MOV D1Ar5, D1Ar1 35 + 36 + ! test 8 byte alignment 37 + ANDS D1Ar5, D1Ar5, #7 38 + BNE $Lbdest_unaligned 39 + 40 + ANDS D0Ar4, D0Ar4, #7 41 + BNE $Lbsrc_unaligned 42 + 43 + LSR D1Ar5, D1Ar3, #3 44 + 45 + $Lbaligned_loop: 46 + GETL D0Re0, D1Re0, [--A1.2] 47 + SETL [--A0.2], D0Re0, D1Re0 48 + SUBS D1Ar5, D1Ar5, #1 49 + BNE $Lbaligned_loop 50 + 51 + ANDS D1Ar3, D1Ar3, #7 52 + BZ $Lbbyte_loop_exit 53 + $Lbbyte_loop: 54 + GETB D1Re0, [--A1.2] 55 + SETB [--A0.2], D1Re0 56 + SUBS D1Ar3, D1Ar3, #1 57 + BNE $Lbbyte_loop 58 + $Lbbyte_loop_exit: 59 + MOV D0Re0, A0.2 60 + $LEND: 61 + SUB A0.2, A0StP, #24 62 + MGETL D0.5, D0.6, D0.7, [A0.2] 63 + SUB A0StP, A0StP, #24 64 + $LEND2: 65 + MOV PC, D1RtP 66 + 67 + $Lbdest_unaligned: 68 + GETB D0Re0, [--A1.2] 69 + SETB [--A0.2], D0Re0 70 + SUBS D1Ar5, D1Ar5, #1 71 + SUB D1Ar3, D1Ar3, #1 72 + BNE $Lbdest_unaligned 73 + CMP D1Ar3, #8 74 + BLT $Lbbyte_loop 75 + $Lbsrc_unaligned: 76 + LSR D1Ar5, D1Ar3, #3 77 + ! adjust A1.2 78 + MOV D0Ar4, A1.2 79 + ! save original address 80 + MOV D0Ar6, A1.2 81 + 82 + ADD D0Ar4, D0Ar4, #7 83 + ANDMB D0Ar4, D0Ar4, #0xfff8 84 + ! new address is the 8-byte aligned one above the original 85 + MOV A1.2, D0Ar4 86 + 87 + ! A0.2 dst 64-bit is aligned 88 + ! measure the gap size 89 + SUB D0Ar6, D0Ar4, D0Ar6 90 + MOVS D0Ar4, D0Ar6 91 + ! keep this information for the later adjustment 92 + ! both aligned 93 + BZ $Lbaligned_loop 94 + 95 + ! prefetch 96 + GETL D0Re0, D1Re0, [--A1.2] 97 + 98 + CMP D0Ar6, #4 99 + BLT $Lbunaligned_1_2_3 100 + ! 32-bit aligned 101 + BZ $Lbaligned_4 102 + 103 + SUB D0Ar6, D0Ar6, #4 104 + ! D1.6 stores the gap size in bits 105 + MULW D1.6, D0Ar6, #8 106 + MOV D0.6, #32 107 + ! D0.6 stores the complement of the gap size 108 + SUB D0.6, D0.6, D1.6 109 + 110 + $Lbunaligned_5_6_7: 111 + GETL D0.7, D1.7, [--A1.2] 112 + ! form 64-bit data in D0Re0, D1Re0 113 + MOV D1Re0, D0Re0 114 + ! D1Re0 << gap-size 115 + LSL D1Re0, D1Re0, D1.6 116 + MOV D0Re0, D1.7 117 + ! D0Re0 >> complement 118 + LSR D0Re0, D0Re0, D0.6 119 + MOV D1.5, D0Re0 120 + ! combine the both 121 + ADD D1Re0, D1Re0, D1.5 122 + 123 + MOV D1.5, D1.7 124 + LSL D1.5, D1.5, D1.6 125 + MOV D0Re0, D0.7 126 + LSR D0Re0, D0Re0, D0.6 127 + MOV D0.5, D1.5 128 + ADD D0Re0, D0Re0, D0.5 129 + 130 + SETL [--A0.2], D0Re0, D1Re0 131 + MOV D0Re0, D0.7 132 + MOV D1Re0, D1.7 133 + SUBS D1Ar5, D1Ar5, #1 134 + BNE $Lbunaligned_5_6_7 135 + 136 + ANDS D1Ar3, D1Ar3, #7 137 + BZ $Lbbyte_loop_exit 138 + ! Adjust A1.2 139 + ! A1.2 <- A1.2 +8 - gapsize 140 + ADD A1.2, A1.2, #8 141 + SUB A1.2, A1.2, D0Ar4 142 + B $Lbbyte_loop 143 + 144 + $Lbunaligned_1_2_3: 145 + MULW D1.6, D0Ar6, #8 146 + MOV D0.6, #32 147 + SUB D0.6, D0.6, D1.6 148 + 149 + $Lbunaligned_1_2_3_loop: 150 + GETL D0.7, D1.7, [--A1.2] 151 + ! form 64-bit data in D0Re0, D1Re0 152 + LSL D1Re0, D1Re0, D1.6 153 + ! save D0Re0 for later use 154 + MOV D0.5, D0Re0 155 + LSR D0Re0, D0Re0, D0.6 156 + MOV D1.5, D0Re0 157 + ADD D1Re0, D1Re0, D1.5 158 + 159 + ! orignal data in D0Re0 160 + MOV D1.5, D0.5 161 + LSL D1.5, D1.5, D1.6 162 + MOV D0Re0, D1.7 163 + LSR D0Re0, D0Re0, D0.6 164 + MOV D0.5, D1.5 165 + ADD D0Re0, D0Re0, D0.5 166 + 167 + SETL [--A0.2], D0Re0, D1Re0 168 + MOV D0Re0, D0.7 169 + MOV D1Re0, D1.7 170 + SUBS D1Ar5, D1Ar5, #1 171 + BNE $Lbunaligned_1_2_3_loop 172 + 173 + ANDS D1Ar3, D1Ar3, #7 174 + BZ $Lbbyte_loop_exit 175 + ! Adjust A1.2 176 + ADD A1.2, A1.2, #8 177 + SUB A1.2, A1.2, D0Ar4 178 + B $Lbbyte_loop 179 + 180 + $Lbaligned_4: 181 + GETL D0.7, D1.7, [--A1.2] 182 + MOV D1Re0, D0Re0 183 + MOV D0Re0, D1.7 184 + SETL [--A0.2], D0Re0, D1Re0 185 + MOV D0Re0, D0.7 186 + MOV D1Re0, D1.7 187 + SUBS D1Ar5, D1Ar5, #1 188 + BNE $Lbaligned_4 189 + ANDS D1Ar3, D1Ar3, #7 190 + BZ $Lbbyte_loop_exit 191 + ! Adjust A1.2 192 + ADD A1.2, A1.2, #8 193 + SUB A1.2, A1.2, D0Ar4 194 + B $Lbbyte_loop 195 + 196 + $Lforwards_copy: 197 + MOV A1.2, D0Ar2 198 + MOV A0.2, D1Ar1 199 + CMP D1Ar3, #8 200 + BLT $Lfbyte_loop 201 + 202 + MOV D0Ar4, D0Ar2 203 + MOV D1Ar5, D1Ar1 204 + 205 + ANDS D1Ar5, D1Ar5, #7 206 + BNE $Lfdest_unaligned 207 + 208 + ANDS D0Ar4, D0Ar4, #7 209 + BNE $Lfsrc_unaligned 210 + 211 + LSR D1Ar5, D1Ar3, #3 212 + 213 + $Lfaligned_loop: 214 + GETL D0Re0, D1Re0, [A1.2++] 215 + SUBS D1Ar5, D1Ar5, #1 216 + SETL [A0.2++], D0Re0, D1Re0 217 + BNE $Lfaligned_loop 218 + 219 + ANDS D1Ar3, D1Ar3, #7 220 + BZ $Lfbyte_loop_exit 221 + $Lfbyte_loop: 222 + GETB D1Re0, [A1.2++] 223 + SETB [A0.2++], D1Re0 224 + SUBS D1Ar3, D1Ar3, #1 225 + BNE $Lfbyte_loop 226 + $Lfbyte_loop_exit: 227 + MOV D0Re0, D1Ar1 228 + B $LEND 229 + 230 + $Lfdest_unaligned: 231 + GETB D0Re0, [A1.2++] 232 + ADD D1Ar5, D1Ar5, #1 233 + SUB D1Ar3, D1Ar3, #1 234 + SETB [A0.2++], D0Re0 235 + CMP D1Ar5, #8 236 + BNE $Lfdest_unaligned 237 + CMP D1Ar3, #8 238 + BLT $Lfbyte_loop 239 + $Lfsrc_unaligned: 240 + ! adjust A1.2 241 + LSR D1Ar5, D1Ar3, #3 242 + 243 + MOV D0Ar4, A1.2 244 + MOV D0Ar6, A1.2 245 + ANDMB D0Ar4, D0Ar4, #0xfff8 246 + MOV A1.2, D0Ar4 247 + 248 + ! A0.2 dst 64-bit is aligned 249 + SUB D0Ar6, D0Ar6, D0Ar4 250 + ! keep the information for the later adjustment 251 + MOVS D0Ar4, D0Ar6 252 + 253 + ! both aligned 254 + BZ $Lfaligned_loop 255 + 256 + ! prefetch 257 + GETL D0Re0, D1Re0, [A1.2] 258 + 259 + CMP D0Ar6, #4 260 + BLT $Lfunaligned_1_2_3 261 + BZ $Lfaligned_4 262 + 263 + SUB D0Ar6, D0Ar6, #4 264 + MULW D0.6, D0Ar6, #8 265 + MOV D1.6, #32 266 + SUB D1.6, D1.6, D0.6 267 + 268 + $Lfunaligned_5_6_7: 269 + GETL D0.7, D1.7, [++A1.2] 270 + ! form 64-bit data in D0Re0, D1Re0 271 + MOV D0Re0, D1Re0 272 + LSR D0Re0, D0Re0, D0.6 273 + MOV D1Re0, D0.7 274 + LSL D1Re0, D1Re0, D1.6 275 + MOV D0.5, D1Re0 276 + ADD D0Re0, D0Re0, D0.5 277 + 278 + MOV D0.5, D0.7 279 + LSR D0.5, D0.5, D0.6 280 + MOV D1Re0, D1.7 281 + LSL D1Re0, D1Re0, D1.6 282 + MOV D1.5, D0.5 283 + ADD D1Re0, D1Re0, D1.5 284 + 285 + SETL [A0.2++], D0Re0, D1Re0 286 + MOV D0Re0, D0.7 287 + MOV D1Re0, D1.7 288 + SUBS D1Ar5, D1Ar5, #1 289 + BNE $Lfunaligned_5_6_7 290 + 291 + ANDS D1Ar3, D1Ar3, #7 292 + BZ $Lfbyte_loop_exit 293 + ! Adjust A1.2 294 + ADD A1.2, A1.2, D0Ar4 295 + B $Lfbyte_loop 296 + 297 + $Lfunaligned_1_2_3: 298 + MULW D0.6, D0Ar6, #8 299 + MOV D1.6, #32 300 + SUB D1.6, D1.6, D0.6 301 + 302 + $Lfunaligned_1_2_3_loop: 303 + GETL D0.7, D1.7, [++A1.2] 304 + ! form 64-bit data in D0Re0, D1Re0 305 + LSR D0Re0, D0Re0, D0.6 306 + MOV D1.5, D1Re0 307 + LSL D1Re0, D1Re0, D1.6 308 + MOV D0.5, D1Re0 309 + ADD D0Re0, D0Re0, D0.5 310 + 311 + MOV D0.5, D1.5 312 + LSR D0.5, D0.5, D0.6 313 + MOV D1Re0, D0.7 314 + LSL D1Re0, D1Re0, D1.6 315 + MOV D1.5, D0.5 316 + ADD D1Re0, D1Re0, D1.5 317 + 318 + SETL [A0.2++], D0Re0, D1Re0 319 + MOV D0Re0, D0.7 320 + MOV D1Re0, D1.7 321 + SUBS D1Ar5, D1Ar5, #1 322 + BNE $Lfunaligned_1_2_3_loop 323 + 324 + ANDS D1Ar3, D1Ar3, #7 325 + BZ $Lfbyte_loop_exit 326 + ! Adjust A1.2 327 + ADD A1.2, A1.2, D0Ar4 328 + B $Lfbyte_loop 329 + 330 + $Lfaligned_4: 331 + GETL D0.7, D1.7, [++A1.2] 332 + MOV D0Re0, D1Re0 333 + MOV D1Re0, D0.7 334 + SETL [A0.2++], D0Re0, D1Re0 335 + MOV D0Re0, D0.7 336 + MOV D1Re0, D1.7 337 + SUBS D1Ar5, D1Ar5, #1 338 + BNE $Lfaligned_4 339 + ANDS D1Ar3, D1Ar3, #7 340 + BZ $Lfbyte_loop_exit 341 + ! Adjust A1.2 342 + ADD A1.2, A1.2, D0Ar4 343 + B $Lfbyte_loop 344 + 345 + .size _memmove,.-_memmove

+86

arch/metag/lib/memset.S

··· 1 + ! Copyright (C) 2008-2012 Imagination Technologies Ltd. 2 + 3 + .text 4 + .global _memset 5 + .type _memset,function 6 + ! D1Ar1 dst 7 + ! D0Ar2 c 8 + ! D1Ar3 cnt 9 + ! D0Re0 dst 10 + _memset: 11 + AND D0Ar2,D0Ar2,#0xFF ! Ensure a byte input value 12 + MULW D0Ar2,D0Ar2,#0x0101 ! Duplicate byte value into 0-15 13 + ANDS D0Ar4,D1Ar1,#7 ! Extract bottom LSBs of dst 14 + LSL D0Re0,D0Ar2,#16 ! Duplicate byte value into 16-31 15 + ADD A0.2,D0Ar2,D0Re0 ! Duplicate byte value into 4 (A0.2) 16 + MOV D0Re0,D1Ar1 ! Return dst 17 + BZ $LLongStub ! if start address is aligned 18 + ! start address is not aligned on an 8 byte boundary, so we 19 + ! need the number of bytes up to the next 8 byte address 20 + ! boundary, or the length of the string if less than 8, in D1Ar5 21 + MOV D0Ar2,#8 ! Need 8 - N in D1Ar5 ... 22 + SUB D1Ar5,D0Ar2,D0Ar4 ! ... subtract N 23 + CMP D1Ar3,D1Ar5 24 + MOVMI D1Ar5,D1Ar3 25 + B $LByteStub ! dst is mis-aligned, do $LByteStub 26 + 27 + ! 28 + ! Preamble to LongLoop which generates 4*8 bytes per interation (5 cycles) 29 + ! 30 + $LLongStub: 31 + LSRS D0Ar2,D1Ar3,#5 32 + AND D1Ar3,D1Ar3,#0x1F 33 + MOV A1.2,A0.2 34 + BEQ $LLongishStub 35 + SUB TXRPT,D0Ar2,#1 36 + CMP D1Ar3,#0 37 + $LLongLoop: 38 + SETL [D1Ar1++],A0.2,A1.2 39 + SETL [D1Ar1++],A0.2,A1.2 40 + SETL [D1Ar1++],A0.2,A1.2 41 + SETL [D1Ar1++],A0.2,A1.2 42 + BR $LLongLoop 43 + BZ $Lexit 44 + ! 45 + ! Preamble to LongishLoop which generates 1*8 bytes per interation (2 cycles) 46 + ! 47 + $LLongishStub: 48 + LSRS D0Ar2,D1Ar3,#3 49 + AND D1Ar3,D1Ar3,#0x7 50 + MOV D1Ar5,D1Ar3 51 + BEQ $LByteStub 52 + SUB TXRPT,D0Ar2,#1 53 + CMP D1Ar3,#0 54 + $LLongishLoop: 55 + SETL [D1Ar1++],A0.2,A1.2 56 + BR $LLongishLoop 57 + BZ $Lexit 58 + ! 59 + ! This does a byte structured burst of up to 7 bytes 60 + ! 61 + ! D1Ar1 should point to the location required 62 + ! D1Ar3 should be the remaining total byte count 63 + ! D1Ar5 should be burst size (<= D1Ar3) 64 + ! 65 + $LByteStub: 66 + SUBS D1Ar3,D1Ar3,D1Ar5 ! Reduce count 67 + ADD D1Ar1,D1Ar1,D1Ar5 ! Advance pointer to end of area 68 + MULW D1Ar5,D1Ar5,#4 ! Scale to (1*4), (2*4), (3*4) 69 + SUB D1Ar5,D1Ar5,#(8*4) ! Rebase to -(7*4), -(6*4), -(5*4), ... 70 + MOV A1.2,D1Ar5 71 + SUB PC,CPC1,A1.2 ! Jump into table below 72 + SETB [D1Ar1+#(-7)],A0.2 73 + SETB [D1Ar1+#(-6)],A0.2 74 + SETB [D1Ar1+#(-5)],A0.2 75 + SETB [D1Ar1+#(-4)],A0.2 76 + SETB [D1Ar1+#(-3)],A0.2 77 + SETB [D1Ar1+#(-2)],A0.2 78 + SETB [D1Ar1+#(-1)],A0.2 79 + ! 80 + ! Return if all data has been output, otherwise do $LLongStub 81 + ! 82 + BNZ $LLongStub 83 + $Lexit: 84 + MOV PC,D1RtP 85 + .size _memset,.-_memset 86 +

+38

arch/metag/lib/modsi3.S

··· 1 + ! Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007 2 + ! Imagination Technologies Ltd 3 + ! 4 + ! Integer modulus routines. 5 + ! 6 + !! 7 + !! 32-bit modulus unsigned i/p - passed unsigned 32-bit numbers 8 + !! 9 + .text 10 + .global ___umodsi3 11 + .type ___umodsi3,function 12 + .align 2 13 + ___umodsi3: 14 + MOV D0FrT,D1RtP ! Save original return address 15 + CALLR D1RtP,___udivsi3 16 + MOV D1RtP,D0FrT ! Recover return address 17 + MOV D0Re0,D1Ar1 ! Return remainder 18 + MOV PC,D1RtP 19 + .size ___umodsi3,.-___umodsi3 20 + 21 + !! 22 + !! 32-bit modulus signed i/p - passed signed 32-bit numbers 23 + !! 24 + .global ___modsi3 25 + .type ___modsi3,function 26 + .align 2 27 + ___modsi3: 28 + MOV D0FrT,D1RtP ! Save original return address 29 + MOV A0.2,D1Ar1 ! Save A in A0.2 30 + CALLR D1RtP,___divsi3 31 + MOV D1RtP,D0FrT ! Recover return address 32 + MOV D1Re0,A0.2 ! Recover A 33 + MOV D0Re0,D1Ar1 ! Return remainder 34 + ORS D1Re0,D1Re0,D1Re0 ! Was A negative? 35 + NEG D1Ar1,D1Ar1 ! Negate remainder 36 + MOVMI D0Re0,D1Ar1 ! Return neg remainder 37 + MOV PC, D1RtP 38 + .size ___modsi3,.-___modsi3

+44

arch/metag/lib/muldi3.S

··· 1 + ! Copyright (C) 2012 by Imagination Technologies Ltd. 2 + ! 3 + ! 64-bit multiply routine. 4 + ! 5 + 6 + ! 7 + ! 64-bit signed/unsigned multiply 8 + ! 9 + ! A = D1Ar1:D0Ar2 = a 2^48 + b 2^32 + c 2^16 + d 2^0 10 + ! 11 + ! B = D1Ar3:D0Ar4 = w 2^48 + x 2^32 + y 2^16 + z 2^0 12 + ! 13 + .text 14 + .global ___muldi3 15 + .type ___muldi3,function 16 + 17 + ___muldi3: 18 + MULD D1Re0,D1Ar1,D0Ar4 ! (a 2^48 + b 2^32)(y 2^16 + z 2^0) 19 + MULD D0Re0,D0Ar2,D1Ar3 ! (w 2^48 + x 2^32)(c 2^16 + d 2^0) 20 + ADD D1Re0,D1Re0,D0Re0 21 + 22 + MULW D0Re0,D0Ar2,D0Ar4 ! (d 2^0) * (z 2^0) 23 + 24 + RTDW D0Ar2,D0Ar2 25 + MULW D0Ar6,D0Ar2,D0Ar4 ! (c 2^16)(z 2^0) 26 + LSR D1Ar5,D0Ar6,#16 27 + LSL D0Ar6,D0Ar6,#16 28 + ADDS D0Re0,D0Re0,D0Ar6 29 + ADDCS D1Re0,D1Re0,#1 30 + RTDW D0Ar4,D0Ar4 31 + ADD D1Re0,D1Re0,D1Ar5 32 + 33 + MULW D0Ar6,D0Ar2,D0Ar4 ! (c 2^16)(y 2^16) 34 + ADD D1Re0,D1Re0,D0Ar6 35 + 36 + RTDW D0Ar2,D0Ar2 37 + MULW D0Ar6,D0Ar2,D0Ar4 ! (d 2^0)(y 2^16) 38 + LSR D1Ar5,D0Ar6,#16 39 + LSL D0Ar6,D0Ar6,#16 40 + ADDS D0Re0,D0Re0,D0Ar6 41 + ADD D1Re0,D1Re0,D1Ar5 42 + ADDCS D1Re0,D1Re0,#1 43 + MOV PC, D1RtP 44 + .size ___muldi3,.-___muldi3

+27

arch/metag/lib/ucmpdi2.S

··· 1 + ! Copyright (C) 2012 by Imagination Technologies Ltd. 2 + ! 3 + ! 64-bit unsigned compare routine. 4 + ! 5 + 6 + .text 7 + .global ___ucmpdi2 8 + .type ___ucmpdi2,function 9 + 10 + ! low high 11 + ! u64 a (D0Ar2, D1Ar1) 12 + ! u64 b (D0Ar4, D1Ar3) 13 + ___ucmpdi2: 14 + ! start at 1 (equal) and conditionally increment or decrement 15 + MOV D0Re0,#1 16 + 17 + ! high words 18 + CMP D1Ar1,D1Ar3 19 + ! or if equal, low words 20 + CMPEQ D0Ar2,D0Ar4 21 + 22 + ! unsigned compare 23 + SUBLO D0Re0,D0Re0,#1 24 + ADDHI D0Re0,D0Re0,#1 25 + 26 + MOV PC,D1RtP 27 + .size ___ucmpdi2,.-___ucmpdi2