···11+#ifndef _METAG_CHECKSUM_H22+#define _METAG_CHECKSUM_H33+44+/*55+ * computes the checksum of a memory block at buff, length len,66+ * and adds in "sum" (32-bit)77+ *88+ * returns a 32-bit number suitable for feeding into itself99+ * or csum_tcpudp_magic1010+ *1111+ * this function must be called with even lengths, except1212+ * for the last fragment, which may be odd1313+ *1414+ * it's best to have buff aligned on a 32-bit boundary1515+ */1616+extern __wsum csum_partial(const void *buff, int len, __wsum sum);1717+1818+/*1919+ * the same as csum_partial, but copies from src while it2020+ * checksums2121+ *2222+ * here even more important to align src and dst on a 32-bit (or even2323+ * better 64-bit) boundary2424+ */2525+extern __wsum csum_partial_copy(const void *src, void *dst, int len,2626+ __wsum sum);2727+2828+/*2929+ * the same as csum_partial_copy, but copies from user space.3030+ *3131+ * here even more important to align src and dst on a 32-bit (or even3232+ * better 64-bit) boundary3333+ */3434+extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst,3535+ int len, __wsum sum, int *csum_err);3636+3737+#define csum_partial_copy_nocheck(src, dst, len, sum) \3838+ csum_partial_copy((src), (dst), (len), (sum))3939+4040+/*4141+ * Fold a partial checksum4242+ */4343+static inline __sum16 csum_fold(__wsum csum)4444+{4545+ u32 sum = (__force u32)csum;4646+ sum = (sum & 0xffff) + (sum >> 16);4747+ sum = (sum & 0xffff) + (sum >> 16);4848+ return (__force __sum16)~sum;4949+}5050+5151+/*5252+ * This is a version of ip_compute_csum() optimized for IP headers,5353+ * which always checksum on 4 octet boundaries.5454+ */5555+extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl);5656+5757+/*5858+ * computes the checksum of the TCP/UDP pseudo-header5959+ * returns a 16-bit checksum, already complemented6060+ */6161+static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,6262+ unsigned short len,6363+ unsigned short proto,6464+ __wsum sum)6565+{6666+ unsigned long len_proto = (proto + len) << 8;6767+ asm ("ADD %0, %0, %1\n"6868+ "ADDS %0, %0, %2\n"6969+ "ADDCS %0, %0, #1\n"7070+ "ADDS %0, %0, %3\n"7171+ "ADDCS %0, %0, #1\n"7272+ : "=d" (sum)7373+ : "d" (daddr), "d" (saddr), "d" (len_proto),7474+ "0" (sum)7575+ : "cc");7676+ return sum;7777+}7878+7979+static inline __sum168080+csum_tcpudp_magic(__be32 saddr, __be32 daddr, unsigned short len,8181+ unsigned short proto, __wsum sum)8282+{8383+ return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));8484+}8585+8686+/*8787+ * this routine is used for miscellaneous IP-like checksums, mainly8888+ * in icmp.c8989+ */9090+extern __sum16 ip_compute_csum(const void *buff, int len);9191+9292+#endif /* _METAG_CHECKSUM_H */
···11+! Copyright (C) 2012 by Imagination Technologies Ltd.22+!33+! 64-bit arithmetic shift left routine.44+!55+66+ .text77+ .global ___ashldi388+ .type ___ashldi3,function99+1010+___ashldi3:1111+ MOV D0Re0,D0Ar21212+ MOV D1Re0,D1Ar11313+ CMP D1Ar3,#0 ! COUNT == 01414+ MOVEQ PC,D1RtP ! Yes, return1515+1616+ SUBS D0Ar4,D1Ar3,#32 ! N = COUNT - 321717+ BGE $L101818+1919+!! Shift < 322020+ NEG D0Ar4,D0Ar4 ! N = - N2121+ LSL D1Re0,D1Re0,D1Ar3 ! HI = HI << COUNT2222+ LSR D0Ar6,D0Re0,D0Ar4 ! TMP= LO >> -(COUNT - 32)2323+ OR D1Re0,D1Re0,D0Ar6 ! HI = HI | TMP2424+ SWAP D0Ar4,D1Ar32525+ LSL D0Re0,D0Re0,D0Ar4 ! LO = LO << COUNT2626+ MOV PC,D1RtP2727+2828+$L10:2929+!! Shift >= 323030+ LSL D1Re0,D0Re0,D0Ar4 ! HI = LO << N3131+ MOV D0Re0,#0 ! LO = 03232+ MOV PC,D1RtP3333+ .size ___ashldi3,.-___ashldi3
+33
arch/metag/lib/ashrdi3.S
···11+! Copyright (C) 2012 by Imagination Technologies Ltd.22+!33+! 64-bit arithmetic shift right routine.44+!55+66+ .text77+ .global ___ashrdi388+ .type ___ashrdi3,function99+1010+___ashrdi3:1111+ MOV D0Re0,D0Ar21212+ MOV D1Re0,D1Ar11313+ CMP D1Ar3,#0 ! COUNT == 01414+ MOVEQ PC,D1RtP ! Yes, return1515+1616+ MOV D0Ar4,D1Ar31717+ SUBS D1Ar3,D1Ar3,#32 ! N = COUNT - 321818+ BGE $L201919+2020+!! Shift < 322121+ NEG D1Ar3,D1Ar3 ! N = - N2222+ LSR D0Re0,D0Re0,D0Ar4 ! LO = LO >> COUNT2323+ LSL D0Ar6,D1Re0,D1Ar3 ! TMP= HI << -(COUNT - 32)2424+ OR D0Re0,D0Re0,D0Ar6 ! LO = LO | TMP2525+ SWAP D1Ar3,D0Ar42626+ ASR D1Re0,D1Re0,D1Ar3 ! HI = HI >> COUNT2727+ MOV PC,D1RtP2828+$L20:2929+!! Shift >= 323030+ ASR D0Re0,D1Re0,D1Ar3 ! LO = HI >> N3131+ ASR D1Re0,D1Re0,#31 ! HI = HI >> 313232+ MOV PC,D1RtP3333+ .size ___ashrdi3,.-___ashrdi3
+168
arch/metag/lib/checksum.c
···11+/*22+ *33+ * INET An implementation of the TCP/IP protocol suite for the LINUX44+ * operating system. INET is implemented using the BSD Socket55+ * interface as the means of communication with the user level.66+ *77+ * IP/TCP/UDP checksumming routines88+ *99+ * Authors: Jorge Cwik, <jorge@laser.satlink.net>1010+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>1111+ * Tom May, <ftom@netcom.com>1212+ * Andreas Schwab, <schwab@issan.informatik.uni-dortmund.de>1313+ * Lots of code moved from tcp.c and ip.c; see those files1414+ * for more names.1515+ *1616+ * 03/02/96 Jes Sorensen, Andreas Schwab, Roman Hodek:1717+ * Fixed some nasty bugs, causing some horrible crashes.1818+ * A: At some points, the sum (%0) was used as1919+ * length-counter instead of the length counter2020+ * (%1). Thanks to Roman Hodek for pointing this out.2121+ * B: GCC seems to mess up if one uses too many2222+ * data-registers to hold input values and one tries to2323+ * specify d0 and d1 as scratch registers. Letting gcc2424+ * choose these registers itself solves the problem.2525+ *2626+ * This program is free software; you can redistribute it and/or2727+ * modify it under the terms of the GNU General Public License2828+ * as published by the Free Software Foundation; either version2929+ * 2 of the License, or (at your option) any later version.3030+ */3131+3232+/* Revised by Kenneth Albanowski for m68knommu. Basic problem: unaligned access3333+ kills, so most of the assembly has to go. */3434+3535+#include <linux/module.h>3636+#include <net/checksum.h>3737+3838+#include <asm/byteorder.h>3939+4040+static inline unsigned short from32to16(unsigned int x)4141+{4242+ /* add up 16-bit and 16-bit for 16+c bit */4343+ x = (x & 0xffff) + (x >> 16);4444+ /* add up carry.. */4545+ x = (x & 0xffff) + (x >> 16);4646+ return x;4747+}4848+4949+static unsigned int do_csum(const unsigned char *buff, int len)5050+{5151+ int odd;5252+ unsigned int result = 0;5353+5454+ if (len <= 0)5555+ goto out;5656+ odd = 1 & (unsigned long) buff;5757+ if (odd) {5858+#ifdef __LITTLE_ENDIAN5959+ result += (*buff << 8);6060+#else6161+ result = *buff;6262+#endif6363+ len--;6464+ buff++;6565+ }6666+ if (len >= 2) {6767+ if (2 & (unsigned long) buff) {6868+ result += *(unsigned short *) buff;6969+ len -= 2;7070+ buff += 2;7171+ }7272+ if (len >= 4) {7373+ const unsigned char *end = buff + ((unsigned)len & ~3);7474+ unsigned int carry = 0;7575+ do {7676+ unsigned int w = *(unsigned int *) buff;7777+ buff += 4;7878+ result += carry;7979+ result += w;8080+ carry = (w > result);8181+ } while (buff < end);8282+ result += carry;8383+ result = (result & 0xffff) + (result >> 16);8484+ }8585+ if (len & 2) {8686+ result += *(unsigned short *) buff;8787+ buff += 2;8888+ }8989+ }9090+ if (len & 1)9191+#ifdef __LITTLE_ENDIAN9292+ result += *buff;9393+#else9494+ result += (*buff << 8);9595+#endif9696+ result = from32to16(result);9797+ if (odd)9898+ result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);9999+out:100100+ return result;101101+}102102+EXPORT_SYMBOL(ip_fast_csum);103103+104104+/*105105+ * computes the checksum of a memory block at buff, length len,106106+ * and adds in "sum" (32-bit)107107+ *108108+ * returns a 32-bit number suitable for feeding into itself109109+ * or csum_tcpudp_magic110110+ *111111+ * this function must be called with even lengths, except112112+ * for the last fragment, which may be odd113113+ *114114+ * it's best to have buff aligned on a 32-bit boundary115115+ */116116+__wsum csum_partial(const void *buff, int len, __wsum wsum)117117+{118118+ unsigned int sum = (__force unsigned int)wsum;119119+ unsigned int result = do_csum(buff, len);120120+121121+ /* add in old sum, and carry.. */122122+ result += sum;123123+ if (sum > result)124124+ result += 1;125125+ return (__force __wsum)result;126126+}127127+EXPORT_SYMBOL(csum_partial);128128+129129+/*130130+ * this routine is used for miscellaneous IP-like checksums, mainly131131+ * in icmp.c132132+ */133133+__sum16 ip_compute_csum(const void *buff, int len)134134+{135135+ return (__force __sum16)~do_csum(buff, len);136136+}137137+EXPORT_SYMBOL(ip_compute_csum);138138+139139+/*140140+ * copy from fs while checksumming, otherwise like csum_partial141141+ */142142+__wsum143143+csum_partial_copy_from_user(const void __user *src, void *dst, int len,144144+ __wsum sum, int *csum_err)145145+{146146+ int missing;147147+148148+ missing = __copy_from_user(dst, src, len);149149+ if (missing) {150150+ memset(dst + len - missing, 0, missing);151151+ *csum_err = -EFAULT;152152+ } else153153+ *csum_err = 0;154154+155155+ return csum_partial(dst, len, sum);156156+}157157+EXPORT_SYMBOL(csum_partial_copy_from_user);158158+159159+/*160160+ * copy from ds while checksumming, otherwise like csum_partial161161+ */162162+__wsum163163+csum_partial_copy(const void *src, void *dst, int len, __wsum sum)164164+{165165+ memcpy(dst, src, len);166166+ return csum_partial(dst, len, sum);167167+}168168+EXPORT_SYMBOL(csum_partial_copy);
···11+/*22+ * Precise Delay Loops for Meta33+ *44+ * Copyright (C) 1993 Linus Torvalds55+ * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>66+ * Copyright (C) 2007,2009 Imagination Technologies Ltd.77+ *88+ */99+1010+#include <linux/export.h>1111+#include <linux/sched.h>1212+#include <linux/delay.h>1313+1414+#include <asm/core_reg.h>1515+#include <asm/processor.h>1616+1717+/*1818+ * TXTACTCYC is only 24 bits, so on chips with fast clocks it will wrap1919+ * many times per-second. If it does wrap __delay will return prematurely,2020+ * but this is only likely with large delay values.2121+ *2222+ * We also can't implement read_current_timer() with TXTACTCYC due to2323+ * this wrapping behaviour.2424+ */2525+#define rdtimer(t) t = __core_reg_get(TXTACTCYC)2626+2727+void __delay(unsigned long loops)2828+{2929+ unsigned long bclock, now;3030+3131+ rdtimer(bclock);3232+ do {3333+ asm("NOP");3434+ rdtimer(now);3535+ } while ((now-bclock) < loops);3636+}3737+EXPORT_SYMBOL(__delay);3838+3939+inline void __const_udelay(unsigned long xloops)4040+{4141+ u64 loops = (u64)xloops * (u64)loops_per_jiffy * HZ;4242+ __delay(loops >> 32);4343+}4444+EXPORT_SYMBOL(__const_udelay);4545+4646+void __udelay(unsigned long usecs)4747+{4848+ __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */4949+}5050+EXPORT_SYMBOL(__udelay);5151+5252+void __ndelay(unsigned long nsecs)5353+{5454+ __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */5555+}5656+EXPORT_SYMBOL(__ndelay);
···11+! Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 200722+! Imagination Technologies Ltd33+!44+! Integer divide routines.55+!66+77+ .text88+ .global ___udivsi399+ .type ___udivsi3,function1010+ .align 21111+___udivsi3:1212+!!1313+!! Since core is signed divide case, just set control variable1414+!!1515+ MOV D1Re0,D0Ar2 ! Au already in A1Ar1, Bu -> D1Re01616+ MOV D0Re0,#0 ! Result is 01717+ MOV D0Ar4,#0 ! Return positive result1818+ B $LIDMCUStart1919+ .size ___udivsi3,.-___udivsi32020+2121+!!2222+!! 32-bit division signed i/p - passed signed 32-bit numbers2323+!!2424+ .global ___divsi32525+ .type ___divsi3,function2626+ .align 22727+___divsi3:2828+!!2929+!! A already in D1Ar1, B already in D0Ar2 -> make B abs(B)3030+!!3131+ MOV D1Re0,D0Ar2 ! A already in A1Ar1, B -> D1Re03232+ MOV D0Re0,#0 ! Result is 03333+ XOR D0Ar4,D1Ar1,D1Re0 ! D0Ar4 -ive if result is -ive3434+ ABS D1Ar1,D1Ar1 ! abs(A) -> Au3535+ ABS D1Re0,D1Re0 ! abs(B) -> Bu3636+$LIDMCUStart:3737+ CMP D1Ar1,D1Re0 ! Is ( Au > Bu )?3838+ LSR D1Ar3,D1Ar1,#2 ! Calculate (Au & (~3)) >> 23939+ CMPHI D1Re0,D1Ar3 ! OR ( (Au & (~3)) <= (Bu << 2) )?4040+ LSLSHI D1Ar3,D1Re0,#1 ! Buq = Bu << 14141+ BLS $LIDMCUSetup ! Yes: Do normal divide4242+!!4343+!! Quick divide setup can assume that CurBit only needs to start at 24444+!!4545+$LIDMCQuick:4646+ CMP D1Ar1,D1Ar3 ! ( A >= Buq )?4747+ ADDCC D0Re0,D0Re0,#2 ! If yes result += 24848+ SUBCC D1Ar1,D1Ar1,D1Ar3 ! and A -= Buq4949+ CMP D1Ar1,D1Re0 ! ( A >= Bu )?5050+ ADDCC D0Re0,D0Re0,#1 ! If yes result += 15151+ SUBCC D1Ar1,D1Ar1,D1Re0 ! and A -= Bu5252+ ORS D0Ar4,D0Ar4,D0Ar4 ! Return neg result?5353+ NEG D0Ar2,D0Re0 ! Calulate neg result5454+ MOVMI D0Re0,D0Ar2 ! Yes: Take neg result5555+$LIDMCRet:5656+ MOV PC,D1RtP5757+!!5858+!! Setup for general unsigned divide code5959+!!6060+!! D0Re0 is used to form the result, already set to Zero6161+!! D1Re0 is the input Bu value, this gets trashed6262+!! D0Ar6 is curbit which is set to 1 at the start and shifted up6363+!! D0Ar4 is negative if we should return a negative result6464+!! D1Ar1 is the input Au value, eventually this holds the remainder6565+!!6666+$LIDMCUSetup:6767+ CMP D1Ar1,D1Re0 ! Is ( Au < Bu )?6868+ MOV D0Ar6,#1 ! Set curbit to 16969+ BCS $LIDMCRet ! Yes: Return 0 remainder Au7070+!!7171+!! Calculate alignment using FFB instruction7272+!!7373+ FFB D1Ar5,D1Ar1 ! Find first bit of Au7474+ ANDN D1Ar5,D1Ar5,#31 ! Handle exceptional case.7575+ ORN D1Ar5,D1Ar5,#31 ! if N bit set, set to 317676+ FFB D1Ar3,D1Re0 ! Find first bit of Bu7777+ ANDN D1Ar3,D1Ar3,#31 ! Handle exceptional case.7878+ ORN D1Ar3,D1Ar3,#31 ! if N bit set, set to 317979+ SUBS D1Ar3,D1Ar5,D1Ar3 ! calculate diff, ffbA - ffbB8080+ MOV D0Ar2,D1Ar3 ! copy into bank 08181+ LSLGT D1Re0,D1Re0,D1Ar3 ! ( > 0) ? left shift B8282+ LSLGT D0Ar6,D0Ar6,D0Ar2 ! ( > 0) ? left shift curbit8383+!!8484+!! Now we start the divide proper, logic is8585+!!8686+!! if ( A >= B ) add curbit to result and subtract B from A8787+!! shift curbit and B down by 1 in either case8888+!!8989+$LIDMCLoop:9090+ CMP D1Ar1, D1Re0 ! ( A >= B )?9191+ ADDCC D0Re0, D0Re0, D0Ar6 ! If yes result += curbit9292+ SUBCC D1Ar1, D1Ar1, D1Re0 ! and A -= B9393+ LSRS D0Ar6, D0Ar6, #1 ! Shift down curbit, is it zero?9494+ LSR D1Re0, D1Re0, #1 ! Shift down B9595+ BNZ $LIDMCLoop ! Was single bit in curbit lost?9696+ ORS D0Ar4,D0Ar4,D0Ar4 ! Return neg result?9797+ NEG D0Ar2,D0Re0 ! Calulate neg result9898+ MOVMI D0Re0,D0Ar2 ! Yes: Take neg result9999+ MOV PC,D1RtP100100+ .size ___divsi3,.-___divsi3
+32
arch/metag/lib/ip_fast_csum.S
···11+22+ .text33+/*44+ * This is a version of ip_compute_csum() optimized for IP headers,55+ * which always checksum on 4 octet boundaries.66+ *77+ * extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl);88+ *99+ */1010+ .global _ip_fast_csum1111+ .type _ip_fast_csum,function1212+_ip_fast_csum:1313+ !! TXRPT needs loops - 11414+ SUBS TXRPT,D0Ar2,#11515+ MOV D0Re0,#01616+ BLO $Lfast_csum_exit1717+$Lfast_csum_loop:1818+ GETD D1Ar3,[D1Ar1++]1919+ ADDS D0Re0,D0Re0,D1Ar32020+ ADDCS D0Re0,D0Re0,#12121+ BR $Lfast_csum_loop2222+ LSR D0Ar4,D0Re0,#162323+ AND D0Re0,D0Re0,#0xffff2424+ AND D0Ar4,D0Ar4,#0xffff2525+ ADD D0Re0,D0Re0,D0Ar42626+ LSR D0Ar4,D0Re0,#162727+ ADD D0Re0,D0Re0,D0Ar42828+ XOR D0Re0,D0Re0,#-12929+ AND D0Re0,D0Re0,#0xffff3030+$Lfast_csum_exit:3131+ MOV PC,D1RtP3232+ .size _ip_fast_csum,.-_ip_fast_csum
+33
arch/metag/lib/lshrdi3.S
···11+! Copyright (C) 2012 by Imagination Technologies Ltd.22+!33+! 64-bit logical shift right routine.44+!55+66+ .text77+ .global ___lshrdi388+ .type ___lshrdi3,function99+1010+___lshrdi3:1111+ MOV D0Re0,D0Ar21212+ MOV D1Re0,D1Ar11313+ CMP D1Ar3,#0 ! COUNT == 01414+ MOVEQ PC,D1RtP ! Yes, return1515+1616+ MOV D0Ar4,D1Ar31717+ SUBS D1Ar3,D1Ar3,#32 ! N = COUNT - 321818+ BGE $L301919+2020+!! Shift < 322121+ NEG D1Ar3,D1Ar3 ! N = - N2222+ LSR D0Re0,D0Re0,D0Ar4 ! LO = LO >> COUNT2323+ LSL D0Ar6,D1Re0,D1Ar3 ! TMP= HI << -(COUNT - 32)2424+ OR D0Re0,D0Re0,D0Ar6 ! LO = LO | TMP2525+ SWAP D1Ar3,D0Ar42626+ LSR D1Re0,D1Re0,D1Ar3 ! HI = HI >> COUNT2727+ MOV PC,D1RtP2828+$L30:2929+!! Shift >= 323030+ LSR D0Re0,D1Re0,D1Ar3 ! LO = HI >> N3131+ MOV D1Re0,#0 ! HI = 03232+ MOV PC,D1RtP3333+ .size ___lshrdi3,.-___lshrdi3
+185
arch/metag/lib/memcpy.S
···11+! Copyright (C) 2008-2012 Imagination Technologies Ltd.22+33+ .text44+ .global _memcpy55+ .type _memcpy,function66+! D1Ar1 dst77+! D0Ar2 src88+! D1Ar3 cnt99+! D0Re0 dst1010+_memcpy:1111+ CMP D1Ar3, #161212+ MOV A1.2, D0Ar2 ! source pointer1313+ MOV A0.2, D1Ar1 ! destination pointer1414+ MOV A0.3, D1Ar1 ! for return value1515+! If there are less than 16 bytes to copy use the byte copy loop1616+ BGE $Llong_copy1717+1818+$Lbyte_copy:1919+! Simply copy a byte at a time2020+ SUBS TXRPT, D1Ar3, #12121+ BLT $Lend2222+$Lloop_byte:2323+ GETB D1Re0, [A1.2++]2424+ SETB [A0.2++], D1Re02525+ BR $Lloop_byte2626+2727+$Lend:2828+! Finally set return value and return2929+ MOV D0Re0, A0.33030+ MOV PC, D1RtP3131+3232+$Llong_copy:3333+ ANDS D1Ar5, D1Ar1, #7 ! test destination alignment3434+ BZ $Laligned_dst3535+3636+! The destination address is not 8 byte aligned. We will copy bytes from3737+! the source to the destination until the remaining data has an 8 byte3838+! destination address alignment (i.e we should never copy more than 73939+! bytes here).4040+$Lalign_dst:4141+ GETB D0Re0, [A1.2++]4242+ ADD D1Ar5, D1Ar5, #1 ! dest is aligned when D1Ar5 reaches #84343+ SUB D1Ar3, D1Ar3, #1 ! decrement count of remaining bytes4444+ SETB [A0.2++], D0Re04545+ CMP D1Ar5, #84646+ BNE $Lalign_dst4747+4848+! We have at least (16 - 7) = 9 bytes to copy - calculate the number of 8 byte4949+! blocks, then jump to the unaligned copy loop or fall through to the aligned5050+! copy loop as appropriate.5151+$Laligned_dst:5252+ MOV D0Ar4, A1.25353+ LSR D1Ar5, D1Ar3, #3 ! D1Ar5 = number of 8 byte blocks5454+ ANDS D0Ar4, D0Ar4, #7 ! test source alignment5555+ BNZ $Lunaligned_copy ! if unaligned, use unaligned copy loop5656+5757+! Both source and destination are 8 byte aligned - the easy case.5858+$Laligned_copy:5959+ LSRS D1Ar5, D1Ar3, #5 ! D1Ar5 = number of 32 byte blocks6060+ BZ $Lbyte_copy6161+ SUB TXRPT, D1Ar5, #16262+6363+$Laligned_32:6464+ GETL D0Re0, D1Re0, [A1.2++]6565+ GETL D0Ar6, D1Ar5, [A1.2++]6666+ SETL [A0.2++], D0Re0, D1Re06767+ SETL [A0.2++], D0Ar6, D1Ar56868+ GETL D0Re0, D1Re0, [A1.2++]6969+ GETL D0Ar6, D1Ar5, [A1.2++]7070+ SETL [A0.2++], D0Re0, D1Re07171+ SETL [A0.2++], D0Ar6, D1Ar57272+ BR $Laligned_327373+7474+! If there are any remaining bytes use the byte copy loop, otherwise we are done7575+ ANDS D1Ar3, D1Ar3, #0x1f7676+ BNZ $Lbyte_copy7777+ B $Lend7878+7979+! The destination is 8 byte aligned but the source is not, and there are 88080+! or more bytes to be copied.8181+$Lunaligned_copy:8282+! Adjust the source pointer (A1.2) to the 8 byte boundary before its8383+! current value8484+ MOV D0Ar4, A1.28585+ MOV D0Ar6, A1.28686+ ANDMB D0Ar4, D0Ar4, #0xfff88787+ MOV A1.2, D0Ar48888+! Save the number of bytes of mis-alignment in D0Ar4 for use later8989+ SUBS D0Ar6, D0Ar6, D0Ar49090+ MOV D0Ar4, D0Ar69191+! if there is no mis-alignment after all, use the aligned copy loop9292+ BZ $Laligned_copy9393+9494+! prefetch 8 bytes9595+ GETL D0Re0, D1Re0, [A1.2]9696+9797+ SUB TXRPT, D1Ar5, #19898+9999+! There are 3 mis-alignment cases to be considered. Less than 4 bytes, exactly100100+! 4 bytes, and more than 4 bytes.101101+ CMP D0Ar6, #4102102+ BLT $Lunaligned_1_2_3 ! use 1-3 byte mis-alignment loop103103+ BZ $Lunaligned_4 ! use 4 byte mis-alignment loop104104+105105+! The mis-alignment is more than 4 bytes106106+$Lunaligned_5_6_7:107107+ SUB D0Ar6, D0Ar6, #4108108+! Calculate the bit offsets required for the shift operations necesssary109109+! to align the data.110110+! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset)111111+ MULW D0Ar6, D0Ar6, #8112112+ MOV D1Ar5, #32113113+ SUB D1Ar5, D1Ar5, D0Ar6114114+! Move data 4 bytes before we enter the main loop115115+ MOV D0Re0, D1Re0116116+117117+$Lloop_5_6_7:118118+ GETL D0Ar2, D1Ar1, [++A1.2]119119+! form 64-bit data in D0Re0, D1Re0120120+ LSR D0Re0, D0Re0, D0Ar6121121+ MOV D1Re0, D0Ar2122122+ LSL D1Re0, D1Re0, D1Ar5123123+ ADD D0Re0, D0Re0, D1Re0124124+125125+ LSR D0Ar2, D0Ar2, D0Ar6126126+ LSL D1Re0, D1Ar1, D1Ar5127127+ ADD D1Re0, D1Re0, D0Ar2128128+129129+ SETL [A0.2++], D0Re0, D1Re0130130+ MOV D0Re0, D1Ar1131131+ BR $Lloop_5_6_7132132+133133+ B $Lunaligned_end134134+135135+$Lunaligned_1_2_3:136136+! Calculate the bit offsets required for the shift operations necesssary137137+! to align the data.138138+! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset)139139+ MULW D0Ar6, D0Ar6, #8140140+ MOV D1Ar5, #32141141+ SUB D1Ar5, D1Ar5, D0Ar6142142+143143+$Lloop_1_2_3:144144+! form 64-bit data in D0Re0,D1Re0145145+ LSR D0Re0, D0Re0, D0Ar6146146+ LSL D1Ar1, D1Re0, D1Ar5147147+ ADD D0Re0, D0Re0, D1Ar1148148+ MOV D0Ar2, D1Re0149149+ LSR D0FrT, D0Ar2, D0Ar6150150+ GETL D0Ar2, D1Ar1, [++A1.2]151151+152152+ MOV D1Re0, D0Ar2153153+ LSL D1Re0, D1Re0, D1Ar5154154+ ADD D1Re0, D1Re0, D0FrT155155+156156+ SETL [A0.2++], D0Re0, D1Re0157157+ MOV D0Re0, D0Ar2158158+ MOV D1Re0, D1Ar1159159+ BR $Lloop_1_2_3160160+161161+ B $Lunaligned_end162162+163163+! The 4 byte mis-alignment case - this does not require any shifting, just a164164+! shuffling of registers.165165+$Lunaligned_4:166166+ MOV D0Re0, D1Re0167167+$Lloop_4:168168+ GETL D0Ar2, D1Ar1, [++A1.2]169169+ MOV D1Re0, D0Ar2170170+ SETL [A0.2++], D0Re0, D1Re0171171+ MOV D0Re0, D1Ar1172172+ BR $Lloop_4173173+174174+$Lunaligned_end:175175+! If there are no remaining bytes to copy, we are done.176176+ ANDS D1Ar3, D1Ar3, #7177177+ BZ $Lend178178+! Re-adjust the source pointer (A1.2) back to the actual (unaligned) byte179179+! address of the remaining bytes, and fall through to the byte copy loop.180180+ MOV D0Ar6, A1.2181181+ ADD D1Ar5, D0Ar4, D0Ar6182182+ MOV A1.2, D1Ar5183183+ B $Lbyte_copy184184+185185+ .size _memcpy,.-_memcpy
···11+! Copyright (C) 2008-2012 Imagination Technologies Ltd.22+33+ .text44+ .global _memset55+ .type _memset,function66+! D1Ar1 dst77+! D0Ar2 c88+! D1Ar3 cnt99+! D0Re0 dst1010+_memset:1111+ AND D0Ar2,D0Ar2,#0xFF ! Ensure a byte input value1212+ MULW D0Ar2,D0Ar2,#0x0101 ! Duplicate byte value into 0-151313+ ANDS D0Ar4,D1Ar1,#7 ! Extract bottom LSBs of dst1414+ LSL D0Re0,D0Ar2,#16 ! Duplicate byte value into 16-311515+ ADD A0.2,D0Ar2,D0Re0 ! Duplicate byte value into 4 (A0.2)1616+ MOV D0Re0,D1Ar1 ! Return dst1717+ BZ $LLongStub ! if start address is aligned1818+ ! start address is not aligned on an 8 byte boundary, so we1919+ ! need the number of bytes up to the next 8 byte address2020+ ! boundary, or the length of the string if less than 8, in D1Ar52121+ MOV D0Ar2,#8 ! Need 8 - N in D1Ar5 ...2222+ SUB D1Ar5,D0Ar2,D0Ar4 ! ... subtract N2323+ CMP D1Ar3,D1Ar52424+ MOVMI D1Ar5,D1Ar32525+ B $LByteStub ! dst is mis-aligned, do $LByteStub2626+2727+!2828+! Preamble to LongLoop which generates 4*8 bytes per interation (5 cycles)2929+!3030+$LLongStub:3131+ LSRS D0Ar2,D1Ar3,#53232+ AND D1Ar3,D1Ar3,#0x1F3333+ MOV A1.2,A0.23434+ BEQ $LLongishStub3535+ SUB TXRPT,D0Ar2,#13636+ CMP D1Ar3,#03737+$LLongLoop:3838+ SETL [D1Ar1++],A0.2,A1.23939+ SETL [D1Ar1++],A0.2,A1.24040+ SETL [D1Ar1++],A0.2,A1.24141+ SETL [D1Ar1++],A0.2,A1.24242+ BR $LLongLoop4343+ BZ $Lexit4444+!4545+! Preamble to LongishLoop which generates 1*8 bytes per interation (2 cycles)4646+!4747+$LLongishStub:4848+ LSRS D0Ar2,D1Ar3,#34949+ AND D1Ar3,D1Ar3,#0x75050+ MOV D1Ar5,D1Ar35151+ BEQ $LByteStub5252+ SUB TXRPT,D0Ar2,#15353+ CMP D1Ar3,#05454+$LLongishLoop:5555+ SETL [D1Ar1++],A0.2,A1.25656+ BR $LLongishLoop5757+ BZ $Lexit5858+!5959+! This does a byte structured burst of up to 7 bytes6060+!6161+! D1Ar1 should point to the location required6262+! D1Ar3 should be the remaining total byte count6363+! D1Ar5 should be burst size (<= D1Ar3)6464+!6565+$LByteStub:6666+ SUBS D1Ar3,D1Ar3,D1Ar5 ! Reduce count6767+ ADD D1Ar1,D1Ar1,D1Ar5 ! Advance pointer to end of area6868+ MULW D1Ar5,D1Ar5,#4 ! Scale to (1*4), (2*4), (3*4)6969+ SUB D1Ar5,D1Ar5,#(8*4) ! Rebase to -(7*4), -(6*4), -(5*4), ...7070+ MOV A1.2,D1Ar57171+ SUB PC,CPC1,A1.2 ! Jump into table below7272+ SETB [D1Ar1+#(-7)],A0.27373+ SETB [D1Ar1+#(-6)],A0.27474+ SETB [D1Ar1+#(-5)],A0.27575+ SETB [D1Ar1+#(-4)],A0.27676+ SETB [D1Ar1+#(-3)],A0.27777+ SETB [D1Ar1+#(-2)],A0.27878+ SETB [D1Ar1+#(-1)],A0.27979+!8080+! Return if all data has been output, otherwise do $LLongStub8181+!8282+ BNZ $LLongStub8383+$Lexit:8484+ MOV PC,D1RtP8585+ .size _memset,.-_memset8686+