···11+ .section .text..SHmedia32,"ax"22+ .align 233+ .global __udivdi344+__udivdi3:55+ shlri r3,1,r466+ nsb r4,r2277+ shlld r3,r22,r688+ shlri r6,49,r599+ movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */1010+ sub r21,r5,r11111+ mmulfx.w r1,r1,r41212+ mshflo.w r1,r63,r11313+ sub r63,r22,r20 // r63 == 64 % 641414+ mmulfx.w r5,r4,r41515+ pta large_divisor,tr01616+ addi r20,32,r91717+ msub.w r1,r4,r11818+ madd.w r1,r1,r11919+ mmulfx.w r1,r1,r42020+ shlri r6,32,r72121+ bgt/u r9,r63,tr0 // large_divisor2222+ mmulfx.w r5,r4,r42323+ shlri r2,32+14,r192424+ addi r22,-31,r02525+ msub.w r1,r4,r12626+2727+ mulu.l r1,r7,r42828+ addi r1,-3,r52929+ mulu.l r5,r19,r53030+ sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r23131+ shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as3232+ the case may be, %0000000000000000 000.11111111111, still */3333+ muls.l r1,r4,r4 /* leaving at least one sign bit. */3434+ mulu.l r5,r3,r83535+ mshalds.l r1,r21,r13636+ shari r4,26,r43737+ shlld r8,r0,r83838+ add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)3939+ sub r2,r8,r24040+ /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */4141+4242+ shlri r2,22,r214343+ mulu.l r21,r1,r214444+ shlld r5,r0,r84545+ addi r20,30-22,r04646+ shlrd r21,r0,r214747+ mulu.l r21,r3,r54848+ add r8,r21,r84949+ mcmpgt.l r21,r63,r21 // See Note 15050+ addi r20,30,r05151+ mshfhi.l r63,r21,r215252+ sub r2,r5,r25353+ andc r2,r21,r25454+5555+ /* small divisor: need a third divide step */5656+ mulu.l r2,r1,r75757+ ptabs r18,tr05858+ addi r2,1,r25959+ shlrd r7,r0,r76060+ mulu.l r7,r3,r56161+ add r8,r7,r86262+ sub r2,r3,r26363+ cmpgt r2,r5,r56464+ add r8,r5,r26565+ /* could test r3 here to check for divide by zero. */6666+ blink tr0,r636767+6868+large_divisor:6969+ mmulfx.w r5,r4,r47070+ shlrd r2,r9,r257171+ shlri r25,32,r87272+ msub.w r1,r4,r17373+7474+ mulu.l r1,r7,r47575+ addi r1,-3,r57676+ mulu.l r5,r8,r57777+ sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r27878+ shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as7979+ the case may be, %0000000000000000 000.11111111111, still */8080+ muls.l r1,r4,r4 /* leaving at least one sign bit. */8181+ shlri r5,14-1,r88282+ mulu.l r8,r7,r58383+ mshalds.l r1,r21,r18484+ shari r4,26,r48585+ add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)8686+ sub r25,r5,r258787+ /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */8888+8989+ shlri r25,22,r219090+ mulu.l r21,r1,r219191+ pta no_lo_adj,tr09292+ addi r22,32,r09393+ shlri r21,40,r219494+ mulu.l r21,r7,r59595+ add r8,r21,r89696+ shlld r2,r0,r29797+ sub r25,r5,r259898+ bgtu/u r7,r25,tr0 // no_lo_adj9999+ addi r8,1,r8100100+ sub r25,r7,r25101101+no_lo_adj:102102+ mextr4 r2,r25,r2103103+104104+ /* large_divisor: only needs a few adjustments. */105105+ mulu.l r8,r6,r5106106+ ptabs r18,tr0107107+ /* bubble */108108+ cmpgtu r5,r2,r5109109+ sub r8,r5,r2110110+ blink tr0,r63111111+112112+/* Note 1: To shift the result of the second divide stage so that the result113113+ always fits into 32 bits, yet we still reduce the rest sufficiently114114+ would require a lot of instructions to do the shifts just right. Using115115+ the full 64 bit shift result to multiply with the divisor would require116116+ four extra instructions for the upper 32 bits (shift / mulu / shift / sub).117117+ Fortunately, if the upper 32 bits of the shift result are nonzero, we118118+ know that the rest after taking this partial result into account will119119+ fit into 32 bits. So we just clear the upper 32 bits of the rest if the120120+ upper 32 bits of the partial result are nonzero. */
+59
arch/sh/lib64/udivsi3.S
···11+ .global __udivsi322+ .section .text..SHmedia32,"ax"33+ .align 244+55+/*66+ inputs: r4,r577+ clobbered: r18,r19,r20,r21,r22,r25,tr088+ result in r0.99+ */1010+__udivsi3:1111+ addz.l r5,r63,r221212+ nsb r22,r01313+ shlld r22,r0,r251414+ shlri r25,48,r251515+ movi 0xffffffffffffbb0c,r20 /* shift count eqiv 76 */1616+ sub r20,r25,r211717+ mmulfx.w r21,r21,r191818+ mshflo.w r21,r63,r211919+ ptabs r18,tr02020+ mmulfx.w r25,r19,r192121+ sub r20,r0,r02222+ /* bubble */2323+ msub.w r21,r19,r192424+2525+ /*2626+ * It would be nice for scheduling to do this add to r21 before2727+ * the msub.w, but we need a different value for r19 to keep2828+ * errors under control.2929+ */3030+ addi r19,-2,r213131+ mulu.l r4,r21,r183232+ mmulfx.w r19,r19,r193333+ shlli r21,15,r213434+ shlrd r18,r0,r183535+ mulu.l r18,r22,r203636+ mmacnfx.wl r25,r19,r213737+ /* bubble */3838+ sub r4,r20,r253939+4040+ mulu.l r25,r21,r194141+ addi r0,14,r04242+ /* bubble */4343+ shlrd r19,r0,r194444+ mulu.l r19,r22,r204545+ add r18,r19,r184646+ /* bubble */4747+ sub.l r25,r20,r254848+4949+ mulu.l r25,r21,r195050+ addz.l r25,r63,r255151+ sub r25,r22,r255252+ shlrd r19,r0,r195353+ mulu.l r19,r22,r205454+ addi r25,1,r255555+ add r18,r19,r185656+5757+ cmpgt r25,r20,r255858+ add.l r18,r25,r05959+ blink tr0,r63