[S390] convert/optimize csum_fold() to C

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

In the meantime gcc generates better code than the old inline
assemblies do. Original inline assembly results in:

lr %r1,%r2
sr %r3,%r3
lr %r2,%r1
srdl %r2,16
alr %r2,%r3
alr %r1,%r2
srl %r1,16
xilf %r1,65535
llghr %r2,%r1
br %r14

Out of the C code gcc generates this:

rll %r1,%r2,16
ar %r1,%r2
srl %r1,16
xilf %r1,65535
llghr %r2,%r1
br %r14

In addition we don't have any static register allocations anymore and
gcc is free to shuffle instructions around for better pipeline usage.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

authored by

Heiko Carstens and committed by

Martin Schwidefsky 16 years ago 04efc3be 05e7ff7d

+4 -21

1 changed file

expand all

arch

s390

include

asm

checksum.h

+4 -21

arch/s390/include/asm/checksum.h

··· 78 78 */ 79 79 static inline __sum16 csum_fold(__wsum sum) 80 80 { 81 - #ifndef __s390x__ 82 - register_pair rp; 81 + u32 csum = (__force u32) sum; 83 82 84 - asm volatile( 85 - " slr %N1,%N1\n" /* %0 = H L */ 86 - " lr %1,%0\n" /* %0 = H L, %1 = H L 0 0 */ 87 - " srdl %1,16\n" /* %0 = H L, %1 = 0 H L 0 */ 88 - " alr %1,%N1\n" /* %0 = H L, %1 = L H L 0 */ 89 - " alr %0,%1\n" /* %0 = H+L+C L+H */ 90 - " srl %0,16\n" /* %0 = H+L+C */ 91 - : "+&d" (sum), "=d" (rp) : : "cc"); 92 - #else /* __s390x__ */ 93 - asm volatile( 94 - " sr 3,3\n" /* %0 = H*65536 + L */ 95 - " lr 2,%0\n" /* %0 = H L, 2/3 = H L / 0 0 */ 96 - " srdl 2,16\n" /* %0 = H L, 2/3 = 0 H / L 0 */ 97 - " alr 2,3\n" /* %0 = H L, 2/3 = L H / L 0 */ 98 - " alr %0,2\n" /* %0 = H+L+C L+H */ 99 - " srl %0,16\n" /* %0 = H+L+C */ 100 - : "+&d" (sum) : : "cc", "2", "3"); 101 - #endif /* __s390x__ */ 102 - return (__force __sum16) ~sum; 83 + csum += (csum >> 16) + (csum << 16); 84 + csum >>= 16; 85 + return (__force __sum16) ~csum; 103 86 } 104 87 105 88 /*