Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sh: Optimise memcpy_to/fromio for SH4

Optimise memcpy_to/fromio. This is used extensivly by MTD, so is a
worthwhile performance gain. The main savings come from not repeatedly
calling readl/writel, and doing word instead of byte at a time
transfers. Also using "movca.l" on SH4 gives a small performance win.

Signed-off-by: Stuart Menefy <stuart.menefy@st.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

authored by

Stuart Menefy and committed by
Paul Mundt
5e9377ec 8af57f8b

+72 -21
+72 -21
arch/sh/kernel/io.c
··· 1 1 /* 2 - * linux/arch/sh/kernel/io.c 2 + * arch/sh/kernel/io.c - Machine independent I/O functions. 3 3 * 4 - * Copyright (C) 2000 Stuart Menefy 4 + * Copyright (C) 2000 - 2009 Stuart Menefy 5 5 * Copyright (C) 2005 Paul Mundt 6 - * 7 - * Provide real functions which expand to whatever the header file defined. 8 - * Also definitions of machine independent IO functions. 9 6 * 10 7 * This file is subject to the terms and conditions of the GNU General Public 11 8 * License. See the file "COPYING" in the main directory of this archive ··· 15 18 16 19 /* 17 20 * Copy data from IO memory space to "real" memory space. 18 - * This needs to be optimized. 19 21 */ 20 22 void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned long count) 21 23 { 22 - unsigned char *p = to; 23 - while (count) { 24 - count--; 25 - *p = readb(from); 26 - p++; 27 - from++; 28 - } 24 + /* 25 + * Would it be worthwhile doing byte and long transfers first 26 + * to try and get aligned? 27 + */ 28 + #ifdef CONFIG_CPU_SH4 29 + if ((count >= 0x20) && 30 + (((u32)to & 0x1f) == 0) && (((u32)from & 0x3) == 0)) { 31 + int tmp2, tmp3, tmp4, tmp5, tmp6; 32 + 33 + __asm__ __volatile__( 34 + "1: \n\t" 35 + "mov.l @%7+, r0 \n\t" 36 + "mov.l @%7+, %2 \n\t" 37 + "movca.l r0, @%0 \n\t" 38 + "mov.l @%7+, %3 \n\t" 39 + "mov.l @%7+, %4 \n\t" 40 + "mov.l @%7+, %5 \n\t" 41 + "mov.l @%7+, %6 \n\t" 42 + "mov.l @%7+, r7 \n\t" 43 + "mov.l @%7+, r0 \n\t" 44 + "mov.l %2, @(0x04,%0) \n\t" 45 + "mov #0x20, %2 \n\t" 46 + "mov.l %3, @(0x08,%0) \n\t" 47 + "sub %2, %1 \n\t" 48 + "mov.l %4, @(0x0c,%0) \n\t" 49 + "cmp/hi %1, %2 ! T if 32 > count \n\t" 50 + "mov.l %5, @(0x10,%0) \n\t" 51 + "mov.l %6, @(0x14,%0) \n\t" 52 + "mov.l r7, @(0x18,%0) \n\t" 53 + "mov.l r0, @(0x1c,%0) \n\t" 54 + "bf.s 1b \n\t" 55 + " add #0x20, %0 \n\t" 56 + : "=&r" (to), "=&r" (count), 57 + "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4), 58 + "=&r" (tmp5), "=&r" (tmp6), "=&r" (from) 59 + : "7"(from), "0" (to), "1" (count) 60 + : "r0", "r7", "t", "memory"); 61 + } 62 + #endif 63 + 64 + if ((((u32)to | (u32)from) & 0x3) == 0) { 65 + for (; count > 3; count -= 4) { 66 + *(u32 *)to = *(volatile u32 *)from; 67 + to += 4; 68 + from += 4; 69 + } 70 + } 71 + 72 + for (; count > 0; count--) { 73 + *(u8 *)to = *(volatile u8 *)from; 74 + to++; 75 + from++; 76 + } 77 + 78 + mb(); 29 79 } 30 80 EXPORT_SYMBOL(memcpy_fromio); 31 81 32 82 /* 33 83 * Copy data from "real" memory space to IO memory space. 34 - * This needs to be optimized. 35 84 */ 36 85 void memcpy_toio(volatile void __iomem *to, const void *from, unsigned long count) 37 86 { 38 - const unsigned char *p = from; 39 - while (count) { 40 - count--; 41 - writeb(*p, to); 42 - p++; 43 - to++; 44 - } 87 + if ((((u32)to | (u32)from) & 0x3) == 0) { 88 + for ( ; count > 3; count -= 4) { 89 + *(volatile u32 *)to = *(u32 *)from; 90 + to += 4; 91 + from += 4; 92 + } 93 + } 94 + 95 + for (; count > 0; count--) { 96 + *(volatile u8 *)to = *(u8 *)from; 97 + to++; 98 + from++; 99 + } 100 + 101 + mb(); 45 102 } 46 103 EXPORT_SYMBOL(memcpy_toio); 47 104