From dfc349402de8e95f6a42e8341e9ea193b718eee3 Mon Sep 17 00:00:00 2001 From: Stuart Menefy Date: Tue, 27 Oct 2009 15:14:06 +0000 Subject: sh: Optimised memset for SH4 Optimised version of memset for the SH4 which uses movca.l. Signed-off-by: Stuart Menefy Signed-off-by: Paul Mundt --- arch/sh/lib/Makefile | 7 +++- arch/sh/lib/memset-sh4.S | 107 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 arch/sh/lib/memset-sh4.S diff --git a/arch/sh/lib/Makefile b/arch/sh/lib/Makefile index a969b47c5463..dab4d2129812 100644 --- a/arch/sh/lib/Makefile +++ b/arch/sh/lib/Makefile @@ -2,7 +2,7 @@ # Makefile for SuperH-specific library files.. # -lib-y = delay.o memset.o memmove.o memchr.o \ +lib-y = delay.o memmove.o memchr.o \ checksum.o strlen.o div64.o div64-generic.o # Extracted from libgcc @@ -23,8 +23,11 @@ obj-y += io.o memcpy-y := memcpy.o memcpy-$(CONFIG_CPU_SH4) := memcpy-sh4.o +memset-y := memset.o +memset-$(CONFIG_CPU_SH4) := memset-sh4.o + lib-$(CONFIG_MMU) += copy_page.o __clear_user.o lib-$(CONFIG_MCOUNT) += mcount.o -lib-y += $(memcpy-y) $(udivsi3-y) +lib-y += $(memcpy-y) $(memset-y) $(udivsi3-y) EXTRA_CFLAGS += -Werror diff --git a/arch/sh/lib/memset-sh4.S b/arch/sh/lib/memset-sh4.S new file mode 100644 index 000000000000..1a6e32cc4e4d --- /dev/null +++ b/arch/sh/lib/memset-sh4.S @@ -0,0 +1,107 @@ +/* + * "memset" implementation for SH4 + * + * Copyright (C) 1999 Niibe Yutaka + * Copyright (c) 2009 STMicroelectronics Limited + * Author: Stuart Menefy + */ + +/* + * void *memset(void *s, int c, size_t n); + */ + +#include + +ENTRY(memset) + mov #12,r0 + add r6,r4 + cmp/gt r6,r0 + bt/s 40f ! if it's too small, set a byte at once + mov r4,r0 + and #3,r0 + cmp/eq #0,r0 + bt/s 2f ! It's aligned + sub r0,r6 +1: + dt r0 + bf/s 1b + mov.b r5,@-r4 +2: ! make VVVV + extu.b r5,r5 + swap.b r5,r0 ! V0 + or r0,r5 ! VV + swap.w r5,r0 ! VV00 + or r0,r5 ! VVVV + + ! Check if enough bytes need to be copied to be worth the big loop + mov #0x40, r0 ! (MT) + cmp/gt r6,r0 ! (MT) 64 > len => slow loop + + bt/s 22f + mov r6,r0 + + ! align the dst to the cache block size if necessary + mov r4, r3 + mov #~(0x1f), r1 + + and r3, r1 + cmp/eq r3, r1 + + bt/s 11f ! dst is already aligned + sub r1, r3 ! r3-r1 -> r3 + shlr2 r3 ! number of loops + +10: mov.l r5,@-r4 + dt r3 + bf/s 10b + add #-4, r6 + +11: ! dst is 32byte aligned + mov r6,r2 + mov #-5,r0 + shld r0,r2 ! number of loops + + add #-32, r4 + mov r5, r0 +12: + movca.l r0,@r4 + mov.l r5,@(4, r4) + mov.l r5,@(8, r4) + mov.l r5,@(12,r4) + mov.l r5,@(16,r4) + mov.l r5,@(20,r4) + add #-0x20, r6 + mov.l r5,@(24,r4) + dt r2 + mov.l r5,@(28,r4) + bf/s 12b + add #-32, r4 + + add #32, r4 + mov #8, r0 + cmp/ge r0, r6 + bf 40f + + mov r6,r0 +22: + shlr2 r0 + shlr r0 ! r0 = r6 >> 3 +3: + dt r0 + mov.l r5,@-r4 ! set 8-byte at once + bf/s 3b + mov.l r5,@-r4 + ! + mov #7,r0 + and r0,r6 + + ! fill bytes (length may be zero) +40: tst r6,r6 + bt 5f +4: + dt r6 + bf/s 4b + mov.b r5,@-r4 +5: + rts + mov r4,r0 -- cgit v1.2.3