1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
|
/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id$
*
* Copyright (C) 2004 by Jens Arnold
*
* All files in this archive are subject to the GNU General Public License.
* See the file COPYING in the source tree root for full license agreement.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
.section .icode,"ax",@progbits
.align 2
.global _memset
.type _memset,@function
/* Fills a memory region with specified byte value
* This version is optimized for speed
*
* arguments:
* r4 - start address
* r5 - data
* r6 - length
*
* return value:
* r0 - start address (like ANSI version)
*
* register usage:
* r0 - temporary
* r1 - bit mask for rounding to long bounds
* r2 - last / first long bound (only if >= 12 bytes)
* r4 - start address
* r5 - data (spread to all 4 bytes if >= 12 bytes)
* r6 - current address (runs down from end to start)
*
* The instruction order below is devised in a way to utilize the pipelining
* of the SH1 to the max. The routine fills memory from end to start in
* order to utilize the auto-decrementing store instructions.
*/
_memset:
add r4,r6 /* r6 = end_address */
mov r6,r0
add #-12,r0 /* r0 = r6 - 12; don't go below 12 here! */
cmp/hs r4,r0 /* >= 12 bytes to fill? */
bf .start_b2 /* no, jump directly to byte loop */
extu.b r5,r5 /* start: spread data to all 4 bytes */
swap.b r5,r0
or r0,r5 /* data now in 2 lower bytes of r5 */
swap.w r5,r0
or r0,r5 /* data now in all 4 bytes of r5 */
mov #-4,r1 /* r1 = 0xFFFFFFFC */
mov r6,r2
bra .start_b1
and r1,r2 /* r2 = last long bound */
/* leading byte loop: sets 0..3 bytes */
.loop_b1:
mov.b r5,@-r6 /* store byte */
.start_b1:
cmp/hi r2,r6 /* runs r6 down to last long bound */
bt .loop_b1
mov r4,r2
add #11,r2 /* combined for rounding and offset */
and r1,r2 /* r2 = first long bound + 8 */
/* main loop: set 2 longs per pass */
.loop2_l:
mov.l r5,@-r6 /* store first long */
cmp/hi r2,r6 /* runs r6 down to first or second long bound */
mov.l r5,@-r6 /* store second long */
bt .loop2_l
add #-8,r2 /* correct offset */
cmp/hi r2,r6 /* 1 long left? */
bf .start_b2 /* no, jump to trailing byte loop */
bra .start_b2 /* jump to trailing byte loop */
mov.l r5,@-r6 /* store last long */
/* trailing byte loop */
.align 2
.loop_b2:
mov.b r5,@-r6 /* store byte */
.start_b2:
cmp/hi r4,r6 /* runs r6 down to the start address */
bt .loop_b2
rts
mov r4,r0 /* return start address */
.end:
.size _memset,.end-_memset
|