firmware/common/memset.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108

/***************************************************************************
 *             __________               __   ___.
 *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
 *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
 *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
 *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
 *                     \/            \/     \/    \/            \/
 * $Id$
 *
 * Copyright (C) 2004 by Jens Arnold
 *
 * All files in this archive are subject to the GNU General Public License.
 * See the file COPYING in the source tree root for full license agreement.
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
 * KIND, either express or implied.
 *
 ****************************************************************************/

    .section    .icode,"ax",@progbits

    .align      2
    .global     _memset
    .type       _memset,@function

/* Fills a memory region with specified byte value
 * This version is optimized for speed
 *
 * arguments:
 *  r4 - start address
 *  r5 - data
 *  r6 - length
 *
 * return value:
 *  r0 - start address (like ANSI version)
 *
 * register usage:
 *  r0 - temporary
 *  r1 - bit mask for rounding to long bounds
 *  r2 - last / first long bound (only if >= 12 bytes)
 *  r4 - start address
 *  r5 - data (spread to all 4 bytes if >= 12 bytes)
 *  r6 - current address (runs down from end to start)
 *
 * The instruction order below is devised in a way to utilize the pipelining
 * of the SH1 to the max. The routine fills memory from end to start in
 * order to utilize the auto-decrementing store instructions.
 */

_memset:
    add     r4,r6       /* r6 = end_address */

    mov     r6,r0
    add     #-12,r0     /* r0 = r6 - 12; don't go below 12 here! */
    cmp/hs  r4,r0       /* >= 12 bytes to fill? */
    bf      .start_b2   /* no, jump directly to byte loop */

    extu.b  r5,r5       /* start: spread data to all 4 bytes */
    swap.b  r5,r0
    or      r0,r5       /* data now in 2 lower bytes of r5 */
    swap.w  r5,r0
    or      r0,r5       /* data now in all 4 bytes of r5 */

    mov     #-4,r1      /* r1 = 0xFFFFFFFC */

    mov     r6,r2
    bra     .start_b1
    and     r1,r2       /* r2 = last long bound */

    /* leading byte loop: sets 0..3 bytes */
.loop_b1:
    mov.b   r5,@-r6     /* store byte */
.start_b1:
    cmp/hi  r2,r6       /* runs r6 down to last long bound */
    bt      .loop_b1

    mov     r4,r2
    add     #11,r2      /* combined for rounding and offset */
    and     r1,r2       /* r2 = first long bound + 8 */

    /* main loop: set 2 longs per pass */
.loop2_l:
    mov.l   r5,@-r6     /* store first long */
    cmp/hi  r2,r6       /* runs r6 down to first or second long bound */
    mov.l   r5,@-r6     /* store second long */
    bt      .loop2_l

    add     #-8,r2      /* correct offset */
    cmp/hi  r2,r6       /* 1 long left? */
    bf      .start_b2   /* no, jump to trailing byte loop */

    bra     .start_b2   /* jump to trailing byte loop */
    mov.l   r5,@-r6     /* store last long */

    /* trailing byte loop */
    .align  2
.loop_b2:
    mov.b   r5,@-r6     /* store byte */
.start_b2:
    cmp/hi  r4,r6       /* runs r6 down to the start address */
    bt      .loop_b2

    rts
    mov     r4,r0       /* return start address */

.end:
    .size   _memset,.end-_memset