firmware/common/memset_a.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318

/***************************************************************************
 *             __________               __   ___.
 *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
 *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
 *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
 *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
 *                     \/            \/     \/    \/            \/
 * $Id$
 *
 * Copyright (C) 2004 by Jens Arnold
 *
 * All files in this archive are subject to the GNU General Public License.
 * See the file COPYING in the source tree root for full license agreement.
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
 * KIND, either express or implied.
 *
 ****************************************************************************/
#include "config.h"

#ifdef CPU_ARM
    .section    .icode,"ax",%progbits
#elif
    .section    .icode,"ax",@progbits
#endif

    .align      2
#if CONFIG_CPU == SH7034        
    .global     _memset
    .type       _memset,@function

/* Fills a memory region with specified byte value
 * This version is optimized for speed
 *
 * arguments:
 *  r4 - start address
 *  r5 - data
 *  r6 - length
 *
 * return value:
 *  r0 - start address (like ANSI version)
 *
 * register usage:
 *  r0 - temporary
 *  r1 - start address +11 for main loop
 *  r4 - start address
 *  r5 - data (spread to all 4 bytes when using long stores)
 *  r6 - current address (runs down from end to start)
 *
 * The instruction order below is devised in a way to utilize the pipelining
 * of the SH1 to the max. The routine fills memory from end to start in
 * order to utilize the auto-decrementing store instructions.
 */

_memset:
    neg     r4,r0
    and     #3,r0       /* r0 = (4 - align_offset) % 4 */
    add     #4,r0
    cmp/hs  r0,r6       /* at least one aligned longword to fill? */
    add     r4,r6       /* r6 = end_address */
    bf      .no_longs   /* no, jump directly to byte loop */

    extu.b  r5,r5       /* start: spread data to all 4 bytes */
    swap.b  r5,r0
    or      r0,r5       /* data now in 2 lower bytes of r5 */
    swap.w  r5,r0
    or      r0,r5       /* data now in all 4 bytes of r5 */
    
    mov     r6,r0
    tst     #3,r0       /* r0 already long aligned? */
    bt      .end_b1     /* yes: skip loop */

    /* leading byte loop: sets 0..3 bytes */
.loop_b1:
    mov.b   r5,@-r0     /* store byte */
    tst     #3,r0       /* r0 long aligned? */
    bf      .loop_b1    /* runs r0 down until long aligned */
    
    mov     r0,r6       /* r6 = last long bound */
    nop                 /* keep alignment */

.end_b1:
    mov     r4,r1       /* r1 = start_address... */
    add     #11,r1      /* ... + 11, combined for rounding and offset */
    xor     r1,r0
    tst     #4,r0       /* bit 2 tells whether an even or odd number of */
    bf      .loop_odd   /* longwords to set */

    /* main loop: set 2 longs per pass */
.loop_2l:
    mov.l   r5,@-r6     /* store first long */
.loop_odd:
    cmp/hi  r1,r6       /* runs r6 down to first long bound */
    mov.l   r5,@-r6     /* store second long */
    bt      .loop_2l

.no_longs:
    cmp/hi  r4,r6       /* any bytes left? */
    bf      .end_b2     /* no: skip loop */

    /* trailing byte loop */
.loop_b2:
    mov.b   r5,@-r6     /* store byte */
    cmp/hi  r4,r6       /* runs r6 down to the start address */
    bt      .loop_b2

.end_b2:
    rts
    mov     r4,r0       /* return start address */

.end:
    .size   _memset,.end-_memset
#elif defined(CPU_COLDFIRE)
    .global     memset
    .type       memset,@function

/* Fills a memory region with specified byte value
 * This version is optimized for speed
 *
 * arguments:
 *  (4,%sp)  - start address
 *  (8,%sp)  - data
 *  (12,%sp) - length
 *
 * return value:
 *  %d0 - start address (like ANSI version)
 *
 * register usage:
 *  %d0 - data (spread to all 4 bytes when using long stores)
 *  %d1 - temporary / data (for burst transfer)
 *  %d2 - data (for burst transfer)
 *  %d3 - data (for burst transfer)
 *  %a0 - start address
 *  %a1 - current address (runs down from end to start)
 *
 * For maximum speed this routine uses both long stores and burst mode,
 * storing whole lines with movem.l. The routine fills memory from end
 * to start in order to ease returning the start address.
 */
memset:
    move.l  (4,%sp),%a0     /* start address */
    move.l  (8,%sp),%d0     /* data */
    move.l  (12,%sp),%a1    /* length */
    add.l   %a0,%a1         /* %a1 = end address */

    move.l  %a0,%d1
    addq.l  #7,%d1
    and.l   #0xFFFFFFFC,%d1 /* %d1 = first long bound + 4 */
    cmp.l   %d1,%a1         /* at least one aligned longword to fill? */
    blo.b   .no_longs       /* no, jump directly to byte loop */

    and.l   #0xFF,%d0       /* start: spread data to all 4 bytes */
    move.l  %d0,%d1
    lsl.l   #8,%d1
    or.l    %d1,%d0         /* data now in 2 lower bytes of %d0 */
    move.l  %d0,%d1
    swap    %d0
    or.l    %d1,%d0         /* data now in all 4 bytes of %d0 */

    move.l  %a1,%d1
    and.l   #0xFFFFFFFC,%d1 /* %d1 = last long bound */
    cmp.l   %d1,%a1         /* any bytes to set? */
    bls.b   .end_b1         /* no: skip byte loop */

    /* leading byte loop: sets 0..3 bytes */
.loop_b1:
    move.b  %d0,-(%a1)      /* store byte */
    cmp.l   %d1,%a1         /* runs %a1 down to last long bound */
    bhi.b   .loop_b1
    
.end_b1:
    moveq.l #31,%d1
    add.l   %a0,%d1
    and.l   #0xFFFFFFF0,%d1 /* %d1 = first line bound + 16 */
    cmp.l   %d1,%a1         /* at least one full line to fill? */
    blo.b   .no_lines       /* no, jump to longword loop */

    mov.l   %a1,%d1
    and.l   #0xFFFFFFF0,%d1 /* %d1 = last line bound */
    cmp.l   %d1,%a1         /* any longwords to set? */
    bls.b   .end_l1         /* no: skip longword loop */

    /* leading longword loop: sets 0..3 longwords */
.loop_l1:
    move.l  %d0,-(%a1)      /* store longword */
    cmp.l   %d1,%a1         /* runs %a1 down to last line bound */
    bhi.b   .loop_l1

.end_l1:
    move.l  %d2,-(%sp)      /* free some registers */
    move.l  %d3,-(%sp)

    move.l  %d0,%d1         /* spread data to 4 data registers */
    move.l  %d0,%d2
    move.l  %d0,%d3
    lea.l   (15,%a0),%a0    /* start address += 15, acct. for trl. data */
    
    /* main loop: set whole lines utilising burst mode */
.loop_line:
    lea.l   (-16,%a1),%a1   /* pre-decrement */
    movem.l %d0-%d3,(%a1)   /* store line */
    cmp.l   %a0,%a1         /* runs %a1 down to first line bound */
    bhi.b   .loop_line

    lea.l   (-15,%a0),%a0   /* correct start address */
    move.l  (%sp)+,%d3      /* restore registers */
    move.l  (%sp)+,%d2

    move.l  %a0,%d1         /* %d1 = start address ... */
    addq.l  #3,%d1          /* ... +3, account for possible trailing bytes */
    cmp.l   %d1,%a1         /* any longwords left */
    bhi.b   .loop_l2        /* yes: jump to longword loop */
    bra.b   .no_longs       /* no: skip loop */

.no_lines:
    move.l  %a0,%d1         /* %d1 = start address ... */
    addq.l  #3,%d1          /* ... +3, account for possible trailing bytes */

    /* trailing longword loop */
.loop_l2:
    move.l  %d0,-(%a1)      /* store longword */
    cmp.l   %d1,%a1         /* runs %a1 down to first long bound */
    bhi.b   .loop_l2

.no_longs:
    cmp.l   %a0,%a1         /* any bytes left? */
    bls.b   .end_b2         /* no: skip loop */

    /* trailing byte loop */
.loop_b2:
    move.b  %d0,-(%a1)      /* store byte */
    cmp.l   %a0,%a1         /* runs %a1 down to start address */
    bhi.b   .loop_b2

.end_b2:
    move.l  %a0,%d0         /* return start address */
    rts

.end:
    .size   memset,.end-memset

#elif defined(CPU_ARM)

/*  The following code is taken from the Linux kernel version 2.6.15.3
 *  linux/arch/arm/lib/memset.S
 *
 *  Copyright (C) 1995-2000 Russell King
 */

@       .word   0
1:      subs    r2, r2, #4              @ 1 do we have enough
        blt     5f                      @ 1 bytes to align with?
        cmp     r3, #2                  @ 1
        strltb  r1, [r0], #1            @ 1
        strleb  r1, [r0], #1            @ 1
        strb    r1, [r0], #1            @ 1
        add     r2, r2, r3              @ 1 (r2 = r2 - (4 - r3))
/*
 * The pointer is now aligned and the length is adjusted.  Try doing the
 * memzero again.
 */

        .global     memset
        .type       memset,%function
memset:
        ands    r3, r0, #3              @ 1 unaligned?
        bne     1b                      @ 1
/*
 * we know that the pointer in r0 is aligned to a word boundary.
 */
        orr     r1, r1, r1, lsl #8
        orr     r1, r1, r1, lsl #16
        mov     r3, r1
        cmp     r2, #16
        blt     4f
/*
 * We need an extra register for this loop - save the return address and
 * use the LR
 */
        str     lr, [sp, #-4]!
        mov     ip, r1
        mov     lr, r1

2:      subs    r2, r2, #64
        stmgeia r0!, {r1, r3, ip, lr}   @ 64 bytes at a time.
        stmgeia r0!, {r1, r3, ip, lr}
        stmgeia r0!, {r1, r3, ip, lr}
        stmgeia r0!, {r1, r3, ip, lr}
        bgt     2b
        ldmeqfd sp!, {pc}               @ Now <64 bytes to go.
/*
 * No need to correct the count; we're only testing bits from now on
 */
        tst     r2, #32
        stmneia r0!, {r1, r3, ip, lr}
        stmneia r0!, {r1, r3, ip, lr}
        tst     r2, #16
        stmneia r0!, {r1, r3, ip, lr}
        ldr     lr, [sp], #4

4:      tst     r2, #8
        stmneia r0!, {r1, r3}
        tst     r2, #4
        strne   r1, [r0], #4
/*
 * When we get here, we've got less than 4 bytes to zero.  We
 * may have an unaligned pointer as well.
 */
5:      tst     r2, #2
        strneb  r1, [r0], #1
        strneb  r1, [r0], #1
        tst     r2, #1
        strneb  r1, [r0], #1
        mov     pc, lr
end:
        .size   memset,.end-memset
#endif