1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
|
/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id$
*
* Copyright (C) 2004 by Jens Arnold
*
* All files in this archive are subject to the GNU General Public License.
* See the file COPYING in the source tree root for full license agreement.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
#include "config.h"
#ifdef CPU_ARM
.section .icode,"ax",%progbits
#elif
.section .icode,"ax",@progbits
#endif
.align 2
#if CONFIG_CPU == SH7034
.global _memset
.type _memset,@function
/* Fills a memory region with specified byte value
* This version is optimized for speed
*
* arguments:
* r4 - start address
* r5 - data
* r6 - length
*
* return value:
* r0 - start address (like ANSI version)
*
* register usage:
* r0 - temporary
* r1 - start address +11 for main loop
* r4 - start address
* r5 - data (spread to all 4 bytes when using long stores)
* r6 - current address (runs down from end to start)
*
* The instruction order below is devised in a way to utilize the pipelining
* of the SH1 to the max. The routine fills memory from end to start in
* order to utilize the auto-decrementing store instructions.
*/
_memset:
neg r4,r0
and #3,r0 /* r0 = (4 - align_offset) % 4 */
add #4,r0
cmp/hs r0,r6 /* at least one aligned longword to fill? */
add r4,r6 /* r6 = end_address */
bf .no_longs /* no, jump directly to byte loop */
extu.b r5,r5 /* start: spread data to all 4 bytes */
swap.b r5,r0
or r0,r5 /* data now in 2 lower bytes of r5 */
swap.w r5,r0
or r0,r5 /* data now in all 4 bytes of r5 */
mov r6,r0
tst #3,r0 /* r0 already long aligned? */
bt .end_b1 /* yes: skip loop */
/* leading byte loop: sets 0..3 bytes */
.loop_b1:
mov.b r5,@-r0 /* store byte */
tst #3,r0 /* r0 long aligned? */
bf .loop_b1 /* runs r0 down until long aligned */
mov r0,r6 /* r6 = last long bound */
nop /* keep alignment */
.end_b1:
mov r4,r1 /* r1 = start_address... */
add #11,r1 /* ... + 11, combined for rounding and offset */
xor r1,r0
tst #4,r0 /* bit 2 tells whether an even or odd number of */
bf .loop_odd /* longwords to set */
/* main loop: set 2 longs per pass */
.loop_2l:
mov.l r5,@-r6 /* store first long */
.loop_odd:
cmp/hi r1,r6 /* runs r6 down to first long bound */
mov.l r5,@-r6 /* store second long */
bt .loop_2l
.no_longs:
cmp/hi r4,r6 /* any bytes left? */
bf .end_b2 /* no: skip loop */
/* trailing byte loop */
.loop_b2:
mov.b r5,@-r6 /* store byte */
cmp/hi r4,r6 /* runs r6 down to the start address */
bt .loop_b2
.end_b2:
rts
mov r4,r0 /* return start address */
.end:
.size _memset,.end-_memset
#elif defined(CPU_COLDFIRE)
.global memset
.type memset,@function
/* Fills a memory region with specified byte value
* This version is optimized for speed
*
* arguments:
* (4,%sp) - start address
* (8,%sp) - data
* (12,%sp) - length
*
* return value:
* %d0 - start address (like ANSI version)
*
* register usage:
* %d0 - data (spread to all 4 bytes when using long stores)
* %d1 - temporary / data (for burst transfer)
* %d2 - data (for burst transfer)
* %d3 - data (for burst transfer)
* %a0 - start address
* %a1 - current address (runs down from end to start)
*
* For maximum speed this routine uses both long stores and burst mode,
* storing whole lines with movem.l. The routine fills memory from end
* to start in order to ease returning the start address.
*/
memset:
move.l (4,%sp),%a0 /* start address */
move.l (8,%sp),%d0 /* data */
move.l (12,%sp),%a1 /* length */
add.l %a0,%a1 /* %a1 = end address */
move.l %a0,%d1
addq.l #7,%d1
and.l #0xFFFFFFFC,%d1 /* %d1 = first long bound + 4 */
cmp.l %d1,%a1 /* at least one aligned longword to fill? */
blo.b .no_longs /* no, jump directly to byte loop */
and.l #0xFF,%d0 /* start: spread data to all 4 bytes */
move.l %d0,%d1
lsl.l #8,%d1
or.l %d1,%d0 /* data now in 2 lower bytes of %d0 */
move.l %d0,%d1
swap %d0
or.l %d1,%d0 /* data now in all 4 bytes of %d0 */
move.l %a1,%d1
and.l #0xFFFFFFFC,%d1 /* %d1 = last long bound */
cmp.l %d1,%a1 /* any bytes to set? */
bls.b .end_b1 /* no: skip byte loop */
/* leading byte loop: sets 0..3 bytes */
.loop_b1:
move.b %d0,-(%a1) /* store byte */
cmp.l %d1,%a1 /* runs %a1 down to last long bound */
bhi.b .loop_b1
.end_b1:
moveq.l #31,%d1
add.l %a0,%d1
and.l #0xFFFFFFF0,%d1 /* %d1 = first line bound + 16 */
cmp.l %d1,%a1 /* at least one full line to fill? */
blo.b .no_lines /* no, jump to longword loop */
mov.l %a1,%d1
and.l #0xFFFFFFF0,%d1 /* %d1 = last line bound */
cmp.l %d1,%a1 /* any longwords to set? */
bls.b .end_l1 /* no: skip longword loop */
/* leading longword loop: sets 0..3 longwords */
.loop_l1:
move.l %d0,-(%a1) /* store longword */
cmp.l %d1,%a1 /* runs %a1 down to last line bound */
bhi.b .loop_l1
.end_l1:
move.l %d2,-(%sp) /* free some registers */
move.l %d3,-(%sp)
move.l %d0,%d1 /* spread data to 4 data registers */
move.l %d0,%d2
move.l %d0,%d3
lea.l (15,%a0),%a0 /* start address += 15, acct. for trl. data */
/* main loop: set whole lines utilising burst mode */
.loop_line:
lea.l (-16,%a1),%a1 /* pre-decrement */
movem.l %d0-%d3,(%a1) /* store line */
cmp.l %a0,%a1 /* runs %a1 down to first line bound */
bhi.b .loop_line
lea.l (-15,%a0),%a0 /* correct start address */
move.l (%sp)+,%d3 /* restore registers */
move.l (%sp)+,%d2
move.l %a0,%d1 /* %d1 = start address ... */
addq.l #3,%d1 /* ... +3, account for possible trailing bytes */
cmp.l %d1,%a1 /* any longwords left */
bhi.b .loop_l2 /* yes: jump to longword loop */
bra.b .no_longs /* no: skip loop */
.no_lines:
move.l %a0,%d1 /* %d1 = start address ... */
addq.l #3,%d1 /* ... +3, account for possible trailing bytes */
/* trailing longword loop */
.loop_l2:
move.l %d0,-(%a1) /* store longword */
cmp.l %d1,%a1 /* runs %a1 down to first long bound */
bhi.b .loop_l2
.no_longs:
cmp.l %a0,%a1 /* any bytes left? */
bls.b .end_b2 /* no: skip loop */
/* trailing byte loop */
.loop_b2:
move.b %d0,-(%a1) /* store byte */
cmp.l %a0,%a1 /* runs %a1 down to start address */
bhi.b .loop_b2
.end_b2:
move.l %a0,%d0 /* return start address */
rts
.end:
.size memset,.end-memset
#elif defined(CPU_ARM)
/* The following code is taken from the Linux kernel version 2.6.15.3
* linux/arch/arm/lib/memset.S
*
* Copyright (C) 1995-2000 Russell King
*/
@ .word 0
1: subs r2, r2, #4 @ 1 do we have enough
blt 5f @ 1 bytes to align with?
cmp r3, #2 @ 1
strltb r1, [r0], #1 @ 1
strleb r1, [r0], #1 @ 1
strb r1, [r0], #1 @ 1
add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3))
/*
* The pointer is now aligned and the length is adjusted. Try doing the
* memzero again.
*/
.global memset
.type memset,%function
memset:
ands r3, r0, #3 @ 1 unaligned?
bne 1b @ 1
/*
* we know that the pointer in r0 is aligned to a word boundary.
*/
orr r1, r1, r1, lsl #8
orr r1, r1, r1, lsl #16
mov r3, r1
cmp r2, #16
blt 4f
/*
* We need an extra register for this loop - save the return address and
* use the LR
*/
str lr, [sp, #-4]!
mov ip, r1
mov lr, r1
2: subs r2, r2, #64
stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time.
stmgeia r0!, {r1, r3, ip, lr}
stmgeia r0!, {r1, r3, ip, lr}
stmgeia r0!, {r1, r3, ip, lr}
bgt 2b
ldmeqfd sp!, {pc} @ Now <64 bytes to go.
/*
* No need to correct the count; we're only testing bits from now on
*/
tst r2, #32
stmneia r0!, {r1, r3, ip, lr}
stmneia r0!, {r1, r3, ip, lr}
tst r2, #16
stmneia r0!, {r1, r3, ip, lr}
ldr lr, [sp], #4
4: tst r2, #8
stmneia r0!, {r1, r3}
tst r2, #4
strne r1, [r0], #4
/*
* When we get here, we've got less than 4 bytes to zero. We
* may have an unaligned pointer as well.
*/
5: tst r2, #2
strneb r1, [r0], #1
strneb r1, [r0], #1
tst r2, #1
strneb r1, [r0], #1
mov pc, lr
end:
.size memset,.end-memset
#endif
|