1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
|
/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id$
*
* Copyright (C) 2008 by Jens Arnold
* Copyright (C) 2009 by Andrew Mahone
*
* Optimised unsigned integer division for ARMv4
*
* Based on: libgcc routines for ARM cpu, additional algorithms from ARM System
* Developer's Guide
* Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
* Free Software Foundation, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
#include "config.h"
/* Codecs should not normally do this, but we need to check a macro, and
* codecs.h would confuse the assembler. */
#ifdef USE_IRAM
#define DIV_RECIP
.section .icode,"ax",%progbits
#else
.text
#endif
.align
.global udiv32_arm
.type udiv32_arm,%function
#if ARM_ARCH < 5
/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
for dividing a 30-bit value by a 15-bit value, with two operations per
iteration by storing quotient and remainder together and adding the previous
quotient bit during trial subtraction. Modified to work with any dividend
and divisor both less than 1 << 30, and skipping trials by calculating bits
in output. */
.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
mov \bits, #1
/* Shift the divisor left until it aligns with the numerator. If it already
has the high bit set, this is fine, everything inside .rept will be
skipped, and the add before and adcs after will set the one-bit result
to zero. */
cmn \divisor, \dividend, lsr #16
movcs \divisor, \divisor, lsl #16
addcs \bits, \bits, #16
cmn \divisor, \dividend, lsr #8
movcs \divisor, \divisor, lsl #8
addcs \bits, \bits, #8
cmn \divisor, \dividend, lsr #4
movcs \divisor, \divisor, lsl #4
addcs \bits, \bits, #4
cmn \divisor, \dividend, lsr #2
movcs \divisor, \divisor, lsl #2
addcs \bits, \bits, #2
cmn \divisor, \dividend, lsr #1
movcs \divisor, \divisor, lsl #1
addcs \bits, \bits, #1
adds \result, \dividend, \divisor
subcc \result, \result, \divisor
rsb \curbit, \bits, #31
add pc, pc, \curbit, lsl #3
nop
.rept 30
adcs \result, \divisor, \result, lsl #1
/* Fix the remainder portion of the result. This must be done because the
handler for 32-bit numerators needs the remainder. */
subcc \result, \result, \divisor
.endr
/* Shift remainder/quotient left one, add final quotient bit */
adc \result, \result, \result
mov \remainder, \result, lsr \bits
eor \quotient, \result, \remainder, lsl \bits
.endm
#ifdef CPU_PP
#if CONFIG_CPU == PP5020
.set recip_max, 8384
#elif CONFIG_CPU == PP5002
.set recip_max, 4608
#else
.set recip_max, 16384
#endif
#elif CONFIG_CPU == AS3525
.set recip_max, 42752
#elif CONFIG_CPU == S5L8701
.set recip_max, 12800
#elif CONFIG_CPU == S5L8700
.set recip_max, 9088
#endif
udiv32_arm:
#ifdef DIV_RECIP
cmp r1, #3
bcc .L_udiv_tiny
cmp r1, #recip_max
bhi .L_udiv
adr r3, .L_udiv_recip_table-12
ldr r2, [r3, r1, lsl #2]
mov r3, r0
umull ip, r0, r2, r0
mul r2, r0, r1
cmp r3, r2
bxcs lr
sub r0, r0, #1
bx lr
.L_udiv_tiny:
cmp r1, #1
movhi r0, r0, lsr #1
bxcs lr
b .L_div0
#endif
.L_udiv:
/* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor
and add the next bit of the result. The correction code at .L_udiv32
does not need the divisor inverted, but can be modified to work with it,
and this allows the zero divisor test to be done early and without an
explicit comparison. */
rsbs r1, r1, #0
#ifndef DIV_RECIP
beq .L_div0
#endif
tst r0, r0
/* High bit must be unset, otherwise shift numerator right, calculate,
and correct results. As this case is very uncommon we want to avoid
any other delays on the main path in handling it, so the long divide
calls the short divide as a function. */
bmi .L_udiv32
.L_udiv31:
ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
bx lr
.L_udiv32:
/* store original numerator and divisor, we'll need them to correct the
result, */
stmdb sp, { r0, r1, lr }
/* Call __div0 here if divisor is zero, otherwise it would report the wrong
address. */
mov r0, r0, lsr #1
bl .L_udiv31
ldmdb sp, { r2, r3, lr }
/* Move the low bit of the original numerator to the carry bit */
movs r2, r2, lsr #1
/* Shift the remainder left one and add in the carry bit */
adc r1, r1, r1
/* Subtract the original divisor from the remainder, setting carry if the
result is non-negative */
adds r1, r1, r3
/* Shift quotient left one and add carry bit */
adc r0, r0, r0
bx lr
.L_div0:
/* __div0 expects the calling address on the top of the stack */
stmdb sp!, { lr }
mov r0, #0
#if defined(__ARM_EABI__) || !defined(USE_IRAM)
bl __div0
#else
ldr pc, [pc, #-4]
.word __div0
#endif
#ifdef DIV_RECIP
.L_udiv_recip_table:
.set div, 3
.rept recip_max - 2
.if (div - 1) & div
.set q, 0x40000000 / div
.set r, (0x40000000 - (q * div))<<1
.set q, q << 1
.if r >= div
.set q, q + 1
.set r, r - div
.endif
.set r, r << 1
.set q, q << 1
.if r >= div
.set q, q + 1
.set r, r - div
.endif
.set q, q + 1
.else
.set q, 0x40000000 / div * 4
.endif
.word q
.set div, div+1
.endr
#endif
.size udiv32_arm, . - udiv32_arm
#else
.macro ARMV5_UDIV32_BODY numerator, divisor, quotient, bits, inv, neg, div0label
cmp \numerator, \divisor
clz \bits, \divisor
bcc 30f
mov \inv, \divisor, lsl \bits
add \neg, pc, \inv, lsr #25
cmp \inv, #1<<31
ldrhib \inv, [\neg, #.L_udiv_est_table-.-64]
bls 20f
subs \bits, \bits, #7
rsb \neg, \divisor, #0
movpl \divisor, \inv, lsl \bits
bmi 10f
mul \inv, \divisor, \neg
smlawt \divisor, \divisor, \inv, \divisor
mul \inv, \divisor, \neg
/* This will save a cycle on ARMv6, but does not produce a correct result
if numerator sign bit is set. This case accounts for about 1 in 10^7 of
divisions, done by the APE decoder, so we specialize for the more common
case and handle the uncommon large-numerator separately */
#if ARM_ARCH >= 6
tst \numerator, \numerator
smmla \divisor, \divisor, \inv, \divisor
bmi 40f
smmul \inv, \numerator, \divisor
#else
mov \bits, #0
smlal \bits, \divisor, \inv, \divisor
umull \bits, \inv, \numerator, \divisor
#endif
add \numerator, \numerator, \neg
mla \divisor, \inv, \neg, \numerator
mov \quotient, \inv
cmn \divisor, \neg
addcc \quotient, \quotient, #1
addpl \quotient, \quotient, #2
bx lr
10:
rsb \bits, \bits, #0
sub \inv, \inv, #4
mov \divisor, \inv, lsr \bits
umull \bits, \inv, \numerator, \divisor
mla \divisor, \inv, \neg, \numerator
mov \quotient, \inv
cmn \neg, \divisor, lsr #1
addcs \divisor, \divisor, \neg, lsl #1
addcs \quotient, \quotient, #2
cmn \neg, \divisor
addcs \quotient, \quotient, #1
bx lr
20:
.ifnc "", "\div0label"
rsb \bits, \bits, #31
bne \div0label
.endif
mov \quotient, \numerator, lsr \bits
bx lr
30:
mov \quotient, #0
bx lr
#if ARM_ARCH >= 6
40:
umull \bits, \inv, \numerator, \divisor
add \numerator, \numerator, \neg
mla \divisor, \inv, \neg, \numerator
mov \quotient, \inv
cmn \divisor, \neg
addcc \quotient, \quotient, #1
addpl \quotient, \quotient, #2
bx lr
#endif
.endm
udiv32_arm:
ARMV5_UDIV32_BODY r0, r1, r0, r2, r3, ip, .L_div0
.L_div0:
/* __div0 expects the calling address on the top of the stack */
stmdb sp!, { lr }
mov r0, #0
#if defined(__ARM_EABI__) || !defined(USE_IRAM)
bl __div0
#else
ldr pc, [pc, #-4]
.word __div0
#endif
.L_udiv_est_table:
.byte 0xff, 0xfc, 0xf8, 0xf4, 0xf0, 0xed, 0xea, 0xe6
.byte 0xe3, 0xe0, 0xdd, 0xda, 0xd7, 0xd4, 0xd2, 0xcf
.byte 0xcc, 0xca, 0xc7, 0xc5, 0xc3, 0xc0, 0xbe, 0xbc
.byte 0xba, 0xb8, 0xb6, 0xb4, 0xb2, 0xb0, 0xae, 0xac
.byte 0xaa, 0xa8, 0xa7, 0xa5, 0xa3, 0xa2, 0xa0, 0x9f
.byte 0x9d, 0x9c, 0x9a, 0x99, 0x97, 0x96, 0x94, 0x93
.byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89
.byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81
#endif
.size udiv32_arm, . - udiv32_arm
|