1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
|
/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id$
*
* JPEG assembly IDCT
*
* Copyright (C) 2009 Andrew Mahone asm versions of the C IDCT algorithms used
* jpeg_load.c with
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
#include "config.h"
.section .text
.align 2
.global jpeg_idct1h
.type jpeg_idct1h, %function
.global jpeg_idct2v
.type jpeg_idct2v, %function
.global jpeg_idct2h
.type jpeg_idct2h, %function
.global jpeg_idct4v
.type jpeg_idct4v, %function
.global jpeg_idct4h
.type jpeg_idct4h, %function
jpeg_idct1h:
/* In the common case of one pass through the loop, the extra add should be
cheaper than saving registers to stack and loading a the value 4112. */
1:
ldrsh r12, [r0]
add r12, r12, #4096
add r12, r12, #16
#if ARM_ARCH < 6
mov r12, r12, asr #5
cmp r12, #255
mvnhi r12, r12, asr #31
#else
usat r12, #8, r12, asr #5
#endif
strb r12, [r1]
add r0, r0, #16
add r1, r1, r3
cmp r0, r2
bcc 1b
bx lr
.size jpeg_idct1h, .-jpeg_idct1h
jpeg_idct2v:
#if ARM_ARCH < 6
/* Use SWAR tricks to fake partitioned add and subtract. This is slightly faster
than loading two values in each register and using shifts and strh, and
requires fewer fixup operations than splitting the values, calculating, and
merging.
*/
stmdb sp!, { r4, lr }
1:
ldr r2, [r0]
ldr r3, [r0, #16]
eor r12, r2, r3
and r12, r12, #0x8000
bic r3, r3, #0x8000
bic r4, r2, #0x8000
add r4, r4, r3
eor r4, r4, r12
orr r2, r2, #0x8000
sub r2, r2, r3
eor r2, r2, r12
eor r2, r2, #0x8000
str r4, [r0]
str r2, [r0, #16]
add r0, r0, #4
cmp r0, r1
bcc 1b
ldmia sp!, { r4, pc }
#else
/* ARMv6 offers partitioned adds and subtracts, used here to unroll the loop
to two columns.
*/
1:
ldr r2, [r0]
ldr r3, [r0, #16]
sadd16 r12, r2, r3
ssub16 r2, r2, r3
str r12, [r0]
str r2, [r0, #16]
add r0, r0, #4
cmp r0, r1
bcc 1b
bx lr
#endif
.size jpeg_idct2v, .-jpeg_idct2v
jpeg_idct2h:
#if ARM_ARCH < 6
/* Using LDR and shifts here would costs two more ops, and is no faster as
results can not be stored merged.
*/
stmdb sp!, { r4-r5, lr }
ldr r14, =4112
1:
ldrsh r12, [r0]
ldrsh r4, [r0, #2]
add r12, r12, r14
add r5, r12, r4
sub r4, r12, r4
mov r5, r5, asr #5
mov r4, r4, asr #5
cmp r5, #255
mvnhi r5, r5, asr #31
cmp r4, #255
mvnhi r4, r4, asr #31
#ifdef HAVE_LCD_COLOR
strb r5, [r1]
strb r4, [r1, #4]
#else
strb r5, [r1]
strb r4, [r1, #1]
#endif
add r0, r0, #16
add r1, r1, r3
cmp r0, r2
bcc 1b
ldmia sp!, { r4-r5, pc }
#else
stmdb sp!, { r4, lr }
ldr r14, =4112
1:
ldr r12, [r0]
sadd16 r12, r12, r14
saddsubx r12, r12, r12
usat r4, #8, r12, asr #21
sxth r12, r12
usat r12, #8, r12, asr #5
#ifdef HAVE_LCD_COLOR
strb r4, [r1]
strb r12, [r1, #4]
#else
strb r4, [r1]
strb r12, [r1, #1]
#endif
add r0, r0, #16
add r1, r1, r3
cmp r0, r2
bcc 1b
ldmia sp!, { r4, pc }
#endif
.size jpeg_idct2h, .-jpeg_idct2h
jpeg_idct4v:
#if ARM_ARCH < 5
stmdb sp!, { r4-r7, lr }
ldr r14, =-15137
ldr r12, =6270
1:
ldrsh r4, [r0, #32]
ldrsh r2, [r0]
ldrsh r5, [r0, #48]
ldrsh r3, [r0, #16]
add r6, r2, r4 /* r6 = tmp10 >> 2 = d0 + d2 */
sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
add r4, r3, r5 /* r4 = z1 = d1 + d3 */
add r7, r4, r4, lsl #3
rsb r4, r4, r7, lsl #4
rsb r4, r4, r4, lsl #5 /* z1 *= 4433 */
add r4, r4, #1024
mla r3, r12, r3, r4 /* r3 = tmp2 = z1 + z2 * 6270 */
mla r5, r14, r5, r4 /* r5 = tmp0 = z1 - z3 * 15137 */
mov r6, r6, lsl #2 /* r6 <<= 2 */
mov r2, r2, lsl #2 /* r2 <<= 2 */
add r7, r6, r3, asr #11 /* r7 = o0 */
sub r3, r6, r3, asr #11 /* r3 = o3 */
add r6, r2, r5, asr #11 /* r6 = o1 */
sub r2, r2, r5, asr #11 /* r2 = o2 */
strh r7, [r0]
strh r3, [r0, #48]
strh r6, [r0, #16]
strh r2, [r0, #32]
add r0, r0, #2
cmp r0, r1
bcc 1b
ldmia sp!, { r4-r7, pc }
#elif ARM_ARCH < 6
stmdb sp!, { r4-r8, lr }
ldr r8, =1024
ldr r14, =4433
ldr r12, =3302955134
1:
ldrsh r5, [r0, #48]
ldrsh r3, [r0, #16]
ldrsh r4, [r0, #32]
ldrsh r2, [r0]
add r6, r3, r5 /* r6 = z1 = d1 + d3 */
add r7, r2, r4 /* r7 = tmp10 >> 2 = d0 + d2 */
smlabb r6, r14, r6, r8 /* z1 *= 4433 */
sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
smlabb r3, r12, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
smlatb r5, r12, r5, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
mov r7, r7, lsl #2
mov r2, r2, lsl #2
add r4, r7, r3, asr #11 /* r4 = o0 */
sub r7, r7, r3, asr #11 /* r7 = o3 */
add r3, r2, r5, asr #11 /* r3 = o1 */
sub r2, r2, r5, asr #11 /* r2 = o2 */
strh r4, [r0]
strh r7, [r0, #48]
strh r3, [r0, #16]
strh r2, [r0, #32]
add r0, r0, #2
cmp r0, r1
bcc 1b
ldmia sp!, { r4-r8, pc }
#else
stmdb sp!, { r4-r10, lr }
ldr r2, =1024
ldr r3, =4433
ldr r12, =3302955134
1:
ldr r6, [r0, #32]
ldr r4, [r0]
ldr r7, [r0, #48]
ldr r5, [r0, #16]
/* this part is being done in parallel on two columns */
sadd16 r8, r4, r6 /* r8 = d0 + d2 */
ssub16 r4, r4, r6 /* r4 = d0 - d2 */
sadd16 r6, r5, r7 /* r6 = d1 + d3 */
/* there is no parallel shift operation, but we can fake it with bic
and lsl */
bic r8, r8, #0xc000
bic r4, r4, #0xc000
/* multiplication expands values beyond 16 bits, so this part needs to be
split. the values will be merged below so that the rest of the addition
can be done in parallel */
smlabb r9, r3, r6, r2 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
smlabt r6, r3, r6, r2 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
smlabb r10, r12, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
smlatb r14, r12, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
smlabt r5, r12, r5, r6 /* r5 = tmp2[1] */
smlatt r6, r12, r7, r6 /* r6 = tmp0[1] */
mov r8, r8, lsl #2 /* complete the parallel shift started */
mov r4, r4, lsl #2 /* with the earlier bic instructions */
/* tmp2 are in r10, r5; tmp0 are in r14, r6 */
/* tmp10, tmp12 are in r4, r8 */
mov r10, r10, asr #11
mov r14, r14, asr #11
pkhbt r5, r10, r5, lsl #5 /* parallel tmp2 */
pkhbt r6, r14, r6, lsl #5 /* parallel tmp0 */
sadd16 r10, r8, r5 /* d0 */
ssub16 r5, r8, r5 /* d3 */
sadd16 r14, r4, r6 /* d1 */
ssub16 r6, r4, r6 /* d2 */
str r10, [r0]
str r5, [r0, #48]
str r14, [r0, #16]
str r6, [r0, #32]
add r0, r0, #4
cmp r0, r1
bcc 1b
ldmia sp!, { r4-r10, pc }
#endif
.size jpeg_idct4v, .-jpeg_idct4v
jpeg_idct4h:
#if ARM_ARCH < 5
stmdb sp!, { r4-r10, lr }
ldr r10, =-15137
ldr r14, =4112
ldr r12, =6270
1:
ldrsh r4, [r0]
ldrsh r6, [r0, #4]
ldrsh r7, [r0, #6]
ldrsh r5, [r0, #2]
add r4, r4, r14
add r8, r4, r6 /* r8 = tmp10 >> 2 = d0 + d2 */
sub r4, r4, r6 /* r4 = tmp12 >> 2= d0 - d2 */
add r6, r5, r7 /* r6 = z1 = d1 + d3 */
add r9, r6, r6, lsl #3
rsb r6, r6, r9, lsl #4
rsb r6, r6, r6, lsl #5 /* z1 *= 4433 */
mla r7, r10, r7, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
mla r5, r12, r5, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
add r9, r5, r8, lsl #13 /* r7 = o0 */
rsb r5, r5, r8, lsl #13 /* r3 = o3 */
add r8, r7, r4, lsl #13 /* r6 = o1 */
rsb r4, r7, r4, lsl #13 /* r2 = o2 */
mov r9, r9, asr #18
mov r8, r8, asr #18
mov r4, r4, asr #18
mov r5, r5, asr #18
cmp r9, #255
mvnhi r9, r9, asr #31
cmp r8, #255
mvnhi r8, r8, asr #31
cmp r4, #255
mvnhi r4, r4, asr #31
cmp r5, #255
mvnhi r5, r5, asr #31
#ifdef HAVE_LCD_COLOR
strb r9, [r1]
strb r8, [r1, #4]
strb r4, [r1, #8]
strb r5, [r1, #12]
#else
strb r9, [r1]
strb r8, [r1, #1]
strb r4, [r1, #2]
strb r5, [r1, #3]
#endif
add r0, r0, #16
add r1, r1, r3
cmp r0, r2
bcc 1b
ldmia sp!, { r4-r10, pc }
#elif ARM_ARCH < 6
stmdb sp!, { r4-r10, lr }
ldr r10, =4433
ldr r14, =4112
ldr r12, =3302955134
1:
ldrsh r7, [r0, #6]
ldrsh r5, [r0, #2]
ldrsh r4, [r0]
ldrsh r6, [r0, #4]
add r8, r5, r7 /* r8 = z1 = d1 + d3 */
add r4, r4, r14
smulbb r8, r10, r8 /* z1 *= 4433 */
add r9, r4, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
smlabb r5, r12, r5, r8 /* r5 = tmp2 = z1 + z2 * 6270 */
smlatb r7, r12, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
sub r4, r5, r6 /* r4 = tmp12 >> 13 = d0 - d2 */
add r6, r5, r9, lsl #13 /* r6 = o0 */
rsb r9, r5, r9, lsl #13 /* r9 = o3 */
add r5, r7, r4, lsl #13 /* r5 = o1 */
rsb r4, r7, r4, lsl #13 /* r4 = o2 */
mov r6, r6, asr #18
mov r5, r5, asr #18
mov r4, r4, asr #18
mov r9, r9, asr #18
cmp r6, #255
mvnhi r6, r6, asr #31
cmp r5, #255
mvnhi r5, r5, asr #31
cmp r4, #255
mvnhi r4, r4, asr #31
cmp r9, #255
mvnhi r9, r9, asr #31
#ifdef HAVE_LCD_COLOR
strb r6, [r1]
strb r5, [r1, #4]
strb r4, [r1, #8]
strb r9, [r1, #12]
#else
strb r6, [r1]
strb r5, [r1, #1]
strb r4, [r1, #2]
strb r9, [r1, #3]
#endif
add r0, r0, #16
add r1, r1, r3
cmp r0, r2
bcc 1b
ldmia sp!, { r4-r10, pc }
#else
stmdb sp!, { r4-r9, lr }
ldr r9, =4433
ldr r14, =4112
ldr r12, =3302955134
1:
ldmia r0, { r4-r5 }
sadd16 r4, r4, r14
sadd16 r6, r4, r5 /* r6lo = d0 + d2, r6hi = d1 + d3 */
ssub16 r7, r4, r5 /* r7lo = d0 - d2 */
smulbt r8, r9, r6
sxth r6, r6
smlabt r4, r12, r4, r8 /* r4 = tmp2 = z1 + z2 * 6270 */
smlatt r5, r12, r5, r8 /* r5 = tmp0 = z1 - z3 * 15137 */
sxth r7, r7
add r8, r4, r6, lsl #13 /* r8 = o0 */
rsb r6, r4, r6, lsl #13 /* r6 = o3 */
add r4, r5, r7, lsl #13 /* r4 = o1 */
rsb r5, r5, r7, lsl #13 /* r5 = o2 */
usat r8, #8, r8, asr #18
usat r6, #8, r6, asr #18
usat r4, #8, r4, asr #18
usat r5, #8, r5, asr #18
#ifdef HAVE_LCD_COLOR
strb r8, [r1]
strb r6, [r1, #12]
strb r4, [r1, #4]
strb r5, [r1, #8]
#else
strb r8, [r1]
strb r6, [r1, #3]
strb r4, [r1, #1]
strb r5, [r1, #2]
#endif
add r0, r0, #16
add r1, r1, r3
cmp r0, r2
bcc 1b
ldmia sp!, { r4-r9, pc }
#endif
.size jpeg_idct4h, .-jpeg_idct4h
|