summaryrefslogtreecommitdiff
path: root/apps/codecs/libmusepack
diff options
context:
space:
mode:
authorNils Wallménius <nils@rockbox.org>2010-07-20 23:35:07 +0000
committerNils Wallménius <nils@rockbox.org>2010-07-20 23:35:07 +0000
commit4f5b390a6df9733b46e254a7e367e066a80ccb9b (patch)
treeb9e8696d7cb431ca739c9c3017189241eca39a84 /apps/codecs/libmusepack
parentf32294d6abff7c5952b3a0c079a54b53eb42eb40 (diff)
Convert inline coldfire assembler to a 'real' assembler function, with tweaks by Buschel. Speeds up mpc decoding by ~1% on h300.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27504 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs/libmusepack')
-rw-r--r--apps/codecs/libmusepack/SOURCES3
-rw-r--r--apps/codecs/libmusepack/synth_filter.c53
-rw-r--r--apps/codecs/libmusepack/synth_filter_coldfire.S78
3 files changed, 90 insertions, 44 deletions
diff --git a/apps/codecs/libmusepack/SOURCES b/apps/codecs/libmusepack/SOURCES
index 31848214e0..60d762afd2 100644
--- a/apps/codecs/libmusepack/SOURCES
+++ b/apps/codecs/libmusepack/SOURCES
@@ -9,3 +9,6 @@ synth_filter.c
#if defined(CPU_ARM)
synth_filter_arm.S
#endif
+#if defined(CPU_COLDFIRE)
+synth_filter_coldfire.S
+#endif
diff --git a/apps/codecs/libmusepack/synth_filter.c b/apps/codecs/libmusepack/synth_filter.c
index 0f415a4838..9a79328106 100644
--- a/apps/codecs/libmusepack/synth_filter.c
+++ b/apps/codecs/libmusepack/synth_filter.c
@@ -472,7 +472,7 @@ mpc_dct32(const MPC_SAMPLE_FORMAT *in, MPC_SAMPLE_FORMAT *v)
/* 31 */ v[17] = -(v[15] = MPC_DCT32_SHIFT((((((((MPC_DCT32_MUL(t171 - t172, costab16) * 2) - t173) * 2) - t174) * 2) - t175) * 2) - t176));
}
-#if defined(CPU_ARM)
+#if defined(CPU_ARM) || defined(CPU_COLDFIRE)
extern void
mpc_decoder_windowing_D(MPC_SAMPLE_FORMAT * Data,
const MPC_SAMPLE_FORMAT * V,
@@ -485,57 +485,22 @@ mpc_decoder_windowing_D(MPC_SAMPLE_FORMAT * Data,
{
mpc_int32_t k;
-#if defined(CPU_COLDFIRE)
- // 64=32x32-multiply assembler for Coldfire
- for ( k = 0; k < 32; k++, D += 16, V++ )
- {
- asm volatile (
- "movem.l (%[D]), %%d0-%%d3 \n\t"
- "move.l (%[V]), %%a5 \n\t"
- "mac.l %%d0, %%a5, (96*4, %[V]), %%a5, %%acc0 \n\t"
- "mac.l %%d1, %%a5, (128*4, %[V]), %%a5, %%acc0\n\t"
- "mac.l %%d2, %%a5, (224*4, %[V]), %%a5, %%acc0\n\t"
- "mac.l %%d3, %%a5, (256*4, %[V]), %%a5, %%acc0\n\t"
- "movem.l (4*4, %[D]), %%d0-%%d3 \n\t"
- "mac.l %%d0, %%a5, (352*4, %[V]), %%a5, %%acc0\n\t"
- "mac.l %%d1, %%a5, (384*4, %[V]), %%a5, %%acc0\n\t"
- "mac.l %%d2, %%a5, (480*4, %[V]), %%a5, %%acc0\n\t"
- "mac.l %%d3, %%a5, (512*4, %[V]), %%a5, %%acc0\n\t"
- "movem.l (8*4, %[D]), %%d0-%%d3 \n\t"
- "mac.l %%d0, %%a5, (608*4, %[V]), %%a5, %%acc0\n\t"
- "mac.l %%d1, %%a5, (640*4, %[V]), %%a5, %%acc0\n\t"
- "mac.l %%d2, %%a5, (736*4, %[V]), %%a5, %%acc0\n\t"
- "mac.l %%d3, %%a5, (768*4, %[V]), %%a5, %%acc0\n\t"
- "movem.l (12*4, %[D]), %%d0-%%d3 \n\t"
- "mac.l %%d0, %%a5, (864*4, %[V]), %%a5, %%acc0\n\t"
- "mac.l %%d1, %%a5, (896*4, %[V]), %%a5, %%acc0\n\t"
- "mac.l %%d2, %%a5, (992*4, %[V]), %%a5, %%acc0\n\t"
- "mac.l %%d3, %%a5, %%acc0 \n\t"
- "movclr.l %%acc0, %%d0 \n\t"
- "lsl.l #1, %%d0 \n\t"
- "move.l %%d0, (%[Data])+ \n"
- : [Data] "+a" (Data)
- : [V] "a" (V), [D] "a" (D)
- : "d0", "d1", "d2", "d3", "a5");
- }
-#else
// 64=64x64-multiply (FIXED_POINT) or float=float*float (!FIXED_POINT) in C
for ( k = 0; k < 32; k++, D += 16, V++ )
{
*Data = MPC_MULTIPLY_EX(V[ 0],D[ 0],30) + MPC_MULTIPLY_EX(V[ 96],D[ 1],30)
- + MPC_MULTIPLY_EX(V[128],D[ 2],30) + MPC_MULTIPLY_EX(V[224],D[ 3],30)
- + MPC_MULTIPLY_EX(V[256],D[ 4],30) + MPC_MULTIPLY_EX(V[352],D[ 5],30)
- + MPC_MULTIPLY_EX(V[384],D[ 6],30) + MPC_MULTIPLY_EX(V[480],D[ 7],30)
- + MPC_MULTIPLY_EX(V[512],D[ 8],30) + MPC_MULTIPLY_EX(V[608],D[ 9],30)
- + MPC_MULTIPLY_EX(V[640],D[10],30) + MPC_MULTIPLY_EX(V[736],D[11],30)
- + MPC_MULTIPLY_EX(V[768],D[12],30) + MPC_MULTIPLY_EX(V[864],D[13],30)
- + MPC_MULTIPLY_EX(V[896],D[14],30) + MPC_MULTIPLY_EX(V[992],D[15],30);
+ + MPC_MULTIPLY_EX(V[128],D[ 2],30) + MPC_MULTIPLY_EX(V[224],D[ 3],30)
+ + MPC_MULTIPLY_EX(V[256],D[ 4],30) + MPC_MULTIPLY_EX(V[352],D[ 5],30)
+ + MPC_MULTIPLY_EX(V[384],D[ 6],30) + MPC_MULTIPLY_EX(V[480],D[ 7],30)
+ + MPC_MULTIPLY_EX(V[512],D[ 8],30) + MPC_MULTIPLY_EX(V[608],D[ 9],30)
+ + MPC_MULTIPLY_EX(V[640],D[10],30) + MPC_MULTIPLY_EX(V[736],D[11],30)
+ + MPC_MULTIPLY_EX(V[768],D[12],30) + MPC_MULTIPLY_EX(V[864],D[13],30)
+ + MPC_MULTIPLY_EX(V[896],D[14],30) + MPC_MULTIPLY_EX(V[992],D[15],30);
Data += 1;
// total: 16 muls, 15 adds, 16 shifts
}
-#endif /* COLDFIRE */
}
-#endif /* CPU_ARM */
+#endif /* CPU_ARM || CPU_COLDFIRE */
static void
mpc_full_synthesis_filter(MPC_SAMPLE_FORMAT *OutData, MPC_SAMPLE_FORMAT *V, const MPC_SAMPLE_FORMAT *Y)
diff --git a/apps/codecs/libmusepack/synth_filter_coldfire.S b/apps/codecs/libmusepack/synth_filter_coldfire.S
new file mode 100644
index 0000000000..758ab3d496
--- /dev/null
+++ b/apps/codecs/libmusepack/synth_filter_coldfire.S
@@ -0,0 +1,78 @@
+/***************************************************************************
+ * __________ __ ___.
+ * Open \______ \ ____ ____ | | _\_ |__ _______ ___
+ * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
+ * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
+ * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
+ * \/ \/ \/ \/ \/
+ * $Id$
+ *
+ * Copyright (C) 2005 by Thom Johansen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include "config.h"
+/*
+ * static void
+ * mpc_decoder_windowing_D(MPC_SAMPLE_FORMAT * Data,
+ * const MPC_SAMPLE_FORMAT * V,
+ * const MPC_SAMPLE_FORMAT * D)
+ */
+
+#if defined(USE_IRAM)
+ .section .icode
+#else
+ .text
+#endif
+ .align 2
+ .global mpc_decoder_windowing_D
+ .type mpc_decoder_windowing_D, @function
+
+mpc_decoder_windowing_D:
+ lea.l (-9*4, %sp), %sp
+ movem.l %d2-%d7/%a2-%a4, (%sp) | save some registers
+ movem.l (9*4+4, %sp), %a0-%a2 | a0 = Data, a1 = V, a2 = D
+ moveq.l #32, %d0 | loop counter
+
+ move.l (%a1), %a4
+ 0: | loop
+ movem.l (%a2), %d1-%d7/%a3
+
+ mac.l %d1, %a4, ( 96*4, %a1), %a4, %acc0
+ mac.l %d2, %a4, (128*4, %a1), %a4, %acc0
+ mac.l %d3, %a4, (224*4, %a1), %a4, %acc0
+ mac.l %d4, %a4, (256*4, %a1), %a4, %acc0
+ mac.l %d5, %a4, (352*4, %a1), %a4, %acc0
+ mac.l %d6, %a4, (384*4, %a1), %a4, %acc0
+ mac.l %d7, %a4, (480*4, %a1), %a4, %acc0
+ mac.l %a3, %a4, (512*4, %a1), %a4, %acc0
+ movem.l (8*4, %a2), %d1-%d7/%a3
+ mac.l %d1, %a4, (608*4, %a1), %a4, %acc0
+ mac.l %d2, %a4, (640*4, %a1), %a4, %acc0
+ mac.l %d3, %a4, (736*4, %a1), %a4, %acc0
+ mac.l %d4, %a4, (768*4, %a1), %a4, %acc0
+ mac.l %d5, %a4, (864*4, %a1), %a4, %acc0
+ mac.l %d6, %a4, (896*4, %a1), %a4, %acc0
+ mac.l %d7, %a4, (992*4, %a1), %a4, %acc0
+ mac.l %a3, %a4, ( 4, %a1), %a4, %acc0
+
+ lea.l (16*4, %a2), %a2
+ addq.l #4, %a1
+ movclr.l %acc0, %d1
+ lsl.l #1, %d1
+ move.l %d1, (%a0)+
+ subq.l #1, %d0
+ bne 0b
+
+ movem.l (%sp), %d2-%d7/%a2-%a4 | restore stacked regs
+ lea.l (9*4, %sp), %sp
+ rts
+