diff options
author | Thom Johansen <thomj@rockbox.org> | 2007-10-18 10:09:21 +0000 |
---|---|---|
committer | Thom Johansen <thomj@rockbox.org> | 2007-10-18 10:09:21 +0000 |
commit | 6e4aa260d019d94e23fbc8c26b95253ae752f697 (patch) | |
tree | 3f2043f69865a800125246993bf4971df629dffc /apps/codecs/libwma/wmadeci.c | |
parent | 31245682b3736bb450c72065f7ac0dbebc21416d (diff) |
Add Coldfire and ARM assembler for "reverse multiply and copy" function too. Gives big speedup on Coldfire, small on ARM.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15183 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs/libwma/wmadeci.c')
-rw-r--r-- | apps/codecs/libwma/wmadeci.c | 65 |
1 files changed, 61 insertions, 4 deletions
diff --git a/apps/codecs/libwma/wmadeci.c b/apps/codecs/libwma/wmadeci.c index 34a0f9f229..33894e1cf2 100644 --- a/apps/codecs/libwma/wmadeci.c +++ b/apps/codecs/libwma/wmadeci.c @@ -96,12 +96,33 @@ void vector_fmul_add_add(fixed32 *dst, const fixed32 *data, "smull r8, r9, r1, r5;" "add r1, r4, r9, lsl #1;" "stmia %[dst]!, {r0, r1};" - "subs %[n], %[n], #2;" - "bne 0b;" + "subs %[n], %[n], #2;" + "bne 0b;" : [d] "+r" (data), [w] "+r" (window), [dst] "+r" (dst), [n] "+r" (n) : : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc"); } +static inline +void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, + int len) +{ + /* Block sizes are always power of two */ + asm volatile ( + "add %[s1], %[s1], %[n], lsl #2;" + "0:" + "ldmia %[s0]!, {r0, r1};" + "ldmdb %[s1]!, {r4, r5};" + "smull r8, r9, r0, r5;" + "mov r0, r9, lsl #1;" + "smull r8, r9, r1, r4;" + "mov r1, r9, lsl #1;" + "stmia %[dst]!, {r0, r1};" + "subs %[n], %[n], #2;" + "bne 0b;" + : [s0] "+r" (src0), [s1] "+r" (src1), [dst] "+r" (dst), [n] "+r" (len) + : : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc"); +} + #elif defined(CPU_COLDFIRE) static inline @@ -118,8 +139,8 @@ void vector_fmul_add_add(fixed32 *dst, const fixed32 *data, "mac.l %%d1, %%d5, %%acc1;" "mac.l %%d2, %%a0, %%acc2;" "mac.l %%d3, %%a1, %%acc3;" - "lea.l (%[d], 16), %[d];" - "lea.l (%[w], 16), %[w];" + "lea.l (16, %[d]), %[d];" + "lea.l (16, %[w]), %[w];" "movclr.l %%acc0, %%d0;" "movclr.l %%acc1, %%d1;" "movclr.l %%acc2, %%d2;" @@ -134,6 +155,35 @@ void vector_fmul_add_add(fixed32 *dst, const fixed32 *data, : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc"); } +static inline +void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, + int len) +{ + /* Block sizes are always power of two. Smallest block is always way bigger + * than four too.*/ + asm volatile ( + "lea.l (-16, %[s1], %[n]*4), %[s1];" + "0:" + "movem.l (%[s0]), %%d0-%%d3;" + "movem.l (%[s1]), %%d4-%%d5/%%a0-%%a1;" + "mac.l %%d0, %%a1, %%acc0;" + "mac.l %%d1, %%a0, %%acc1;" + "mac.l %%d2, %%d5, %%acc2;" + "mac.l %%d3, %%d4, %%acc3;" + "lea.l (16, %[s0]), %[s0];" + "lea.l (-16, %[s1]), %[s1];" + "movclr.l %%acc0, %%d0;" + "movclr.l %%acc1, %%d1;" + "movclr.l %%acc2, %%d2;" + "movclr.l %%acc3, %%d3;" + "movem.l %%d0-%%d3, (%[dst]);" + "lea.l (16, %[dst]), %[dst];" + "subq.l #4, %[n];" + "jne 0b;" + : [s0] "+a" (src0), [s1] "+a" (src1), [dst] "+a" (dst), [n] "+d" (len) + : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc"); +} + #else static inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){ @@ -142,6 +192,13 @@ static inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *src0, const dst[i] = fixmul32b(src0[i], src1[i]) + dst[i]; } +static inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){ + int i; + src1 += len-1; + for(i=0; i<len; i++) + dst[i] = fixmul32b(src0[i], src1[-i]); +} + #endif /* TODO: Adapt the above to work with this */ |