diff options
author | Michael Giacomelli <giac2000@hotmail.com> | 2007-08-16 03:33:15 +0000 |
---|---|---|
committer | Michael Giacomelli <giac2000@hotmail.com> | 2007-08-16 03:33:15 +0000 |
commit | 8159b9ee9ae0c0a81819a9ebd1e33b9308ed1e68 (patch) | |
tree | 22f33b66239a30f4b1e51041a182ce5c443bcdca | |
parent | 1d1d9a8491ff478d3e3809df9366c87813993a73 (diff) |
Faster Q1.31 multiply for ARM. Add some ARM asm windowing code. Simply replacing c code with loop unrolled + load/store multiple instructions makes a surprising difference on PP. Also, add comments to the windowing code.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@14365 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r-- | apps/codecs/libwma/wmadeci.c | 49 | ||||
-rw-r--r-- | apps/codecs/libwma/wmafixed.h | 6 |
2 files changed, 49 insertions, 6 deletions
diff --git a/apps/codecs/libwma/wmadeci.c b/apps/codecs/libwma/wmadeci.c index dfce09aa01..2796815d72 100644 --- a/apps/codecs/libwma/wmadeci.c +++ b/apps/codecs/libwma/wmadeci.c @@ -378,12 +378,47 @@ void ff_imdct_calc(MDCTContext *s, * */ +#ifdef CPU_ARM +static inline +void vector_fmul_add_add(fixed32 *dst, const fixed32 *data, const fixed32 *window, int n) +{ + while (n>=2) { + asm volatile ("ldmia %[d]!, {r0, r1};" + "ldmia %[w]!, {r4, r5};" + + /*consume the first data and window value so we can use those registers again */ + "smull r8, r9, r0, r4;" + + "ldmia %[dst], {r0, r4};" + "add r0, r0, r9, lsl #1;" /* *dst=*dst+(r9<<1)*/ + "smull r8, r9, r1, r5;" + "add r1, r4, r9, lsl #1;" + "stmia %[dst]!, {r0, r1};" + : [d] "+r" (data), [w] "+r" (window), [dst] "+r" (dst) + : : "r0", "r1", + "r4", "r5", "r8", "r9", + "memory", "cc"); + n -= 2; + } + while(n>0) { + *dst = fixmul32b(*data, *window); + data++; + window++; + n--; + } +} + +#else + static inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){ int i; for(i=0; i<len; i++) dst[i] = fixmul32b(src0[i], src1[i]) + dst[i]; } +#endif + +/* TODO: Adapt the above to work with this */ static inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){ int i; src1 += len-1; @@ -391,11 +426,15 @@ static inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const dst[i] = fixmul32b(src0[i], src1[-i]); } + /** * Apply MDCT window and add into output. * * We ensure that when the windows overlap their squared sum * is always 1 (MDCT reconstruction rule). + * + * The Vorbis I spec has a great diagram explaining this process. + * See section 1.3.2.3 of http://xiph.org/vorbis/doc/Vorbis_I_spec.html */ static void wma_window(WMADecodeContext *s, fixed32 *in, fixed32 *out) { @@ -403,6 +442,7 @@ static inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const int block_len, bsize, n; /* left part */ + /*previous block was larger, so we'll use the size of the current block to set the window size*/ if (s->block_len_bits <= s->prev_block_len_bits) { block_len = s->block_len; bsize = s->frame_len_bits - s->block_len_bits; @@ -410,7 +450,9 @@ static inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const vector_fmul_add_add(out, in, s->windows[bsize], block_len); } else { + /*previous block was smaller or the same size, so use it's size to set the window length*/ block_len = 1 << s->prev_block_len_bits; + /*find the middle of the two overlapped blocks, this will be the first overlapped sample*/ n = (s->block_len - block_len) / 2; bsize = s->frame_len_bits - s->prev_block_len_bits; @@ -418,7 +460,10 @@ static inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const memcpy(out+n+block_len, in+n+block_len, n*sizeof(fixed32)); } - + /* Advance to the end of the current block and prepare to window it for the next block. + * Since the window function needs to be reversed, we do it backwards starting with the + * last sample and moving towards the first + */ out += s->block_len; in += s->block_len; @@ -1124,7 +1169,7 @@ static int wma_decode_block(WMADecodeContext *s) int nb_coefs[MAX_CHANNELS]; fixed32 mdct_norm; -// printf("***decode_block: %d:%d (%d)\n", s->frame_count - 1, s->block_num, s->block_len); + DEBUGF("***decode_block: %d of (%d samples) (%d)\n", s->block_num, s->frame_len, s->block_len); /* compute current block length */ if (s->use_variable_block_len) diff --git a/apps/codecs/libwma/wmafixed.h b/apps/codecs/libwma/wmafixed.h index da0637fb68..db7529f681 100644 --- a/apps/codecs/libwma/wmafixed.h +++ b/apps/codecs/libwma/wmafixed.h @@ -67,11 +67,9 @@ long fsincos(unsigned long phase, fixed32 *cos); uint32_t __lo; \ int32_t __result; \ asm ("smull %0, %1, %3, %4\n\t" \ - "movs %0, %0, lsr %5\n\t" \ - "adc %2, %0, %1, lsl %6" \ + "movs %2, %1, lsl #1" \ : "=&r" (__lo), "=&r" (__hi), "=r" (__result) \ - : "%r" (x), "r" (y), \ - "M" (31), "M" (32 - 31) \ + : "%r" (x), "r" (y) \ : "cc"); \ __result; \ }) |