Submit FS#11461. Major speedup for aac he profile (PP5002 +20%, PP5020 +15%, PP5022 +19%, MCF5249 +35%, MCF5250 +80%), still not realtime on most targets though. This change does a lot of refactoring in the sbr filters and the dct, switching to our optimized codeclib fft and tweaking IRAM usage.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27358 a1c6a512-1295-4272-9138-f99709370657
author: Andree Buschmann <AndreeBuschmann@t-online.de> 2010-07-09 18:32:37 +0000
committer: Andree Buschmann <AndreeBuschmann@t-online.de> 2010-07-09 18:32:37 +0000
commit: 811af5968ae3015af04804aa774728f90a897c05 (patch)
tree: 18aadf8c4a535fef2de069e968fcb23824fdd20a /apps/codecs/libfaad/sbr_dct.c
parent: f3e0207384af483d31e43e3d1f326a138007faf8 (diff)
1 files changed, 49 insertions, 296 deletions
diff --git a/apps/codecs/libfaad/sbr_dct.c b/apps/codecs/libfaad/sbr_dct.c
index c916a82a61..123514f226 100644
--- a/apps/codecs/libfaad/sbr_dct.c
+++ b/apps/codecs/libfaad/sbr_dct.c
@@ -26,6 +26,9 @@
 **/
 
 #include "common.h"
+#include "../lib/fft.h"
+#include "../lib/mdct_lookup.h"
+
 
 #ifdef SBR_DEC
 
@@ -1447,267 +1450,9 @@ void DCT2_32_unscaled(real_t *y, real_t *x)
     y[17] = f286 - f285;
 }
 
-#else
-
-
-#define n 32
-#define log2n 5
-
-// w_array_real[i] = cos(2*M_PI*i/32)
-static const real_t w_array_real[] = {
-    FRAC_CONST(1.000000000000000), FRAC_CONST(0.980785279337272),
-    FRAC_CONST(0.923879528329380), FRAC_CONST(0.831469603195765),
-    FRAC_CONST(0.707106765732237), FRAC_CONST(0.555570210304169),
-    FRAC_CONST(0.382683402077046), FRAC_CONST(0.195090284503576),
-    FRAC_CONST(0.000000000000000), FRAC_CONST(-0.195090370246552),
-    FRAC_CONST(-0.382683482845162), FRAC_CONST(-0.555570282993553),
-    FRAC_CONST(-0.707106827549476), FRAC_CONST(-0.831469651765257),
-    FRAC_CONST(-0.923879561784627), FRAC_CONST(-0.980785296392607)
-};
-
-// w_array_imag[i] = sin(-2*M_PI*i/32)
-static const real_t w_array_imag[] = {
-    FRAC_CONST(0.000000000000000), FRAC_CONST(-0.195090327375064),
-    FRAC_CONST(-0.382683442461104), FRAC_CONST(-0.555570246648862),
-    FRAC_CONST(-0.707106796640858), FRAC_CONST(-0.831469627480512),
-    FRAC_CONST(-0.923879545057005), FRAC_CONST(-0.980785287864940),
-    FRAC_CONST(-1.000000000000000), FRAC_CONST(-0.980785270809601),
-    FRAC_CONST(-0.923879511601754), FRAC_CONST(-0.831469578911016),
-    FRAC_CONST(-0.707106734823616), FRAC_CONST(-0.555570173959476),
-    FRAC_CONST(-0.382683361692986), FRAC_CONST(-0.195090241632088)
-};
-
-// FFT decimation in frequency
-// 4*16*2+16=128+16=144 multiplications
-// 6*16*2+10*8+4*16*2=192+80+128=400 additions
-static void fft_dif(real_t * Real, real_t * Imag)
-{
-    real_t w_real, w_imag; // For faster access
-    real_t point1_real, point1_imag, point2_real, point2_imag; // For faster access
-    uint32_t j, i, i2, w_index; // Counters
-
-    // First 2 stages of 32 point FFT decimation in frequency
-    // 4*16*2=64*2=128 multiplications
-    // 6*16*2=96*2=192 additions
-    // Stage 1 of 32 point FFT decimation in frequency
-    for (i = 0; i < 16; i++)
-    {
-        point1_real = Real[i];
-        point1_imag = Imag[i];
-        i2 = i+16;
-        point2_real = Real[i2];
-        point2_imag = Imag[i2];
-
-        w_real = w_array_real[i];
-        w_imag = w_array_imag[i];
-
-        // temp1 = x[i] - x[i2]
-        point1_real -= point2_real;
-        point1_imag -= point2_imag;
-
-        // x[i1] = x[i] + x[i2]
-        Real[i] += point2_real;
-        Imag[i] += point2_imag;
-
-        // x[i2] = (x[i] - x[i2]) * w
-        Real[i2] = (MUL_F(point1_real,w_real) - MUL_F(point1_imag,w_imag));
-        Imag[i2] = (MUL_F(point1_real,w_imag) + MUL_F(point1_imag,w_real));
-     }
-    // Stage 2 of 32 point FFT decimation in frequency
-    for (j = 0, w_index = 0; j < 8; j++, w_index += 2)
-    {
-        w_real = w_array_real[w_index];
-        w_imag = w_array_imag[w_index];
-
-        i = j;
-        point1_real = Real[i];
-        point1_imag = Imag[i];
-        i2 = i+8;
-        point2_real = Real[i2];
-        point2_imag = Imag[i2];
-
-        // temp1 = x[i] - x[i2]
-        point1_real -= point2_real;
-        point1_imag -= point2_imag;
-
-        // x[i1] = x[i] + x[i2]
-        Real[i] += point2_real;
-        Imag[i] += point2_imag;
+#else /* #ifdef SBR_LOW_POWER */
 
-        // x[i2] = (x[i] - x[i2]) * w
-        Real[i2] = (MUL_F(point1_real,w_real) - MUL_F(point1_imag,w_imag));
-        Imag[i2] = (MUL_F(point1_real,w_imag) + MUL_F(point1_imag,w_real));
-
-        i = j+16;
-        point1_real = Real[i];
-        point1_imag = Imag[i];
-        i2 = i+8;
-        point2_real = Real[i2];
-        point2_imag = Imag[i2];
-
-        // temp1 = x[i] - x[i2]
-        point1_real -= point2_real;
-        point1_imag -= point2_imag;
-
-        // x[i1] = x[i] + x[i2]
-        Real[i] += point2_real;
-        Imag[i] += point2_imag;
-
-        // x[i2] = (x[i] - x[i2]) * w
-        Real[i2] = (MUL_F(point1_real,w_real) - MUL_F(point1_imag,w_imag));
-        Imag[i2] = (MUL_F(point1_real,w_imag) + MUL_F(point1_imag,w_real));
-    }
-
-    // Stage 3 of 32 point FFT decimation in frequency
-    // 2*4*2=16 multiplications
-    // 4*4*2+6*4*2=10*8=80 additions
-    for (i = 0; i < n; i += 8)
-    {
-        i2 = i+4;
-        point1_real = Real[i];
-        point1_imag = Imag[i];
-
-        point2_real = Real[i2];
-        point2_imag = Imag[i2];
-
-        // out[i1] = point1 + point2
-        Real[i] += point2_real;
-        Imag[i] += point2_imag;
-
-        // out[i2] = point1 - point2
-        Real[i2] = point1_real - point2_real;
-        Imag[i2] = point1_imag - point2_imag;
-    }
-    w_real = w_array_real[4]; // = sqrt(2)/2
-    // w_imag = -w_real; // = w_array_imag[4]; // = -sqrt(2)/2
-    for (i = 1; i < n; i += 8)
-    {
-        i2 = i+4;
-        point1_real = Real[i];
-        point1_imag = Imag[i];
-
-        point2_real = Real[i2];
-        point2_imag = Imag[i2];
-
-        // temp1 = x[i] - x[i2]
-        point1_real -= point2_real;
-        point1_imag -= point2_imag;
-
-        // x[i1] = x[i] + x[i2]
-        Real[i] += point2_real;
-        Imag[i] += point2_imag;
-
-        // x[i2] = (x[i] - x[i2]) * w
-        Real[i2] = MUL_F(point1_real+point1_imag, w_real);
-        Imag[i2] = MUL_F(point1_imag-point1_real, w_real);
-    }
-    for (i = 2; i < n; i += 8)
-    {
-        i2 = i+4;
-        point1_real = Real[i];
-        point1_imag = Imag[i];
-
-        point2_real = Real[i2];
-        point2_imag = Imag[i2];
-
-        // x[i] = x[i] + x[i2]
-        Real[i] += point2_real;
-        Imag[i] += point2_imag;
-
-        // x[i2] = (x[i] - x[i2]) * (-i)
-        Real[i2] = point1_imag - point2_imag;
-        Imag[i2] = point2_real - point1_real;
-    }
-    w_real = w_array_real[12]; // = -sqrt(2)/2
-    // w_imag = w_real; // = w_array_imag[12]; // = -sqrt(2)/2
-    for (i = 3; i < n; i += 8)
-    {
-        i2 = i+4;
-        point1_real = Real[i];
-        point1_imag = Imag[i];
-
-        point2_real = Real[i2];
-        point2_imag = Imag[i2];
-
-        // temp1 = x[i] - x[i2]
-        point1_real -= point2_real;
-        point1_imag -= point2_imag;
-
-        // x[i1] = x[i] + x[i2]
-        Real[i] += point2_real;
-        Imag[i] += point2_imag;
-
-        // x[i2] = (x[i] - x[i2]) * w
-        Real[i2] = MUL_F(point1_real-point1_imag, w_real);
-        Imag[i2] = MUL_F(point1_real+point1_imag, w_real);
-    }
-
-
-    // Stage 4 of 32 point FFT decimation in frequency (no multiplications)
-    // 16*4=64 additions
-    for (i = 0; i < n; i += 4)
-    {
-        i2 = i+2;
-        point1_real = Real[i];
-        point1_imag = Imag[i];
-
-        point2_real = Real[i2];
-        point2_imag = Imag[i2];
-
-        // x[i1] = x[i] + x[i2]
-        Real[i] += point2_real;
-        Imag[i] += point2_imag;
-
-        // x[i2] = x[i] - x[i2]
-        Real[i2] = point1_real - point2_real;
-        Imag[i2] = point1_imag - point2_imag;
-    }
-    for (i = 1; i < n; i += 4)
-    {
-        i2 = i+2;
-        point1_real = Real[i];
-        point1_imag = Imag[i];
-
-        point2_real = Real[i2];
-        point2_imag = Imag[i2];
-
-        // x[i] = x[i] + x[i2]
-        Real[i] += point2_real;
-        Imag[i] += point2_imag;
-
-        // x[i2] = (x[i] - x[i2]) * (-i)
-        Real[i2] = point1_imag - point2_imag;
-        Imag[i2] = point2_real - point1_real;
-    }
-
-    // Stage 5 of 32 point FFT decimation in frequency (no multiplications)
-    // 16*4=64 additions
-    for (i = 0; i < n; i += 2)
-    {
-        i2 = i+1;
-        point1_real = Real[i];
-        point1_imag = Imag[i];
-
-        point2_real = Real[i2];
-        point2_imag = Imag[i2];
-
-        // out[i1] = point1 + point2
-        Real[i] += point2_real;
-        Imag[i] += point2_imag;
-
-        // out[i2] = point1 - point2
-        Real[i2] = point1_real - point2_real;
-        Imag[i2] = point1_imag - point2_imag;
-    }
-
-#ifdef REORDER_IN_FFT
-    FFTReorder(Real, Imag);
-#endif // #ifdef REORDER_IN_FFT
-}
-#undef n
-#undef log2n
-
-static const real_t dct4_64_tab[] = {
+static const real_t dct4_64_tab[] ICONST_ATTR = {
     COEF_CONST(0.999924719333649), COEF_CONST(0.998118102550507),
     COEF_CONST(0.993906974792480), COEF_CONST(0.987301409244537),
     COEF_CONST(0.978317379951477), COEF_CONST(0.966976463794708),
@@ -1806,57 +1551,65 @@ static const real_t dct4_64_tab[] = {
     COEF_CONST(0.897167563438416), COEF_CONST(0.949727773666382)
 };
 
+// Table adapted from codeclib to fit into IRAM
+const uint32_t dct4_revtab[32] ICONST_ATTR = {
+  0, 24, 12, 22, 6, 30, 11, 19, 3, 27, 15, 21, 5, 29, 9, 17,
+  1, 25, 13, 23, 7, 31, 10, 18, 2, 26, 14, 20, 4, 28, 8, 16};
+
 /* size 64 only! */
-void dct4_kernel(real_t * in_real, real_t * in_imag, real_t * out_real, real_t * out_imag)
+void dct4_kernel(real_t *real, real_t *imag)
 {
-    // Tables with bit reverse values for 5 bits, bit reverse of i at i-th position
-    const uint8_t bit_rev_tab[32] = { 0,16,8,24,4,20,12,28,2,18,10,26,6,22,14,30,1,17,9,25,5,21,13,29,3,19,11,27,7,23,15,31 };
-    uint16_t i, i_rev;
+    uint32_t i, idx;
+    real_t x_re, x_im, tmp;
+    FFTComplex xc[32]; /* used for calling codeclib's fft implementation */
 
-    /* Step 2: modulate */
+    /* Step 2: modulate and pre-rotate for codeclib's fft implementation */
     // 3*32=96 multiplications
     // 3*32=96 additions
     for (i = 0; i < 32; i++)
     {
-        real_t x_re, x_im, tmp;
-        x_re = in_real[i];
-        x_im = in_imag[i];
-        tmp =        MUL_C(x_re + x_im, dct4_64_tab[i]);
-        in_real[i] = MUL_C(x_im, dct4_64_tab[i + 64]) + tmp;
-        in_imag[i] = MUL_C(x_re, dct4_64_tab[i + 32]) + tmp;
+        idx  = dct4_revtab[i];
+        x_re = real[i];
+        x_im = imag[i];
+        tmp        = MUL_C(x_re + x_im, dct4_64_tab[i     ]);
+        xc[idx].re = MUL_C(x_im       , dct4_64_tab[i + 64]) + tmp;
+        xc[idx].im = MUL_C(x_re       , dct4_64_tab[i + 32]) + tmp;
     }
 
-    /* Step 3: FFT, but with output in bit reverse order */
-    fft_dif(in_real, in_imag);
+    /* Step 3: FFT (codeclib's implementation) */
+    ff_fft_calc_c(5, xc);
 
-    /* Step 4: modulate + bitreverse reordering */
+    /* Step 4: modulate + reordering */
     // 3*31+2=95 multiplications
     // 3*31+2=95 additions
-    for (i = 0; i < 16; i++)
+    x_re  = xc[0].re;
+    x_im  = xc[0].im;
+    tmp     = MUL_C(x_re + x_im, dct4_64_tab[0 + 3*32]);
+    real[0] = MUL_C(x_im       , dct4_64_tab[0 + 5*32]) + tmp;
+    imag[0] = MUL_C(x_re       , dct4_64_tab[0 + 4*32]) + tmp;
+    for (i = 1; i < 16; i++)
     {
-        real_t x_re, x_im, tmp;
-        i_rev = bit_rev_tab[i];
-        x_re = in_real[i_rev];
-        x_im = in_imag[i_rev];
-
-        tmp =         MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]);
-        out_real[i] = MUL_C(x_im, dct4_64_tab[i + 5*32]) + tmp;
-        out_imag[i] = MUL_C(x_re, dct4_64_tab[i + 4*32]) + tmp;
+        idx  = 32-i;
+        x_re = xc[idx].re;
+        x_im = xc[idx].im;
+        tmp     = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]);
+        real[i] = MUL_C(x_im       , dct4_64_tab[i + 5*32]) + tmp;
+        imag[i] = MUL_C(x_re       , dct4_64_tab[i + 4*32]) + tmp;
     }
-    // i = 16, i_rev = 1 = rev(16);
-    out_imag[16] = MUL_C(in_imag[1] - in_real[1], dct4_64_tab[16 + 3*32]);
-    out_real[16] = MUL_C(in_real[1] + in_imag[1], dct4_64_tab[16 + 3*32]);
+    // i = 16, idx = 16 = reorder_tab[16];
+    x_re  = xc[16].re;
+    x_im  = xc[16].im;
+    imag[16] = MUL_C(x_im - x_re, dct4_64_tab[16 + 3*32]);
+    real[16] = MUL_C(x_re + x_im, dct4_64_tab[16 + 3*32]);
     for (i = 17; i < 32; i++)
     {
-        real_t x_re, x_im, tmp;
-        i_rev = bit_rev_tab[i];
-        x_re = in_real[i_rev];
-        x_im = in_imag[i_rev];
-        tmp =         MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]);
-        out_real[i] = MUL_C(x_im, dct4_64_tab[i + 5*32]) + tmp;
-        out_imag[i] = MUL_C(x_re, dct4_64_tab[i + 4*32]) + tmp;
+        idx  = 32-i;
+        x_re = xc[idx].re;
+        x_im = xc[idx].im;
+        tmp     = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]);
+        real[i] = MUL_C(x_im       , dct4_64_tab[i + 5*32]) + tmp;
+        imag[i] = MUL_C(x_re       , dct4_64_tab[i + 4*32]) + tmp;
     }
-
 }
 
 void DST4_32(real_t *y, real_t *x)
@@ -2266,6 +2019,6 @@ void DST4_32(real_t *y, real_t *x)
     y[0] = MUL_R(REAL_CONST(20.3738781672314530), f304);
 }
 
-#endif
+#endif /* #ifdef SBR_LOW_POWER */
 
-#endif
+#endif /* #ifdef SBR_DEC */
author	Andree Buschmann <AndreeBuschmann@t-online.de>	2010-07-09 18:32:37 +0000
committer	Andree Buschmann <AndreeBuschmann@t-online.de>	2010-07-09 18:32:37 +0000
commit	811af5968ae3015af04804aa774728f90a897c05 (patch)
tree	18aadf8c4a535fef2de069e968fcb23824fdd20a /apps/codecs/libfaad/sbr_dct.c
parent	f3e0207384af483d31e43e3d1f326a138007faf8 (diff)