summaryrefslogtreecommitdiff
path: root/apps/codecs/libfaad/sbr_dct.c
diff options
context:
space:
mode:
authorAndree Buschmann <AndreeBuschmann@t-online.de>2010-07-09 18:32:37 +0000
committerAndree Buschmann <AndreeBuschmann@t-online.de>2010-07-09 18:32:37 +0000
commit811af5968ae3015af04804aa774728f90a897c05 (patch)
tree18aadf8c4a535fef2de069e968fcb23824fdd20a /apps/codecs/libfaad/sbr_dct.c
parentf3e0207384af483d31e43e3d1f326a138007faf8 (diff)
Submit FS#11461. Major speedup for aac he profile (PP5002 +20%, PP5020 +15%, PP5022 +19%, MCF5249 +35%, MCF5250 +80%), still not realtime on most targets though. This change does a lot of refactoring in the sbr filters and the dct, switching to our optimized codeclib fft and tweaking IRAM usage.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27358 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs/libfaad/sbr_dct.c')
-rw-r--r--apps/codecs/libfaad/sbr_dct.c345
1 files changed, 49 insertions, 296 deletions
diff --git a/apps/codecs/libfaad/sbr_dct.c b/apps/codecs/libfaad/sbr_dct.c
index c916a82a61..123514f226 100644
--- a/apps/codecs/libfaad/sbr_dct.c
+++ b/apps/codecs/libfaad/sbr_dct.c
@@ -26,6 +26,9 @@
**/
#include "common.h"
+#include "../lib/fft.h"
+#include "../lib/mdct_lookup.h"
+
#ifdef SBR_DEC
@@ -1447,267 +1450,9 @@ void DCT2_32_unscaled(real_t *y, real_t *x)
y[17] = f286 - f285;
}
-#else
-
-
-#define n 32
-#define log2n 5
-
-// w_array_real[i] = cos(2*M_PI*i/32)
-static const real_t w_array_real[] = {
- FRAC_CONST(1.000000000000000), FRAC_CONST(0.980785279337272),
- FRAC_CONST(0.923879528329380), FRAC_CONST(0.831469603195765),
- FRAC_CONST(0.707106765732237), FRAC_CONST(0.555570210304169),
- FRAC_CONST(0.382683402077046), FRAC_CONST(0.195090284503576),
- FRAC_CONST(0.000000000000000), FRAC_CONST(-0.195090370246552),
- FRAC_CONST(-0.382683482845162), FRAC_CONST(-0.555570282993553),
- FRAC_CONST(-0.707106827549476), FRAC_CONST(-0.831469651765257),
- FRAC_CONST(-0.923879561784627), FRAC_CONST(-0.980785296392607)
-};
-
-// w_array_imag[i] = sin(-2*M_PI*i/32)
-static const real_t w_array_imag[] = {
- FRAC_CONST(0.000000000000000), FRAC_CONST(-0.195090327375064),
- FRAC_CONST(-0.382683442461104), FRAC_CONST(-0.555570246648862),
- FRAC_CONST(-0.707106796640858), FRAC_CONST(-0.831469627480512),
- FRAC_CONST(-0.923879545057005), FRAC_CONST(-0.980785287864940),
- FRAC_CONST(-1.000000000000000), FRAC_CONST(-0.980785270809601),
- FRAC_CONST(-0.923879511601754), FRAC_CONST(-0.831469578911016),
- FRAC_CONST(-0.707106734823616), FRAC_CONST(-0.555570173959476),
- FRAC_CONST(-0.382683361692986), FRAC_CONST(-0.195090241632088)
-};
-
-// FFT decimation in frequency
-// 4*16*2+16=128+16=144 multiplications
-// 6*16*2+10*8+4*16*2=192+80+128=400 additions
-static void fft_dif(real_t * Real, real_t * Imag)
-{
- real_t w_real, w_imag; // For faster access
- real_t point1_real, point1_imag, point2_real, point2_imag; // For faster access
- uint32_t j, i, i2, w_index; // Counters
-
- // First 2 stages of 32 point FFT decimation in frequency
- // 4*16*2=64*2=128 multiplications
- // 6*16*2=96*2=192 additions
- // Stage 1 of 32 point FFT decimation in frequency
- for (i = 0; i < 16; i++)
- {
- point1_real = Real[i];
- point1_imag = Imag[i];
- i2 = i+16;
- point2_real = Real[i2];
- point2_imag = Imag[i2];
-
- w_real = w_array_real[i];
- w_imag = w_array_imag[i];
-
- // temp1 = x[i] - x[i2]
- point1_real -= point2_real;
- point1_imag -= point2_imag;
-
- // x[i1] = x[i] + x[i2]
- Real[i] += point2_real;
- Imag[i] += point2_imag;
-
- // x[i2] = (x[i] - x[i2]) * w
- Real[i2] = (MUL_F(point1_real,w_real) - MUL_F(point1_imag,w_imag));
- Imag[i2] = (MUL_F(point1_real,w_imag) + MUL_F(point1_imag,w_real));
- }
- // Stage 2 of 32 point FFT decimation in frequency
- for (j = 0, w_index = 0; j < 8; j++, w_index += 2)
- {
- w_real = w_array_real[w_index];
- w_imag = w_array_imag[w_index];
-
- i = j;
- point1_real = Real[i];
- point1_imag = Imag[i];
- i2 = i+8;
- point2_real = Real[i2];
- point2_imag = Imag[i2];
-
- // temp1 = x[i] - x[i2]
- point1_real -= point2_real;
- point1_imag -= point2_imag;
-
- // x[i1] = x[i] + x[i2]
- Real[i] += point2_real;
- Imag[i] += point2_imag;
+#else /* #ifdef SBR_LOW_POWER */
- // x[i2] = (x[i] - x[i2]) * w
- Real[i2] = (MUL_F(point1_real,w_real) - MUL_F(point1_imag,w_imag));
- Imag[i2] = (MUL_F(point1_real,w_imag) + MUL_F(point1_imag,w_real));
-
- i = j+16;
- point1_real = Real[i];
- point1_imag = Imag[i];
- i2 = i+8;
- point2_real = Real[i2];
- point2_imag = Imag[i2];
-
- // temp1 = x[i] - x[i2]
- point1_real -= point2_real;
- point1_imag -= point2_imag;
-
- // x[i1] = x[i] + x[i2]
- Real[i] += point2_real;
- Imag[i] += point2_imag;
-
- // x[i2] = (x[i] - x[i2]) * w
- Real[i2] = (MUL_F(point1_real,w_real) - MUL_F(point1_imag,w_imag));
- Imag[i2] = (MUL_F(point1_real,w_imag) + MUL_F(point1_imag,w_real));
- }
-
- // Stage 3 of 32 point FFT decimation in frequency
- // 2*4*2=16 multiplications
- // 4*4*2+6*4*2=10*8=80 additions
- for (i = 0; i < n; i += 8)
- {
- i2 = i+4;
- point1_real = Real[i];
- point1_imag = Imag[i];
-
- point2_real = Real[i2];
- point2_imag = Imag[i2];
-
- // out[i1] = point1 + point2
- Real[i] += point2_real;
- Imag[i] += point2_imag;
-
- // out[i2] = point1 - point2
- Real[i2] = point1_real - point2_real;
- Imag[i2] = point1_imag - point2_imag;
- }
- w_real = w_array_real[4]; // = sqrt(2)/2
- // w_imag = -w_real; // = w_array_imag[4]; // = -sqrt(2)/2
- for (i = 1; i < n; i += 8)
- {
- i2 = i+4;
- point1_real = Real[i];
- point1_imag = Imag[i];
-
- point2_real = Real[i2];
- point2_imag = Imag[i2];
-
- // temp1 = x[i] - x[i2]
- point1_real -= point2_real;
- point1_imag -= point2_imag;
-
- // x[i1] = x[i] + x[i2]
- Real[i] += point2_real;
- Imag[i] += point2_imag;
-
- // x[i2] = (x[i] - x[i2]) * w
- Real[i2] = MUL_F(point1_real+point1_imag, w_real);
- Imag[i2] = MUL_F(point1_imag-point1_real, w_real);
- }
- for (i = 2; i < n; i += 8)
- {
- i2 = i+4;
- point1_real = Real[i];
- point1_imag = Imag[i];
-
- point2_real = Real[i2];
- point2_imag = Imag[i2];
-
- // x[i] = x[i] + x[i2]
- Real[i] += point2_real;
- Imag[i] += point2_imag;
-
- // x[i2] = (x[i] - x[i2]) * (-i)
- Real[i2] = point1_imag - point2_imag;
- Imag[i2] = point2_real - point1_real;
- }
- w_real = w_array_real[12]; // = -sqrt(2)/2
- // w_imag = w_real; // = w_array_imag[12]; // = -sqrt(2)/2
- for (i = 3; i < n; i += 8)
- {
- i2 = i+4;
- point1_real = Real[i];
- point1_imag = Imag[i];
-
- point2_real = Real[i2];
- point2_imag = Imag[i2];
-
- // temp1 = x[i] - x[i2]
- point1_real -= point2_real;
- point1_imag -= point2_imag;
-
- // x[i1] = x[i] + x[i2]
- Real[i] += point2_real;
- Imag[i] += point2_imag;
-
- // x[i2] = (x[i] - x[i2]) * w
- Real[i2] = MUL_F(point1_real-point1_imag, w_real);
- Imag[i2] = MUL_F(point1_real+point1_imag, w_real);
- }
-
-
- // Stage 4 of 32 point FFT decimation in frequency (no multiplications)
- // 16*4=64 additions
- for (i = 0; i < n; i += 4)
- {
- i2 = i+2;
- point1_real = Real[i];
- point1_imag = Imag[i];
-
- point2_real = Real[i2];
- point2_imag = Imag[i2];
-
- // x[i1] = x[i] + x[i2]
- Real[i] += point2_real;
- Imag[i] += point2_imag;
-
- // x[i2] = x[i] - x[i2]
- Real[i2] = point1_real - point2_real;
- Imag[i2] = point1_imag - point2_imag;
- }
- for (i = 1; i < n; i += 4)
- {
- i2 = i+2;
- point1_real = Real[i];
- point1_imag = Imag[i];
-
- point2_real = Real[i2];
- point2_imag = Imag[i2];
-
- // x[i] = x[i] + x[i2]
- Real[i] += point2_real;
- Imag[i] += point2_imag;
-
- // x[i2] = (x[i] - x[i2]) * (-i)
- Real[i2] = point1_imag - point2_imag;
- Imag[i2] = point2_real - point1_real;
- }
-
- // Stage 5 of 32 point FFT decimation in frequency (no multiplications)
- // 16*4=64 additions
- for (i = 0; i < n; i += 2)
- {
- i2 = i+1;
- point1_real = Real[i];
- point1_imag = Imag[i];
-
- point2_real = Real[i2];
- point2_imag = Imag[i2];
-
- // out[i1] = point1 + point2
- Real[i] += point2_real;
- Imag[i] += point2_imag;
-
- // out[i2] = point1 - point2
- Real[i2] = point1_real - point2_real;
- Imag[i2] = point1_imag - point2_imag;
- }
-
-#ifdef REORDER_IN_FFT
- FFTReorder(Real, Imag);
-#endif // #ifdef REORDER_IN_FFT
-}
-#undef n
-#undef log2n
-
-static const real_t dct4_64_tab[] = {
+static const real_t dct4_64_tab[] ICONST_ATTR = {
COEF_CONST(0.999924719333649), COEF_CONST(0.998118102550507),
COEF_CONST(0.993906974792480), COEF_CONST(0.987301409244537),
COEF_CONST(0.978317379951477), COEF_CONST(0.966976463794708),
@@ -1806,57 +1551,65 @@ static const real_t dct4_64_tab[] = {
COEF_CONST(0.897167563438416), COEF_CONST(0.949727773666382)
};
+// Table adapted from codeclib to fit into IRAM
+const uint32_t dct4_revtab[32] ICONST_ATTR = {
+ 0, 24, 12, 22, 6, 30, 11, 19, 3, 27, 15, 21, 5, 29, 9, 17,
+ 1, 25, 13, 23, 7, 31, 10, 18, 2, 26, 14, 20, 4, 28, 8, 16};
+
/* size 64 only! */
-void dct4_kernel(real_t * in_real, real_t * in_imag, real_t * out_real, real_t * out_imag)
+void dct4_kernel(real_t *real, real_t *imag)
{
- // Tables with bit reverse values for 5 bits, bit reverse of i at i-th position
- const uint8_t bit_rev_tab[32] = { 0,16,8,24,4,20,12,28,2,18,10,26,6,22,14,30,1,17,9,25,5,21,13,29,3,19,11,27,7,23,15,31 };
- uint16_t i, i_rev;
+ uint32_t i, idx;
+ real_t x_re, x_im, tmp;
+ FFTComplex xc[32]; /* used for calling codeclib's fft implementation */
- /* Step 2: modulate */
+ /* Step 2: modulate and pre-rotate for codeclib's fft implementation */
// 3*32=96 multiplications
// 3*32=96 additions
for (i = 0; i < 32; i++)
{
- real_t x_re, x_im, tmp;
- x_re = in_real[i];
- x_im = in_imag[i];
- tmp = MUL_C(x_re + x_im, dct4_64_tab[i]);
- in_real[i] = MUL_C(x_im, dct4_64_tab[i + 64]) + tmp;
- in_imag[i] = MUL_C(x_re, dct4_64_tab[i + 32]) + tmp;
+ idx = dct4_revtab[i];
+ x_re = real[i];
+ x_im = imag[i];
+ tmp = MUL_C(x_re + x_im, dct4_64_tab[i ]);
+ xc[idx].re = MUL_C(x_im , dct4_64_tab[i + 64]) + tmp;
+ xc[idx].im = MUL_C(x_re , dct4_64_tab[i + 32]) + tmp;
}
- /* Step 3: FFT, but with output in bit reverse order */
- fft_dif(in_real, in_imag);
+ /* Step 3: FFT (codeclib's implementation) */
+ ff_fft_calc_c(5, xc);
- /* Step 4: modulate + bitreverse reordering */
+ /* Step 4: modulate + reordering */
// 3*31+2=95 multiplications
// 3*31+2=95 additions
- for (i = 0; i < 16; i++)
+ x_re = xc[0].re;
+ x_im = xc[0].im;
+ tmp = MUL_C(x_re + x_im, dct4_64_tab[0 + 3*32]);
+ real[0] = MUL_C(x_im , dct4_64_tab[0 + 5*32]) + tmp;
+ imag[0] = MUL_C(x_re , dct4_64_tab[0 + 4*32]) + tmp;
+ for (i = 1; i < 16; i++)
{
- real_t x_re, x_im, tmp;
- i_rev = bit_rev_tab[i];
- x_re = in_real[i_rev];
- x_im = in_imag[i_rev];
-
- tmp = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]);
- out_real[i] = MUL_C(x_im, dct4_64_tab[i + 5*32]) + tmp;
- out_imag[i] = MUL_C(x_re, dct4_64_tab[i + 4*32]) + tmp;
+ idx = 32-i;
+ x_re = xc[idx].re;
+ x_im = xc[idx].im;
+ tmp = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]);
+ real[i] = MUL_C(x_im , dct4_64_tab[i + 5*32]) + tmp;
+ imag[i] = MUL_C(x_re , dct4_64_tab[i + 4*32]) + tmp;
}
- // i = 16, i_rev = 1 = rev(16);
- out_imag[16] = MUL_C(in_imag[1] - in_real[1], dct4_64_tab[16 + 3*32]);
- out_real[16] = MUL_C(in_real[1] + in_imag[1], dct4_64_tab[16 + 3*32]);
+ // i = 16, idx = 16 = reorder_tab[16];
+ x_re = xc[16].re;
+ x_im = xc[16].im;
+ imag[16] = MUL_C(x_im - x_re, dct4_64_tab[16 + 3*32]);
+ real[16] = MUL_C(x_re + x_im, dct4_64_tab[16 + 3*32]);
for (i = 17; i < 32; i++)
{
- real_t x_re, x_im, tmp;
- i_rev = bit_rev_tab[i];
- x_re = in_real[i_rev];
- x_im = in_imag[i_rev];
- tmp = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]);
- out_real[i] = MUL_C(x_im, dct4_64_tab[i + 5*32]) + tmp;
- out_imag[i] = MUL_C(x_re, dct4_64_tab[i + 4*32]) + tmp;
+ idx = 32-i;
+ x_re = xc[idx].re;
+ x_im = xc[idx].im;
+ tmp = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]);
+ real[i] = MUL_C(x_im , dct4_64_tab[i + 5*32]) + tmp;
+ imag[i] = MUL_C(x_re , dct4_64_tab[i + 4*32]) + tmp;
}
-
}
void DST4_32(real_t *y, real_t *x)
@@ -2266,6 +2019,6 @@ void DST4_32(real_t *y, real_t *x)
y[0] = MUL_R(REAL_CONST(20.3738781672314530), f304);
}
-#endif
+#endif /* #ifdef SBR_LOW_POWER */
-#endif
+#endif /* #ifdef SBR_DEC */