summaryrefslogtreecommitdiff
path: root/apps
diff options
context:
space:
mode:
Diffstat (limited to 'apps')
-rw-r--r--apps/codecs/libmusepack/synth_filter_arm.S210
1 files changed, 208 insertions, 2 deletions
diff --git a/apps/codecs/libmusepack/synth_filter_arm.S b/apps/codecs/libmusepack/synth_filter_arm.S
index 731a21ce21..5bdae93561 100644
--- a/apps/codecs/libmusepack/synth_filter_arm.S
+++ b/apps/codecs/libmusepack/synth_filter_arm.S
@@ -92,7 +92,7 @@ mpc_decoder_windowing_D:
bgt .loop32
ldmpc regs=r4-r8
-#else
+#elif defined(CPU_ARM7TDMI) /* arm7 only */
mpc_decoder_windowing_D:
/* r0 = Data[] */
/* r1 = V[] */
@@ -106,6 +106,7 @@ mpc_decoder_windowing_D:
* saved at the cost of 15 x 4 + 1 add's.
* The row V[16] can be extracted as it has symmetries within this single
* row. 8 smull/mlal and 8 ldr's can be saved.
+ * Used for arm7 only. For arm9 and above see implementation below.
***********************************************************************/
stmfd sp!, {r4-r11, lr}
@@ -152,7 +153,7 @@ mpc_decoder_windowing_D:
add r2, r2, #7*4 /* D+=7, r2 = D[16] */
/******************************************
- * rows 01..15 are symmetrc to rows 31..17
+ * rows 01..15 are symmetric to rows 31..17
* r8 = lo, r9 = hi of 01..15
* r1 = V[01..15]
* r10 = lo, r11 = hi of 31..17
@@ -290,6 +291,211 @@ mpc_decoder_windowing_D:
add r1, r1, #4 /* V++ */
ldmpc regs=r4-r11
+#else /* arm9 and above */
+ mpc_decoder_windowing_D:
+ /* r0 = Data[] */
+ /* r1 = V[] */
+ /* r2 = D[] */
+ /* lr = counter */
+ /************************************************************************
+ * Further speed up through making use of symmetries within D[]-window.
+ * The row V[00] can be extracted as it has symmetries within this single
+ * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's.
+ * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be
+ * saved at the cost of 15 x 4 + 1 add's.
+ * The row V[16] can be extracted as it has symmetries within this single
+ * row. 8 smull/mlal and 8 ldr's can be saved.
+ * On arm9 (still armv4 architecture) reducing stalls after ldr/ldm speeds
+ * up decoding even though several ldm-calls are replaced with ldr to free
+ * 2 registers.
+ ***********************************************************************/
+ stmfd sp!, {r4-r11, lr}
+
+ /******************************************
+ * row 0 with internal symmetry
+ *****************************************/
+ add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */
+ ldmia r2!, { r3-r6 } /* load D[01..04] */
+ ldr r7 , [r1, #96*4] /* 1 */
+ ldr r10, [r1, #992*4] /* 15 */
+ ldr r11, [r1, #128*4] /* 2 */
+ ldr r12, [r1, #896*4] /* 14 */
+ rsb r10, r10, r7 /* V[01] - V[15] */
+ smull r8, r9, r10, r3
+ ldr r7 , [r1, #224*4] /* 3 */
+ ldr r10, [r1, #864*4] /* 13 */
+ add r12, r12, r11 /* V[02] + V[14] */
+ smlal r8, r9, r12, r4
+ ldr r11, [r1, #256*4] /* 4 */
+ ldr r12, [r1, #768*4] /* 12 */
+ rsb r10, r10, r7 /* V[03] - V[13] */
+ smlal r8, r9, r10, r5
+ ldr r7 , [r1, #352*4] /* 5 */
+ ldr r10, [r1, #736*4] /* 11 */
+ add r12, r12, r11 /* V[04] + V[12] */
+ smlal r8, r9, r12, r6
+ ldmia r2!, { r3-r6 } /* load D[05..08] */
+ ldr r11, [r1, #384*4] /* 6 */
+ ldr r12, [r1, #640*4] /* 10 */
+ rsb r10, r10, r7 /* V[05] - V[11] */
+ smlal r8, r9, r10, r3
+ ldr r7 , [r1, #480*4] /* 7 */
+ ldr r10, [r1, #608*4] /* 9 */
+ add r12, r12, r11 /* V[06] + V[10] */
+ smlal r8, r9, r12, r4
+ ldr r11, [r1, #512*4] /* 8 */
+ rsb r10, r10, r7 /* V[07] - V[09] */
+ smlal r8, r9, r10, r5
+ smlal r8, r9, r11, r6
+ mov r8, r8, lsr #16
+ orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
+ str r8, [r0], #4 /* store Data */
+ add r1, r1, #4 /* V+=1, r1 = V[01] */
+ add r2, r2, #7*4 /* D+=7, r2 = D[16] */
+
+ /******************************************
+ * rows 01..15 are symmetric to rows 31..17
+ * r8 = lo, r9 = hi of 01..15
+ * r1 = V[01..15]
+ * r10 = lo, r11 = hi of 31..17
+ * r12 = V[31..16]
+ *****************************************/
+ mov lr, #15
+ add r12, r1, #30*4 /* r12 = V[31] */
+.loop15:
+ ldmia r2!, { r3-r4 } /* load D[00..01] */
+ ldr r7, [r12, #896*4] /* 14 */
+ ldr r5, [r12, #992*4] /* 15 */
+ smull r10, r11, r7, r4
+ ldr r7, [r1] /* 0 */
+ smlal r10, r11, r5, r3
+ ldr r5, [r1, #96*4] /* 1 */
+ smull r8, r9, r7, r3
+ ldr r7, [r12, #768*4] /* 12 */
+ smlal r8, r9, r5, r4
+ ldmia r2!, { r3-r4 } /* load D[02..03] */
+ ldr r5, [r12, #864*4] /* 13 */
+ smlal r10, r11, r7, r4
+ ldr r7, [r1, #128*4] /* 2 */
+ smlal r10, r11, r5, r3
+ ldr r5, [r1, #224*4] /* 3 */
+ smlal r8, r9, r7, r3
+ ldr r7, [r1, #256*4] /* 4 */
+ smlal r8, r9, r5, r4
+ ldmia r2!, { r3-r4 } /* load D[04..04] */
+ ldr r5, [r1, #352*4] /* 5 */
+ smlal r8, r9, r7, r3
+ ldr r7, [r12, #640*4] /* 10 */
+ smlal r8, r9, r5, r4
+ ldr r5, [r12, #736*4] /* 11 */
+ smlal r10, r11, r7, r4
+ ldr r7, [r1, #384*4] /* 6 */
+ smlal r10, r11, r5, r3
+ ldmia r2!, { r3-r4 } /* load D[06..07] */
+ ldr r5, [r1, #480*4] /* 7 */
+ smlal r8, r9, r7, r3
+ ldr r7, [r12, #512*4] /* 8 */
+ smlal r8, r9, r5, r4
+ ldr r5, [r12, #608*4] /* 9 */
+ smlal r10, r11, r7, r4
+ ldr r7, [r12, #384*4] /* 6 */
+ smlal r10, r11, r5, r3
+ ldmia r2!, { r3-r4 } /* load D[08..09] */
+ ldr r5, [r12, #480*4] /* 7 */
+ smlal r10, r11, r7, r4
+ ldr r7, [r1, #512*4] /* 8 */
+ smlal r10, r11, r5, r3
+ ldr r5, [r1, #608*4] /* 9 */
+ smlal r8, r9, r7, r3
+ ldr r7, [r1, #640*4] /* 10 */
+ smlal r8, r9, r5, r4
+ ldmia r2!, { r3-r4 } /* load D[10..11] */
+ ldr r5, [r1, #736*4] /* 11 */
+ smlal r8, r9, r7, r3
+ ldr r7, [r12, #256*4] /* 4 */
+ smlal r8, r9, r5, r4
+ ldr r5, [r12, #352*4] /* 5 */
+ smlal r10, r11, r7, r4
+ ldr r7, [r1, #768*4] /* 12 */
+ smlal r10, r11, r5, r3
+ ldmia r2!, { r3-r4 } /* load D[12..13] */
+ ldr r5, [r1, #864*4] /* 13 */
+ smlal r8, r9, r7, r3
+ ldr r7, [r12, #128*4] /* 2 */
+ smlal r8, r9, r5, r4
+ ldr r5, [r12, #224*4] /* 3 */
+ smlal r10, r11, r7, r4
+ ldr r7, [r12] /* 0 */
+ smlal r10, r11, r5, r3
+ ldmia r2!, { r3-r4 } /* load D[14..15] */
+ ldr r5, [r12, #96*4] /* 1 */
+ smlal r10, r11, r7, r4
+ ldr r7, [r1, #896*4] /* 14 */
+ smlal r10, r11, r5, r3
+ ldr r5, [r1, #992*4] /* 15 */
+ smlal r8, r9, r7, r3
+ smlal r8, r9, r5, r4
+ /* store Data[01..15] */
+ mov r8, r8, lsr #16
+ orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
+ str r8, [r0] /* store Data */
+ /* store Data[31..17] */
+ add r0, r0, lr, asl #3 /* r0 = r0 + 2*lr [words] */
+ mov r10, r10, lsr #16
+ orr r10, r10, r11, lsl #16 /* (lo>>16) || (hi<<16) */
+ rsb r10, r10, #0 /* r10 = -r10 */
+ str r10, [r0], #4 /* store Data */
+ sub r0, r0, lr, asl #3 /* r0 = r0 - 2*lr [words] */
+ /* correct adresses for next loop */
+ sub r12, r12, #4 /* r12 = V-- */
+ add r1, r1, #4 /* r1 = V++ */
+ /* next loop */
+ subs lr, lr, #1
+ bgt .loop15
+
+ /******************************************
+ * V[16] with internal symmetry
+ *****************************************/
+ ldmia r2!, { r3-r6 } /* load D[00..03] */
+ ldr r7 , [r1] /* 0 */
+ ldr r10, [r1, #992*4] /* 15 */
+ ldr r11, [r1, #96*4] /* 1 */
+ ldr r12, [r1, #896*4] /* 14 */
+ rsb r10, r10, r7 /* V[00] - V[15] */
+ smull r8, r9, r10, r3
+ ldr r7 , [r1, #128*4] /* 2 */
+ ldr r10, [r1, #864*4] /* 13 */
+ rsb r12, r12, r11 /* V[01] - V[14] */
+ smlal r8, r9, r12, r4
+ ldr r11, [r1, #224*4] /* 3 */
+ ldr r12, [r1, #768*4] /* 12 */
+ rsb r10, r10, r7 /* V[02] - V[13] */
+ smlal r8, r9, r10, r5
+ ldr r7 , [r1, #256*4] /* 4 */
+ ldr r10, [r1, #736*4] /* 11 */
+ rsb r12, r12, r11 /* V[03] - V[12] */
+ smlal r8, r9, r12, r6
+ ldmia r2!, { r3-r6 } /* load D[04..07] */
+ ldr r11, [r1, #352*4] /* 5 */
+ ldr r12, [r1, #640*4] /* 10 */
+ rsb r10, r10, r7 /* V[04] - V[11] */
+ smlal r8, r9, r10, r3
+ ldr r7 , [r1, #384*4] /* 6 */
+ ldr r10, [r1, #608*4] /* 9 */
+ rsb r12, r12, r11 /* V[05] - V[10] */
+ smlal r8, r9, r12, r4
+ ldr r11, [r1, #480*4] /* 7 */
+ ldr r12, [r1, #512*4] /* 8 */
+ rsb r10, r10, r7 /* V[06] - V[09] */
+ smlal r8, r9, r10, r5
+ rsb r12, r12, r11 /* V[07] - V[08] */
+ smlal r8, r9, r12, r6
+ mov r8, r8, lsr #16
+ orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
+ str r8, [r0], #4 /* store Data */
+ add r1, r1, #4 /* V++ */
+
+ ldmpc regs=r4-r11
#endif
.mpc_dewindowing_end:
.size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D