summaryrefslogtreecommitdiff
path: root/apps/codecs
diff options
context:
space:
mode:
authorAndrew Mahone <andrew.mahone@gmail.com>2010-01-28 02:28:52 +0000
committerAndrew Mahone <andrew.mahone@gmail.com>2010-01-28 02:28:52 +0000
commite76f30a57c25a3ae762fc48218e57bc46dff4410 (patch)
treeb3ca05f49dab3bd6eb4f35af8714653515771cb0 /apps/codecs
parente18e8069304eefca5439d9b4e573429e2f600a2c (diff)
Improvements to specialized dividers for APE codec:
* Use Newton-Raphson divider on ARMv5e and ARMv6, about 7% speedup on Gigabeat S. * On ARMv4 targets using IRAM, remove insane filter buffer from IRAM, fill available IRAM with LUT of reciprocals for small divisors - speedup varies according to target and available IRAM, APE normal sample is approx. 109% RT on e200. * Rename apps/codecs/lib/udiv32_armv4.S to apps/codecs/lib/udiv32_arm.S, which includes dividers for all ARM targets specialized for APE. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24354 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs')
-rw-r--r--apps/codecs/demac/libdemac/demac_config.h4
-rw-r--r--apps/codecs/lib/SOURCES4
-rw-r--r--apps/codecs/lib/codeclib.h2
-rw-r--r--apps/codecs/lib/udiv32_arm.S319
-rw-r--r--apps/codecs/lib/udiv32_armv4.S134
5 files changed, 323 insertions, 140 deletions
diff --git a/apps/codecs/demac/libdemac/demac_config.h b/apps/codecs/demac/libdemac/demac_config.h
index 1bbdef3d56..1beda2b9cd 100644
--- a/apps/codecs/demac/libdemac/demac_config.h
+++ b/apps/codecs/demac/libdemac/demac_config.h
@@ -57,11 +57,11 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
#elif defined(CPU_S5L870X)
#define ICODE_SECTION_DEMAC_ARM .icode
#define ICODE_ATTR_DEMAC ICODE_ATTR
-#define IBSS_ATTR_DEMAC_INSANEBUF IBSS_ATTR
+#define IBSS_ATTR_DEMAC_INSANEBUF
#else
#define ICODE_SECTION_DEMAC_ARM .text
#define ICODE_ATTR_DEMAC
-#define IBSS_ATTR_DEMAC_INSANEBUF IBSS_ATTR
+#define IBSS_ATTR_DEMAC_INSANEBUF
#endif
#else /* !ROCKBOX */
diff --git a/apps/codecs/lib/SOURCES b/apps/codecs/lib/SOURCES
index 3a741a5c81..ffbe1af92e 100644
--- a/apps/codecs/lib/SOURCES
+++ b/apps/codecs/lib/SOURCES
@@ -7,9 +7,7 @@ mdct_lookup.c
#ifdef CPU_ARM
mdct_arm.S
setjmp_arm.S
-#if ARM_ARCH == 4
-udiv32_armv4.S
-#endif
+udiv32_arm.S
#endif
#ifdef CPU_COLDFIRE
diff --git a/apps/codecs/lib/codeclib.h b/apps/codecs/lib/codeclib.h
index 517264f3a5..926035f05e 100644
--- a/apps/codecs/lib/codeclib.h
+++ b/apps/codecs/lib/codeclib.h
@@ -65,7 +65,7 @@ void qsort(void *base, size_t nmemb, size_t size, int(*compar)(const void *, con
extern void mdct_backward(int n, int32_t *in, int32_t *out);
-#if defined(CPU_ARM) && (ARM_ARCH == 4)
+#ifdef CPU_ARM
/* optimised unsigned integer division for ARMv4, in IRAM */
unsigned udiv32_arm(unsigned a, unsigned b);
#define UDIV32(a, b) udiv32_arm(a, b)
diff --git a/apps/codecs/lib/udiv32_arm.S b/apps/codecs/lib/udiv32_arm.S
new file mode 100644
index 0000000000..c46a09be5c
--- /dev/null
+++ b/apps/codecs/lib/udiv32_arm.S
@@ -0,0 +1,319 @@
+/***************************************************************************
+ * __________ __ ___.
+ * Open \______ \ ____ ____ | | _\_ |__ _______ ___
+ * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
+ * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
+ * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
+ * \/ \/ \/ \/ \/
+ * $Id$
+ *
+ * Copyright (C) 2008 by Jens Arnold
+ * Copyright (C) 2009 by Andrew Mahone
+ *
+ * Optimised unsigned integer division for ARMv4
+ *
+ * Based on: libgcc routines for ARM cpu, additional algorithms from ARM System
+ * Developer's Guide
+ * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
+ * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
+ * Free Software Foundation, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include "config.h"
+/* Codecs should not normally do this, but we need to check a macro, and
+ * codecs.h would confuse the assembler. */
+
+#ifdef USE_IRAM
+#define DIV_RECIP
+ .section .icode,"ax",%progbits
+#else
+ .text
+#endif
+ .align
+ .global udiv32_arm
+ .type udiv32_arm,%function
+
+#if ARM_ARCH < 5
+/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
+ for dividing a 30-bit value by a 15-bit value, with two operations per
+ iteration by storing quotient and remainder together and adding the previous
+ quotient bit during trial subtraction. Modified to work with any dividend
+ and divisor both less than 1 << 30, and skipping trials by calculating bits
+ in output. */
+.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
+
+ mov \bits, #1
+ /* Shift the divisor left until it aligns with the numerator. If it already
+ has the high bit set, this is fine, everything inside .rept will be
+ skipped, and the add before and adcs after will set the one-bit result
+ to zero. */
+ cmn \divisor, \dividend, lsr #16
+ movcs \divisor, \divisor, lsl #16
+ addcs \bits, \bits, #16
+ cmn \divisor, \dividend, lsr #8
+ movcs \divisor, \divisor, lsl #8
+ addcs \bits, \bits, #8
+ cmn \divisor, \dividend, lsr #4
+ movcs \divisor, \divisor, lsl #4
+ addcs \bits, \bits, #4
+ cmn \divisor, \dividend, lsr #2
+ movcs \divisor, \divisor, lsl #2
+ addcs \bits, \bits, #2
+ cmn \divisor, \dividend, lsr #1
+ movcs \divisor, \divisor, lsl #1
+ addcs \bits, \bits, #1
+ adds \result, \dividend, \divisor
+ subcc \result, \result, \divisor
+ rsb \curbit, \bits, #31
+ add pc, pc, \curbit, lsl #3
+ nop
+ .rept 30
+ adcs \result, \divisor, \result, lsl #1
+ /* Fix the remainder portion of the result. This must be done because the
+ handler for 32-bit numerators needs the remainder. */
+ subcc \result, \result, \divisor
+ .endr
+ /* Shift remainder/quotient left one, add final quotient bit */
+ adc \result, \result, \result
+ mov \remainder, \result, lsr \bits
+ eor \quotient, \result, \remainder, lsl \bits
+.endm
+
+#ifdef CPU_PP
+#if CONFIG_CPU == PP5020
+.set recip_max, 5952
+#elif CONFIG_CPU == PP5002
+.set recip_max, 1472
+#else
+.set recip_max, 14208
+#endif
+#elif CONFIG_CPU == AS3525
+.set recip_max, 42752
+#elif CONFIG_CPU == S5L8701
+.set recip_max, 9600
+#elif CONFIG_CPU == S5L8700
+.set recip_max, 5504
+#endif
+
+udiv32_arm:
+#ifdef DIV_RECIP
+ cmp r1, #3
+ bcc .L_udiv_tiny
+ cmp r1, #recip_max
+ bhi .L_udiv
+ adr r3, .L_udiv_recip_table-12
+ ldr r2, [r3, r1, lsl #2]
+ mov r3, r0
+ umull ip, r0, r2, r0
+ mul r2, r0, r1
+ cmp r3, r2
+ bxcs lr
+ sub r0, r0, #1
+ bx lr
+.L_udiv_tiny:
+ cmp r1, #1
+ movhi r0, r0, lsr #1
+ bxcs lr
+ b .L_div0
+#endif
+.L_udiv:
+ /* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor
+ and add the next bit of the result. The correction code at .L_udiv32
+ does not need the divisor inverted, but can be modified to work with it,
+ and this allows the zero divisor test to be done early and without an
+ explicit comparison. */
+ rsbs r1, r1, #0
+#ifndef DIV_RECIP
+ beq .L_div0
+#endif
+ tst r0, r0
+ /* High bit must be unset, otherwise shift numerator right, calculate,
+ and correct results. As this case is very uncommon we want to avoid
+ any other delays on the main path in handling it, so the long divide
+ calls the short divide as a function. */
+ bmi .L_udiv32
+.L_udiv31:
+ ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
+ bx lr
+.L_udiv32:
+ /* store original numerator and divisor, we'll need them to correct the
+ result, */
+ stmdb sp, { r0, r1, lr }
+ /* Call __div0 here if divisor is zero, otherwise it would report the wrong
+ address. */
+ mov r0, r0, lsr #1
+ bl .L_udiv31
+ ldmdb sp, { r2, r3, lr }
+ /* Move the low bit of the original numerator to the carry bit */
+ movs r2, r2, lsr #1
+ /* Shift the remainder left one and add in the carry bit */
+ adc r1, r1, r1
+ /* Subtract the original divisor from the remainder, setting carry if the
+ result is non-negative */
+ adds r1, r1, r3
+ /* Shift quotient left one and add carry bit */
+ adc r0, r0, r0
+ bx lr
+.L_div0:
+ /* __div0 expects the calling address on the top of the stack */
+ stmdb sp!, { lr }
+ mov r0, #0
+#if defined(__ARM_EABI__) || !defined(USE_IRAM)
+ bl __div0
+#else
+ ldr pc, [pc, #-4]
+ .word __div0
+#endif
+#ifdef DIV_RECIP
+.L_udiv_recip_table:
+ .set div, 3
+ .rept recip_max - 2
+ .if (div - 1) & div
+ .set q, 0x40000000 / div
+ .set r, (0x40000000 - (q * div))<<1
+ .set q, q << 1
+ .if r >= div
+ .set q, q + 1
+ .set r, r - div
+ .endif
+ .set r, r << 1
+ .set q, q << 1
+ .if r >= div
+ .set q, q + 1
+ .set r, r - div
+ .endif
+ .set q, q + 1
+ .else
+ .set q, 0x40000000 / div * 4
+ .endif
+ .word q
+ .set div, div+1
+ .endr
+#endif
+ .size udiv32_arm, . - udiv32_arm
+
+#else
+.macro ARMV5_UDIV32_BODY numerator, divisor, quotient, bits, inv, neg, div0label
+ cmp \numerator, \divisor
+ clz \bits, \divisor
+ bcc 30f
+ mov \inv, \divisor, lsl \bits
+ add \neg, pc, \inv, lsr #25
+ cmp \inv, #1<<31
+ ldrhib \inv, [\neg, #.L_udiv_est_table-.-64]
+ bls 20f
+ subs \bits, \bits, #7
+ rsb \neg, \divisor, #0
+ movpl \divisor, \inv, lsl \bits
+ bmi 10f
+ mul \inv, \divisor, \neg
+ smlawt \divisor, \divisor, \inv, \divisor
+ mul \inv, \divisor, \neg
+ /* This will save a cycle on ARMv6, but does not produce a correct result
+ if numerator sign bit is set. This case accounts for about 1 in 10^7 of
+ divisions, done by the APE decoder, so we specialize for the more common
+ case and handle the uncommon large-numerator separately */
+#if ARM_ARCH >= 6
+ tst \numerator, \numerator
+ smmla \divisor, \divisor, \inv, \divisor
+ bmi 40f
+ smmul \inv, \numerator, \divisor
+#else
+ mov \bits, #0
+ smlal \bits, \divisor, \divisor, \inv
+ umull \bits, \inv, \numerator, \divisor
+#endif
+ add \numerator, \numerator, \neg
+ mla \divisor, \inv, \neg, \numerator
+ mov \quotient, \inv
+ cmn \divisor, \neg
+ addcc \quotient, \quotient, #1
+ addpl \quotient, \quotient, #2
+ bx lr
+10:
+ rsb \bits, \bits, #0
+ sub \inv, \inv, #4
+ mov \divisor, \inv, lsr \bits
+#if ARM_ARCH >= 6
+ tst \numerator, \numerator
+ smmla \divisor, \divisor, \inv, \divisor
+ bmi 50f
+ smmul \inv, \numerator, \divisor
+#else
+ mov \bits, #0
+ smlal \bits, \divisor, \divisor, \inv
+ umull \bits, \inv, \numerator, \divisor
+#endif
+ mla \divisor, \inv, \neg, \numerator
+ mov \quotient, \inv
+ cmn \neg, \divisor, lsr #1
+ addcs \divisor, \divisor, \neg, lsl #1
+ addcs \quotient, \quotient, #2
+ cmn \neg, \divisor
+ addcs \quotient, \quotient, #1
+ bx lr
+20:
+.ifnc "", "\div0label"
+ rsb \bits, \bits, #31
+ bne \div0label
+.endif
+ mov \quotient, \numerator, lsr \bits
+ bx lr
+30:
+ mov \quotient, #0
+ bx lr
+#if ARM_ARCH >= 6
+40:
+ umull \bits, \inv, \numerator, \divisor
+ add \numerator, \numerator, \neg
+ mla \divisor, \inv, \neg, \numerator
+ mov \quotient, \inv
+ cmn \divisor, \neg
+ addcc \quotient, \quotient, #1
+ addpl \quotient, \quotient, #2
+ bx lr
+50:
+ umull \bits, \inv, \numerator, \divisor
+ mla \divisor, \inv, \neg, \numerator
+ mov \quotient, \inv
+ cmn \neg, \divisor, lsr #1
+ addcs \divisor, \divisor, \neg, lsl #1
+ addcs \quotient, \quotient, #2
+ cmn \neg, \divisor
+ addcs \quotient, \quotient, #1
+ bx lr
+#endif
+.endm
+
+udiv32_arm:
+ ARMV5_UDIV32_BODY r0, r1, r0, r2, r3, ip, .L_div0
+.L_div0:
+ /* __div0 expects the calling address on the top of the stack */
+ stmdb sp!, { lr }
+ mov r0, #0
+#if defined(__ARM_EABI__) || !defined(USE_IRAM)
+ bl __div0
+#else
+ ldr pc, [pc, #-4]
+ .word __div0
+#endif
+.L_udiv_est_table:
+ .byte 0xff, 0xfc, 0xf8, 0xf4, 0xf0, 0xed, 0xea, 0xe6
+ .byte 0xe3, 0xe0, 0xdd, 0xda, 0xd7, 0xd4, 0xd2, 0xcf
+ .byte 0xcc, 0xca, 0xc7, 0xc5, 0xc3, 0xc0, 0xbe, 0xbc
+ .byte 0xba, 0xb8, 0xb6, 0xb4, 0xb2, 0xb0, 0xae, 0xac
+ .byte 0xaa, 0xa8, 0xa7, 0xa5, 0xa3, 0xa2, 0xa0, 0x9f
+ .byte 0x9d, 0x9c, 0x9a, 0x99, 0x97, 0x96, 0x94, 0x93
+ .byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89
+ .byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81
+#endif
+ .size udiv32_arm, . - udiv32_arm
diff --git a/apps/codecs/lib/udiv32_armv4.S b/apps/codecs/lib/udiv32_armv4.S
deleted file mode 100644
index c4aea14093..0000000000
--- a/apps/codecs/lib/udiv32_armv4.S
+++ /dev/null
@@ -1,134 +0,0 @@
-/***************************************************************************
- * __________ __ ___.
- * Open \______ \ ____ ____ | | _\_ |__ _______ ___
- * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
- * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
- * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
- * \/ \/ \/ \/ \/
- * $Id$
- *
- * Copyright (C) 2008 by Jens Arnold
- * Copyright (C) 2009 by Andrew Mahone
- *
- * Optimised unsigned integer division for ARMv4
- *
- * Based on: libgcc routines for ARM cpu.
- * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
- * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
- * Free Software Foundation, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
- * KIND, either express or implied.
- *
- ****************************************************************************/
-
-#include "config.h"
-/* Codecs should not normally do this, but we need to check a macro, and
- * codecs.h would confuse the assembler. */
-
-/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
- for dividing a 30-bit value by a 15-bit value, with two operations per
- iteration by storing quotient and remainder together and adding the previous
- quotient bit during trial subtraction. Modified to work with any dividend
- and divisor both less than 1 << 30, and skipping trials by calculating bits
- in output. */
-.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
-
- mov \bits, #1
- /* Shift the divisor left until it aligns with the numerator. If it already
- has the high bit set, this is fine, everything inside .rept will be
- skipped, and the add before and adcs after will set the one-bit result
- to zero. */
- cmn \divisor, \dividend, lsr #16
- movcs \divisor, \divisor, lsl #16
- addcs \bits, \bits, #16
- cmn \divisor, \dividend, lsr #8
- movcs \divisor, \divisor, lsl #8
- addcs \bits, \bits, #8
- cmn \divisor, \dividend, lsr #4
- movcs \divisor, \divisor, lsl #4
- addcs \bits, \bits, #4
- cmn \divisor, \dividend, lsr #2
- movcs \divisor, \divisor, lsl #2
- addcs \bits, \bits, #2
- cmn \divisor, \dividend, lsr #1
- movcs \divisor, \divisor, lsl #1
- addcs \bits, \bits, #1
- adds \result, \dividend, \divisor
- subcc \result, \result, \divisor
- rsb \curbit, \bits, #31
- add pc, pc, \curbit, lsl #3
- nop
- .rept 30
- adcs \result, \divisor, \result, lsl #1
- /* Fix the remainder portion of the result. This must be done because the
- handler for 32-bit numerators needs the remainder. */
- subcc \result, \result, \divisor
- .endr
- /* Shift remainder/quotient left one, add final quotient bit */
- adc \result, \result, \result
- mov \remainder, \result, lsr \bits
- eor \quotient, \result, \remainder, lsl \bits
-.endm
-
-#ifdef USE_IRAM
- .section .icode,"ax",%progbits
-#else
- .text
-#endif
- .align
- .global udiv32_arm
- .type udiv32_arm,%function
-
-udiv32_arm:
- /* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor
- and add the next bit of the result. The correction code at .L_udiv32
- does not need the divisor inverted, but can be modified to work with it,
- and this allows the zero divisor test to be done early and without an
- explicit comparison. */
- rsbs r1, r1, #0
- beq .L_div0
- tst r0, r0
- /* High bit must be unset, otherwise shift numerator right, calculate,
- and correct results. As this case is very uncommon we want to avoid
- any other delays on the main path in handling it, so the long divide
- calls the short divide as a function. */
- bmi .L_udiv32
-.L_udiv31:
- ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
- bx lr
-
-.L_udiv32:
- /* store original numerator and divisor, we'll need them to correct the
- result, */
- stmdb sp, { r0, r1, lr }
- /* Call __div0 here if divisor is zero, otherwise it would report the wrong
- address. */
- mov r0, r0, lsr #1
- bl .L_udiv31
- ldmdb sp, { r2, r3, lr }
- /* Move the low bit of the original numerator to the carry bit */
- movs r2, r2, lsr #1
- /* Shift the remainder left one and add in the carry bit */
- adc r1, r1, r1
- /* Subtract the original divisor from the remainder, setting carry if the
- result is non-negative */
- adds r1, r1, r3
- /* Shift quotient left one and add carry bit */
- adc r0, r0, r0
- bx lr
-.L_div0:
- /* __div0 expects the calling address on the top of the stack */
- stmdb sp!, { lr }
-#if defined(__ARM_EABI__) || !defined(USE_IRAM)
- bl __div0
-#else
- mov lr, pc
- bx r3
-#endif
- .size udiv32_arm, . - udiv32_arm