diff options
author | Andrew Mahone <andrew.mahone@gmail.com> | 2010-01-28 02:28:52 +0000 |
---|---|---|
committer | Andrew Mahone <andrew.mahone@gmail.com> | 2010-01-28 02:28:52 +0000 |
commit | e76f30a57c25a3ae762fc48218e57bc46dff4410 (patch) | |
tree | b3ca05f49dab3bd6eb4f35af8714653515771cb0 /apps/codecs | |
parent | e18e8069304eefca5439d9b4e573429e2f600a2c (diff) |
Improvements to specialized dividers for APE codec:
* Use Newton-Raphson divider on ARMv5e and ARMv6, about 7% speedup on Gigabeat S.
* On ARMv4 targets using IRAM, remove insane filter buffer from IRAM, fill available IRAM with LUT of reciprocals for small divisors - speedup varies according to target and available IRAM, APE normal sample is approx. 109% RT on e200.
* Rename apps/codecs/lib/udiv32_armv4.S to apps/codecs/lib/udiv32_arm.S, which includes dividers for all ARM targets specialized for APE.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24354 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs')
-rw-r--r-- | apps/codecs/demac/libdemac/demac_config.h | 4 | ||||
-rw-r--r-- | apps/codecs/lib/SOURCES | 4 | ||||
-rw-r--r-- | apps/codecs/lib/codeclib.h | 2 | ||||
-rw-r--r-- | apps/codecs/lib/udiv32_arm.S | 319 | ||||
-rw-r--r-- | apps/codecs/lib/udiv32_armv4.S | 134 |
5 files changed, 323 insertions, 140 deletions
diff --git a/apps/codecs/demac/libdemac/demac_config.h b/apps/codecs/demac/libdemac/demac_config.h index 1bbdef3d56..1beda2b9cd 100644 --- a/apps/codecs/demac/libdemac/demac_config.h +++ b/apps/codecs/demac/libdemac/demac_config.h @@ -57,11 +57,11 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA #elif defined(CPU_S5L870X) #define ICODE_SECTION_DEMAC_ARM .icode #define ICODE_ATTR_DEMAC ICODE_ATTR -#define IBSS_ATTR_DEMAC_INSANEBUF IBSS_ATTR +#define IBSS_ATTR_DEMAC_INSANEBUF #else #define ICODE_SECTION_DEMAC_ARM .text #define ICODE_ATTR_DEMAC -#define IBSS_ATTR_DEMAC_INSANEBUF IBSS_ATTR +#define IBSS_ATTR_DEMAC_INSANEBUF #endif #else /* !ROCKBOX */ diff --git a/apps/codecs/lib/SOURCES b/apps/codecs/lib/SOURCES index 3a741a5c81..ffbe1af92e 100644 --- a/apps/codecs/lib/SOURCES +++ b/apps/codecs/lib/SOURCES @@ -7,9 +7,7 @@ mdct_lookup.c #ifdef CPU_ARM mdct_arm.S setjmp_arm.S -#if ARM_ARCH == 4 -udiv32_armv4.S -#endif +udiv32_arm.S #endif #ifdef CPU_COLDFIRE diff --git a/apps/codecs/lib/codeclib.h b/apps/codecs/lib/codeclib.h index 517264f3a5..926035f05e 100644 --- a/apps/codecs/lib/codeclib.h +++ b/apps/codecs/lib/codeclib.h @@ -65,7 +65,7 @@ void qsort(void *base, size_t nmemb, size_t size, int(*compar)(const void *, con extern void mdct_backward(int n, int32_t *in, int32_t *out); -#if defined(CPU_ARM) && (ARM_ARCH == 4) +#ifdef CPU_ARM /* optimised unsigned integer division for ARMv4, in IRAM */ unsigned udiv32_arm(unsigned a, unsigned b); #define UDIV32(a, b) udiv32_arm(a, b) diff --git a/apps/codecs/lib/udiv32_arm.S b/apps/codecs/lib/udiv32_arm.S new file mode 100644 index 0000000000..c46a09be5c --- /dev/null +++ b/apps/codecs/lib/udiv32_arm.S @@ -0,0 +1,319 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2008 by Jens Arnold + * Copyright (C) 2009 by Andrew Mahone + * + * Optimised unsigned integer division for ARMv4 + * + * Based on: libgcc routines for ARM cpu, additional algorithms from ARM System + * Developer's Guide + * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk) + * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005 + * Free Software Foundation, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + +#include "config.h" +/* Codecs should not normally do this, but we need to check a macro, and + * codecs.h would confuse the assembler. */ + +#ifdef USE_IRAM +#define DIV_RECIP + .section .icode,"ax",%progbits +#else + .text +#endif + .align + .global udiv32_arm + .type udiv32_arm,%function + +#if ARM_ARCH < 5 +/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2) + for dividing a 30-bit value by a 15-bit value, with two operations per + iteration by storing quotient and remainder together and adding the previous + quotient bit during trial subtraction. Modified to work with any dividend + and divisor both less than 1 << 30, and skipping trials by calculating bits + in output. */ +.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder + + mov \bits, #1 + /* Shift the divisor left until it aligns with the numerator. If it already + has the high bit set, this is fine, everything inside .rept will be + skipped, and the add before and adcs after will set the one-bit result + to zero. */ + cmn \divisor, \dividend, lsr #16 + movcs \divisor, \divisor, lsl #16 + addcs \bits, \bits, #16 + cmn \divisor, \dividend, lsr #8 + movcs \divisor, \divisor, lsl #8 + addcs \bits, \bits, #8 + cmn \divisor, \dividend, lsr #4 + movcs \divisor, \divisor, lsl #4 + addcs \bits, \bits, #4 + cmn \divisor, \dividend, lsr #2 + movcs \divisor, \divisor, lsl #2 + addcs \bits, \bits, #2 + cmn \divisor, \dividend, lsr #1 + movcs \divisor, \divisor, lsl #1 + addcs \bits, \bits, #1 + adds \result, \dividend, \divisor + subcc \result, \result, \divisor + rsb \curbit, \bits, #31 + add pc, pc, \curbit, lsl #3 + nop + .rept 30 + adcs \result, \divisor, \result, lsl #1 + /* Fix the remainder portion of the result. This must be done because the + handler for 32-bit numerators needs the remainder. */ + subcc \result, \result, \divisor + .endr + /* Shift remainder/quotient left one, add final quotient bit */ + adc \result, \result, \result + mov \remainder, \result, lsr \bits + eor \quotient, \result, \remainder, lsl \bits +.endm + +#ifdef CPU_PP +#if CONFIG_CPU == PP5020 +.set recip_max, 5952 +#elif CONFIG_CPU == PP5002 +.set recip_max, 1472 +#else +.set recip_max, 14208 +#endif +#elif CONFIG_CPU == AS3525 +.set recip_max, 42752 +#elif CONFIG_CPU == S5L8701 +.set recip_max, 9600 +#elif CONFIG_CPU == S5L8700 +.set recip_max, 5504 +#endif + +udiv32_arm: +#ifdef DIV_RECIP + cmp r1, #3 + bcc .L_udiv_tiny + cmp r1, #recip_max + bhi .L_udiv + adr r3, .L_udiv_recip_table-12 + ldr r2, [r3, r1, lsl #2] + mov r3, r0 + umull ip, r0, r2, r0 + mul r2, r0, r1 + cmp r3, r2 + bxcs lr + sub r0, r0, #1 + bx lr +.L_udiv_tiny: + cmp r1, #1 + movhi r0, r0, lsr #1 + bxcs lr + b .L_div0 +#endif +.L_udiv: + /* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor + and add the next bit of the result. The correction code at .L_udiv32 + does not need the divisor inverted, but can be modified to work with it, + and this allows the zero divisor test to be done early and without an + explicit comparison. */ + rsbs r1, r1, #0 +#ifndef DIV_RECIP + beq .L_div0 +#endif + tst r0, r0 + /* High bit must be unset, otherwise shift numerator right, calculate, + and correct results. As this case is very uncommon we want to avoid + any other delays on the main path in handling it, so the long divide + calls the short divide as a function. */ + bmi .L_udiv32 +.L_udiv31: + ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1 + bx lr +.L_udiv32: + /* store original numerator and divisor, we'll need them to correct the + result, */ + stmdb sp, { r0, r1, lr } + /* Call __div0 here if divisor is zero, otherwise it would report the wrong + address. */ + mov r0, r0, lsr #1 + bl .L_udiv31 + ldmdb sp, { r2, r3, lr } + /* Move the low bit of the original numerator to the carry bit */ + movs r2, r2, lsr #1 + /* Shift the remainder left one and add in the carry bit */ + adc r1, r1, r1 + /* Subtract the original divisor from the remainder, setting carry if the + result is non-negative */ + adds r1, r1, r3 + /* Shift quotient left one and add carry bit */ + adc r0, r0, r0 + bx lr +.L_div0: + /* __div0 expects the calling address on the top of the stack */ + stmdb sp!, { lr } + mov r0, #0 +#if defined(__ARM_EABI__) || !defined(USE_IRAM) + bl __div0 +#else + ldr pc, [pc, #-4] + .word __div0 +#endif +#ifdef DIV_RECIP +.L_udiv_recip_table: + .set div, 3 + .rept recip_max - 2 + .if (div - 1) & div + .set q, 0x40000000 / div + .set r, (0x40000000 - (q * div))<<1 + .set q, q << 1 + .if r >= div + .set q, q + 1 + .set r, r - div + .endif + .set r, r << 1 + .set q, q << 1 + .if r >= div + .set q, q + 1 + .set r, r - div + .endif + .set q, q + 1 + .else + .set q, 0x40000000 / div * 4 + .endif + .word q + .set div, div+1 + .endr +#endif + .size udiv32_arm, . - udiv32_arm + +#else +.macro ARMV5_UDIV32_BODY numerator, divisor, quotient, bits, inv, neg, div0label + cmp \numerator, \divisor + clz \bits, \divisor + bcc 30f + mov \inv, \divisor, lsl \bits + add \neg, pc, \inv, lsr #25 + cmp \inv, #1<<31 + ldrhib \inv, [\neg, #.L_udiv_est_table-.-64] + bls 20f + subs \bits, \bits, #7 + rsb \neg, \divisor, #0 + movpl \divisor, \inv, lsl \bits + bmi 10f + mul \inv, \divisor, \neg + smlawt \divisor, \divisor, \inv, \divisor + mul \inv, \divisor, \neg + /* This will save a cycle on ARMv6, but does not produce a correct result + if numerator sign bit is set. This case accounts for about 1 in 10^7 of + divisions, done by the APE decoder, so we specialize for the more common + case and handle the uncommon large-numerator separately */ +#if ARM_ARCH >= 6 + tst \numerator, \numerator + smmla \divisor, \divisor, \inv, \divisor + bmi 40f + smmul \inv, \numerator, \divisor +#else + mov \bits, #0 + smlal \bits, \divisor, \divisor, \inv + umull \bits, \inv, \numerator, \divisor +#endif + add \numerator, \numerator, \neg + mla \divisor, \inv, \neg, \numerator + mov \quotient, \inv + cmn \divisor, \neg + addcc \quotient, \quotient, #1 + addpl \quotient, \quotient, #2 + bx lr +10: + rsb \bits, \bits, #0 + sub \inv, \inv, #4 + mov \divisor, \inv, lsr \bits +#if ARM_ARCH >= 6 + tst \numerator, \numerator + smmla \divisor, \divisor, \inv, \divisor + bmi 50f + smmul \inv, \numerator, \divisor +#else + mov \bits, #0 + smlal \bits, \divisor, \divisor, \inv + umull \bits, \inv, \numerator, \divisor +#endif + mla \divisor, \inv, \neg, \numerator + mov \quotient, \inv + cmn \neg, \divisor, lsr #1 + addcs \divisor, \divisor, \neg, lsl #1 + addcs \quotient, \quotient, #2 + cmn \neg, \divisor + addcs \quotient, \quotient, #1 + bx lr +20: +.ifnc "", "\div0label" + rsb \bits, \bits, #31 + bne \div0label +.endif + mov \quotient, \numerator, lsr \bits + bx lr +30: + mov \quotient, #0 + bx lr +#if ARM_ARCH >= 6 +40: + umull \bits, \inv, \numerator, \divisor + add \numerator, \numerator, \neg + mla \divisor, \inv, \neg, \numerator + mov \quotient, \inv + cmn \divisor, \neg + addcc \quotient, \quotient, #1 + addpl \quotient, \quotient, #2 + bx lr +50: + umull \bits, \inv, \numerator, \divisor + mla \divisor, \inv, \neg, \numerator + mov \quotient, \inv + cmn \neg, \divisor, lsr #1 + addcs \divisor, \divisor, \neg, lsl #1 + addcs \quotient, \quotient, #2 + cmn \neg, \divisor + addcs \quotient, \quotient, #1 + bx lr +#endif +.endm + +udiv32_arm: + ARMV5_UDIV32_BODY r0, r1, r0, r2, r3, ip, .L_div0 +.L_div0: + /* __div0 expects the calling address on the top of the stack */ + stmdb sp!, { lr } + mov r0, #0 +#if defined(__ARM_EABI__) || !defined(USE_IRAM) + bl __div0 +#else + ldr pc, [pc, #-4] + .word __div0 +#endif +.L_udiv_est_table: + .byte 0xff, 0xfc, 0xf8, 0xf4, 0xf0, 0xed, 0xea, 0xe6 + .byte 0xe3, 0xe0, 0xdd, 0xda, 0xd7, 0xd4, 0xd2, 0xcf + .byte 0xcc, 0xca, 0xc7, 0xc5, 0xc3, 0xc0, 0xbe, 0xbc + .byte 0xba, 0xb8, 0xb6, 0xb4, 0xb2, 0xb0, 0xae, 0xac + .byte 0xaa, 0xa8, 0xa7, 0xa5, 0xa3, 0xa2, 0xa0, 0x9f + .byte 0x9d, 0x9c, 0x9a, 0x99, 0x97, 0x96, 0x94, 0x93 + .byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89 + .byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81 +#endif + .size udiv32_arm, . - udiv32_arm diff --git a/apps/codecs/lib/udiv32_armv4.S b/apps/codecs/lib/udiv32_armv4.S deleted file mode 100644 index c4aea14093..0000000000 --- a/apps/codecs/lib/udiv32_armv4.S +++ /dev/null @@ -1,134 +0,0 @@ -/*************************************************************************** - * __________ __ ___. - * Open \______ \ ____ ____ | | _\_ |__ _______ ___ - * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / - * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < - * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ - * \/ \/ \/ \/ \/ - * $Id$ - * - * Copyright (C) 2008 by Jens Arnold - * Copyright (C) 2009 by Andrew Mahone - * - * Optimised unsigned integer division for ARMv4 - * - * Based on: libgcc routines for ARM cpu. - * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk) - * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005 - * Free Software Foundation, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ****************************************************************************/ - -#include "config.h" -/* Codecs should not normally do this, but we need to check a macro, and - * codecs.h would confuse the assembler. */ - -/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2) - for dividing a 30-bit value by a 15-bit value, with two operations per - iteration by storing quotient and remainder together and adding the previous - quotient bit during trial subtraction. Modified to work with any dividend - and divisor both less than 1 << 30, and skipping trials by calculating bits - in output. */ -.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder - - mov \bits, #1 - /* Shift the divisor left until it aligns with the numerator. If it already - has the high bit set, this is fine, everything inside .rept will be - skipped, and the add before and adcs after will set the one-bit result - to zero. */ - cmn \divisor, \dividend, lsr #16 - movcs \divisor, \divisor, lsl #16 - addcs \bits, \bits, #16 - cmn \divisor, \dividend, lsr #8 - movcs \divisor, \divisor, lsl #8 - addcs \bits, \bits, #8 - cmn \divisor, \dividend, lsr #4 - movcs \divisor, \divisor, lsl #4 - addcs \bits, \bits, #4 - cmn \divisor, \dividend, lsr #2 - movcs \divisor, \divisor, lsl #2 - addcs \bits, \bits, #2 - cmn \divisor, \dividend, lsr #1 - movcs \divisor, \divisor, lsl #1 - addcs \bits, \bits, #1 - adds \result, \dividend, \divisor - subcc \result, \result, \divisor - rsb \curbit, \bits, #31 - add pc, pc, \curbit, lsl #3 - nop - .rept 30 - adcs \result, \divisor, \result, lsl #1 - /* Fix the remainder portion of the result. This must be done because the - handler for 32-bit numerators needs the remainder. */ - subcc \result, \result, \divisor - .endr - /* Shift remainder/quotient left one, add final quotient bit */ - adc \result, \result, \result - mov \remainder, \result, lsr \bits - eor \quotient, \result, \remainder, lsl \bits -.endm - -#ifdef USE_IRAM - .section .icode,"ax",%progbits -#else - .text -#endif - .align - .global udiv32_arm - .type udiv32_arm,%function - -udiv32_arm: - /* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor - and add the next bit of the result. The correction code at .L_udiv32 - does not need the divisor inverted, but can be modified to work with it, - and this allows the zero divisor test to be done early and without an - explicit comparison. */ - rsbs r1, r1, #0 - beq .L_div0 - tst r0, r0 - /* High bit must be unset, otherwise shift numerator right, calculate, - and correct results. As this case is very uncommon we want to avoid - any other delays on the main path in handling it, so the long divide - calls the short divide as a function. */ - bmi .L_udiv32 -.L_udiv31: - ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1 - bx lr - -.L_udiv32: - /* store original numerator and divisor, we'll need them to correct the - result, */ - stmdb sp, { r0, r1, lr } - /* Call __div0 here if divisor is zero, otherwise it would report the wrong - address. */ - mov r0, r0, lsr #1 - bl .L_udiv31 - ldmdb sp, { r2, r3, lr } - /* Move the low bit of the original numerator to the carry bit */ - movs r2, r2, lsr #1 - /* Shift the remainder left one and add in the carry bit */ - adc r1, r1, r1 - /* Subtract the original divisor from the remainder, setting carry if the - result is non-negative */ - adds r1, r1, r3 - /* Shift quotient left one and add carry bit */ - adc r0, r0, r0 - bx lr -.L_div0: - /* __div0 expects the calling address on the top of the stack */ - stmdb sp!, { lr } -#if defined(__ARM_EABI__) || !defined(USE_IRAM) - bl __div0 -#else - mov lr, pc - bx r3 -#endif - .size udiv32_arm, . - udiv32_arm |