3 files changed, 468 insertions, 369 deletions
diff --git a/apps/dsp.c b/apps/dsp.c
index be851e2305..3b95145b39 100644
--- a/apps/dsp.c
+++ b/apps/dsp.c
@@ -38,9 +38,14 @@
 #define WORD_FRACBITS       27
 
 #define NATIVE_DEPTH        16
+/* If the buffer sizes change, check the assembly code! */
 #define SAMPLE_BUF_COUNT    256
 #define RESAMPLE_BUF_COUNT  (256 * 4)   /* Enough for 11,025 Hz -> 44,100 Hz*/
 #define DEFAULT_GAIN        0x01000000
+#define SAMPLE_BUF_LEFT_CHANNEL 0
+#define SAMPLE_BUF_RIGHT_CHANNEL (SAMPLE_BUF_COUNT/2)
+#define RESAMPLE_BUF_LEFT_CHANNEL 0
+#define RESAMPLE_BUF_RIGHT_CHANNEL (RESAMPLE_BUF_COUNT/2)
 
 /* enums to index conversion properly with stereo mode and other settings */
 enum
@@ -66,11 +71,10 @@ enum
  * NOTE: Any assembly routines that use these structures must be updated
  * if current data members are moved or changed.
  */
-                                        /* 32-bit achitecture offset */
 struct resample_data
 {
-    long delta;                         /* 00h */
-    long phase;                         /* 04h */
+    uint32_t delta;                     /* 00h */
+    uint32_t phase;                     /* 04h */
     int32_t last_sample[2];             /* 08h */
                                         /* 10h */
 };
@@ -93,9 +97,10 @@ struct dsp_data
     int output_scale;                   /* 00h */
     int num_channels;                   /* 04h */
     struct resample_data resample_data; /* 08h */
-    int clip_min;                       /* 18h */
-    int clip_max;                       /* 2ch */
-                                        /* 30h */
+    int32_t clip_min;                   /* 18h */
+    int32_t clip_max;                   /* 1ch */
+    int32_t gain;                       /* 20h - Note that this is in S8.23 format. */ 
+                                        /* 24h */
 };
 
 /* No asm...yet */
@@ -132,13 +137,18 @@ struct eq_state
 #include <dsp_asm.h>
 
 /* Typedefs keep things much neater in this case */
-typedef int (*sample_input_fn_type)(int count, const char *src[],
-                                    int32_t *dst[]);    
+typedef void (*sample_input_fn_type)(int count, const char *src[],
+                                     int32_t *dst[]);    
 typedef int (*resample_fn_type)(int count, struct dsp_data *data,
                                 int32_t *src[], int32_t *dst[]);
 typedef void (*sample_output_fn_type)(int count, struct dsp_data *data,
                                       int32_t *src[], int16_t *dst);
+/* Single-DSP channel processing in place */
 typedef void (*channels_process_fn_type)(int count, int32_t *buf[]);
+/* DSP local channel processing in place */
+typedef void (*channels_process_dsp_fn_type)(int count, struct dsp_data *data,
+                                             int32_t *buf[]);
+
 
 /*
  ***************************************************************************/
@@ -152,16 +162,16 @@ struct dsp_config
     int  sample_bytes;
     int  stereo_mode;
     int  frac_bits;
-    long gain;          /* Note that this is in S8.23 format. */
     /* Functions that change depending upon settings - NULL if stage is
        disabled */
-    sample_input_fn_type        input_samples;
-    resample_fn_type            resample;
-    sample_output_fn_type       output_samples;
+    sample_input_fn_type         input_samples;
+    resample_fn_type             resample;
+    sample_output_fn_type        output_samples;
     /* These will be NULL for the voice codec and is more economical that
        way */
-    channels_process_fn_type    apply_crossfeed;
-    channels_process_fn_type    channels_process;
+    channels_process_dsp_fn_type apply_gain;
+    channels_process_fn_type     apply_crossfeed;
+    channels_process_fn_type     channels_process;
 };
 
 /* General DSP config */
@@ -211,7 +221,7 @@ static struct dsp_config *dsp IDATA_ATTR = audio_dsp;
  * of copying needed is minimized for that case.
  */
 
-static int32_t sample_buf[SAMPLE_BUF_COUNT] IBSS_ATTR;
+int32_t sample_buf[SAMPLE_BUF_COUNT] IBSS_ATTR;
 static int32_t resample_buf[RESAMPLE_BUF_COUNT] IBSS_ATTR;
 
 /* set a new dsp and return old one */
@@ -258,23 +268,20 @@ void sound_set_pitch(int permille)
     dsp_configure(DSP_SWITCH_FREQUENCY, dsp->codec_frequency);
 }
 
-/* Convert at most count samples to the internal format, if needed. Returns
- * number of samples ready for further processing. Updates src to point
- * past the samples "consumed" and dst is set to point to the samples to
- * consume. Note that for mono, dst[0] equals dst[1], as there is no point
- * in processing the same data twice.
+/* Convert count samples to the internal format, if needed.  Updates src
+ * to point past the samples "consumed" and dst is set to point to the
+ * samples to consume. Note that for mono, dst[0] equals dst[1], as there
+ * is no point in processing the same data twice.
  */
 
 /* convert count 16-bit mono to 32-bit mono */
-static int sample_input_lte_native_mono(
+static void sample_input_lte_native_mono(
     int count, const char *src[], int32_t *dst[])
 {
-    count = MIN(SAMPLE_BUF_COUNT/2, count);
-
     const int16_t *s = (int16_t *) src[0];
     const int16_t * const send = s + count;
-    int32_t *d = dst[0] = dst[1] = sample_buf;
-    const int scale = WORD_SHIFT;
+    int32_t *d = dst[0] = dst[1] = &sample_buf[SAMPLE_BUF_LEFT_CHANNEL];
+    int scale = WORD_SHIFT;
 
     do
     {
@@ -283,21 +290,17 @@ static int sample_input_lte_native_mono(
     while (s < send);
 
     src[0] = (char *)s;
-
-    return count;
 }
 
 /* convert count 16-bit interleaved stereo to 32-bit noninterleaved */
-static int sample_input_lte_native_i_stereo(
+static void sample_input_lte_native_i_stereo(
     int count, const char *src[], int32_t *dst[])
 {
-    count = MIN(SAMPLE_BUF_COUNT/2, count);
-
     const int32_t *s = (int32_t *) src[0];
     const int32_t * const send = s + count;
-    int32_t *dl = dst[0] = sample_buf;
-    int32_t *dr = dst[1] = sample_buf + SAMPLE_BUF_COUNT/2;
-    const int scale = WORD_SHIFT;
+    int32_t *dl = dst[0] = &sample_buf[SAMPLE_BUF_LEFT_CHANNEL];
+    int32_t *dr = dst[1] = &sample_buf[SAMPLE_BUF_RIGHT_CHANNEL];
+    int scale = WORD_SHIFT;
 
     do
     {
@@ -313,22 +316,18 @@ static int sample_input_lte_native_i_stereo(
     while (s < send);
 
     src[0] = (char *)s;
-
-    return count;
 }
 
 /* convert count 16-bit noninterleaved stereo to 32-bit noninterleaved */
-static int sample_input_lte_native_ni_stereo(
+static void sample_input_lte_native_ni_stereo(
     int count, const char *src[], int32_t *dst[])
 {
-    count = MIN(SAMPLE_BUF_COUNT/2, count);
-
     const int16_t *sl = (int16_t *) src[0];
     const int16_t *sr = (int16_t *) src[1];
     const int16_t * const slend = sl + count;
-    int32_t *dl = dst[0] = sample_buf;
-    int32_t *dr = dst[1] = sample_buf + SAMPLE_BUF_COUNT/2;
-    const int scale = WORD_SHIFT;
+    int32_t *dl = dst[0] = &sample_buf[SAMPLE_BUF_LEFT_CHANNEL];
+    int32_t *dr = dst[1] = &sample_buf[SAMPLE_BUF_RIGHT_CHANNEL];
+    int scale = WORD_SHIFT;
 
     do
     {
@@ -339,35 +338,24 @@ static int sample_input_lte_native_ni_stereo(
 
     src[0] = (char *)sl;
     src[1] = (char *)sr;
-
-    return count;
 }
 
 /* convert count 32-bit mono to 32-bit mono */
-static int sample_input_gt_native_mono(
+static void sample_input_gt_native_mono(
     int count, const char *src[], int32_t *dst[])
 {
-    count = MIN(SAMPLE_BUF_COUNT/2, count);
-
     dst[0] = dst[1] = (int32_t *)src[0];
     src[0] = (char *)(dst[0] + count);
-
-    return count;
 }
 
 /* convert count 32-bit interleaved stereo to 32-bit noninterleaved stereo */
-static int sample_input_gt_native_i_stereo(
+static void sample_input_gt_native_i_stereo(
     int count, const char *src[], int32_t *dst[])
 {
-    count = MIN(SAMPLE_BUF_COUNT/2, count);
-
     const int32_t *s = (int32_t *)src[0];
     const int32_t * const send = s + 2*count;
-    int32_t *dl = sample_buf;
-    int32_t *dr = sample_buf + SAMPLE_BUF_COUNT/2;
-
-    dst[0] = dl;
-    dst[1] = dr;
+    int32_t *dl = dst[0] = &sample_buf[SAMPLE_BUF_LEFT_CHANNEL];
+    int32_t *dr = dst[1] = &sample_buf[SAMPLE_BUF_RIGHT_CHANNEL];
 
     do
     {
@@ -377,22 +365,16 @@ static int sample_input_gt_native_i_stereo(
     while (s < send);
 
     src[0] = (char *)send;
-
-    return count;
 }
 
 /* convert 32 bit-noninterleaved stereo to 32-bit noninterleaved stereo */
-static int sample_input_gt_native_ni_stereo(
+static void sample_input_gt_native_ni_stereo(
     int count, const char *src[], int32_t *dst[])
 {
-    count = MIN(SAMPLE_BUF_COUNT/2, count);
-
     dst[0] = (int32_t *)src[0];
     dst[1] = (int32_t *)src[1];
     src[0] = (char *)(dst[0] + count);
     src[1] = (char *)(dst[1] + count);
-
-    return count;
 }
 
 /**
@@ -573,12 +555,6 @@ static void sample_output_new_format(void)
     dsp->output_samples = sample_output_functions[out];
 }
 
-static void resampler_set_delta(int frequency)
-{
-    dsp->data.resample_data.delta = (unsigned long) 
-        frequency * 65536LL / NATIVE_FREQUENCY;
-}
-
 /**
  * Linear interpolation resampling that introduces a one sample delay because
  * of our inability to look into the future at the end of a frame.
@@ -587,9 +563,9 @@ static void resampler_set_delta(int frequency)
 static int dsp_downsample(int count, struct dsp_data *data,
                           int32_t *src[], int32_t *dst[])
 {
-    int  ch = data->num_channels - 1;
-    long delta = data->resample_data.delta;
-    long phase, pos;
+    int ch = data->num_channels - 1;
+    uint32_t delta = data->resample_data.delta;
+    uint32_t phase, pos;
     int32_t *d;
 
     /* Rolled channel loop actually showed slightly faster. */
@@ -610,7 +586,7 @@ static int dsp_downsample(int count, struct dsp_data *data,
         if (pos > 0)
             last = s[pos - 1];
 
-        while (pos < count)
+        while (pos < (uint32_t)count)
         {
             *d++ = last + FRACMUL((phase & 0xffff) << 15, s[pos] - last);
             phase += delta;
@@ -625,12 +601,12 @@ static int dsp_downsample(int count, struct dsp_data *data,
     return d - dst[0];
 }
 
-static int dsp_upsample(int count,  struct dsp_data *data,
+static int dsp_upsample(int count, struct dsp_data *data,
                         int32_t *src[], int32_t *dst[])
 {
     int  ch = data->num_channels - 1;
-    long delta = data->resample_data.delta;
-    long phase, pos;
+    uint32_t delta = data->resample_data.delta;
+    uint32_t phase, pos;
     int32_t *d;
 
     /* Rolled channel loop actually showed slightly faster. */
@@ -653,7 +629,7 @@ static int dsp_upsample(int count,  struct dsp_data *data,
             pos = phase >> 16;
         }
 
-        while (pos < count)
+        while (pos < (uint32_t)count)
         {
             last = s[pos - 1];
             *d++ = last + FRACMUL((phase & 0xffff) << 15, s[pos] - last);
@@ -669,24 +645,43 @@ static int dsp_upsample(int count,  struct dsp_data *data,
 }
 #endif /* DSP_HAVE_ASM_RESAMPLING */
 
+static void resampler_new_delta(void)
+{
+    dsp->data.resample_data.delta = (unsigned long) 
+        dsp->frequency * 65536LL / NATIVE_FREQUENCY;
+
+    if (dsp->frequency == NATIVE_FREQUENCY)
+    {
+        /* NOTE: If fully glitch-free transistions from no resampling to
+           resampling are desired, last_sample history should be maintained
+           even when not resampling. */
+        dsp->resample = NULL;
+        dsp->data.resample_data.phase = 0;
+        dsp->data.resample_data.last_sample[0] = 0;
+        dsp->data.resample_data.last_sample[1] = 0;
+    }
+    else if (dsp->frequency < NATIVE_FREQUENCY)
+        dsp->resample = dsp_upsample;
+    else
+        dsp->resample = dsp_downsample;
+}
+
 /* Resample count stereo samples. Updates the src array, if resampling is
  * done, to refer to the resampled data. Returns number of stereo samples
  * for further processing.
  */
 static inline int resample(int count, int32_t *src[])
 {
-    if (dsp->resample)
+    int32_t *dst[2] =
     {
-        int32_t *dst[2] =
-        {
-            resample_buf,
-            resample_buf + RESAMPLE_BUF_COUNT/2,
-        };
+        &resample_buf[RESAMPLE_BUF_LEFT_CHANNEL],
+        &resample_buf[RESAMPLE_BUF_RIGHT_CHANNEL],
+    };
 
-        count = dsp->resample(count, &dsp->data, src, dst);
-        src[0] = dst[0];
-        src[1] = dst[dsp->data.num_channels - 1];
-    }
+    count = dsp->resample(count, &dsp->data, src, dst);
+
+    src[0] = dst[0];
+    src[1] = dst[dsp->data.num_channels - 1];
 
     return count;
 }
@@ -810,30 +805,59 @@ void dsp_set_crossfeed_cross_params(long lf_gain, long hf_gain, long cutoff)
     c[2] <<= 4;
 }
 
+/* Apply a constant gain to the samples (e.g., for ReplayGain).
+ * Note that this must be called before the resampler.
+ */
+#ifndef DSP_HAVE_ASM_APPLY_GAIN
+static void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[])
+{
+    const int32_t gain = data->gain;
+    int ch = data->num_channels - 1;
+
+    do
+    {
+        int32_t *s = buf[ch];
+        int32_t *d = buf[ch];
+        int32_t  samp = *s++;
+        int i = 0;
+
+        do
+        {
+            FRACMUL_8_LOOP(samp, gain, s, d);
+        }
+        while (++i < count);
+    }
+    while (--ch >= 0);
+}
+#endif /* DSP_HAVE_ASM_APPLY_GAIN */
+
 /* Combine all gains to a global gain. */
 static void set_gain(struct dsp_config *dsp)
 {
-    dsp->gain = DEFAULT_GAIN;
+    dsp->data.gain = DEFAULT_GAIN;
 
     /* Replay gain not relevant to voice */
     if (dsp == audio_dsp && replaygain)
     {
-        dsp->gain = replaygain;
+        dsp->data.gain = replaygain;
     }
     
     if (eq_enabled && eq_precut)
     {
-        dsp->gain = (long) (((int64_t) dsp->gain * eq_precut) >> 24);
+        dsp->data.gain =
+            (long) (((int64_t) dsp->data.gain * eq_precut) >> 24);
     }
     
-    if (dsp->gain == DEFAULT_GAIN)
+    if (dsp->data.gain == DEFAULT_GAIN)
     {
-        dsp->gain = 0;
+        dsp->data.gain = 0;
     }
     else
     {
-        dsp->gain >>= 1;
+        dsp->data.gain >>= 1;
     }
+
+    dsp->apply_gain = dsp->data.gain != 0 ? dsp_apply_gain : NULL;
 }
 
 /**
@@ -927,50 +951,6 @@ static void eq_process(int count, int32_t *buf[])
     }
 }
 
-/* Apply a constant gain to the samples (e.g., for ReplayGain). May update
- * the src array if gain was applied.
- * Note that this must be called before the resampler.
- */
-static void apply_gain(int count, int32_t *buf[])
-{
-    int32_t *sl, *sr;
-    int32_t s, *d;
-    long gain;
-    int i;
-
-    if (new_gain)
-    {
-        /* Gain has changed */
-        dsp_set_replaygain();
-        if (dsp->gain == 0)
-            return; /* No gain to apply now */
-    }
-
-    sl = buf[0], sr = buf[1];
-    gain = dsp->gain;
-
-    if (sl != sr)
-    {
-        d = &sample_buf[SAMPLE_BUF_COUNT / 2];
-        buf[1] = d;
-        s = *sr++;
-
-        for (i = 0; i < count; i++)
-            FRACMUL_8_LOOP(s, gain, sr, d);
-    }
-    else
-    {
-        buf[1] = &sample_buf[0];
-    }
-
-    d = &sample_buf[0];
-    buf[0] = d;
-    s = *sl++;
-
-    for (i = 0; i < count; i++)
-        FRACMUL_8_LOOP(s, gain, sl, d);
-}
-
 void dsp_set_stereo_width(int value)
 {
     long width, straight, cross;
@@ -993,35 +973,6 @@ void dsp_set_stereo_width(int value)
     dsp_sw_cross = cross << 8;
 }
 
-/**
- * Implements the different channel configurations and stereo width.
- */
-
-/* SOUND_CHAN_STEREO mode is a noop so has no function - just outline one for
- * completeness. */
-#if 0
-static void channels_process_sound_chan_stereo(int count, int32_t *buf[])
-{
-    /* The channels are each just themselves */
-    (void)count; (void)buf;
-}
-#endif
-
-#ifndef DSP_HAVE_ASM_SOUND_CHAN_MONO
-static void channels_process_sound_chan_mono(int count, int32_t *buf[])
-{
-    int32_t *sl = buf[0], *sr = buf[1];
-
-    do
-    {
-        int32_t lr = *sl/2 + *sr/2;
-        *sl++ = lr;
-        *sr++ = lr;
-    }
-    while (--count > 0);
-}
-#endif /* DSP_HAVE_ASM_SOUND_CHAN_MONO */
-
 #if CONFIG_CODEC == SWCODEC
 
 #ifdef HAVE_SW_TONE_CONTROLS
@@ -1063,6 +1014,35 @@ int dsp_callback(int msg, intptr_t param)
 }
 #endif
 
+/**
+ * Implements the different channel configurations and stereo width.
+ */
+
+/* SOUND_CHAN_STEREO mode is a noop so has no function - just outline one for
+ * completeness. */
+#if 0
+static void channels_process_sound_chan_stereo(int count, int32_t *buf[])
+{
+    /* The channels are each just themselves */
+    (void)count; (void)buf;
+}
+#endif
+
+#ifndef DSP_HAVE_ASM_SOUND_CHAN_MONO
+static void channels_process_sound_chan_mono(int count, int32_t *buf[])
+{
+    int32_t *sl = buf[0], *sr = buf[1];
+
+    do
+    {
+        int32_t lr = *sl/2 + *sr/2;
+        *sl++ = lr;
+        *sr++ = lr;
+    }
+    while (--count > 0);
+}
+#endif /* DSP_HAVE_ASM_SOUND_CHAN_MONO */
+
 #ifndef DSP_HAVE_ASM_SOUND_CHAN_CUSTOM
 static void channels_process_sound_chan_custom(int count, int32_t *buf[])
 {
@@ -1151,30 +1131,47 @@ int dsp_process(char *dst, const char *src[], int count)
     coldfire_set_macsr(EMAC_FRACTIONAL | EMAC_SATURATE);
 #endif
 
+    if (new_gain)
+        dsp_set_replaygain(); /* Gain has changed */
+
+    /* Testing function pointers for NULL is preferred since the pointer
+       will be preloaded to be used for the call if not. */
     while (count > 0)
     {
-        samples = dsp->input_samples(count, src, tmp);
+        samples = MIN(SAMPLE_BUF_COUNT/2, count);
         count -= samples;
-        if (dsp->gain != 0)
-            apply_gain(samples, tmp);
-        if ((samples = resample(samples, tmp)) <= 0)
+
+        dsp->input_samples(samples, src, tmp);
+
+        if (dsp->apply_gain)
+            dsp->apply_gain(samples, &dsp->data, tmp);
+
+        if (dsp->resample && (samples = resample(samples, tmp)) <= 0)
             break; /* I'm pretty sure we're downsampling here */
+
         if (dsp->apply_crossfeed)
             dsp->apply_crossfeed(samples, tmp);
+
         /* TODO: EQ and tone controls need separate structs for audio and voice
          * DSP processing thanks to filter history. isn't really audible now, but
-         * might be the day we start handling voice more delicately.
+         * might be the day we start handling voice more delicately. Planned
+         * changes may well run all relevent channels through the same EQ so
+         * perhaps not.
          */
         if (eq_enabled)
             eq_process(samples, tmp);
+
 #ifdef HAVE_SW_TONE_CONTROLS
         if ((bass | treble) != 0)
             eq_filter(tmp, &tone_filter, samples, dsp->data.num_channels,
                       FILTER_BISHELF_SHIFT);
 #endif
+
         if (dsp->channels_process)
             dsp->channels_process(samples, tmp);
+
         dsp->output_samples(samples, &dsp->data, tmp, (int16_t *)dst);
+
         written += samples;
         dst += samples * sizeof (int16_t) * 2;
         yield();
@@ -1245,9 +1242,6 @@ bool dsp_configure(int setting, intptr_t value)
         if (dsp == audio_dsp)
         {
             *var = value;
-            /* In case current gain is zero, force at least one call
-               to apply_gain or apply_gain won't pick up on new_gain */
-            audio_dsp->gain = -1;
             new_gain = true;
         }
     }
@@ -1282,15 +1276,7 @@ bool dsp_configure(int setting, intptr_t value)
         else
             dsp->frequency = dsp->codec_frequency;
 
-        resampler_set_delta(dsp->frequency);
-
-        if (dsp->frequency == NATIVE_FREQUENCY)
-            dsp->resample = NULL;
-        else if (dsp->frequency < NATIVE_FREQUENCY)
-            dsp->resample = dsp_upsample;
-        else
-            dsp->resample = dsp_downsample;
-
+        resampler_new_delta();
         break;
 
     case DSP_SET_SAMPLE_DEPTH:
@@ -1348,7 +1334,7 @@ bool dsp_configure(int setting, intptr_t value)
     case DSP_FLUSH:
         memset(&dsp->data.resample_data, 0,
                sizeof (dsp->data.resample_data));
-        resampler_set_delta(dsp->frequency);
+        resampler_new_delta();
         dither_init();
         break;
 
diff --git a/apps/dsp_asm.h b/apps/dsp_asm.h
index f8df337b37..14875d21d8 100644
--- a/apps/dsp_asm.h
+++ b/apps/dsp_asm.h
@@ -22,32 +22,61 @@
 #ifndef _DSP_ASM_H
 #define _DSP_ASM_H
 
+/* Set the appropriate #defines based on CPU or whatever matters */
 #ifndef SIMULATOR
 
-#if defined(CPU_COLDFIRE) || defined(CPU_ARM)
+#if defined(CPU_ARM)
+#define DSP_HAVE_ASM_RESAMPLING
 #define DSP_HAVE_ASM_CROSSFEED
-void apply_crossfeed(int count, int32_t *buf[]);
+#elif defined (CPU_COLDFIRE)
+#define DSP_HAVE_ASM_APPLY_GAIN
 #define DSP_HAVE_ASM_RESAMPLING
-int dsp_downsample(int count, struct dsp_data *data, int32_t *src[], int32_t *dst[]);
-int dsp_upsample(int count, struct dsp_data *data, int32_t *src[], int32_t *dst[]);
-#endif /* defined(CPU_COLDFIRE) || defined(CPU_ARM) */
-
-#if defined (CPU_COLDFIRE)
+#define DSP_HAVE_ASM_CROSSFEED
 #define DSP_HAVE_ASM_SOUND_CHAN_MONO
-void channels_process_sound_chan_mono(int count, int32_t *buf[]);
 #define DSP_HAVE_ASM_SOUND_CHAN_CUSTOM
-void channels_process_sound_chan_custom(int count, int32_t *buf[]);
 #define DSP_HAVE_ASM_SOUND_CHAN_KARAOKE
-void channels_process_sound_chan_karaoke(int count, int32_t *buf[]);
-
 #define DSP_HAVE_ASM_SAMPLE_OUTPUT_MONO
-void sample_output_mono(int count, struct dsp_data *data,
-                        int32_t *src[], int16_t *dst);
 #define DSP_HAVE_ASM_SAMPLE_OUTPUT_STEREO
-void sample_output_stereo(int count, struct dsp_data *data,
-                          int32_t *src[], int16_t *dst);
 #endif /* CPU_COLDFIRE */
 
 #endif /* SIMULATOR */
 
+/* Declare prototypes based upon what's #defined above */
+#ifdef DSP_HAVE_ASM_CROSSFEED
+void apply_crossfeed(int count, int32_t *buf[]);
+#endif
+
+#ifdef DSP_HAVE_ASM_APPLY_GAIN
+void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[]);
+#endif /* DSP_HAVE_ASM_APPLY_GAIN* */
+
+#ifdef DSP_HAVE_ASM_RESAMPLING
+int dsp_upsample(int count, struct dsp_data *data,
+                 int32_t *src[], int32_t *dst[]);
+int dsp_downsample(int count, struct dsp_data *data,
+                   int32_t *src[], int32_t *dst[]);
+#endif /* DSP_HAVE_ASM_RESAMPLING */
+
+#ifdef DSP_HAVE_ASM_SOUND_CHAN_MONO
+void channels_process_sound_chan_mono(int count, int32_t *buf[]);
+#endif
+
+#ifdef DSP_HAVE_ASM_SOUND_CHAN_CUSTOM
+void channels_process_sound_chan_custom(int count, int32_t *buf[]);
+#endif
+
+#ifdef DSP_HAVE_ASM_SOUND_CHAN_KARAOKE
+void channels_process_sound_chan_karaoke(int count, int32_t *buf[]);
+#endif
+
+#ifdef DSP_HAVE_ASM_SAMPLE_OUTPUT_STEREO
+void sample_output_stereo(int count, struct dsp_data *data,
+                          int32_t *src[], int16_t *dst);
+#endif
+
+#ifdef DSP_HAVE_ASM_SAMPLE_OUTPUT_MONO
+void sample_output_mono(int count, struct dsp_data *data,
+                        int32_t *src[], int16_t *dst);
+#endif
+
 #endif /* _DSP_ASM_H */
diff --git a/apps/dsp_cf.S b/apps/dsp_cf.S
index af9ac1fa4b..e5d3ee8c55 100644
--- a/apps/dsp_cf.S
+++ b/apps/dsp_cf.S
@@ -19,68 +19,117 @@
  ****************************************************************************/
 
 /****************************************************************************
- * void apply_crossfeed(int count, int32_t *src[])
+ * void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[])
  */
     .section    .text
+	.align      2
+    .global     dsp_apply_gain
+dsp_apply_gain:
+    lea.l       -20(%sp), %sp           | save registers
+    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
+    movem.l     28(%sp), %a0-%a1        | %a0 = data,
+                                        | %a1 = buf
+	move.l      4(%a0), %d1             | %d1 = data->num_channels
+    move.l      32(%a0), %a0            | %a0 = data->gain (in s8.23)
+10: | channel loop                      |
+	move.l      24(%sp), %d0            | %d0 = count
+    move.l      -4(%a1, %d1.l*4), %a2   | %a2 = s = buf[ch-1]
+    move.l      %a2, %a3                | %a3 = d = s
+    move.l      (%a2)+, %d2             | %d2 = *s++,
+    mac.l       %a0, %d2, (%a2)+, %d2, %acc0 | %acc0 = S(n)*gain, load S(n+1)
+    subq.l      #1, %d0                 | --count > 0 ? : effectively n++
+    ble.b       30f | loop done         | no? finish up
+20: | loop                              |
+    move.l      %accext01, %d4          | fetch S(n-1)[7:0]
+    movclr.l    %acc0, %d3              | fetch S(n-1)[40:8] in %d5[31:0]
+    asl.l       #8, %d3                 | *s++ = (S(n-1)[40:8] << 8) | S(n-1)[7:0]
+    mac.l       %a0, %d2, (%a2)+, %d2, %acc0 | %acc0 = S(n)*gain, load S(n+1)
+    move.b      %d4, %d3                |
+    move.l      %d3, (%a3)+             |
+    subq.l      #1, %d0                 | --count > 0 ? : effectively n++
+    bgt.b       20b | loop              | yes? do more samples
+30: | loop done                         |
+    move.l      %accext01, %d4          | fetch S(n-1)[7:0]
+    movclr.l    %acc0, %d3              | fetch S(n-1)[40:8] in %d5[31:0]
+    asl.l       #8, %d3                 | *s = (S(n-1)[40:8] << 8) | S(n-1)[7:0]
+    move.b      %d4, %d3                |
+    move.l      %d3, (%a3)              |
+	subq.l      #1, %d1                 | next channel
+	bgt.b       10b | channel loop      |
+    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
+    lea.l       20(%sp), %sp            | cleanup stack
+    rts                                 |
+    .size       dsp_apply_gain,.-dsp_apply_gain
+
+/****************************************************************************
+ * void apply_crossfeed(int count, int32_t *buf[])
+ */
+    .section    .text
+        .align      2
     .global     apply_crossfeed 
 apply_crossfeed:
-    lea.l       -44(%sp), %sp
+    lea.l       -44(%sp), %sp           |
     movem.l     %d2-%d7/%a2-%a6, (%sp)  | save all regs
     movem.l     48(%sp), %d7/%a4        | %d7 = count, %a4 = src
     movem.l     (%a4), %a4-%a5          | %a4 = src[0], %a5 = src[1]
-    lea.l       crossfeed_data, %a1
-    move.l      (%a1)+, %a6             | a6 = direct gain
+    lea.l       crossfeed_data, %a1     | %a1 = &crossfeed_data
+    move.l      (%a1)+, %d6             | %d6 = direct gain
     movem.l     12(%a1), %d0-%d3        | fetch filter history samples
     move.l      132(%a1), %a0           | fetch delay line address
     movem.l     (%a1), %a1-%a3          | load filter coefs
+    lea.l       crossfeed_data+136, %a6 | %a6 = delay line wrap limit
+    bra.b       20f | loop start        | go to loop start point
     /* Register usage in loop:
      * %a0 = delay_p, %a1..%a3 = b0, b1, a1 (filter coefs),
-     * %a4 = src[0], %a5 = src[1], %a6 = direct gain,
+     * %a4 = buf[0], %a5 = buf[1],
+     * %a6 = delay line pointer wrap limit,
      * %d0..%d3 = history
-     * %d4..%d6 = temp.
+     * %d4..%d5 = temp.
+     * %d6 = direct gain,
      * %d7 = count
      */
-.cfloop:
-    mac.l       %a2, %d0, 4(%a0), %d0, %acc0 | acc  = b1*dr[n - 1] d0 = dr[n]
-    mac.l       %a1, %d0             , %acc0 | acc += b0*dr[n]
-    mac.l       %a3, %d1,  (%a4), %d4, %acc0 | acc += a1*y_l[n - 1], load L
-    move.l      %acc0, %d1              | get filtered delayed sample
-    mac.l       %a6, %d4, %acc0         | acc += gain*x_l[n]
-    movclr.l    %acc0, %d6              |
-    move.l      %d6, (%a4)+             | write result
-
-    mac.l       %a2, %d2, (%a0), %d2, %acc0 | acc  = b1*dl[n - 1], d2 = dl[n]
-    mac.l       %a1, %d2            , %acc0 | acc += b0*dl[n]
-    mac.l       %a3, %d3, (%a5), %d5, %acc0 | acc += a1*y_r[n - 1], load R
-    movem.l     %d4-%d5, (%a0)          | save left & right inputs to delay line
-    move.l      %acc0, %d3              | get filtered delayed sample
-    mac.l       %a6, %d5, %acc0         | acc += gain*x_r[n]
-    lea.l       8(%a0), %a0             | increment delay pointer
-    movclr.l    %acc0, %d6              |
-    move.l      %d6, (%a5)+             | write result
-
-    cmpa.l      #crossfeed_data+136, %a0| wrap a0 if passed end
-    bge.b       .cfwrap                 |
-    .word       0x51fb                  | tpf.l - trap the buffer wrap
-.cfwrap:
-    lea.l       -104(%a0), %a0          | wrap
-    subq.l      #1, %d7                 | --count < 0 ?
-    bgt.b       .cfloop                 |
+10: | loop                              |
+    movclr.l    %acc0, %d4              | write outputs
+    move.l      %d4, (%a4)+             | .
+    movclr.l    %acc1, %d5              | .
+    move.l      %d5, (%a5)+             | .
+20: | loop start                        |
+    mac.l       %a2, %d0, (%a0)+, %d0, %acc0 | %acc0  = b1*dl[n - 1], %d0 = dl[n]
+    mac.l       %a1, %d0             , %acc0 | %acc0 += b0*dl[n]
+    mac.l       %a3, %d1, (%a5),  %d5, %acc0 | %acc0 += a1*y_r[n - 1], load R
+    mac.l       %a2, %d2, (%a0)+, %d2, %acc1 | %acc1  = b1*dr[n - 1], %d2 = dr[n]
+    mac.l       %a1, %d2             , %acc1 | %acc1 += b0*dr[n]
+    mac.l       %a3, %d3, (%a4),  %d4, %acc1 | %acc1 += a1*y_l[n - 1], load L
+    movem.l     %d4-%d5, -8(%a0)        | save left & right inputs to delay line
+    move.l      %acc0, %d3              | get filtered delayed left sample (y_l[n])
+    move.l      %acc1, %d1              | get filtered delayed right sample (y_r[n])
+    mac.l       %d6, %d4, %acc0         | %acc0 += gain*x_l[n]
+    mac.l       %d6, %d5, %acc1         | %acc1 += gain*x_r[n]
+    cmp.l       %a6, %a0                | wrap %a0 if passed end
+    bhs.b       30f | wrap buffer       |
+    .word       0x51fb | tpf.l          | trap the buffer wrap
+30: | wrap buffer                       | ...fwd taken branches more costly
+    lea.l       -104(%a0), %a0          | wrap it up
+    subq.l      #1, %d7                 | --count > 0 ?
+    bgt.b       10b | loop              | yes? do more
+    movclr.l    %acc0, %d4              | write last outputs
+    move.l      %d4, (%a4)              | .
+    movclr.l    %acc1, %d5              | .
+    move.l      %d5, (%a5)              | .
     lea.l       crossfeed_data+16, %a1  | save data back to struct
     movem.l     %d0-%d3, (%a1)          | ...history
     move.l      %a0, 120(%a1)           | ...delay_p
     movem.l     (%sp), %d2-%d7/%a2-%a6  | restore all regs
-    lea.l       44(%sp), %sp
-    rts
-.cfend:
-    .size       apply_crossfeed,.cfend-apply_crossfeed
-
+    lea.l       44(%sp), %sp            |
+    rts                                 |
+    .size       apply_crossfeed,.-apply_crossfeed 
 
 /****************************************************************************
  * int dsp_downsample(int count, struct dsp_data *data,
  *                    in32_t *src[], int32_t *dst[])
  */
     .section    .text
+	.align      2
     .global     dsp_downsample
 dsp_downsample:
     lea.l       -40(%sp), %sp           | save non-clobberables
@@ -92,7 +141,7 @@ dsp_downsample:
     movem.l     4(%a0), %d3-%d4         | %d3 = ch = data->num_channels
                                         | %d4 = delta = data->resample_data.delta
     moveq.l     #16, %d7                | %d7 = shift
-.dschannel_loop:
+10: | channel loop                      |
     move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
     move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
     move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
@@ -102,15 +151,15 @@ dsp_downsample:
     move.l      %d5, %d6                | %d6 = pos = phase >> 16
     lsr.l       %d7, %d6                |
     cmp.l       %d2, %d6                | past end of samples?
-    bge.b       .dsloop_skip            | yes? skip loop
+    bge.b       40f | skip resample loop| yes? skip loop
     tst.l       %d6                     | need last sample of prev. frame?
-    bne.b       .dsloop                 | no? start main loop
+    bne.b       20f | resample loop     | no? start main loop
     move.l      (%a3, %d6.l*4), %d1     | %d1 = s[pos]
-    bra.b       .dsuse_last_start       | start with last (last in %d0)
-.dsloop:
+    bra.b       30f | resample start last | start with last (last in %d0)
+20: | resample loop                     |
     lea.l       -4(%a3, %d6.l*4), %a5   | load s[pos-1] and s[pos]
     movem.l     (%a5), %d0-%d1          |
-.dsuse_last_start:
+30: | resample start last               |
     sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
     move.l      %d0, %acc0              | %acc0 = previous sample
     move.l      %d5, %d0                | frac = (phase << 16) >> 1
@@ -123,11 +172,11 @@ dsp_downsample:
     movclr.l    %acc0, %d0              |
     move.l      %d0, (%a4)+             | *d++ = %d0
     cmp.l       %d2, %d6                | pos < count?
-    blt.b       .dsloop                 | yes? continue resampling
-.dsloop_skip:
+    blt.b       20b | resample loop     | yes? continue resampling
+40: | skip resample loop                |
     subq.l      #1, %d3                 | ch > 0?
-    bgt.b       .dschannel_loop         | yes? process next channel
-    asl.l       %d7, %d2                | wrap phase to start of next frame
+    bgt.b       10b | channel loop      | yes? process next channel
+    lsl.l       %d7, %d2                | wrap phase to start of next frame
     sub.l       %d2, %d5                | data->resample_data.phase =
     move.l      %d5, 12(%a0)            | ... phase - (count << 16)
     move.l      %a4, %d0                | return d - d[0]
@@ -136,14 +185,14 @@ dsp_downsample:
     movem.l     (%sp), %d2-%d7/%a2-%a5  | restore non-clobberables
     lea.l       40(%sp), %sp            | cleanup stack
     rts                                 | buh-bye
-.dsend:
-    .size       dsp_downsample,.dsend-dsp_downsample
+    .size       dsp_downsample,.-dsp_downsample
 
 /****************************************************************************
  * int dsp_upsample(int count, struct dsp_data *dsp,
- *                  in32_t *src[], int32_t *dst[])
+ *                  int32_t *src[], int32_t *dst[])
  */
     .section    .text
+	.align      2
     .global     dsp_upsample
 dsp_upsample:
     lea.l       -40(%sp), %sp           | save non-clobberables
@@ -154,47 +203,55 @@ dsp_upsample:
                                         | %a2 = dst
     movem.l      4(%a0), %d3-%d4        | %d3 = ch = channels
                                         | %d4 = delta = data->resample_data.delta
-    swap        %d4                     | swap delta to high word to use
-                                        | carries to increment position
-.uschannel_loop:
+    swap        %d4                     | swap delta to high word to use...
+                                        | ...carries to increment position
+10: | channel loop                      |
     move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
     move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
     lea.l       12(%a0, %d3.l*4), %a4   | %a4 = &data->resample_data.last_sample[ch-1]
-    lea.l       (%a3, %d2.l*4), %a5     | %a5 = src_end = &src[count]
+    lea.l       -4(%a3, %d2.l*4), %a5   | %a5 = src_end = &src[count-1]
     move.l      (%a4), %d0              | %d0 = last = data->resample_data.last_sample[ch-1]
-    move.l      -(%a5), (%a4)           | data->resample_data.last_sample[ch-1] = s[count-1]
+    move.l      (%a5), (%a4)            | data->resample_data.last_sample[ch-1] = s[count-1]
     move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
+    move.l      (%a3)+, %d1             | fetch first sample - might throw this...
+                                        | ...away later but we'll be preincremented
+    move.l      %d1, %d6                | save sample value
+    sub.l       %d0, %d1                | %d1 = diff = s[0] - last
     swap        %d5                     | swap phase to high word to use
                                         | carries to increment position
-    move.l      %d5, %d6                | %d6 = pos = phase >> 16
+    move.l      %d5, %d7                | %d7 = pos = phase >> 16
     clr.w       %d5                     |
-    eor.l       %d5, %d6                | pos == 0?
-    beq.b       .usstart_0              | no? transistion from down
-    cmp.l       %d2, %d6                | past end of samples?
-    bge.b       .usloop_skip            | yes? skip loop
-    lea.l       -4(%a3, %d6.l*4), %a3   | %a3 = s = &s[pos-1] (previous)
-    move.l      (%a3)+, %d0             | %d0 = *s++
-    .word       0x51fa                  | tpf.w - trap next instruction
-.usloop_1:
+    eor.l       %d5, %d7                | pos == 0?
+    beq.b       40f | loop start        | yes? start loop
+    cmp.l       %d2, %d7                | past end of samples?
+    bge.b       50f | skip resample loop| yes? go to next channel and collect info
+    lea.l       (%a3, %d7.l*4), %a3     | %a3 = s = &s[pos+1]
+	movem.l     -8(%a3), %d0-%d1        | %d0 = s[pos-1], %d1 = s[pos]
+    move.l      %d1, %d6                | save sample value
+    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
+	bra.b       40f | loop start        |
+20: | next sample loop                  |
     move.l      %d6, %d0                | move previous sample to %d0
-.usstart_0:
     move.l      (%a3)+, %d1             | fetch next sample
     move.l      %d1, %d6                | save sample value
     sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
-.usloop_0:
+30: | same sample loop                  |
+    movclr.l    %acc0, %d7              | %d7 = result
+    move.l      %d7, (%a4)+             | *d++ = %d7
+40: | loop start                        |
     lsr.l       #1, %d5                 | make phase into frac
+    move.l      %d0, %acc0              | %acc0 = s[pos-1]
     mac.l       %d1, %d5, %acc0         | %acc0 = diff * frac
     lsl.l       #1, %d5                 | restore frac to phase
-    movclr.l    %acc0, %d7              | %d7 = product
-    add.l       %d0, %d7                | %d7 = last + product
-    move.l      %d7, (%a4)+             | *d++ = %d7
     add.l       %d4, %d5                | phase += delta
-    bcc.b       .usloop_0               | load next values?
+    bcc.b       30b | same sample loop  | load next values?
     cmp.l       %a5, %a3                | src <= src_end?
-    ble.b       .usloop_1               | yes? continue resampling
-.usloop_skip:
+    bls.b       20b | next sample loop  | yes? continue resampling
+    movclr.l    %acc0, %d7              | %d7 = result
+    move.l      %d7, (%a4)+             | *d++ = %d7
+50: | skip resample loop                |
     subq.l      #1, %d3                 | ch > 0?
-    bgt.b       .uschannel_loop         | yes? process next channel
+    bgt.b       10b | channel loop      | yes? process next channel
     swap        %d5                     | wrap phase to start of next frame
     move.l      %d5, 12(%a0)            | ...and save in data->resample_data.phase
     move.l      %a4, %d0                | return d - d[0]
@@ -203,12 +260,7 @@ dsp_upsample:
     asr.l       #2, %d0                 | convert bytes->samples
     lea.l       40(%sp), %sp            | cleanup stack
     rts                                 | buh-bye
-.usend:
-    .size       dsp_upsample,.usend-dsp_upsample
-
-/* These routines might benefit from burst transfers but we'll keep them
- * small for now since they're rather light weight
- */
+    .size       dsp_upsample,.-dsp_upsample
 
 /****************************************************************************
  * void channels_process_sound_chan_mono(int count, int32_t *buf[])
@@ -216,31 +268,39 @@ dsp_upsample:
  * Mix left and right channels 50/50 into a center channel.
  */
     .section    .text
+	.align      2
     .global     channels_process_sound_chan_mono
 channels_process_sound_chan_mono:
     movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
-    lea.l       -12(%sp), %sp           | save registers
-    move.l      %macsr, %d1             |
-    movem.l     %d1-%d3, (%sp)          |
-    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
+    lea.l       -20(%sp), %sp           | save registers
+    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
     movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      %a0, %a2                | use separate dst pointers since read
+    move.l      %a1, %a3                | pointers run one ahead of write
     move.l      #0x40000000, %d3        | %d3 = 0.5
-1:
-    move.l     (%a0), %d1               | L = R = l/2 + r/2
-    mac.l      %d1, %d3, (%a1), %d2, %acc0 |
-    mac.l      %d2, %d3, %acc0          |
-    movclr.l   %acc0, %d1               |
-    move.l     %d1, (%a0)+              | output to original buffer
-    move.l     %d1, (%a1)+              |
-    subq.l     #1, %d0                  |
-    bgt.s      1b                       |
-    movem.l    (%sp), %d1-%d3           | restore registers
-    move.l     %d1, %macsr              |
-    lea.l      12(%sp), %sp             | cleanup
-    rts
-.cpmono_end:
-    .size       channels_process_sound_chan_mono, .cpmono_end-channels_process_sound_chan_mono
-
+    move.l      (%a0)+, %d1             | prime the input registers
+    move.l      (%a1)+, %d2             |
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc0 |
+    subq.l      #1, %d0                 |
+    ble.s       20f | loop done         |
+10: | loop                              |
+    movclr.l    %acc0, %d4              | L = R = l/2 + r/2
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc0 |
+    move.l      %d4, (%a2)+             | output to original buffer
+    move.l      %d4, (%a3)+             |
+    subq.l      #1, %d0                 |
+    bgt.s       10b | loop              |
+20: | loop done                         |
+    movclr.l    %acc0, %d4              | output last sample
+    move.l      %d4, (%a2)              |
+    move.l      %d4, (%a3)              |
+    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
+    lea.l       20(%sp), %sp            | cleanup
+    rts                                 |
+    .size       channels_process_sound_chan_mono, \
+                .-channels_process_sound_chan_mono
 
 /****************************************************************************
  * void channels_process_sound_chan_custom(int count, int32_t *buf[])
@@ -248,34 +308,47 @@ channels_process_sound_chan_mono:
  * Apply stereo width (narrowing/expanding) effect.
  */
     .section    .text
+	.align      2
     .global     channels_process_sound_chan_custom
 channels_process_sound_chan_custom:
     movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
-    lea.l       -16(%sp), %sp           | save registers
-    move.l      %macsr, %d1             |
-    movem.l     %d1-%d4, (%sp)          |
-    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
+    lea.l       -28(%sp), %sp           | save registers
+    movem.l     %d2-%d6/%a2-%a3, (%sp)  |
     movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      %a0, %a2                | use separate dst pointers since read
+    move.l      %a1, %a3                | pointers run one ahead of write
     move.l      dsp_sw_gain, %d3        | load straight (mid) gain
     move.l      dsp_sw_cross, %d4       | load cross (side) gain
-1:
-    move.l      (%a0), %d1              |
-    mac.l       %d1, %d3, (%a1), %d2, %acc0 |  L = l*gain + r*cross
-    mac.l       %d1, %d4            , %acc1 |  R = r*gain + l*cross
-    mac.l       %d2, %d4            , %acc0 |
-    mac.l       %d2, %d3            , %acc1 |
-    movclr.l    %acc0, %d1              |
-    movclr.l    %acc1, %d2              |
-    move.l      %d1, (%a0)+             |
-    move.l      %d2, (%a1)+             |
+    move.l      (%a0)+, %d1             | prime the input registers
+    move.l      (%a1)+, %d2             |
+    mac.l       %d1, %d3             , %acc0 |  L = l*gain + r*cross
+    mac.l       %d1, %d4, (%a0)+, %d1, %acc1 |  R = r*gain + l*cross
+    mac.l       %d2, %d4             , %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc1 |
     subq.l      #1, %d0                 |
-    bgt.s       1b                      |
-    movem.l     (%sp), %d1-%d4          | restore registers
-    move.l      %d1, %macsr             |
-    lea.l       16(%sp), %sp            | cleanup
-    rts
-.cpcustom_end:
-    .size       channels_process_sound_chan_custom, .cpcustom_end-channels_process_sound_chan_custom
+    ble.b       20f | loop done         |
+10: | loop                              |
+    movclr.l    %acc0, %d5              |
+    movclr.l    %acc1, %d6              |
+15: | loop start                        |
+    mac.l       %d1, %d3             , %acc0 |  L = l*gain + r*cross
+    mac.l       %d1, %d4, (%a0)+, %d1, %acc1 |  R = r*gain + l*cross
+    mac.l       %d2, %d4             , %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc1 |
+    move.l      %d5, (%a2)+             |
+    move.l      %d6, (%a3)+             |
+    subq.l      #1, %d0                 |
+    bgt.s       10b | loop              |
+20: | loop done                         |
+    movclr.l    %acc0, %d5              | output last sample
+    movclr.l    %acc1, %d6              |
+    move.l      %d5, (%a2)              |
+    move.l      %d6, (%a3)              |
+    movem.l     (%sp), %d2-%d6/%a2-%a3  | restore registers
+    lea.l       28(%sp), %sp            | cleanup
+    rts                                 |
+    .size       channels_process_sound_chan_custom, \
+                .-channels_process_sound_chan_custom
 
 /****************************************************************************
  *  void channels_process_sound_chan_karaoke(int count, int32_t *buf[])
@@ -283,31 +356,42 @@ channels_process_sound_chan_custom:
  *  Separate channels into side channels.
  */
     .section    .text
+	.align      2
     .global     channels_process_sound_chan_karaoke
 channels_process_sound_chan_karaoke:
     movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
-    lea.l       -16(%sp), %sp           | save registers
-    move.l      %macsr, %d1             |
-    movem.l     %d1-%d4, (%sp)          |
-    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
-    movem.l     (%a0), %a0-%a1          | get channel pointers
-    move.l      #0x40000000, %d4        | %d3 = 0.5
-1:
-    move.l     (%a0), %d1               |
-    msac.l     %d1, %d4, (%a1), %d2, %acc0 | R = r/2 - l/2
-    mac.l      %d2, %d4            , %acc0 |
-    movclr.l   %acc0, %d1               |
-    move.l     %d1, (%a1)+              |
-    neg.l      %d1                      | L = -R = -(r/2 - l/2) = l/2 - r/2
-    move.l     %d1, (%a0)+              |
-    subq.l     #1, %d0                  |
-    bgt.s      1b                       |
-    movem.l    (%sp), %d1-%d4           | restore registers
-    move.l     %d1, %macsr              |
-    lea.l      16(%sp), %sp             | cleanup
-    rts
-.cpkaraoke_end:
-    .size       channels_process_sound_chan_karaoke, .cpkaraoke_end-channels_process_sound_chan_karaoke
+    lea.l       -20(%sp), %sp           | save registers
+    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
+    movem.l     (%a0), %a0-%a1          | get channel src pointers
+    move.l      %a0, %a2                | use separate dst pointers since read
+    move.l      %a1, %a3                | pointers run one ahead of write
+    move.l      #0x40000000, %d3        | %d3 = 0.5
+    move.l      (%a0)+, %d1             | prime the input registers
+    move.l      (%a1)+, %d2             |
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 | L = l/2 - r/2
+    msac.l      %d2, %d3, (%a1)+, %d2, %acc0 |
+    subq.l      #1, %d0                 |
+    ble.b       20f | loop done         |
+10: | loop                              |
+    movclr.l    %acc0, %d4              |
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 | L = l/2 - r/2
+    msac.l      %d2, %d3, (%a1)+, %d2, %acc0 |
+    move.l      %d4, (%a2)+             |
+    neg.l       %d4                     | R = -L = -(l/2 - r/2) = r/2 - l/2
+    move.l      %d4, (%a3)+             |
+    subq.l      #1, %d0                 |
+    bgt.s       10b | loop              |
+20: | loop done                         |
+    movclr.l    %acc0, %d4              | output last sample
+    move.l      %d4, (%a2)              |
+    neg.l       %d4                     | R = -L = -(l/2 - r/2) = r/2 - l/2
+    move.l      %d4, (%a3)              |
+    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
+    lea.l       20(%sp), %sp            | cleanup
+    rts                                 |
+    .size       channels_process_sound_chan_karaoke, \
+                .-channels_process_sound_chan_karaoke
+
 /****************************************************************************
  * void sample_output_stereo(int count, struct dsp_data *data,
  *                               int32_t *src[], int16_t *dst)
@@ -329,6 +413,7 @@ channels_process_sound_chan_karaoke:
  *
  */
     .section   .text
+	.align      2
     .global    sample_output_stereo
 sample_output_stereo:
     lea.l       -44(%sp), %sp             | save registers
@@ -348,11 +433,11 @@ sample_output_stereo:
     add.l       %a4, %d0                  |
     and.l       #0xfffffff0, %d0          |
     cmp.l       %a0, %d0                  | at least a full line?
-    bhi.w       .sos_longloop_1_start     | no? jump to trailing longword
+    bhi.w       40f | long loop 1 start   | no? do as trailing longwords
     sub.l       #16, %d0                  | %d1 = first line bound
     cmp.l       %a4, %d0                  | any leading longwords?
-    bls.b       .sos_lineloop_start       | no? jump to line loop
-.sos_longloop_0:
+    bls.b       20f | line loop start     | no? start line loop
+10: | long loop 0                         |
     move.l      (%a2)+, %d1               | read longword from L and R
     mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | shift L to high word
     mac.l       %d2, %a1, %acc1           | shift R to high word
@@ -362,10 +447,10 @@ sample_output_stereo:
     move.w      %d2, %d1                  | interleave MS 16 bits of each 
     move.l      %d1, (%a4)+               | ...and write both
     cmp.l       %a4, %d0                  |
-    bhi.b       .sos_longloop_0           |
-.sos_lineloop_start:
+    bhi.b       10b | long loop 0         |
+20: | line loop start                     |
     lea.l       -12(%a0), %a5             | %a5 = at or just before last line bound
-.sos_lineloop:
+30: | line loop                           |
     move.l      (%a3)+, %d4               | get next 4 R samples and scale
     mac.l       %d4, %a1, (%a3)+, %d5, %acc0 | with saturation
     mac.l       %d5, %a1, (%a3)+, %d6, %acc1 |
@@ -394,11 +479,11 @@ sample_output_stereo:
     move.w      %d7, %d3                  |
     movem.l     %d0-%d3, -16(%a4)         | write four stereo samples
     cmp.l       %a4, %a5                  |
-    bhi.b       .sos_lineloop             |
-.sos_longloop_1_start:
+    bhi.b       30b | line loop           |
+40: | long loop 1 start                   |
     cmp.l       %a4, %a0                  | any longwords left?
-    bls.b       .sos_done                 | no? finished.
-.sos_longloop_1:
+    bls.b       60f | output end          | no? stop
+50: | long loop 1                         |
     move.l      (%a2)+, %d1               | handle trailing longwords
     mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | the same way as leading ones
     mac.l       %d2, %a1, %acc1           |
@@ -408,14 +493,13 @@ sample_output_stereo:
     move.w      %d2, %d1                  |
     move.l      %d1, (%a4)+               |
     cmp.l       %a4, %a0                  |
-    bhi.b       .sos_longloop_1           |
-.sos_done:
+    bhi.b       50b                       | long loop 1
+60: | output end                          |
     movem.l     (%sp), %d1-%d7/%a2-%a5    | restore registers
     move.l      %d1, %macsr               |
     lea.l       44(%sp), %sp              | cleanup
     rts                                   |
-.sos_end:
-    .size      sample_output_stereo, .sos_end-sample_output_stereo
+    .size      sample_output_stereo, .-sample_output_stereo
 
 /****************************************************************************
  * void sample_output_mono(int count, struct dsp_data *data,
@@ -424,6 +508,7 @@ sample_output_stereo:
  * Same treatment as sample_output_stereo but for one channel.
  */
     .section   .text
+	.align      2
     .global    sample_output_mono
 sample_output_mono:
     lea.l       -28(%sp), %sp             | save registers
@@ -442,11 +527,11 @@ sample_output_mono:
     add.l       %a3, %d0                  |
     and.l       #0xfffffff0, %d0          |
     cmp.l       %a0, %d0                  | at least a full line?
-    bhi.w       .som_longloop_1_start     | no? jump to trailing longword
+    bhi.w       40f | long loop 1 start   | no? do as trailing longwords
     sub.l       #16, %d0                  | %d1 = first line bound
     cmp.l       %a3, %d0                  | any leading longwords?
-    bls.b       .som_lineloop_start       | no? jump to line loop
-.som_longloop_0:
+    bls.b       20f | line loop start     | no? start line loop
+10: | long loop 0                         |
     move.l      (%a2)+, %d1               | read longword from L and R
     mac.l       %d1, %d5, %acc0           | shift L to high word
     movclr.l    %acc0, %d1                | get possibly saturated results
@@ -455,10 +540,10 @@ sample_output_mono:
     move.w      %d2, %d1                  | duplicate single channel into
     move.l      %d1, (%a3)+               | L and R
     cmp.l       %a3, %d0                  |
-    bhi.b       .som_longloop_0           |
-.som_lineloop_start:
+    bhi.b       10b | long loop 0         |
+20: | line loop start                     |
     lea.l       -12(%a0), %a1             | %a1 = at or just before last line bound
-.som_lineloop:
+30: | line loop                           |
     move.l      (%a2)+, %d0               | get next 4 L samples and scale
     mac.l       %d0, %d5, (%a2)+, %d1, %acc0 | with saturation
     mac.l       %d1, %d5, (%a2)+, %d2, %acc1 |
@@ -483,11 +568,11 @@ sample_output_mono:
     move.w      %d4, %d3                  |
     movem.l     %d0-%d3, -16(%a3)         | write four stereo samples
     cmp.l       %a3, %a1                  |
-    bhi.b       .som_lineloop             |
-.som_longloop_1_start:
+    bhi.b       30b | line loop           |
+40: | long loop 1 start                   |
     cmp.l       %a3, %a0                  | any longwords left?
-    bls.b       .som_done                 | no? finished.
-.som_longloop_1:
+    bls.b       60f | output end          | no? stop
+50: | loop loop 1                         |
     move.l      (%a2)+, %d1               | handle trailing longwords
     mac.l       %d1, %d5, %acc0           | the same way as leading ones
     movclr.l    %acc0, %d1                |
@@ -496,11 +581,10 @@ sample_output_mono:
     move.w      %d2, %d1                  |
     move.l      %d1, (%a3)+               |
     cmp.l       %a3, %a0                  |
-    bhi.b       .som_longloop_1           |
-.som_done:
+    bhi.b       50b | long loop 1         |
+60: | output end                          |
     movem.l     (%sp), %d1-%d5/%a2-%a3    | restore registers
     move.l      %d1, %macsr               |
     lea.l       28(%sp), %sp              | cleanup
     rts                                   |
-.som_end:
-    .size      sample_output_mono, .som_end-sample_output_mono
+    .size      sample_output_mono, .-sample_output_mono