/***************************************************************************
 *             __________               __   ___.
 *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
 *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
 *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
 *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
 *                     \/            \/     \/    \/            \/
 * $Id$
 *
 * Copyright (C) 2006 Thom Johansen
 *
 * All files in this archive are subject to the GNU General Public License.
 * See the file COPYING in the source tree root for full license agreement.
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
 * KIND, either express or implied.
 *
 ****************************************************************************/

/****************************************************************************
 * void apply_crossfeed(int32_t *src[], int count)
 */
    .section    .text
    .global     apply_crossfeed 
apply_crossfeed:
    lea.l (-44, %sp), %sp
    movem.l %d2-%d7/%a2-%a6, (%sp)      | save all regs
    move.l (44+4, %sp), %a4
    movem.l (%a4), %a4-%a5              | a4 = src[0], a5 = src[1]
    move.l (44+8, %sp), %d7             | d7 = count

    lea.l crossfeed_data, %a1
    lea.l (8*4, %a1), %a0               | a0 = &delay[0][0]
    move.l (%a1)+, %a6                  | a6 = direct gain
    movem.l (3*4, %a1), %d0-%d3         | fetch filter history samples
    move.l (33*4, %a1), %d4             | fetch delay line index
    movem.l (%a1), %a1-%a3              | load filter coefs
    move.l %d4, %d5
    lsl.l #3, %d5
    add.l %d5, %a0                      | point a0 to current delay position
|    lea.l (%d4*4, %a0), %a0
|    lea.l (%d4*4, %a0), %a0             | point a0 to current delay position
    /* Register usage in loop:
     * a0 = &delay[index][0], a1..a3 = b0, b1, a1 (filter coefs),
     * a4 = src[0], a5 = src[1], a6 = direct gain,
     * d0..d3 = history
     * d4 = delay line index,
     * d5,d6 = temp.
     * d7 = count
     */
.cfloop:
    mac.l %a2, %d0, (4, %a0), %d0, %acc0 | acc = b1*dr[n - 1] d0 = dr[n]
    mac.l %a1, %d0, %acc0               | acc += b0*dr[n]
    mac.l %a3, %d1, (%a4), %d5, %acc0   | acc += a1*y_l[n - 1], load left input
    move.l %acc0, %d1                   | get filtered delayed sample
    mac.l %a6, %d5, %acc0               | acc += gain*x_l[n]
    movclr.l %acc0, %d6
    move.l %d6, (%a4)+                  | write result

    mac.l %a2, %d2, (%a0), %d2, %acc0   | acc = b1*dl[n - 1], d2 = dl[n]
    move.l %d5, (%a0)+                  | save left input to delay line
    mac.l %a1, %d2, %acc0               | acc += b0*dl[n]
    mac.l %a3, %d3, (%a5), %d5, %acc0   | acc += a1*y_r[n - 1], load right input
    move.l %acc0, %d3                   | get filtered delayed sample
    mac.l %a6, %d5, %acc0               | acc += gain*x_r[n]
    move.l %d5, (%a0)+                  | save right input to delay line
    movclr.l %acc0, %d6
    move.l %d6, (%a5)+                  | write result

    addq.l #1, %d4                      | index++
    moveq.l #13, %d6
    cmp.l %d6, %d4                      | wrap index to 0 if it overflows
    jlt .cfnowrap
    moveq.l #13*8, %d4
    sub.l %d4, %a0                      | wrap back delay line ptr as well
    clr.l %d4
.cfnowrap:
    subq.l #1, %d7
    jne .cfloop
    | save data back to struct
    lea.l crossfeed_data + 4*4, %a1
    movem.l %d0-%d3, (%a1)
    move.l %d4, (30*4, %a1)
    movem.l (%sp), %d2-%d7/%a2-%a6
    lea.l (44, %sp), %sp
    rts
.cfend:
    .size       apply_crossfeed,.cfend-apply_crossfeed

/****************************************************************************
 * int dsp_downsample(int count, struct dsp_data *data,
 *                    in32_t *src[], int32_t *dst[])
 */
    .section    .text
    .global     dsp_downsample
dsp_downsample:
    lea.l       -40(%sp), %sp           | save non-clobberables
    movem.l     %d2-%d7/%a2-%a5, (%sp)  |
    movem.l     44(%sp), %d2/%a0-%a2    | %d2 = count
                                        | %a0 = data
                                        | %a1 = src
                                        | %a2 = dst
    movem.l     4(%a0), %d3-%d4         | %d3 = ch = data->num_channels
                                        | %d4 = delta = data->resample_data.delta
    moveq.l     #16, %d7                | %d7 = shift
.dschannel_loop:
    move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
    move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
    move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
    lea.l       12(%a0, %d3.l*4), %a5   | %a5 = &data->resample_data.ast_sample[ch-1]
    move.l      (%a5), %d0              | %d0 = last = data->resample_data.last_sample[ch-1]
    move.l      -4(%a3, %d2.l*4), (%a5) | data->resample_data.last_sample[ch-1] = s[count-1]
    move.l      %d5, %d6                | %d6 = pos = phase >> 16
    lsr.l       %d7, %d6                |
    cmp.l       %d2, %d6                | past end of samples?
    bge.b       .dsloop_skip            | yes? skip loop
    tst.l       %d6                     | need last sample of prev. frame?
    bne.b       .dsloop                 | no? start main loop
    move.l      (%a3, %d6.l*4), %d1     | %d1 = s[pos]
    bra.b       .dsuse_last_start       | start with last (last in %d0)
.dsloop:
    lea.l       -4(%a3, %d6.l*4), %a5   | load s[pos-1] and s[pos]
    movem.l     (%a5), %d0-%d1          |
.dsuse_last_start:
    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
    move.l      %d0, %acc0              | %acc0 = previous sample
    move.l      %d5, %d0                | frac = (phase << 16) >> 1
    lsl.l       %d7, %d0                |
    lsr.l       #1, %d0                 |
    mac.l       %d0, %d1, %acc0         | %acc0 += frac * diff
    move.l      %acc0, %d0              |
    add.l       %d4, %d5                | phase += delta
    move.l      %d5, %d6                | pos = phase >> 16
    lsr.l       %d7, %d6                |
    move.l      %d0, (%a4)+             | *d++ = %d0
    cmp.l       %d2, %d6                | pos < count?
    blt.b       .dsloop                 | yes? continue resampling
.dsloop_skip:
    subq.l      #1, %d3                 | ch > 0?
    bgt.b       .dschannel_loop         | yes? process next channel
    asl.l       %d7, %d2                | wrap phase to start of next frame
    sub.l       %d2, %d5                | data->resample_data.phase =
    move.l      %d5, 12(%a0)            | ... phase - (count << 16)
    move.l      %a4, %d0                | return d - d[0]
    sub.l       (%a2), %d0              |
    asr.l       #2, %d0                 | convert bytes->samples
    movem.l     (%sp), %d2-%d7/%a2-%a5  | restore non-clobberables
    move.l      %acc1, %acc0            | clear %acc0
    lea.l       40(%sp), %sp            | cleanup stack
    rts                                 | buh-bye
.dsend:
    .size       dsp_downsample,.dsend-dsp_downsample

/****************************************************************************
 * int dsp_upsample(int count, struct dsp_data *dsp,
 *                  in32_t *src[], int32_t *dst[])
 */
    .section    .text
    .global     dsp_upsample
dsp_upsample:
    lea.l       -40(%sp), %sp           | save non-clobberables
    movem.l     %d2-%d7/%a2-%a5, (%sp)  |
    movem.l     44(%sp), %d2/%a0-%a2    | %d2 = count
                                        | %a0 = data
                                        | %a1 = src
                                        | %a2 = dst
    movem.l      4(%a0), %d3-%d4        | %d3 = ch = channels
                                        | %d4 = delta = data->resample_data.delta
    swap        %d4                     | swap delta to high word to use
                                        | carries to increment position
.uschannel_loop:
    move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
    move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
    lea.l       12(%a0, %d3.l*4), %a4   | %a4 = &data->resample_data.last_sample[ch-1]
    lea.l       (%a3, %d2.l*4), %a5     | %a5 = src_end = &src[count]
    move.l      (%a4), %d0              | %d0 = last = data->resample_data.last_sample[ch-1]
    move.l      -(%a5), (%a4)           | data->resample_data.last_sample[ch-1] = s[count-1]
    move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
    swap        %d5                     | swap phase to high word to use
                                        | carries to increment position
    move.l      %d5, %d6                | %d6 = pos = phase >> 16
    clr.w       %d5                     |
    eor.l       %d5, %d6                | pos == 0?
    beq.b       .usstart_0              | no? transistion from down
    cmp.l       %d3, %d6                | past end of samples?
    bge.b       .usloop_skip            | yes? skip loop
    lea.l       -4(%a3, %d6.l*4), %a3   | %a3 = s = &s[pos-1] (previous)
    move.l      (%a3)+, %d0             | %d0 = *s++
    .word       0x51fa                  | tpf.w - trap next instruction
.usloop_1:
    move.l      %d6, %d0                | move previous sample to %d0
.usstart_0:
    move.l      (%a3)+, %d1             | fetch next sample
    move.l      %d1, %d6                | save sample value
    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
.usloop_0:
    lsr.l       #1, %d5                 | make phase into frac
    mac.l       %d1, %d5, %acc0         | %acc0 = diff * frac
    movclr.l    %acc0, %d7              | %d7 = product
    lsl.l       #1, %d5                 | restore frac to phase
    add.l       %d0, %d7                | %d7 = last + product
    move.l      %d7, (%a4)+             | *d++ = %d7
    add.l       %d4, %d5                | phase += delta
    bcc.b       .usloop_0               | load next values?
    cmp.l       %a5, %a3                | src <= src_end?
    ble.b       .usloop_1               | yes? continue resampling
.usloop_skip:
    subq.l      #1, %d3                 | ch > 0?
    bgt.b       .uschannel_loop         | yes? process next channel
    swap        %d5                     | wrap phase to start of next frame
    move.l      %d5, 12(%a0)            | ...and save in data->resample_data.phase
    move.l      %a4, %d0                | return d - d[0]
    sub.l       (%a2), %d0              |
    movem.l     (%sp), %d2-%d7/%a2-%a5  | restore non-clobberables
    asr.l       #2, %d0                 | convert bytes->samples
    lea.l       40(%sp), %sp            | cleanup stack
    rts                                 | buh-bye
.usend:
    .size       dsp_upsample,.usend-dsp_upsample

/* These routines might benefit from burst transfers but we'll keep them
 * small for now since they're rather light weight
 */

/****************************************************************************
 * void channels_process_sound_chan_mono(int count, int32_t *buf[])
 *
 * Mix left and right channels 50/50 into a center channel.
 */
    .section    .text
    .global     channels_process_sound_chan_mono
channels_process_sound_chan_mono:
    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
    lea.l       -12(%sp), %sp           | save registers
    move.l      %macsr, %d1             |
    movem.l     %d1-%d3, (%sp)          |
    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
    movem.l     (%a0), %a0-%a1          | get channel pointers
    move.l      #0x40000000, %d3        | %d3 = 0.5
1:
    move.l     (%a0), %d1               | L = R = l/2 + r/2
    mac.l      %d1, %d3, (%a1), %d2, %acc0 |
    mac.l      %d2, %d3, %acc0          |
    movclr.l   %acc0, %d1               |
    move.l     %d1, (%a0)+              | output to original buffer
    move.l     %d1, (%a1)+              |
    subq.l     #1, %d0                  |
    bgt.s      1b                       |
    movem.l    (%sp), %d1-%d3           | restore registers
    move.l     %d1, %macsr              |
    lea.l      12(%sp), %sp             | cleanup
    rts
.cpmono_end:
    .size       channels_process_sound_chan_mono, .cpmono_end-channels_process_sound_chan_mono


/****************************************************************************
 * void channels_process_sound_chan_custom(int count, int32_t *buf[])
 *
 * Apply stereo width (narrowing/expanding) effect.
 */
    .section    .text
    .global     channels_process_sound_chan_custom
channels_process_sound_chan_custom:
    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
    lea.l       -16(%sp), %sp           | save registers
    move.l      %macsr, %d1             |
    movem.l     %d1-%d4, (%sp)          |
    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
    movem.l     (%a0), %a0-%a1          | get channel pointers
    move.l      dsp_sw_gain, %d3        | load straight (mid) gain
    move.l      dsp_sw_cross, %d4       | load cross (side) gain
1:
    move.l      (%a0), %d1              |
    mac.l       %d1, %d3 , (%a1), %d2, %acc0 |  L = l*gain + r*cross
    mac.l       %d1, %d4 , %acc1        |  R = r*gain + l*cross
    mac.l       %d2, %d4 , %acc0        |
    mac.l       %d2, %d3 , %acc1        |
    movclr.l    %acc0, %d1              |
    movclr.l    %acc1, %d2              |
    move.l      %d1, (%a0)+             |
    move.l      %d2, (%a1)+             |
    subq.l      #1, %d0                 |
    bgt.s       1b                      |
    movem.l     (%sp), %d1-%d4          | restore registers
    move.l      %d1, %macsr             |
    lea.l       16(%sp), %sp            | cleanup
    rts
.cpcustom_end:
    .size       channels_process_sound_chan_custom, .cpcustom_end-channels_process_sound_chan_custom

/****************************************************************************
 *  void channels_process_sound_chan_karaoke(int count, int32_t *buf[])
 *
 *  Separate channels into side channels.
 */
    .section    .text
    .global     channels_process_sound_chan_karaoke
channels_process_sound_chan_karaoke:
    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
    lea.l       -16(%sp), %sp           | save registers
    move.l      %macsr, %d1             |
    movem.l     %d1-%d4, (%sp)          |
    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
    movem.l     (%a0), %a0-%a1          | get channel pointers
    move.l      #0x40000000, %d4        | %d3 = 0.5
1:
    move.l     (%a0), %d1               |
    mac.l      %d1, %d4, (%a1), %d2, %acc0 | L = l/2 - r/2
    mac.l      %d2, %d4, %acc1          | R = r/2 - l/2
    movclr.l   %acc0, %d1               |
    movclr.l   %acc1, %d2               |
    move.l     %d1, %d3                 |
    sub.l      %d2, %d1                 |
    sub.l      %d3, %d2                 |
    move.l     %d1, (%a0)+              |
    move.l     %d2, (%a1)+              |
    subq.l     #1, %d0                  |
    bgt.s      1b                       |
    movem.l    (%sp), %d1-%d4           | restore registers
    move.l     %d1, %macsr              |
    lea.l      16(%sp), %sp             | cleanup
    rts
.cpkaraoke_end:
    .size       channels_process_sound_chan_karaoke, .cpkaraoke_end-channels_process_sound_chan_karaoke

/****************************************************************************
 * void sample_output_stereo(int count, struct dsp_data *data,
 *                               int32_t *src[], int16_t *dst)
 *
 * Framework based on the ubiquitous Rockbox line transfer logic for
 * Coldfire CPUs.
 *
 * Does emac clamping and scaling (which proved faster than the usual
 * checks and branches - even single test clamping) and writes using
 * line burst transfers. Also better than writing a single L-R pair per
 * loop but a good deal more code.
 *
 * Attemping bursting during reads is rather futile since the source and
 * destination alignments rarely agree and too much complication will
 * slow us up. The parallel loads seem to do a bit better at least until
 * a pcm buffer can always give line aligned chunk and then aligning the
 * dest can then imply the source is aligned if the source buffers are.
 * For now longword alignment is assumed of both the source and dest.
 *
 */
    .section   .text
    .global    sample_output_stereo
sample_output_stereo:
    lea.l       -44(%sp), %sp             | save registers
    move.l      %macsr, %d1               | do it now as at many lines will
    movem.l     %d1-%d7/%a2-%a5, (%sp)    | be the far more common condition
    move.l      #0x80, %macsr             | put emac unit in signed int mode
    movem.l     48(%sp), %a0-%a2/%a4      |
    lea.l       (%a4, %a0.l*4), %a0       | %a0 = end address     
    move.l      (%a1), %d1                | %a1 = multiplier: (1 << (16 - scale))
    sub.l       #16, %d1                  |
    neg.l       %d1                       |
    moveq.l     #1, %d0                   |
    asl.l       %d1, %d0                  |
    move.l      %d0, %a1                  |
    movem.l     (%a2), %a2-%a3            | get L/R channel pointers
    moveq.l     #28, %d0                  | %d0 = second line bound
    add.l       %a4, %d0                  |
    and.l       #0xfffffff0, %d0          |
    cmp.l       %a4, %d0                  | at least a full line?
    blo.w       .sos_longloop_1_start     | no? jump to trailing longword
    sub.l       #16, %d0                  | %d1 = first line bound
    cmp.l       %a4, %d0                  | any leading longwords?
    bls.b       .sos_lineloop_start       | no? jump to line loop
.sos_longloop_0:
    move.l      (%a2)+, %d1               | read longword from L and R
    mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | shift L to high word
    mac.l       %d2, %a1, %acc1           | shift R to high word
    movclr.l    %acc0, %d1                | get possibly saturated results
    movclr.l    %acc1, %d2                |
    swap        %d2                       | move R to low word
    move.w      %d2, %d1                  | interleave MS 16 bits of each 
    move.l      %d1, (%a4)+               | ...and write both
    cmp.l       %a4, %d0                  |
    bhi.b       .sos_longloop_0           |
.sos_lineloop_start:
    lea.l       -12(%a0), %a5             | %a5 = at or just before last line bound
.sos_lineloop:
    move.l      (%a2)+, %d0               | get next 4 L samples and scale
    mac.l       %d0, %a1, (%a2)+, %d1, %acc0 | with saturation
    mac.l       %d1, %a1, (%a2)+, %d2, %acc1 |
    mac.l       %d2, %a1, (%a2)+, %d3, %acc2 |
    mac.l       %d3, %a1, %acc3           |
    movclr.l    %acc0, %d0                | obtain results
    movclr.l    %acc1, %d1                |
    movclr.l    %acc2, %d2                |
    movclr.l    %acc3, %d3                |
    move.l      (%a3)+, %d4               | get next 4 R samples and scale
    mac.l       %d4, %a1, (%a3)+, %d5,  %acc0 | with saturation
    mac.l       %d5, %a1, (%a3)+, %d6,  %acc1 |
    mac.l       %d6, %a1, (%a3)+, %d7,  %acc2 |
    mac.l       %d7, %a1, %acc3           |
    movclr.l    %acc0, %d4                | obtain results
    movclr.l    %acc1, %d5                |
    movclr.l    %acc2, %d6                |
    movclr.l    %acc3, %d7                |
    swap        %d4                       | interleave most significant
    move.w      %d4, %d0                  | 16 bits of L and R
    swap        %d5                       |
    move.w      %d5, %d1                  |
    swap        %d6                       |
    move.w      %d6, %d2                  |
    swap        %d7                       |
    move.w      %d7, %d3                  |
    movem.l     %d0-%d3, (%a4)            | write four stereo samples
    lea.l       16(%a4), %a4              |
    cmp.l       %a4, %a5                  |
    bhi.b       .sos_lineloop             |
.sos_longloop_1_start:
    cmp.l       %a4, %a0                  | any longwords left?
    bls.b       .sos_done                 | no? finished.
.sos_longloop_1:
    move.l      (%a2)+, %d1               | handle trailing longwords
    mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | the same way as leading ones
    mac.l       %d2, %a1, %acc1           |
    movclr.l    %acc0, %d1                |
    movclr.l    %acc1, %d2                |
    swap        %d2                       |
    move.w      %d2, %d1                  |
    move.l      %d1, (%a4)+               |
    cmp.l       %a4, %a0                  |
    bhi.b       .sos_longloop_1           |
.sos_done:
    movem.l     (%sp), %d1-%d7/%a2-%a5    | restore registers
    move.l      %d1, %macsr               |
    lea.l       44(%sp), %sp              | cleanup
    rts                                   |
.sos_end:
    .size      sample_output_stereo, .sos_end-sample_output_stereo

/****************************************************************************
 * void sample_output_mono(int count, struct dsp_data *data,
 *                         int32_t *src[], int16_t *dst)
 *
 * Same treatment as sample_output_stereo but for one channel.
 */
    .section   .text
    .global    sample_output_mono
sample_output_mono:
    lea.l       -28(%sp), %sp             | save registers
    move.l      %macsr, %d1               | do it now as at many lines will
    movem.l     %d1-%d5/%a2-%a3, (%sp)    | be the far more common condition
    move.l      #0x80, %macsr             | put emac unit in signed int mode
    movem.l     32(%sp), %a0-%a3          |
    lea.l       (%a3, %a0.l*4), %a0       | %a0 = end address     
    move.l      (%a1), %d1                | %d5 = multiplier: (1 << (16 - scale))
    sub.l       #16, %d1                  |
    neg.l       %d1                       |
    moveq.l     #1, %d5                   |
    asl.l       %d1, %d5                  |
    movem.l     (%a2), %a2                | get source channel pointer
    moveq.l     #28, %d0                  | %d0 = second line bound
    add.l       %a3, %d0                  |
    and.l       #0xfffffff0, %d0          |
    cmp.l       %a3, %d0                  | at least a full line?
    blo.w       .som_longloop_1_start     | no? jump to trailing longword
    sub.l       #16, %d0                  | %d1 = first line bound
    cmp.l       %a3, %d0                  | any leading longwords?
    bls.b       .som_lineloop_start       | no? jump to line loop
.som_longloop_0:
    move.l      (%a2)+, %d1               | read longword from L and R
    mac.l       %d1, %d5, %acc0           | shift L to high word
    movclr.l    %acc0, %d1                | get possibly saturated results
    move.l      %d1, %d2                  |
    swap        %d2                       | move R to low word
    move.w      %d2, %d1                  | duplicate single channel into
    move.l      %d1, (%a3)+               | L and R
    cmp.l       %a3, %d0                  |
    bhi.b       .som_longloop_0           |
.som_lineloop_start:
    lea.l       -12(%a0), %a1             | %a1 = at or just before last line bound
.som_lineloop:
    move.l      (%a2)+, %d0               | get next 4 L samples and scale
    mac.l       %d0, %d5, (%a2)+, %d1, %acc0 | with saturation
    mac.l       %d1, %d5, (%a2)+, %d2, %acc1 |
    mac.l       %d2, %d5, (%a2)+, %d3, %acc2 |
    mac.l       %d3, %d5, %acc3           |
    movclr.l    %acc0, %d0                | obtain results
    movclr.l    %acc1, %d1                |
    movclr.l    %acc2, %d2                |
    movclr.l    %acc3, %d3                |
    move.l      %d0, %d4                  | duplicate single channel
    swap        %d4                       | into L and R
    move.w      %d4, %d0                  |
    move.l      %d1, %d4                  |
    swap        %d4                       |
    move.w      %d4, %d1                  |
    move.l      %d2, %d4                  |
    swap        %d4                       |
    move.w      %d4, %d2                  |
    move.l      %d3, %d4                  |
    swap        %d4                       |
    move.w      %d4, %d3                  |
    movem.l     %d0-%d3, (%a3)            | write four stereo samples
    lea.l       16(%a3), %a3              |
    cmp.l       %a3, %a1                  |
    bhi.b       .som_lineloop             |
.som_longloop_1_start:
    cmp.l       %a3, %a0                  | any longwords left?
    bls.b       .som_done                 | no? finished.
.som_longloop_1:
    move.l      (%a2)+, %d1               | handle trailing longwords
    mac.l       %d1, %d5, %acc0           | the same way as leading ones
    movclr.l    %acc0, %d1                |
    move.l      %d1, %d2                  |
    swap        %d2                       |
    move.w      %d2, %d1                  |
    move.l      %d1, (%a3)+               |
    cmp.l       %a3, %a0                  |
    bhi.b       .som_longloop_1           |
.som_done:
    movem.l     (%sp), %d1-%d5/%a2-%a3    | restore registers
    move.l      %d1, %macsr               |
    lea.l       28(%sp), %sp              | cleanup
    rts                                   |
.som_end:
    .size      sample_output_mono, .som_end-sample_output_mono