/***************************************************************************
*             __________               __   ___.
*   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
*   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
*   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
*   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
*                     \/            \/     \/    \/            \/
* $Id$
*
* Greyscale framework
* Core & miscellaneous functions
*
* This is a generic framework to display up to 33 shades of grey
* on low-depth bitmap LCDs (Archos b&w, Iriver 4-grey, iPod 4-grey)
* within plugins.
*
* Copyright (C) 2004-2006 Jens Arnold
*
* All files in this archive are subject to the GNU General Public License.
* See the file COPYING in the source tree root for full license agreement.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/

#include "plugin.h"

#ifdef HAVE_LCD_BITMAP
#include "gray.h"

#if defined(CPU_PP) && defined(HAVE_ADJUSTABLE_CPU_FREQ)
#define NEED_BOOST
#endif

/* Global variables */
struct plugin_api *_gray_rb = NULL; /* global api struct pointer */
struct _gray_info _gray_info;       /* global info structure */

#ifndef SIMULATOR
short _gray_random_buffer;          /* buffer for random number generator */

#if CONFIG_LCD == LCD_SSD1815
/* measured and interpolated curve */
static const unsigned char lcdlinear[256] = {
      0,   3,   5,   8,  11,  13,  16,  18,
     21,  23,  26,  28,  31,  33,  36,  38,
     40,  42,  45,  47,  49,  51,  53,  55,
     57,  59,  60,  62,  64,  66,  67,  69,
     70,  72,  73,  74,  76,  77,  78,  79,
     81,  82,  83,  84,  85,  86,  87,  88,
     88,  89,  90,  91,  92,  92,  93,  94,
     95,  95,  96,  97,  97,  98,  99,  99,
    100, 101, 102, 102, 103, 104, 104, 105,
    106, 106, 107, 107, 108, 109, 109, 110,
    111, 111, 112, 113, 113, 114, 114, 115,
    116, 116, 117, 117, 118, 119, 119, 120,
    120, 121, 121, 122, 122, 123, 123, 124,
    124, 125, 125, 126, 126, 127, 127, 128,
    128, 128, 129, 129, 130, 130, 131, 131,
    132, 132, 133, 133, 133, 134, 134, 135,
    135, 136, 136, 137, 137, 138, 138, 138,
    139, 139, 140, 140, 141, 141, 142, 142,
    143, 143, 144, 144, 145, 145, 146, 146,
    147, 147, 148, 148, 148, 149, 149, 150,
    150, 151, 151, 152, 152, 153, 153, 153,
    154, 154, 155, 155, 156, 156, 157, 157,
    158, 158, 158, 159, 159, 160, 160, 161,
    161, 162, 162, 163, 163, 164, 164, 165,
    165, 166, 167, 167, 168, 168, 169, 169,
    170, 171, 171, 172, 173, 173, 174, 175,
    176, 176, 177, 178, 179, 180, 181, 181,
    182, 183, 184, 185, 186, 188, 189, 190,
    191, 192, 194, 195, 196, 198, 199, 201,
    202, 204, 205, 207, 209, 211, 213, 215,
    217, 219, 222, 224, 226, 229, 231, 234,
    236, 239, 242, 244, 247, 250, 252, 255
};
#elif CONFIG_LCD == LCD_S1D15E06
/* measured and interpolated curve */
static const unsigned char lcdlinear[256] = {
      0,   5,  11,  16,  21,  27,  32,  37,
     42,  47,  51,  56,  60,  64,  68,  72,
     75,  78,  81,  84,  87,  89,  91,  93,
     95,  96,  98,  99, 101, 102, 103, 104,
    105, 106, 107, 108, 109, 110, 111, 111,
    112, 113, 113, 114, 115, 115, 116, 117,
    117, 118, 118, 119, 119, 120, 120, 121,
    121, 122, 122, 123, 123, 124, 124, 125,
    125, 126, 126, 127, 127, 127, 128, 128,
    129, 129, 130, 130, 131, 131, 132, 132,
    133, 133, 134, 134, 135, 135, 136, 136,
    137, 137, 138, 138, 138, 139, 139, 140,
    140, 141, 141, 141, 142, 142, 143, 143,
    143, 144, 144, 145, 145, 145, 146, 146,
    146, 147, 147, 147, 148, 148, 149, 149,
    149, 150, 150, 150, 151, 151, 151, 152,
    152, 153, 153, 153, 154, 154, 155, 155,
    155, 156, 156, 157, 157, 157, 158, 158,
    159, 159, 159, 160, 160, 161, 161, 162,
    162, 162, 163, 163, 164, 164, 164, 165,
    165, 166, 166, 167, 167, 167, 168, 168,
    169, 169, 170, 170, 170, 171, 171, 172,
    172, 173, 173, 174, 174, 175, 175, 176,
    176, 177, 177, 178, 178, 179, 179, 180,
    180, 181, 182, 182, 183, 184, 184, 185,
    186, 186, 187, 188, 188, 189, 190, 191,
    191, 192, 193, 194, 195, 196, 196, 197,
    198, 199, 200, 201, 202, 203, 204, 205,
    206, 207, 208, 209, 210, 211, 213, 214,
    215, 216, 218, 219, 220, 222, 223, 225,
    227, 228, 230, 232, 233, 235, 237, 239,
    241, 243, 245, 247, 249, 251, 253, 255
};
#elif (CONFIG_LCD == LCD_IPOD2BPP) || (CONFIG_LCD == LCD_IPODMINI)
/* measured and interpolated curve for mini LCD */
/* TODO: verify this curve on the fullsize greyscale LCD */
static const unsigned char lcdlinear[256] = {
      0,   3,   6,   8,  11,  14,  17,  19,
     22,  24,  27,  29,  32,  34,  36,  38,
     40,  42,  44,  45,  47,  48,  50,  51,
     52,  54,  55,  56,  57,  58,  58,  59,
     60,  61,  62,  62,  63,  64,  64,  65,
     66,  66,  67,  67,  68,  68,  69,  69,
     70,  70,  70,  71,  71,  71,  72,  72,
     73,  73,  73,  74,  74,  74,  74,  75,
     75,  75,  76,  76,  76,  77,  77,  77,
     78,  78,  78,  79,  79,  79,  80,  80,
     80,  80,  81,  81,  81,  82,  82,  82,
     83,  83,  83,  84,  84,  84,  85,  85,
     85,  85,  86,  86,  86,  87,  87,  87,
     87,  88,  88,  88,  89,  89,  89,  89,
     90,  90,  90,  91,  91,  91,  92,  92,
     92,  93,  93,  93,  94,  94,  94,  95,
     95,  96,  96,  96,  97,  97,  98,  98,
     99,  99,  99, 100, 100, 101, 101, 102,
    102, 103, 103, 104, 104, 105, 105, 106,
    106, 107, 107, 108, 108, 109, 109, 110,
    110, 111, 111, 112, 113, 113, 114, 114,
    115, 115, 116, 117, 117, 118, 118, 119,
    120, 120, 121, 122, 122, 123, 124, 124,
    125, 126, 126, 127, 128, 128, 129, 130,
    131, 131, 132, 133, 134, 134, 135, 136,
    137, 138, 139, 140, 141, 142, 143, 144,
    145, 146, 147, 148, 149, 150, 152, 153,
    154, 156, 157, 159, 160, 162, 163, 165,
    167, 168, 170, 172, 174, 176, 178, 180,
    182, 184, 187, 189, 192, 194, 197, 200,
    203, 206, 209, 212, 215, 219, 222, 226,
    229, 233, 236, 240, 244, 248, 251, 255
};
#endif
#else /* SIMULATOR */
/* undo a (generic) PC display gamma of 2.0 to simulate target behaviour */
static const unsigned char lcdlinear[256] = {
      0,  16,  23,  28,  32,  36,  39,  42,
     45,  48,  50,  53,  55,  58,  60,  62,
     64,  66,  68,  70,  71,  73,  75,  77,
     78,  80,  81,  83,  84,  86,  87,  89,
     90,  92,  93,  94,  96,  97,  98, 100,
    101, 102, 103, 105, 106, 107, 108, 109,
    111, 112, 113, 114, 115, 116, 117, 118,
    119, 121, 122, 123, 124, 125, 126, 127,
    128, 129, 130, 131, 132, 133, 134, 135,
    135, 136, 137, 138, 139, 140, 141, 142,
    143, 144, 145, 145, 146, 147, 148, 149,
    150, 151, 151, 152, 153, 154, 155, 156,
    156, 157, 158, 159, 160, 160, 161, 162,
    163, 164, 164, 165, 166, 167, 167, 168,
    169, 170, 170, 171, 172, 173, 173, 174,
    175, 176, 176, 177, 178, 179, 179, 180,
    181, 181, 182, 183, 183, 184, 185, 186,
    186, 187, 188, 188, 189, 190, 190, 191,
    192, 192, 193, 194, 194, 195, 196, 196,
    197, 198, 198, 199, 199, 200, 201, 201,
    202, 203, 203, 204, 204, 205, 206, 206,
    207, 208, 208, 209, 209, 210, 211, 211,
    212, 212, 213, 214, 214, 215, 215, 216,
    217, 217, 218, 218, 219, 220, 220, 221,
    221, 222, 222, 223, 224, 224, 225, 225,
    226, 226, 227, 228, 228, 229, 229, 230,
    230, 231, 231, 232, 233, 233, 234, 234,
    235, 235, 236, 236, 237, 237, 238, 238,
    239, 240, 240, 241, 241, 242, 242, 243,
    243, 244, 244, 245, 245, 246, 246, 247,
    247, 248, 248, 249, 249, 250, 250, 251,
    251, 252, 252, 253, 253, 254, 254, 255
};
#endif /* SIMULATOR */

/* Prototypes */
static inline void _deferred_update(void) __attribute__ ((always_inline));
static int exp_s16p16(int x);
static int log_s16p16(int x);
static void gray_screendump_hook(int fd);
#ifdef SIMULATOR
static unsigned long _gray_get_pixel(int x, int y);
#else
static void _timer_isr(void);
#endif

/* Update LCD areas not covered by the greyscale overlay */
static inline void _deferred_update(void)
{
    int x1 = MAX(_gray_info.x, 0);
    int x2 = MIN(_gray_info.x + _gray_info.width, LCD_WIDTH);
    int y1 = MAX(_gray_info.y, 0);
    int y2 = MIN(_gray_info.y + _gray_info.height, LCD_HEIGHT);

    if (y1 > 0)  /* refresh part above overlay, full width */
        _gray_rb->lcd_update_rect(0, 0, LCD_WIDTH, y1);

    if (y2 < LCD_HEIGHT) /* refresh part below overlay, full width */
        _gray_rb->lcd_update_rect(0, y2, LCD_WIDTH, LCD_HEIGHT - y2);

    if (x1 > 0) /* refresh part to the left of overlay */
        _gray_rb->lcd_update_rect(0, y1, x1, y2 - y1);

    if (x2 < LCD_WIDTH) /* refresh part to the right of overlay */
        _gray_rb->lcd_update_rect(x2, y1, LCD_WIDTH - x2, y2 - y1);
}

#ifndef SIMULATOR
/* Timer interrupt handler: display next bitplane */
static void _timer_isr(void)
{
#if LCD_PIXELFORMAT == HORIZONTAL_PACKING
    _gray_rb->lcd_blit(_gray_info.plane_data + MULU16(_gray_info.plane_size,
                       _gray_info.cur_plane), _gray_info.bx, _gray_info.y,
                       _gray_info.bwidth, _gray_info.height, _gray_info.bwidth);
#else
    _gray_rb->lcd_blit(_gray_info.plane_data + MULU16(_gray_info.plane_size,
                       _gray_info.cur_plane), _gray_info.x, _gray_info.by,
                       _gray_info.width, _gray_info.bheight, _gray_info.width);
#endif

    if (++_gray_info.cur_plane >= _gray_info.depth)
        _gray_info.cur_plane = 0;

    if (_gray_info.flags & _GRAY_DEFERRED_UPDATE)  /* lcd_update() requested? */
    {
        _deferred_update();
        _gray_info.flags &= ~_GRAY_DEFERRED_UPDATE; /* clear request */
    }
}
#endif /* !SIMULATOR */

/* fixed point exp() */
static int exp_s16p16(int x)
{
    int t;
    int y = 0x00010000;
    
    if (x < 0) x += 0xb1721,            y >>= 16;
    t = x - 0x58b91; if (t >= 0) x = t, y <<= 8;
    t = x - 0x2c5c8; if (t >= 0) x = t, y <<= 4;
    t = x - 0x162e4; if (t >= 0) x = t, y <<= 2;
    t = x - 0x0b172; if (t >= 0) x = t, y <<= 1;
    t = x - 0x067cd; if (t >= 0) x = t, y += y >> 1;
    t = x - 0x03920; if (t >= 0) x = t, y += y >> 2;
    t = x - 0x01e27; if (t >= 0) x = t, y += y >> 3;
    t = x - 0x00f85; if (t >= 0) x = t, y += y >> 4;
    t = x - 0x007e1; if (t >= 0) x = t, y += y >> 5;
    t = x - 0x003f8; if (t >= 0) x = t, y += y >> 6;
    t = x - 0x001fe; if (t >= 0) x = t, y += y >> 7;
    y += ((y >> 8) * x) >> 8;

    return y;
}

/* fixed point log() */
static int log_s16p16(int x)
{
    int t;
    int y = 0xa65af;

    if (x < 0x00008000) x <<=16,                        y -= 0xb1721;
    if (x < 0x00800000) x <<= 8,                        y -= 0x58b91;
    if (x < 0x08000000) x <<= 4,                        y -= 0x2c5c8;
    if (x < 0x20000000) x <<= 2,                        y -= 0x162e4;
    if (x < 0x40000000) x <<= 1,                        y -= 0x0b172;
    t = x + (x >> 1); if ((t & 0x80000000) == 0) x = t, y -= 0x067cd;
    t = x + (x >> 2); if ((t & 0x80000000) == 0) x = t, y -= 0x03920;
    t = x + (x >> 3); if ((t & 0x80000000) == 0) x = t, y -= 0x01e27;
    t = x + (x >> 4); if ((t & 0x80000000) == 0) x = t, y -= 0x00f85;
    t = x + (x >> 5); if ((t & 0x80000000) == 0) x = t, y -= 0x007e1;
    t = x + (x >> 6); if ((t & 0x80000000) == 0) x = t, y -= 0x003f8;
    t = x + (x >> 7); if ((t & 0x80000000) == 0) x = t, y -= 0x001fe;
    x = 0x80000000 - x;
    y -= x >> 15;

    return y;
}

/* Initialise the framework and prepare the greyscale display buffer

 arguments:
   newrb     = pointer to plugin api
   gbuf      = pointer to the memory area to use (e.g. plugin buffer)
   gbuf_size = max usable size of the buffer
   buffered  = use chunky pixel buffering with delta buffer?
               This allows to use all drawing functions, but needs more
               memory. Unbuffered operation provides only a subset of
               drawing functions. (only gray_bitmap drawing and scrolling)
   width     = width in pixels  (1..LCD_WIDTH)
   height    = height in pixels (1..LCD_HEIGHT)
               Note that depending on the target LCD, either height or
               width are rounded up to a multiple of 8.
   depth     = number of bitplanes to use (1..32).
   gamma     = gamma value as s8p8 fixed point. gamma <= 0 means no
               correction at all, i.e. no LCD linearisation as well.

 result:
   = depth  if there was enough memory
   < depth  if there wasn't enough memory. The number of displayable
            shades is smaller than desired, but it still works
   = 0      if there wasn't even enough memory for 1 bitplane

   You can request any depth in the allowed range, not just powers of 2. The
   routine performs "graceful degradation" if the memory is not sufficient for
   the desired depth. As long as there is at least enough memory for 1 bitplane,
   it creates as many bitplanes as fit into memory, although 1 bitplane won't
   deliver an enhancement over the native display.
 
   The number of displayable shades is calculated as follows:
   shades = depth + 1

   If you need info about the memory taken by the greyscale buffer, supply a
   long* as the last parameter. This long will then contain the number of bytes
   used. The total memory needed can be calculated as follows:
 total_mem =
     shades * sizeof(long)               (bitpatterns)
   + [horizontal_packing] ?              (bitplane data)
       ((width + 7) / 8) * height * depth : width * ((height + 7) / 8) * depth
   + buffered ?                          (chunky front- & backbuffer)
       (width * height * 2) : 0
   + 0..3                                (longword alignment)

   The function tries to be as authentic as possible regarding memory usage on
   the simulator, even if it doesn't use all of the allocated memory. There's
   one situation where it will consume more memory on the sim than on the
   target: if you're allocating a low depth (< 8) without buffering. */
int gray_init(struct plugin_api* newrb, unsigned char *gbuf, long gbuf_size,
              bool buffered, int width, int height, int depth, int gamma,
              long *buf_taken)
{
    int possible_depth, bdim, i;
    long plane_size, buftaken;
    unsigned data;
#ifndef SIMULATOR
    int j, bitfill;
#endif

    _gray_rb = newrb;

    if ((unsigned) width > LCD_WIDTH
        || (unsigned) height > LCD_HEIGHT
        || depth < 1)
        return 0;

#if LCD_PIXELFORMAT == HORIZONTAL_PACKING
    bdim = (width + 7) >> 3;
    width = bdim << 3;
#else
    bdim = (height + 7) >> 3;
    height = bdim << 3;
#endif

    /* the buffer has to be long aligned */
    buftaken = (-(long)gbuf) & 3;
    gbuf += buftaken;

    /* chunky front- & backbuffer */
    if (buffered)  
    {
        plane_size = MULU16(width, height);
        buftaken += 2 * plane_size;
        if (buftaken > gbuf_size)
            return 0;

        _gray_info.cur_buffer = gbuf;
        gbuf += plane_size;
        /* set backbuffer to 0xFF to guarantee the initial full update */
        _gray_rb->memset(gbuf, 0xFF, plane_size);
        _gray_info.back_buffer = gbuf;
        gbuf += plane_size;
    }

#if LCD_PIXELFORMAT == HORIZONTAL_PACKING
    plane_size = MULU16(bdim, height);
#else
    plane_size = MULU16(width, bdim);
#endif
    possible_depth = (gbuf_size - buftaken - sizeof(long))
                     / (plane_size + sizeof(long));

    if (possible_depth < 1)
        return 0;

    depth = MIN(depth, 32);
    depth = MIN(depth, possible_depth);

#ifdef SIMULATOR
    if (!buffered)
    {
        long orig_size = MULU16(depth, plane_size) + (depth + 1) * sizeof(long);
        
        plane_size = MULU16(width, height);
        if (plane_size > orig_size)
        {
            buftaken += plane_size;
            if (buftaken > gbuf_size)
                return 0;
        }
        else
        {
            buftaken += orig_size;
        }
        _gray_info.cur_buffer = gbuf;
    }
    else
#endif
        buftaken += MULU16(depth, plane_size) + (depth + 1) * sizeof(long);

    _gray_info.x = 0;
    _gray_info.y = 0;
    _gray_info.width = width;
    _gray_info.height = height;
#if LCD_PIXELFORMAT == HORIZONTAL_PACKING
    _gray_info.bx = 0;
    _gray_info.bwidth = bdim;
#else
    _gray_info.by = 0;
    _gray_info.bheight = bdim;
#endif
    _gray_info.depth = depth;
    _gray_info.flags = 0;
#ifndef SIMULATOR
    _gray_info.cur_plane = 0;
    _gray_info.plane_size = plane_size;
    _gray_info.plane_data = gbuf;
    _gray_rb->memset(gbuf, 0, MULU16(depth, plane_size));
    gbuf += MULU16(depth, plane_size);
    _gray_info.bitpattern = (unsigned long *)gbuf;
              
    i = depth - 1;
    j = 8;
    while (i != 0)
    {
        i >>= 1;
        j--;
    }
    _gray_info.randmask = 0xFFu >> j; 
    bitfill = (-depth) & 7;

    /* Precalculate the bit patterns for all possible pixel values */
    for (i = 0; i <= depth; i++)
    {
        unsigned long pattern = 0;
        int value = 0;

        for (j = 0; j < depth; j++)
        {
            pattern <<= 1;
            value += i;

            if (value >= depth)
                value -= depth;   /* "white" bit */
            else
                pattern |= 1;     /* "black" bit */
        }
        /* now the lower <depth> bits contain the pattern */

        _gray_info.bitpattern[i] = pattern << bitfill;
    }
#endif

    /* precalculate the value -> pattern index conversion table, taking 
       linearisation and gamma correction into account */
    if (gamma <= 0)
    {
        for (i = 0; i < 256; i++)
        {
            data = MULU16(depth, i) + 127;
            _gray_info.idxtable[i] = (data + (data >> 8)) >> 8;
                                      /* approx. data / 255 */
        }
    }
    else
    {
        for (i = 0; i < 256; i++)
        {
            data = exp_s16p16((gamma * log_s16p16(i * 257 + 1)) >> 8) + 128;
            data = (data - (data >> 8)) >> 8; /* approx. data /= 257 */
            data = MULU16(depth, lcdlinear[data]) + 127;
            _gray_info.idxtable[i] = (data + (data >> 8)) >> 8;
                                      /* approx. data / 255 */
        }
    }

    _gray_info.fg_index = 0;
    _gray_info.bg_index = depth;
    _gray_info.fg_brightness = 0;
    _gray_info.bg_brightness = 255;
    _gray_info.drawmode = DRMODE_SOLID;
    _gray_info.curfont = FONT_SYSFIXED;

    if (buf_taken)  /* caller requested info about space taken */
        *buf_taken = buftaken;

    return depth;
}

/* Release the greyscale display buffer and the library
   DO CALL either this function or at least gray_show_display(false)
   before you exit, otherwise nasty things may happen. */
void gray_release(void)
{
    gray_show(false);
}

/* Switch the greyscale overlay on or off
   DO NOT call lcd_update() or any other api function that directly accesses
   the lcd while the greyscale overlay is running! If you need to do
   lcd_update() to update something outside the greyscale overlay area, use
   gray_deferred_update() instead.

 Other functions to avoid are:
   lcd_blit() (obviously), lcd_update_rect(), lcd_set_contrast(),
   lcd_set_invert_display(), lcd_set_flip(), lcd_roll() */
void gray_show(bool enable)
{
    if (enable && !(_gray_info.flags & _GRAY_RUNNING))
    {
        _gray_info.flags |= _GRAY_RUNNING;
#ifdef SIMULATOR
        _gray_rb->sim_lcd_ex_init(_gray_info.depth + 1, _gray_get_pixel);
        gray_update();
#else /* !SIMULATOR */
#ifdef NEED_BOOST
        _gray_rb->cpu_boost(true);
#endif
#if CONFIG_LCD == LCD_SSD1815
        _gray_rb->timer_register(1, NULL, TIMER_FREQ / 67, 1, _timer_isr);
#elif CONFIG_LCD == LCD_S1D15E06
        _gray_rb->timer_register(1, NULL, TIMER_FREQ / 70, 1, _timer_isr);
#elif CONFIG_LCD == LCD_IPOD2BPP
        /* FIXME: verify value */
        _gray_rb->timer_register(1, NULL, TIMER_FREQ / 80, 1, _timer_isr);
#elif CONFIG_LCD == LCD_IPODMINI
        _gray_rb->timer_register(1, NULL, TIMER_FREQ / 88, 1, _timer_isr);
#elif CONFIG_LCD == LCD_IFP7XX
        (void)_timer_isr;   /* TODO: implement for iFP */
#endif /* CONFIG_LCD */
#endif /* !SIMULATOR */
        _gray_rb->screen_dump_set_hook(gray_screendump_hook);
    }
    else if (!enable && (_gray_info.flags & _GRAY_RUNNING))
    {
#ifdef SIMULATOR
        _gray_rb->sim_lcd_ex_init(0, NULL);
#else
        _gray_rb->timer_unregister();
#ifdef NEED_BOOST
        _gray_rb->cpu_boost(false);
#endif
#endif
        _gray_info.flags &= ~_GRAY_RUNNING;
        _gray_rb->screen_dump_set_hook(NULL);
        _gray_rb->lcd_update(); /* restore whatever there was before */
    }
}

#ifdef SIMULATOR
/* Callback function for gray_update_rect() to read a pixel from the graybuffer.
   Note that x and y are in LCD coordinates, not graybuffer coordinates! */
static unsigned long _gray_get_pixel(int x, int y)
{
#if LCD_PIXELFORMAT == HORIZONTAL_PACKING
    return _gray_info.cur_buffer[MULU16(y - _gray_info.y, _gray_info.width)
                                 + x - _gray_info.x]
           + (1 << LCD_DEPTH);
#else
    return _gray_info.cur_buffer[MULU16(x - _gray_info.x, _gray_info.height)
                                 + y - _gray_info.y]
           + (1 << LCD_DEPTH);
#endif
}

/* Update a rectangular area of the greyscale overlay */
void gray_update_rect(int x, int y, int width, int height)
{
    if (x + width > _gray_info.width)
        width = _gray_info.width - x;
    if (y + height > _gray_info.height)
        height = _gray_info.height - y;
        
    x += _gray_info.x;
    y += _gray_info.y;

    if (x + width > LCD_WIDTH)
        width = LCD_WIDTH - x;
    if (y + height > LCD_HEIGHT)
        height = LCD_HEIGHT - y;
        
    _gray_rb->sim_lcd_ex_update_rect(x, y, width, height);
}

#else /* !SIMULATOR */

#if LCD_PIXELFORMAT == HORIZONTAL_PACKING

/* Update a rectangular area of the greyscale overlay */
void gray_update_rect(int x, int y, int width, int height)
{
    int xmax, bwidth;
    long srcofs;
    unsigned char *dst;

    if ((width <= 0) || (height <= 0))
        return; /* nothing to do */

    /* The X coordinates have to work on whole pixel block columns */
    xmax = (x + width - 1) >> 3;
    x >>= 3;

    if (y + height > _gray_info.height)
        height = _gray_info.height - y;
    if (xmax >= _gray_info.bwidth)
        xmax = _gray_info.bwidth - 1;
    bwidth = xmax - x + 1;
        
    srcofs = MULU16(_gray_info.width, y) + (x << 3);
    dst = _gray_info.plane_data + MULU16(_gray_info.bwidth, y) + x;
    
    /* Copy specified rectangle bitmap to hardware */
    for (; height > 0; height--)
    {
        long srcofs_row = srcofs;
        unsigned char *dst_row = dst;
        unsigned char *dst_end = dst_row + bwidth;

        do
        {
            unsigned long pat_stack[8];
            unsigned long *pat_ptr;
            unsigned char *cbuf, *bbuf;
            unsigned change;

            cbuf = _gray_info.cur_buffer + srcofs_row;
            bbuf = _gray_info.back_buffer + srcofs_row;

#ifdef CPU_ARM
            asm volatile 
            (
                "ldr     r0, [%[cbuf]]           \n"
                "ldr     r1, [%[bbuf]]           \n"
                "eor     r1, r0, r1              \n"
                "ldr     r0, [%[cbuf], #4]       \n"
                "ldr     %[chg], [%[bbuf], #4]   \n"
                "eor     %[chg], r0, %[chg]      \n"
                "orr     %[chg], %[chg], r1      \n"
                : /* outputs */
                [chg] "=&r"(change)
                : /* inputs */
                [cbuf]"r"(cbuf),
                [bbuf]"r"(bbuf)
                : /* clobbers */
                "r0", "r1"
            );

            if (change != 0)
            {
                unsigned char *addr;
                unsigned mask, depth, trash;
                
                pat_ptr = &pat_stack[8];
                
                /* precalculate the bit patterns with random shifts
                 * for all 8 pixels and put them on an extra "stack" */
                asm volatile 
                (
        "mov     r3, #8                      \n"  /* loop count */
        "mov     %[mask], #0                 \n"

    ".ur_pre_loop:                           \n"
        "mov     %[mask], %[mask], lsl #1    \n"  /* shift mask */
        "ldrb    r0, [%[cbuf]], #1           \n"  /* read current buffer */
        "ldrb    r1, [%[bbuf]]               \n"  /* read back buffer */
        "strb    r0, [%[bbuf]], #1           \n"  /* update back buffer */
        "mov     r2, #0                      \n"  /* preset for skipped pixel */
        "cmp     r0, r1                      \n"  /* no change? */
        "beq     .ur_skip                    \n"  /* -> skip */

        "ldr     r2, [%[bpat], r0, lsl #2]   \n"  /* r2 = bitpattern[byte]; */

        "add     %[rnd], %[rnd], %[rnd], lsl #2  \n"  /* multiply by 75 */
        "rsb     %[rnd], %[rnd], %[rnd], lsl #4  \n"
        "add     %[rnd], %[rnd], #74         \n"  /* add another 74 */
        /* Since the lower bits are not very random:   get bits 8..15 (need max. 5) */
        "and     r1, %[rmsk], %[rnd], lsr #8 \n"  /* ..and mask out unneeded bits */

        "cmp     r1, %[dpth]                 \n"  /* random >= depth ? */
        "subhs   r1, r1, %[dpth]             \n"  /* yes: random -= depth */

        "mov     r0, r2, lsl r1              \n"  /** rotate pattern **/
        "sub     r1, %[dpth], r1             \n"
        "orr     r2, r0, r2, lsr r1          \n"

        "orr     %[mask], %[mask], #1        \n"  /* set mask bit */
                    
    ".ur_skip:                               \n"
        "str     r2, [%[patp], #-4]!         \n"  /* push on pattern stack */

        "subs    r3, r3, #1                  \n"  /* loop 8 times (pixel block) */
        "bne     .ur_pre_loop                \n"
        : /* outputs */
        [cbuf]"+r"(cbuf),
        [bbuf]"+r"(bbuf),
        [patp]"+r"(pat_ptr),
        [rnd] "+r"(_gray_random_buffer),
        [mask]"=&r"(mask)
        : /* inputs */
        [bpat]"r"(_gray_info.bitpattern),
        [dpth]"r"(_gray_info.depth),
        [rmsk]"r"(_gray_info.randmask)
        : /* clobbers */
        "r0", "r1", "r2", "r3"
                );

                addr = dst_row;
                depth = _gray_info.depth;

                /* set the bits for all 8 pixels in all bytes according to the
                 * precalculated patterns on the pattern stack */
                asm volatile
                (
        "ldmia   %[patp], {r1 - r8}          \n"  /* pop all 8 patterns */

        /** Rotate the four 8x8 bit "blocks" within r1..r8 **/

        "mov     %[rx], #0xF0                \n"  /** Stage 1: 4 bit "comb" **/
        "orr     %[rx], %[rx], %[rx], lsl #8 \n"
        "orr     %[rx], %[rx], %[rx], lsl #16\n"  /* bitmask = ...11110000 */
        "eor     r0, r1, r5, lsl #4          \n"
        "and     r0, r0, %[rx]               \n"
        "eor     r1, r1, r0                  \n"  /* r1 = ...e3e2e1e0a3a2a1a0 */
        "eor     r5, r5, r0, lsr #4          \n"  /* r5 = ...e7e6e5e4a7a6a5a4 */
        "eor     r0, r2, r6, lsl #4          \n"
        "and     r0, r0, %[rx]               \n"
        "eor     r2, r2, r0                  \n"  /* r2 = ...f3f2f1f0b3b2b1b0 */
        "eor     r6, r6, r0, lsr #4          \n"  /* r6 = ...f7f6f5f4f7f6f5f4 */
        "eor     r0, r3, r7, lsl #4          \n"
        "and     r0, r0, %[rx]               \n"
        "eor     r3, r3, r0                  \n"  /* r3 = ...g3g2g1g0c3c2c1c0 */
        "eor     r7, r7, r0, lsr #4          \n"  /* r7 = ...g7g6g5g4c7c6c5c4 */
        "eor     r0, r4, r8, lsl #4          \n"
        "and     r0, r0, %[rx]               \n"
        "eor     r4, r4, r0                  \n"  /* r4 = ...h3h2h1h0d3d2d1d0 */
        "eor     r8, r8, r0, lsr #4          \n"  /* r8 = ...h7h6h5h4d7d6d5d4 */

        "mov     %[rx], #0xCC                \n"  /** Stage 2: 2 bit "comb" **/
        "orr     %[rx], %[rx], %[rx], lsl #8 \n"
        "orr     %[rx], %[rx], %[rx], lsl #16\n"  /* bitmask = ...11001100 */
        "eor     r0, r1, r3, lsl #2          \n"
        "and     r0, r0, %[rx]               \n"
        "eor     r1, r1, r0                  \n"  /* r1 = ...g1g0e1e0c1c0a1a0 */
        "eor     r3, r3, r0, lsr #2          \n"  /* r3 = ...g3g2e3e2c3c2a3a2 */
        "eor     r0, r2, r4, lsl #2          \n"
        "and     r0, r0, %[rx]               \n"
        "eor     r2, r2, r0                  \n"  /* r2 = ...h1h0f1f0d1d0b1b0 */
        "eor     r4, r4, r0, lsr #2          \n"  /* r4 = ...h3h2f3f2d3d2b3b2 */
        "eor     r0, r5, r7, lsl #2          \n"
        "and     r0, r0, %[rx]               \n"
        "eor     r5, r5, r0                  \n"  /* r5 = ...g5g4e5e4c5c4a5a4 */
        "eor     r7, r7, r0, lsr #2          \n"  /* r7 = ...g7g6e7e6c7c6a7a6 */
        "eor     r0, r6, r8, lsl #2          \n"
        "and     r0, r0, %[rx]               \n"
        "eor     r6, r6, r0                  \n"  /* r6 = ...h5h4f5f4d5d4b5b4 */
        "eor     r8, r8, r0, lsr #2          \n"  /* r8 = ...h7h6f7f6d7d6b7b6 */

        "mov     %[rx], #0xAA                \n"  /** Stage 3: 1 bit "comb" **/
        "orr     %[rx], %[rx], %[rx], lsl #8 \n"
        "orr     %[rx], %[rx], %[rx], lsl #16\n"  /* bitmask = ...10101010 */
        "eor     r0, r1, r2, lsl #1          \n"
        "and     r0, r0, %[rx]               \n"
        "eor     r1, r1, r0                  \n"  /* r1 = ...h0g0f0e0d0c0b0a0 */
        "eor     r2, r2, r0, lsr #1          \n"  /* r2 = ...h1g1f1e1d1c1b1a1 */
        "eor     r0, r3, r4, lsl #1          \n"
        "and     r0, r0, %[rx]               \n"
        "eor     r3, r3, r0                  \n"  /* r3 = ...h2g2f2e2d2c2b2a2 */
        "eor     r4, r4, r0, lsr #1          \n"  /* r4 = ...h3g3f3e3d3c3b3a3 */
        "eor     r0, r5, r6, lsl #1          \n"
        "and     r0, r0, %[rx]               \n"
        "eor     r5, r5, r0                  \n"  /* r5 = ...h4g4f4e4d4c4b4a4 */
        "eor     r6, r6, r0, lsr #1          \n"  /* r6 = ...h5g5f5e5d5c5b5a5 */
        "eor     r0, r7, r8, lsl #1          \n"
        "and     r0, r0, %[rx]               \n"
        "eor     r7, r7, r0                  \n"  /* r7 = ...h6g6f6e6d6c6b6a6 */
        "eor     r8, r8, r0, lsr #1          \n"  /* r8 = ...h7g7f7e7d7c7b7a7 */

        "sub     r0, %[dpth], #1             \n"  /** shift out unused low bytes **/
        "and     r0, r0, #7                  \n"
        "add     pc, pc, r0, lsl #2          \n"  /* jump into shift streak */
        "mov     r8, r8, lsr #8              \n"  /* r8: never reached */
        "mov     r7, r7, lsr #8              \n"
        "mov     r6, r6, lsr #8              \n"
        "mov     r5, r5, lsr #8              \n"
        "mov     r4, r4, lsr #8              \n"
        "mov     r3, r3, lsr #8              \n"
        "mov     r2, r2, lsr #8              \n"
        "mov     r1, r1, lsr #8              \n"

        "mvn     %[mask], %[mask]            \n"  /* "set" mask -> "keep" mask */
        "ands    %[mask], %[mask], #0xff     \n"
        "beq     .ur_sstart                  \n"  /* short loop if no bits to keep */

        "ldrb    r0, [pc, r0]                \n"  /* jump into full loop */
        "add     pc, pc, r0                  \n"
    ".ur_ftable:                             \n"
        ".byte   .ur_f1 - .ur_ftable - 4     \n"  /* [jump tables are tricky] */
        ".byte   .ur_f2 - .ur_ftable - 4     \n"
        ".byte   .ur_f3 - .ur_ftable - 4     \n"
        ".byte   .ur_f4 - .ur_ftable - 4     \n"
        ".byte   .ur_f5 - .ur_ftable - 4     \n"
        ".byte   .ur_f6 - .ur_ftable - 4     \n"
        ".byte   .ur_f7 - .ur_ftable - 4     \n"
        ".byte   .ur_f8 - .ur_ftable - 4     \n"

    ".ur_floop:                              \n"  /** full loop (bits to keep)**/
    ".ur_f8:                                 \n"
        "ldrb    r0, [%[addr]]               \n"  /* load old byte */
        "and     r0, r0, %[mask]             \n"  /* mask out replaced bits */
        "orr     r0, r0, r1                  \n"  /* set new bits */
        "strb    r0, [%[addr]], %[psiz]      \n"  /* store byte */
        "mov     r1, r1, lsr #8              \n"  /* shift out used-up byte */
    ".ur_f7:                                 \n"
        "ldrb    r0, [%[addr]]               \n"
        "and     r0, r0, %[mask]             \n"
        "orr     r0, r0, r2                  \n"
        "strb    r0, [%[addr]], %[psiz]      \n"
        "mov     r2, r2, lsr #8              \n"
    ".ur_f6:                                 \n"
        "ldrb    r0, [%[addr]]               \n"
        "and     r0, r0, %[mask]             \n"
        "orr     r0, r0, r3                  \n"
        "strb    r0, [%[addr]], %[psiz]      \n"
        "mov     r3, r3, lsr #8              \n"
    ".ur_f5:                                 \n"
        "ldrb    r0, [%[addr]]               \n"
        "and     r0, r0, %[mask]             \n"
        "orr     r0, r0, r4                  \n"
        "strb    r0, [%[addr]], %[psiz]      \n"
        "mov     r4, r4, lsr #8              \n"
    ".ur_f4:                                 \n"
        "ldrb    r0, [%[addr]]               \n"
        "and     r0, r0, %[mask]             \n"
        "orr     r0, r0, r5                  \n"
        "strb    r0, [%[addr]], %[psiz]      \n"
        "mov     r5, r5, lsr #8              \n"
    ".ur_f3:                                 \n"
        "ldrb    r0, [%[addr]]               \n"
        "and     r0, r0, %[mask]             \n"
        "orr     r0, r0, r6                  \n"
        "strb    r0, [%[addr]], %[psiz]      \n"
        "mov     r6, r6, lsr #8              \n"
    ".ur_f2:                                 \n"
        "ldrb    r0, [%[addr]]               \n"
        "and     r0, r0, %[mask]             \n"
        "orr     r0, r0, r7                  \n"
        "strb    r0, [%[addr]], %[psiz]      \n"
        "mov     r7, r7, lsr #8              \n"
    ".ur_f1:                                 \n"
        "ldrb    r0, [%[addr]]               \n"
        "and     r0, r0, %[mask]             \n"
        "orr     r0, r0, r8                  \n"
        "strb    r0, [%[addr]], %[psiz]      \n"
        "mov     r8, r8, lsr #8              \n"

        "subs    %[dpth], %[dpth], #8        \n"  /* next round if anything left */
        "bhi     .ur_floop                   \n"

        "b       .ur_end                     \n"

    ".ur_sstart:                             \n"
        "ldrb    r0, [pc, r0]                \n"  /* jump into short loop*/
        "add     pc, pc, r0                  \n"
    ".ur_stable:                             \n"
        ".byte   .ur_s1 - .ur_stable - 4     \n"
        ".byte   .ur_s2 - .ur_stable - 4     \n"
        ".byte   .ur_s3 - .ur_stable - 4     \n"
        ".byte   .ur_s4 - .ur_stable - 4     \n"
        ".byte   .ur_s5 - .ur_stable - 4     \n"
        ".byte   .ur_s6 - .ur_stable - 4     \n"
        ".byte   .ur_s7 - .ur_stable - 4     \n"
        ".byte   .ur_s8 - .ur_stable - 4     \n"

    ".ur_sloop:                              \n"  /** short loop (nothing to keep) **/
    ".ur_s8:                                 \n"
        "strb    r1, [%[addr]], %[psiz]      \n"  /* store byte */
        "mov     r1, r1, lsr #8              \n"  /* shift out used-up byte */
    ".ur_s7:                                 \n"
        "strb    r2, [%[addr]], %[psiz]      \n"
        "mov     r2, r2, lsr #8              \n"
    ".ur_s6:                                 \n"
        "strb    r3, [%[addr]], %[psiz]      \n"
        "mov     r3, r3, lsr #8              \n"
    ".ur_s5:                                 \n"
        "strb    r4, [%[addr]], %[psiz]      \n"
        "mov     r4, r4, lsr #8              \n"
    ".ur_s4:                                 \n"
        "strb    r5, [%[addr]], %[psiz]      \n"
        "mov     r5, r5, lsr #8              \n"
    ".ur_s3:                                 \n"
        "strb    r6, [%[addr]], %[psiz]      \n"
        "mov     r6, r6, lsr #8              \n"
    ".ur_s2:                                 \n"
        "strb    r7, [%[addr]], %[psiz]      \n"
        "mov     r7, r7, lsr #8              \n"
    ".ur_s1:                                 \n"
        "strb    r8, [%[addr]], %[psiz]      \n"
        "mov     r8, r8, lsr #8              \n"

        "subs    %[dpth], %[dpth], #8        \n"  /* next round if anything left */
        "bhi     .ur_sloop                   \n"

    ".ur_end:                                \n"
        : /* outputs */
        [addr]"+r"(addr),
        [mask]"+r"(mask),
        [dpth]"+r"(depth),
        [rx]  "=&r"(trash)
        : /* inputs */
        [psiz]"r"(_gray_info.plane_size),
        [patp]"[rx]"(pat_ptr)
        : /* clobbers */
        "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
                 );
             }
#else /* C version, for reference*/
#warning C version of gray_update_rect() used
            (void)pat_ptr;
            /* check whether anything changed in the 8-pixel block */
            change  = *(uint32_t *)cbuf ^ *(uint32_t *)bbuf;
            change |= *(uint32_t *)(cbuf + 4) ^ *(uint32_t *)(bbuf + 4);

            if (change != 0)
            {
                unsigned char *addr, *end;
                unsigned mask = 0;
                unsigned test = 1 << ((-_gray_info.depth) & 7);
                int i;

                /* precalculate the bit patterns with random shifts
                 * for all 8 pixels and put them on an extra "stack" */
                for (i = 7; i >= 0; i--)
                {
                    unsigned pat = 0;
                    unsigned char cur = *cbuf++;
                    unsigned char back = *bbuf;
                    
                    *bbuf++ = cur;

                    mask <<= 1;
                    if (cur != back)
                    {
                        int shift;

                        pat = _gray_info.bitpattern[cur];

                        /* shift pattern pseudo-random, simple & fast PRNG */
                        _gray_random_buffer = 75 * _gray_random_buffer + 74;
                        shift = (_gray_random_buffer >> 8) & _gray_info.randmask;
                        if (shift >= _gray_info.depth)
                            shift -= _gray_info.depth;
                            
                        pat = (pat << shift) | (pat >> (_gray_info.depth - shift));
                        
                        mask |= 1;
                    }
                    pat_stack[i] = pat;
                }

                addr = dst_row;
                end = addr + MULU16(_gray_info.depth, _gray_info.plane_size);

                /* set the bits for all 8 pixels in all bytes according to the
                 * precalculated patterns on the pattern stack */
                mask = (~mask & 0xff);
                if (mask == 0)
                {
                    do
                    {
                        unsigned data = 0;

                        for (i = 7; i >= 0; i--)
                            data = (data << 1) | ((pat_stack[i] & test) ? 1 : 0);

                        *addr = data;
                        addr += _gray_info.plane_size;
                        test <<= 1;
                    }
                    while (addr < end);
                }
                else
                {
                    do
                    {
                        unsigned data = 0;

                        for (i = 7; i >= 0; i--)
                            data = (data << 1) | ((pat_stack[i] & test) ? 1 : 0);

                        *addr = (*addr & mask) | data;
                        addr += _gray_info.plane_size;
                        test <<= 1;
                    }
                    while (addr < end);
                }

            }
#endif /* CONFIG_CPU */
            srcofs_row += 8;
            dst_row++;
        }
        while (dst_row < dst_end);
        
        srcofs += _gray_info.width;
        dst += _gray_info.bwidth;
    }
}
#else /* LCD_PIXELFORMAT == VERTICAL_PACKING */

/* Update a rectangular area of the greyscale overlay */
void gray_update_rect(int x, int y, int width, int height)
{
    int ymax;
    long srcofs;
    unsigned char *dst;

    if ((width <= 0) || (height <= 0))
        return; /* nothing to do */

    /* The Y coordinates have to work on whole pixel block rows */
    ymax = (y + height - 1) >> 3;
    y >>= 3;

    if (x + width > _gray_info.width)
        width = _gray_info.width - x;
    if (ymax >= _gray_info.bheight)
        ymax = _gray_info.bheight - 1;
        
    srcofs = (y << 3) + MULU16(_gray_info.height, x);
    dst = _gray_info.plane_data + MULU16(_gray_info.width, y) + x;
    
    /* Copy specified rectangle bitmap to hardware */
    for (; y <= ymax; y++)
    {
        long srcofs_row = srcofs;
        unsigned char *dst_row = dst;
        unsigned char *dst_end = dst_row + width;

        do
        {
            unsigned long pat_stack[8];
            unsigned long *pat_ptr;
            unsigned char *cbuf, *bbuf;
            unsigned change;

            cbuf = _gray_info.cur_buffer + srcofs_row;
            bbuf = _gray_info.back_buffer + srcofs_row;

#if CONFIG_CPU == SH7034
            asm volatile (
                "mov.l   @%[bbuf], r2        \n"
                "mov.l   @%[cbuf], r1        \n"
                "mov.l   @(4,%[bbuf]), %[chg]\n"
                "xor     r1, r2              \n"
                "mov.l   @(4,%[cbuf]), r1    \n"
                "xor     r1, %[chg]          \n"
                "or      r2, %[chg]          \n"
                : /* outputs */
                [chg] "=r"(change)
                : /* inputs */
                [cbuf]"r"(cbuf),
                [bbuf]"r"(bbuf)
                : /* clobbers */
                 "r1", "r2"
            );

            if (change != 0)
            {
                unsigned char *addr;
                unsigned mask, depth, trash;

                pat_ptr = &pat_stack[8];

                /* precalculate the bit patterns with random shifts
                 * for all 8 pixels and put them on an extra "stack" */
                asm volatile 
                (
        "mov     #8, r3              \n"  /* loop count */

    ".ur_pre_loop:                   \n"
        "mov.b   @%[cbuf]+, r0       \n"  /* read current buffer */
        "mov.b   @%[bbuf], r1        \n"  /* read back buffer */
        "mov     #0, r2              \n"  /* preset for skipped pixel */
        "mov.b   r0, @%[bbuf]        \n"  /* update back buffer */
        "add     #1, %[bbuf]         \n"
        "cmp/eq  r0, r1              \n"  /* no change? */
        "bt      .ur_skip            \n"  /* -> skip */

        "mov     #75, r1             \n"
        "mulu    r1, %[rnd]          \n"  /* multiply by 75 */
        "shll2   r0                  \n"  /* pixel value -> pattern offset */
        "mov.l   @(r0,%[bpat]), r4   \n"  /* r4 = bitpattern[byte]; */
        "sts     macl, %[rnd]        \n"
        "add     #74, %[rnd]         \n"  /* add another 74 */
        /* Since the lower bits are not very random: */
        "swap.b  %[rnd], r1          \n"  /* get bits 8..15 (need max. 5) */
        "and     %[rmsk], r1         \n"  /* mask out unneeded bits */

        "cmp/hs  %[dpth], r1         \n"  /* random >= depth ? */
        "bf      .ur_ntrim           \n"
        "sub     %[dpth], r1         \n"  /* yes: random -= depth; */
    ".ur_ntrim:                      \n"
                
        "mov.l   .ashlsi3, r0        \n"  /** rotate pattern **/
        "jsr     @r0                 \n"  /* r4 -> r0, shift left by r5 */
        "mov     r1, r5              \n"

        "mov     %[dpth], r5         \n"
        "sub     r1, r5              \n"  /* r5 = depth - r1 */
        "mov.l   .lshrsi3, r1        \n"
        "jsr     @r1                 \n"  /* r4 -> r0, shift right by r5 */
        "mov     r0, r2              \n"  /* store previous result in r2 */
                                         
        "or      r0, r2              \n"  /* rotated_pattern = r2 | r0 */
        "clrt                        \n"  /* mask bit = 0 (replace) */

    ".ur_skip:                       \n"  /* T == 1 if skipped */
        "rotcr   %[mask]             \n"  /* get mask bit */
        "mov.l   r2, @-%[patp]       \n"  /* push on pattern stack */

        "add     #-1, r3             \n"  /* loop 8 times (pixel block) */
        "cmp/pl  r3                  \n"
        "bt      .ur_pre_loop        \n"

        "shlr8   %[mask]             \n"  /* shift mask to low byte */
        "shlr16  %[mask]             \n"
        : /* outputs */
        [cbuf]"+r"(cbuf),
        [bbuf]"+r"(bbuf),
        [rnd] "+r"(_gray_random_buffer),
        [patp]"+r"(pat_ptr),
        [mask]"=&r"(mask)
        : /* inputs */
        [dpth]"r"(_gray_info.depth),
        [bpat]"r"(_gray_info.bitpattern),
        [rmsk]"r"(_gray_info.randmask)
        : /* clobbers */
        "r0", "r1", "r2", "r3", "r4", "r5", "macl", "pr"
                );

                addr = dst_row;
                depth = _gray_info.depth;

                /* set the bits for all 8 pixels in all bytes according to the
                 * precalculated patterns on the pattern stack */
                asm volatile
                (
        "mov.l   @%[patp]+, r8       \n"  /* pop all 8 patterns */
        "mov.l   @%[patp]+, r7       \n"
        "mov.l   @%[patp]+, r6       \n"
        "mov.l   @%[patp]+, r5       \n"
        "mov.l   @%[patp]+, r4       \n"
        "mov.l   @%[patp]+, r3       \n"
        "mov.l   @%[patp]+, r2       \n"
        "mov.l   @%[patp], r1        \n"

        /** Rotate the four 8x8 bit "blocks" within r1..r8 **/
                                          
        "mov.l   .ur_mask4, %[rx]    \n"  /* bitmask = ...11110000 */
        "mov     r5, r0              \n"  /** Stage 1: 4 bit "comb" **/
        "shll2   r0                  \n"
        "shll2   r0                  \n"
        "xor     r1, r0              \n"
        "and     %[rx], r0           \n"
        "xor     r0, r1              \n"  /* r1 = ...e3e2e1e0a3a2a1a0 */
        "shlr2   r0                  \n"
        "shlr2   r0                  \n"
        "xor     r0, r5              \n"  /* r5 = ...e7e6e5e4a7a6a5a4 */
        "mov     r6, r0              \n"
        "shll2   r0                  \n"
        "shll2   r0                  \n"
        "xor     r2, r0              \n"
        "and     %[rx], r0           \n"
        "xor     r0, r2              \n"  /* r2 = ...f3f2f1f0b3b2b1b0 */
        "shlr2   r0                  \n"
        "shlr2   r0                  \n"
        "xor     r0, r6              \n"  /* r6 = ...f7f6f5f4f7f6f5f4 */
        "mov     r7, r0              \n"
        "shll2   r0                  \n"
        "shll2   r0                  \n"
        "xor     r3, r0              \n"
        "and     %[rx], r0           \n"
        "xor     r0, r3              \n"  /* r3 = ...g3g2g1g0c3c2c1c0 */
        "shlr2   r0                  \n"
        "shlr2   r0                  \n"
        "xor     r0, r7              \n"  /* r7 = ...g7g6g5g4c7c6c5c4 */
        "mov     r8, r0              \n"
        "shll2   r0                  \n"
        "shll2   r0                  \n"
        "xor     r4, r0              \n"
        "and     %[rx], r0           \n"
        "xor     r0, r4              \n"  /* r4 = ...h3h2h1h0d3d2d1d0 */
        "shlr2   r0                  \n"
        "shlr2   r0                  \n"
        "xor     r0, r8              \n"  /* r8 = ...h7h6h5h4d7d6d5d4 */

        "mov.l   .ur_mask2, %[rx]    \n"  /* bitmask = ...11001100 */
        "mov     r3, r0              \n"  /** Stage 2: 2 bit "comb" **/
        "shll2   r0                  \n"
        "xor     r1, r0              \n"
        "and     %[rx], r0           \n"
        "xor     r0, r1              \n"  /* r1 = ...g1g0e1e0c1c0a1a0 */
        "shlr2   r0                  \n"
        "xor     r0, r3              \n"  /* r3 = ...g3g2e3e2c3c2a3a2 */
        "mov     r4, r0              \n"
        "shll2   r0                  \n"
        "xor     r2, r0              \n"
        "and     %[rx], r0           \n"
        "xor     r0, r2              \n"  /* r2 = ...h1h0f1f0d1d0b1b0 */
        "shlr2   r0                  \n"
        "xor     r0, r4              \n"  /* r4 = ...h3h2f3f2d3d2b3b2 */
        "mov     r7, r0              \n"
        "shll2   r0                  \n"
        "xor     r5, r0              \n"
        "and     %[rx], r0           \n"
        "xor     r0, r5              \n"  /* r5 = ...g5g4e5e4c5c4a5a4 */
        "shlr2   r0                  \n"
        "xor     r0, r7              \n"  /* r7 = ...g7g6e7e6c7c6a7a6 */
        "mov     r8, r0              \n"
        "shll2   r0                  \n"
        "xor     r6, r0              \n"
        "and     %[rx], r0           \n"
        "xor     r0, r6              \n"  /* r6 = ...h5h4f5f4d5d4b5b4 */
        "shlr2   r0                  \n"
        "xor     r0, r8              \n"  /* r8 = ...h7h6f7f6d7d6b7b6 */

        "mov.l   .ur_mask1, %[rx]    \n"  /* bitmask = ...10101010 */
        "mov     r2, r0              \n"  /** Stage 3: 1 bit "comb" **/
        "shll    r0                  \n"
        "xor     r1, r0              \n"
        "and     %[rx], r0           \n"
        "xor     r0, r1              \n"  /* r1 = ...h0g0f0e0d0c0b0a0 */
        "shlr    r0                  \n"
        "xor     r0, r2              \n"  /* r2 = ...h1g1f1e1d1c1b1a1 */
        "mov     r4, r0              \n"
        "shll    r0                  \n"
        "xor     r3, r0              \n"
        "and     %[rx], r0           \n"
        "xor     r0, r3              \n"  /* r3 = ...h2g2f2e2d2c2b2a2 */
        "shlr    r0                  \n"
        "xor     r0, r4              \n"  /* r4 = ...h3g3f3e3d3c3b3a3 */
        "mov     r6, r0              \n"
        "shll    r0                  \n"
        "xor     r5, r0              \n"
        "and     %[rx], r0           \n"
        "xor     r0, r5              \n"  /* r5 = ...h4g4f4e4d4c4b4a4 */
        "shlr    r0                  \n"
        "xor     r0, r6              \n"  /* r6 = ...h5g5f5e5d5c5b5a5 */
        "mov     r8, r0              \n"
        "shll    r0                  \n"
        "xor     r7, r0              \n"
        "and     %[rx], r0           \n"
        "xor     r0, r7              \n"  /* r7 = ...h6g6f6e6d6c6b6a6 */
        "shlr    r0                  \n"
        "xor     r0, r8              \n"  /* r8 = ...h7g7f7e7d7c7b7a7 */

        "mov     %[dpth], %[rx]      \n"  /** shift out unused low bytes **/
        "add     #-1, %[rx]          \n"
        "mov     #7, r0              \n"
        "and     r0, %[rx]           \n"
        "mova    .ur_pshift, r0      \n"
        "add     %[rx], r0           \n"
        "add     %[rx], r0           \n"
        "jmp     @r0                 \n"  /* jump into shift streak */
        "nop                         \n"

        ".align  2                   \n"
    ".ur_pshift:                     \n"
        "shlr8   r7                  \n"
        "shlr8   r6                  \n"
        "shlr8   r5                  \n"
        "shlr8   r4                  \n"
        "shlr8   r3                  \n"
        "shlr8   r2                  \n"
        "shlr8   r1                  \n"

        "tst     %[mask], %[mask]    \n"
        "bt      .ur_sstart          \n"  /* short loop if nothing to keep */
        
        "mova    .ur_ftable, r0      \n"  /* jump into full loop */
        "mov.b   @(r0, %[rx]), %[rx] \n"
        "add     %[rx], r0           \n"
        "jmp     @r0                 \n"
        "nop                         \n"

        ".align  2                   \n"
    ".ur_ftable:                     \n"
        ".byte   .ur_f1 - .ur_ftable \n"
        ".byte   .ur_f2 - .ur_ftable \n"
        ".byte   .ur_f3 - .ur_ftable \n"
        ".byte   .ur_f4 - .ur_ftable \n"
        ".byte   .ur_f5 - .ur_ftable \n"
        ".byte   .ur_f6 - .ur_ftable \n"
        ".byte   .ur_f7 - .ur_ftable \n"
        ".byte   .ur_f8 - .ur_ftable \n"

    ".ur_floop:                      \n"  /** full loop (there are bits to keep)**/
    ".ur_f8:                         \n"
        "mov.b   @%[addr], r0        \n"  /* load old byte */
        "and     %[mask], r0         \n"  /* mask out replaced bits */
        "or      r1, r0              \n"  /* set new bits */
        "mov.b   r0, @%[addr]        \n"  /* store byte */
        "add     %[psiz], %[addr]    \n"
        "shlr8   r1                  \n"  /* shift out used-up byte */
    ".ur_f7:                         \n"
        "mov.b   @%[addr], r0        \n"
        "and     %[mask], r0         \n"
        "or      r2, r0              \n"
        "mov.b   r0, @%[addr]        \n"
        "add     %[psiz], %[addr]    \n"
        "shlr8   r2                  \n"
    ".ur_f6:                         \n"
        "mov.b   @%[addr], r0        \n"
        "and     %[mask], r0         \n"
        "or      r3, r0              \n"
        "mov.b   r0, @%[addr]        \n"
        "add     %[psiz], %[addr]    \n"
        "shlr8   r3                  \n"
    ".ur_f5:                         \n"
        "mov.b   @%[addr], r0        \n"
        "and     %[mask], r0         \n"
        "or      r4, r0              \n"
        "mov.b   r0, @%[addr]        \n"
        "add     %[psiz], %[addr]    \n"
        "shlr8   r4                  \n"
    ".ur_f4:                         \n"
        "mov.b   @%[addr], r0        \n"
        "and     %[mask], r0         \n"
        "or      r5, r0              \n"
        "mov.b   r0, @%[addr]        \n"
        "add     %[psiz], %[addr]    \n"
        "shlr8   r5                  \n"
    ".ur_f3:                         \n"
        "mov.b   @%[addr], r0        \n"
        "and     %[mask], r0         \n"
        "or      r6, r0              \n"
        "mov.b   r0, @%[addr]        \n"
        "add     %[psiz], %[addr]    \n"
        "shlr8   r6                  \n"
    ".ur_f2:                         \n"
        "mov.b   @%[addr], r0        \n"
        "and     %[mask], r0         \n"
        "or      r7, r0              \n"
        "mov.b   r0, @%[addr]        \n"
        "add     %[psiz], %[addr]    \n"
        "shlr8   r7                  \n"
    ".ur_f1:                         \n"
        "mov.b   @%[addr], r0        \n"
        "and     %[mask], r0         \n"
        "or      r8, r0              \n"
        "mov.b   r0, @%[addr]        \n"
        "add     %[psiz], %[addr]    \n"
        "shlr8   r8                  \n"

        "add     #-8, %[dpth]        \n"
        "cmp/pl  %[dpth]             \n"  /* next round if anything left */
        "bt      .ur_floop           \n"

        "bra     .ur_end             \n"
        "nop                         \n"

        /* References to C library routines used in the precalc block */
        ".align  2                   \n"
    ".ashlsi3:                       \n"  /* C library routine: */
        ".long   ___ashlsi3          \n"  /* shift r4 left by r5, result in r0 */
    ".lshrsi3:                       \n"  /* C library routine: */
        ".long   ___lshrsi3          \n"  /* shift r4 right by r5, result in r0 */
        /* both routines preserve r4, destroy r5 and take ~16 cycles */

        /* Bitmasks for the bit block rotation */
    ".ur_mask4:                      \n"
        ".long   0xF0F0F0F0          \n"
    ".ur_mask2:                      \n"
        ".long   0xCCCCCCCC          \n"
    ".ur_mask1:                      \n"
        ".long   0xAAAAAAAA          \n"

    ".ur_sstart:                     \n"
        "mova    .ur_stable, r0      \n"  /* jump into short loop */
        "mov.b   @(r0, %[rx]), %[rx] \n"
        "add     %[rx], r0           \n"
        "jmp     @r0                 \n"
        "nop                         \n"

        ".align  2                   \n"
    ".ur_stable:                     \n"
        ".byte   .ur_s1 - .ur_stable \n"
        ".byte   .ur_s2 - .ur_stable \n"
        ".byte   .ur_s3 - .ur_stable \n"
        ".byte   .ur_s4 - .ur_stable \n"
        ".byte   .ur_s5 - .ur_stable \n"
        ".byte   .ur_s6 - .ur_stable \n"
        ".byte   .ur_s7 - .ur_stable \n"
        ".byte   .ur_s8 - .ur_stable \n"

    ".ur_sloop:                      \n"  /** short loop (nothing to keep) **/
    ".ur_s8:                         \n"
        "mov.b   r1, @%[addr]        \n"  /* store byte */
        "add     %[psiz], %[addr]    \n"
        "shlr8   r1                  \n"  /* shift out used-up byte */
    ".ur_s7:                         \n"
        "mov.b   r2, @%[addr]        \n"
        "add     %[psiz], %[addr]    \n"
        "shlr8   r2                  \n"
    ".ur_s6:                         \n"
        "mov.b   r3, @%[addr]        \n"
        "add     %[psiz], %[addr]    \n"
        "shlr8   r3                  \n"
    ".ur_s5:                         \n"
        "mov.b   r4, @%[addr]        \n"
        "add     %[psiz], %[addr]    \n"
        "shlr8   r4                  \n"
    ".ur_s4:                         \n"
        "mov.b   r5, @%[addr]        \n"
        "add     %[psiz], %[addr]    \n"
        "shlr8   r5                  \n"
    ".ur_s3:                         \n"
        "mov.b   r6, @%[addr]        \n"
        "add     %[psiz], %[addr]    \n"
        "shlr8   r6                  \n"
    ".ur_s2:                         \n"
        "mov.b   r7, @%[addr]        \n"
        "add     %[psiz], %[addr]    \n"
        "shlr8   r7                  \n"
    ".ur_s1:                         \n"
        "mov.b   r8, @%[addr]        \n"
        "add     %[psiz], %[addr]    \n"
        "shlr8   r8                  \n"

        "add     #-8, %[dpth]        \n"
        "cmp/pl  %[dpth]             \n"  /* next round if anything left */
        "bt      .ur_sloop           \n"

    ".ur_end:                        \n"
        : /* outputs */
        [addr]"+r"(addr),
        [dpth]"+r"(depth),
        [rx]  "=&r"(trash)
        : /* inputs */
        [mask]"r"(mask),
        [psiz]"r"(_gray_info.plane_size),
        [patp]"[rx]"(pat_ptr)
        : /* clobbers */
        "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "macl"
                );
            }
#elif defined(CPU_COLDFIRE)
            asm volatile (
                "move.l  (%[cbuf]), %%d0     \n"
                "move.l  (%[bbuf]), %%d1     \n"
                "eor.l   %%d0, %%d1          \n"
                "move.l  (4,%[cbuf]), %%d0   \n"
                "move.l  (4,%[bbuf]), %[chg] \n"
                "eor.l   %%d0, %[chg]        \n"
                "or.l    %%d1, %[chg]        \n"
                : /* outputs */
                [chg] "=&d"(change)
                : /* inputs */
                [cbuf]"a"(cbuf),
                [bbuf]"a"(bbuf)
                : /* clobbers */
                "d0", "d1"
            );

            if (change != 0)
            {
                unsigned char *addr;
                unsigned mask, depth, trash;

                pat_ptr = &pat_stack[8];

                /* precalculate the bit patterns with random shifts
                 * for all 8 pixels and put them on an extra "stack" */
                asm volatile 
                (
        "moveq.l #8, %%d3            \n"  /* loop count */
        "clr.l   %[mask]             \n"

    ".ur_pre_loop:                   \n"
        "clr.l   %%d0                \n"
        "move.b  (%[cbuf])+, %%d0    \n"  /* read current buffer */
        "clr.l   %%d1                \n"
        "move.b  (%[bbuf]), %%d1     \n"  /* read back buffer */
        "move.b  %%d0, (%[bbuf])+    \n"  /* update back buffer */
        "clr.l   %%d2                \n"  /* preset for skipped pixel */
        "cmp.l   %%d0, %%d1          \n"  /* no change? */
        "beq.b   .ur_skip            \n"  /* -> skip */

        "move.l  (%%d0:l:4, %[bpat]), %%d2   \n"  /* d2 = bitpattern[byte]; */

        "mulu.w  #75, %[rnd]         \n"  /* multiply by 75 */
        "add.l   #74, %[rnd]         \n"  /* add another 74 */
        /* Since the lower bits are not very random: */
        "move.l  %[rnd], %%d1        \n"
        "lsr.l   #8, %%d1            \n"  /* get bits 8..15 (need max. 5) */
        "and.l   %[rmsk], %%d1       \n"  /* mask out unneeded bits */

        "cmp.l   %[dpth], %%d1       \n"  /* random >= depth ? */
        "blo.b   .ur_ntrim           \n"
        "sub.l   %[dpth], %%d1       \n"  /* yes: random -= depth; */
    ".ur_ntrim:                      \n"

        "move.l  %%d2, %%d0          \n"  /** rotate pattern **/
        "lsl.l   %%d1, %%d0          \n"
        "sub.l   %[dpth], %%d1       \n"
        "neg.l   %%d1                \n"  /* d1 = depth - d1 */
        "lsr.l   %%d1, %%d2          \n"
        "or.l    %%d0, %%d2          \n"  /* rotated_pattern = d2 | d0 */

        "or.l    #0x0100, %[mask]    \n"  /* set mask bit */

    ".ur_skip:                       \n"
        "lsr.l   #1, %[mask]         \n"  /* shift mask */
        "move.l  %%d2, -(%[patp])    \n"  /* push on pattern stack */

        "subq.l  #1, %%d3            \n"  /* loop 8 times (pixel block) */
        "bne.b   .ur_pre_loop        \n"
        : /* outputs */
        [cbuf]"+a"(cbuf),
        [bbuf]"+a"(bbuf),
        [patp]"+a"(pat_ptr),
        [rnd] "+d"(_gray_random_buffer),
        [mask]"=&d"(mask)
        : /* inputs */
        [bpat]"a"(_gray_info.bitpattern),
        [dpth]"d"(_gray_info.depth),
        [rmsk]"d"(_gray_info.randmask)
        : /* clobbers */
        "d0", "d1", "d2", "d3"
                );

                addr = dst_row;
                mask = ~mask & 0xff;
                depth = _gray_info.depth;

                /* set the bits for all 8 pixels in all bytes according to the
                 * precalculated patterns on the pattern stack */
                asm volatile
                (
        "movem.l (%[patp]), %%d1-%%d7/%%a0   \n"  /* pop all 8 patterns */
        /* move.l  %%d5, %[ax]        */  /* need %%d5 as workspace, but not yet */

        /** Rotate the four 8x8 bit "blocks" within r1..r8 **/

        "move.l  %%d1, %%d0          \n"  /** Stage 1: 4 bit "comb" **/
        "lsl.l   #4, %%d0            \n"
        /* move.l  %[ax], %%d5        */  /* already in d5 */
        "eor.l   %%d5, %%d0          \n"
        "and.l   #0xF0F0F0F0, %%d0   \n"  /* bitmask = ...11110000 */
        "eor.l   %%d0, %%d5          \n"
        "move.l  %%d5, %[ax]         \n"  /* ax = ...h3h2h1h0d3d2d1d0 */
        "lsr.l   #4, %%d0            \n"
        "eor.l   %%d0, %%d1          \n"  /* d1 = ...h7h6h5h4d7d6d5d4 */
        "move.l  %%d2, %%d0          \n"
        "lsl.l   #4, %%d0            \n"
        "eor.l   %%d6, %%d0          \n"
        "and.l   #0xF0F0F0F0, %%d0   \n"
        "eor.l   %%d0, %%d6          \n"  /* d6 = ...g3g2g1g0c3c2c1c0 */
        "lsr.l   #4, %%d0            \n"
        "eor.l   %%d0, %%d2          \n"  /* d2 = ...g7g6g5g4c7c6c5c4 */
        "move.l  %%d3, %%d0          \n"
        "lsl.l   #4, %%d0            \n"
        "eor.l   %%d7, %%d0          \n"
        "and.l   #0xF0F0F0F0, %%d0   \n"
        "eor.l   %%d0, %%d7          \n"  /* d7 = ...f3f2f1f0b3b2b1b0 */
        "lsr.l   #4, %%d0            \n"
        "eor.l   %%d0, %%d3          \n"  /* d3 = ...f7f6f5f4f7f6f5f4 */
        "move.l  %%d4, %%d0          \n"
        "lsl.l   #4, %%d0            \n"
        "move.l  %%a0, %%d5          \n"
        "eor.l   %%d5, %%d0          \n"
        "and.l   #0xF0F0F0F0, %%d0   \n"
        "eor.l   %%d0, %%d5          \n"  /* (a0 = ...e3e2e1e0a3a2a1a0) */
        /* move.l  %%d5, %%a0         */  /* but d5 is kept until next usage */
        "lsr.l   #4, %%d0            \n"
        "eor.l   %%d0, %%d4          \n"  /* d4 = ...e7e6e5e4a7a6a5a4 */
        
        "move.l  %%d6, %%d0          \n"  /** Stage 2: 2 bit "comb" **/
        "lsl.l   #2, %%d0            \n"
        /* move.l  %%a0, %%d5         */  /* still in d5 */
        "eor.l   %%d5, %%d0          \n"
        "and.l   #0xCCCCCCCC, %%d0   \n"  /* bitmask = ...11001100 */
        "eor.l   %%d0, %%d5          \n"
        "move.l  %%d5, %%a0          \n"  /* a0 = ...g1g0e1e0c1c0a1a0 */
        "lsr.l   #2, %%d0            \n"
        "eor.l   %%d0, %%d6          \n"  /* d6 = ...g3g2e3e2c3c2a3a2 */
        "move.l  %[ax], %%d5         \n"
        "move.l  %%d5, %%d0          \n"
        "lsl.l   #2, %%d0            \n"
        "eor.l   %%d7, %%d0          \n"
        "and.l   #0xCCCCCCCC, %%d0   \n"
        "eor.l   %%d0, %%d7          \n"  /* r2 = ...h1h0f1f0d1d0b1b0 */
        "lsr.l   #2, %%d0            \n"
        "eor.l   %%d0, %%d5          \n"  /* (ax = ...h3h2f3f2d3d2b3b2) */
        /* move.l  %%d5, %[ax]        */  /* but d5 is kept until next usage */
        "move.l  %%d2, %%d0          \n"
        "lsl.l   #2, %%d0            \n"
        "eor.l   %%d4, %%d0          \n"
        "and.l   #0xCCCCCCCC, %%d0   \n"
        "eor.l   %%d0, %%d4          \n"  /* d4 = ...g5g4e5e4c5c4a5a4 */
        "lsr.l   #2, %%d0            \n"
        "eor.l   %%d0, %%d2          \n"  /* d2 = ...g7g6e7e6c7c6a7a6 */
        "move.l  %%d1, %%d0          \n"
        "lsl.l   #2, %%d0            \n"
        "eor.l   %%d3, %%d0          \n"
        "and.l   #0xCCCCCCCC, %%d0   \n"
        "eor.l   %%d0, %%d3          \n"  /* d3 = ...h5h4f5f4d5d4b5b4 */
        "lsr.l   #2, %%d0            \n"
        "eor.l   %%d0, %%d1          \n"  /* d1 = ...h7h6f7f6d7d6b7b6 */
        
        "move.l  %%d1, %%d0          \n"  /** Stage 3: 1 bit "comb" **/
        "lsl.l   #1, %%d0            \n"
        "eor.l   %%d2, %%d0          \n"
        "and.l   #0xAAAAAAAA, %%d0   \n"  /* bitmask = ...10101010 */
        "eor.l   %%d0, %%d2          \n"  /* d2 = ...h6g6f6e6d6c6b6a6 */
        "lsr.l   #1, %%d0            \n"
        "eor.l   %%d0, %%d1          \n"  /* d1 = ...h7g7f7e7d7c7b7a7 */
        "move.l  %%d3, %%d0          \n"
        "lsl.l   #1, %%d0            \n"
        "eor.l   %%d4, %%d0          \n"
        "and.l   #0xAAAAAAAA, %%d0   \n"
        "eor.l   %%d0, %%d4          \n"  /* d4 = ...h4g4f4e4d4c4b4a4 */
        "lsr.l   #1, %%d0            \n"
        "eor.l   %%d0, %%d3          \n"  /* d3 = ...h5g5f5e5d5c5b5a5 */
        /* move.l  %[ax], %%d5        */  /* still in d5 */
        "move.l  %%d5, %%d0          \n"
        "lsl.l   #1, %%d0            \n"
        "eor.l   %%d6, %%d0          \n"
        "and.l   #0xAAAAAAAA, %%d0   \n"
        "eor.l   %%d0, %%d6          \n"  /* d6 = ...h2g2f2e2d2c2b2a2 */
        "lsr.l   #1, %%d0            \n"
        "eor.l   %%d0, %%d5          \n"
        "move.l  %%d5, %[ax]         \n"  /* ax = ...h3g3f3e3d3c3b3a3 */
        "move.l  %%d7, %%d0          \n"
        "lsl.l   #1, %%d0            \n"
        "move.l  %%a0, %%d5          \n"
        "eor.l   %%d5, %%d0          \n"
        "and.l   #0xAAAAAAAA, %%d0   \n"
        "eor.l   %%d0, %%d5          \n"  /* (a0 = ...h0g0f0e0d0c0b0a0) */
        /* move.l  %%d5, %%a0         */  /*  but keep in d5 for shift streak */
        "lsr.l   #1, %%d0            \n"
        "eor.l   %%d0, %%d7          \n"  /* d7 = ...h1g1f1e1d1c1b1a1 */
        
        "move.l  %[dpth], %%d0       \n"  /** shift out unused low bytes **/
        "subq.l  #1, %%d0            \n"
        "and.l   #7, %%d0            \n"
        "move.l  %%d0, %%a0          \n"
        "move.l  %[ax], %%d0         \n"  /* all data in D registers */
        "jmp     (2, %%pc, %%a0:l:2) \n"  /* jump into shift streak */
        "lsr.l   #8, %%d2            \n"
        "lsr.l   #8, %%d3            \n"
        "lsr.l   #8, %%d4            \n"
        "lsr.l   #8, %%d0            \n"
        "lsr.l   #8, %%d6            \n"
        "lsr.l   #8, %%d7            \n"
        "lsr.l   #8, %%d5            \n"
        "move.l  %%d0, %[ax]         \n"  /* put the 2 extra words back.. */
        "move.l  %%a0, %%d0          \n"  /* keep the value for later */
        "move.l  %%d5, %%a0          \n"  /* ..into their A registers */

        "tst.l   %[mask]             \n"
        "jeq     .ur_sstart          \n"  /* short loop if nothing to keep */

        "move.l  %[mask], %%d5       \n"  /* need mask in data reg. */
        "move.l  %%d1, %[mask]       \n"  /* free d1 as working reg. */

        "jmp     (2, %%pc, %%d0:l:2) \n"  /* jump into full loop */
        "bra.s   .ur_f1              \n"
        "bra.s   .ur_f2              \n"
        "bra.s   .ur_f3              \n"
        "bra.s   .ur_f4              \n"
        "bra.s   .ur_f5              \n"
        "bra.s   .ur_f6              \n"
        "bra.s   .ur_f7              \n"
        /* bra.s   .ur_f8             */  /* identical with target */

    ".ur_floop:                      \n"  /** full loop (there are bits to keep)**/
    ".ur_f8:                         \n"
        "move.b  (%[addr]), %%d0     \n"  /* load old byte */
        "and.l   %%d5, %%d0          \n"  /* mask out replaced bits */
        "move.l  %%a0, %%d1          \n"
        "or.l    %%d1, %%d0          \n"  /* set new bits */
        "move.b  %%d0, (%[addr])     \n"  /* store byte */
        "add.l   %[psiz], %[addr]    \n"
        "lsr.l   #8, %%d1            \n"  /* shift out used-up byte */
        "move.l  %%d1, %%a0          \n"
    ".ur_f7:                         \n"
        "move.b  (%[addr]), %%d0     \n"
        "and.l   %%d5, %%d0          \n"
        "or.l    %%d7, %%d0          \n"
        "move.b  %%d0, (%[addr])     \n"
        "add.l   %[psiz], %[addr]    \n"
        "lsr.l   #8, %%d7            \n"
    ".ur_f6:                         \n"
        "move.b  (%[addr]), %%d0     \n"
        "and.l   %%d5, %%d0          \n"
        "or.l    %%d6, %%d0          \n"
        "move.b  %%d0, (%[addr])     \n"
        "add.l   %[psiz], %[addr]    \n"
        "lsr.l   #8, %%d6            \n"
    ".ur_f5:                         \n"
        "move.b  (%[addr]), %%d0     \n"
        "and.l   %%d5, %%d0          \n"
        "move.l  %[ax], %%d1         \n"
        "or.l    %%d1, %%d0          \n"
        "move.b  %%d0, (%[addr])     \n"
        "add.l   %[psiz], %[addr]    \n"
        "lsr.l   #8, %%d1            \n"
        "move.l  %%d1, %[ax]         \n"
    ".ur_f4:                         \n"
        "move.b  (%[addr]), %%d0     \n"
        "and.l   %%d5, %%d0          \n"
        "or.l    %%d4, %%d0          \n"
        "move.b  %%d0, (%[addr])     \n"
        "add.l   %[psiz], %[addr]    \n"
        "lsr.l   #8, %%d4            \n"
    ".ur_f3:                         \n"
        "move.b  (%[addr]), %%d0     \n"
        "and.l   %%d5, %%d0          \n"
        "or.l    %%d3, %%d0          \n"
        "move.b  %%d0, (%[addr])     \n"
        "add.l   %[psiz], %[addr]    \n"
        "lsr.l   #8, %%d3            \n"
    ".ur_f2:                         \n"
        "move.b  (%[addr]), %%d0     \n"
        "and.l   %%d5, %%d0          \n"
        "or.l    %%d2, %%d0          \n"
        "move.b  %%d0, (%[addr])     \n"
        "add.l   %[psiz], %[addr]    \n"
        "lsr.l   #8, %%d2            \n"
    ".ur_f1:                         \n"
        "move.b  (%[addr]), %%d0     \n"
        "and.l   %%d5, %%d0          \n"
        "move.l  %[mask], %%d1       \n"
        "or.l    %%d1, %%d0          \n"
        "move.b  %%d0, (%[addr])     \n"
        "add.l   %[psiz], %[addr]    \n"
        "lsr.l   #8, %%d1            \n"
        "move.l  %%d1, %[mask]       \n"

        "subq.l  #8, %[dpth]         \n"
        "tst.l   %[dpth]             \n"  /* subq doesn't set flags for A reg */
        "jgt     .ur_floop           \n"  /* next round if anything left */

        "jra     .ur_end             \n"

    ".ur_sstart:                     \n"
        "jmp     (2, %%pc, %%d0:l:2) \n"  /* jump into short loop */
        "bra.s   .ur_s1              \n"
        "bra.s   .ur_s2              \n"
        "bra.s   .ur_s3              \n"
        "bra.s   .ur_s4              \n"
        "bra.s   .ur_s5              \n"
        "bra.s   .ur_s6              \n"
        "bra.s   .ur_s7              \n"
        /* bra.s   .ur_s8             */  /* identical with target */

    ".ur_sloop:                      \n"  /** short loop (nothing to keep) **/
    ".ur_s8:                         \n"
        "move.l  %%a0, %%d5          \n"
        "move.b  %%d5, (%[addr])     \n"  /* store byte */
        "add.l   %[psiz], %[addr]    \n"
        "lsr.l   #8, %%d5            \n"  /* shift out used-up byte */
        "move.l  %%d5, %%a0          \n"
    ".ur_s7:                         \n"
        "move.b  %%d7, (%[addr])     \n"
        "add.l   %[psiz], %[addr]    \n"
        "lsr.l   #8, %%d7            \n"
    ".ur_s6:                         \n"
        "move.b  %%d6, (%[addr])     \n"
        "add.l   %[psiz], %[addr]    \n"
        "lsr.l   #8, %%d6            \n"
    ".ur_s5:                         \n"
        "move.l  %[ax], %%d5         \n"
        "move.b  %%d5, (%[addr])     \n"
        "add.l   %[psiz], %[addr]    \n"
        "lsr.l   #8, %%d5            \n"
        "move.l  %%d5, %[ax]         \n"
    ".ur_s4:                         \n"
        "move.b  %%d4, (%[addr])     \n"
        "add.l   %[psiz], %[addr]    \n"
        "lsr.l   #8, %%d4            \n"
    ".ur_s3:                         \n"
        "move.b  %%d3, (%[addr])     \n"
        "add.l   %[psiz], %[addr]    \n"
        "lsr.l   #8, %%d3            \n"
    ".ur_s2:                         \n"
        "move.b  %%d2, (%[addr])     \n"
        "add.l   %[psiz], %[addr]    \n"
        "lsr.l   #8, %%d2            \n"
    ".ur_s1:                         \n"
        "move.b  %%d1, (%[addr])     \n"
        "add.l   %[psiz], %[addr]    \n"
        "lsr.l   #8, %%d1            \n"

        "subq.l  #8, %[dpth]         \n"
        "tst.l   %[dpth]             \n"  /* subq doesn't set flags for A reg */
        "jgt     .ur_sloop           \n"  /* next round if anything left */

    ".ur_end:                        \n"
        : /* outputs */
        [addr]"+a"(addr),
        [dpth]"+a"(depth),
        [mask]"+a"(mask),
        [ax]  "=&a"(trash)
        : /* inputs */
        [psiz]"a"(_gray_info.plane_size),
        [patp]"[ax]"(pat_ptr)
        : /* clobbers */
        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a0"
                );
            }
#else /* C version, for reference*/
#warning C version of gray_update_rect() used
            (void)pat_ptr;
            /* check whether anything changed in the 8-pixel block */
            change  = *(uint32_t *)cbuf ^ *(uint32_t *)bbuf;
            change |= *(uint32_t *)(cbuf + 4) ^ *(uint32_t *)(bbuf + 4);
            
            if (change != 0)
            {
                unsigned char *addr, *end;
                unsigned mask = 0;
                unsigned test = 1 << ((-_gray_info.depth) & 7);
                int i;

                /* precalculate the bit patterns with random shifts
                 * for all 8 pixels and put them on an extra "stack" */
                for (i = 0; i < 8; i++)
                {
                    unsigned pat = 0;
                    unsigned char cur = *cbuf++;
                    unsigned char back = *bbuf;
                    
                    *bbuf++ = cur;

                    if (cur != back)
                    {
                        int shift;

                        pat = _gray_info.bitpattern[cur];

                        /* shift pattern pseudo-random, simple & fast PRNG */
                        _gray_random_buffer = 75 * _gray_random_buffer + 74;
                        shift = (_gray_random_buffer >> 8) & _gray_info.randmask;
                        if (shift >= _gray_info.depth)
                            shift -= _gray_info.depth;
                            
                        pat = (pat << shift) | (pat >> (_gray_info.depth - shift));
                        
                        mask |= 0x100;
                    }
                    mask >>= 1;
                    pat_stack[i] = pat;
                }

                addr = dst_row;
                end = addr + MULU16(_gray_info.depth, _gray_info.plane_size);

                /* set the bits for all 8 pixels in all bytes according to the
                 * precalculated patterns on the pattern stack */
                mask = (~mask & 0xff);
                if (mask == 0)
                {
                    do
                    {
                        unsigned data = 0;

                        for (i = 7; i >= 0; i--)
                            data = (data << 1) | ((pat_stack[i] & test) ? 1 : 0);
                        
                        *addr = data;
                        addr += _gray_info.plane_size;
                        test <<= 1;
                    }
                    while (addr < end);
                }
                else
                {
                    do
                    {
                        unsigned data = 0;

                        for (i = 7; i >= 0; i--)
                            data = (data << 1) | ((pat_stack[i] & test) ? 1 : 0);
                        
                        *addr = (*addr & mask) | data;
                        addr += _gray_info.plane_size;
                        test <<= 1;
                    }
                    while (addr < end);
                }

            }
#endif /* CONFIG_CPU */
            srcofs_row += _gray_info.height;
            dst_row++;
        }
        while (dst_row < dst_end);
        
        srcofs += 8;
        dst += _gray_info.width;
    }
}
#endif /* LCD_PIXELFORMAT */

#endif /* !SIMULATOR */

/* Update the whole greyscale overlay */
void gray_update(void)
{
    gray_update_rect(0, 0, _gray_info.width, _gray_info.height);
}

/* Do an lcd_update() to show changes done by rb->lcd_xxx() functions
   (in areas of the screen not covered by the greyscale overlay). */
void gray_deferred_lcd_update(void)
{
    if (_gray_info.flags & _GRAY_RUNNING)
    {
#ifdef SIMULATOR
        _deferred_update();
#else
        _gray_info.flags |= _GRAY_DEFERRED_UPDATE;
#endif
    }
    else
        _gray_rb->lcd_update();
}

/*** Screenshot ***/

#define BMP_FIXEDCOLORS (1 << LCD_DEPTH)
#define BMP_VARCOLORS   33
#define BMP_NUMCOLORS   (BMP_FIXEDCOLORS + BMP_VARCOLORS)
#define BMP_BPP         8
#define BMP_LINESIZE    ((LCD_WIDTH + 3) & ~3)
#define BMP_HEADERSIZE  (54 + 4 * BMP_NUMCOLORS)
#define BMP_DATASIZE    (BMP_LINESIZE * LCD_HEIGHT)
#define BMP_TOTALSIZE   (BMP_HEADERSIZE + BMP_DATASIZE)

#define LE16_CONST(x) (x)&0xff, ((x)>>8)&0xff
#define LE32_CONST(x) (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff

static const unsigned char bmpheader[] =
{
    0x42, 0x4d,                 /* 'BM' */
    LE32_CONST(BMP_TOTALSIZE),  /* Total file size */
    0x00, 0x00, 0x00, 0x00,     /* Reserved */
    LE32_CONST(BMP_HEADERSIZE), /* Offset to start of pixel data */

    0x28, 0x00, 0x00, 0x00,     /* Size of (2nd) header */
    LE32_CONST(LCD_WIDTH),      /* Width in pixels */
    LE32_CONST(LCD_HEIGHT),     /* Height in pixels */
    0x01, 0x00,                 /* Number of planes (always 1) */
    LE16_CONST(BMP_BPP),        /* Bits per pixel 1/4/8/16/24 */
    0x00, 0x00, 0x00, 0x00,     /* Compression mode, 0 = none */
    LE32_CONST(BMP_DATASIZE),   /* Size of bitmap data */
    0xc4, 0x0e, 0x00, 0x00,     /* Horizontal resolution (pixels/meter) */
    0xc4, 0x0e, 0x00, 0x00,     /* Vertical resolution (pixels/meter) */
    LE32_CONST(BMP_NUMCOLORS),  /* Number of used colours */
    LE32_CONST(BMP_NUMCOLORS),  /* Number of important colours */

    /* Fixed colours */
#if LCD_DEPTH == 1
    0x90, 0xee, 0x90, 0x00,     /* Colour #0 */
    0x00, 0x00, 0x00, 0x00      /* Colour #1 */
#elif LCD_DEPTH == 2
    0xe6, 0xd8, 0xad, 0x00,     /* Colour #0 */
    0x99, 0x90, 0x73, 0x00,     /* Colour #1 */
    0x4c, 0x48, 0x39, 0x00,     /* Colour #2 */
    0x00, 0x00, 0x00, 0x00      /* Colour #3 */
#endif
};

#if LCD_DEPTH == 1
#define BMP_RED   0x90
#define BMP_GREEN 0xee
#define BMP_BLUE  0x90
#elif LCD_DEPTH == 2
#define BMP_RED   0xad
#define BMP_GREEN 0xd8
#define BMP_BLUE  0xe6
#endif

/* Hook function for core screen_dump() to save the current display
   content (b&w and greyscale overlay) to an 8-bit BMP file. */
static void gray_screendump_hook(int fd)
{
    int i;
    int x, y;
    int gx, gy;
#if (LCD_DEPTH == 1) || !defined(SIMULATOR)
    int mask;
#endif
#if LCD_PIXELFORMAT == HORIZONTAL_PACKING
    unsigned data;
#else
    int by;
#if LCD_DEPTH == 2
    int shift;
#endif
#endif
    unsigned char *clut_entry;
    unsigned char *lcdptr;
    unsigned char linebuf[MAX(4*BMP_VARCOLORS,BMP_LINESIZE)];

    _gray_rb->write(fd, bmpheader, sizeof(bmpheader));  /* write header */

    /* build clut */
    _gray_rb->memset(linebuf, 0, 4*BMP_VARCOLORS);
    clut_entry = linebuf;

    for (i = _gray_info.depth; i > 0; i--)
    {
        *clut_entry++ = MULU16(BMP_BLUE,  i) / _gray_info.depth;
        *clut_entry++ = MULU16(BMP_GREEN, i) / _gray_info.depth;
        *clut_entry++ = MULU16(BMP_RED,   i) / _gray_info.depth;
        clut_entry++;
    }
    _gray_rb->write(fd, linebuf, 4*BMP_VARCOLORS);

    /* BMP image goes bottom -> top */
    for (y = LCD_HEIGHT - 1; y >= 0; y--)
    {
        _gray_rb->memset(linebuf, 0, BMP_LINESIZE);

        gy = y - _gray_info.y;
#if LCD_PIXELFORMAT == HORIZONTAL_PACKING
#if LCD_DEPTH == 2
        lcdptr = _gray_rb->lcd_framebuffer + MULU16(LCD_FBWIDTH, y);

        if ((unsigned) gy < (unsigned) _gray_info.height)
        {
            /* line contains greyscale (and maybe b&w) graphics */
#ifndef SIMULATOR
            unsigned char *grayptr = _gray_info.plane_data
                                   + MULU16(_gray_info.bwidth, gy);
#endif

            for (x = 0; x < LCD_WIDTH; x += 4)
            {
                gx = x - _gray_info.x;

                if ((unsigned)gx < (unsigned)_gray_info.width)
                {
#ifdef SIMULATOR
                    data = MULU16(gy, _gray_info.width) + gx;

                    for (i = 0; i < 4; i++)
                        linebuf[x + i] = BMP_FIXEDCOLORS + _gray_info.depth
                                       - _gray_info.cur_buffer[data + i];
#else
                    mask = 0x80 >> (gx & 7);

                    for (i = 0; i < 4; i++)
                    {
                        int j;
                        int idx = BMP_FIXEDCOLORS;
                        unsigned char *grayptr2 = grayptr + (gx >> 3);

                        for (j = _gray_info.depth; j > 0; j--)
                        {
                            if (*grayptr2 & mask)
                                idx++;
                            grayptr2 += _gray_info.plane_size;
                        }
                        linebuf[x + i] = idx;
                        mask >>= 1;
                    }
#endif
                }
                else
                {
                    data = *lcdptr;
                    linebuf[x]     = (data >> 6) & 3;
                    linebuf[x + 1] = (data >> 4) & 3;
                    linebuf[x + 2] = (data >> 2) & 3;
                    linebuf[x + 3] = data & 3;
                }
                lcdptr++;
            }
        }
        else
        {
            /* line contains only b&w graphics */
            for (x = 0; x < LCD_WIDTH; x += 4)
            {
                data = *lcdptr++;
                linebuf[x] = (data >> 6) & 3;
                linebuf[x + 1] = (data >> 4) & 3;
                linebuf[x + 2] = (data >> 2) & 3;
                linebuf[x + 3] = data & 3;
            }
        }
#endif /* LCD_DEPTH */
#else /* LCD_PIXELFORMAT == VERTICAL_PACKING */
#if LCD_DEPTH == 1
        mask = 1 << (y & 7);
        by = y >> 3;
        lcdptr = _gray_rb->lcd_framebuffer + MULU16(LCD_WIDTH, by);

        if ((unsigned) gy < (unsigned) _gray_info.height)
        {
            /* line contains greyscale (and maybe b&w) graphics */
#ifndef SIMULATOR
            unsigned char *grayptr = _gray_info.plane_data 
                                   + MULU16(_gray_info.width, gy >> 3);
#endif

            for (x = 0; x < LCD_WIDTH; x++)
            {
                gx = x - _gray_info.x;
                
                if ((unsigned)gx < (unsigned)_gray_info.width)
                {
#ifdef SIMULATOR
                    linebuf[x] = BMP_FIXEDCOLORS + _gray_info.depth
                               - _gray_info.cur_buffer[MULU16(gx, _gray_info.height) + gy];
#else
                    int idx = BMP_FIXEDCOLORS;
                    unsigned char *grayptr2 = grayptr + gx;

                    for (i = _gray_info.depth; i > 0; i--)
                    {
                        if (*grayptr2 & mask)
                            idx++;
                        grayptr2 += _gray_info.plane_size;
                    }
                    linebuf[x] = idx;
#endif
                }
                else
                {
                    linebuf[x] = (*lcdptr & mask) ? 1 : 0;
                }
                lcdptr++;
            }
        }
        else
        {
            /* line contains only b&w graphics */
            for (x = 0; x < LCD_WIDTH; x++)
                linebuf[x] = (*lcdptr++ & mask) ? 1 : 0;
        }
#elif LCD_DEPTH == 2
        shift = 2 * (y & 3);
        by = y >> 2;
        lcdptr = _gray_rb->lcd_framebuffer + MULU16(LCD_WIDTH, by);

        if ((unsigned)gy < (unsigned)_gray_info.height)
        {
            /* line contains greyscale (and maybe b&w) graphics */
#ifndef SIMULATOR
            unsigned char *grayptr = _gray_info.plane_data
                                   + MULU16(_gray_info.width, gy >> 3);
            mask = 1 << (gy & 7);
#endif

            for (x = 0; x < LCD_WIDTH; x++)
            {
                gx = x - _gray_info.x;
                
                if ((unsigned)gx < (unsigned)_gray_info.width)
                {
#ifdef SIMULATOR
                    linebuf[x] = BMP_FIXEDCOLORS + _gray_info.depth
                               - _gray_info.cur_buffer[MULU16(gx, _gray_info.height) + gy];
#else
                    int idx = BMP_FIXEDCOLORS;
                    unsigned char *grayptr2 = grayptr + gx;

                    for (i = _gray_info.depth; i > 0; i--)
                    {
                        if (*grayptr2 & mask)
                            idx++;
                        grayptr2 += _gray_info.plane_size;
                    }
                    linebuf[x] = idx;
#endif
                }
                else
                {
                    linebuf[x] = (*lcdptr >> shift) & 3;
                }
                lcdptr++;
            }
        }
        else
        {
            /* line contains only b&w graphics */
            for (x = 0; x < LCD_WIDTH; x++)
                linebuf[x] = (*lcdptr++ >> shift) & 3;
        }
#endif /* LCD_DEPTH */
#endif /* LCD_PIXELFORMAT */

        _gray_rb->write(fd, linebuf, BMP_LINESIZE);
    }
}

#endif /* HAVE_LCD_BITMAP */