diff options
author | Jens Arnold <amiconn@rockbox.org> | 2006-08-11 23:40:05 +0000 |
---|---|---|
committer | Jens Arnold <amiconn@rockbox.org> | 2006-08-11 23:40:05 +0000 |
commit | e6ed58f6c545719ed804c9ad513496f8d0fa0286 (patch) | |
tree | db096eb64de3ec7c33682c5bb50579784c5ca633 | |
parent | 22860a910a7ddbb9b811fce4cf971e097c8e6eac (diff) |
Better implementation of the new greyscale algorithm. A bit faster (+2% on iPod and Archos, +7% on Iriver H1x0), and more straightforward. * Fixed garbage display when a plugin switches on the overlay before drawing anything.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@10534 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r-- | apps/plugins/lib/gray_core.c | 501 | ||||
-rw-r--r-- | apps/plugins/lib/gray_draw.c | 494 |
2 files changed, 471 insertions, 524 deletions
diff --git a/apps/plugins/lib/gray_core.c b/apps/plugins/lib/gray_core.c index 809e88dba1..413b66c65d 100644 --- a/apps/plugins/lib/gray_core.c +++ b/apps/plugins/lib/gray_core.c @@ -348,7 +348,7 @@ int gray_init(struct plugin_api* newrb, unsigned char *gbuf, long gbuf_size, long plane_size, buftaken; unsigned data; #ifndef SIMULATOR - int j; + int j, bitfill; #endif _gray_rb = newrb; @@ -439,6 +439,7 @@ int gray_init(struct plugin_api* newrb, unsigned char *gbuf, long gbuf_size, _gray_info.cur_plane = 0; _gray_info.plane_size = plane_size; _gray_info.plane_data = gbuf; + _gray_rb->memset(gbuf, 0, depth * plane_size); gbuf += depth * plane_size; _gray_info.bitpattern = (unsigned long *)gbuf; @@ -449,7 +450,8 @@ int gray_init(struct plugin_api* newrb, unsigned char *gbuf, long gbuf_size, i >>= 1; j--; } - _gray_info.randmask = 0xFFu >> j; + _gray_info.randmask = 0xFFu >> j; + bitfill = (-depth) & 7; /* Precalculate the bit patterns for all possible pixel values */ for (i = 0; i <= depth; i++) @@ -469,7 +471,7 @@ int gray_init(struct plugin_api* newrb, unsigned char *gbuf, long gbuf_size, } /* now the lower <depth> bits contain the pattern */ - _gray_info.bitpattern[i] = pattern; + _gray_info.bitpattern[i] = pattern << bitfill; } #endif @@ -797,101 +799,93 @@ void gray_update_rect(int x, int y, int width, int height) "eor r7, r7, r0 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */ "eor r8, r8, r0, lsr #1 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */ + "sub r0, %[dpth], #1 \n" /** shift out unused low bytes **/ + "and r0, r0, #7 \n" + "add pc, pc, r0, lsl #2 \n" /* jump into shift streak */ + "mov r8, r8, lsr #8 \n" /* r8: never reached */ + "mov r7, r7, lsr #8 \n" + "mov r6, r6, lsr #8 \n" + "mov r5, r5, lsr #8 \n" + "mov r4, r4, lsr #8 \n" + "mov r3, r3, lsr #8 \n" + "mov r2, r2, lsr #8 \n" + "mov r1, r1, lsr #8 \n" + "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ "ands %[mask], %[mask], #0xff \n" - "beq .ur_sloop \n" /* short loop if no bits to keep */ - - ".ur_floop: \n" /** full loop (bits to keep)**/ - "cmp %[dpth], #8 \n" /* 8 planes or more left? */ - "bhs .ur_f8 \n" + "beq .ur_sstart \n" /* short loop if no bits to keep */ - "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */ - "add %[addr], %[addr], r0 \n" /* for this round */ - - "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */ + "ldrb r0, [pc, r0] \n" /* jump into full loop */ "add pc, pc, r0 \n" ".ur_ftable: \n" - ".byte .ur_f0 - .ur_ftable - 4 \n" /* [jump tables are tricky] */ - ".byte .ur_f1 - .ur_ftable - 4 \n" + ".byte .ur_f1 - .ur_ftable - 4 \n" /* [jump tables are tricky] */ ".byte .ur_f2 - .ur_ftable - 4 \n" ".byte .ur_f3 - .ur_ftable - 4 \n" ".byte .ur_f4 - .ur_ftable - 4 \n" ".byte .ur_f5 - .ur_ftable - 4 \n" ".byte .ur_f6 - .ur_ftable - 4 \n" ".byte .ur_f7 - .ur_ftable - 4 \n" + ".byte .ur_f8 - .ur_ftable - 4 \n" + ".ur_floop: \n" /** full loop (bits to keep)**/ ".ur_f8: \n" - "add %[addr], %[addr], %[psiz], lsl #3 \n" - /* Point behind the last plane for this round. Note: We're using the - * registers backwards in order to reuse the streak for the last round. - * Therefore we need to go thru the bitplanes backwards too, otherwise - * the bit order would be destroyed which results in more flicker. */ - "ldrb r0, [%[addr], -%[psiz]]! \n" /* load old byte */ + "ldrb r0, [%[addr]] \n" /* load old byte */ "and r0, r0, %[mask] \n" /* mask out replaced bits */ - "orr r0, r0, r8 \n" /* set new bits */ - "strb r0, [%[addr]] \n" /* store byte */ - "mov r8, r8, lsr #8 \n" /* shift out used-up byte */ + "orr r0, r0, r1 \n" /* set new bits */ + "strb r0, [%[addr]], %[psiz] \n" /* store byte */ + "mov r1, r1, lsr #8 \n" /* shift out used-up byte */ ".ur_f7: \n" - "ldrb r0, [%[addr], -%[psiz]]! \n" + "ldrb r0, [%[addr]] \n" "and r0, r0, %[mask] \n" - "orr r0, r0, r7 \n" - "strb r0, [%[addr]] \n" - "mov r7, r7, lsr #8 \n" + "orr r0, r0, r2 \n" + "strb r0, [%[addr]], %[psiz] \n" + "mov r2, r2, lsr #8 \n" ".ur_f6: \n" - "ldrb r0, [%[addr], -%[psiz]]! \n" + "ldrb r0, [%[addr]] \n" "and r0, r0, %[mask] \n" - "orr r0, r0, r6 \n" - "strb r0, [%[addr]] \n" - "mov r6, r6, lsr #8 \n" + "orr r0, r0, r3 \n" + "strb r0, [%[addr]], %[psiz] \n" + "mov r3, r3, lsr #8 \n" ".ur_f5: \n" - "ldrb r0, [%[addr], -%[psiz]]! \n" - "and r0, r0, %[mask] \n" - "orr r0, r0, r5 \n" - "strb r0, [%[addr]] \n" - "mov r5, r5, lsr #8 \n" - ".ur_f4: \n" - "ldrb r0, [%[addr], -%[psiz]]! \n" + "ldrb r0, [%[addr]] \n" "and r0, r0, %[mask] \n" "orr r0, r0, r4 \n" - "strb r0, [%[addr]] \n" + "strb r0, [%[addr]], %[psiz] \n" "mov r4, r4, lsr #8 \n" + ".ur_f4: \n" + "ldrb r0, [%[addr]] \n" + "and r0, r0, %[mask] \n" + "orr r0, r0, r5 \n" + "strb r0, [%[addr]], %[psiz] \n" + "mov r5, r5, lsr #8 \n" ".ur_f3: \n" - "ldrb r0, [%[addr], -%[psiz]]! \n" + "ldrb r0, [%[addr]] \n" "and r0, r0, %[mask] \n" - "orr r0, r0, r3 \n" - "strb r0, [%[addr]] \n" - "mov r3, r3, lsr #8 \n" + "orr r0, r0, r6 \n" + "strb r0, [%[addr]], %[psiz] \n" + "mov r6, r6, lsr #8 \n" ".ur_f2: \n" - "ldrb r0, [%[addr], -%[psiz]]! \n" + "ldrb r0, [%[addr]] \n" "and r0, r0, %[mask] \n" - "orr r0, r0, r2 \n" - "strb r0, [%[addr]] \n" - "mov r2, r2, lsr #8 \n" + "orr r0, r0, r7 \n" + "strb r0, [%[addr]], %[psiz] \n" + "mov r7, r7, lsr #8 \n" ".ur_f1: \n" - "ldrb r0, [%[addr], -%[psiz]]! \n" + "ldrb r0, [%[addr]] \n" "and r0, r0, %[mask] \n" - "orr r0, r0, r1 \n" - "strb r0, [%[addr]] \n" - "mov r1, r1, lsr #8 \n" - ".ur_f0: \n" + "orr r0, r0, r8 \n" + "strb r0, [%[addr]], %[psiz] \n" + "mov r8, r8, lsr #8 \n" - "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */ "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */ "bhi .ur_floop \n" "b .ur_end \n" - ".ur_sloop: \n" /** short loop (nothing to keep) **/ - "cmp %[dpth], #8 \n" /* 8 planes or more left? */ - "bhs .ur_s8 \n" - - "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */ - "add %[addr], %[addr], r0 \n" /* for this round */ - - "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */ + ".ur_sstart: \n" + "ldrb r0, [pc, r0] \n" /* jump into short loop*/ "add pc, pc, r0 \n" ".ur_stable: \n" - ".byte .ur_s0 - .ur_stable - 4 \n" ".byte .ur_s1 - .ur_stable - 4 \n" ".byte .ur_s2 - .ur_stable - 4 \n" ".byte .ur_s3 - .ur_stable - 4 \n" @@ -899,36 +893,34 @@ void gray_update_rect(int x, int y, int width, int height) ".byte .ur_s5 - .ur_stable - 4 \n" ".byte .ur_s6 - .ur_stable - 4 \n" ".byte .ur_s7 - .ur_stable - 4 \n" + ".byte .ur_s8 - .ur_stable - 4 \n" + ".ur_sloop: \n" /** short loop (nothing to keep) **/ ".ur_s8: \n" - "add %[addr], %[addr], %[psiz], lsl #3 \n" - /* Point behind the last plane for this round. See above. */ - "strb r8, [%[addr], -%[psiz]]! \n" /* store byte */ - "mov r8, r8, lsr #8 \n" /* shift out used-up byte */ + "strb r1, [%[addr]], %[psiz] \n" /* store byte */ + "mov r1, r1, lsr #8 \n" /* shift out used-up byte */ ".ur_s7: \n" - "strb r7, [%[addr], -%[psiz]]! \n" - "mov r7, r7, lsr #8 \n" + "strb r2, [%[addr]], %[psiz] \n" + "mov r2, r2, lsr #8 \n" ".ur_s6: \n" - "strb r6, [%[addr], -%[psiz]]! \n" - "mov r6, r6, lsr #8 \n" + "strb r3, [%[addr]], %[psiz] \n" + "mov r3, r3, lsr #8 \n" ".ur_s5: \n" - "strb r5, [%[addr], -%[psiz]]! \n" - "mov r5, r5, lsr #8 \n" - ".ur_s4: \n" - "strb r4, [%[addr], -%[psiz]]! \n" + "strb r4, [%[addr]], %[psiz] \n" "mov r4, r4, lsr #8 \n" + ".ur_s4: \n" + "strb r5, [%[addr]], %[psiz] \n" + "mov r5, r5, lsr #8 \n" ".ur_s3: \n" - "strb r3, [%[addr], -%[psiz]]! \n" - "mov r3, r3, lsr #8 \n" + "strb r6, [%[addr]], %[psiz] \n" + "mov r6, r6, lsr #8 \n" ".ur_s2: \n" - "strb r2, [%[addr], -%[psiz]]! \n" - "mov r2, r2, lsr #8 \n" + "strb r7, [%[addr]], %[psiz] \n" + "mov r7, r7, lsr #8 \n" ".ur_s1: \n" - "strb r1, [%[addr], -%[psiz]]! \n" - "mov r1, r1, lsr #8 \n" - ".ur_s0: \n" + "strb r8, [%[addr]], %[psiz] \n" + "mov r8, r8, lsr #8 \n" - "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */ "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */ "bhi .ur_sloop \n" @@ -956,7 +948,7 @@ void gray_update_rect(int x, int y, int width, int height) { unsigned char *addr, *end; unsigned mask = 0; - unsigned test = 1; + unsigned test = 1 << ((-_gray_info.depth) & 7); int i; /* precalculate the bit patterns with random shifts @@ -1287,25 +1279,37 @@ void gray_update_rect(int x, int y, int width, int height) "shlr r0 \n" "xor r0, r8 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */ - "tst %[mask], %[mask] \n" - "bt .ur_sloop \n" /* short loop if nothing to keep */ + "mov %[dpth], %[rx] \n" /** shift out unused low bytes **/ + "add #-1, %[rx] \n" + "mov #7, r0 \n" + "and r0, %[rx] \n" + "mova .ur_pshift, r0 \n" + "add %[rx], r0 \n" + "add %[rx], r0 \n" + "jmp @r0 \n" /* jump into shift streak */ + "nop \n" - ".ur_floop: \n" /** full loop (there are bits to keep)**/ - "mov #8, r0 \n" - "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */ - "bt .ur_f8 \n" + ".align 2 \n" + ".ur_pshift: \n" + "shlr8 r7 \n" + "shlr8 r6 \n" + "shlr8 r5 \n" + "shlr8 r4 \n" + "shlr8 r3 \n" + "shlr8 r2 \n" + "shlr8 r1 \n" + + "tst %[mask], %[mask] \n" + "bt .ur_sstart \n" /* short loop if nothing to keep */ - "mulu %[psiz], %[dpth] \n" - "mova .ur_ftable, r0 \n" - "mov.b @(r0, %[dpth]), %[rx] \n" + "mova .ur_ftable, r0 \n" /* jump into full loop */ + "mov.b @(r0, %[rx]), %[rx] \n" "add %[rx], r0 \n" - "sts macl, %[rx] \n" /* point behind the last plane.. */ - "jmp @r0 \n" /* jump into streak */ - "add %[rx], %[addr] \n" /* ..for this round */ - + "jmp @r0 \n" + "nop \n" + ".align 2 \n" ".ur_ftable: \n" - ".byte .ur_f0 - .ur_ftable \n" ".byte .ur_f1 - .ur_ftable \n" ".byte .ur_f2 - .ur_ftable \n" ".byte .ur_f3 - .ur_ftable \n" @@ -1313,74 +1317,66 @@ void gray_update_rect(int x, int y, int width, int height) ".byte .ur_f5 - .ur_ftable \n" ".byte .ur_f6 - .ur_ftable \n" ".byte .ur_f7 - .ur_ftable \n" + ".byte .ur_f8 - .ur_ftable \n" + ".ur_floop: \n" /** full loop (there are bits to keep)**/ ".ur_f8: \n" - "mov %[psiz], %[rx] \n" - "shll2 %[rx] \n" - "add %[rx], %[rx] \n" - "add %[rx], %[addr] \n" - /* Point behind the last plane for this round. Note: We're using the - * registers backwards in order to reuse the streak for the last round. - * Therefore we need to go thru the bitplanes backwards too, otherwise - * the bit order would be destroyed which results in more flicker. */ - "sub %[psiz], %[addr] \n" "mov.b @%[addr], r0 \n" /* load old byte */ "and %[mask], r0 \n" /* mask out replaced bits */ - "or r8, r0 \n" /* set new bits */ + "or r1, r0 \n" /* set new bits */ "mov.b r0, @%[addr] \n" /* store byte */ - "shlr8 r8 \n" /* shift out used-up byte */ + "add %[psiz], %[addr] \n" + "shlr8 r1 \n" /* shift out used-up byte */ ".ur_f7: \n" - "sub %[psiz], %[addr] \n" "mov.b @%[addr], r0 \n" "and %[mask], r0 \n" - "or r7, r0 \n" + "or r2, r0 \n" "mov.b r0, @%[addr] \n" - "shlr8 r7 \n" + "add %[psiz], %[addr] \n" + "shlr8 r2 \n" ".ur_f6: \n" - "sub %[psiz], %[addr] \n" "mov.b @%[addr], r0 \n" "and %[mask], r0 \n" - "or r6, r0 \n" + "or r3, r0 \n" "mov.b r0, @%[addr] \n" - "shlr8 r6 \n" + "add %[psiz], %[addr] \n" + "shlr8 r3 \n" ".ur_f5: \n" - "sub %[psiz], %[addr] \n" "mov.b @%[addr], r0 \n" "and %[mask], r0 \n" - "or r5, r0 \n" + "or r4, r0 \n" "mov.b r0, @%[addr] \n" - "shlr8 r5 \n" + "add %[psiz], %[addr] \n" + "shlr8 r4 \n" ".ur_f4: \n" - "sub %[psiz], %[addr] \n" "mov.b @%[addr], r0 \n" "and %[mask], r0 \n" - "or r4, r0 \n" + "or r5, r0 \n" "mov.b r0, @%[addr] \n" - "shlr8 r4 \n" + "add %[psiz], %[addr] \n" + "shlr8 r5 \n" ".ur_f3: \n" - "sub %[psiz], %[addr] \n" "mov.b @%[addr], r0 \n" "and %[mask], r0 \n" - "or r3, r0 \n" + "or r6, r0 \n" "mov.b r0, @%[addr] \n" - "shlr8 r3 \n" + "add %[psiz], %[addr] \n" + "shlr8 r6 \n" ".ur_f2: \n" - "sub %[psiz], %[addr] \n" "mov.b @%[addr], r0 \n" "and %[mask], r0 \n" - "or r2, r0 \n" + "or r7, r0 \n" "mov.b r0, @%[addr] \n" - "shlr8 r2 \n" + "add %[psiz], %[addr] \n" + "shlr8 r7 \n" ".ur_f1: \n" - "sub %[psiz], %[addr] \n" "mov.b @%[addr], r0 \n" "and %[mask], r0 \n" - "or r1, r0 \n" + "or r8, r0 \n" "mov.b r0, @%[addr] \n" - "shlr8 r1 \n" - ".ur_f0: \n" + "add %[psiz], %[addr] \n" + "shlr8 r8 \n" - "add %[rx], %[addr] \n" /* correct address */ "add #-8, %[dpth] \n" "cmp/pl %[dpth] \n" /* next round if anything left */ "bt .ur_floop \n" @@ -1404,22 +1400,15 @@ void gray_update_rect(int x, int y, int width, int height) ".ur_mask1: \n" ".long 0xAAAAAAAA \n" - ".ur_sloop: \n" /** short loop (nothing to keep) **/ - "mov #8, r0 \n" - "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */ - "bt .ur_s8 \n" - - "mulu %[psiz], %[dpth] \n" - "mova .ur_stable, r0 \n" - "mov.b @(r0, %[dpth]), %[rx] \n" + ".ur_sstart: \n" + "mova .ur_stable, r0 \n" /* jump into short loop */ + "mov.b @(r0, %[rx]), %[rx] \n" "add %[rx], r0 \n" - "sts macl, %[rx] \n" /* point behind the last plane.. */ - "jmp @r0 \n" /* jump into streak */ - "add %[rx], %[addr] \n" /* ..for this round */ + "jmp @r0 \n" + "nop \n" ".align 2 \n" ".ur_stable: \n" - ".byte .ur_s0 - .ur_stable \n" ".byte .ur_s1 - .ur_stable \n" ".byte .ur_s2 - .ur_stable \n" ".byte .ur_s3 - .ur_stable \n" @@ -1427,47 +1416,42 @@ void gray_update_rect(int x, int y, int width, int height) ".byte .ur_s5 - .ur_stable \n" ".byte .ur_s6 - .ur_stable \n" ".byte .ur_s7 - .ur_stable \n" + ".byte .ur_s8 - .ur_stable \n" + ".ur_sloop: \n" /** short loop (nothing to keep) **/ ".ur_s8: \n" - "mov %[psiz], %[rx] \n" /* Point behind the last plane */ - "shll2 %[rx] \n" /* for this round. */ - "add %[rx], %[rx] \n" /* See above. */ - "add %[rx], %[addr] \n" - - "sub %[psiz], %[addr] \n" - "mov.b r8, @%[addr] \n" /* store byte */ - "shlr8 r8 \n" /* shift out used-up byte */ + "mov.b r1, @%[addr] \n" /* store byte */ + "add %[psiz], %[addr] \n" + "shlr8 r1 \n" /* shift out used-up byte */ ".ur_s7: \n" - "sub %[psiz], %[addr] \n" - "mov.b r7, @%[addr] \n" - "shlr8 r7 \n" + "mov.b r2, @%[addr] \n" + "add %[psiz], %[addr] \n" + "shlr8 r2 \n" ".ur_s6: \n" - "sub %[psiz], %[addr] \n" - "mov.b r6, @%[addr] \n" - "shlr8 r6 \n" + "mov.b r3, @%[addr] \n" + "add %[psiz], %[addr] \n" + "shlr8 r3 \n" ".ur_s5: \n" - "sub %[psiz], %[addr] \n" - "mov.b r5, @%[addr] \n" - "shlr8 r5 \n" - ".ur_s4: \n" - "sub %[psiz], %[addr] \n" "mov.b r4, @%[addr] \n" + "add %[psiz], %[addr] \n" "shlr8 r4 \n" + ".ur_s4: \n" + "mov.b r5, @%[addr] \n" + "add %[psiz], %[addr] \n" + "shlr8 r5 \n" ".ur_s3: \n" - "sub %[psiz], %[addr] \n" - "mov.b r3, @%[addr] \n" - "shlr8 r3 \n" + "mov.b r6, @%[addr] \n" + "add %[psiz], %[addr] \n" + "shlr8 r6 \n" ".ur_s2: \n" - "sub %[psiz], %[addr] \n" - "mov.b r2, @%[addr] \n" - "shlr8 r2 \n" + "mov.b r7, @%[addr] \n" + "add %[psiz], %[addr] \n" + "shlr8 r7 \n" ".ur_s1: \n" - "sub %[psiz], %[addr] \n" - "mov.b r1, @%[addr] \n" - "shlr8 r1 \n" - ".ur_s0: \n" + "mov.b r8, @%[addr] \n" + "add %[psiz], %[addr] \n" + "shlr8 r8 \n" - "add %[rx], %[addr] \n" /* correct address */ "add #-8, %[dpth] \n" "cmp/pl %[dpth] \n" /* next round if anything left */ "bt .ur_sloop \n" @@ -1677,172 +1661,163 @@ void gray_update_rect(int x, int y, int width, int height) "move.l %%a0, %%d5 \n" "eor.l %%d5, %%d0 \n" "and.l #0xAAAAAAAA, %%d0 \n" - "eor.l %%d0, %%d5 \n" - "move.l %%d5, %%a0 \n" /* a0 = ...h0g0f0e0d0c0b0a0 */ + "eor.l %%d0, %%d5 \n" /* (a0 = ...h0g0f0e0d0c0b0a0) */ + /* move.l %%d5, %%a0 */ /* but keep in d5 for shift streak */ "lsr.l #1, %%d0 \n" "eor.l %%d0, %%d7 \n" /* d7 = ...h1g1f1e1d1c1b1a1 */ + + "move.l %[dpth], %%d0 \n" /** shift out unused low bytes **/ + "subq.l #1, %%d0 \n" + "and.l #7, %%d0 \n" + "move.l %%d0, %%a0 \n" + "move.l %[ax], %%d0 \n" /* all data in D registers */ + "jmp (2, %%pc, %%a0:l:2) \n" /* jump into shift streak */ + "lsr.l #8, %%d2 \n" + "lsr.l #8, %%d3 \n" + "lsr.l #8, %%d4 \n" + "lsr.l #8, %%d0 \n" + "lsr.l #8, %%d6 \n" + "lsr.l #8, %%d7 \n" + "lsr.l #8, %%d5 \n" + "move.l %%d0, %[ax] \n" /* put the 2 extra words back.. */ + "move.l %%a0, %%d0 \n" /* keep the value for later */ + "move.l %%d5, %%a0 \n" /* ..into their A registers */ "tst.l %[mask] \n" - "jeq .ur_sloop \n" /* short loop if nothing to keep */ + "jeq .ur_sstart \n" /* short loop if nothing to keep */ "move.l %[mask], %%d5 \n" /* need mask in data reg. */ "move.l %%d1, %[mask] \n" /* free d1 as working reg. */ - ".ur_floop: \n" /** full loop (there are bits to keep)**/ - "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */ - "bhs.s .ur_f8 \n" - - "move.l %[psiz], %%d0 \n" - "move.l %[dpth], %%d1 \n" - "mulu.w %%d1, %%d0 \n" /* point behind the last plane */ - "add.l %%d0, %[addr] \n" /* for this round */ - "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */ - "bra.s .ur_f1 \n" /* dpth == 0 should never happen */ + "jmp (2, %%pc, %%d0:l:2) \n" /* jump into full loop */ + "bra.s .ur_f1 \n" "bra.s .ur_f2 \n" "bra.s .ur_f3 \n" "bra.s .ur_f4 \n" "bra.s .ur_f5 \n" "bra.s .ur_f6 \n" "bra.s .ur_f7 \n" + /* bra.s .ur_f8 */ /* identical with target */ + ".ur_floop: \n" /** full loop (there are bits to keep)**/ ".ur_f8: \n" - "move.l %[psiz], %%d0 \n" - "lsl.l #3, %%d0 \n" - "add.l %%d0, %[addr] \n" - /* Point behind the last plane for this round. Note: We're using the - * registers backwards in order to reuse the streak for the last round. - * Therefore we need to go thru the bitplanes backwards too, otherwise - * the bit order would be destroyed which results in more flicker. */ - "sub.l %[psiz], %[addr] \n" "move.b (%[addr]), %%d0 \n" /* load old byte */ "and.l %%d5, %%d0 \n" /* mask out replaced bits */ - "move.l %[mask], %%d1 \n" + "move.l %%a0, %%d1 \n" "or.l %%d1, %%d0 \n" /* set new bits */ "move.b %%d0, (%[addr]) \n" /* store byte */ + "add.l %[psiz], %[addr] \n" "lsr.l #8, %%d1 \n" /* shift out used-up byte */ - "move.l %%d1, %[mask] \n" + "move.l %%d1, %%a0 \n" ".ur_f7: \n" - "sub.l %[psiz], %[addr] \n" "move.b (%[addr]), %%d0 \n" "and.l %%d5, %%d0 \n" - "or.l %%d2, %%d0 \n" + "or.l %%d7, %%d0 \n" "move.b %%d0, (%[addr]) \n" - "lsr.l #8, %%d2 \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d7 \n" ".ur_f6: \n" - "sub.l %[psiz], %[addr] \n" "move.b (%[addr]), %%d0 \n" "and.l %%d5, %%d0 \n" - "or.l %%d3, %%d0 \n" + "or.l %%d6, %%d0 \n" "move.b %%d0, (%[addr]) \n" - "lsr.l #8, %%d3 \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d6 \n" ".ur_f5: \n" - "sub.l %[psiz], %[addr] \n" - "move.b (%[addr]), %%d0 \n" - "and.l %%d5, %%d0 \n" - "or.l %%d4, %%d0 \n" - "move.b %%d0, (%[addr]) \n" - "lsr.l #8, %%d4 \n" - ".ur_f4: \n" - "sub.l %[psiz], %[addr] \n" "move.b (%[addr]), %%d0 \n" "and.l %%d5, %%d0 \n" "move.l %[ax], %%d1 \n" "or.l %%d1, %%d0 \n" "move.b %%d0, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" "lsr.l #8, %%d1 \n" "move.l %%d1, %[ax] \n" + ".ur_f4: \n" + "move.b (%[addr]), %%d0 \n" + "and.l %%d5, %%d0 \n" + "or.l %%d4, %%d0 \n" + "move.b %%d0, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d4 \n" ".ur_f3: \n" - "sub.l %[psiz], %[addr] \n" "move.b (%[addr]), %%d0 \n" "and.l %%d5, %%d0 \n" - "or.l %%d6, %%d0 \n" + "or.l %%d3, %%d0 \n" "move.b %%d0, (%[addr]) \n" - "lsr.l #8, %%d6 \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d3 \n" ".ur_f2: \n" - "sub.l %[psiz], %[addr] \n" "move.b (%[addr]), %%d0 \n" "and.l %%d5, %%d0 \n" - "or.l %%d7, %%d0 \n" + "or.l %%d2, %%d0 \n" "move.b %%d0, (%[addr]) \n" - "lsr.l #8, %%d7 \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d2 \n" ".ur_f1: \n" - "sub.l %[psiz], %[addr] \n" "move.b (%[addr]), %%d0 \n" "and.l %%d5, %%d0 \n" - "move.l %%a0, %%d1 \n" + "move.l %[mask], %%d1 \n" "or.l %%d1, %%d0 \n" "move.b %%d0, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" "lsr.l #8, %%d1 \n" - "move.l %%d1, %%a0 \n" + "move.l %%d1, %[mask] \n" - "move.l %[psiz], %%d0 \n" - "lsl.l #3, %%d0 \n" - "add.l %%d0, %[addr] \n" /* correct address */ "subq.l #8, %[dpth] \n" "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */ "jgt .ur_floop \n" /* next round if anything left */ "jra .ur_end \n" - ".ur_sloop: \n" /** short loop (nothing to keep) **/ - "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */ - "bhs.s .ur_s8 \n" - - "move.l %[psiz], %%d0 \n" - "move.l %[dpth], %%d5 \n" - "mulu.w %%d5, %%d0 \n" /* point behind the last plane */ - "add.l %%d0, %[addr] \n" /* for this round */ - "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */ - "bra.s .ur_s1 \n" /* dpth == 0 should never happen */ + ".ur_sstart: \n" + "jmp (2, %%pc, %%d0:l:2) \n" /* jump into short loop */ + "bra.s .ur_s1 \n" "bra.s .ur_s2 \n" "bra.s .ur_s3 \n" "bra.s .ur_s4 \n" "bra.s .ur_s5 \n" "bra.s .ur_s6 \n" "bra.s .ur_s7 \n" + /* bra.s .ur_s8 */ /* identical with target */ + ".ur_sloop: \n" /** short loop (nothing to keep) **/ ".ur_s8: \n" - "move.l %[psiz], %%d0 \n" /* Point behind the last plane */ - "lsl.l #3, %%d0 \n" /* for this round. */ - "add.l %%d0, %[addr] \n" /* See above. */ - - "sub.l %[psiz], %[addr] \n" - "move.b %%d1, (%[addr]) \n" /* store byte */ - "lsr.l #8, %%d1 \n" /* shift out used-up byte */ + "move.l %%a0, %%d5 \n" + "move.b %%d5, (%[addr]) \n" /* store byte */ + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d5 \n" /* shift out used-up byte */ + "move.l %%d5, %%a0 \n" ".ur_s7: \n" - "sub.l %[psiz], %[addr] \n" - "move.b %%d2, (%[addr]) \n" - "lsr.l #8, %%d2 \n" + "move.b %%d7, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d7 \n" ".ur_s6: \n" - "sub.l %[psiz], %[addr] \n" - "move.b %%d3, (%[addr]) \n" - "lsr.l #8, %%d3 \n" + "move.b %%d6, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d6 \n" ".ur_s5: \n" - "sub.l %[psiz], %[addr] \n" - "move.b %%d4, (%[addr]) \n" - "lsr.l #8, %%d4 \n" - ".ur_s4: \n" - "sub.l %[psiz], %[addr] \n" "move.l %[ax], %%d5 \n" "move.b %%d5, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" "lsr.l #8, %%d5 \n" "move.l %%d5, %[ax] \n" + ".ur_s4: \n" + "move.b %%d4, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d4 \n" ".ur_s3: \n" - "sub.l %[psiz], %[addr] \n" - "move.b %%d6, (%[addr]) \n" - "lsr.l #8, %%d6 \n" + "move.b %%d3, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d3 \n" ".ur_s2: \n" - "sub.l %[psiz], %[addr] \n" - "move.b %%d7, (%[addr]) \n" - "lsr.l #8, %%d7 \n" + "move.b %%d2, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d2 \n" ".ur_s1: \n" - "sub.l %[psiz], %[addr] \n" - "move.l %%a0, %%d5 \n" - "move.b %%d5, (%[addr]) \n" - "lsr.l #8, %%d5 \n" - "move.l %%d5, %%a0 \n" + "move.b %%d1, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d1 \n" - "add.l %%d0, %[addr] \n" /* correct address */ "subq.l #8, %[dpth] \n" "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */ "jgt .ur_sloop \n" /* next round if anything left */ @@ -1871,7 +1846,7 @@ void gray_update_rect(int x, int y, int width, int height) { unsigned char *addr, *end; unsigned mask = 0; - unsigned test = 1; + unsigned test = 1 << ((-_gray_info.depth) & 7); int i; /* precalculate the bit patterns with random shifts diff --git a/apps/plugins/lib/gray_draw.c b/apps/plugins/lib/gray_draw.c index dcc65bdd09..bd8ea4f1ce 100644 --- a/apps/plugins/lib/gray_draw.c +++ b/apps/plugins/lib/gray_draw.c @@ -1002,103 +1002,94 @@ static void _writearray(unsigned char *address, const unsigned char *src, "and r0, r0, %[rx] \n" "eor r7, r7, r0 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */ "eor r8, r8, r0, lsr #1 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */ + + "sub r0, %[dpth], #1 \n" /** shift out unused low bytes **/ + "and r0, r0, #7 \n" + "add pc, pc, r0, lsl #2 \n" /* jump into shift streak */ + "mov r8, r8, lsr #8 \n" /* r8: never reached */ + "mov r7, r7, lsr #8 \n" + "mov r6, r6, lsr #8 \n" + "mov r5, r5, lsr #8 \n" + "mov r4, r4, lsr #8 \n" + "mov r3, r3, lsr #8 \n" + "mov r2, r2, lsr #8 \n" + "mov r1, r1, lsr #8 \n" "mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ "ands %[mask], %[mask], #0xff \n" - "beq .wa_sloop \n" /* short loop if no bits to keep */ - - ".wa_floop: \n" /** full loop (bits to keep)**/ - "cmp %[dpth], #8 \n" /* 8 planes or more left? */ - "bhs .wa_f8 \n" + "beq .wa_sstart \n" /* short loop if no bits to keep */ - "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */ - "add %[addr], %[addr], r0 \n" /* for this round */ - - - "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */ + "ldrb r0, [pc, r0] \n" /* jump into full loop */ "add pc, pc, r0 \n" ".wa_ftable: \n" - ".byte .wa_f0 - .wa_ftable - 4 \n" /* [jump tables are tricky] */ - ".byte .wa_f1 - .wa_ftable - 4 \n" + ".byte .wa_f1 - .wa_ftable - 4 \n" /* [jump tables are tricky] */ ".byte .wa_f2 - .wa_ftable - 4 \n" ".byte .wa_f3 - .wa_ftable - 4 \n" ".byte .wa_f4 - .wa_ftable - 4 \n" ".byte .wa_f5 - .wa_ftable - 4 \n" ".byte .wa_f6 - .wa_ftable - 4 \n" ".byte .wa_f7 - .wa_ftable - 4 \n" + ".byte .wa_f8 - .wa_ftable - 4 \n" + ".wa_floop: \n" /** full loop (bits to keep)**/ ".wa_f8: \n" - "add %[addr], %[addr], %[psiz], lsl #3 \n" - /* Point behind the last plane for this round. Note: We're using the - * registers backwards in order to reuse the streak for the last round. - * Therefore we need to go thru the bitplanes backwards too, otherwise - * the bit order would be destroyed which results in more flicker. */ - "ldrb r0, [%[addr], -%[psiz]]! \n" /* load old byte */ + "ldrb r0, [%[addr]] \n" /* load old byte */ "and r0, r0, %[mask] \n" /* mask out replaced bits */ - "orr r0, r0, r8 \n" /* set new bits */ - "strb r0, [%[addr]] \n" /* store byte */ - "mov r8, r8, lsr #8 \n" /* shift out used-up byte */ + "orr r0, r0, r1 \n" /* set new bits */ + "strb r0, [%[addr]], %[psiz] \n" /* store byte */ + "mov r1, r1, lsr #8 \n" /* shift out used-up byte */ ".wa_f7: \n" - "ldrb r0, [%[addr], -%[psiz]]! \n" + "ldrb r0, [%[addr]] \n" "and r0, r0, %[mask] \n" - "orr r0, r0, r7 \n" - "strb r0, [%[addr]] \n" - "mov r7, r7, lsr #8 \n" + "orr r0, r0, r2 \n" + "strb r0, [%[addr]], %[psiz] \n" + "mov r2, r2, lsr #8 \n" ".wa_f6: \n" - "ldrb r0, [%[addr], -%[psiz]]! \n" + "ldrb r0, [%[addr]] \n" "and r0, r0, %[mask] \n" - "orr r0, r0, r6 \n" - "strb r0, [%[addr]] \n" - "mov r6, r6, lsr #8 \n" + "orr r0, r0, r3 \n" + "strb r0, [%[addr]], %[psiz] \n" + "mov r3, r3, lsr #8 \n" ".wa_f5: \n" - "ldrb r0, [%[addr], -%[psiz]]! \n" - "and r0, r0, %[mask] \n" - "orr r0, r0, r5 \n" - "strb r0, [%[addr]] \n" - "mov r5, r5, lsr #8 \n" - ".wa_f4: \n" - "ldrb r0, [%[addr], -%[psiz]]! \n" + "ldrb r0, [%[addr]] \n" "and r0, r0, %[mask] \n" "orr r0, r0, r4 \n" - "strb r0, [%[addr]] \n" + "strb r0, [%[addr]], %[psiz] \n" "mov r4, r4, lsr #8 \n" + ".wa_f4: \n" + "ldrb r0, [%[addr]] \n" + "and r0, r0, %[mask] \n" + "orr r0, r0, r5 \n" + "strb r0, [%[addr]], %[psiz] \n" + "mov r5, r5, lsr #8 \n" ".wa_f3: \n" - "ldrb r0, [%[addr], -%[psiz]]! \n" + "ldrb r0, [%[addr]] \n" "and r0, r0, %[mask] \n" - "orr r0, r0, r3 \n" - "strb r0, [%[addr]] \n" - "mov r3, r3, lsr #8 \n" + "orr r0, r0, r6 \n" + "strb r0, [%[addr]], %[psiz] \n" + "mov r6, r6, lsr #8 \n" ".wa_f2: \n" - "ldrb r0, [%[addr], -%[psiz]]! \n" + "ldrb r0, [%[addr]] \n" "and r0, r0, %[mask] \n" - "orr r0, r0, r2 \n" - "strb r0, [%[addr]] \n" - "mov r2, r2, lsr #8 \n" + "orr r0, r0, r7 \n" + "strb r0, [%[addr]], %[psiz] \n" + "mov r7, r7, lsr #8 \n" ".wa_f1: \n" - "ldrb r0, [%[addr], -%[psiz]]! \n" + "ldrb r0, [%[addr]] \n" "and r0, r0, %[mask] \n" - "orr r0, r0, r1 \n" - "strb r0, [%[addr]] \n" - "mov r1, r1, lsr #8 \n" - ".wa_f0: \n" + "orr r0, r0, r8 \n" + "strb r0, [%[addr]], %[psiz] \n" + "mov r8, r8, lsr #8 \n" - "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */ "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */ "bhi .wa_floop \n" "b .wa_end \n" - ".wa_sloop: \n" /** short loop (nothing to keep) **/ - "cmp %[dpth], #8 \n" /* 8 planes or more left? */ - "bhs .wa_s8 \n" - - "mul r0, %[psiz], %[dpth] \n" /* point behind the last plane */ - "add %[addr], %[addr], r0 \n" /* for this round */ - - "ldrb r0, [pc, %[dpth]] \n" /* jump into streak */ + ".wa_sstart: \n" + "ldrb r0, [pc, r0] \n" /* jump into short loop*/ "add pc, pc, r0 \n" ".wa_stable: \n" - ".byte .wa_s0 - .wa_stable - 4 \n" ".byte .wa_s1 - .wa_stable - 4 \n" ".byte .wa_s2 - .wa_stable - 4 \n" ".byte .wa_s3 - .wa_stable - 4 \n" @@ -1106,36 +1097,34 @@ static void _writearray(unsigned char *address, const unsigned char *src, ".byte .wa_s5 - .wa_stable - 4 \n" ".byte .wa_s6 - .wa_stable - 4 \n" ".byte .wa_s7 - .wa_stable - 4 \n" + ".byte .wa_s8 - .wa_stable - 4 \n" + ".wa_sloop: \n" /** short loop (nothing to keep) **/ ".wa_s8: \n" - "add %[addr], %[addr], %[psiz], lsl #3 \n" - /* Point behind the last plane for this round. See above. */ - "strb r8, [%[addr], -%[psiz]]! \n" /* store byte */ - "mov r8, r8, lsr #8 \n" /* shift out used-up byte */ + "strb r1, [%[addr]], %[psiz] \n" /* store byte */ + "mov r1, r1, lsr #8 \n" /* shift out used-up byte */ ".wa_s7: \n" - "strb r7, [%[addr], -%[psiz]]! \n" - "mov r7, r7, lsr #8 \n" + "strb r2, [%[addr]], %[psiz] \n" + "mov r2, r2, lsr #8 \n" ".wa_s6: \n" - "strb r6, [%[addr], -%[psiz]]! \n" - "mov r6, r6, lsr #8 \n" + "strb r3, [%[addr]], %[psiz] \n" + "mov r3, r3, lsr #8 \n" ".wa_s5: \n" - "strb r5, [%[addr], -%[psiz]]! \n" - "mov r5, r5, lsr #8 \n" - ".wa_s4: \n" - "strb r4, [%[addr], -%[psiz]]! \n" + "strb r4, [%[addr]], %[psiz] \n" "mov r4, r4, lsr #8 \n" + ".wa_s4: \n" + "strb r5, [%[addr]], %[psiz] \n" + "mov r5, r5, lsr #8 \n" ".wa_s3: \n" - "strb r3, [%[addr], -%[psiz]]! \n" - "mov r3, r3, lsr #8 \n" + "strb r6, [%[addr]], %[psiz] \n" + "mov r6, r6, lsr #8 \n" ".wa_s2: \n" - "strb r2, [%[addr], -%[psiz]]! \n" - "mov r2, r2, lsr #8 \n" + "strb r7, [%[addr]], %[psiz] \n" + "mov r7, r7, lsr #8 \n" ".wa_s1: \n" - "strb r1, [%[addr], -%[psiz]]! \n" - "mov r1, r1, lsr #8 \n" - ".wa_s0: \n" + "strb r8, [%[addr]], %[psiz] \n" + "mov r8, r8, lsr #8 \n" - "add %[addr], %[addr], %[psiz], lsl #3 \n" /* correct address */ "subs %[dpth], %[dpth], #8 \n" /* next round if anything left */ "bhi .wa_sloop \n" @@ -1187,7 +1176,7 @@ static void _writearray(unsigned char *address, const unsigned char *src, /* set the bits for all 8 pixels in all bytes according to the * precalculated patterns on the pattern stack */ - test = 1; + test = 1 << ((-_gray_info.depth) & 7); mask = (~mask & 0xff); if (mask == 0) { @@ -1483,28 +1472,40 @@ static void _writearray(unsigned char *address, const unsigned char *src, "xor r0, r7 \n" /* r7 = ...h6g6f6e6d6c6b6a6 */ "shlr r0 \n" "xor r0, r8 \n" /* r8 = ...h7g7f7e7d7c7b7a7 */ + + "mov %[dpth], %[rx] \n" /** shift out unused low bytes **/ + "add #-1, %[rx] \n" + "mov #7, r0 \n" + "and r0, %[rx] \n" + "mova .wa_pshift, r0 \n" + "add %[rx], r0 \n" + "add %[rx], r0 \n" + "jmp @r0 \n" /* jump into shift streak */ + "nop \n" + + ".align 2 \n" + ".wa_pshift: \n" + "shlr8 r7 \n" + "shlr8 r6 \n" + "shlr8 r5 \n" + "shlr8 r4 \n" + "shlr8 r3 \n" + "shlr8 r2 \n" + "shlr8 r1 \n" "not %[mask], %[mask] \n" /* "set" mask -> "keep" mask */ "extu.b %[mask], %[mask] \n" /* mask out high bits */ "tst %[mask], %[mask] \n" - "bt .wa_sloop \n" /* short loop if nothing to keep */ - - ".wa_floop: \n" /** full loop (there are bits to keep)**/ - "mov #8, r0 \n" - "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */ - "bt .wa_f8 \n" + "bt .wa_sstart \n" /* short loop if nothing to keep */ - "mulu %[psiz], %[dpth] \n" - "mova .wa_ftable, r0 \n" - "mov.b @(r0, %[dpth]), %[rx] \n" + "mova .wa_ftable, r0 \n" /* jump into full loop */ + "mov.b @(r0, %[rx]), %[rx] \n" "add %[rx], r0 \n" - "sts macl, %[rx] \n" /* point behind the last plane.. */ - "jmp @r0 \n" /* jump into streak */ - "add %[rx], %[addr] \n" /* ..for this round */ - + "jmp @r0 \n" + "nop \n" + ".align 2 \n" ".wa_ftable: \n" - ".byte .wa_f0 - .wa_ftable \n" ".byte .wa_f1 - .wa_ftable \n" ".byte .wa_f2 - .wa_ftable \n" ".byte .wa_f3 - .wa_ftable \n" @@ -1512,74 +1513,66 @@ static void _writearray(unsigned char *address, const unsigned char *src, ".byte .wa_f5 - .wa_ftable \n" ".byte .wa_f6 - .wa_ftable \n" ".byte .wa_f7 - .wa_ftable \n" + ".byte .wa_f8 - .wa_ftable \n" + ".wa_floop: \n" /** full loop (there are bits to keep)**/ ".wa_f8: \n" - "mov %[psiz], %[rx] \n" - "shll2 %[rx] \n" - "add %[rx], %[rx] \n" - "add %[rx], %[addr] \n" - /* Point behind the last plane for this round. Note: We're using the - * registers backwards in order to reuse the streak for the last round. - * Therefore we need to go thru the bitplanes backwards too, otherwise - * the bit order would be destroyed which results in more flicker. */ - "sub %[psiz], %[addr] \n" "mov.b @%[addr], r0 \n" /* load old byte */ "and %[mask], r0 \n" /* mask out replaced bits */ - "or r8, r0 \n" /* set new bits */ + "or r1, r0 \n" /* set new bits */ "mov.b r0, @%[addr] \n" /* store byte */ - "shlr8 r8 \n" /* shift out used-up byte */ + "add %[psiz], %[addr] \n" + "shlr8 r1 \n" /* shift out used-up byte */ ".wa_f7: \n" - "sub %[psiz], %[addr] \n" "mov.b @%[addr], r0 \n" "and %[mask], r0 \n" - "or r7, r0 \n" + "or r2, r0 \n" "mov.b r0, @%[addr] \n" - "shlr8 r7 \n" + "add %[psiz], %[addr] \n" + "shlr8 r2 \n" ".wa_f6: \n" - "sub %[psiz], %[addr] \n" "mov.b @%[addr], r0 \n" "and %[mask], r0 \n" - "or r6, r0 \n" + "or r3, r0 \n" "mov.b r0, @%[addr] \n" - "shlr8 r6 \n" + "add %[psiz], %[addr] \n" + "shlr8 r3 \n" ".wa_f5: \n" - "sub %[psiz], %[addr] \n" "mov.b @%[addr], r0 \n" "and %[mask], r0 \n" - "or r5, r0 \n" + "or r4, r0 \n" "mov.b r0, @%[addr] \n" - "shlr8 r5 \n" + "add %[psiz], %[addr] \n" + "shlr8 r4 \n" ".wa_f4: \n" - "sub %[psiz], %[addr] \n" "mov.b @%[addr], r0 \n" "and %[mask], r0 \n" - "or r4, r0 \n" + "or r5, r0 \n" "mov.b r0, @%[addr] \n" - "shlr8 r4 \n" + "add %[psiz], %[addr] \n" + "shlr8 r5 \n" ".wa_f3: \n" - "sub %[psiz], %[addr] \n" "mov.b @%[addr], r0 \n" "and %[mask], r0 \n" - "or r3, r0 \n" + "or r6, r0 \n" "mov.b r0, @%[addr] \n" - "shlr8 r3 \n" + "add %[psiz], %[addr] \n" + "shlr8 r6 \n" ".wa_f2: \n" - "sub %[psiz], %[addr] \n" "mov.b @%[addr], r0 \n" "and %[mask], r0 \n" - "or r2, r0 \n" + "or r7, r0 \n" "mov.b r0, @%[addr] \n" - "shlr8 r2 \n" + "add %[psiz], %[addr] \n" + "shlr8 r7 \n" ".wa_f1: \n" - "sub %[psiz], %[addr] \n" "mov.b @%[addr], r0 \n" "and %[mask], r0 \n" - "or r1, r0 \n" + "or r8, r0 \n" "mov.b r0, @%[addr] \n" - "shlr8 r1 \n" - ".wa_f0: \n" + "add %[psiz], %[addr] \n" + "shlr8 r8 \n" - "add %[rx], %[addr] \n" /* correct address */ "add #-8, %[dpth] \n" "cmp/pl %[dpth] \n" /* next round if anything left */ "bt .wa_floop \n" @@ -1603,22 +1596,15 @@ static void _writearray(unsigned char *address, const unsigned char *src, ".wa_mask1: \n" ".long 0xAAAAAAAA \n" - ".wa_sloop: \n" /** short loop (nothing to keep) **/ - "mov #8, r0 \n" - "cmp/hs r0, %[dpth] \n" /* 8 planes or more left? */ - "bt .wa_s8 \n" - - "mulu %[psiz], %[dpth] \n" - "mova .wa_stable, r0 \n" - "mov.b @(r0, %[dpth]), %[rx] \n" + ".wa_sstart: \n" + "mova .wa_stable, r0 \n" /* jump into short loop */ + "mov.b @(r0, %[rx]), %[rx] \n" "add %[rx], r0 \n" - "sts macl, %[rx] \n" /* point behind the last plane.. */ - "jmp @r0 \n" /* jump into streak */ - "add %[rx], %[addr] \n" /* ..for this round */ + "jmp @r0 \n" + "nop \n" ".align 2 \n" ".wa_stable: \n" - ".byte .wa_s0 - .wa_stable \n" ".byte .wa_s1 - .wa_stable \n" ".byte .wa_s2 - .wa_stable \n" ".byte .wa_s3 - .wa_stable \n" @@ -1626,47 +1612,42 @@ static void _writearray(unsigned char *address, const unsigned char *src, ".byte .wa_s5 - .wa_stable \n" ".byte .wa_s6 - .wa_stable \n" ".byte .wa_s7 - .wa_stable \n" + ".byte .wa_s8 - .wa_stable \n" + ".wa_sloop: \n" /** short loop (nothing to keep) **/ ".wa_s8: \n" - "mov %[psiz], %[rx] \n" /* Point behind the last plane */ - "shll2 %[rx] \n" /* for this round. */ - "add %[rx], %[rx] \n" /* See above. */ - "add %[rx], %[addr] \n" - - "sub %[psiz], %[addr] \n" - "mov.b r8, @%[addr] \n" /* store byte */ - "shlr8 r8 \n" /* shift out used-up byte */ + "mov.b r1, @%[addr] \n" /* store byte */ + "add %[psiz], %[addr] \n" + "shlr8 r1 \n" /* shift out used-up byte */ ".wa_s7: \n" - "sub %[psiz], %[addr] \n" - "mov.b r7, @%[addr] \n" - "shlr8 r7 \n" + "mov.b r2, @%[addr] \n" + "add %[psiz], %[addr] \n" + "shlr8 r2 \n" ".wa_s6: \n" - "sub %[psiz], %[addr] \n" - "mov.b r6, @%[addr] \n" - "shlr8 r6 \n" + "mov.b r3, @%[addr] \n" + "add %[psiz], %[addr] \n" + "shlr8 r3 \n" ".wa_s5: \n" - "sub %[psiz], %[addr] \n" - "mov.b r5, @%[addr] \n" - "shlr8 r5 \n" - ".wa_s4: \n" - "sub %[psiz], %[addr] \n" "mov.b r4, @%[addr] \n" + "add %[psiz], %[addr] \n" "shlr8 r4 \n" + ".wa_s4: \n" + "mov.b r5, @%[addr] \n" + "add %[psiz], %[addr] \n" + "shlr8 r5 \n" ".wa_s3: \n" - "sub %[psiz], %[addr] \n" - "mov.b r3, @%[addr] \n" - "shlr8 r3 \n" + "mov.b r6, @%[addr] \n" + "add %[psiz], %[addr] \n" + "shlr8 r6 \n" ".wa_s2: \n" - "sub %[psiz], %[addr] \n" - "mov.b r2, @%[addr] \n" - "shlr8 r2 \n" + "mov.b r7, @%[addr] \n" + "add %[psiz], %[addr] \n" + "shlr8 r7 \n" ".wa_s1: \n" - "sub %[psiz], %[addr] \n" - "mov.b r1, @%[addr] \n" - "shlr8 r1 \n" - ".wa_s0: \n" + "mov.b r8, @%[addr] \n" + "add %[psiz], %[addr] \n" + "shlr8 r8 \n" - "add %[rx], %[addr] \n" /* correct address */ "add #-8, %[dpth] \n" "cmp/pl %[dpth] \n" /* next round if anything left */ "bt .wa_sloop \n" @@ -1853,172 +1834,163 @@ static void _writearray(unsigned char *address, const unsigned char *src, "move.l %%a0, %%d5 \n" "eor.l %%d5, %%d0 \n" "and.l #0xAAAAAAAA, %%d0 \n" - "eor.l %%d0, %%d5 \n" - "move.l %%d5, %%a0 \n" /* a0 = ...h0g0f0e0d0c0b0a0 */ + "eor.l %%d0, %%d5 \n" /* (a0 = ...h0g0f0e0d0c0b0a0) */ + /* move.l %%d5, %%a0 */ /* but keep in d5 for shift streak */ "lsr.l #1, %%d0 \n" "eor.l %%d0, %%d7 \n" /* d7 = ...h1g1f1e1d1c1b1a1 */ + + "move.l %[dpth], %%d0 \n" /** shift out unused low bytes **/ + "subq.l #1, %%d0 \n" + "and.l #7, %%d0 \n" + "move.l %%d0, %%a0 \n" + "move.l %[ax], %%d0 \n" /* all data in D registers */ + "jmp (2, %%pc, %%a0:l:2) \n" /* jump into shift streak */ + "lsr.l #8, %%d2 \n" + "lsr.l #8, %%d3 \n" + "lsr.l #8, %%d4 \n" + "lsr.l #8, %%d0 \n" + "lsr.l #8, %%d6 \n" + "lsr.l #8, %%d7 \n" + "lsr.l #8, %%d5 \n" + "move.l %%d0, %[ax] \n" /* put the 2 extra words back.. */ + "move.l %%a0, %%d0 \n" /* keep the value for later */ + "move.l %%d5, %%a0 \n" /* ..into their A registers */ "tst.l %[mask] \n" - "jeq .wa_sloop \n" /* short loop if nothing to keep */ + "jeq .wa_sstart \n" /* short loop if nothing to keep */ "move.l %[mask], %%d5 \n" /* need mask in data reg. */ "move.l %%d1, %[mask] \n" /* free d1 as working reg. */ - ".wa_floop: \n" /** full loop (there are bits to keep)**/ - "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */ - "bhs.s .wa_f8 \n" - - "move.l %[psiz], %%d0 \n" - "move.l %[dpth], %%d1 \n" - "mulu.w %%d1, %%d0 \n" /* point behind the last plane */ - "add.l %%d0, %[addr] \n" /* for this round */ - "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */ - "bra.s .wa_f1 \n" /* dpth == 0 should never happen */ + "jmp (2, %%pc, %%d0:l:2) \n" /* jump into full loop */ + "bra.s .wa_f1 \n" "bra.s .wa_f2 \n" "bra.s .wa_f3 \n" "bra.s .wa_f4 \n" "bra.s .wa_f5 \n" "bra.s .wa_f6 \n" "bra.s .wa_f7 \n" + /* bra.s .wa_f8 */ /* identical with target */ + ".wa_floop: \n" /** full loop (there are bits to keep)**/ ".wa_f8: \n" - "move.l %[psiz], %%d0 \n" - "lsl.l #3, %%d0 \n" - "add.l %%d0, %[addr] \n" - /* Point behind the last plane for this round. Note: We're using the - * registers backwards in order to reuse the streak for the last round. - * Therefore we need to go thru the bitplanes backwards too, otherwise - * the bit order would be destroyed which results in more flicker. */ - "sub.l %[psiz], %[addr] \n" "move.b (%[addr]), %%d0 \n" /* load old byte */ "and.l %%d5, %%d0 \n" /* mask out replaced bits */ - "move.l %[mask], %%d1 \n" + "move.l %%a0, %%d1 \n" "or.l %%d1, %%d0 \n" /* set new bits */ "move.b %%d0, (%[addr]) \n" /* store byte */ + "add.l %[psiz], %[addr] \n" "lsr.l #8, %%d1 \n" /* shift out used-up byte */ - "move.l %%d1, %[mask] \n" + "move.l %%d1, %%a0 \n" ".wa_f7: \n" - "sub.l %[psiz], %[addr] \n" "move.b (%[addr]), %%d0 \n" "and.l %%d5, %%d0 \n" - "or.l %%d2, %%d0 \n" + "or.l %%d7, %%d0 \n" "move.b %%d0, (%[addr]) \n" - "lsr.l #8, %%d2 \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d7 \n" ".wa_f6: \n" - "sub.l %[psiz], %[addr] \n" "move.b (%[addr]), %%d0 \n" "and.l %%d5, %%d0 \n" - "or.l %%d3, %%d0 \n" + "or.l %%d6, %%d0 \n" "move.b %%d0, (%[addr]) \n" - "lsr.l #8, %%d3 \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d6 \n" ".wa_f5: \n" - "sub.l %[psiz], %[addr] \n" - "move.b (%[addr]), %%d0 \n" - "and.l %%d5, %%d0 \n" - "or.l %%d4, %%d0 \n" - "move.b %%d0, (%[addr]) \n" - "lsr.l #8, %%d4 \n" - ".wa_f4: \n" - "sub.l %[psiz], %[addr] \n" "move.b (%[addr]), %%d0 \n" "and.l %%d5, %%d0 \n" "move.l %[ax], %%d1 \n" "or.l %%d1, %%d0 \n" "move.b %%d0, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" "lsr.l #8, %%d1 \n" "move.l %%d1, %[ax] \n" + ".wa_f4: \n" + "move.b (%[addr]), %%d0 \n" + "and.l %%d5, %%d0 \n" + "or.l %%d4, %%d0 \n" + "move.b %%d0, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d4 \n" ".wa_f3: \n" - "sub.l %[psiz], %[addr] \n" "move.b (%[addr]), %%d0 \n" "and.l %%d5, %%d0 \n" - "or.l %%d6, %%d0 \n" + "or.l %%d3, %%d0 \n" "move.b %%d0, (%[addr]) \n" - "lsr.l #8, %%d6 \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d3 \n" ".wa_f2: \n" - "sub.l %[psiz], %[addr] \n" "move.b (%[addr]), %%d0 \n" "and.l %%d5, %%d0 \n" - "or.l %%d7, %%d0 \n" + "or.l %%d2, %%d0 \n" "move.b %%d0, (%[addr]) \n" - "lsr.l #8, %%d7 \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d2 \n" ".wa_f1: \n" - "sub.l %[psiz], %[addr] \n" "move.b (%[addr]), %%d0 \n" "and.l %%d5, %%d0 \n" - "move.l %%a0, %%d1 \n" + "move.l %[mask], %%d1 \n" "or.l %%d1, %%d0 \n" "move.b %%d0, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" "lsr.l #8, %%d1 \n" - "move.l %%d1, %%a0 \n" + "move.l %%d1, %[mask] \n" - "move.l %[psiz], %%d0 \n" - "lsl.l #3, %%d0 \n" - "add.l %%d0, %[addr] \n" /* correct address */ "subq.l #8, %[dpth] \n" "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */ "jgt .wa_floop \n" /* next round if anything left */ "jra .wa_end \n" - ".wa_sloop: \n" /** short loop (nothing to keep) **/ - "cmp.l #8, %[dpth] \n" /* 8 planes or more left? */ - "bhs.s .wa_s8 \n" - - "move.l %[psiz], %%d0 \n" - "move.l %[dpth], %%d5 \n" - "mulu.w %%d5, %%d0 \n" /* point behind the last plane */ - "add.l %%d0, %[addr] \n" /* for this round */ - "jmp (%%pc, %[dpth]:l:2) \n" /* jump into streak */ - "bra.s .wa_s1 \n" /* dpth == 0 should never happen */ + ".wa_sstart: \n" + "jmp (2, %%pc, %%d0:l:2) \n" /* jump into short loop */ + "bra.s .wa_s1 \n" "bra.s .wa_s2 \n" "bra.s .wa_s3 \n" "bra.s .wa_s4 \n" "bra.s .wa_s5 \n" "bra.s .wa_s6 \n" "bra.s .wa_s7 \n" + /* bra.s .wa_s8 */ /* identical with target */ + ".wa_sloop: \n" /** short loop (nothing to keep) **/ ".wa_s8: \n" - "move.l %[psiz], %%d0 \n" /* Point behind the last plane */ - "lsl.l #3, %%d0 \n" /* for this round. */ - "add.l %%d0, %[addr] \n" /* See above. */ - - "sub.l %[psiz], %[addr] \n" - "move.b %%d1, (%[addr]) \n" /* store byte */ - "lsr.l #8, %%d1 \n" /* shift out used-up byte */ + "move.l %%a0, %%d5 \n" + "move.b %%d5, (%[addr]) \n" /* store byte */ + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d5 \n" /* shift out used-up byte */ + "move.l %%d5, %%a0 \n" ".wa_s7: \n" - "sub.l %[psiz], %[addr] \n" - "move.b %%d2, (%[addr]) \n" - "lsr.l #8, %%d2 \n" + "move.b %%d7, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d7 \n" ".wa_s6: \n" - "sub.l %[psiz], %[addr] \n" - "move.b %%d3, (%[addr]) \n" - "lsr.l #8, %%d3 \n" + "move.b %%d6, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d6 \n" ".wa_s5: \n" - "sub.l %[psiz], %[addr] \n" - "move.b %%d4, (%[addr]) \n" - "lsr.l #8, %%d4 \n" - ".wa_s4: \n" - "sub.l %[psiz], %[addr] \n" "move.l %[ax], %%d5 \n" "move.b %%d5, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" "lsr.l #8, %%d5 \n" "move.l %%d5, %[ax] \n" + ".wa_s4: \n" + "move.b %%d4, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d4 \n" ".wa_s3: \n" - "sub.l %[psiz], %[addr] \n" - "move.b %%d6, (%[addr]) \n" - "lsr.l #8, %%d6 \n" + "move.b %%d3, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d3 \n" ".wa_s2: \n" - "sub.l %[psiz], %[addr] \n" - "move.b %%d7, (%[addr]) \n" - "lsr.l #8, %%d7 \n" + "move.b %%d2, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d2 \n" ".wa_s1: \n" - "sub.l %[psiz], %[addr] \n" - "move.l %%a0, %%d5 \n" - "move.b %%d5, (%[addr]) \n" - "lsr.l #8, %%d5 \n" - "move.l %%d5, %%a0 \n" + "move.b %%d1, (%[addr]) \n" + "add.l %[psiz], %[addr] \n" + "lsr.l #8, %%d1 \n" - "add.l %%d0, %[addr] \n" /* correct address */ "subq.l #8, %[dpth] \n" "tst.l %[dpth] \n" /* subq doesn't set flags for A reg */ "jgt .wa_sloop \n" /* next round if anything left */ @@ -2071,7 +2043,7 @@ static void _writearray(unsigned char *address, const unsigned char *src, /* set the bits for all 8 pixels in all bytes according to the * precalculated patterns on the pattern stack */ - test = 1; + test = 1 << ((-_gray_info.depth) & 7); mask = (~mask & 0xff); if (mask == 0) { |