Tweak types and the order that things are computed

author: Christopher Wellons <wellons@nullprogram.com> 2017-10-06 11:28:26 -0400
committer: Christopher Wellons <wellons@nullprogram.com> 2017-10-06 11:28:26 -0400
commit: 15924e1250595989805c5994383b58224cce0b8b (patch)
tree: fd41590c0fcc0e4d58078fc326ea7de154cc4379
parent: 0b4b374e733d3eb28051ced0e034e850a4e61834 (diff)
3 files changed, 39 insertions, 24 deletions
diff --git a/test/benchmark.c b/test/benchmark.c
index b637e7b..4eb51da 100644
--- a/test/benchmark.c
+++ b/test/benchmark.c
@@ -87,7 +87,7 @@ main(void)
     do {
         unsigned char *p = buffer;
         int e = 0;
-        long c;
+        uint32_t c;
         long count = 0;
         while (p < end) {
             p = utf8_decode(p, &c, &e);
diff --git a/test/tests.c b/test/tests.c
index bb0b0d3..be94789 100644
--- a/test/tests.c
+++ b/test/tests.c
@@ -23,10 +23,10 @@ main(void)
     /* Make sure it can decode every character */
     {
         long failures = 0;
-        for (long i = 0; i < 0x1ffffL; i++) {
+        for (unsigned long i = 0; i < 0x1ffff; i++) {
             if (!IS_SURROGATE(i)) {
                 int e;
-                long c;
+                uint32_t c;
                 unsigned char buf[8] = {0};
                 unsigned char *end = utf8_encode(buf, i);
                 unsigned char *res = utf8_decode(buf, &c, &e);
@@ -39,9 +39,9 @@ main(void)
     /* Does it reject all surrogate halves? */
     {
         long failures = 0;
-        for (long i = 0xd800; i <= 0xdfff; i++) {
+        for (unsigned long i = 0xd800; i <= 0xdfff; i++) {
             int e;
-            long c;
+            uint32_t c;
             unsigned char buf[8] = {0};
             utf8_encode(buf, i);
             utf8_decode(buf, &c, &e);
@@ -53,46 +53,49 @@ main(void)
     /* How about non-canonical encodings? */
     {
         int e;
-        long c;
+        uint32_t c;
         unsigned char *end;
 
         unsigned char buf2[8] = {0xc0, 0xA4};
         end = utf8_decode(buf2, &c, &e);
         TEST(e, "non-canonical len 2, 0x%02x", e);
-        TEST(end == buf2 + 2, "non-canonical recover 2, U+%04lx", c);
+        TEST(end == buf2 + 2, "non-canonical recover 2, U+%04lx",
+             (unsigned long)c);
 
         unsigned char buf3[8] = {0xe0, 0x80, 0xA4};
         end = utf8_decode(buf3, &c, &e);
         TEST(e, "non-canonical len 3, 0x%02x", e);
-        TEST(end == buf3 + 3, "non-canonical recover 3, U+%04lx", c);
+        TEST(end == buf3 + 3, "non-canonical recover 3, U+%04lx",
+             (unsigned long)c);
 
         unsigned char buf4[8] = {0xf0, 0x80, 0x80, 0xA4};
         end = utf8_decode(buf4, &c, &e);
         TEST(e, "non-canonical encoding len 4, 0x%02x", e);
-        TEST(end == buf4 + 4, "non-canonical recover 4, U+%04lx", c);
+        TEST(end == buf4 + 4, "non-canonical recover 4, U+%04lx",
+             (unsigned long)c);
     }
 
     /* Let's try some bogus byte sequences */
     {
         int len, e;
-        long c;
+        uint32_t c;
 
         /* Invalid first byte */
         unsigned char buf0[4] = {0xff};
         len = (unsigned char *)utf8_decode(buf0, &c, &e) - buf0;
-        TEST(e, "bogus [ff] 0x%02x U+%04lx", e, c);
+        TEST(e, "bogus [ff] 0x%02x U+%04lx", e, (unsigned long)c);
         TEST(len == 1, "bogus [ff] recovery %d", len);
 
         /* Invalid first byte */
         unsigned char buf1[4] = {0x80};
         len = (unsigned char *)utf8_decode(buf1, &c, &e) - buf1;
-        TEST(e, "bogus [80] 0x%02x U+%04lx", e, c);
+        TEST(e, "bogus [80] 0x%02x U+%04lx", e, (unsigned long)c);
         TEST(len == 1, "bogus [80] recovery %d", len);
 
         /* Looks like a two-byte sequence but second byte is wrong */
         unsigned char buf2[4] = {0xc0, 0x0a};
         len = (unsigned char *)utf8_decode(buf2, &c, &e) - buf2;
-        TEST(e, "bogus [c0 0a] 0x%02x U+%04lx", e, c);
+        TEST(e, "bogus [c0 0a] 0x%02x U+%04lx", e, (unsigned long)c);
         TEST(len == 2, "bogus [c0 0a] recovery %d", len);
     }
 
diff --git a/utf8.h b/utf8.h
index 1942cb4..19eef17 100644
--- a/utf8.h
+++ b/utf8.h
@@ -5,6 +5,8 @@
 #ifndef UTF8_H
 #define UTF8_H
 
+#include <stdint.h>
+
 /* Decode the next character, C, from BUF, reporting errors in E.
  *
  * Since this is a branchless decoder, four bytes will be read from the
@@ -17,7 +19,7 @@
  * encoding, or a surrogate half.
  */
 static void *
-utf8_decode(void *buf, long *c, int *e) {
+utf8_decode(void *buf, uint32_t *c, int *e) {
     static const char utf8_lengths[] = {
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
@@ -30,21 +32,31 @@ utf8_decode(void *buf, long *c, int *e) {
     unsigned char *s = buf;
     int len = utf8_lengths[s[0] >> 3];
 
-    *c  = (s[0] & masks[len]) << 18;
-    *c |= (s[1] & 0x3fU) << 12;
-    *c |= (s[2] & 0x3fU) <<  6;
-    *c |= (s[3] & 0x3fU) <<  0;
+    /* Compute the pointer to the next character early so that, when
+     * unrolled, the next iteration can start working on the next
+     * character. Neither Clang nor GCC figure this out on their own.
+     */
+    unsigned char *next = s + len + !len;
+
+    /* Assume this is a four-byte character and load all four bytes.
+     * Unused bits will be shifted out later.
+     */
+    *c  = (uint32_t)(s[0] & masks[len]) << 18;
+    *c |= (uint32_t)(s[1] & 0x3f) << 12;
+    *c |= (uint32_t)(s[2] & 0x3f) <<  6;
+    *c |= (uint32_t)(s[3] & 0x3f) <<  0;
     *c >>= shiftc[len];
 
-    *e  = (*c < (1L << thresh[len]) - 1) << 6;
+    /* Accumulate the various error conditions. */
+    *e  = (*c < (UINT32_C(1) << thresh[len]) - 1) << 6;
     *e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
-    *e |= (s[1] & 0xc0U) >> 2;
-    *e |= (s[2] & 0xc0U) >> 4;
-    *e |= (s[3]        ) >> 6;
-    *e ^= 0x2aU; // top two bits of each tail byte correct?
+    *e |= (s[1] & 0xc0) >> 2;
+    *e |= (s[2] & 0xc0) >> 4;
+    *e |= (s[3]       ) >> 6;
+    *e ^= 0x2a; // top two bits of each tail byte correct?
     *e >>= shifte[len];
 
-    return s + len + !len;
+    return next;
 }
 
 #endif
author	Christopher Wellons <wellons@nullprogram.com>	2017-10-06 11:28:26 -0400
committer	Christopher Wellons <wellons@nullprogram.com>	2017-10-06 11:28:26 -0400
commit	15924e1250595989805c5994383b58224cce0b8b (patch)
tree	fd41590c0fcc0e4d58078fc326ea7de154cc4379
parent	0b4b374e733d3eb28051ced0e034e850a4e61834 (diff)