Initial working code

author: Christopher Wellons <wellons@nullprogram.com> 2017-10-05 23:07:01 -0400
committer: Christopher Wellons <wellons@nullprogram.com> 2017-10-05 23:07:01 -0400
commit: a4db22221bbdbd1fb58cc6f6cc67bca1221aba6b (patch)
tree: 33b36651ab71a6bc141665a7ff78913d13153f81
9 files changed, 378 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5d2ecae
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+tests
+benchmark
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..2f2928e
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,18 @@
+CC = c99
+CFLAGS = -Wall -Wextra -O3 -g3
+all: benchmark tests
+
+benchmark: test/benchmark.c utf8.h test/utf8-encode.h test/bj-utf8.h
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ test/benchmark.c $(LDLIBS)
+
+tests: test/tests.c utf8.h test/utf8-encode.h
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ test/tests.c $(LDLIBS)
+
+bench: benchmark
+	./benchmark
+
+check: tests
+	./tests
+
+clean:
+	rm -f benchmark tests
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b755bbf
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+# Branchless UTF-8 Decoder
diff --git a/UNLICENSE b/UNLICENSE
new file mode 100644
index 0000000..68a49da
--- /dev/null
+++ b/UNLICENSE
@@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
diff --git a/test/benchmark.c b/test/benchmark.c
new file mode 100644
index 0000000..ddc1d7d
--- /dev/null
+++ b/test/benchmark.c
@@ -0,0 +1,122 @@
+#define _POSIX_C_SOURCE 200112L
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#include <unistd.h> // alarm()
+
+#include "../utf8.h"
+#include "utf8-encode.h"
+#include "bj-utf8.h"
+
+#define SECONDS 5
+#define BUFLEN  64 // MB
+
+static uint32_t
+pcg32(uint64_t *s)
+{
+    uint64_t m = 0x9b60933458e17d7d;
+    uint64_t a = 0xd737232eeccdf7ed;
+    *s = *s * m + a;
+    int shift = 29 - (*s >> 61);
+    return *s >> shift;
+}
+
+static long
+randchar(uint64_t *s)
+{
+    uint32_t r = pcg32(s);
+    int len = 1 + (r & 0x3);
+    r >>= 2;
+    switch (len) {
+        case 1:
+            return r % 128;
+        case 2:
+            return 128 + r % (2048 - 128);
+        case 3:
+            return 2048 + r % (65536 - 2048);
+        case 4:
+            return 65536 + r % (131072 - 65536);
+    }
+    abort();
+}
+
+static volatile sig_atomic_t running;
+
+static void
+alarm_handler(int signum)
+{
+    (void)signum;
+    running = 0;
+}
+
+static void *
+buffer_fill(void *buf, size_t z)
+{
+    uint64_t s = 0;
+    char *p = buf;
+    char *end = p + z;
+    while (p < end) {
+        long c;
+        do
+            c = randchar(&s);
+        while (IS_SURROGATE(c));
+        p = utf8_encode(p, c);
+    }
+    return p;
+}
+
+int
+main(void)
+{
+    long errors, n;
+    size_t z = BUFLEN * 1024 * 1024;
+    unsigned char *buffer = malloc(z);
+    unsigned char *end = buffer_fill(buffer, z);
+
+    /* Benchmark the branchless decoder */
+    running = 1;
+    signal(SIGALRM, alarm_handler);
+    alarm(SECONDS);
+    errors = n = 0;
+    do {
+        unsigned char *p = buffer;
+        int e = 0;
+        long c;
+        long count = 0;
+        while (p < end) {
+            p = utf8_decode(p, &c, &e);
+            count++;
+        }
+        errors += !!e;
+        if (p == end)
+            n++;
+    } while (running);
+
+    double rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
+    printf("branchless: %f MB/s, %ld rounds, %ld errors\n", rate, n, errors);
+
+    /* Benchmark Bjoern Hoehrmann's decoder */
+    running = 1;
+    signal(SIGALRM, alarm_handler);
+    alarm(SECONDS);
+    errors = n = 0;
+    do {
+        unsigned char *p = buffer;
+        uint32_t c;
+        uint32_t state = 0;
+        long count = 0;
+        for (; p < end; p++)
+            if (!bj_utf8_decode(&state, &c, *p))
+                count++;
+        errors += state != UTF8_ACCEPT;
+        if (p == end)
+            n++;
+    } while (running);
+
+    rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
+    printf("Hoehrmann:  %f MB/s, %ld rounds, %ld errors\n", rate, n, errors);
+
+    free(buffer);
+}
diff --git a/test/bj-utf8.h b/test/bj-utf8.h
new file mode 100644
index 0000000..e8b262b
--- /dev/null
+++ b/test/bj-utf8.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 1
+
+static const uint8_t utf8d[] = {
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+    0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+    0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+    0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+    1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+    1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+    1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+
+static uint32_t
+bj_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
+    uint32_t type = utf8d[byte];
+
+    *codep = (*state != UTF8_ACCEPT) ?
+        (byte & 0x3fu) | (*codep << 6) :
+        (0xff >> type) & (byte);
+
+    *state = utf8d[256 + *state*16 + type];
+    return *state;
+}
diff --git a/test/tests.c b/test/tests.c
new file mode 100644
index 0000000..3befc6d
--- /dev/null
+++ b/test/tests.c
@@ -0,0 +1,87 @@
+#include <stdio.h>
+
+#include "../utf8.h"
+#include "utf8-encode.h"
+
+static int count_pass;
+static int count_fail;
+
+#define TEST(x, s, ...) \
+    do { \
+        if (x) { \
+            printf("\033[32;1mPASS\033[0m " s "\n", __VA_ARGS__); \
+            count_pass++; \
+        } else { \
+            printf("\033[31;1mFAIL\033[0m " s "\n", __VA_ARGS__); \
+            count_fail++; \
+        } \
+    } while (0)
+
+int
+main(void)
+{
+    /* Make sure it can decode every character */
+    {
+        long failures = 0;
+        for (long i = 0; i < 0x10000L; i++) {
+            if (!IS_SURROGATE(i)) {
+                int e;
+                long c;
+                unsigned char buf[8] = {0};
+                unsigned char *end = utf8_encode(buf, i);
+                unsigned char *res = utf8_decode(buf, &c, &e);
+                failures += end != res || c != i || e;
+            }
+        }
+        TEST(failures == 0, "decode all, errors: %ld", failures);
+    }
+
+    /* Does it reject all surrogate halves? */
+    {
+        long failures = 0;
+        for (long i = 0xD800; i <= 0xDFFFU; i++) {
+            int e;
+            long c;
+            unsigned char buf[8] = {0};
+            utf8_encode(buf, i);
+            utf8_decode(buf, &c, &e);
+            failures += !e;
+        }
+        TEST(failures == 0, "surrogate halves, errors: %ld", failures);
+    }
+
+    /* How about non-canonical encodings? */
+    {
+        int e;
+        long c;
+        unsigned char *end;
+
+        unsigned char buf2[8] = {0xc0, 0xA4};
+        end = utf8_decode(buf2, &c, &e);
+        TEST(e, "non-canonical len 2, 0x%02x", e);
+        TEST(end == buf2 + 2, "non-canonical recover 2, U+%04lx", c);
+
+        unsigned char buf3[8] = {0xe0, 0x80, 0xA4};
+        end = utf8_decode(buf3, &c, &e);
+        TEST(e, "non-canonical len 3, 0x%02x", e);
+        TEST(end == buf3 + 3, "non-canonical recover 3, U+%04lx", c);
+
+        unsigned char buf4[8] = {0xf0, 0x80, 0x80, 0xA4};
+        end = utf8_decode(buf4, &c, &e);
+        TEST(e, "non-canonical encoding len 4, 0x%02x", e);
+        TEST(end == buf4 + 4, "non-canonical recover 4, U+%04lx", c);
+    }
+
+    /* Let's try some bogus byte sequences */
+    {
+        int e;
+        long c;
+
+        unsigned char buf[4] = {0xff};
+        int len = (unsigned char *)utf8_decode(buf, &c, &e) - buf;
+        TEST(e, "bogus ff 00 00 00, 0x%02x, U+%04lx, len=%d", e, c, len);
+    }
+
+    printf("%d fail, %d pass\n", count_fail, count_pass);
+    return count_fail != 0;
+}
diff --git a/test/utf8-encode.h b/test/utf8-encode.h
new file mode 100644
index 0000000..73113e7
--- /dev/null
+++ b/test/utf8-encode.h
@@ -0,0 +1,31 @@
+#ifndef UTF8_ENCODE
+#define UTF8_ENCODE
+
+#define IS_SURROGATE(c) ((c) >= 0xD800U && (c) <= 0xDFFFU)
+
+static void *
+utf8_encode(void *buf, long c)
+{
+    unsigned char *s = buf;
+    if (c >= (1L << 16)) {
+        s[0] = 0xf0 |  (c >> 18);
+        s[1] = 0x80 | ((c >> 12) & 0x3f);
+        s[2] = 0x80 | ((c >>  6) & 0x3f);
+        s[3] = 0x80 | ((c >>  0) & 0x3f);
+        return s + 4;
+    } else if (c >= (1L << 11)) {
+        s[0] = 0xe0 |  (c >> 12);
+        s[1] = 0x80 | ((c >>  6) & 0x3f);
+        s[2] = 0x80 | ((c >>  0) & 0x3f);
+        return s + 3;
+    } else if (c >= (1L << 7)) {
+        s[0] = 0xc0 |  (c >>  6);
+        s[1] = 0x80 | ((c >>  0) & 0x3f);
+        return s + 2;
+    } else {
+        s[0] = c;
+        return s + 1;
+    }
+}
+
+#endif
diff --git a/utf8.h b/utf8.h
new file mode 100644
index 0000000..66d7840
--- /dev/null
+++ b/utf8.h
@@ -0,0 +1,59 @@
+/* Branchless UTF-8 decoder
+ * Chris Wellons
+ *
+ * This is free and unencumbered software released into the public domain.
+ */
+#ifndef UTF8_H
+#define UTF8_H
+
+/* Decode the next character, C, from BUF, reporting errors in E.
+ *
+ * Since this is a branchless decoder, four bytes will be read from the
+ * buffer regardless of the actual length of the next character. This
+ * means the buffer _must_ have at least three zero-padding bytes
+ * following the end of the data stream.
+ *
+ * Errors are reported in E, which will be non-zero if the parsed
+ * character was somehow invalid: invalid byte sequence, non-canonical
+ * encoding, or a surrogate half.
+ */
+static void *
+utf8_decode(void *buf, long *c, int *e) {
+    static const char utf8_lengths[] = {
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
+    };
+    static const unsigned char masks[] = {
+        0x00, 0x7f, 0x1f, 0x0f, 0x07
+    };
+    static const char thresholds[] = {
+        22, 0, 7, 11, 16
+    };
+    static const char shiftc[] = {
+        0, 18, 12, 6, 0
+    };
+    static const char shifte[] = {
+        0, 6, 4, 2, 0
+    };
+
+    unsigned char *s = buf;
+    int len = utf8_lengths[s[0] >> 3];
+
+    *c  = (s[0] & masks[len]) << 18;
+    *c |= (s[1] & 0x3fU) << 12;
+    *c |= (s[2] & 0x3fU) <<  6;
+    *c |= (s[3] & 0x3fU) <<  0;
+    *c >>= shiftc[len];
+
+    *e  = (*c < (1L << thresholds[len]) - 1) << 6;
+    *e |= ((*c >> 11) == 0x1b) << 7;
+    *e |= (s[1] & 0xc0U) >> 2;
+    *e |= (s[2] & 0xc0U) >> 4;
+    *e |= (s[3]        ) >> 6;
+    *e ^= 0x2aU;
+    *e >>= shifte[len];
+
+    return s + len;
+}
+
+#endif
author	Christopher Wellons <wellons@nullprogram.com>	2017-10-05 23:07:01 -0400
committer	Christopher Wellons <wellons@nullprogram.com>	2017-10-05 23:07:01 -0400
commit	a4db22221bbdbd1fb58cc6f6cc67bca1221aba6b (patch)
tree	33b36651ab71a6bc141665a7ff78913d13153f81