summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristopher Wellons <wellons@nullprogram.com>2017-10-05 23:07:01 -0400
committerChristopher Wellons <wellons@nullprogram.com>2017-10-05 23:07:01 -0400
commita4db22221bbdbd1fb58cc6f6cc67bca1221aba6b (patch)
tree33b36651ab71a6bc141665a7ff78913d13153f81
Initial working code
-rw-r--r--.gitignore2
-rw-r--r--Makefile18
-rw-r--r--README.md1
-rw-r--r--UNLICENSE24
-rw-r--r--test/benchmark.c122
-rw-r--r--test/bj-utf8.h34
-rw-r--r--test/tests.c87
-rw-r--r--test/utf8-encode.h31
-rw-r--r--utf8.h59
9 files changed, 378 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5d2ecae
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+tests
+benchmark
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..2f2928e
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,18 @@
+CC = c99
+CFLAGS = -Wall -Wextra -O3 -g3
+all: benchmark tests
+
+benchmark: test/benchmark.c utf8.h test/utf8-encode.h test/bj-utf8.h
+ $(CC) $(CFLAGS) $(LDFLAGS) -o $@ test/benchmark.c $(LDLIBS)
+
+tests: test/tests.c utf8.h test/utf8-encode.h
+ $(CC) $(CFLAGS) $(LDFLAGS) -o $@ test/tests.c $(LDLIBS)
+
+bench: benchmark
+ ./benchmark
+
+check: tests
+ ./tests
+
+clean:
+ rm -f benchmark tests
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b755bbf
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+# Branchless UTF-8 Decoder
diff --git a/UNLICENSE b/UNLICENSE
new file mode 100644
index 0000000..68a49da
--- /dev/null
+++ b/UNLICENSE
@@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
diff --git a/test/benchmark.c b/test/benchmark.c
new file mode 100644
index 0000000..ddc1d7d
--- /dev/null
+++ b/test/benchmark.c
@@ -0,0 +1,122 @@
+#define _POSIX_C_SOURCE 200112L
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#include <unistd.h> // alarm()
+
+#include "../utf8.h"
+#include "utf8-encode.h"
+#include "bj-utf8.h"
+
+#define SECONDS 5
+#define BUFLEN 64 // MB
+
+static uint32_t
+pcg32(uint64_t *s)
+{
+ uint64_t m = 0x9b60933458e17d7d;
+ uint64_t a = 0xd737232eeccdf7ed;
+ *s = *s * m + a;
+ int shift = 29 - (*s >> 61);
+ return *s >> shift;
+}
+
+static long
+randchar(uint64_t *s)
+{
+ uint32_t r = pcg32(s);
+ int len = 1 + (r & 0x3);
+ r >>= 2;
+ switch (len) {
+ case 1:
+ return r % 128;
+ case 2:
+ return 128 + r % (2048 - 128);
+ case 3:
+ return 2048 + r % (65536 - 2048);
+ case 4:
+ return 65536 + r % (131072 - 65536);
+ }
+ abort();
+}
+
+static volatile sig_atomic_t running;
+
+static void
+alarm_handler(int signum)
+{
+ (void)signum;
+ running = 0;
+}
+
+static void *
+buffer_fill(void *buf, size_t z)
+{
+ uint64_t s = 0;
+ char *p = buf;
+ char *end = p + z;
+ while (p < end) {
+ long c;
+ do
+ c = randchar(&s);
+ while (IS_SURROGATE(c));
+ p = utf8_encode(p, c);
+ }
+ return p;
+}
+
+int
+main(void)
+{
+ long errors, n;
+ size_t z = BUFLEN * 1024 * 1024;
+ unsigned char *buffer = malloc(z);
+ unsigned char *end = buffer_fill(buffer, z);
+
+ /* Benchmark the branchless decoder */
+ running = 1;
+ signal(SIGALRM, alarm_handler);
+ alarm(SECONDS);
+ errors = n = 0;
+ do {
+ unsigned char *p = buffer;
+ int e = 0;
+ long c;
+ long count = 0;
+ while (p < end) {
+ p = utf8_decode(p, &c, &e);
+ count++;
+ }
+ errors += !!e;
+ if (p == end)
+ n++;
+ } while (running);
+
+ double rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
+ printf("branchless: %f MB/s, %ld rounds, %ld errors\n", rate, n, errors);
+
+ /* Benchmark Bjoern Hoehrmann's decoder */
+ running = 1;
+ signal(SIGALRM, alarm_handler);
+ alarm(SECONDS);
+ errors = n = 0;
+ do {
+ unsigned char *p = buffer;
+ uint32_t c;
+ uint32_t state = 0;
+ long count = 0;
+ for (; p < end; p++)
+ if (!bj_utf8_decode(&state, &c, *p))
+ count++;
+ errors += state != UTF8_ACCEPT;
+ if (p == end)
+ n++;
+ } while (running);
+
+ rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
+ printf("Hoehrmann: %f MB/s, %ld rounds, %ld errors\n", rate, n, errors);
+
+ free(buffer);
+}
diff --git a/test/bj-utf8.h b/test/bj-utf8.h
new file mode 100644
index 0000000..e8b262b
--- /dev/null
+++ b/test/bj-utf8.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 1
+
+static const uint8_t utf8d[] = {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+ 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+ 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+ 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+ 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+ 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+ 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+
+static uint32_t
+bj_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
+ uint32_t type = utf8d[byte];
+
+ *codep = (*state != UTF8_ACCEPT) ?
+ (byte & 0x3fu) | (*codep << 6) :
+ (0xff >> type) & (byte);
+
+ *state = utf8d[256 + *state*16 + type];
+ return *state;
+}
diff --git a/test/tests.c b/test/tests.c
new file mode 100644
index 0000000..3befc6d
--- /dev/null
+++ b/test/tests.c
@@ -0,0 +1,87 @@
+#include <stdio.h>
+
+#include "../utf8.h"
+#include "utf8-encode.h"
+
+static int count_pass;
+static int count_fail;
+
+#define TEST(x, s, ...) \
+ do { \
+ if (x) { \
+ printf("\033[32;1mPASS\033[0m " s "\n", __VA_ARGS__); \
+ count_pass++; \
+ } else { \
+ printf("\033[31;1mFAIL\033[0m " s "\n", __VA_ARGS__); \
+ count_fail++; \
+ } \
+ } while (0)
+
+int
+main(void)
+{
+ /* Make sure it can decode every character */
+ {
+ long failures = 0;
+ for (long i = 0; i < 0x10000L; i++) {
+ if (!IS_SURROGATE(i)) {
+ int e;
+ long c;
+ unsigned char buf[8] = {0};
+ unsigned char *end = utf8_encode(buf, i);
+ unsigned char *res = utf8_decode(buf, &c, &e);
+ failures += end != res || c != i || e;
+ }
+ }
+ TEST(failures == 0, "decode all, errors: %ld", failures);
+ }
+
+ /* Does it reject all surrogate halves? */
+ {
+ long failures = 0;
+ for (long i = 0xD800; i <= 0xDFFFU; i++) {
+ int e;
+ long c;
+ unsigned char buf[8] = {0};
+ utf8_encode(buf, i);
+ utf8_decode(buf, &c, &e);
+ failures += !e;
+ }
+ TEST(failures == 0, "surrogate halves, errors: %ld", failures);
+ }
+
+ /* How about non-canonical encodings? */
+ {
+ int e;
+ long c;
+ unsigned char *end;
+
+ unsigned char buf2[8] = {0xc0, 0xA4};
+ end = utf8_decode(buf2, &c, &e);
+ TEST(e, "non-canonical len 2, 0x%02x", e);
+ TEST(end == buf2 + 2, "non-canonical recover 2, U+%04lx", c);
+
+ unsigned char buf3[8] = {0xe0, 0x80, 0xA4};
+ end = utf8_decode(buf3, &c, &e);
+ TEST(e, "non-canonical len 3, 0x%02x", e);
+ TEST(end == buf3 + 3, "non-canonical recover 3, U+%04lx", c);
+
+ unsigned char buf4[8] = {0xf0, 0x80, 0x80, 0xA4};
+ end = utf8_decode(buf4, &c, &e);
+ TEST(e, "non-canonical encoding len 4, 0x%02x", e);
+ TEST(end == buf4 + 4, "non-canonical recover 4, U+%04lx", c);
+ }
+
+ /* Let's try some bogus byte sequences */
+ {
+ int e;
+ long c;
+
+ unsigned char buf[4] = {0xff};
+ int len = (unsigned char *)utf8_decode(buf, &c, &e) - buf;
+ TEST(e, "bogus ff 00 00 00, 0x%02x, U+%04lx, len=%d", e, c, len);
+ }
+
+ printf("%d fail, %d pass\n", count_fail, count_pass);
+ return count_fail != 0;
+}
diff --git a/test/utf8-encode.h b/test/utf8-encode.h
new file mode 100644
index 0000000..73113e7
--- /dev/null
+++ b/test/utf8-encode.h
@@ -0,0 +1,31 @@
+#ifndef UTF8_ENCODE
+#define UTF8_ENCODE
+
+#define IS_SURROGATE(c) ((c) >= 0xD800U && (c) <= 0xDFFFU)
+
+static void *
+utf8_encode(void *buf, long c)
+{
+ unsigned char *s = buf;
+ if (c >= (1L << 16)) {
+ s[0] = 0xf0 | (c >> 18);
+ s[1] = 0x80 | ((c >> 12) & 0x3f);
+ s[2] = 0x80 | ((c >> 6) & 0x3f);
+ s[3] = 0x80 | ((c >> 0) & 0x3f);
+ return s + 4;
+ } else if (c >= (1L << 11)) {
+ s[0] = 0xe0 | (c >> 12);
+ s[1] = 0x80 | ((c >> 6) & 0x3f);
+ s[2] = 0x80 | ((c >> 0) & 0x3f);
+ return s + 3;
+ } else if (c >= (1L << 7)) {
+ s[0] = 0xc0 | (c >> 6);
+ s[1] = 0x80 | ((c >> 0) & 0x3f);
+ return s + 2;
+ } else {
+ s[0] = c;
+ return s + 1;
+ }
+}
+
+#endif
diff --git a/utf8.h b/utf8.h
new file mode 100644
index 0000000..66d7840
--- /dev/null
+++ b/utf8.h
@@ -0,0 +1,59 @@
+/* Branchless UTF-8 decoder
+ * Chris Wellons
+ *
+ * This is free and unencumbered software released into the public domain.
+ */
+#ifndef UTF8_H
+#define UTF8_H
+
+/* Decode the next character, C, from BUF, reporting errors in E.
+ *
+ * Since this is a branchless decoder, four bytes will be read from the
+ * buffer regardless of the actual length of the next character. This
+ * means the buffer _must_ have at least three zero-padding bytes
+ * following the end of the data stream.
+ *
+ * Errors are reported in E, which will be non-zero if the parsed
+ * character was somehow invalid: invalid byte sequence, non-canonical
+ * encoding, or a surrogate half.
+ */
+static void *
+utf8_decode(void *buf, long *c, int *e) {
+ static const char utf8_lengths[] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
+ };
+ static const unsigned char masks[] = {
+ 0x00, 0x7f, 0x1f, 0x0f, 0x07
+ };
+ static const char thresholds[] = {
+ 22, 0, 7, 11, 16
+ };
+ static const char shiftc[] = {
+ 0, 18, 12, 6, 0
+ };
+ static const char shifte[] = {
+ 0, 6, 4, 2, 0
+ };
+
+ unsigned char *s = buf;
+ int len = utf8_lengths[s[0] >> 3];
+
+ *c = (s[0] & masks[len]) << 18;
+ *c |= (s[1] & 0x3fU) << 12;
+ *c |= (s[2] & 0x3fU) << 6;
+ *c |= (s[3] & 0x3fU) << 0;
+ *c >>= shiftc[len];
+
+ *e = (*c < (1L << thresholds[len]) - 1) << 6;
+ *e |= ((*c >> 11) == 0x1b) << 7;
+ *e |= (s[1] & 0xc0U) >> 2;
+ *e |= (s[2] & 0xc0U) >> 4;
+ *e |= (s[3] ) >> 6;
+ *e ^= 0x2aU;
+ *e >>= shifte[len];
+
+ return s + len;
+}
+
+#endif