diff options
author | Christopher Wellons <wellons@nullprogram.com> | 2017-10-05 23:07:01 -0400 |
---|---|---|
committer | Christopher Wellons <wellons@nullprogram.com> | 2017-10-05 23:07:01 -0400 |
commit | a4db22221bbdbd1fb58cc6f6cc67bca1221aba6b (patch) | |
tree | 33b36651ab71a6bc141665a7ff78913d13153f81 |
Initial working code
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | Makefile | 18 | ||||
-rw-r--r-- | README.md | 1 | ||||
-rw-r--r-- | UNLICENSE | 24 | ||||
-rw-r--r-- | test/benchmark.c | 122 | ||||
-rw-r--r-- | test/bj-utf8.h | 34 | ||||
-rw-r--r-- | test/tests.c | 87 | ||||
-rw-r--r-- | test/utf8-encode.h | 31 | ||||
-rw-r--r-- | utf8.h | 59 |
9 files changed, 378 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5d2ecae --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +tests +benchmark diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..2f2928e --- /dev/null +++ b/Makefile @@ -0,0 +1,18 @@ +CC = c99 +CFLAGS = -Wall -Wextra -O3 -g3 +all: benchmark tests + +benchmark: test/benchmark.c utf8.h test/utf8-encode.h test/bj-utf8.h + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ test/benchmark.c $(LDLIBS) + +tests: test/tests.c utf8.h test/utf8-encode.h + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ test/tests.c $(LDLIBS) + +bench: benchmark + ./benchmark + +check: tests + ./tests + +clean: + rm -f benchmark tests diff --git a/README.md b/README.md new file mode 100644 index 0000000..b755bbf --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# Branchless UTF-8 Decoder diff --git a/UNLICENSE b/UNLICENSE new file mode 100644 index 0000000..68a49da --- /dev/null +++ b/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to <http://unlicense.org/> diff --git a/test/benchmark.c b/test/benchmark.c new file mode 100644 index 0000000..ddc1d7d --- /dev/null +++ b/test/benchmark.c @@ -0,0 +1,122 @@ +#define _POSIX_C_SOURCE 200112L +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <signal.h> + +#include <unistd.h> // alarm() + +#include "../utf8.h" +#include "utf8-encode.h" +#include "bj-utf8.h" + +#define SECONDS 5 +#define BUFLEN 64 // MB + +static uint32_t +pcg32(uint64_t *s) +{ + uint64_t m = 0x9b60933458e17d7d; + uint64_t a = 0xd737232eeccdf7ed; + *s = *s * m + a; + int shift = 29 - (*s >> 61); + return *s >> shift; +} + +static long +randchar(uint64_t *s) +{ + uint32_t r = pcg32(s); + int len = 1 + (r & 0x3); + r >>= 2; + switch (len) { + case 1: + return r % 128; + case 2: + return 128 + r % (2048 - 128); + case 3: + return 2048 + r % (65536 - 2048); + case 4: + return 65536 + r % (131072 - 65536); + } + abort(); +} + +static volatile sig_atomic_t running; + +static void +alarm_handler(int signum) +{ + (void)signum; + running = 0; +} + +static void * +buffer_fill(void *buf, size_t z) +{ + uint64_t s = 0; + char *p = buf; + char *end = p + z; + while (p < end) { + long c; + do + c = randchar(&s); + while (IS_SURROGATE(c)); + p = utf8_encode(p, c); + } + return p; +} + +int +main(void) +{ + long errors, n; + size_t z = BUFLEN * 1024 * 1024; + unsigned char *buffer = malloc(z); + unsigned char *end = buffer_fill(buffer, z); + + /* Benchmark the branchless decoder */ + running = 1; + signal(SIGALRM, alarm_handler); + alarm(SECONDS); + errors = n = 0; + do { + unsigned char *p = buffer; + int e = 0; + long c; + long count = 0; + while (p < end) { + p = utf8_decode(p, &c, &e); + count++; + } + errors += !!e; + if (p == end) + n++; + } while (running); + + double rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024; + printf("branchless: %f MB/s, %ld rounds, %ld errors\n", rate, n, errors); + + /* Benchmark Bjoern Hoehrmann's decoder */ + running = 1; + signal(SIGALRM, alarm_handler); + alarm(SECONDS); + errors = n = 0; + do { + unsigned char *p = buffer; + uint32_t c; + uint32_t state = 0; + long count = 0; + for (; p < end; p++) + if (!bj_utf8_decode(&state, &c, *p)) + count++; + errors += state != UTF8_ACCEPT; + if (p == end) + n++; + } while (running); + + rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024; + printf("Hoehrmann: %f MB/s, %ld rounds, %ld errors\n", rate, n, errors); + + free(buffer); +} diff --git a/test/bj-utf8.h b/test/bj-utf8.h new file mode 100644 index 0000000..e8b262b --- /dev/null +++ b/test/bj-utf8.h @@ -0,0 +1,34 @@ +// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 1 + +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +static uint32_t +bj_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state*16 + type]; + return *state; +} diff --git a/test/tests.c b/test/tests.c new file mode 100644 index 0000000..3befc6d --- /dev/null +++ b/test/tests.c @@ -0,0 +1,87 @@ +#include <stdio.h> + +#include "../utf8.h" +#include "utf8-encode.h" + +static int count_pass; +static int count_fail; + +#define TEST(x, s, ...) \ + do { \ + if (x) { \ + printf("\033[32;1mPASS\033[0m " s "\n", __VA_ARGS__); \ + count_pass++; \ + } else { \ + printf("\033[31;1mFAIL\033[0m " s "\n", __VA_ARGS__); \ + count_fail++; \ + } \ + } while (0) + +int +main(void) +{ + /* Make sure it can decode every character */ + { + long failures = 0; + for (long i = 0; i < 0x10000L; i++) { + if (!IS_SURROGATE(i)) { + int e; + long c; + unsigned char buf[8] = {0}; + unsigned char *end = utf8_encode(buf, i); + unsigned char *res = utf8_decode(buf, &c, &e); + failures += end != res || c != i || e; + } + } + TEST(failures == 0, "decode all, errors: %ld", failures); + } + + /* Does it reject all surrogate halves? */ + { + long failures = 0; + for (long i = 0xD800; i <= 0xDFFFU; i++) { + int e; + long c; + unsigned char buf[8] = {0}; + utf8_encode(buf, i); + utf8_decode(buf, &c, &e); + failures += !e; + } + TEST(failures == 0, "surrogate halves, errors: %ld", failures); + } + + /* How about non-canonical encodings? */ + { + int e; + long c; + unsigned char *end; + + unsigned char buf2[8] = {0xc0, 0xA4}; + end = utf8_decode(buf2, &c, &e); + TEST(e, "non-canonical len 2, 0x%02x", e); + TEST(end == buf2 + 2, "non-canonical recover 2, U+%04lx", c); + + unsigned char buf3[8] = {0xe0, 0x80, 0xA4}; + end = utf8_decode(buf3, &c, &e); + TEST(e, "non-canonical len 3, 0x%02x", e); + TEST(end == buf3 + 3, "non-canonical recover 3, U+%04lx", c); + + unsigned char buf4[8] = {0xf0, 0x80, 0x80, 0xA4}; + end = utf8_decode(buf4, &c, &e); + TEST(e, "non-canonical encoding len 4, 0x%02x", e); + TEST(end == buf4 + 4, "non-canonical recover 4, U+%04lx", c); + } + + /* Let's try some bogus byte sequences */ + { + int e; + long c; + + unsigned char buf[4] = {0xff}; + int len = (unsigned char *)utf8_decode(buf, &c, &e) - buf; + TEST(e, "bogus ff 00 00 00, 0x%02x, U+%04lx, len=%d", e, c, len); + } + + printf("%d fail, %d pass\n", count_fail, count_pass); + return count_fail != 0; +} diff --git a/test/utf8-encode.h b/test/utf8-encode.h new file mode 100644 index 0000000..73113e7 --- /dev/null +++ b/test/utf8-encode.h @@ -0,0 +1,31 @@ +#ifndef UTF8_ENCODE +#define UTF8_ENCODE + +#define IS_SURROGATE(c) ((c) >= 0xD800U && (c) <= 0xDFFFU) + +static void * +utf8_encode(void *buf, long c) +{ + unsigned char *s = buf; + if (c >= (1L << 16)) { + s[0] = 0xf0 | (c >> 18); + s[1] = 0x80 | ((c >> 12) & 0x3f); + s[2] = 0x80 | ((c >> 6) & 0x3f); + s[3] = 0x80 | ((c >> 0) & 0x3f); + return s + 4; + } else if (c >= (1L << 11)) { + s[0] = 0xe0 | (c >> 12); + s[1] = 0x80 | ((c >> 6) & 0x3f); + s[2] = 0x80 | ((c >> 0) & 0x3f); + return s + 3; + } else if (c >= (1L << 7)) { + s[0] = 0xc0 | (c >> 6); + s[1] = 0x80 | ((c >> 0) & 0x3f); + return s + 2; + } else { + s[0] = c; + return s + 1; + } +} + +#endif @@ -0,0 +1,59 @@ +/* Branchless UTF-8 decoder + * Chris Wellons + * + * This is free and unencumbered software released into the public domain. + */ +#ifndef UTF8_H +#define UTF8_H + +/* Decode the next character, C, from BUF, reporting errors in E. + * + * Since this is a branchless decoder, four bytes will be read from the + * buffer regardless of the actual length of the next character. This + * means the buffer _must_ have at least three zero-padding bytes + * following the end of the data stream. + * + * Errors are reported in E, which will be non-zero if the parsed + * character was somehow invalid: invalid byte sequence, non-canonical + * encoding, or a surrogate half. + */ +static void * +utf8_decode(void *buf, long *c, int *e) { + static const char utf8_lengths[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0 + }; + static const unsigned char masks[] = { + 0x00, 0x7f, 0x1f, 0x0f, 0x07 + }; + static const char thresholds[] = { + 22, 0, 7, 11, 16 + }; + static const char shiftc[] = { + 0, 18, 12, 6, 0 + }; + static const char shifte[] = { + 0, 6, 4, 2, 0 + }; + + unsigned char *s = buf; + int len = utf8_lengths[s[0] >> 3]; + + *c = (s[0] & masks[len]) << 18; + *c |= (s[1] & 0x3fU) << 12; + *c |= (s[2] & 0x3fU) << 6; + *c |= (s[3] & 0x3fU) << 0; + *c >>= shiftc[len]; + + *e = (*c < (1L << thresholds[len]) - 1) << 6; + *e |= ((*c >> 11) == 0x1b) << 7; + *e |= (s[1] & 0xc0U) >> 2; + *e |= (s[2] & 0xc0U) >> 4; + *e |= (s[3] ) >> 6; + *e ^= 0x2aU; + *e >>= shifte[len]; + + return s + len; +} + +#endif |