summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristopher Wellons <wellons@nullprogram.com>2017-10-08 12:43:48 -0400
committerChristopher Wellons <wellons@nullprogram.com>2017-10-08 12:44:18 -0400
commit2198501b0c79aa68cda3f47d902524c015147bf5 (patch)
tree3697d93cff662e2410ebc9ecdf9f164d9c33b26c
parent3c74988876ee04145e83fd79e916b5a76c946e66 (diff)
Prefer a faster variant of Hoehrmann's DFA decoder (#1)
-rw-r--r--test/bh-utf8.h42
1 files changed, 42 insertions, 0 deletions
diff --git a/test/bh-utf8.h b/test/bh-utf8.h
index e0f4efb..d93cf06 100644
--- a/test/bh-utf8.h
+++ b/test/bh-utf8.h
@@ -1,6 +1,8 @@
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+#ifdef BH_ORIGINAL
+
#define UTF8_ACCEPT 0
#define UTF8_REJECT 1
@@ -32,3 +34,43 @@ bh_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
*state = utf8d[256 + *state*16 + type];
return *state;
}
+
+#else
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 12
+
+static const uint8_t utf8d[] = {
+ // The first part of the table maps bytes to character classes that
+ // to reduce the size of the transition table and create bitmasks.
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+ // The second part is a transition table that maps a combination
+ // of a state of the automaton and a character class to a state.
+ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+ 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+ 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+ 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+ 12,36,12,12,12,12,12,12,12,12,12,12,
+};
+
+static uint32_t
+bh_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
+ uint32_t type = utf8d[byte];
+
+ *codep = (*state != UTF8_ACCEPT) ?
+ (byte & 0x3fu) | (*codep << 6) :
+ (0xff >> type) & (byte);
+
+ *state = utf8d[256 + *state + type];
+ return *state;
+}
+
+#endif