summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h34
1 files changed, 23 insertions, 11 deletions
diff --git a/utf8.h b/utf8.h
index 1942cb4..19eef17 100644
--- a/utf8.h
+++ b/utf8.h
@@ -5,6 +5,8 @@
#ifndef UTF8_H
#define UTF8_H
+#include <stdint.h>
+
/* Decode the next character, C, from BUF, reporting errors in E.
*
* Since this is a branchless decoder, four bytes will be read from the
@@ -17,7 +19,7 @@
* encoding, or a surrogate half.
*/
static void *
-utf8_decode(void *buf, long *c, int *e) {
+utf8_decode(void *buf, uint32_t *c, int *e) {
static const char utf8_lengths[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
@@ -30,21 +32,31 @@ utf8_decode(void *buf, long *c, int *e) {
unsigned char *s = buf;
int len = utf8_lengths[s[0] >> 3];
- *c = (s[0] & masks[len]) << 18;
- *c |= (s[1] & 0x3fU) << 12;
- *c |= (s[2] & 0x3fU) << 6;
- *c |= (s[3] & 0x3fU) << 0;
+ /* Compute the pointer to the next character early so that, when
+ * unrolled, the next iteration can start working on the next
+ * character. Neither Clang nor GCC figure this out on their own.
+ */
+ unsigned char *next = s + len + !len;
+
+ /* Assume this is a four-byte character and load all four bytes.
+ * Unused bits will be shifted out later.
+ */
+ *c = (uint32_t)(s[0] & masks[len]) << 18;
+ *c |= (uint32_t)(s[1] & 0x3f) << 12;
+ *c |= (uint32_t)(s[2] & 0x3f) << 6;
+ *c |= (uint32_t)(s[3] & 0x3f) << 0;
*c >>= shiftc[len];
- *e = (*c < (1L << thresh[len]) - 1) << 6;
+ /* Accumulate the various error conditions. */
+ *e = (*c < (UINT32_C(1) << thresh[len]) - 1) << 6;
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
- *e |= (s[1] & 0xc0U) >> 2;
- *e |= (s[2] & 0xc0U) >> 4;
- *e |= (s[3] ) >> 6;
- *e ^= 0x2aU; // top two bits of each tail byte correct?
+ *e |= (s[1] & 0xc0) >> 2;
+ *e |= (s[2] & 0xc0) >> 4;
+ *e |= (s[3] ) >> 6;
+ *e ^= 0x2a; // top two bits of each tail byte correct?
*e >>= shifte[len];
- return s + len + !len;
+ return next;
}
#endif