diff options
Diffstat (limited to 'utf8.h')
-rw-r--r-- | utf8.h | 34 |
1 files changed, 23 insertions, 11 deletions
@@ -5,6 +5,8 @@ #ifndef UTF8_H #define UTF8_H +#include <stdint.h> + /* Decode the next character, C, from BUF, reporting errors in E. * * Since this is a branchless decoder, four bytes will be read from the @@ -17,7 +19,7 @@ * encoding, or a surrogate half. */ static void * -utf8_decode(void *buf, long *c, int *e) { +utf8_decode(void *buf, uint32_t *c, int *e) { static const char utf8_lengths[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0 @@ -30,21 +32,31 @@ utf8_decode(void *buf, long *c, int *e) { unsigned char *s = buf; int len = utf8_lengths[s[0] >> 3]; - *c = (s[0] & masks[len]) << 18; - *c |= (s[1] & 0x3fU) << 12; - *c |= (s[2] & 0x3fU) << 6; - *c |= (s[3] & 0x3fU) << 0; + /* Compute the pointer to the next character early so that, when + * unrolled, the next iteration can start working on the next + * character. Neither Clang nor GCC figure this out on their own. + */ + unsigned char *next = s + len + !len; + + /* Assume this is a four-byte character and load all four bytes. + * Unused bits will be shifted out later. + */ + *c = (uint32_t)(s[0] & masks[len]) << 18; + *c |= (uint32_t)(s[1] & 0x3f) << 12; + *c |= (uint32_t)(s[2] & 0x3f) << 6; + *c |= (uint32_t)(s[3] & 0x3f) << 0; *c >>= shiftc[len]; - *e = (*c < (1L << thresh[len]) - 1) << 6; + /* Accumulate the various error conditions. */ + *e = (*c < (UINT32_C(1) << thresh[len]) - 1) << 6; *e |= ((*c >> 11) == 0x1b) << 7; // surrogate half? - *e |= (s[1] & 0xc0U) >> 2; - *e |= (s[2] & 0xc0U) >> 4; - *e |= (s[3] ) >> 6; - *e ^= 0x2aU; // top two bits of each tail byte correct? + *e |= (s[1] & 0xc0) >> 2; + *e |= (s[2] & 0xc0) >> 4; + *e |= (s[3] ) >> 6; + *e ^= 0x2a; // top two bits of each tail byte correct? *e >>= shifte[len]; - return s + len + !len; + return next; } #endif |