From 49caeee5ab4736f69c339340b889a1f6fb74560d Mon Sep 17 00:00:00 2001 From: Christopher Wellons Date: Fri, 6 Oct 2017 13:58:15 -0400 Subject: Adjust some comments again --- utf8.h | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/utf8.h b/utf8.h index d4444db..6e26ebd 100644 --- a/utf8.h +++ b/utf8.h @@ -11,35 +11,39 @@ * * Since this is a branchless decoder, four bytes will be read from the * buffer regardless of the actual length of the next character. This - * means the buffer _must_ have at least three zero-padding bytes + * means the buffer _must_ have at least three bytes of zero padding * following the end of the data stream. * * Errors are reported in E, which will be non-zero if the parsed * character was somehow invalid: invalid byte sequence, non-canonical * encoding, or a surrogate half. + * + * The function returns a pointer to the next character. When an error + * occurs, this pointer will be a guess that depends on the particular + * error, but it will always advance at least one byte. */ static void * utf8_decode(void *buf, uint32_t *c, int *e) { - static const char utf8_lengths[] = { + static const char lengths[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0 }; static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07}; - static const uint32_t thresh[] = {4194304, 0, 128, 2048, 65536}; + static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536}; static const int shiftc[] = {0, 18, 12, 6, 0}; static const int shifte[] = {0, 6, 4, 2, 0}; unsigned char *s = buf; - int len = utf8_lengths[s[0] >> 3]; + int len = lengths[s[0] >> 3]; - /* Compute the pointer to the next character early so that, when - * unrolled, the next iteration can start working on the next - * character. Neither Clang nor GCC figure this out on their own. + /* Compute the pointer to the next character early so that the next + * iteration can start working on the next character. Neither Clang + * nor GCC figure out this reordering on their own. */ unsigned char *next = s + len + !len; - /* Assume this is a four-byte character and load all four bytes. - * Unused bits will be shifted out later. + /* Assume a four-byte character and load four bytes. Unused bits are + * shifted out. */ *c = (uint32_t)(s[0] & masks[len]) << 18; *c |= (uint32_t)(s[1] & 0x3f) << 12; @@ -48,7 +52,7 @@ utf8_decode(void *buf, uint32_t *c, int *e) { *c >>= shiftc[len]; /* Accumulate the various error conditions. */ - *e = (*c < thresh[len]) << 6; + *e = (*c < mins[len]) << 6; *e |= ((*c >> 11) == 0x1b) << 7; // surrogate half? *e |= (s[1] & 0xc0) >> 2; *e |= (s[2] & 0xc0) >> 4; -- cgit v1.2.3