From 15924e1250595989805c5994383b58224cce0b8b Mon Sep 17 00:00:00 2001 From: Christopher Wellons Date: Fri, 6 Oct 2017 11:28:26 -0400 Subject: Tweak types and the order that things are computed --- test/benchmark.c | 2 +- test/tests.c | 27 +++++++++++++++------------ utf8.h | 34 +++++++++++++++++++++++----------- 3 files changed, 39 insertions(+), 24 deletions(-) diff --git a/test/benchmark.c b/test/benchmark.c index b637e7b..4eb51da 100644 --- a/test/benchmark.c +++ b/test/benchmark.c @@ -87,7 +87,7 @@ main(void) do { unsigned char *p = buffer; int e = 0; - long c; + uint32_t c; long count = 0; while (p < end) { p = utf8_decode(p, &c, &e); diff --git a/test/tests.c b/test/tests.c index bb0b0d3..be94789 100644 --- a/test/tests.c +++ b/test/tests.c @@ -23,10 +23,10 @@ main(void) /* Make sure it can decode every character */ { long failures = 0; - for (long i = 0; i < 0x1ffffL; i++) { + for (unsigned long i = 0; i < 0x1ffff; i++) { if (!IS_SURROGATE(i)) { int e; - long c; + uint32_t c; unsigned char buf[8] = {0}; unsigned char *end = utf8_encode(buf, i); unsigned char *res = utf8_decode(buf, &c, &e); @@ -39,9 +39,9 @@ main(void) /* Does it reject all surrogate halves? */ { long failures = 0; - for (long i = 0xd800; i <= 0xdfff; i++) { + for (unsigned long i = 0xd800; i <= 0xdfff; i++) { int e; - long c; + uint32_t c; unsigned char buf[8] = {0}; utf8_encode(buf, i); utf8_decode(buf, &c, &e); @@ -53,46 +53,49 @@ main(void) /* How about non-canonical encodings? */ { int e; - long c; + uint32_t c; unsigned char *end; unsigned char buf2[8] = {0xc0, 0xA4}; end = utf8_decode(buf2, &c, &e); TEST(e, "non-canonical len 2, 0x%02x", e); - TEST(end == buf2 + 2, "non-canonical recover 2, U+%04lx", c); + TEST(end == buf2 + 2, "non-canonical recover 2, U+%04lx", + (unsigned long)c); unsigned char buf3[8] = {0xe0, 0x80, 0xA4}; end = utf8_decode(buf3, &c, &e); TEST(e, "non-canonical len 3, 0x%02x", e); - TEST(end == buf3 + 3, "non-canonical recover 3, U+%04lx", c); + TEST(end == buf3 + 3, "non-canonical recover 3, U+%04lx", + (unsigned long)c); unsigned char buf4[8] = {0xf0, 0x80, 0x80, 0xA4}; end = utf8_decode(buf4, &c, &e); TEST(e, "non-canonical encoding len 4, 0x%02x", e); - TEST(end == buf4 + 4, "non-canonical recover 4, U+%04lx", c); + TEST(end == buf4 + 4, "non-canonical recover 4, U+%04lx", + (unsigned long)c); } /* Let's try some bogus byte sequences */ { int len, e; - long c; + uint32_t c; /* Invalid first byte */ unsigned char buf0[4] = {0xff}; len = (unsigned char *)utf8_decode(buf0, &c, &e) - buf0; - TEST(e, "bogus [ff] 0x%02x U+%04lx", e, c); + TEST(e, "bogus [ff] 0x%02x U+%04lx", e, (unsigned long)c); TEST(len == 1, "bogus [ff] recovery %d", len); /* Invalid first byte */ unsigned char buf1[4] = {0x80}; len = (unsigned char *)utf8_decode(buf1, &c, &e) - buf1; - TEST(e, "bogus [80] 0x%02x U+%04lx", e, c); + TEST(e, "bogus [80] 0x%02x U+%04lx", e, (unsigned long)c); TEST(len == 1, "bogus [80] recovery %d", len); /* Looks like a two-byte sequence but second byte is wrong */ unsigned char buf2[4] = {0xc0, 0x0a}; len = (unsigned char *)utf8_decode(buf2, &c, &e) - buf2; - TEST(e, "bogus [c0 0a] 0x%02x U+%04lx", e, c); + TEST(e, "bogus [c0 0a] 0x%02x U+%04lx", e, (unsigned long)c); TEST(len == 2, "bogus [c0 0a] recovery %d", len); } diff --git a/utf8.h b/utf8.h index 1942cb4..19eef17 100644 --- a/utf8.h +++ b/utf8.h @@ -5,6 +5,8 @@ #ifndef UTF8_H #define UTF8_H +#include + /* Decode the next character, C, from BUF, reporting errors in E. * * Since this is a branchless decoder, four bytes will be read from the @@ -17,7 +19,7 @@ * encoding, or a surrogate half. */ static void * -utf8_decode(void *buf, long *c, int *e) { +utf8_decode(void *buf, uint32_t *c, int *e) { static const char utf8_lengths[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0 @@ -30,21 +32,31 @@ utf8_decode(void *buf, long *c, int *e) { unsigned char *s = buf; int len = utf8_lengths[s[0] >> 3]; - *c = (s[0] & masks[len]) << 18; - *c |= (s[1] & 0x3fU) << 12; - *c |= (s[2] & 0x3fU) << 6; - *c |= (s[3] & 0x3fU) << 0; + /* Compute the pointer to the next character early so that, when + * unrolled, the next iteration can start working on the next + * character. Neither Clang nor GCC figure this out on their own. + */ + unsigned char *next = s + len + !len; + + /* Assume this is a four-byte character and load all four bytes. + * Unused bits will be shifted out later. + */ + *c = (uint32_t)(s[0] & masks[len]) << 18; + *c |= (uint32_t)(s[1] & 0x3f) << 12; + *c |= (uint32_t)(s[2] & 0x3f) << 6; + *c |= (uint32_t)(s[3] & 0x3f) << 0; *c >>= shiftc[len]; - *e = (*c < (1L << thresh[len]) - 1) << 6; + /* Accumulate the various error conditions. */ + *e = (*c < (UINT32_C(1) << thresh[len]) - 1) << 6; *e |= ((*c >> 11) == 0x1b) << 7; // surrogate half? - *e |= (s[1] & 0xc0U) >> 2; - *e |= (s[2] & 0xc0U) >> 4; - *e |= (s[3] ) >> 6; - *e ^= 0x2aU; // top two bits of each tail byte correct? + *e |= (s[1] & 0xc0) >> 2; + *e |= (s[2] & 0xc0) >> 4; + *e |= (s[3] ) >> 6; + *e ^= 0x2a; // top two bits of each tail byte correct? *e >>= shifte[len]; - return s + len + !len; + return next; } #endif -- cgit v1.2.3