summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristopher Wellons <wellons@nullprogram.com>2017-10-06 11:28:26 -0400
committerChristopher Wellons <wellons@nullprogram.com>2017-10-06 11:28:26 -0400
commit15924e1250595989805c5994383b58224cce0b8b (patch)
treefd41590c0fcc0e4d58078fc326ea7de154cc4379
parent0b4b374e733d3eb28051ced0e034e850a4e61834 (diff)
Tweak types and the order that things are computed
-rw-r--r--test/benchmark.c2
-rw-r--r--test/tests.c27
-rw-r--r--utf8.h34
3 files changed, 39 insertions, 24 deletions
diff --git a/test/benchmark.c b/test/benchmark.c
index b637e7b..4eb51da 100644
--- a/test/benchmark.c
+++ b/test/benchmark.c
@@ -87,7 +87,7 @@ main(void)
do {
unsigned char *p = buffer;
int e = 0;
- long c;
+ uint32_t c;
long count = 0;
while (p < end) {
p = utf8_decode(p, &c, &e);
diff --git a/test/tests.c b/test/tests.c
index bb0b0d3..be94789 100644
--- a/test/tests.c
+++ b/test/tests.c
@@ -23,10 +23,10 @@ main(void)
/* Make sure it can decode every character */
{
long failures = 0;
- for (long i = 0; i < 0x1ffffL; i++) {
+ for (unsigned long i = 0; i < 0x1ffff; i++) {
if (!IS_SURROGATE(i)) {
int e;
- long c;
+ uint32_t c;
unsigned char buf[8] = {0};
unsigned char *end = utf8_encode(buf, i);
unsigned char *res = utf8_decode(buf, &c, &e);
@@ -39,9 +39,9 @@ main(void)
/* Does it reject all surrogate halves? */
{
long failures = 0;
- for (long i = 0xd800; i <= 0xdfff; i++) {
+ for (unsigned long i = 0xd800; i <= 0xdfff; i++) {
int e;
- long c;
+ uint32_t c;
unsigned char buf[8] = {0};
utf8_encode(buf, i);
utf8_decode(buf, &c, &e);
@@ -53,46 +53,49 @@ main(void)
/* How about non-canonical encodings? */
{
int e;
- long c;
+ uint32_t c;
unsigned char *end;
unsigned char buf2[8] = {0xc0, 0xA4};
end = utf8_decode(buf2, &c, &e);
TEST(e, "non-canonical len 2, 0x%02x", e);
- TEST(end == buf2 + 2, "non-canonical recover 2, U+%04lx", c);
+ TEST(end == buf2 + 2, "non-canonical recover 2, U+%04lx",
+ (unsigned long)c);
unsigned char buf3[8] = {0xe0, 0x80, 0xA4};
end = utf8_decode(buf3, &c, &e);
TEST(e, "non-canonical len 3, 0x%02x", e);
- TEST(end == buf3 + 3, "non-canonical recover 3, U+%04lx", c);
+ TEST(end == buf3 + 3, "non-canonical recover 3, U+%04lx",
+ (unsigned long)c);
unsigned char buf4[8] = {0xf0, 0x80, 0x80, 0xA4};
end = utf8_decode(buf4, &c, &e);
TEST(e, "non-canonical encoding len 4, 0x%02x", e);
- TEST(end == buf4 + 4, "non-canonical recover 4, U+%04lx", c);
+ TEST(end == buf4 + 4, "non-canonical recover 4, U+%04lx",
+ (unsigned long)c);
}
/* Let's try some bogus byte sequences */
{
int len, e;
- long c;
+ uint32_t c;
/* Invalid first byte */
unsigned char buf0[4] = {0xff};
len = (unsigned char *)utf8_decode(buf0, &c, &e) - buf0;
- TEST(e, "bogus [ff] 0x%02x U+%04lx", e, c);
+ TEST(e, "bogus [ff] 0x%02x U+%04lx", e, (unsigned long)c);
TEST(len == 1, "bogus [ff] recovery %d", len);
/* Invalid first byte */
unsigned char buf1[4] = {0x80};
len = (unsigned char *)utf8_decode(buf1, &c, &e) - buf1;
- TEST(e, "bogus [80] 0x%02x U+%04lx", e, c);
+ TEST(e, "bogus [80] 0x%02x U+%04lx", e, (unsigned long)c);
TEST(len == 1, "bogus [80] recovery %d", len);
/* Looks like a two-byte sequence but second byte is wrong */
unsigned char buf2[4] = {0xc0, 0x0a};
len = (unsigned char *)utf8_decode(buf2, &c, &e) - buf2;
- TEST(e, "bogus [c0 0a] 0x%02x U+%04lx", e, c);
+ TEST(e, "bogus [c0 0a] 0x%02x U+%04lx", e, (unsigned long)c);
TEST(len == 2, "bogus [c0 0a] recovery %d", len);
}
diff --git a/utf8.h b/utf8.h
index 1942cb4..19eef17 100644
--- a/utf8.h
+++ b/utf8.h
@@ -5,6 +5,8 @@
#ifndef UTF8_H
#define UTF8_H
+#include <stdint.h>
+
/* Decode the next character, C, from BUF, reporting errors in E.
*
* Since this is a branchless decoder, four bytes will be read from the
@@ -17,7 +19,7 @@
* encoding, or a surrogate half.
*/
static void *
-utf8_decode(void *buf, long *c, int *e) {
+utf8_decode(void *buf, uint32_t *c, int *e) {
static const char utf8_lengths[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
@@ -30,21 +32,31 @@ utf8_decode(void *buf, long *c, int *e) {
unsigned char *s = buf;
int len = utf8_lengths[s[0] >> 3];
- *c = (s[0] & masks[len]) << 18;
- *c |= (s[1] & 0x3fU) << 12;
- *c |= (s[2] & 0x3fU) << 6;
- *c |= (s[3] & 0x3fU) << 0;
+ /* Compute the pointer to the next character early so that, when
+ * unrolled, the next iteration can start working on the next
+ * character. Neither Clang nor GCC figure this out on their own.
+ */
+ unsigned char *next = s + len + !len;
+
+ /* Assume this is a four-byte character and load all four bytes.
+ * Unused bits will be shifted out later.
+ */
+ *c = (uint32_t)(s[0] & masks[len]) << 18;
+ *c |= (uint32_t)(s[1] & 0x3f) << 12;
+ *c |= (uint32_t)(s[2] & 0x3f) << 6;
+ *c |= (uint32_t)(s[3] & 0x3f) << 0;
*c >>= shiftc[len];
- *e = (*c < (1L << thresh[len]) - 1) << 6;
+ /* Accumulate the various error conditions. */
+ *e = (*c < (UINT32_C(1) << thresh[len]) - 1) << 6;
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
- *e |= (s[1] & 0xc0U) >> 2;
- *e |= (s[2] & 0xc0U) >> 4;
- *e |= (s[3] ) >> 6;
- *e ^= 0x2aU; // top two bits of each tail byte correct?
+ *e |= (s[1] & 0xc0) >> 2;
+ *e |= (s[2] & 0xc0) >> 4;
+ *e |= (s[3] ) >> 6;
+ *e ^= 0x2a; // top two bits of each tail byte correct?
*e >>= shifte[len];
- return s + len + !len;
+ return next;
}
#endif