From f2d0e24c3864d726cd009901726df4778ad3e0d5 Mon Sep 17 00:00:00 2001 From: Christopher Wellons Date: Mon, 9 Oct 2017 18:37:41 -0400 Subject: Reject out of range code points (fixes #3) --- test/tests.c | 18 +++++++++++++++++- utf8.h | 3 ++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/test/tests.c b/test/tests.c index be94789..c16576c 100644 --- a/test/tests.c +++ b/test/tests.c @@ -23,7 +23,7 @@ main(void) /* Make sure it can decode every character */ { long failures = 0; - for (unsigned long i = 0; i < 0x1ffff; i++) { + for (unsigned long i = 0; i < 0x10ffff; i++) { if (!IS_SURROGATE(i)) { int e; uint32_t c; @@ -36,6 +36,22 @@ main(void) TEST(failures == 0, "decode all, errors: %ld", failures); } + /* Reject everything outside of U+0000..U+10FFFF */ + { + long failures = 0; + for (unsigned long i = 0x110000; i < 0x1fffff; i++) { + int e; + uint32_t c; + unsigned char buf[8] = {0}; + utf8_encode(buf, i); + unsigned char *end = utf8_decode(buf, &c, &e); + failures += !e; + failures += end - buf != 4; + } + TEST(failures == 0, "out of range, errors: %ld", failures); + } + + /* Does it reject all surrogate halves? */ { long failures = 0; diff --git a/utf8.h b/utf8.h index 419977d..8c6a7a0 100644 --- a/utf8.h +++ b/utf8.h @@ -53,8 +53,9 @@ utf8_decode(void *buf, uint32_t *c, int *e) *c >>= shiftc[len]; /* Accumulate the various error conditions. */ - *e = (*c < mins[len]) << 6; + *e = (*c < mins[len]) << 6; // non-canonical encoding *e |= ((*c >> 11) == 0x1b) << 7; // surrogate half? + *e |= (*c > 0x10FFFF) << 8; // out of range? *e |= (s[1] & 0xc0) >> 2; *e |= (s[2] & 0xc0) >> 4; *e |= (s[3] ) >> 6; -- cgit v1.2.3