summaryrefslogtreecommitdiff
path: root/src/util/UTF8.cxx
diff options
context:
space:
mode:
authorMax Kellermann <max@duempel.org>2014-10-10 21:17:40 +0200
committerMax Kellermann <max@duempel.org>2014-10-10 22:11:38 +0200
commitb70bf938c230bb5e8aafe95094975a8e10109876 (patch)
tree5513db2f04a57a7f51dedc43bfbbc702a10a17f6 /src/util/UTF8.cxx
parentd5cf41e0431db4fd7d28848899015d8b9225eb45 (diff)
util/UTF8: add SequenceLengthUTF8()
Diffstat (limited to 'src/util/UTF8.cxx')
-rw-r--r--src/util/UTF8.cxx80
1 files changed, 80 insertions, 0 deletions
diff --git a/src/util/UTF8.cxx b/src/util/UTF8.cxx
index 273cbac1e..50ff19e88 100644
--- a/src/util/UTF8.cxx
+++ b/src/util/UTF8.cxx
@@ -166,6 +166,86 @@ ValidateUTF8(const char *p)
return true;
}
+size_t
+SequenceLengthUTF8(char ch)
+{
+ if (IsASCII(ch))
+ return 1;
+ else if (IsLeading1(ch))
+ /* 1 continuation */
+ return 2;
+ else if (IsLeading2(ch))
+ /* 2 continuations */
+ return 3;
+ else if (IsLeading3(ch))
+ /* 3 continuations */
+ return 4;
+ else if (IsLeading4(ch))
+ /* 4 continuations */
+ return 5;
+ else if (IsLeading5(ch))
+ /* 5 continuations */
+ return 6;
+ else
+ /* continuation without a prefix or some other illegal
+ start byte */
+ return 0;
+
+}
+
+template<size_t L>
+struct CheckSequenceUTF8 {
+ gcc_pure
+ bool operator()(const char *p) const {
+ return IsContinuation(*p) && CheckSequenceUTF8<L-1>()(p + 1);
+ }
+};
+
+template<>
+struct CheckSequenceUTF8<0u> {
+ constexpr bool operator()(gcc_unused const char *p) const {
+ return true;
+ }
+};
+
+template<size_t L>
+gcc_pure
+static size_t
+InnerSequenceLengthUTF8(const char *p)
+{
+ return CheckSequenceUTF8<L>()(p)
+ ? L + 1
+ : 0u;
+}
+
+size_t
+SequenceLengthUTF8(const char *p)
+{
+ const unsigned char ch = *p++;
+
+ if (IsASCII(ch))
+ return 1;
+ else if (IsLeading1(ch))
+ /* 1 continuation */
+ return InnerSequenceLengthUTF8<1>(p);
+ else if (IsLeading2(ch))
+ /* 2 continuations */
+ return InnerSequenceLengthUTF8<2>(p);
+ else if (IsLeading3(ch))
+ /* 3 continuations */
+ return InnerSequenceLengthUTF8<3>(p);
+ else if (IsLeading4(ch))
+ /* 4 continuations */
+ return InnerSequenceLengthUTF8<4>(p);
+ else if (IsLeading5(ch))
+ /* 5 continuations */
+ return InnerSequenceLengthUTF8<5>(p);
+ else
+ /* continuation without a prefix or some other illegal
+ start byte */
+ return 0;
+}
+
static const char *
FindNonASCIIOrZero(const char *p)
{