From b3efa631d9e4870cfa3c67ef784967b2c4d033f5 Mon Sep 17 00:00:00 2001
From: Andrew Tribick <ajtribick@googlemail.com>
Date: Sat, 13 Nov 2021 13:11:51 +0100
Subject: [PATCH] Enforce maximum token length, error on invalid UTF-8
 sequences

---
 src/celutil/tokenizer.cpp | 60 +++++++++++++++++++++++++++---------
 src/celutil/utf8.cpp      | 64 ++++++++++++++++++++++++++++++++++++---
 src/celutil/utf8.h        | 29 ++++++++++++++++--
 3 files changed, 131 insertions(+), 22 deletions(-)

diff --git a/src/celutil/tokenizer.cpp b/src/celutil/tokenizer.cpp
index 885e7a7f5..47d2fc12f 100644
--- a/src/celutil/tokenizer.cpp
+++ b/src/celutil/tokenizer.cpp
@@ -18,6 +18,8 @@
 
 namespace
 {
+constexpr std::string::size_type maxTokenLength = 1024;
+
 enum class State
 {
     Start,
@@ -41,12 +43,25 @@ bool isSeparator(char c)
 {
     return !std::isdigit(c) && !std::isalpha(c) && c != '.';
 }
+
+bool tryPushBack(std::string& s, char c)
+{
+    if (s.size() < maxTokenLength)
+    {
+        s.push_back(c);
+        return true;
+    }
+
+    reportError("Token too long");
+    return false;
+}
 }
 
 
 Tokenizer::Tokenizer(std::istream* _in) :
     in(_in)
 {
+    textToken.reserve(maxTokenLength);
 }
 
 
@@ -68,6 +83,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
         }
     }
 
+    UTF8Validator validator;
     textToken.clear();
     tokenValue = std::nan("");
     State state = State::Start;
@@ -95,6 +111,12 @@ Tokenizer::TokenType Tokenizer::nextToken()
                 newToken = TokenError;
                 break;
             }
+            else if (!validator.check(nextChar))
+            {
+                reportError("Invalid UTF-8 sequence detected");
+                newToken = TokenError;
+                break;
+            }
             else if (nextChar == '\n')
             {
                 ++lineNumber;
@@ -185,16 +207,16 @@ Tokenizer::TokenType Tokenizer::nextToken()
             }
             else if (std::isdigit(nextChar))
             {
-                textToken.push_back(nextChar);
+                if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
             }
             else if (nextChar == '.')
             {
-                textToken.push_back(nextChar);
+                if (!tryPushBack(textToken, '.')) { newToken = TokenError; }
                 state = State::Fraction;
             }
             else if (nextChar == 'e' || nextChar == 'E')
             {
-                textToken.push_back('e');
+                if (!tryPushBack(textToken, 'e')) { newToken = TokenError; }
                 state = State::ExponentStart;
             }
             else if (isSeparator(nextChar))
@@ -216,11 +238,11 @@ Tokenizer::TokenType Tokenizer::nextToken()
             }
             else if (std::isdigit(nextChar))
             {
-                textToken.push_back(nextChar);
+                if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
             }
             else if (nextChar == 'e' || nextChar == 'E')
             {
-                textToken.push_back('e');
+                if (!tryPushBack(textToken, 'e')) { newToken = TokenError; }
                 state = State::ExponentStart;
             }
             else if (isSeparator(nextChar))
@@ -243,7 +265,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
             }
             else if (std::isdigit(nextChar) || nextChar == '+' || nextChar == '-')
             {
-                textToken.push_back(nextChar);
+                if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
                 state = State::Exponent;
             }
             else
@@ -260,7 +282,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
             }
             else if (std::isdigit(nextChar))
             {
-                textToken.push_back(nextChar);
+                if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
             }
             else if (isSeparator(nextChar))
             {
@@ -281,7 +303,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
             }
             else if (std::isalpha(nextChar) || std::isdigit(nextChar) || nextChar == '_')
             {
-                textToken.push_back(nextChar);
+                if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
             }
             else
             {
@@ -306,7 +328,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
             }
             else
             {
-                textToken.push_back(nextChar);
+                if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
             }
             break;
 
@@ -318,17 +340,17 @@ Tokenizer::TokenType Tokenizer::nextToken()
             }
             else if (nextChar == '\\')
             {
-                textToken.push_back('\\');
+                if (!tryPushBack(textToken, '\\')) { newToken = TokenError; }
                 state = State::String;
             }
             else if (nextChar == 'n')
             {
-                textToken.push_back('\n');
+                if (!tryPushBack(textToken, '\n')) { newToken = TokenError; }
                 state = State::String;
             }
             else if (nextChar == '"')
             {
-                textToken.push_back('"');
+                if (!tryPushBack(textToken, '"')) { newToken = TokenError; }
                 state = State::String;
             }
             else if (nextChar == 'u')
@@ -350,8 +372,16 @@ Tokenizer::TokenType Tokenizer::nextToken()
                 if (unicodeDigits == 4)
                 {
                     auto unicodeValue = static_cast<std::uint32_t>(std::strtoul(unicode, nullptr, 16));
-                    UTF8Encode(unicodeValue, textToken);
-                    state = State::String;
+                    if (textToken.size() + UTF8EncodedSize(unicodeValue) <= maxTokenLength)
+                    {
+                        UTF8Encode(unicodeValue, textToken);
+                        state = State::String;
+                    }
+                    else
+                    {
+                        reportError("Token too long");
+                        newToken = TokenError;
+                    }
                 }
             }
             else
@@ -416,7 +446,7 @@ void Tokenizer::pushBack()
 
 double Tokenizer::getNumberValue() const
 {
-    return tokenValue;
+    return tokenType == TokenNumber ? tokenValue : std::nan("");
 }
 
 
diff --git a/src/celutil/utf8.cpp b/src/celutil/utf8.cpp
index 478db3f65..2b1a0ff83 100644
--- a/src/celutil/utf8.cpp
+++ b/src/celutil/utf8.cpp
@@ -440,9 +440,17 @@ void UTF8Encode(std::uint32_t ch, std::string& dest)
     }
     else if (ch < 0x10000)
     {
-        dest.push_back(static_cast<char>(0xe0 | (ch >> 12)));
-        dest.push_back(static_cast<char>(0x80 | ((ch & 0xfff) >> 6)));
-        dest.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
+        if (ch < 0xd800 || ch >= 0xe000)
+        {
+            dest.push_back(static_cast<char>(0xe0 | (ch >> 12)));
+            dest.push_back(static_cast<char>(0x80 | ((ch & 0xfff) >> 6)));
+            dest.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
+        }
+        else
+        {
+            // disallow surrogates
+            dest.append(UTF8_REPLACEMENT_CHAR);
+        }
     }
 #if WCHAR_MAX > 0xFFFFu
     else if (ch < 0x110000)
@@ -457,7 +465,7 @@ void UTF8Encode(std::uint32_t ch, std::string& dest)
     {
         // not a valid Unicode code point, or we only support BMP characters,
         // so fall back to U+FFFD REPLACEMENT CHARACTER
-        dest.append("\357\277\275");
+        dest.append(UTF8_REPLACEMENT_CHAR);
     }
 }
 
@@ -999,3 +1007,51 @@ std::vector<std::string> getGreekCompletion(const std::string &s)
 
     return ret;
 }
+
+bool
+UTF8Validator::check(char c)
+{
+    return check(static_cast<unsigned char>(c));
+}
+
+bool
+UTF8Validator::check(unsigned char c)
+{
+    switch (state)
+    {
+    case State::Initial:
+        if (c < 0x80) { return true; }
+        if (c < 0xc2) { return false; }
+        if (c < 0xe0) { state = State::Continuation1; return true; }
+        if (c == 0xe0) { state = State::E0Continuation; return true; }
+        if (c < 0xed) { state = State::Continuation2; return true; }
+        if (c== 0xed) { state = State::EDContinuation; return true; }
+        if (c < 0xf0) { state = State::Continuation2; return true; }
+        if (c == 0xf0) { state = State::F0Continuation; return true; }
+        if (c < 0xf4) { state = State::Continuation3; return true; }
+        if (c == 0xf4) { state = State::F4Continuation; return true; }
+        return false;
+    case State::Continuation1:
+        if (c >= 0x80 && c < 0xc0) { state = State::Initial; return true; }
+        break;
+    case State::Continuation2:
+        if (c >= 0x80 && c < 0xc0) { state = State::Continuation1; return true; }
+        break;
+    case State::Continuation3:
+        if (c >= 0x80 && c < 0xc0) { state = State::Continuation2; return true; }
+        break;
+    case State::E0Continuation: // disallow overlong sequences
+        if (c >= 0xa0 && c < 0xc0) { state = State::Continuation1; return true; }
+        break;
+    case State::EDContinuation: // disallow surrogate pairs
+        if (c >= 0x80 && c < 0xa0) { state = State::Continuation1; return true; }
+        break;
+    case State::F0Continuation: // disallow overlong sequences
+        if (c >= 0x90 && c < 0xc0) { state = State::Continuation2; return true; }
+        break;
+    case State::F4Continuation: // disallow out-of-range
+        if (c >= 0x80 && c < 0x90) { state = State::Continuation2; return true; }
+    }
+    state = State::Initial;
+    return false;
+}
diff --git a/src/celutil/utf8.h b/src/celutil/utf8.h
index bb9125fde..27c110d8b 100644
--- a/src/celutil/utf8.h
+++ b/src/celutil/utf8.h
@@ -7,8 +7,7 @@
 // as published by the Free Software Foundation; either version 2
 // of the License, or (at your option) any later version.
 
-#ifndef _CELUTIL_UTF8_
-#define _CELUTIL_UTF8_
+#pragma once
 
 #include <cstdint>
 #include <string>
@@ -26,6 +25,7 @@
 #define UTF8_SUPERSCRIPT_7       "\342\201\267"
 #define UTF8_SUPERSCRIPT_8       "\342\201\270"
 #define UTF8_SUPERSCRIPT_9       "\342\201\271"
+#define UTF8_REPLACEMENT_CHAR    "\357\277\275"
 
 
 bool UTF8Decode(const std::string& str, int pos, wchar_t& ch);
@@ -136,4 +136,27 @@ class Greek
 
 std::vector<std::string> getGreekCompletion(const std::string &);
 
-#endif // _CELUTIL_UTF8_
+class UTF8Validator
+{
+public:
+    UTF8Validator() = default;
+    ~UTF8Validator() = default;
+
+    bool check(char c);
+    bool check(unsigned char c);
+
+private:
+    enum class State
+    {
+        Initial,
+        Continuation1,
+        Continuation2,
+        Continuation3,
+        E0Continuation,
+        EDContinuation,
+        F0Continuation,
+        F4Continuation,
+    };
+
+    State state{ State::Initial };
+};