From b3efa631d9e4870cfa3c67ef784967b2c4d033f5 Mon Sep 17 00:00:00 2001 From: Andrew Tribick Date: Sat, 13 Nov 2021 13:11:51 +0100 Subject: [PATCH] Enforce maximum token length, error on invalid UTF-8 sequences --- src/celutil/tokenizer.cpp | 60 +++++++++++++++++++++++++++--------- src/celutil/utf8.cpp | 64 ++++++++++++++++++++++++++++++++++++--- src/celutil/utf8.h | 29 ++++++++++++++++-- 3 files changed, 131 insertions(+), 22 deletions(-) diff --git a/src/celutil/tokenizer.cpp b/src/celutil/tokenizer.cpp index 885e7a7f5..47d2fc12f 100644 --- a/src/celutil/tokenizer.cpp +++ b/src/celutil/tokenizer.cpp @@ -18,6 +18,8 @@ namespace { +constexpr std::string::size_type maxTokenLength = 1024; + enum class State { Start, @@ -41,12 +43,25 @@ bool isSeparator(char c) { return !std::isdigit(c) && !std::isalpha(c) && c != '.'; } + +bool tryPushBack(std::string& s, char c) +{ + if (s.size() < maxTokenLength) + { + s.push_back(c); + return true; + } + + reportError("Token too long"); + return false; +} } Tokenizer::Tokenizer(std::istream* _in) : in(_in) { + textToken.reserve(maxTokenLength); } @@ -68,6 +83,7 @@ Tokenizer::TokenType Tokenizer::nextToken() } } + UTF8Validator validator; textToken.clear(); tokenValue = std::nan(""); State state = State::Start; @@ -95,6 +111,12 @@ Tokenizer::TokenType Tokenizer::nextToken() newToken = TokenError; break; } + else if (!validator.check(nextChar)) + { + reportError("Invalid UTF-8 sequence detected"); + newToken = TokenError; + break; + } else if (nextChar == '\n') { ++lineNumber; @@ -185,16 +207,16 @@ Tokenizer::TokenType Tokenizer::nextToken() } else if (std::isdigit(nextChar)) { - textToken.push_back(nextChar); + if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; } } else if (nextChar == '.') { - textToken.push_back(nextChar); + if (!tryPushBack(textToken, '.')) { newToken = TokenError; } state = State::Fraction; } else if (nextChar == 'e' || nextChar == 'E') { - textToken.push_back('e'); + if (!tryPushBack(textToken, 'e')) { newToken = TokenError; } state = State::ExponentStart; } else if (isSeparator(nextChar)) @@ -216,11 +238,11 @@ Tokenizer::TokenType Tokenizer::nextToken() } else if (std::isdigit(nextChar)) { - textToken.push_back(nextChar); + if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; } } else if (nextChar == 'e' || nextChar == 'E') { - textToken.push_back('e'); + if (!tryPushBack(textToken, 'e')) { newToken = TokenError; } state = State::ExponentStart; } else if (isSeparator(nextChar)) @@ -243,7 +265,7 @@ Tokenizer::TokenType Tokenizer::nextToken() } else if (std::isdigit(nextChar) || nextChar == '+' || nextChar == '-') { - textToken.push_back(nextChar); + if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; } state = State::Exponent; } else @@ -260,7 +282,7 @@ Tokenizer::TokenType Tokenizer::nextToken() } else if (std::isdigit(nextChar)) { - textToken.push_back(nextChar); + if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; } } else if (isSeparator(nextChar)) { @@ -281,7 +303,7 @@ Tokenizer::TokenType Tokenizer::nextToken() } else if (std::isalpha(nextChar) || std::isdigit(nextChar) || nextChar == '_') { - textToken.push_back(nextChar); + if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; } } else { @@ -306,7 +328,7 @@ Tokenizer::TokenType Tokenizer::nextToken() } else { - textToken.push_back(nextChar); + if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; } } break; @@ -318,17 +340,17 @@ Tokenizer::TokenType Tokenizer::nextToken() } else if (nextChar == '\\') { - textToken.push_back('\\'); + if (!tryPushBack(textToken, '\\')) { newToken = TokenError; } state = State::String; } else if (nextChar == 'n') { - textToken.push_back('\n'); + if (!tryPushBack(textToken, '\n')) { newToken = TokenError; } state = State::String; } else if (nextChar == '"') { - textToken.push_back('"'); + if (!tryPushBack(textToken, '"')) { newToken = TokenError; } state = State::String; } else if (nextChar == 'u') @@ -350,8 +372,16 @@ Tokenizer::TokenType Tokenizer::nextToken() if (unicodeDigits == 4) { auto unicodeValue = static_cast(std::strtoul(unicode, nullptr, 16)); - UTF8Encode(unicodeValue, textToken); - state = State::String; + if (textToken.size() + UTF8EncodedSize(unicodeValue) <= maxTokenLength) + { + UTF8Encode(unicodeValue, textToken); + state = State::String; + } + else + { + reportError("Token too long"); + newToken = TokenError; + } } } else @@ -416,7 +446,7 @@ void Tokenizer::pushBack() double Tokenizer::getNumberValue() const { - return tokenValue; + return tokenType == TokenNumber ? tokenValue : std::nan(""); } diff --git a/src/celutil/utf8.cpp b/src/celutil/utf8.cpp index 478db3f65..2b1a0ff83 100644 --- a/src/celutil/utf8.cpp +++ b/src/celutil/utf8.cpp @@ -440,9 +440,17 @@ void UTF8Encode(std::uint32_t ch, std::string& dest) } else if (ch < 0x10000) { - dest.push_back(static_cast(0xe0 | (ch >> 12))); - dest.push_back(static_cast(0x80 | ((ch & 0xfff) >> 6))); - dest.push_back(static_cast(0x80 | (ch & 0x3f))); + if (ch < 0xd800 || ch >= 0xe000) + { + dest.push_back(static_cast(0xe0 | (ch >> 12))); + dest.push_back(static_cast(0x80 | ((ch & 0xfff) >> 6))); + dest.push_back(static_cast(0x80 | (ch & 0x3f))); + } + else + { + // disallow surrogates + dest.append(UTF8_REPLACEMENT_CHAR); + } } #if WCHAR_MAX > 0xFFFFu else if (ch < 0x110000) @@ -457,7 +465,7 @@ void UTF8Encode(std::uint32_t ch, std::string& dest) { // not a valid Unicode code point, or we only support BMP characters, // so fall back to U+FFFD REPLACEMENT CHARACTER - dest.append("\357\277\275"); + dest.append(UTF8_REPLACEMENT_CHAR); } } @@ -999,3 +1007,51 @@ std::vector getGreekCompletion(const std::string &s) return ret; } + +bool +UTF8Validator::check(char c) +{ + return check(static_cast(c)); +} + +bool +UTF8Validator::check(unsigned char c) +{ + switch (state) + { + case State::Initial: + if (c < 0x80) { return true; } + if (c < 0xc2) { return false; } + if (c < 0xe0) { state = State::Continuation1; return true; } + if (c == 0xe0) { state = State::E0Continuation; return true; } + if (c < 0xed) { state = State::Continuation2; return true; } + if (c== 0xed) { state = State::EDContinuation; return true; } + if (c < 0xf0) { state = State::Continuation2; return true; } + if (c == 0xf0) { state = State::F0Continuation; return true; } + if (c < 0xf4) { state = State::Continuation3; return true; } + if (c == 0xf4) { state = State::F4Continuation; return true; } + return false; + case State::Continuation1: + if (c >= 0x80 && c < 0xc0) { state = State::Initial; return true; } + break; + case State::Continuation2: + if (c >= 0x80 && c < 0xc0) { state = State::Continuation1; return true; } + break; + case State::Continuation3: + if (c >= 0x80 && c < 0xc0) { state = State::Continuation2; return true; } + break; + case State::E0Continuation: // disallow overlong sequences + if (c >= 0xa0 && c < 0xc0) { state = State::Continuation1; return true; } + break; + case State::EDContinuation: // disallow surrogate pairs + if (c >= 0x80 && c < 0xa0) { state = State::Continuation1; return true; } + break; + case State::F0Continuation: // disallow overlong sequences + if (c >= 0x90 && c < 0xc0) { state = State::Continuation2; return true; } + break; + case State::F4Continuation: // disallow out-of-range + if (c >= 0x80 && c < 0x90) { state = State::Continuation2; return true; } + } + state = State::Initial; + return false; +} diff --git a/src/celutil/utf8.h b/src/celutil/utf8.h index bb9125fde..27c110d8b 100644 --- a/src/celutil/utf8.h +++ b/src/celutil/utf8.h @@ -7,8 +7,7 @@ // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. -#ifndef _CELUTIL_UTF8_ -#define _CELUTIL_UTF8_ +#pragma once #include #include @@ -26,6 +25,7 @@ #define UTF8_SUPERSCRIPT_7 "\342\201\267" #define UTF8_SUPERSCRIPT_8 "\342\201\270" #define UTF8_SUPERSCRIPT_9 "\342\201\271" +#define UTF8_REPLACEMENT_CHAR "\357\277\275" bool UTF8Decode(const std::string& str, int pos, wchar_t& ch); @@ -136,4 +136,27 @@ class Greek std::vector getGreekCompletion(const std::string &); -#endif // _CELUTIL_UTF8_ +class UTF8Validator +{ +public: + UTF8Validator() = default; + ~UTF8Validator() = default; + + bool check(char c); + bool check(unsigned char c); + +private: + enum class State + { + Initial, + Continuation1, + Continuation2, + Continuation3, + E0Continuation, + EDContinuation, + F0Continuation, + F4Continuation, + }; + + State state{ State::Initial }; +};