Enforce maximum token length, error on invalid UTF-8 sequences
parent
0f1ed583ae
commit
b3efa631d9
|
@ -18,6 +18,8 @@
|
||||||
|
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
constexpr std::string::size_type maxTokenLength = 1024;
|
||||||
|
|
||||||
enum class State
|
enum class State
|
||||||
{
|
{
|
||||||
Start,
|
Start,
|
||||||
|
@ -41,12 +43,25 @@ bool isSeparator(char c)
|
||||||
{
|
{
|
||||||
return !std::isdigit(c) && !std::isalpha(c) && c != '.';
|
return !std::isdigit(c) && !std::isalpha(c) && c != '.';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool tryPushBack(std::string& s, char c)
|
||||||
|
{
|
||||||
|
if (s.size() < maxTokenLength)
|
||||||
|
{
|
||||||
|
s.push_back(c);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
reportError("Token too long");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
Tokenizer::Tokenizer(std::istream* _in) :
|
Tokenizer::Tokenizer(std::istream* _in) :
|
||||||
in(_in)
|
in(_in)
|
||||||
{
|
{
|
||||||
|
textToken.reserve(maxTokenLength);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -68,6 +83,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
UTF8Validator validator;
|
||||||
textToken.clear();
|
textToken.clear();
|
||||||
tokenValue = std::nan("");
|
tokenValue = std::nan("");
|
||||||
State state = State::Start;
|
State state = State::Start;
|
||||||
|
@ -95,6 +111,12 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
||||||
newToken = TokenError;
|
newToken = TokenError;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
else if (!validator.check(nextChar))
|
||||||
|
{
|
||||||
|
reportError("Invalid UTF-8 sequence detected");
|
||||||
|
newToken = TokenError;
|
||||||
|
break;
|
||||||
|
}
|
||||||
else if (nextChar == '\n')
|
else if (nextChar == '\n')
|
||||||
{
|
{
|
||||||
++lineNumber;
|
++lineNumber;
|
||||||
|
@ -185,16 +207,16 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
||||||
}
|
}
|
||||||
else if (std::isdigit(nextChar))
|
else if (std::isdigit(nextChar))
|
||||||
{
|
{
|
||||||
textToken.push_back(nextChar);
|
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
|
||||||
}
|
}
|
||||||
else if (nextChar == '.')
|
else if (nextChar == '.')
|
||||||
{
|
{
|
||||||
textToken.push_back(nextChar);
|
if (!tryPushBack(textToken, '.')) { newToken = TokenError; }
|
||||||
state = State::Fraction;
|
state = State::Fraction;
|
||||||
}
|
}
|
||||||
else if (nextChar == 'e' || nextChar == 'E')
|
else if (nextChar == 'e' || nextChar == 'E')
|
||||||
{
|
{
|
||||||
textToken.push_back('e');
|
if (!tryPushBack(textToken, 'e')) { newToken = TokenError; }
|
||||||
state = State::ExponentStart;
|
state = State::ExponentStart;
|
||||||
}
|
}
|
||||||
else if (isSeparator(nextChar))
|
else if (isSeparator(nextChar))
|
||||||
|
@ -216,11 +238,11 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
||||||
}
|
}
|
||||||
else if (std::isdigit(nextChar))
|
else if (std::isdigit(nextChar))
|
||||||
{
|
{
|
||||||
textToken.push_back(nextChar);
|
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
|
||||||
}
|
}
|
||||||
else if (nextChar == 'e' || nextChar == 'E')
|
else if (nextChar == 'e' || nextChar == 'E')
|
||||||
{
|
{
|
||||||
textToken.push_back('e');
|
if (!tryPushBack(textToken, 'e')) { newToken = TokenError; }
|
||||||
state = State::ExponentStart;
|
state = State::ExponentStart;
|
||||||
}
|
}
|
||||||
else if (isSeparator(nextChar))
|
else if (isSeparator(nextChar))
|
||||||
|
@ -243,7 +265,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
||||||
}
|
}
|
||||||
else if (std::isdigit(nextChar) || nextChar == '+' || nextChar == '-')
|
else if (std::isdigit(nextChar) || nextChar == '+' || nextChar == '-')
|
||||||
{
|
{
|
||||||
textToken.push_back(nextChar);
|
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
|
||||||
state = State::Exponent;
|
state = State::Exponent;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -260,7 +282,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
||||||
}
|
}
|
||||||
else if (std::isdigit(nextChar))
|
else if (std::isdigit(nextChar))
|
||||||
{
|
{
|
||||||
textToken.push_back(nextChar);
|
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
|
||||||
}
|
}
|
||||||
else if (isSeparator(nextChar))
|
else if (isSeparator(nextChar))
|
||||||
{
|
{
|
||||||
|
@ -281,7 +303,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
||||||
}
|
}
|
||||||
else if (std::isalpha(nextChar) || std::isdigit(nextChar) || nextChar == '_')
|
else if (std::isalpha(nextChar) || std::isdigit(nextChar) || nextChar == '_')
|
||||||
{
|
{
|
||||||
textToken.push_back(nextChar);
|
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -306,7 +328,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
textToken.push_back(nextChar);
|
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -318,17 +340,17 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
||||||
}
|
}
|
||||||
else if (nextChar == '\\')
|
else if (nextChar == '\\')
|
||||||
{
|
{
|
||||||
textToken.push_back('\\');
|
if (!tryPushBack(textToken, '\\')) { newToken = TokenError; }
|
||||||
state = State::String;
|
state = State::String;
|
||||||
}
|
}
|
||||||
else if (nextChar == 'n')
|
else if (nextChar == 'n')
|
||||||
{
|
{
|
||||||
textToken.push_back('\n');
|
if (!tryPushBack(textToken, '\n')) { newToken = TokenError; }
|
||||||
state = State::String;
|
state = State::String;
|
||||||
}
|
}
|
||||||
else if (nextChar == '"')
|
else if (nextChar == '"')
|
||||||
{
|
{
|
||||||
textToken.push_back('"');
|
if (!tryPushBack(textToken, '"')) { newToken = TokenError; }
|
||||||
state = State::String;
|
state = State::String;
|
||||||
}
|
}
|
||||||
else if (nextChar == 'u')
|
else if (nextChar == 'u')
|
||||||
|
@ -350,8 +372,16 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
||||||
if (unicodeDigits == 4)
|
if (unicodeDigits == 4)
|
||||||
{
|
{
|
||||||
auto unicodeValue = static_cast<std::uint32_t>(std::strtoul(unicode, nullptr, 16));
|
auto unicodeValue = static_cast<std::uint32_t>(std::strtoul(unicode, nullptr, 16));
|
||||||
UTF8Encode(unicodeValue, textToken);
|
if (textToken.size() + UTF8EncodedSize(unicodeValue) <= maxTokenLength)
|
||||||
state = State::String;
|
{
|
||||||
|
UTF8Encode(unicodeValue, textToken);
|
||||||
|
state = State::String;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
reportError("Token too long");
|
||||||
|
newToken = TokenError;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -416,7 +446,7 @@ void Tokenizer::pushBack()
|
||||||
|
|
||||||
double Tokenizer::getNumberValue() const
|
double Tokenizer::getNumberValue() const
|
||||||
{
|
{
|
||||||
return tokenValue;
|
return tokenType == TokenNumber ? tokenValue : std::nan("");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -440,9 +440,17 @@ void UTF8Encode(std::uint32_t ch, std::string& dest)
|
||||||
}
|
}
|
||||||
else if (ch < 0x10000)
|
else if (ch < 0x10000)
|
||||||
{
|
{
|
||||||
dest.push_back(static_cast<char>(0xe0 | (ch >> 12)));
|
if (ch < 0xd800 || ch >= 0xe000)
|
||||||
dest.push_back(static_cast<char>(0x80 | ((ch & 0xfff) >> 6)));
|
{
|
||||||
dest.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
|
dest.push_back(static_cast<char>(0xe0 | (ch >> 12)));
|
||||||
|
dest.push_back(static_cast<char>(0x80 | ((ch & 0xfff) >> 6)));
|
||||||
|
dest.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// disallow surrogates
|
||||||
|
dest.append(UTF8_REPLACEMENT_CHAR);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#if WCHAR_MAX > 0xFFFFu
|
#if WCHAR_MAX > 0xFFFFu
|
||||||
else if (ch < 0x110000)
|
else if (ch < 0x110000)
|
||||||
|
@ -457,7 +465,7 @@ void UTF8Encode(std::uint32_t ch, std::string& dest)
|
||||||
{
|
{
|
||||||
// not a valid Unicode code point, or we only support BMP characters,
|
// not a valid Unicode code point, or we only support BMP characters,
|
||||||
// so fall back to U+FFFD REPLACEMENT CHARACTER
|
// so fall back to U+FFFD REPLACEMENT CHARACTER
|
||||||
dest.append("\357\277\275");
|
dest.append(UTF8_REPLACEMENT_CHAR);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -999,3 +1007,51 @@ std::vector<std::string> getGreekCompletion(const std::string &s)
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
UTF8Validator::check(char c)
|
||||||
|
{
|
||||||
|
return check(static_cast<unsigned char>(c));
|
||||||
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
UTF8Validator::check(unsigned char c)
|
||||||
|
{
|
||||||
|
switch (state)
|
||||||
|
{
|
||||||
|
case State::Initial:
|
||||||
|
if (c < 0x80) { return true; }
|
||||||
|
if (c < 0xc2) { return false; }
|
||||||
|
if (c < 0xe0) { state = State::Continuation1; return true; }
|
||||||
|
if (c == 0xe0) { state = State::E0Continuation; return true; }
|
||||||
|
if (c < 0xed) { state = State::Continuation2; return true; }
|
||||||
|
if (c== 0xed) { state = State::EDContinuation; return true; }
|
||||||
|
if (c < 0xf0) { state = State::Continuation2; return true; }
|
||||||
|
if (c == 0xf0) { state = State::F0Continuation; return true; }
|
||||||
|
if (c < 0xf4) { state = State::Continuation3; return true; }
|
||||||
|
if (c == 0xf4) { state = State::F4Continuation; return true; }
|
||||||
|
return false;
|
||||||
|
case State::Continuation1:
|
||||||
|
if (c >= 0x80 && c < 0xc0) { state = State::Initial; return true; }
|
||||||
|
break;
|
||||||
|
case State::Continuation2:
|
||||||
|
if (c >= 0x80 && c < 0xc0) { state = State::Continuation1; return true; }
|
||||||
|
break;
|
||||||
|
case State::Continuation3:
|
||||||
|
if (c >= 0x80 && c < 0xc0) { state = State::Continuation2; return true; }
|
||||||
|
break;
|
||||||
|
case State::E0Continuation: // disallow overlong sequences
|
||||||
|
if (c >= 0xa0 && c < 0xc0) { state = State::Continuation1; return true; }
|
||||||
|
break;
|
||||||
|
case State::EDContinuation: // disallow surrogate pairs
|
||||||
|
if (c >= 0x80 && c < 0xa0) { state = State::Continuation1; return true; }
|
||||||
|
break;
|
||||||
|
case State::F0Continuation: // disallow overlong sequences
|
||||||
|
if (c >= 0x90 && c < 0xc0) { state = State::Continuation2; return true; }
|
||||||
|
break;
|
||||||
|
case State::F4Continuation: // disallow out-of-range
|
||||||
|
if (c >= 0x80 && c < 0x90) { state = State::Continuation2; return true; }
|
||||||
|
}
|
||||||
|
state = State::Initial;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
|
@ -7,8 +7,7 @@
|
||||||
// as published by the Free Software Foundation; either version 2
|
// as published by the Free Software Foundation; either version 2
|
||||||
// of the License, or (at your option) any later version.
|
// of the License, or (at your option) any later version.
|
||||||
|
|
||||||
#ifndef _CELUTIL_UTF8_
|
#pragma once
|
||||||
#define _CELUTIL_UTF8_
|
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
@ -26,6 +25,7 @@
|
||||||
#define UTF8_SUPERSCRIPT_7 "\342\201\267"
|
#define UTF8_SUPERSCRIPT_7 "\342\201\267"
|
||||||
#define UTF8_SUPERSCRIPT_8 "\342\201\270"
|
#define UTF8_SUPERSCRIPT_8 "\342\201\270"
|
||||||
#define UTF8_SUPERSCRIPT_9 "\342\201\271"
|
#define UTF8_SUPERSCRIPT_9 "\342\201\271"
|
||||||
|
#define UTF8_REPLACEMENT_CHAR "\357\277\275"
|
||||||
|
|
||||||
|
|
||||||
bool UTF8Decode(const std::string& str, int pos, wchar_t& ch);
|
bool UTF8Decode(const std::string& str, int pos, wchar_t& ch);
|
||||||
|
@ -136,4 +136,27 @@ class Greek
|
||||||
|
|
||||||
std::vector<std::string> getGreekCompletion(const std::string &);
|
std::vector<std::string> getGreekCompletion(const std::string &);
|
||||||
|
|
||||||
#endif // _CELUTIL_UTF8_
|
class UTF8Validator
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
UTF8Validator() = default;
|
||||||
|
~UTF8Validator() = default;
|
||||||
|
|
||||||
|
bool check(char c);
|
||||||
|
bool check(unsigned char c);
|
||||||
|
|
||||||
|
private:
|
||||||
|
enum class State
|
||||||
|
{
|
||||||
|
Initial,
|
||||||
|
Continuation1,
|
||||||
|
Continuation2,
|
||||||
|
Continuation3,
|
||||||
|
E0Continuation,
|
||||||
|
EDContinuation,
|
||||||
|
F0Continuation,
|
||||||
|
F4Continuation,
|
||||||
|
};
|
||||||
|
|
||||||
|
State state{ State::Initial };
|
||||||
|
};
|
||||||
|
|
Loading…
Reference in New Issue