More forgiving handling of invalid UTF-8 sequences
- Report an error (once per stream) - In string literals, append U+FFFD REPLACEMENT CHARACTERpull/1173/head
parent
8ae1fcb88e
commit
e92c3d9bd5
|
@ -11,6 +11,7 @@
|
|||
#include <cctype>
|
||||
#include <cerrno>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
|
||||
#include "utf8.h"
|
||||
|
@ -36,16 +37,19 @@ enum class State
|
|||
Comment,
|
||||
};
|
||||
|
||||
|
||||
void reportError(const char* message)
|
||||
{
|
||||
std::cerr << message << '\n';
|
||||
}
|
||||
|
||||
|
||||
bool isSeparator(char c)
|
||||
{
|
||||
return !std::isdigit(c) && !std::isalpha(c) && c != '.';
|
||||
}
|
||||
|
||||
|
||||
bool tryPushBack(std::string& s, char c)
|
||||
{
|
||||
if (s.size() < maxTokenLength)
|
||||
|
@ -57,7 +61,33 @@ bool tryPushBack(std::string& s, char c)
|
|||
reportError("Token too long");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool handleUtf8Error(std::string& s, UTF8Status status)
|
||||
{
|
||||
if (status == UTF8Status::InvalidTrailingByte)
|
||||
{
|
||||
// remove the partial UTF-8 sequence
|
||||
std::string::size_type pos = s.size();
|
||||
while (pos > 0)
|
||||
{
|
||||
unsigned char u = static_cast<unsigned char>(s[--pos]);
|
||||
if (u >= 0xc0) { break; }
|
||||
}
|
||||
|
||||
s.resize(pos);
|
||||
}
|
||||
|
||||
if (s.size() <= maxTokenLength - std::strlen(UTF8_REPLACEMENT_CHAR))
|
||||
{
|
||||
s.append(UTF8_REPLACEMENT_CHAR);
|
||||
return true;
|
||||
}
|
||||
|
||||
reportError("Token too long");
|
||||
return false;
|
||||
}
|
||||
} // end unnamed namespace
|
||||
|
||||
|
||||
Tokenizer::Tokenizer(std::istream* _in) :
|
||||
|
@ -92,6 +122,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
|||
TokenType newToken = TokenBegin;
|
||||
int unicodeDigits = 0;
|
||||
char unicode[5] = {};
|
||||
UTF8Status utf8Status = UTF8Status::Ok;
|
||||
|
||||
while (newToken == TokenBegin)
|
||||
{
|
||||
|
@ -102,6 +133,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
|||
}
|
||||
else
|
||||
{
|
||||
utf8Status = UTF8Status::Ok;
|
||||
in->get(nextChar);
|
||||
if (in->eof())
|
||||
{
|
||||
|
@ -113,15 +145,18 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
|||
newToken = TokenError;
|
||||
break;
|
||||
}
|
||||
else if (!validator.check(nextChar))
|
||||
else
|
||||
{
|
||||
reportError("Invalid UTF-8 sequence detected");
|
||||
newToken = TokenError;
|
||||
break;
|
||||
}
|
||||
else if (nextChar == '\n')
|
||||
{
|
||||
++lineNumber;
|
||||
utf8Status = validator.check(nextChar);
|
||||
if (utf8Status != UTF8Status::Ok && !hasUtf8Errors)
|
||||
{
|
||||
reportError("Invalid UTF-8 sequence detected");
|
||||
hasUtf8Errors = true;
|
||||
}
|
||||
else if (nextChar == '\n')
|
||||
{
|
||||
++lineNumber;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -378,6 +413,10 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
|||
reportError("Unexpected EOF in string");
|
||||
newToken = TokenError;
|
||||
}
|
||||
else if (utf8Status != UTF8Status::Ok)
|
||||
{
|
||||
if (!handleUtf8Error(textToken, utf8Status)) { newToken = TokenError; }
|
||||
}
|
||||
else if (nextChar == '\\')
|
||||
{
|
||||
state = State::StringEscape;
|
||||
|
@ -437,7 +476,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
|||
if (unicodeDigits == 4)
|
||||
{
|
||||
auto unicodeValue = static_cast<std::uint32_t>(std::strtoul(unicode, nullptr, 16));
|
||||
if (textToken.size() + UTF8EncodedSize(unicodeValue) <= maxTokenLength)
|
||||
if (textToken.size() <= maxTokenLength - UTF8EncodedSizeChecked(unicodeValue))
|
||||
{
|
||||
UTF8Encode(unicodeValue, textToken);
|
||||
state = State::String;
|
||||
|
|
|
@ -60,6 +60,7 @@ private:
|
|||
int lineNumber{ 1 };
|
||||
char nextChar{ '\0' };
|
||||
bool reprocess{ false };
|
||||
bool hasUtf8Errors{ false };
|
||||
|
||||
bool skipUtf8Bom();
|
||||
};
|
||||
|
|
|
@ -1008,50 +1008,50 @@ std::vector<std::string> getGreekCompletion(const std::string &s)
|
|||
return ret;
|
||||
}
|
||||
|
||||
bool
|
||||
UTF8Status
|
||||
UTF8Validator::check(char c)
|
||||
{
|
||||
return check(static_cast<unsigned char>(c));
|
||||
}
|
||||
|
||||
bool
|
||||
UTF8Status
|
||||
UTF8Validator::check(unsigned char c)
|
||||
{
|
||||
switch (state)
|
||||
{
|
||||
case State::Initial:
|
||||
if (c < 0x80) { return true; }
|
||||
if (c < 0xc2) { return false; }
|
||||
if (c < 0xe0) { state = State::Continuation1; return true; }
|
||||
if (c == 0xe0) { state = State::E0Continuation; return true; }
|
||||
if (c < 0xed) { state = State::Continuation2; return true; }
|
||||
if (c== 0xed) { state = State::EDContinuation; return true; }
|
||||
if (c < 0xf0) { state = State::Continuation2; return true; }
|
||||
if (c == 0xf0) { state = State::F0Continuation; return true; }
|
||||
if (c < 0xf4) { state = State::Continuation3; return true; }
|
||||
if (c == 0xf4) { state = State::F4Continuation; return true; }
|
||||
return false;
|
||||
if (c < 0x80) { return UTF8Status::Ok; }
|
||||
if (c < 0xc2) { return UTF8Status::InvalidFirstByte; }
|
||||
if (c < 0xe0) { state = State::Continuation1; return UTF8Status::Ok; }
|
||||
if (c == 0xe0) { state = State::E0Continuation; return UTF8Status::Ok; }
|
||||
if (c < 0xed) { state = State::Continuation2; return UTF8Status::Ok; }
|
||||
if (c== 0xed) { state = State::EDContinuation; return UTF8Status::Ok; }
|
||||
if (c < 0xf0) { state = State::Continuation2; return UTF8Status::Ok; }
|
||||
if (c == 0xf0) { state = State::F0Continuation; return UTF8Status::Ok; }
|
||||
if (c < 0xf4) { state = State::Continuation3; return UTF8Status::Ok; }
|
||||
if (c == 0xf4) { state = State::F4Continuation; return UTF8Status::Ok; }
|
||||
return UTF8Status::InvalidFirstByte;
|
||||
case State::Continuation1:
|
||||
if (c >= 0x80 && c < 0xc0) { state = State::Initial; return true; }
|
||||
if (c >= 0x80 && c < 0xc0) { state = State::Initial; return UTF8Status::Ok; }
|
||||
break;
|
||||
case State::Continuation2:
|
||||
if (c >= 0x80 && c < 0xc0) { state = State::Continuation1; return true; }
|
||||
if (c >= 0x80 && c < 0xc0) { state = State::Continuation1; return UTF8Status::Ok; }
|
||||
break;
|
||||
case State::Continuation3:
|
||||
if (c >= 0x80 && c < 0xc0) { state = State::Continuation2; return true; }
|
||||
if (c >= 0x80 && c < 0xc0) { state = State::Continuation2; return UTF8Status::Ok; }
|
||||
break;
|
||||
case State::E0Continuation: // disallow overlong sequences
|
||||
if (c >= 0xa0 && c < 0xc0) { state = State::Continuation1; return true; }
|
||||
if (c >= 0xa0 && c < 0xc0) { state = State::Continuation1; return UTF8Status::Ok; }
|
||||
break;
|
||||
case State::EDContinuation: // disallow surrogate pairs
|
||||
if (c >= 0x80 && c < 0xa0) { state = State::Continuation1; return true; }
|
||||
if (c >= 0x80 && c < 0xa0) { state = State::Continuation1; return UTF8Status::Ok; }
|
||||
break;
|
||||
case State::F0Continuation: // disallow overlong sequences
|
||||
if (c >= 0x90 && c < 0xc0) { state = State::Continuation2; return true; }
|
||||
if (c >= 0x90 && c < 0xc0) { state = State::Continuation2; return UTF8Status::Ok; }
|
||||
break;
|
||||
case State::F4Continuation: // disallow out-of-range
|
||||
if (c >= 0x80 && c < 0x90) { state = State::Continuation2; return true; }
|
||||
if (c >= 0x80 && c < 0x90) { state = State::Continuation2; return UTF8Status::Ok; }
|
||||
}
|
||||
state = State::Initial;
|
||||
return false;
|
||||
return UTF8Status::InvalidTrailingByte;
|
||||
}
|
||||
|
|
|
@ -66,6 +66,24 @@ inline int UTF8EncodedSize(wchar_t ch)
|
|||
#endif
|
||||
}
|
||||
|
||||
constexpr inline int UTF8EncodedSizeChecked(std::uint32_t ch)
|
||||
{
|
||||
if (ch < 0x80)
|
||||
return 1;
|
||||
if (ch < 0x800)
|
||||
return 2;
|
||||
#if WCHAR_MAX > 0xFFFFu
|
||||
if (ch < 0x10000)
|
||||
#endif
|
||||
return 3;
|
||||
#if WCHAR_MAX > 0xFFFFu
|
||||
if (ch < 0x110000)
|
||||
return 4;
|
||||
// out-of-range: assume U+FFFD REPLACEMENT CHARACTER
|
||||
return 3;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline int UTF8EncodedSizeFromFirstByte(unsigned int ch)
|
||||
{
|
||||
if (ch < 0x80)
|
||||
|
@ -136,14 +154,21 @@ class Greek
|
|||
|
||||
std::vector<std::string> getGreekCompletion(const std::string &);
|
||||
|
||||
enum class UTF8Status
|
||||
{
|
||||
Ok,
|
||||
InvalidFirstByte,
|
||||
InvalidTrailingByte,
|
||||
};
|
||||
|
||||
class UTF8Validator
|
||||
{
|
||||
public:
|
||||
UTF8Validator() = default;
|
||||
~UTF8Validator() = default;
|
||||
|
||||
bool check(char c);
|
||||
bool check(unsigned char c);
|
||||
UTF8Status check(char c);
|
||||
UTF8Status check(unsigned char c);
|
||||
|
||||
private:
|
||||
enum class State
|
||||
|
|
Loading…
Reference in New Issue