More forgiving handling of invalid UTF-8 sequences

- Report an error (once per stream)
- In string literals, append U+FFFD REPLACEMENT CHARACTER
pull/1173/head
Andrew Tribick 2021-11-16 22:13:08 +01:00 committed by ajtribick
parent 8ae1fcb88e
commit e92c3d9bd5
4 changed files with 97 additions and 32 deletions

View File

@ -11,6 +11,7 @@
#include <cctype>
#include <cerrno>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include "utf8.h"
@ -36,16 +37,19 @@ enum class State
Comment,
};
void reportError(const char* message)
{
std::cerr << message << '\n';
}
bool isSeparator(char c)
{
return !std::isdigit(c) && !std::isalpha(c) && c != '.';
}
bool tryPushBack(std::string& s, char c)
{
if (s.size() < maxTokenLength)
@ -57,7 +61,33 @@ bool tryPushBack(std::string& s, char c)
reportError("Token too long");
return false;
}
bool handleUtf8Error(std::string& s, UTF8Status status)
{
if (status == UTF8Status::InvalidTrailingByte)
{
// remove the partial UTF-8 sequence
std::string::size_type pos = s.size();
while (pos > 0)
{
unsigned char u = static_cast<unsigned char>(s[--pos]);
if (u >= 0xc0) { break; }
}
s.resize(pos);
}
if (s.size() <= maxTokenLength - std::strlen(UTF8_REPLACEMENT_CHAR))
{
s.append(UTF8_REPLACEMENT_CHAR);
return true;
}
reportError("Token too long");
return false;
}
} // end unnamed namespace
Tokenizer::Tokenizer(std::istream* _in) :
@ -92,6 +122,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
TokenType newToken = TokenBegin;
int unicodeDigits = 0;
char unicode[5] = {};
UTF8Status utf8Status = UTF8Status::Ok;
while (newToken == TokenBegin)
{
@ -102,6 +133,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
}
else
{
utf8Status = UTF8Status::Ok;
in->get(nextChar);
if (in->eof())
{
@ -113,15 +145,18 @@ Tokenizer::TokenType Tokenizer::nextToken()
newToken = TokenError;
break;
}
else if (!validator.check(nextChar))
else
{
reportError("Invalid UTF-8 sequence detected");
newToken = TokenError;
break;
}
else if (nextChar == '\n')
{
++lineNumber;
utf8Status = validator.check(nextChar);
if (utf8Status != UTF8Status::Ok && !hasUtf8Errors)
{
reportError("Invalid UTF-8 sequence detected");
hasUtf8Errors = true;
}
else if (nextChar == '\n')
{
++lineNumber;
}
}
}
@ -378,6 +413,10 @@ Tokenizer::TokenType Tokenizer::nextToken()
reportError("Unexpected EOF in string");
newToken = TokenError;
}
else if (utf8Status != UTF8Status::Ok)
{
if (!handleUtf8Error(textToken, utf8Status)) { newToken = TokenError; }
}
else if (nextChar == '\\')
{
state = State::StringEscape;
@ -437,7 +476,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
if (unicodeDigits == 4)
{
auto unicodeValue = static_cast<std::uint32_t>(std::strtoul(unicode, nullptr, 16));
if (textToken.size() + UTF8EncodedSize(unicodeValue) <= maxTokenLength)
if (textToken.size() <= maxTokenLength - UTF8EncodedSizeChecked(unicodeValue))
{
UTF8Encode(unicodeValue, textToken);
state = State::String;

View File

@ -60,6 +60,7 @@ private:
int lineNumber{ 1 };
char nextChar{ '\0' };
bool reprocess{ false };
bool hasUtf8Errors{ false };
bool skipUtf8Bom();
};

View File

@ -1008,50 +1008,50 @@ std::vector<std::string> getGreekCompletion(const std::string &s)
return ret;
}
bool
UTF8Status
UTF8Validator::check(char c)
{
return check(static_cast<unsigned char>(c));
}
bool
UTF8Status
UTF8Validator::check(unsigned char c)
{
switch (state)
{
case State::Initial:
if (c < 0x80) { return true; }
if (c < 0xc2) { return false; }
if (c < 0xe0) { state = State::Continuation1; return true; }
if (c == 0xe0) { state = State::E0Continuation; return true; }
if (c < 0xed) { state = State::Continuation2; return true; }
if (c== 0xed) { state = State::EDContinuation; return true; }
if (c < 0xf0) { state = State::Continuation2; return true; }
if (c == 0xf0) { state = State::F0Continuation; return true; }
if (c < 0xf4) { state = State::Continuation3; return true; }
if (c == 0xf4) { state = State::F4Continuation; return true; }
return false;
if (c < 0x80) { return UTF8Status::Ok; }
if (c < 0xc2) { return UTF8Status::InvalidFirstByte; }
if (c < 0xe0) { state = State::Continuation1; return UTF8Status::Ok; }
if (c == 0xe0) { state = State::E0Continuation; return UTF8Status::Ok; }
if (c < 0xed) { state = State::Continuation2; return UTF8Status::Ok; }
if (c== 0xed) { state = State::EDContinuation; return UTF8Status::Ok; }
if (c < 0xf0) { state = State::Continuation2; return UTF8Status::Ok; }
if (c == 0xf0) { state = State::F0Continuation; return UTF8Status::Ok; }
if (c < 0xf4) { state = State::Continuation3; return UTF8Status::Ok; }
if (c == 0xf4) { state = State::F4Continuation; return UTF8Status::Ok; }
return UTF8Status::InvalidFirstByte;
case State::Continuation1:
if (c >= 0x80 && c < 0xc0) { state = State::Initial; return true; }
if (c >= 0x80 && c < 0xc0) { state = State::Initial; return UTF8Status::Ok; }
break;
case State::Continuation2:
if (c >= 0x80 && c < 0xc0) { state = State::Continuation1; return true; }
if (c >= 0x80 && c < 0xc0) { state = State::Continuation1; return UTF8Status::Ok; }
break;
case State::Continuation3:
if (c >= 0x80 && c < 0xc0) { state = State::Continuation2; return true; }
if (c >= 0x80 && c < 0xc0) { state = State::Continuation2; return UTF8Status::Ok; }
break;
case State::E0Continuation: // disallow overlong sequences
if (c >= 0xa0 && c < 0xc0) { state = State::Continuation1; return true; }
if (c >= 0xa0 && c < 0xc0) { state = State::Continuation1; return UTF8Status::Ok; }
break;
case State::EDContinuation: // disallow surrogate pairs
if (c >= 0x80 && c < 0xa0) { state = State::Continuation1; return true; }
if (c >= 0x80 && c < 0xa0) { state = State::Continuation1; return UTF8Status::Ok; }
break;
case State::F0Continuation: // disallow overlong sequences
if (c >= 0x90 && c < 0xc0) { state = State::Continuation2; return true; }
if (c >= 0x90 && c < 0xc0) { state = State::Continuation2; return UTF8Status::Ok; }
break;
case State::F4Continuation: // disallow out-of-range
if (c >= 0x80 && c < 0x90) { state = State::Continuation2; return true; }
if (c >= 0x80 && c < 0x90) { state = State::Continuation2; return UTF8Status::Ok; }
}
state = State::Initial;
return false;
return UTF8Status::InvalidTrailingByte;
}

View File

@ -66,6 +66,24 @@ inline int UTF8EncodedSize(wchar_t ch)
#endif
}
constexpr inline int UTF8EncodedSizeChecked(std::uint32_t ch)
{
if (ch < 0x80)
return 1;
if (ch < 0x800)
return 2;
#if WCHAR_MAX > 0xFFFFu
if (ch < 0x10000)
#endif
return 3;
#if WCHAR_MAX > 0xFFFFu
if (ch < 0x110000)
return 4;
// out-of-range: assume U+FFFD REPLACEMENT CHARACTER
return 3;
#endif
}
inline int UTF8EncodedSizeFromFirstByte(unsigned int ch)
{
if (ch < 0x80)
@ -136,14 +154,21 @@ class Greek
std::vector<std::string> getGreekCompletion(const std::string &);
enum class UTF8Status
{
Ok,
InvalidFirstByte,
InvalidTrailingByte,
};
class UTF8Validator
{
public:
UTF8Validator() = default;
~UTF8Validator() = default;
bool check(char c);
bool check(unsigned char c);
UTF8Status check(char c);
UTF8Status check(unsigned char c);
private:
enum class State