More forgiving handling of invalid UTF-8 sequences

- Report an error (once per stream)
- In string literals, append U+FFFD REPLACEMENT CHARACTER
pull/1173/head
Andrew Tribick 2021-11-16 22:13:08 +01:00 committed by ajtribick
parent 8ae1fcb88e
commit e92c3d9bd5
4 changed files with 97 additions and 32 deletions

View File

@ -11,6 +11,7 @@
#include <cctype> #include <cctype>
#include <cerrno> #include <cerrno>
#include <cstdlib> #include <cstdlib>
#include <cstring>
#include <iostream> #include <iostream>
#include "utf8.h" #include "utf8.h"
@ -36,16 +37,19 @@ enum class State
Comment, Comment,
}; };
void reportError(const char* message) void reportError(const char* message)
{ {
std::cerr << message << '\n'; std::cerr << message << '\n';
} }
bool isSeparator(char c) bool isSeparator(char c)
{ {
return !std::isdigit(c) && !std::isalpha(c) && c != '.'; return !std::isdigit(c) && !std::isalpha(c) && c != '.';
} }
bool tryPushBack(std::string& s, char c) bool tryPushBack(std::string& s, char c)
{ {
if (s.size() < maxTokenLength) if (s.size() < maxTokenLength)
@ -57,7 +61,33 @@ bool tryPushBack(std::string& s, char c)
reportError("Token too long"); reportError("Token too long");
return false; return false;
} }
bool handleUtf8Error(std::string& s, UTF8Status status)
{
if (status == UTF8Status::InvalidTrailingByte)
{
// remove the partial UTF-8 sequence
std::string::size_type pos = s.size();
while (pos > 0)
{
unsigned char u = static_cast<unsigned char>(s[--pos]);
if (u >= 0xc0) { break; }
}
s.resize(pos);
}
if (s.size() <= maxTokenLength - std::strlen(UTF8_REPLACEMENT_CHAR))
{
s.append(UTF8_REPLACEMENT_CHAR);
return true;
}
reportError("Token too long");
return false;
} }
} // end unnamed namespace
Tokenizer::Tokenizer(std::istream* _in) : Tokenizer::Tokenizer(std::istream* _in) :
@ -92,6 +122,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
TokenType newToken = TokenBegin; TokenType newToken = TokenBegin;
int unicodeDigits = 0; int unicodeDigits = 0;
char unicode[5] = {}; char unicode[5] = {};
UTF8Status utf8Status = UTF8Status::Ok;
while (newToken == TokenBegin) while (newToken == TokenBegin)
{ {
@ -102,6 +133,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
} }
else else
{ {
utf8Status = UTF8Status::Ok;
in->get(nextChar); in->get(nextChar);
if (in->eof()) if (in->eof())
{ {
@ -113,15 +145,18 @@ Tokenizer::TokenType Tokenizer::nextToken()
newToken = TokenError; newToken = TokenError;
break; break;
} }
else if (!validator.check(nextChar)) else
{ {
reportError("Invalid UTF-8 sequence detected"); utf8Status = validator.check(nextChar);
newToken = TokenError; if (utf8Status != UTF8Status::Ok && !hasUtf8Errors)
break; {
} reportError("Invalid UTF-8 sequence detected");
else if (nextChar == '\n') hasUtf8Errors = true;
{ }
++lineNumber; else if (nextChar == '\n')
{
++lineNumber;
}
} }
} }
@ -378,6 +413,10 @@ Tokenizer::TokenType Tokenizer::nextToken()
reportError("Unexpected EOF in string"); reportError("Unexpected EOF in string");
newToken = TokenError; newToken = TokenError;
} }
else if (utf8Status != UTF8Status::Ok)
{
if (!handleUtf8Error(textToken, utf8Status)) { newToken = TokenError; }
}
else if (nextChar == '\\') else if (nextChar == '\\')
{ {
state = State::StringEscape; state = State::StringEscape;
@ -437,7 +476,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
if (unicodeDigits == 4) if (unicodeDigits == 4)
{ {
auto unicodeValue = static_cast<std::uint32_t>(std::strtoul(unicode, nullptr, 16)); auto unicodeValue = static_cast<std::uint32_t>(std::strtoul(unicode, nullptr, 16));
if (textToken.size() + UTF8EncodedSize(unicodeValue) <= maxTokenLength) if (textToken.size() <= maxTokenLength - UTF8EncodedSizeChecked(unicodeValue))
{ {
UTF8Encode(unicodeValue, textToken); UTF8Encode(unicodeValue, textToken);
state = State::String; state = State::String;

View File

@ -60,6 +60,7 @@ private:
int lineNumber{ 1 }; int lineNumber{ 1 };
char nextChar{ '\0' }; char nextChar{ '\0' };
bool reprocess{ false }; bool reprocess{ false };
bool hasUtf8Errors{ false };
bool skipUtf8Bom(); bool skipUtf8Bom();
}; };

View File

@ -1008,50 +1008,50 @@ std::vector<std::string> getGreekCompletion(const std::string &s)
return ret; return ret;
} }
bool UTF8Status
UTF8Validator::check(char c) UTF8Validator::check(char c)
{ {
return check(static_cast<unsigned char>(c)); return check(static_cast<unsigned char>(c));
} }
bool UTF8Status
UTF8Validator::check(unsigned char c) UTF8Validator::check(unsigned char c)
{ {
switch (state) switch (state)
{ {
case State::Initial: case State::Initial:
if (c < 0x80) { return true; } if (c < 0x80) { return UTF8Status::Ok; }
if (c < 0xc2) { return false; } if (c < 0xc2) { return UTF8Status::InvalidFirstByte; }
if (c < 0xe0) { state = State::Continuation1; return true; } if (c < 0xe0) { state = State::Continuation1; return UTF8Status::Ok; }
if (c == 0xe0) { state = State::E0Continuation; return true; } if (c == 0xe0) { state = State::E0Continuation; return UTF8Status::Ok; }
if (c < 0xed) { state = State::Continuation2; return true; } if (c < 0xed) { state = State::Continuation2; return UTF8Status::Ok; }
if (c== 0xed) { state = State::EDContinuation; return true; } if (c== 0xed) { state = State::EDContinuation; return UTF8Status::Ok; }
if (c < 0xf0) { state = State::Continuation2; return true; } if (c < 0xf0) { state = State::Continuation2; return UTF8Status::Ok; }
if (c == 0xf0) { state = State::F0Continuation; return true; } if (c == 0xf0) { state = State::F0Continuation; return UTF8Status::Ok; }
if (c < 0xf4) { state = State::Continuation3; return true; } if (c < 0xf4) { state = State::Continuation3; return UTF8Status::Ok; }
if (c == 0xf4) { state = State::F4Continuation; return true; } if (c == 0xf4) { state = State::F4Continuation; return UTF8Status::Ok; }
return false; return UTF8Status::InvalidFirstByte;
case State::Continuation1: case State::Continuation1:
if (c >= 0x80 && c < 0xc0) { state = State::Initial; return true; } if (c >= 0x80 && c < 0xc0) { state = State::Initial; return UTF8Status::Ok; }
break; break;
case State::Continuation2: case State::Continuation2:
if (c >= 0x80 && c < 0xc0) { state = State::Continuation1; return true; } if (c >= 0x80 && c < 0xc0) { state = State::Continuation1; return UTF8Status::Ok; }
break; break;
case State::Continuation3: case State::Continuation3:
if (c >= 0x80 && c < 0xc0) { state = State::Continuation2; return true; } if (c >= 0x80 && c < 0xc0) { state = State::Continuation2; return UTF8Status::Ok; }
break; break;
case State::E0Continuation: // disallow overlong sequences case State::E0Continuation: // disallow overlong sequences
if (c >= 0xa0 && c < 0xc0) { state = State::Continuation1; return true; } if (c >= 0xa0 && c < 0xc0) { state = State::Continuation1; return UTF8Status::Ok; }
break; break;
case State::EDContinuation: // disallow surrogate pairs case State::EDContinuation: // disallow surrogate pairs
if (c >= 0x80 && c < 0xa0) { state = State::Continuation1; return true; } if (c >= 0x80 && c < 0xa0) { state = State::Continuation1; return UTF8Status::Ok; }
break; break;
case State::F0Continuation: // disallow overlong sequences case State::F0Continuation: // disallow overlong sequences
if (c >= 0x90 && c < 0xc0) { state = State::Continuation2; return true; } if (c >= 0x90 && c < 0xc0) { state = State::Continuation2; return UTF8Status::Ok; }
break; break;
case State::F4Continuation: // disallow out-of-range case State::F4Continuation: // disallow out-of-range
if (c >= 0x80 && c < 0x90) { state = State::Continuation2; return true; } if (c >= 0x80 && c < 0x90) { state = State::Continuation2; return UTF8Status::Ok; }
} }
state = State::Initial; state = State::Initial;
return false; return UTF8Status::InvalidTrailingByte;
} }

View File

@ -66,6 +66,24 @@ inline int UTF8EncodedSize(wchar_t ch)
#endif #endif
} }
constexpr inline int UTF8EncodedSizeChecked(std::uint32_t ch)
{
if (ch < 0x80)
return 1;
if (ch < 0x800)
return 2;
#if WCHAR_MAX > 0xFFFFu
if (ch < 0x10000)
#endif
return 3;
#if WCHAR_MAX > 0xFFFFu
if (ch < 0x110000)
return 4;
// out-of-range: assume U+FFFD REPLACEMENT CHARACTER
return 3;
#endif
}
inline int UTF8EncodedSizeFromFirstByte(unsigned int ch) inline int UTF8EncodedSizeFromFirstByte(unsigned int ch)
{ {
if (ch < 0x80) if (ch < 0x80)
@ -136,14 +154,21 @@ class Greek
std::vector<std::string> getGreekCompletion(const std::string &); std::vector<std::string> getGreekCompletion(const std::string &);
enum class UTF8Status
{
Ok,
InvalidFirstByte,
InvalidTrailingByte,
};
class UTF8Validator class UTF8Validator
{ {
public: public:
UTF8Validator() = default; UTF8Validator() = default;
~UTF8Validator() = default; ~UTF8Validator() = default;
bool check(char c); UTF8Status check(char c);
bool check(unsigned char c); UTF8Status check(unsigned char c);
private: private:
enum class State enum class State