More forgiving handling of invalid UTF-8 sequences
- Report an error (once per stream) - In string literals, append U+FFFD REPLACEMENT CHARACTERpull/1173/head
parent
8ae1fcb88e
commit
e92c3d9bd5
|
@ -11,6 +11,7 @@
|
||||||
#include <cctype>
|
#include <cctype>
|
||||||
#include <cerrno>
|
#include <cerrno>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
#include "utf8.h"
|
#include "utf8.h"
|
||||||
|
@ -36,16 +37,19 @@ enum class State
|
||||||
Comment,
|
Comment,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
void reportError(const char* message)
|
void reportError(const char* message)
|
||||||
{
|
{
|
||||||
std::cerr << message << '\n';
|
std::cerr << message << '\n';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool isSeparator(char c)
|
bool isSeparator(char c)
|
||||||
{
|
{
|
||||||
return !std::isdigit(c) && !std::isalpha(c) && c != '.';
|
return !std::isdigit(c) && !std::isalpha(c) && c != '.';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool tryPushBack(std::string& s, char c)
|
bool tryPushBack(std::string& s, char c)
|
||||||
{
|
{
|
||||||
if (s.size() < maxTokenLength)
|
if (s.size() < maxTokenLength)
|
||||||
|
@ -57,7 +61,33 @@ bool tryPushBack(std::string& s, char c)
|
||||||
reportError("Token too long");
|
reportError("Token too long");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool handleUtf8Error(std::string& s, UTF8Status status)
|
||||||
|
{
|
||||||
|
if (status == UTF8Status::InvalidTrailingByte)
|
||||||
|
{
|
||||||
|
// remove the partial UTF-8 sequence
|
||||||
|
std::string::size_type pos = s.size();
|
||||||
|
while (pos > 0)
|
||||||
|
{
|
||||||
|
unsigned char u = static_cast<unsigned char>(s[--pos]);
|
||||||
|
if (u >= 0xc0) { break; }
|
||||||
|
}
|
||||||
|
|
||||||
|
s.resize(pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (s.size() <= maxTokenLength - std::strlen(UTF8_REPLACEMENT_CHAR))
|
||||||
|
{
|
||||||
|
s.append(UTF8_REPLACEMENT_CHAR);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
reportError("Token too long");
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
} // end unnamed namespace
|
||||||
|
|
||||||
|
|
||||||
Tokenizer::Tokenizer(std::istream* _in) :
|
Tokenizer::Tokenizer(std::istream* _in) :
|
||||||
|
@ -92,6 +122,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
||||||
TokenType newToken = TokenBegin;
|
TokenType newToken = TokenBegin;
|
||||||
int unicodeDigits = 0;
|
int unicodeDigits = 0;
|
||||||
char unicode[5] = {};
|
char unicode[5] = {};
|
||||||
|
UTF8Status utf8Status = UTF8Status::Ok;
|
||||||
|
|
||||||
while (newToken == TokenBegin)
|
while (newToken == TokenBegin)
|
||||||
{
|
{
|
||||||
|
@ -102,6 +133,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
utf8Status = UTF8Status::Ok;
|
||||||
in->get(nextChar);
|
in->get(nextChar);
|
||||||
if (in->eof())
|
if (in->eof())
|
||||||
{
|
{
|
||||||
|
@ -113,15 +145,18 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
||||||
newToken = TokenError;
|
newToken = TokenError;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
else if (!validator.check(nextChar))
|
else
|
||||||
{
|
{
|
||||||
reportError("Invalid UTF-8 sequence detected");
|
utf8Status = validator.check(nextChar);
|
||||||
newToken = TokenError;
|
if (utf8Status != UTF8Status::Ok && !hasUtf8Errors)
|
||||||
break;
|
{
|
||||||
}
|
reportError("Invalid UTF-8 sequence detected");
|
||||||
else if (nextChar == '\n')
|
hasUtf8Errors = true;
|
||||||
{
|
}
|
||||||
++lineNumber;
|
else if (nextChar == '\n')
|
||||||
|
{
|
||||||
|
++lineNumber;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -378,6 +413,10 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
||||||
reportError("Unexpected EOF in string");
|
reportError("Unexpected EOF in string");
|
||||||
newToken = TokenError;
|
newToken = TokenError;
|
||||||
}
|
}
|
||||||
|
else if (utf8Status != UTF8Status::Ok)
|
||||||
|
{
|
||||||
|
if (!handleUtf8Error(textToken, utf8Status)) { newToken = TokenError; }
|
||||||
|
}
|
||||||
else if (nextChar == '\\')
|
else if (nextChar == '\\')
|
||||||
{
|
{
|
||||||
state = State::StringEscape;
|
state = State::StringEscape;
|
||||||
|
@ -437,7 +476,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
|
||||||
if (unicodeDigits == 4)
|
if (unicodeDigits == 4)
|
||||||
{
|
{
|
||||||
auto unicodeValue = static_cast<std::uint32_t>(std::strtoul(unicode, nullptr, 16));
|
auto unicodeValue = static_cast<std::uint32_t>(std::strtoul(unicode, nullptr, 16));
|
||||||
if (textToken.size() + UTF8EncodedSize(unicodeValue) <= maxTokenLength)
|
if (textToken.size() <= maxTokenLength - UTF8EncodedSizeChecked(unicodeValue))
|
||||||
{
|
{
|
||||||
UTF8Encode(unicodeValue, textToken);
|
UTF8Encode(unicodeValue, textToken);
|
||||||
state = State::String;
|
state = State::String;
|
||||||
|
|
|
@ -60,6 +60,7 @@ private:
|
||||||
int lineNumber{ 1 };
|
int lineNumber{ 1 };
|
||||||
char nextChar{ '\0' };
|
char nextChar{ '\0' };
|
||||||
bool reprocess{ false };
|
bool reprocess{ false };
|
||||||
|
bool hasUtf8Errors{ false };
|
||||||
|
|
||||||
bool skipUtf8Bom();
|
bool skipUtf8Bom();
|
||||||
};
|
};
|
||||||
|
|
|
@ -1008,50 +1008,50 @@ std::vector<std::string> getGreekCompletion(const std::string &s)
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool
|
UTF8Status
|
||||||
UTF8Validator::check(char c)
|
UTF8Validator::check(char c)
|
||||||
{
|
{
|
||||||
return check(static_cast<unsigned char>(c));
|
return check(static_cast<unsigned char>(c));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool
|
UTF8Status
|
||||||
UTF8Validator::check(unsigned char c)
|
UTF8Validator::check(unsigned char c)
|
||||||
{
|
{
|
||||||
switch (state)
|
switch (state)
|
||||||
{
|
{
|
||||||
case State::Initial:
|
case State::Initial:
|
||||||
if (c < 0x80) { return true; }
|
if (c < 0x80) { return UTF8Status::Ok; }
|
||||||
if (c < 0xc2) { return false; }
|
if (c < 0xc2) { return UTF8Status::InvalidFirstByte; }
|
||||||
if (c < 0xe0) { state = State::Continuation1; return true; }
|
if (c < 0xe0) { state = State::Continuation1; return UTF8Status::Ok; }
|
||||||
if (c == 0xe0) { state = State::E0Continuation; return true; }
|
if (c == 0xe0) { state = State::E0Continuation; return UTF8Status::Ok; }
|
||||||
if (c < 0xed) { state = State::Continuation2; return true; }
|
if (c < 0xed) { state = State::Continuation2; return UTF8Status::Ok; }
|
||||||
if (c== 0xed) { state = State::EDContinuation; return true; }
|
if (c== 0xed) { state = State::EDContinuation; return UTF8Status::Ok; }
|
||||||
if (c < 0xf0) { state = State::Continuation2; return true; }
|
if (c < 0xf0) { state = State::Continuation2; return UTF8Status::Ok; }
|
||||||
if (c == 0xf0) { state = State::F0Continuation; return true; }
|
if (c == 0xf0) { state = State::F0Continuation; return UTF8Status::Ok; }
|
||||||
if (c < 0xf4) { state = State::Continuation3; return true; }
|
if (c < 0xf4) { state = State::Continuation3; return UTF8Status::Ok; }
|
||||||
if (c == 0xf4) { state = State::F4Continuation; return true; }
|
if (c == 0xf4) { state = State::F4Continuation; return UTF8Status::Ok; }
|
||||||
return false;
|
return UTF8Status::InvalidFirstByte;
|
||||||
case State::Continuation1:
|
case State::Continuation1:
|
||||||
if (c >= 0x80 && c < 0xc0) { state = State::Initial; return true; }
|
if (c >= 0x80 && c < 0xc0) { state = State::Initial; return UTF8Status::Ok; }
|
||||||
break;
|
break;
|
||||||
case State::Continuation2:
|
case State::Continuation2:
|
||||||
if (c >= 0x80 && c < 0xc0) { state = State::Continuation1; return true; }
|
if (c >= 0x80 && c < 0xc0) { state = State::Continuation1; return UTF8Status::Ok; }
|
||||||
break;
|
break;
|
||||||
case State::Continuation3:
|
case State::Continuation3:
|
||||||
if (c >= 0x80 && c < 0xc0) { state = State::Continuation2; return true; }
|
if (c >= 0x80 && c < 0xc0) { state = State::Continuation2; return UTF8Status::Ok; }
|
||||||
break;
|
break;
|
||||||
case State::E0Continuation: // disallow overlong sequences
|
case State::E0Continuation: // disallow overlong sequences
|
||||||
if (c >= 0xa0 && c < 0xc0) { state = State::Continuation1; return true; }
|
if (c >= 0xa0 && c < 0xc0) { state = State::Continuation1; return UTF8Status::Ok; }
|
||||||
break;
|
break;
|
||||||
case State::EDContinuation: // disallow surrogate pairs
|
case State::EDContinuation: // disallow surrogate pairs
|
||||||
if (c >= 0x80 && c < 0xa0) { state = State::Continuation1; return true; }
|
if (c >= 0x80 && c < 0xa0) { state = State::Continuation1; return UTF8Status::Ok; }
|
||||||
break;
|
break;
|
||||||
case State::F0Continuation: // disallow overlong sequences
|
case State::F0Continuation: // disallow overlong sequences
|
||||||
if (c >= 0x90 && c < 0xc0) { state = State::Continuation2; return true; }
|
if (c >= 0x90 && c < 0xc0) { state = State::Continuation2; return UTF8Status::Ok; }
|
||||||
break;
|
break;
|
||||||
case State::F4Continuation: // disallow out-of-range
|
case State::F4Continuation: // disallow out-of-range
|
||||||
if (c >= 0x80 && c < 0x90) { state = State::Continuation2; return true; }
|
if (c >= 0x80 && c < 0x90) { state = State::Continuation2; return UTF8Status::Ok; }
|
||||||
}
|
}
|
||||||
state = State::Initial;
|
state = State::Initial;
|
||||||
return false;
|
return UTF8Status::InvalidTrailingByte;
|
||||||
}
|
}
|
||||||
|
|
|
@ -66,6 +66,24 @@ inline int UTF8EncodedSize(wchar_t ch)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
constexpr inline int UTF8EncodedSizeChecked(std::uint32_t ch)
|
||||||
|
{
|
||||||
|
if (ch < 0x80)
|
||||||
|
return 1;
|
||||||
|
if (ch < 0x800)
|
||||||
|
return 2;
|
||||||
|
#if WCHAR_MAX > 0xFFFFu
|
||||||
|
if (ch < 0x10000)
|
||||||
|
#endif
|
||||||
|
return 3;
|
||||||
|
#if WCHAR_MAX > 0xFFFFu
|
||||||
|
if (ch < 0x110000)
|
||||||
|
return 4;
|
||||||
|
// out-of-range: assume U+FFFD REPLACEMENT CHARACTER
|
||||||
|
return 3;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
inline int UTF8EncodedSizeFromFirstByte(unsigned int ch)
|
inline int UTF8EncodedSizeFromFirstByte(unsigned int ch)
|
||||||
{
|
{
|
||||||
if (ch < 0x80)
|
if (ch < 0x80)
|
||||||
|
@ -136,14 +154,21 @@ class Greek
|
||||||
|
|
||||||
std::vector<std::string> getGreekCompletion(const std::string &);
|
std::vector<std::string> getGreekCompletion(const std::string &);
|
||||||
|
|
||||||
|
enum class UTF8Status
|
||||||
|
{
|
||||||
|
Ok,
|
||||||
|
InvalidFirstByte,
|
||||||
|
InvalidTrailingByte,
|
||||||
|
};
|
||||||
|
|
||||||
class UTF8Validator
|
class UTF8Validator
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
UTF8Validator() = default;
|
UTF8Validator() = default;
|
||||||
~UTF8Validator() = default;
|
~UTF8Validator() = default;
|
||||||
|
|
||||||
bool check(char c);
|
UTF8Status check(char c);
|
||||||
bool check(unsigned char c);
|
UTF8Status check(unsigned char c);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
enum class State
|
enum class State
|
||||||
|
|
Loading…
Reference in New Issue