624 lines
16 KiB
C++
624 lines
16 KiB
C++
// tokenizer.cpp
|
|
//
|
|
// Copyright (C) 2001-2021, the Celestia Development Team
|
|
// Original version by Chris Laurel <claurel@gmail.com>
|
|
//
|
|
// This program is free software; you can redistribute it and/or
|
|
// modify it under the terms of the GNU General Public License
|
|
// as published by the Free Software Foundation; either version 2
|
|
// of the License, or (at your option) any later version.
|
|
|
|
#include <cctype>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
#include <iostream>
|
|
|
|
#include "logger.h"
|
|
#include "utf8.h"
|
|
#include "tokenizer.h"
|
|
|
|
using celestia::util::GetLogger;
|
|
|
|
namespace
|
|
{
|
|
constexpr std::string::size_type maxTokenLength = 1024;
|
|
|
|
enum class State
|
|
{
|
|
Start,
|
|
NumberSigned,
|
|
Number,
|
|
Fraction,
|
|
ExponentStart,
|
|
ExponentSigned,
|
|
Exponent,
|
|
Name,
|
|
String,
|
|
StringEscape,
|
|
UnicodeEscape,
|
|
Comment,
|
|
};
|
|
|
|
|
|
bool isSeparator(char c)
|
|
{
|
|
return !std::isdigit(c) && !std::isalpha(c) && c != '.';
|
|
}
|
|
|
|
|
|
bool tryPushBack(std::string& s, char c)
|
|
{
|
|
if (s.size() < maxTokenLength)
|
|
{
|
|
s.push_back(c);
|
|
return true;
|
|
}
|
|
|
|
GetLogger()->error("Token too long\n");
|
|
return false;
|
|
}
|
|
|
|
|
|
bool handleUtf8Error(std::string& s, UTF8Status status)
|
|
{
|
|
if (status == UTF8Status::InvalidTrailingByte)
|
|
{
|
|
// remove the partial UTF-8 sequence
|
|
std::string::size_type pos = s.size();
|
|
while (pos > 0)
|
|
{
|
|
unsigned char u = static_cast<unsigned char>(s[--pos]);
|
|
if (u >= 0xc0) { break; }
|
|
}
|
|
|
|
s.resize(pos);
|
|
}
|
|
|
|
if (s.size() <= maxTokenLength - std::strlen(UTF8_REPLACEMENT_CHAR))
|
|
{
|
|
s.append(UTF8_REPLACEMENT_CHAR);
|
|
return true;
|
|
}
|
|
|
|
GetLogger()->error("Token too long\n");
|
|
return false;
|
|
}
|
|
} // end unnamed namespace
|
|
|
|
|
|
Tokenizer::Tokenizer(std::istream* _in) :
|
|
in(_in)
|
|
{
|
|
textToken.reserve(maxTokenLength);
|
|
}
|
|
|
|
|
|
Tokenizer::TokenType Tokenizer::nextToken()
|
|
{
|
|
if (isPushedBack)
|
|
{
|
|
isPushedBack = false;
|
|
return tokenType;
|
|
}
|
|
|
|
if (isStart)
|
|
{
|
|
isStart = false;
|
|
if (!skipUtf8Bom())
|
|
{
|
|
tokenType = TokenError;
|
|
return tokenType;
|
|
}
|
|
}
|
|
|
|
UTF8Validator validator;
|
|
textToken.clear();
|
|
tokenValue = std::nan("");
|
|
State state = State::Start;
|
|
TokenType newToken = TokenBegin;
|
|
int unicodeDigits = 0;
|
|
char unicode[5] = {};
|
|
UTF8Status utf8Status = UTF8Status::Ok;
|
|
|
|
while (newToken == TokenBegin)
|
|
{
|
|
bool isEof = false;
|
|
if (reprocess)
|
|
{
|
|
reprocess = false;
|
|
}
|
|
else
|
|
{
|
|
utf8Status = UTF8Status::Ok;
|
|
in->get(nextChar);
|
|
if (in->eof())
|
|
{
|
|
isEof = true;
|
|
}
|
|
else if (in->fail())
|
|
{
|
|
GetLogger()->error("Unexpected error reading stream\n");
|
|
newToken = TokenError;
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
utf8Status = validator.check(nextChar);
|
|
if (utf8Status != UTF8Status::Ok && !hasUtf8Errors)
|
|
{
|
|
GetLogger()->error("Invalid UTF-8 sequence detected\n");
|
|
hasUtf8Errors = true;
|
|
}
|
|
else if (nextChar == '\n')
|
|
{
|
|
++lineNumber;
|
|
}
|
|
}
|
|
}
|
|
|
|
switch (state)
|
|
{
|
|
case State::Start:
|
|
if (isEof)
|
|
{
|
|
newToken = TokenEnd;
|
|
}
|
|
else if (std::isspace(nextChar))
|
|
{
|
|
// no-op
|
|
}
|
|
else if (std::isdigit(nextChar))
|
|
{
|
|
textToken.push_back(nextChar);
|
|
state = State::Number;
|
|
}
|
|
else if (nextChar == '-')
|
|
{
|
|
textToken.push_back(nextChar);
|
|
state = State::NumberSigned;
|
|
}
|
|
else if (nextChar == '+')
|
|
{
|
|
state = State::NumberSigned;
|
|
}
|
|
else if (nextChar == '.')
|
|
{
|
|
textToken.append("0.");
|
|
state = State::Fraction;
|
|
}
|
|
else if (std::isalpha(nextChar) || nextChar == '_')
|
|
{
|
|
textToken.push_back(nextChar);
|
|
state = State::Name;
|
|
}
|
|
else if (nextChar == '"')
|
|
{
|
|
state = State::String;
|
|
}
|
|
else if (nextChar == '#')
|
|
{
|
|
state = State::Comment;
|
|
}
|
|
else if (nextChar == '{')
|
|
{
|
|
newToken = TokenBeginGroup;
|
|
}
|
|
else if (nextChar == '}')
|
|
{
|
|
newToken = TokenEndGroup;
|
|
}
|
|
else if (nextChar == '[')
|
|
{
|
|
newToken = TokenBeginArray;
|
|
}
|
|
else if (nextChar == ']')
|
|
{
|
|
newToken = TokenEndArray;
|
|
}
|
|
else if (nextChar == '=')
|
|
{
|
|
newToken = TokenEquals;
|
|
}
|
|
else if (nextChar == '|')
|
|
{
|
|
newToken = TokenBar;
|
|
}
|
|
else if (nextChar == '<')
|
|
{
|
|
newToken = TokenBeginUnits;
|
|
}
|
|
else if (nextChar == '>')
|
|
{
|
|
newToken = TokenEndUnits;
|
|
}
|
|
else
|
|
{
|
|
GetLogger()->error("Bad character in stream\n");
|
|
newToken = TokenError;
|
|
}
|
|
break;
|
|
|
|
case State::NumberSigned:
|
|
if (isEof)
|
|
{
|
|
GetLogger()->error("Unexpected EOF in number\n");
|
|
newToken = TokenError;
|
|
}
|
|
else if (std::isdigit(nextChar))
|
|
{
|
|
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
|
|
state = State::Number;
|
|
}
|
|
else if (nextChar == '.')
|
|
{
|
|
if (textToken.size() + 2 <= maxTokenLength)
|
|
{
|
|
textToken.append("0.");
|
|
state = State::Fraction;
|
|
}
|
|
else
|
|
{
|
|
GetLogger()->error("Token too long\n");
|
|
newToken = TokenError;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
GetLogger()->error("Bad character in number\n");
|
|
newToken = TokenError;
|
|
}
|
|
break;
|
|
|
|
case State::Number:
|
|
if (isEof)
|
|
{
|
|
newToken = TokenNumber;
|
|
}
|
|
else if (std::isdigit(nextChar))
|
|
{
|
|
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
|
|
}
|
|
else if (nextChar == '.')
|
|
{
|
|
if (!tryPushBack(textToken, '.')) { newToken = TokenError; }
|
|
state = State::Fraction;
|
|
}
|
|
else if (nextChar == 'e' || nextChar == 'E')
|
|
{
|
|
if (!tryPushBack(textToken, 'e')) { newToken = TokenError; }
|
|
state = State::ExponentStart;
|
|
}
|
|
else if (isSeparator(nextChar))
|
|
{
|
|
newToken = TokenNumber;
|
|
reprocess = true;
|
|
}
|
|
else
|
|
{
|
|
GetLogger()->error("Bad character in number\n");
|
|
newToken = TokenError;
|
|
}
|
|
break;
|
|
|
|
case State::Fraction:
|
|
if (isEof)
|
|
{
|
|
newToken = TokenNumber;
|
|
}
|
|
else if (std::isdigit(nextChar))
|
|
{
|
|
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
|
|
}
|
|
else if (nextChar == 'e' || nextChar == 'E')
|
|
{
|
|
if (!tryPushBack(textToken, 'e')) { newToken = TokenError; }
|
|
state = State::ExponentStart;
|
|
}
|
|
else if (isSeparator(nextChar))
|
|
{
|
|
newToken = TokenNumber;
|
|
reprocess = true;
|
|
}
|
|
else
|
|
{
|
|
GetLogger()->error("Bad character in number\n");
|
|
newToken = TokenError;
|
|
}
|
|
break;
|
|
|
|
case State::ExponentStart:
|
|
if (isEof)
|
|
{
|
|
GetLogger()->error("Unexpected EOF in number\n");
|
|
newToken = TokenError;
|
|
}
|
|
else if (std::isdigit(nextChar))
|
|
{
|
|
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
|
|
state = State::Exponent;
|
|
}
|
|
else if (nextChar == '+' || nextChar == '-')
|
|
{
|
|
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
|
|
state = State::ExponentSigned;
|
|
}
|
|
else
|
|
{
|
|
GetLogger()->error("Bad character in number\n");
|
|
newToken = TokenError;
|
|
}
|
|
break;
|
|
|
|
case State::ExponentSigned:
|
|
if (isEof)
|
|
{
|
|
GetLogger()->error("Unexpected EOF in number\n");
|
|
newToken = TokenError;
|
|
}
|
|
else if (std::isdigit(nextChar))
|
|
{
|
|
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
|
|
state = State::Exponent;
|
|
}
|
|
else
|
|
{
|
|
GetLogger()->error("Bad character in number\n");
|
|
newToken = TokenError;
|
|
}
|
|
break;
|
|
|
|
case State::Exponent:
|
|
if (isEof)
|
|
{
|
|
newToken = TokenNumber;
|
|
}
|
|
else if (std::isdigit(nextChar))
|
|
{
|
|
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
|
|
}
|
|
else if (isSeparator(nextChar))
|
|
{
|
|
newToken = TokenNumber;
|
|
reprocess = true;
|
|
}
|
|
else
|
|
{
|
|
GetLogger()->error("Bad character in number\n");
|
|
newToken = TokenError;
|
|
}
|
|
break;
|
|
|
|
case State::Name:
|
|
if (isEof)
|
|
{
|
|
newToken = TokenName;
|
|
}
|
|
else if (std::isalpha(nextChar) || std::isdigit(nextChar) || nextChar == '_')
|
|
{
|
|
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
|
|
}
|
|
else
|
|
{
|
|
newToken = TokenName;
|
|
reprocess = true;
|
|
}
|
|
break;
|
|
|
|
case State::String:
|
|
if (isEof)
|
|
{
|
|
GetLogger()->error("Unexpected EOF in string\n");
|
|
newToken = TokenError;
|
|
}
|
|
else if (utf8Status != UTF8Status::Ok)
|
|
{
|
|
if (!handleUtf8Error(textToken, utf8Status)) { newToken = TokenError; }
|
|
}
|
|
else if (nextChar == '\\')
|
|
{
|
|
state = State::StringEscape;
|
|
}
|
|
else if (nextChar == '"')
|
|
{
|
|
newToken = TokenString;
|
|
}
|
|
else
|
|
{
|
|
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
|
|
}
|
|
break;
|
|
|
|
case State::StringEscape:
|
|
if (isEof)
|
|
{
|
|
GetLogger()->error("Unexpected EOF in string\n");
|
|
newToken = TokenError;
|
|
}
|
|
else if (nextChar == '\\')
|
|
{
|
|
if (!tryPushBack(textToken, '\\')) { newToken = TokenError; }
|
|
state = State::String;
|
|
}
|
|
else if (nextChar == 'n')
|
|
{
|
|
if (!tryPushBack(textToken, '\n')) { newToken = TokenError; }
|
|
state = State::String;
|
|
}
|
|
else if (nextChar == '"')
|
|
{
|
|
if (!tryPushBack(textToken, '"')) { newToken = TokenError; }
|
|
state = State::String;
|
|
}
|
|
else if (nextChar == 'u')
|
|
{
|
|
state = State::UnicodeEscape;
|
|
unicodeDigits = 0;
|
|
}
|
|
else
|
|
{
|
|
GetLogger()->error("Invalid string escape sequence\n");
|
|
newToken = TokenError;
|
|
}
|
|
break;
|
|
|
|
case State::UnicodeEscape:
|
|
if (isEof)
|
|
{
|
|
GetLogger()->error("Unexpected EOF in string\n");
|
|
newToken = TokenError;
|
|
}
|
|
else if (std::isxdigit(nextChar))
|
|
{
|
|
unicode[unicodeDigits++] = nextChar;
|
|
if (unicodeDigits == 4)
|
|
{
|
|
auto unicodeValue = static_cast<std::uint32_t>(std::strtoul(unicode, nullptr, 16));
|
|
if (textToken.size() <= maxTokenLength - UTF8EncodedSizeChecked(unicodeValue))
|
|
{
|
|
UTF8Encode(unicodeValue, textToken);
|
|
state = State::String;
|
|
}
|
|
else
|
|
{
|
|
GetLogger()->error("Token too long\n");
|
|
newToken = TokenError;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
GetLogger()->error("Bad character in Unicode escape\n");
|
|
newToken = TokenError;
|
|
}
|
|
break;
|
|
|
|
case State::Comment:
|
|
if (isEof)
|
|
{
|
|
newToken = TokenEnd;
|
|
}
|
|
else if (nextChar == '\n' || nextChar == '\r')
|
|
{
|
|
state = State::Start;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (newToken == TokenNumber)
|
|
{
|
|
const char* start = textToken.c_str();
|
|
char* end;
|
|
errno = 0;
|
|
double value = std::strtod(start, &end);
|
|
if (end == start)
|
|
{
|
|
GetLogger()->error("Could not parse number\n");
|
|
newToken = TokenError;
|
|
}
|
|
else if (errno == ERANGE)
|
|
{
|
|
GetLogger()->error("Number out of range\n");
|
|
newToken = TokenError;
|
|
errno = 0;
|
|
}
|
|
else
|
|
{
|
|
tokenValue = value;
|
|
}
|
|
}
|
|
|
|
tokenType = newToken;
|
|
return tokenType;
|
|
}
|
|
|
|
|
|
Tokenizer::TokenType Tokenizer::getTokenType() const
|
|
{
|
|
return tokenType;
|
|
}
|
|
|
|
|
|
void Tokenizer::pushBack()
|
|
{
|
|
isPushedBack = true;
|
|
}
|
|
|
|
|
|
double Tokenizer::getNumberValue() const
|
|
{
|
|
return tokenType == TokenNumber ? tokenValue : std::nan("");
|
|
}
|
|
|
|
|
|
bool Tokenizer::isInteger() const
|
|
{
|
|
return tokenType == TokenNumber
|
|
&& textToken.find_first_of(".eE") == std::string::npos
|
|
&& tokenValue >= INT32_MIN && tokenValue <= INT32_MAX;
|
|
}
|
|
|
|
|
|
std::int32_t Tokenizer::getIntegerValue() const
|
|
{
|
|
return static_cast<std::int32_t>(tokenValue);
|
|
}
|
|
|
|
|
|
std::string Tokenizer::getNameValue() const
|
|
{
|
|
return textToken;
|
|
}
|
|
|
|
|
|
std::string Tokenizer::getStringValue() const
|
|
{
|
|
return textToken;
|
|
}
|
|
|
|
|
|
int Tokenizer::getLineNumber() const
|
|
{
|
|
return lineNumber;
|
|
}
|
|
|
|
|
|
bool Tokenizer::skipUtf8Bom()
|
|
{
|
|
for (int i = 0; i < 3; ++i)
|
|
{
|
|
in->get(nextChar);
|
|
if (in->eof())
|
|
{
|
|
if (i == 0)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
GetLogger()->error("Incomplete UTF-8 sequence\n");
|
|
return false;
|
|
}
|
|
else if (in->fail())
|
|
{
|
|
GetLogger()->error("Unexpected error reading stream\n");
|
|
return false;
|
|
}
|
|
else if (i == 0)
|
|
{
|
|
if (nextChar != '\357')
|
|
{
|
|
lineNumber += nextChar == '\n' ? 1 : 0;
|
|
reprocess = true;
|
|
return true;
|
|
}
|
|
}
|
|
else if ((i == 1 && nextChar != '\273') || (i == 2 && nextChar != '\277'))
|
|
{
|
|
GetLogger()->error("Bad character in stream\n");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|