Minor tokenizer fixes

- Functions in cctype should be called with unsigned char.
- Use charconv to convert numeric tokens.
pull/1315/head
Andrew Tribick 2022-01-08 12:44:01 +01:00 committed by ajtribick
parent e04ed379a5
commit 56751882e6
1 changed files with 38 additions and 26 deletions

View File

@ -9,9 +9,12 @@
// of the License, or (at your option) any later version.
#include <cctype>
#include <cstdlib>
#include <cmath>
#include <cstring>
#include <iostream>
#include <system_error>
#include <celcompat/charconv.h>
#include "logger.h"
#include "utf8.h"
@ -40,9 +43,10 @@ enum class State
};
bool isSeparator(char c)
bool isSeparator(unsigned char u)
{
return !std::isdigit(c) && !std::isalpha(c) && c != '.';
constexpr auto period = static_cast<unsigned char>('.');
return !std::isdigit(u) && !std::isalpha(u) && u != period;
}
@ -123,6 +127,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
while (newToken == TokenBegin)
{
bool isEof = false;
auto uNextChar = static_cast<unsigned char>(nextChar);
if (reprocess)
{
reprocess = false;
@ -143,7 +148,8 @@ Tokenizer::TokenType Tokenizer::nextToken()
}
else
{
utf8Status = validator.check(nextChar);
uNextChar = static_cast<unsigned char>(nextChar);
utf8Status = validator.check(uNextChar);
if (utf8Status != UTF8Status::Ok && !hasUtf8Errors)
{
GetLogger()->error("Invalid UTF-8 sequence detected\n");
@ -163,11 +169,11 @@ Tokenizer::TokenType Tokenizer::nextToken()
{
newToken = TokenEnd;
}
else if (std::isspace(nextChar))
else if (std::isspace(uNextChar))
{
// no-op
}
else if (std::isdigit(nextChar))
else if (std::isdigit(uNextChar))
{
textToken.push_back(nextChar);
state = State::Number;
@ -186,7 +192,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
textToken.append("0.");
state = State::Fraction;
}
else if (std::isalpha(nextChar) || nextChar == '_')
else if (std::isalpha(uNextChar) || nextChar == '_')
{
textToken.push_back(nextChar);
state = State::Name;
@ -244,7 +250,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
GetLogger()->error("Unexpected EOF in number\n");
newToken = TokenError;
}
else if (std::isdigit(nextChar))
else if (std::isdigit(uNextChar))
{
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
state = State::Number;
@ -274,7 +280,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
{
newToken = TokenNumber;
}
else if (std::isdigit(nextChar))
else if (std::isdigit(uNextChar))
{
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
}
@ -288,7 +294,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
if (!tryPushBack(textToken, 'e')) { newToken = TokenError; }
state = State::ExponentStart;
}
else if (isSeparator(nextChar))
else if (isSeparator(uNextChar))
{
newToken = TokenNumber;
reprocess = true;
@ -305,7 +311,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
{
newToken = TokenNumber;
}
else if (std::isdigit(nextChar))
else if (std::isdigit(uNextChar))
{
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
}
@ -314,7 +320,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
if (!tryPushBack(textToken, 'e')) { newToken = TokenError; }
state = State::ExponentStart;
}
else if (isSeparator(nextChar))
else if (isSeparator(uNextChar))
{
newToken = TokenNumber;
reprocess = true;
@ -332,7 +338,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
GetLogger()->error("Unexpected EOF in number\n");
newToken = TokenError;
}
else if (std::isdigit(nextChar))
else if (std::isdigit(uNextChar))
{
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
state = State::Exponent;
@ -355,7 +361,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
GetLogger()->error("Unexpected EOF in number\n");
newToken = TokenError;
}
else if (std::isdigit(nextChar))
else if (std::isdigit(uNextChar))
{
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
state = State::Exponent;
@ -372,11 +378,11 @@ Tokenizer::TokenType Tokenizer::nextToken()
{
newToken = TokenNumber;
}
else if (std::isdigit(nextChar))
else if (std::isdigit(uNextChar))
{
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
}
else if (isSeparator(nextChar))
else if (isSeparator(uNextChar))
{
newToken = TokenNumber;
reprocess = true;
@ -393,7 +399,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
{
newToken = TokenName;
}
else if (std::isalpha(nextChar) || std::isdigit(nextChar) || nextChar == '_')
else if (std::isalpha(uNextChar) || std::isdigit(uNextChar) || nextChar == '_')
{
if (!tryPushBack(textToken, nextChar)) { newToken = TokenError; }
}
@ -467,7 +473,7 @@ Tokenizer::TokenType Tokenizer::nextToken()
GetLogger()->error("Unexpected EOF in string\n");
newToken = TokenError;
}
else if (std::isxdigit(nextChar))
else if (std::isxdigit(uNextChar))
{
unicode[unicodeDigits++] = nextChar;
if (unicodeDigits == 4)
@ -507,24 +513,30 @@ Tokenizer::TokenType Tokenizer::nextToken()
if (newToken == TokenNumber)
{
const char* start = textToken.c_str();
char* end;
errno = 0;
double value = std::strtod(start, &end);
if (end == start)
double value;
auto [p, ec] = celestia::compat::from_chars(textToken.data(), textToken.data() + textToken.size(), value);
if (ec == std::errc())
{
tokenValue = value;
if (p != textToken.data() + textToken.size())
{
GetLogger()->warn("Incomplete parsing of numeric token");
}
}
else if (ec == std::errc::invalid_argument)
{
GetLogger()->error("Could not parse number\n");
newToken = TokenError;
}
else if (errno == ERANGE)
else if (ec == std::errc::result_out_of_range)
{
GetLogger()->error("Number out of range\n");
newToken = TokenError;
errno = 0;
}
else
{
tokenValue = value;
GetLogger()->error("Unexpected error parsing number\n");
newToken = TokenError;
}
}