Split greek-letters handling from utf8 and refactor a bit

pull/1315/head
Hleb Valoshka 2021-12-24 18:23:08 +02:00
parent eaae852f8f
commit c251dac856
23 changed files with 427 additions and 648 deletions

View File

@ -8,6 +8,7 @@
// of the License, or (at your option) any later version.
#include <celutil/gettext.h>
#include <celutil/greek.h>
#include <celutil/logger.h>
#include <celutil/tokenizer.h>
#include "stardb.h"

View File

@ -179,7 +179,7 @@ void Console::print(char* s)
while (i < length && validChar)
{
wchar_t ch = 0;
validChar = UTF8Decode(s, i, length, ch);
validChar = UTF8Decode(string_view(s, length), i, ch);
i += UTF8EncodedSize(ch);
print(ch);
}

View File

@ -1,5 +1,6 @@
#include <celutil/logger.h>
#include <celutil/gettext.h>
#include <celutil/greek.h>
#include "name.h"
uint32_t NameDatabase::getNameCount() const
@ -97,41 +98,25 @@ NameDatabase::NumberIndex::const_iterator NameDatabase::getFinalNameIter() const
return numberIndex.end();
}
std::vector<std::string> NameDatabase::getCompletion(const std::string& name, bool i18n, bool greek) const
std::vector<std::string> NameDatabase::getCompletion(const std::string& name, bool i18n) const
{
if (greek)
{
auto compList = getGreekCompletion(name);
compList.push_back(name);
return getCompletion(compList, i18n);
}
std::string name2 = ReplaceGreekLetter(name);
std::vector<std::string> completion;
int name_length = UTF8Length(name);
const int name_length = UTF8Length(name2);
for (NameIndex::const_iterator iter = nameIndex.begin(); iter != nameIndex.end(); ++iter)
for (const auto &[n, _] : nameIndex)
{
if (!UTF8StringCompare(iter->first, name, name_length, true))
completion.push_back(iter->first);
if (!UTF8StringCompare(n, name2, name_length, true))
completion.push_back(n);
}
if (i18n)
{
for (NameIndex::const_iterator iter = localizedNameIndex.begin(); iter != localizedNameIndex.end(); ++iter)
for (const auto &[n, _] : localizedNameIndex)
{
if (!UTF8StringCompare(iter->first, name, name_length, true))
completion.push_back(iter->first);
if (!UTF8StringCompare(n, name2, name_length, true))
completion.push_back(n);
}
}
return completion;
}
std::vector<std::string> NameDatabase::getCompletion(const std::vector<std::string> &list, bool i18n) const
{
std::vector<std::string> completion;
for (const auto &n : list)
{
for (const auto &nn : getCompletion(n, i18n, false))
completion.emplace_back(nn);
}
return completion;
}

View File

@ -45,8 +45,7 @@ class NameDatabase
NumberIndex::const_iterator getFirstNameIter(const AstroCatalog::IndexNumber catalogNumber) const;
NumberIndex::const_iterator getFinalNameIter() const;
std::vector<std::string> getCompletion(const std::string& name, bool i18n, bool greek = true) const;
std::vector<std::string> getCompletion(const std::vector<std::string> &list, bool i18n) const;
std::vector<std::string> getCompletion(const std::string& name, bool i18n) const;
protected:
NameIndex nameIndex;

View File

@ -159,7 +159,7 @@ void Overlay::print_impl(const std::string& s)
while (i < length && validChar)
{
wchar_t ch = 0;
validChar = UTF8Decode(s.c_str(), i, length, ch);
validChar = UTF8Decode(s, i, ch);
i += UTF8EncodedSize(ch);
print(ch);
}

View File

@ -10,8 +10,10 @@
//
//
#include <fmt/format.h>
#include <celengine/constellation.h>
#include <celengine/starname.h>
#include <celutil/greek.h>
using namespace std;
@ -61,26 +63,26 @@ uint32_t StarNameDatabase::findCatalogNumberByName(const string& name, bool i18n
// We have a valid constellation as the last part
// of the name. Next, we see if the first part of
// the name is a greek letter.
const string& letter = Greek::canonicalAbbreviation(string(prefix, 0, len));
std::string_view letter = GetCanonicalGreekAbbreviation(std::string_view(prefix).substr(0, len));
if (!letter.empty())
{
// Matched . . . this is a Bayer designation
if (digit == ' ')
{
priName = letter + ' ' + con->getAbbreviation();
priName = fmt::format("{} {}", letter, con->getAbbreviation());
// If 'let con' doesn't match, try using
// 'let1 con' instead.
altName = letter + '1' + ' ' + con->getAbbreviation();
altName = fmt::format("{}1 {}", letter, con->getAbbreviation());
}
else
{
priName = letter + digit + ' ' + con->getAbbreviation();
priName = fmt::format("{}{} {}", letter, digit, con->getAbbreviation());
}
}
else
{
// Something other than a Bayer designation
priName = prefix + ' ' + con->getAbbreviation();
priName = fmt::format("{} {}", prefix, con->getAbbreviation());
}
if (isOrbitingStar)

View File

@ -22,7 +22,7 @@
#include <celmath/mathlib.h>
#include <celmath/intersect.h>
#include <celmath/ray.h>
#include <celutil/utf8.h>
#include <celutil/greek.h>
#include <cassert>
static const double ANGULAR_RES = 3.5e-6;

View File

@ -970,7 +970,7 @@ void CelestiaCore::charEntered(const char *c_p, int modifiers)
if (textEnterMode & KbAutoComplete)
{
wchar_t wc = 0; // Null wide character
UTF8Decode(c_p, 0, strlen(c_p), wc);
UTF8Decode(c_p, 0, wc);
#ifdef __APPLE__
if ( wc && (!iscntrl(wc)) )
#else

View File

@ -19,7 +19,7 @@
#include <celengine/starbrowser.h>
#include <celengine/stardb.h>
#include <celengine/univcoord.h>
#include <celutil/utf8.h>
#include <celutil/greek.h>
#include "dialog-star.h"
#include "actions.h"

View File

@ -16,7 +16,7 @@
#include <celengine/simulation.h>
#include <celestia/celestiacore.h>
#include <celestia/helper.h>
#include <celutil/utf8.h>
#include <celutil/greek.h>
#include "menu-context.h"
#include "actions.h"

View File

@ -41,6 +41,7 @@
#include <vector>
#include <string>
#include <celutil/gettext.h>
#include <celutil/greek.h>
#include <celutil/tzutil.h>
#include "qtappwin.h"
#include "qtglwidget.h"

View File

@ -12,6 +12,7 @@
#include <celestia/celestiacore.h>
#include <celutil/gettext.h>
#include <celutil/greek.h>
#include "qtcelestialbrowser.h"
#include "qtcolorswatchwidget.h"
#include "qtinfopanel.h"

View File

@ -12,6 +12,7 @@
#include <celestia/celestiacore.h>
#include <celutil/gettext.h>
#include <celutil/greek.h>
#include "qtdeepskybrowser.h"
#include "qtcolorswatchwidget.h"
#include "qtinfopanel.h"

View File

@ -15,7 +15,7 @@
#include <celengine/astro.h>
#include <celutil/gettext.h>
#include <celutil/logger.h>
#include <celutil/utf8.h>
#include <celutil/greek.h>
#include <celengine/universe.h>
#include <QTextBrowser>
#include <QIODevice>

View File

@ -17,6 +17,7 @@
#include <celengine/axisarrow.h>
#include <celengine/planetgrid.h>
#include <celutil/gettext.h>
#include <celutil/greek.h>
#include <fmt/printf.h>
#include "qtselectionpopup.h"
#include "qtappwin.h"

View File

@ -12,6 +12,7 @@
#include <celestia/celestiacore.h>
#include <celutil/gettext.h>
#include <celutil/greek.h>
#include "qtsolarsystembrowser.h"
#include "qtinfopanel.h"
#include "qtcolorswatchwidget.h"

View File

@ -13,6 +13,8 @@ set(CELUTIL_SOURCES
formatnum.h
fsutils.cpp
fsutils.h
greek.cpp
greek.h
logger.cpp
logger.h
reshandle.h

View File

@ -0,0 +1,270 @@
// utf8.cpp
//
// Copyright (C) 2004, Chris Laurel <claurel@shatters.net>
// 2018-present, Celestia Development Team
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
#include "greek.h"
#include "stringutils.h"
#include "utf8.h"
#include <algorithm>
#include <array>
#include <cctype>
using namespace std::string_view_literals;
namespace
{
constexpr int nLetters = 24;
constexpr std::string_view UTF8_SUPERSCRIPT_0 = "\342\201\260"sv;
constexpr std::string_view UTF8_SUPERSCRIPT_1 = "\302\271"sv;
constexpr std::string_view UTF8_SUPERSCRIPT_2 = "\302\262"sv;
constexpr std::string_view UTF8_SUPERSCRIPT_3 = "\302\263"sv;
constexpr std::string_view UTF8_SUPERSCRIPT_4 = "\342\201\264"sv;
constexpr std::string_view UTF8_SUPERSCRIPT_5 = "\342\201\265"sv;
constexpr std::string_view UTF8_SUPERSCRIPT_6 = "\342\201\266"sv;
constexpr std::string_view UTF8_SUPERSCRIPT_7 = "\342\201\267"sv;
constexpr std::string_view UTF8_SUPERSCRIPT_8 = "\342\201\270"sv;
constexpr std::string_view UTF8_SUPERSCRIPT_9 = "\342\201\271"sv;
// clang-format off
const std::array<std::string_view, nLetters> greekAlphabet =
{
"Alpha"sv,
"Beta"sv,
"Gamma"sv,
"Delta"sv,
"Epsilon"sv,
"Zeta"sv,
"Eta"sv,
"Theta"sv,
"Iota"sv,
"Kappa"sv,
"Lambda"sv,
"Mu"sv,
"Nu"sv,
"Xi"sv,
"Omicron"sv,
"Pi"sv,
"Rho"sv,
"Sigma"sv,
"Tau"sv,
"Upsilon"sv,
"Phi"sv,
"Chi"sv,
"Psi"sv,
"Omega"sv
};
const std::array<std::string_view, nLetters> greekAlphabetUTF8 = {
"\316\261"sv, // ALF
"\316\262"sv, // BET
"\316\263"sv, // GAM
"\316\264"sv, // DEL
"\316\265"sv, // EPS
"\316\266"sv, // ZET
"\316\267"sv, // ETA
"\316\270"sv, // TET
"\316\271"sv, // IOT
"\316\272"sv, // KAP
"\316\273"sv, // LAM
"\316\274"sv, // MU
"\316\275"sv, // NU
"\316\276"sv, // XI
"\316\277"sv, // OMI
"\317\200"sv, // PI
"\317\201"sv, // RHO
"\317\203"sv, // SIG
"\317\204"sv, // TAU
"\317\205"sv, // UPS
"\317\206"sv, // PHI
"\317\207"sv, // CHI
"\317\210"sv, // PSI
"\317\211"sv, // OME
};
const std::array<std::string_view, nLetters> canonicalAbbrevs =
{
"ALF"sv,
"BET"sv,
"GAM"sv,
"DEL"sv,
"EPS"sv,
"ZET"sv,
"ETA"sv,
"TET"sv,
"IOT"sv,
"KAP"sv,
"LAM"sv,
"MU"sv,
"NU"sv,
"XI"sv,
"OMI"sv,
"PI"sv,
"RHO"sv,
"SIG"sv,
"TAU"sv,
"UPS"sv,
"PHI"sv,
"CHI"sv,
"PSI"sv,
"OME"sv,
};
// clang-format on
std::string_view::size_type
getFirstWordLength(std::string_view str)
{
auto sp = str.find(' ');
if (sp == std::string_view::npos)
sp = str.length();
// skip digits
while (sp > 0 && std::isdigit(str[sp - 1]) != 0)
sp--;
return sp;
}
std::string_view
toSuperscript(char c)
{
switch (c)
{
case '0':
return UTF8_SUPERSCRIPT_0;
case '1':
return UTF8_SUPERSCRIPT_1;
case '2':
return UTF8_SUPERSCRIPT_2;
case '3':
return UTF8_SUPERSCRIPT_3;
case '4':
return UTF8_SUPERSCRIPT_4;
case '5':
return UTF8_SUPERSCRIPT_5;
case '6':
return UTF8_SUPERSCRIPT_6;
case '7':
return UTF8_SUPERSCRIPT_7;
case '8':
return UTF8_SUPERSCRIPT_8;
case '9':
return UTF8_SUPERSCRIPT_9;
default:
return {};
}
}
} // namespace
/**
* Replaces the Greek letter abbreviation at the beginning
* of a string by the UTF-8 representation of that letter.
* Also, replaces digits following Greek letters with UTF-8
* superscripts.
*/
std::string
ReplaceGreekLetterAbbr(std::string_view str)
{
if (str.empty())
return {};
if (auto len = getFirstWordLength(str); len > 0 && str[0] >= 'A' && str[0] <= 'Z')
{
// Linear search through all letter abbreviations
for (int i = 0; i < nLetters; i++)
{
auto prefix = canonicalAbbrevs[i];
if (len != prefix.length() || UTF8StringCompare(str, prefix, len, true) != 0)
{
prefix = greekAlphabet[i];
if (len != prefix.length() || UTF8StringCompare(str, prefix, len, true) != 0)
continue;
}
std::string ret(greekAlphabetUTF8[i]);
for (; str.length() > len && std::isdigit(str[len]); len++)
ret.append(toSuperscript(str[len]));
ret.append(str.substr(len));
return ret;
}
}
return std::string(str);
}
/**
* Returns canonical greek abbreviation for a letter passed.
* The letter can be: latin name of a greek letter, canonical
* representation of it or a greek letter itself in UTF-8.
*/
std::string_view
GetCanonicalGreekAbbreviation(std::string_view letter)
{
for (int i = 0; i < nLetters; i++)
{
if (compareIgnoringCase(letter, greekAlphabet[i]) == 0
|| compareIgnoringCase(letter, canonicalAbbrevs[i]) == 0)
{
return canonicalAbbrevs[i];
}
}
if (letter.length() == 2)
{
for (int i = 0; i < nLetters; i++)
{
if (letter == greekAlphabetUTF8[i]) return canonicalAbbrevs[i];
}
}
return {};
}
/**
* Replaces the Greek letter or abbreviation at the beginning
* of a string by the UTF-8 representation of that letter.
* Also, replaces digits following Greek letters with UTF-8
* superscripts.
*/
std::string
ReplaceGreekLetter(std::string_view str)
{
if (str.empty()) return {};
if (auto len = getFirstWordLength(str); len > 0)
{
// Linear search through all letter abbreviations
for (int i = 0; i < nLetters; i++)
{
if (len != 2 || str != greekAlphabetUTF8[i])
{
auto prefix = canonicalAbbrevs[i];
if (len != prefix.length() || UTF8StringCompare(str, prefix, len, true) != 0)
{
prefix = greekAlphabet[i];
if (len != prefix.length() || UTF8StringCompare(str, prefix, len, true) != 0)
continue;
}
}
std::string ret(greekAlphabetUTF8[i]);
for (; str.length() > len && std::isdigit(str[len]); len++)
ret.append(toSuperscript(str[len]));
ret.append(str.substr(len));
return ret;
}
}
return std::string(str);
}

View File

@ -0,0 +1,19 @@
// greek.h
//
// Copyright (C) 2004, Chris Laurel <claurel@shatters.net>
// 2018-present, Celestia Development Team
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
#pragma once
#include <string>
#include <string_view>
#include <vector>
std::string ReplaceGreekLetterAbbr(std::string_view str);
std::string ReplaceGreekLetter(std::string_view str);
std::string_view GetCanonicalGreekAbbreviation(std::string_view letter);

View File

@ -1,20 +1,21 @@
// utf8.cpp
//
// Copyright (C) 2004, Chris Laurel <claurel@shatters.net>
// 2018-present, Celestia Development Team
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
#include <cctype>
#include <cstring>
#include <wchar.h>
#include <climits>
#include <fmt/printf.h>
#include "stringutils.h"
#include "utf8.h"
namespace
{
// clang-format off
uint16_t WGL4_Normalization_00[256] = {
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
@ -300,11 +301,45 @@ uint16_t* WGL4NormalizationTables[256] = {
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
};
// clang-format on
inline int UTF8EncodedSizeFromFirstByte(unsigned int ch)
{
if (ch < 0x80)
return 1;
if ((ch & 0xe0) == 0xc0)
return 2;
if ((ch & 0xf0) == 0xe0)
return 3;
if ((ch & 0xf8) == 0xf0)
return 4;
if ((ch & 0xfc) == 0xf8)
return 5;
if ((ch & 0xfe) == 0xfc)
return 6;
else
return 1;
}
inline wchar_t UTF8Normalize(wchar_t ch)
{
auto page = (unsigned int) ch >> 8;
if (page >= 256)
return ch;
uint16_t* normTable = WGL4NormalizationTables[page];
if (normTable == nullptr)
return ch;
return (wchar_t) normTable[(unsigned int) ch & 0xff];
}
} // namespace
//! Decode the UTF-8 characters in string str beginning at position pos.
//! The decoded character is returned in ch; the return value of the function
//! is true if a valid UTF-8 sequence was successfully decoded.
bool UTF8Decode(const std::string& str, int pos, wchar_t& ch)
bool UTF8Decode(std::string_view str, int pos, wchar_t& ch)
{
auto c0 = (unsigned int) str[pos];
int charlen = UTF8EncodedSizeFromFirstByte(c0);
@ -362,69 +397,6 @@ bool UTF8Decode(const std::string& str, int pos, wchar_t& ch)
}
}
//! Decode the UTF-8 characters in string str beginning at position pos.
//! The decoded character is returned in ch; the return value of the function
//! is true if a valid UTF-8 sequence was successfully decoded.
bool UTF8Decode(const char* str, int pos, int length, wchar_t& ch)
{
auto c0 = (unsigned int) str[pos];
int charlen = UTF8EncodedSizeFromFirstByte(c0);
// Bad UTF-8 character that extends past end of string
if (pos + charlen > length)
return false;
// TODO: Should check that the bytes of characters after the first are all
// of the form 01xxxxxx
// TODO: Need to reject overlong encoding sequences
switch (charlen)
{
case 1:
ch = c0;
return true;
case 2:
ch = ((c0 & 0x1f) << 6) | ((unsigned int) str[pos + 1] & 0x3f);
return true;
case 3:
ch = ((c0 & 0x0f) << 12) |
(((unsigned int) str[pos + 1] & 0x3f) << 6) |
((unsigned int) str[pos + 2] & 0x3f);
return true;
case 4:
ch = ((c0 & 0x07) << 18) |
(((unsigned int) str[pos + 1] & 0x3f) << 12) |
(((unsigned int) str[pos + 2] & 0x3f) << 6) |
((unsigned int) str[pos + 3] & 0x3f);
return true;
case 5:
ch = ((c0 & 0x03) << 24) |
(((unsigned int) str[pos + 1] & 0x3f) << 18) |
(((unsigned int) str[pos + 2] & 0x3f) << 12) |
(((unsigned int) str[pos + 3] & 0x3f) << 6) |
((unsigned int) str[pos + 4] & 0x3f);
return true;
case 6:
ch = ((c0 & 0x01) << 30) |
(((unsigned int) str[pos + 1] & 0x3f) << 24) |
(((unsigned int) str[pos + 2] & 0x3f) << 18) |
(((unsigned int) str[pos + 3] & 0x3f) << 12) |
(((unsigned int) str[pos + 4] & 0x3f) << 6) |
((unsigned int) str[pos + 5] & 0x3f);
return true;
default:
return false;
}
}
//! Appends the UTF-8 encoded version of the code point ch to the
//! destination string
void UTF8Encode(std::uint32_t ch, std::string& dest)
@ -469,9 +441,8 @@ void UTF8Encode(std::uint32_t ch, std::string& dest)
}
}
//! Return the number of characters encoded by a UTF-8 string
int UTF8Length(const std::string& s)
int UTF8Length(std::string_view s)
{
int len = s.length();
int count = 0;
@ -485,25 +456,10 @@ int UTF8Length(const std::string& s)
return count;
}
inline wchar_t UTF8Normalize(wchar_t ch)
{
auto page = (unsigned int) ch >> 8;
if (page >= 256)
return ch;
uint16_t* normTable = WGL4NormalizationTables[page];
if (normTable == nullptr)
return ch;
return (wchar_t) normTable[(unsigned int) ch & 0xff];
}
//! Perform a normalized comparison of two UTF-8 strings. The normalization
//! only works for characters in the WGL-4 subset, and no multicharacter
//! translations are performed.
int UTF8StringCompare(const std::string& s0, const std::string& s1)
int UTF8StringCompare(std::string_view s0, std::string_view s1)
{
int len0 = s0.length();
int len1 = s1.length();
@ -542,7 +498,7 @@ int UTF8StringCompare(const std::string& s0, const std::string& s1)
return 0;
}
int UTF8StringCompare(const std::string& s0, const std::string& s1, size_t n, bool ignoreCase)
int UTF8StringCompare(std::string_view s0, std::string_view s1, size_t n, bool ignoreCase)
{
int len0 = s0.length();
int len1 = s1.length();
@ -588,432 +544,6 @@ int UTF8StringCompare(const std::string& s0, const std::string& s1, size_t n, bo
return 0;
}
#if 0
//! Currently incomplete, but could be a helpful class for dealing with
//! UTF-8 streams
class UTF8StringIterator
{
public:
UTF8StringIterator(const std::string& _str) : str(_str) {};
UTF8StringIterator(const UTF8StringIterator& iter) = default;
UTF8StringIterator& operator++();
UTF8StringIterator& operator++(int);
private:
const std::string& str;
int position{ 0 };
};
UTF8StringIterator& UTF8StringIterator::operator++()
{
return *this;
}
UTF8StringIterator& UTF8StringIterator::operator++(int)
{
return *this;
}
#endif
static const char *greekAlphabet[] =
{
"Alpha",
"Beta",
"Gamma",
"Delta",
"Epsilon",
"Zeta",
"Eta",
"Theta",
"Iota",
"Kappa",
"Lambda",
"Mu",
"Nu",
"Xi",
"Omicron",
"Pi",
"Rho",
"Sigma",
"Tau",
"Upsilon",
"Phi",
"Chi",
"Psi",
"Omega"
};
static const char* greekAlphabetUTF8[] =
{
"\316\261",
"\316\262",
"\316\263",
"\316\264",
"\316\265",
"\316\266",
"\316\267",
"\316\270",
"\316\271",
"\316\272",
"\316\273",
"\316\274",
"\316\275",
"\316\276",
"\316\277",
"\317\200",
"\317\201",
"\317\203",
"\317\204",
"\317\205",
"\317\206",
"\317\207",
"\317\210",
"\317\211",
};
static const char* canonicalAbbrevs[] =
{
"ALF", "BET", "GAM", "DEL", "EPS", "ZET", "ETA", "TET",
"IOT", "KAP", "LAM", "MU" , "NU" , "XI" , "OMI", "PI" ,
"RHO", "SIG", "TAU", "UPS", "PHI", "CHI", "PSI", "OME",
};
static std::string noAbbrev;
// Greek alphabet crud . . . should probably moved to it's own module.
static size_t greekChunkLength(const std::string&);
Greek* Greek::m_instance = nullptr;
Greek* Greek::getInstance()
{
if (m_instance == nullptr)
m_instance = new Greek();
return m_instance;
}
Greek::Greek()
{
nLetters = sizeof(greekAlphabet) / sizeof(greekAlphabet[0]);
names = new std::string[nLetters];
abbrevs = new std::string[nLetters];
for (int i = 0; i < nLetters; i++)
{
names[i] = std::string(greekAlphabet[i]);
abbrevs[i] = std::string(canonicalAbbrevs[i]);
}
}
Greek::~Greek()
{
delete[] names;
delete[] abbrevs;
}
const std::string& Greek::canonicalAbbreviation(const std::string& letter)
{
Greek *instance = Greek::getInstance();
int i;
for (i = 0; i < instance->nLetters; i++)
{
if (compareIgnoringCase(letter, instance->names[i]) == 0)
return instance->abbrevs[i];
}
for (i = 0; i < instance->nLetters; i++)
{
if (compareIgnoringCase(letter, instance->abbrevs[i]) == 0)
return instance->abbrevs[i];
}
if (letter.length() == 2)
{
for (i = 0; i < instance->nLetters; i++)
{
if (letter[0] == greekAlphabetUTF8[i][0] &&
letter[1] == greekAlphabetUTF8[i][1])
{
return instance->abbrevs[i];
}
}
}
return noAbbrev;
}
static const char* toSuperscript(char c)
{
switch (c)
{
case '0':
return UTF8_SUPERSCRIPT_0;
case '1':
return UTF8_SUPERSCRIPT_1;
case '2':
return UTF8_SUPERSCRIPT_2;
case '3':
return UTF8_SUPERSCRIPT_3;
case '4':
return UTF8_SUPERSCRIPT_4;
case '5':
return UTF8_SUPERSCRIPT_5;
case '6':
return UTF8_SUPERSCRIPT_6;
case '7':
return UTF8_SUPERSCRIPT_7;
case '8':
return UTF8_SUPERSCRIPT_8;
case '9':
return UTF8_SUPERSCRIPT_9;
default:
return nullptr;
}
}
//! Replaces the Greek letter abbreviation at the beginning
//! of a string by the UTF-8 representation of that letter.
//! Also, replace digits following Greek letters with UTF-8
//! superscripts.
std::string ReplaceGreekLetterAbbr(const std::string& str)
{
Greek *instance = Greek::getInstance();
size_t len = greekChunkLength(str);
if (str[0] >= 'A' && str[0] <= 'Z')
{
// Linear search through all letter abbreviations
for (int i = 0; i < instance->nLetters; i++)
{
std::string prefix = instance->abbrevs[i];
if (len != prefix.length() || UTF8StringCompare(str, prefix, len, true) != 0)
{
prefix = instance->names[i];
if (len != prefix.length() || UTF8StringCompare(str, prefix, len, true) != 0)
continue;
}
std::string ret = greekAlphabetUTF8[i];
auto len = prefix.length();
for (; str.length() > len && isdigit(str[len]); len++)
ret += toSuperscript(str[len]);
ret += str.substr(len);
return ret;
}
}
return str;
}
//! Replaces the Greek letter abbreviation at the beginning
//! of a string by the UTF-8 representation of that letter.
//! Also, replace digits following Greek letters with UTF-8
//! superscripts. Operates on char* instead of strings--less
//! convenient, but more efficient. Return the number of
//! characters copied to the destination string, not
//! including the zero terminator.
#if 0
unsigned int
ReplaceGreekLetterAbbr(char *dst, unsigned int dstSize, const char* src, unsigned int srcLength)
{
Greek *instance = Greek::getInstance();
if (src[0] >= 'A' && src[0] <= 'Z' &&
src[1] >= 'A' && src[1] <= 'Z')
{
// Linear search through all letter abbreviations
for (unsigned int i = 0; i < (unsigned int) instance->nLetters; i++)
{
const char* abbr = canonicalAbbrevs[i];
unsigned int j = 0;
while (abbr[j] == src[j] && abbr[j] != '\0' && src[j] != '\0')
j++;
// It's a match if we reached the end of the abbreviation string
if (abbr[j] == '\0')
{
unsigned int abbrevLength = j;
unsigned int srcIndex = j;
const char *superscript = toSuperscript(src[abbrevLength]);
const char* utfGreek = greekAlphabetUTF8[i];
unsigned int utfGreekLength = strlen(utfGreek);
unsigned int requiredLength = srcLength;
if (utfGreekLength > abbrevLength)
requiredLength += utfGreekLength - abbrevLength;
if (superscript != nullptr)
{
requiredLength += strlen(superscript) - 1;
srcIndex++;
}
// If there's not enough room, give up translating and just copy as much as possible
if (requiredLength + 1 > dstSize)
break;
unsigned int dstIndex = 0;
j = 0;
while (utfGreek[j] != 0)
{
dst[dstIndex++] = utfGreek[j];
j++;
}
if (superscript != nullptr)
{
j = 0;
while (superscript[j] != 0)
{
dst[dstIndex++] = superscript[j];
j++;
}
}
while (src[srcIndex] != 0)
{
dst[dstIndex++] = src[srcIndex++];
}
dst[dstIndex] = '\0';
return dstIndex;
}
}
}
strncpy(dst, src, dstSize);
if (dstSize > srcLength)
return srcLength;
if (dstSize > 0)
{
dst[dstSize - 1] = '\0';
return dstSize - 1;
}
return 0;
}
#endif
static int findGreekNameIndexBySubstr(const std::string &, int = 0, unsigned int = UINT_MAX);
#if 0
static std::string firstGreekAbbrCompletion(const std::string &);
#endif
bool inline isSubstringIgnoringCase(const std::string &s0, const std::string &s1, size_t n)
{
return UTF8StringCompare(s0, s1, n, true) == 0;
}
static int findGreekNameIndexBySubstr(const std::string &s, int start, unsigned int n)
{
Greek *instance = Greek::getInstance();
if (s.empty())
return -1;
for (int i = start; i < instance->nLetters; i++)
{
if (isSubstringIgnoringCase(instance->names[i], s, n))
return i;
}
for (int i = start; i < instance->nLetters; i++)
{
if (isSubstringIgnoringCase(instance->abbrevs[i], s, n))
return i;
}
return -1;
}
static size_t greekChunkLength(const std::string& str)
{
bool npos = false;
size_t sp = str.find_first_of(' ');
if (sp == std::string::npos)
{
sp = str.length();
npos = true;
}
if (sp != 0 && isdigit(static_cast<unsigned char>(str[sp - 1])))
while(sp != 0 && isdigit(static_cast<unsigned char>(str[sp - 1]))) sp--;
else if (npos)
sp = std::string::npos;
return sp;
}
#if 0
static std::string firstGreekAbbrCompletion(const std::string &s)
{
std::string ret;
size_t sp = greekChunkLength(s);
if (sp == std::string::npos)
{
int i = findGreekNameIndexBySubstr(s);
return (i >= 0) ? Greek::getInstance()->abbrevs[i] : s;
}
else
{
std::string prefix = s.substr(0, sp);
ret = Greek::canonicalAbbreviation(prefix);
return ret.empty() ? s : prefix + s.substr(sp);
}
return ret;
}
#endif
std::vector<std::string> getGreekCompletion(const std::string &s)
{
std::vector<std::string> ret;
if (s.empty())
return ret;
size_t sp = greekChunkLength(s);
if (sp == std::string::npos)
{
sp = UTF8Length(s);
for(int i = 0; i >= 0;)
{
std::string rets;
i = findGreekNameIndexBySubstr(s, i, sp);
if (i >= 0)
{
rets = Greek::getInstance()->abbrevs[i];
rets += " ";
ret.emplace_back(ReplaceGreekLetterAbbr(rets));
i++;
}
}
}
else
{
std::string prefix = s.substr(0, sp);
std::string rets = Greek::canonicalAbbreviation(prefix);
if (!rets.empty())
{
rets += s.substr(sp);
ret.emplace_back(ReplaceGreekLetterAbbr(rets));
}
}
return ret;
}
UTF8Status
UTF8Validator::check(char c)
{
return check(static_cast<unsigned char>(c));
}
UTF8Status
UTF8Validator::check(unsigned char c)
{

View File

@ -1,6 +1,7 @@
// utf8.h
//
// Copyright (C) 2004, Chris Laurel <claurel@shatters.net>
// 2018-present, Celestia Development Team
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
@ -11,42 +12,30 @@
#include <cstdint>
#include <string>
#include <vector>
#include <string_view>
#define UTF8_DEGREE_SIGN "\302\260"
#define UTF8_MULTIPLICATION_SIGN "\303\227"
#define UTF8_SUPERSCRIPT_0 "\342\201\260"
#define UTF8_SUPERSCRIPT_1 "\302\271"
#define UTF8_SUPERSCRIPT_2 "\302\262"
#define UTF8_SUPERSCRIPT_3 "\302\263"
#define UTF8_SUPERSCRIPT_4 "\342\201\264"
#define UTF8_SUPERSCRIPT_5 "\342\201\265"
#define UTF8_SUPERSCRIPT_6 "\342\201\266"
#define UTF8_SUPERSCRIPT_7 "\342\201\267"
#define UTF8_SUPERSCRIPT_8 "\342\201\270"
#define UTF8_SUPERSCRIPT_9 "\342\201\271"
#define UTF8_REPLACEMENT_CHAR "\357\277\275"
bool UTF8Decode(const std::string& str, int pos, wchar_t& ch);
bool UTF8Decode(const char* str, int pos, int length, wchar_t& ch);
void UTF8Encode(std::uint32_t ch, std::string& dest);
int UTF8StringCompare(const std::string& s0, const std::string& s1);
int UTF8StringCompare(const std::string& s0, const std::string& s1, size_t n, bool ignoreCase = false);
bool UTF8Decode(std::string_view str, int pos, wchar_t &ch);
void UTF8Encode(std::uint32_t ch, std::string &dest);
int UTF8StringCompare(std::string_view s0, std::string_view s1);
int UTF8StringCompare(std::string_view s0, std::string_view s1, size_t n, bool ignoreCase = false);
class UTF8StringOrderingPredicate
{
public:
bool operator()(const std::string& s0, const std::string& s1) const
bool operator()(std::string_view s0, std::string_view s1) const
{
return UTF8StringCompare(s0, s1) == -1;
}
};
int UTF8Length(std::string_view s);
int UTF8Length(const std::string& s);
inline int UTF8EncodedSize(wchar_t ch)
constexpr int
UTF8EncodedSize(wchar_t ch)
{
if (ch < 0x80)
return 1;
@ -66,7 +55,8 @@ inline int UTF8EncodedSize(wchar_t ch)
#endif
}
constexpr inline int UTF8EncodedSizeChecked(std::uint32_t ch)
constexpr int
UTF8EncodedSizeChecked(std::uint32_t ch)
{
if (ch < 0x80)
return 1;
@ -84,76 +74,6 @@ constexpr inline int UTF8EncodedSizeChecked(std::uint32_t ch)
#endif
}
inline int UTF8EncodedSizeFromFirstByte(unsigned int ch)
{
if (ch < 0x80)
return 1;
if ((ch & 0xe0) == 0xc0)
return 2;
if ((ch & 0xf0) == 0xe0)
return 3;
if ((ch & 0xf8) == 0xf0)
return 4;
if ((ch & 0xfc) == 0xf8)
return 5;
if ((ch & 0xfe) == 0xfc)
return 6;
else
return 1;
}
std::string ReplaceGreekLetterAbbr(const std::string&);
#if 0
unsigned int ReplaceGreekLetterAbbr(char* dst, unsigned int dstSize, const char* src, unsigned int srcLength);
#endif
class Greek
{
private:
Greek();
~Greek();
public:
enum Letter
{
Alpha = 1,
Beta = 2,
Gamma = 3,
Delta = 4,
Epsilon = 5,
Zeta = 6,
Eta = 7,
Theta = 8,
Iota = 9,
Kappa = 10,
Lambda = 11,
Mu = 12,
Nu = 13,
Xi = 14,
Omicron = 15,
Pi = 16,
Rho = 17,
Sigma = 18,
Tau = 19,
Upsilon = 20,
Phi = 21,
Chi = 22,
Psi = 23,
Omega = 24,
};
static const std::string& canonicalAbbreviation(const std::string&);
private:
static Greek* m_instance;
public:
static Greek* getInstance();
int nLetters;
std::string* names;
std::string* abbrevs;
};
std::vector<std::string> getGreekCompletion(const std::string &);
enum class UTF8Status
{
Ok,
@ -164,9 +84,6 @@ enum class UTF8Status
class UTF8Validator
{
public:
UTF8Validator() = default;
~UTF8Validator() = default;
UTF8Status check(char c);
UTF8Status check(unsigned char c);
@ -185,3 +102,9 @@ private:
State state{ State::Initial };
};
inline UTF8Status
UTF8Validator::check(char c)
{
return check(static_cast<unsigned char>(c));
}

View File

@ -1,4 +1,5 @@
test_case(charconv_compat)
test_case(greek)
test_case(hash)
test_case(logger)
test_case(stellarclass)

View File

@ -0,0 +1,42 @@
#include <celutil/greek.h>
#include <catch.hpp>
TEST_CASE("Greek", "[Greek]")
{
SECTION("ReplaceGreekLetterAbbr")
{
REQUIRE(ReplaceGreekLetterAbbr("XI") == "\316\276");
REQUIRE(ReplaceGreekLetterAbbr("XI12") == "\316\276\302\271\302\262");
REQUIRE(ReplaceGreekLetterAbbr("XI Foo") == "\316\276 Foo");
REQUIRE(ReplaceGreekLetterAbbr("XI12 Bar") == "\316\276\302\271\302\262 Bar");
REQUIRE(ReplaceGreekLetterAbbr("xi") == "xi");
REQUIRE(ReplaceGreekLetterAbbr("xi12") == "xi12");
REQUIRE(ReplaceGreekLetterAbbr("xi Foo") == "xi Foo");
REQUIRE(ReplaceGreekLetterAbbr("xi12 Bar") == "xi12 Bar");
REQUIRE(ReplaceGreekLetterAbbr("alpha") == "alpha");
}
SECTION("ReplaceGreekLetter")
{
REQUIRE(ReplaceGreekLetter("XI") == "\316\276");
REQUIRE(ReplaceGreekLetter("XI12") == "\316\276\302\271\302\262");
REQUIRE(ReplaceGreekLetter("XI Foo") == "\316\276 Foo");
REQUIRE(ReplaceGreekLetter("XI12 Bar") == "\316\276\302\271\302\262 Bar");
REQUIRE(ReplaceGreekLetter("xi") == "\316\276");
REQUIRE(ReplaceGreekLetter("xi12") == "\316\276\302\271\302\262");
REQUIRE(ReplaceGreekLetter("xi Foo") == "\316\276 Foo");
REQUIRE(ReplaceGreekLetter("xi12 Bar") == "\316\276\302\271\302\262 Bar");
REQUIRE(ReplaceGreekLetter("alpha") == "\316\261");
}
SECTION("GetCanonicalGreekAbbreviation")
{
REQUIRE(GetCanonicalGreekAbbreviation("xi") == "XI");
REQUIRE(GetCanonicalGreekAbbreviation("alpha") == "ALF");
}
}