soft string cleanup with normalization and invisible char detection

pull/9923/head
Thibault Duplessis 2021-10-03 10:10:05 +02:00
parent 872fa736e4
commit ab46589a52
1 changed files with 16 additions and 9 deletions

View File

@ -31,28 +31,25 @@ object String {
def distinctGarbageChars(str: String): Set[Char] =
str
.chars()
.filter(isGarbageChar)
.filter(isGarbageChar _)
.boxed()
.iterator()
.asScala
.map((i: Integer) => i.toChar)
.toSet
def removeGarbageChars(str: String): String =
private def removeChars(str: String, isRemoveable: Int => Boolean): String =
str
.chars()
.filter(c => !isGarbageChar(c))
.filter(c => !isRemoveable(c))
.boxed()
.iterator()
.asScala
.map((i: Integer) => i.toChar)
.mkString
def isGarbageChar(c: Int) =
// invisible chars https://www.compart.com/en/unicode/block/U+2000
(c >= '\u2000' && c <= '\u200F') ||
// weird stuff https://www.compart.com/en/unicode/block/U+2000
(c >= '\u2028' && c <= '\u202F') ||
private def isGarbageChar(c: Int) =
isInvisibleChar(c) ||
// bunch of probably useless blocks https://www.compart.com/en/unicode/block/U+2100
// but keep maths operators cause maths are cool https://www.compart.com/en/unicode/block/U+2200
// and chess symbols https://www.compart.com/en/unicode/block/U+2600
@ -68,6 +65,12 @@ object String {
// IPA extensions https://www.compart.com/en/unicode/block/U+0250
(c >= '\u0250' && c <= '\u02af')
private def isInvisibleChar(c: Int) =
// invisible chars https://www.compart.com/en/unicode/block/U+2000
(c >= '\u2000' && c <= '\u200F') ||
// weird stuff https://www.compart.com/en/unicode/block/U+2000
(c >= '\u2028' && c <= '\u202F')
object normalize {
private val ordinalRegex = "[º°ª]".r
@ -93,7 +96,11 @@ object String {
private val multibyteSymbolsRegex = "\\p{So}+".r
def removeMultibyteSymbols(str: String): String = multibyteSymbolsRegex.replaceAllIn(str, "")
def fullCleanUp(str: String) = removeMultibyteSymbols(removeGarbageChars(normalize(str.trim)))
// for publicly listed text like team names, study names, forum topics...
def fullCleanUp(str: String) = removeMultibyteSymbols(removeChars(normalize(str.trim), isGarbageChar))
// for inner text like study move annotations, possibly forum posts and team descriptions
def softCleanUp(str: String) = removeChars(normalize(str.trim), isInvisibleChar)
def decodeUriPath(input: String): Option[String] = {
try {