soft string cleanup with normalization and invisible char detection
parent
872fa736e4
commit
ab46589a52
|
@ -31,28 +31,25 @@ object String {
|
|||
def distinctGarbageChars(str: String): Set[Char] =
|
||||
str
|
||||
.chars()
|
||||
.filter(isGarbageChar)
|
||||
.filter(isGarbageChar _)
|
||||
.boxed()
|
||||
.iterator()
|
||||
.asScala
|
||||
.map((i: Integer) => i.toChar)
|
||||
.toSet
|
||||
|
||||
def removeGarbageChars(str: String): String =
|
||||
private def removeChars(str: String, isRemoveable: Int => Boolean): String =
|
||||
str
|
||||
.chars()
|
||||
.filter(c => !isGarbageChar(c))
|
||||
.filter(c => !isRemoveable(c))
|
||||
.boxed()
|
||||
.iterator()
|
||||
.asScala
|
||||
.map((i: Integer) => i.toChar)
|
||||
.mkString
|
||||
|
||||
def isGarbageChar(c: Int) =
|
||||
// invisible chars https://www.compart.com/en/unicode/block/U+2000
|
||||
(c >= '\u2000' && c <= '\u200F') ||
|
||||
// weird stuff https://www.compart.com/en/unicode/block/U+2000
|
||||
(c >= '\u2028' && c <= '\u202F') ||
|
||||
private def isGarbageChar(c: Int) =
|
||||
isInvisibleChar(c) ||
|
||||
// bunch of probably useless blocks https://www.compart.com/en/unicode/block/U+2100
|
||||
// but keep maths operators cause maths are cool https://www.compart.com/en/unicode/block/U+2200
|
||||
// and chess symbols https://www.compart.com/en/unicode/block/U+2600
|
||||
|
@ -68,6 +65,12 @@ object String {
|
|||
// IPA extensions https://www.compart.com/en/unicode/block/U+0250
|
||||
(c >= '\u0250' && c <= '\u02af')
|
||||
|
||||
private def isInvisibleChar(c: Int) =
|
||||
// invisible chars https://www.compart.com/en/unicode/block/U+2000
|
||||
(c >= '\u2000' && c <= '\u200F') ||
|
||||
// weird stuff https://www.compart.com/en/unicode/block/U+2000
|
||||
(c >= '\u2028' && c <= '\u202F')
|
||||
|
||||
object normalize {
|
||||
|
||||
private val ordinalRegex = "[º°ª]".r
|
||||
|
@ -93,7 +96,11 @@ object String {
|
|||
private val multibyteSymbolsRegex = "\\p{So}+".r
|
||||
def removeMultibyteSymbols(str: String): String = multibyteSymbolsRegex.replaceAllIn(str, "")
|
||||
|
||||
def fullCleanUp(str: String) = removeMultibyteSymbols(removeGarbageChars(normalize(str.trim)))
|
||||
// for publicly listed text like team names, study names, forum topics...
|
||||
def fullCleanUp(str: String) = removeMultibyteSymbols(removeChars(normalize(str.trim), isGarbageChar))
|
||||
|
||||
// for inner text like study move annotations, possibly forum posts and team descriptions
|
||||
def softCleanUp(str: String) = removeChars(normalize(str.trim), isInvisibleChar)
|
||||
|
||||
def decodeUriPath(input: String): Option[String] = {
|
||||
try {
|
||||
|
|
Loading…
Reference in New Issue