improve shutup analyser

This commit is contained in:
Thibault Duplessis 2015-04-26 14:45:10 +02:00
parent 9861d909f6
commit 2073889849
3 changed files with 136 additions and 234 deletions

View file

@ -7,41 +7,9 @@ object Analyser {
bigRegex.findAllMatchIn(text).map(_.toString).toList
)
// based on https://github.com/snipe/banbuilder/blob/master/src/CensorWords.php#L97
private val leetReplace = Map(
'a' -> """(a|a\.|a\-|4|@|Á|á|À|Â|à|Â|â|Ä|ä|Ã|ã|Å|å|α|Δ|Λ|λ)""",
'b' -> """(b|b\.|b\-|8|\|3|ß|Β|β)""",
'c' -> """(c|c\.|c\-|Ç|ç|¢|€|<|\(|\{|©)""",
'd' -> """(d|d\.|d\-|&part;|\|\)|Þ|þ|Ð|ð)""",
'e' -> """(e|e\.|e\-|3|€|È|è|É|é|Ê|ê|∑)""",
'f' -> """(f|f\.|f\-|ƒ)""",
'g' -> """(g|g\.|g\-|6|9)""",
'h' -> """(h|h\.|h\-|Η)""",
'i' -> """(i|i\.|i\-|!|\||\]\[|]|1|∫|Ì|Í|Î|Ï|ì|í|î|ï)""",
'j' -> """(j|j\.|j\-)""",
'k' -> """(k|k\.|k\-|Κ|κ)""",
'l' -> """(l|1\.|l\-|!|\||\]\[|]|£|∫|Ì|Í|Î|Ï)""",
'm' -> """(m|m\.|m\-)""",
'n' -> """(n|n\.|n\-|η|Ν|Π)""",
'o' -> """(o|o\.|o\-|0|Ο|ο|Φ|¤|°|ø)""",
'p' -> """(p|p\.|p\-|ρ|Ρ|¶|þ)""",
'q' -> """(q|q\.|q\-)""",
'r' -> """(r|r\.|r\-|®)""",
's' -> """(s|s\.|s\-|5|\$|§)""",
't' -> """(t|t\.|t\-|Τ|τ)""",
'u' -> """(u|u\.|u\-|υ|µ)""",
'v' -> """(v|v\.|v\-|υ|ν)""",
'w' -> """(w|w\.|w\-|ω|ψ|Ψ)""",
'x' -> """(x|x\.|x\-|Χ|χ)""",
'y' -> """(y|y\.|y\-|¥|γ|ÿ|ý|Ÿ|Ý)""",
'z' -> """(z|z\.|z\-|Ζ)""")
private def wordsRegexes: List[String] = Dictionary.en.map { word =>
val regex = word.map { char =>
leetReplace.getOrElse(char, char.toString)
}.mkString
if (word endsWith "s") regex
else regex + leetReplace.getOrElse('s', "s") + "?"
if (word endsWith "s") word
else word + "s?"
}
private val bigRegex = {

View file

@ -7,235 +7,154 @@ package lila.shutup
*/
object Dictionary {
val en: List[String] = (enBase ++ enUk ++ enUs).distinct
private def enBase = dict("""
anal
def en: List[String] = dict("""
(c|k)oc?k
(c|k)oc?ksuc?ker
(c|k)um(shot|)
(c|k)unt(ing|)
(f+|ph)(u{1,}|a{1,}|e{1,})c?k(er|r|u|k|ed|d|t|ing|tar|tard|face|)
abortion
anal(plug|sex|)
anus
arse(hole|)
ass
asshole
bastard
bitch
boob
cock
coward
cum
cunnilingu
cunt
cunting
dick
dildo
dyke
fack
fag
faggot
fuck
fucking
fuckstick
fucktard
fuk
handjob
homo
incest
jerk
jizz
kike
kunt
muff
niger
nigger
pederast
penis
piss
poop
pussy
queer
rape
retard
retarded
scum
scumbag
semen
sex
shit
shitbag
shite
shitty
shity
slut
titties
twat
vagin
vagina
vulva
wank
""").pp
private def enUk = dict("""
analplug
analsex
arse
arsehole
balls
ass?hole
ball
bastard?
bewb
bimbo
bitche?
blow
blowjob
bollocks
blumpkin
bollock
boner
boobies
boobs
boob
bugger
bukkake
bullshit
buk?kake
bull?shit
cawk
chink
choad
clit
clitoris
cocksucker
condom
coon
crap
cumshot
damm
dammit
damn
dickhead
doggystyle
f0ck
fags
fanny
fck
fcker
fckr
fcku
fcuk
fucked
fucker
fuckface
fuckr
fuct
genital
genitalia
genitals
glory hole
gloryhole
gobshite
godammet
godammit
goddammet
goddammit
goddamn
gypo
hitler
hooker
hore
horny
jesussucks
jizzum
kaffir
kill
killer
killin
killing
lesbo
masturbate
milf
molest
moron
motherfuck
mthrfckr
murder
murderer
nazi
negro
niga
nigah
nigga
niggah
nonce
paedo
paedophile
paki
pecker
pedo
pedofile
pedophile
phuk
pig
pimp
poof
porn
prick
pron
prostitute
raped
rapes
rapist
schlong
screw
scrotum
shag
shemale
shite
shiz
slag
spastic
spaz
sperm
spunk
stripper
stupid
tart
terrorist
tit
tittyfuck
tosser
turd
vaginal
vibrator
wanker
weed
wetback
whor
whore
wog
wtf
xxx
""")
private def enUs = dict("""
abortion
anus
bewb
blow
blumpkin
cawk
choad
cooter
cornhole
coward?
crap
cunn?ilingu
dic?k
dic?khead
dildo
dogg?ystyle
dong
douche
douche(bag|)
dyke
fag
fagg?ot
fanny
fart
foreskin
gangbang
genital
genitalia
gobshite?
gook
gypo
handjob
hell
hitler
homm?o
honkey
hooker
hore
horny
humping
jiz
incest
jerk
jizz?(um|)
kaffir
kike
labia
nutsack
pen1s
lesbo
masturbat(e|ion)
milf
molest
moron
mother
motherfuc?k
mthrfckr
muff
nazi
negro
nigg?(er|a|ah)
nonce
nutsac?k
pa?edo
pa?edo(f|ph)ile
paki
pecker
pederast
pen(1|i)s
pig
pimp
piss
poof
poon
poop
porn
pric?k
pron
prostitute
punani
puss(i|y|ie)
queef
queer
quim
rectal
rectum
raped?
rapist
rect(al|um)
retard(ed|)
rimjob
schlong
screw(d|ed|)
scrotum
scum(bag|)
semen
sex
shag
shemale
shit(z|e|y|ty|bag|)
sister
slag
slut
spastic
spaz
sperm
spick
spoo
spooge
spunk
stripper
stupid
suc?k
taint
titty
tart
terrorist
tit(s|ies|ties|ty)(fuc?k)
tosser
turd
twat
vag
whore
vagin(a|al|)
vibrator
vulva
wanc?k(er|)
wetback
whore?
wog
""")
private def dict(words: String) = words.lines.filter(_.nonEmpty).toList

View file

@ -15,8 +15,8 @@ class DetectTest extends Specification {
}
"find many bad words" in {
find("fuck that shit") must_== List("fuck", "shit")
find("Beat them cunting nigger faggots with a bitchin' fuckstick") must_==
List("cunting", "nigger", "faggots", "fuckstick")
find("Beat them cunting nigger faggots with a communist dick") must_==
List("cunting", "nigger", "faggots", "dick")
}
"find no bad words" in {
find("") must_== Nil
@ -24,9 +24,24 @@ class DetectTest extends Specification {
find("A sonnet is a poetic form which originated in Italy; Giacomo Da Lentini is credited with its invention.") must_== Nil
find("computer analysis") must_== Nil
}
"find badly spelled words" in {
find("fuk") must_== List("fuk")
find("well fuk me") must_== List("fuk")
find("foo ashole bar fukd") must_== List("ashole", "fukd")
}
"find variants" in {
find("cunt kunt cunting kawa kunting") must_== List("cunt", "kunt", "cunting", "kunting")
find("ass as ashole") must_== List("ass", "ashole")
}
"find plurals" in {
find("cunts kunts cuntings kawas kuntings") must_== List("cunts", "kunts", "cuntings", "kuntings")
}
"fucks" in {
find("fuck fffuuk fektard feak fak phuk") must_== List("fuck", "fffuuk", "fektard", "fak", "phuk")
}
"compute ratio" in {
ratio("fuck that shit") must_== 2d/3
ratio("Beat them cunting nigger faggots with a bitchin' fuckstick") must_== 4d/9
ratio("Beat them cunting nigger faggots with a communist dick") must_== 4d/9
ratio("hello there") must_== 0
ratio("") must_== 0
}