-
Notifications
You must be signed in to change notification settings - Fork 45
/
Copy pathSpellCheck.jl
152 lines (138 loc) · 7.37 KB
/
SpellCheck.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
module SpellCheck
using ..ProblemBenchmarks: PROBLEM_DATA_DIR
using Printf
# Peter Norvig's Spelling Corrector (http://norvig.com/spell-correct.html)
# I've attempted to keep this close to the python version, although in some
# cases the python version is tighter because of some syntax that is not
# available in julia (e.g., guards in comprehensions). The python version, at
# the time of original submission, is also 5-6x faster than the julia version.
#
# @kmsquire, 2013-11-29
const ALPHABET = "abcdefghijklmnopqrstuvwxyz"
words(text) = eachmatch(r"[a-z]+", lowercase(text))
function train(features)
model = Dict{String, Int}()
for f in features
model[f.match] = 2
end
return model
end
const NWORDS = train(words(read(joinpath(PROBLEM_DATA_DIR, "norvig_spellcheck.txt"), String)))
function edits(word::AbstractString)
splits = [(word[1:i], word[i+1:end]) for i=0:length(word) ]
deletes = ["$a$(b[2:end])" for (a,b) in splits[1:end-1]]
transposes = ["$a$(b[2])$(b[1])$(b[3:end])" for (a,b) in splits[1:end-2]]
replaces = ["$a$c$(b[2:end])" for (a,b) in splits[1:end-1], c in ALPHABET]
inserts = ["$a$c$b" for (a,b) in splits, c in ALPHABET]
return Set([deletes; transposes; replaces[:]; inserts[:]])
end
function known_edits(word::AbstractString)
xs = Set{String}()
for e1 in edits(word)
for e2 in edits(e1)
haskey(NWORDS, e2) && push!(xs, e2)
end
end
return xs
end
function known(words)
xs = Set{String}()
for word in words
haskey(NWORDS, word) && push!(xs, word)
end
return xs
end
function correct(word::AbstractString)
candidates = known([word])
length(candidates) == 0 && (candidates = known(edits(word)))
length(candidates) == 0 && (candidates = known_edits(word) )
length(candidates) == 0 && (candidates = Set([word]) )
return maximum(x->(get(NWORDS, x, 0),x), candidates)[2]
end
function perf_spellcheck(; bias=0, verbose=false)
n, bad, unknown, start = 0, 0, 0, time_ns()
if bias > 0
for target in keys(TEST_DATA)
NWORDS[target] = get(NWORDS, target, 1) + bias
end
end
for (target,wrongs) in TEST_DATA
for wrong in split(wrongs)
n += 1
w = correct(wrong)
if w!=target
bad += 1
unknown += !(target in keys(NWORDS))
if verbose
@printf("correct(%s) => %s (%d); expected %s (%d)\n",
wrong, w, NWORDS[w], target, NWORDS[target])
end
end
end
end
return Dict("bad"=>bad, "n"=>n, "bias"=>bias,
"pct"=>round(Int, 100.0 - 100.0 * bad / n),
"unknown"=>unknown, "secs"=>(time_ns()-start)/1e9)
end
const TEST_DATA = Dict("access"=> "acess", "accessing"=> "accesing", "accommodation"=>
"accomodation acommodation acomodation", "account"=> "acount", "address"=>
"adress adres", "addressable"=> "addresable", "arranged"=> "aranged arrainged",
"arrangeing"=> "aranging", "arrangement"=> "arragment", "articles"=> "articals",
"aunt"=> "annt anut arnt", "auxiliary"=> "auxillary", "available"=> "avaible",
"awful"=> "awfall afful", "basically"=> "basicaly", "beginning"=> "begining",
"benefit"=> "benifit", "benefits"=> "benifits", "between"=> "beetween", "bicycle"=>
"bicycal bycicle bycycle", "biscuits"=>
"biscits biscutes biscuts bisquits buiscits buiscuts", "built"=> "biult",
"cake"=> "cak", "career"=> "carrer",
"cemetery"=> "cemetary semetary", "centrally"=> "centraly", "certain"=> "cirtain",
"challenges"=> "chalenges chalenges", "chapter"=> "chaper chaphter chaptur",
"choice"=> "choise", "choosing"=> "chosing", "clerical"=> "clearical",
"committee"=> "comittee", "compare"=> "compair", "completely"=> "completly",
"consider"=> "concider", "considerable"=> "conciderable", "contented"=>
"contenpted contende contended contentid", "curtains"=>
"cartains certans courtens cuaritains curtans curtians curtions", "decide"=> "descide", "decided"=>
"descided", "definitely"=> "definately difinately", "definition"=> "defenition",
"definitions"=> "defenitions", "description"=> "discription", "desiccate"=>
"desicate dessicate dessiccate", "diagrammatically"=> "diagrammaticaally",
"different"=> "diffrent", "driven"=> "dirven", "ecstasy"=> "exstacy ecstacy",
"embarrass"=> "embaras embarass", "establishing"=> "astablishing establising",
"experience"=> "experance experiance", "experiences"=> "experances", "extended"=>
"extented", "extremely"=> "extreamly", "fails"=> "failes", "families"=> "familes",
"february"=> "febuary", "further"=> "futher", "gallery"=> "galery gallary gallerry gallrey",
"hierarchal"=> "hierachial", "hierarchy"=> "hierchy", "inconvenient"=>
"inconvienient inconvient inconvinient", "independent"=> "independant independant",
"initial"=> "intial", "initials"=> "inetials inistals initails initals intials",
"juice"=> "guic juce jucie juise juse", "latest"=> "lates latets latiest latist",
"laugh"=> "lagh lauf laught lugh", "level"=> "leval",
"levels"=> "levals", "liaison"=> "liaision liason", "lieu"=> "liew", "literature"=>
"litriture", "loans"=> "lones", "locally"=> "localy", "magnificent"=>
"magnificnet magificent magnifcent magnifecent magnifiscant magnifisent magnificant",
"management"=> "managment", "meant"=> "ment", "minuscule"=> "miniscule",
"minutes"=> "muinets", "monitoring"=> "monitering", "necessary"=>
"neccesary necesary neccesary necassary necassery neccasary", "occurrence"=>
"occurence occurence", "often"=> "ofen offen offten ofton", "opposite"=>
"opisite oppasite oppesite oppisit oppisite opposit oppossite oppossitte", "parallel"=>
"paralel paralell parrallel parralell parrallell", "particular"=> "particulaur",
"perhaps"=> "perhapse", "personnel"=> "personnell", "planned"=> "planed", "poem"=>
"poame", "poems"=> "poims pomes", "poetry"=> "poartry poertry poetre poety powetry",
"position"=> "possition", "possible"=> "possable", "pretend"=>
"pertend protend prtend pritend", "problem"=> "problam proble promblem proplen",
"pronunciation"=> "pronounciation", "purple"=> "perple perpul poarple",
"questionnaire"=> "questionaire", "really"=> "realy relley relly", "receipt"=>
"receit receite reciet recipt", "receive"=> "recieve", "refreshment"=>
"reafreshment refreshmant refresment refressmunt", "remember"=> "rember remeber rememmer rermember",
"remind"=> "remine remined", "scarcely"=> "scarcly scarecly scarely scarsely",
"scissors"=> "scisors sissors", "separate"=> "seperate",
"singular"=> "singulaur", "someone"=> "somone", "sources"=> "sorces", "southern"=>
"southen", "special"=> "speaical specail specal speical", "splendid"=>
"spledid splended splened splended", "standardizing"=> "stanerdizing", "stomach"=>
"stomac stomache stomec stumache", "supersede"=> "supercede superceed", "there"=> "ther",
"totally"=> "totaly", "transferred"=> "transfred", "transportability"=>
"transportibility", "triangular"=> "triangulaur", "understand"=> "undersand undistand",
"unexpected"=> "unexpcted unexpeted unexspected", "unfortunately"=>
"unfortunatly", "unique"=> "uneque", "useful"=> "usefull", "valuable"=> "valubale valuble",
"variable"=> "varable", "variant"=> "vairiant", "various"=> "vairious",
"visited"=> "fisited viseted vistid vistied", "visitors"=> "vistors",
"voluntary"=> "volantry", "voting"=> "voteing", "wanted"=> "wantid wonted",
"whether"=> "wether", "wrote"=> "rote wote")
end # module