From e4f3ef2b07b1110a68bb3ab555869f06f86e2799 Mon Sep 17 00:00:00 2001 From: Jordan Kay Date: Fri, 20 Feb 2015 11:58:13 -0800 Subject: [PATCH 1/4] Add Swift versions of basic components Include tests that are identical to previous tests, but test the corresponding Swift, not Objective-C, component. This is to ensure the new Swift components behave exactly the same as their Objective-C counterparts. --- Parsimmon/Parsimmon.xcodeproj/project.pbxproj | 72 +++++++++++++++---- Parsimmon/Parsimmon/Analyzer.swift | 34 +++++++++ Parsimmon/Parsimmon/Lemmatizer.swift | 47 ++++++++++++ Parsimmon/Parsimmon/Seed.swift | 21 ++++++ Parsimmon/Parsimmon/TaggedToken.swift | 54 ++++++++++++++ Parsimmon/Parsimmon/Tagger.swift | 49 +++++++++++++ Parsimmon/Parsimmon/Tokenizer.swift | 47 ++++++++++++ Parsimmon/ParsimmonTests/LemmatizerTests.m | 54 ++++++++++++++ Parsimmon/ParsimmonTests/TaggerTests.m | 51 +++++++++++++ Parsimmon/ParsimmonTests/TokenizerTests.swift | 53 ++++++++++++++ 10 files changed, 468 insertions(+), 14 deletions(-) create mode 100644 Parsimmon/Parsimmon/Analyzer.swift create mode 100644 Parsimmon/Parsimmon/Lemmatizer.swift create mode 100644 Parsimmon/Parsimmon/Seed.swift create mode 100644 Parsimmon/Parsimmon/TaggedToken.swift create mode 100644 Parsimmon/Parsimmon/Tagger.swift create mode 100644 Parsimmon/Parsimmon/Tokenizer.swift create mode 100644 Parsimmon/ParsimmonTests/LemmatizerTests.m create mode 100644 Parsimmon/ParsimmonTests/TaggerTests.m create mode 100644 Parsimmon/ParsimmonTests/TokenizerTests.swift diff --git a/Parsimmon/Parsimmon.xcodeproj/project.pbxproj b/Parsimmon/Parsimmon.xcodeproj/project.pbxproj index f494ffe..3fd67e4 100644 --- a/Parsimmon/Parsimmon.xcodeproj/project.pbxproj +++ b/Parsimmon/Parsimmon.xcodeproj/project.pbxproj @@ -8,16 +8,29 @@ /* Begin PBXBuildFile section */ 45166BDE1A94265800D0E013 /* ParsimmonTokenizer.swift in Sources */ = {isa = PBXBuildFile; fileRef = B6139F6919442FB700FC6CAA /* ParsimmonTokenizer.swift */; }; + 452C8BA51A96B19D003D7441 /* Seed.swift in Sources */ = {isa = PBXBuildFile; fileRef = 452C8BA41A96B19D003D7441 /* Seed.swift */; }; + 452C8BA71A96B557003D7441 /* Tokenizer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 452C8BA61A96B557003D7441 /* Tokenizer.swift */; }; + 452C8BAA1A96BC81003D7441 /* Tokenizer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 452C8BA61A96B557003D7441 /* Tokenizer.swift */; }; + 452C8BAB1A96BCBB003D7441 /* Seed.swift in Sources */ = {isa = PBXBuildFile; fileRef = 452C8BA41A96B19D003D7441 /* Seed.swift */; }; + 452C8BAF1A96C27F003D7441 /* Analyzer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 452C8BAE1A96C27F003D7441 /* Analyzer.swift */; }; + 452C8BB01A96E501003D7441 /* Analyzer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 452C8BAE1A96C27F003D7441 /* Analyzer.swift */; }; + 452C8BB21A96E589003D7441 /* Tagger.swift in Sources */ = {isa = PBXBuildFile; fileRef = 452C8BB11A96E589003D7441 /* Tagger.swift */; }; + 452C8BB41A96E58F003D7441 /* Lemmatizer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 452C8BB31A96E58F003D7441 /* Lemmatizer.swift */; }; + 452C8BB61A96E5DC003D7441 /* TaggedToken.swift in Sources */ = {isa = PBXBuildFile; fileRef = 452C8BB51A96E5DC003D7441 /* TaggedToken.swift */; }; + 452C8BB81A97B944003D7441 /* Lemmatizer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 452C8BB31A96E58F003D7441 /* Lemmatizer.swift */; }; + 452C8BB91A97B94E003D7441 /* TaggedToken.swift in Sources */ = {isa = PBXBuildFile; fileRef = 452C8BB51A96E5DC003D7441 /* TaggedToken.swift */; }; 459B01491A9534B0000859A1 /* NaiveBayesClassifier.swift in Sources */ = {isa = PBXBuildFile; fileRef = 459B01481A9534B0000859A1 /* NaiveBayesClassifier.swift */; }; 459B014B1A954B98000859A1 /* NaiveBayesClassifierTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 459B014A1A954B98000859A1 /* NaiveBayesClassifierTests.swift */; }; 459B014D1A955E3D000859A1 /* Functions.swift in Sources */ = {isa = PBXBuildFile; fileRef = 459B014C1A955E3D000859A1 /* Functions.swift */; }; + 459BE4051A97C99D0008714F /* ParsimmonLemmatizerTests.m in Sources */ = {isa = PBXBuildFile; fileRef = 459BE4021A97C99D0008714F /* ParsimmonLemmatizerTests.m */; }; + 459BE4061A97C99D0008714F /* ParsimmonTaggerTests.m in Sources */ = {isa = PBXBuildFile; fileRef = 459BE4031A97C99D0008714F /* ParsimmonTaggerTests.m */; }; + 459BE4071A97C99D0008714F /* ParsimmonTokenizerTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 459BE4041A97C99D0008714F /* ParsimmonTokenizerTests.swift */; }; + 45B4EC5B1A97C65900B7B038 /* Tagger.swift in Sources */ = {isa = PBXBuildFile; fileRef = 452C8BB11A96E589003D7441 /* Tagger.swift */; }; B6139F6A19442FB700FC6CAA /* ParsimmonTokenizer.swift in Sources */ = {isa = PBXBuildFile; fileRef = B6139F6919442FB700FC6CAA /* ParsimmonTokenizer.swift */; }; - B6139F701944D59F00FC6CAA /* ParsimmonTokenizerTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = B6139F6E194433BA00FC6CAA /* ParsimmonTokenizerTests.swift */; }; + B6139F701944D59F00FC6CAA /* TokenizerTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = B6139F6E194433BA00FC6CAA /* TokenizerTests.swift */; }; B63E18C418E618160006BD3E /* InfoPlist.strings in Resources */ = {isa = PBXBuildFile; fileRef = B63E18BC18E618160006BD3E /* InfoPlist.strings */; }; B63E18C518E618160006BD3E /* ParsimmonTests-Info.plist in Resources */ = {isa = PBXBuildFile; fileRef = B63E18BE18E618160006BD3E /* ParsimmonTests-Info.plist */; }; B63E18CB18E6196A0006BD3E /* ParsimmonDecisionTreeTests.m in Sources */ = {isa = PBXBuildFile; fileRef = B63E18C018E618160006BD3E /* ParsimmonDecisionTreeTests.m */; }; - B63E18CC18E6196D0006BD3E /* ParsimmonLemmatizerTests.m in Sources */ = {isa = PBXBuildFile; fileRef = B63E18C118E618160006BD3E /* ParsimmonLemmatizerTests.m */; }; - B63E18CD18E619710006BD3E /* ParsimmonTaggerTests.m in Sources */ = {isa = PBXBuildFile; fileRef = B63E18C218E618160006BD3E /* ParsimmonTaggerTests.m */; }; B670057B1807D79500CFF860 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = B670057A1807D79500CFF860 /* Foundation.framework */; }; B670057D1807D79500CFF860 /* CoreGraphics.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = B670057C1807D79500CFF860 /* CoreGraphics.framework */; }; B67005851807D79500CFF860 /* InfoPlist.strings in Resources */ = {isa = PBXBuildFile; fileRef = B67005831807D79500CFF860 /* InfoPlist.strings */; }; @@ -49,18 +62,27 @@ /* End PBXContainerItemProxy section */ /* Begin PBXFileReference section */ + 452C8BA41A96B19D003D7441 /* Seed.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Seed.swift; sourceTree = ""; }; + 452C8BA61A96B557003D7441 /* Tokenizer.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Tokenizer.swift; sourceTree = ""; }; + 452C8BAE1A96C27F003D7441 /* Analyzer.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Analyzer.swift; sourceTree = ""; }; + 452C8BB11A96E589003D7441 /* Tagger.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Tagger.swift; sourceTree = ""; }; + 452C8BB31A96E58F003D7441 /* Lemmatizer.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Lemmatizer.swift; sourceTree = ""; }; + 452C8BB51A96E5DC003D7441 /* TaggedToken.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = TaggedToken.swift; sourceTree = ""; }; 459B01481A9534B0000859A1 /* NaiveBayesClassifier.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = NaiveBayesClassifier.swift; sourceTree = ""; }; 459B014A1A954B98000859A1 /* NaiveBayesClassifierTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = NaiveBayesClassifierTests.swift; sourceTree = ""; }; 459B014C1A955E3D000859A1 /* Functions.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Functions.swift; sourceTree = ""; }; + 459BE4021A97C99D0008714F /* ParsimmonLemmatizerTests.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = ParsimmonLemmatizerTests.m; sourceTree = ""; }; + 459BE4031A97C99D0008714F /* ParsimmonTaggerTests.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = ParsimmonTaggerTests.m; sourceTree = ""; }; + 459BE4041A97C99D0008714F /* ParsimmonTokenizerTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ParsimmonTokenizerTests.swift; sourceTree = ""; }; B6139F6919442FB700FC6CAA /* ParsimmonTokenizer.swift */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 2; lastKnownFileType = sourcecode.swift; path = ParsimmonTokenizer.swift; sourceTree = ""; tabWidth = 2; usesTabs = 0; }; B6139F6B1944318F00FC6CAA /* ParsimmonTests-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "ParsimmonTests-Bridging-Header.h"; sourceTree = ""; }; - B6139F6E194433BA00FC6CAA /* ParsimmonTokenizerTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 2; lastKnownFileType = sourcecode.swift; path = ParsimmonTokenizerTests.swift; sourceTree = ""; tabWidth = 2; usesTabs = 0; }; + B6139F6E194433BA00FC6CAA /* TokenizerTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 2; lastKnownFileType = sourcecode.swift; path = TokenizerTests.swift; sourceTree = ""; tabWidth = 2; usesTabs = 0; }; B61BB186194429CF003E7B7B /* Parsimmon-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "Parsimmon-Bridging-Header.h"; sourceTree = ""; }; B63E18BD18E618160006BD3E /* en */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = en; path = en.lproj/InfoPlist.strings; sourceTree = ""; }; B63E18BE18E618160006BD3E /* ParsimmonTests-Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = "ParsimmonTests-Info.plist"; sourceTree = ""; }; B63E18C018E618160006BD3E /* ParsimmonDecisionTreeTests.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = ParsimmonDecisionTreeTests.m; sourceTree = ""; }; - B63E18C118E618160006BD3E /* ParsimmonLemmatizerTests.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = ParsimmonLemmatizerTests.m; sourceTree = ""; }; - B63E18C218E618160006BD3E /* ParsimmonTaggerTests.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = ParsimmonTaggerTests.m; sourceTree = ""; }; + B63E18C118E618160006BD3E /* LemmatizerTests.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = LemmatizerTests.m; sourceTree = ""; }; + B63E18C218E618160006BD3E /* TaggerTests.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = TaggerTests.m; sourceTree = ""; }; B67005771807D79500CFF860 /* Parsimmon.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = Parsimmon.app; sourceTree = BUILT_PRODUCTS_DIR; }; B670057A1807D79500CFF860 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; }; B670057C1807D79500CFF860 /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; }; @@ -124,10 +146,13 @@ 459B014A1A954B98000859A1 /* NaiveBayesClassifierTests.swift */, B63E18BE18E618160006BD3E /* ParsimmonTests-Info.plist */, B63E18C018E618160006BD3E /* ParsimmonDecisionTreeTests.m */, - B63E18C118E618160006BD3E /* ParsimmonLemmatizerTests.m */, - B63E18C218E618160006BD3E /* ParsimmonTaggerTests.m */, + 459BE4021A97C99D0008714F /* ParsimmonLemmatizerTests.m */, + 459BE4031A97C99D0008714F /* ParsimmonTaggerTests.m */, + 459BE4041A97C99D0008714F /* ParsimmonTokenizerTests.swift */, + B63E18C118E618160006BD3E /* LemmatizerTests.m */, + B63E18C218E618160006BD3E /* TaggerTests.m */, B6139F6B1944318F00FC6CAA /* ParsimmonTests-Bridging-Header.h */, - B6139F6E194433BA00FC6CAA /* ParsimmonTokenizerTests.swift */, + B6139F6E194433BA00FC6CAA /* TokenizerTests.swift */, ); path = ParsimmonTests; sourceTree = ""; @@ -194,13 +219,18 @@ B67005AD1807D7F600CFF860 /* Parsimmon */ = { isa = PBXGroup; children = ( + 452C8BAE1A96C27F003D7441 /* Analyzer.swift */, + 459B014C1A955E3D000859A1 /* Functions.swift */, B6700608180A2A5A00CFF860 /* Parsimmon.h */, + B61BB186194429CF003E7B7B /* Parsimmon-Bridging-Header.h */, B6139F6919442FB700FC6CAA /* ParsimmonTokenizer.swift */, B67005B91809CD5600CFF860 /* ParsimmonTagger.h */, B67005BA1809CD5600CFF860 /* ParsimmonTagger.m */, B67005BF180A05FC00CFF860 /* ParsimmonLemmatizer.h */, B67005C0180A05FC00CFF860 /* ParsimmonLemmatizer.m */, - 459B014C1A955E3D000859A1 /* Functions.swift */, + 452C8BB31A96E58F003D7441 /* Lemmatizer.swift */, + 452C8BA61A96B557003D7441 /* Tokenizer.swift */, + 452C8BB11A96E589003D7441 /* Tagger.swift */, B6B05E05180A858000D7F34F /* Classifiers */, B67005C6180A19BD00CFF860 /* Seedlings */, ); @@ -214,7 +244,8 @@ B67005BD1809CE2F00CFF860 /* ParsimmonTaggedToken.m */, B67005C2180A0A1D00CFF860 /* ParsimmonSeed.h */, B67005C3180A0A1D00CFF860 /* ParsimmonSeed.m */, - B61BB186194429CF003E7B7B /* Parsimmon-Bridging-Header.h */, + 452C8BA41A96B19D003D7441 /* Seed.swift */, + 452C8BB51A96E5DC003D7441 /* TaggedToken.swift */, ); name = Seedlings; sourceTree = ""; @@ -336,18 +367,24 @@ buildActionMask = 2147483647; files = ( B67005C4180A0A1D00CFF860 /* ParsimmonSeed.m in Sources */, + 452C8BB61A96E5DC003D7441 /* TaggedToken.swift in Sources */, + 452C8BA51A96B19D003D7441 /* Seed.swift in Sources */, B6A43FD618837CF6000F61BA /* ParsimmonNode.m in Sources */, B67005911807D79500CFF860 /* TaggerViewController.m in Sources */, B6A43FD318837077000F61BA /* ParsimmonDecisionTree.m in Sources */, + 452C8BB41A96E58F003D7441 /* Lemmatizer.swift in Sources */, + 452C8BB21A96E589003D7441 /* Tagger.swift in Sources */, B670058B1807D79500CFF860 /* AppDelegate.m in Sources */, 459B014D1A955E3D000859A1 /* Functions.swift in Sources */, B67005871807D79500CFF860 /* main.m in Sources */, B6139F6A19442FB700FC6CAA /* ParsimmonTokenizer.swift in Sources */, B67005C1180A05FC00CFF860 /* ParsimmonLemmatizer.m in Sources */, B67005BE1809CE2F00CFF860 /* ParsimmonTaggedToken.m in Sources */, + 452C8BA71A96B557003D7441 /* Tokenizer.swift in Sources */, B6B05E36180B633F00D7F34F /* ClassifierViewController.m in Sources */, B67005BB1809CD5600CFF860 /* ParsimmonTagger.m in Sources */, 459B01491A9534B0000859A1 /* NaiveBayesClassifier.swift in Sources */, + 452C8BAF1A96C27F003D7441 /* Analyzer.swift in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -355,12 +392,19 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( - B6139F701944D59F00FC6CAA /* ParsimmonTokenizerTests.swift in Sources */, 459B014B1A954B98000859A1 /* NaiveBayesClassifierTests.swift in Sources */, - B63E18CC18E6196D0006BD3E /* ParsimmonLemmatizerTests.m in Sources */, - B63E18CD18E619710006BD3E /* ParsimmonTaggerTests.m in Sources */, + 459BE4071A97C99D0008714F /* ParsimmonTokenizerTests.swift in Sources */, 45166BDE1A94265800D0E013 /* ParsimmonTokenizer.swift in Sources */, + B6139F701944D59F00FC6CAA /* TokenizerTests.swift in Sources */, + 452C8BB81A97B944003D7441 /* Lemmatizer.swift in Sources */, + 459BE4051A97C99D0008714F /* ParsimmonLemmatizerTests.m in Sources */, + 452C8BB91A97B94E003D7441 /* TaggedToken.swift in Sources */, + 45B4EC5B1A97C65900B7B038 /* Tagger.swift in Sources */, B63E18CB18E6196A0006BD3E /* ParsimmonDecisionTreeTests.m in Sources */, + 452C8BAB1A96BCBB003D7441 /* Seed.swift in Sources */, + 452C8BAA1A96BC81003D7441 /* Tokenizer.swift in Sources */, + 452C8BB01A96E501003D7441 /* Analyzer.swift in Sources */, + 459BE4061A97C99D0008714F /* ParsimmonTaggerTests.m in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/Parsimmon/Parsimmon/Analyzer.swift b/Parsimmon/Parsimmon/Analyzer.swift new file mode 100644 index 0000000..f0a8b52 --- /dev/null +++ b/Parsimmon/Parsimmon/Analyzer.swift @@ -0,0 +1,34 @@ +// +// Enumerator.swift +// Parsimmon +// +// Created by Jordan Kay on 2/19/15. +// +// + +import Foundation + +typealias Pair = (String, String) + +protocol Analyzer { + var seed: Seed { get } + var scheme: String { get } +} + +internal func analyze(analyzer: Analyzer, text: String, options: NSLinguisticTaggerOptions?) -> [Pair] { + var pairs: [Pair] = [] + + let range = NSRange(location: 0, length: count(text)) + let options = options ?? analyzer.seed.linguisticTaggerOptions + let tagger = analyzer.seed.linguisticTaggerWithOptions(options) + + tagger.string = text + tagger.enumerateTagsInRange(range, scheme: analyzer.scheme, options: options) { (tag: String?, tokenRange, range, stop) in + if let tag = tag { + let token = (text as NSString).substringWithRange(tokenRange) + let pair = (token, tag) + pairs.append(pair) + } + } + return pairs +} diff --git a/Parsimmon/Parsimmon/Lemmatizer.swift b/Parsimmon/Parsimmon/Lemmatizer.swift new file mode 100644 index 0000000..1c21501 --- /dev/null +++ b/Parsimmon/Parsimmon/Lemmatizer.swift @@ -0,0 +1,47 @@ +// Tokenizer.swift +// +// Copyright (c) 2015 Ayaka Nonaka +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +import Foundation + +public class Lemmatizer: NSObject, Analyzer { + let seed: Seed + + var scheme: String { + return NSLinguisticTagSchemeLemma + } + + init(seed: Seed = Seed()) { + self.seed = seed + } + + override convenience init() { + self.init(seed: Seed()) + } + + func lemmatizeWordsInText(text: String) -> [String] { + return lemmatizeText(text, options: nil) + } + + func lemmatizeText(text: String, options: NSLinguisticTaggerOptions?) -> [String] { + return analyze(self, text, options).map { (token, lemma) in lemma } + } +} diff --git a/Parsimmon/Parsimmon/Seed.swift b/Parsimmon/Parsimmon/Seed.swift new file mode 100644 index 0000000..bf72ddf --- /dev/null +++ b/Parsimmon/Parsimmon/Seed.swift @@ -0,0 +1,21 @@ +// +// Seed.swift +// Parsimmon +// +// Created by Jordan Kay on 2/19/15. +// +// + +import Foundation + +struct Seed { + typealias Language = String + + private let language: Language = "en" + let linguisticTaggerOptions: NSLinguisticTaggerOptions = .OmitWhitespace | .OmitPunctuation | .OmitOther + + func linguisticTaggerWithOptions(options: NSLinguisticTaggerOptions) -> NSLinguisticTagger { + let tagSchemes = NSLinguisticTagger.availableTagSchemesForLanguage(self.language) + return NSLinguisticTagger(tagSchemes: tagSchemes, options: Int(options.rawValue)) + } +} diff --git a/Parsimmon/Parsimmon/TaggedToken.swift b/Parsimmon/Parsimmon/TaggedToken.swift new file mode 100644 index 0000000..d4237a8 --- /dev/null +++ b/Parsimmon/Parsimmon/TaggedToken.swift @@ -0,0 +1,54 @@ +// +// TaggedToken.swift +// Parsimmon +// +// Created by Jordan Kay on 2/19/15. +// +// + +import Foundation + +// +// ParsimmonTaggedToken.swift +// Parsimmon +// +// Created by Jordan Kay on 2/17/15. +// +// + +import Foundation + +class TaggedToken: NSObject, Equatable { + let token: String + let tag: String + + init(token: String, tag: String) { + self.token = token + self.tag = tag + } + + override var hash: Int { + return token.hash ^ tag.hash + } + + override func isEqual(object: AnyObject?) -> Bool { + if let taggedToken = object as? TaggedToken { + return isEqualToTaggedToken(taggedToken) + } + return false + } + + private func isEqualToTaggedToken(taggedToken: TaggedToken) -> Bool { + return token == taggedToken.token && tag == taggedToken.tag + } +} + +extension TaggedToken: Printable { + override var description: String { + return "('\(token)' \(tag))" + } +} + +func ==(lhs: TaggedToken, rhs: TaggedToken) -> Bool { + return lhs.token == rhs.token && lhs.tag == rhs.tag +} diff --git a/Parsimmon/Parsimmon/Tagger.swift b/Parsimmon/Parsimmon/Tagger.swift new file mode 100644 index 0000000..da36d1b --- /dev/null +++ b/Parsimmon/Parsimmon/Tagger.swift @@ -0,0 +1,49 @@ +// Tokenizer.swift +// +// Copyright (c) 2015 Ayaka Nonaka +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +import Foundation + +public class Tagger: NSObject, Analyzer { + let seed: Seed + + var scheme: String { + return NSLinguisticTagSchemeNameTypeOrLexicalClass + } + + init(seed: Seed) { + self.seed = seed + } + + override convenience init() { + self.init(seed: Seed()) + } + + func tagWordsInText(text: String) -> [TaggedToken] { + return tagText(text, options: nil) + } + + func tagText(text: String, options: NSLinguisticTaggerOptions?) -> [TaggedToken] { + return analyze(self, text, options).map { (token, tag) in + TaggedToken(token: token, tag: tag) + } + } +} diff --git a/Parsimmon/Parsimmon/Tokenizer.swift b/Parsimmon/Parsimmon/Tokenizer.swift new file mode 100644 index 0000000..d1d0641 --- /dev/null +++ b/Parsimmon/Parsimmon/Tokenizer.swift @@ -0,0 +1,47 @@ +// Tokenizer.swift +// +// Copyright (c) 2015 Ayaka Nonaka +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +import Foundation + +public class Tokenizer: NSObject, Analyzer { + let seed: Seed + + var scheme: String { + return NSLinguisticTagSchemeNameTypeOrLexicalClass + } + + init(seed: Seed) { + self.seed = seed + } + + override convenience init() { + self.init(seed: Seed()) + } + + func tokenize(text: String) -> [String] { + return tokenize(text, options: nil) + } + + func tokenize(text: String, options: NSLinguisticTaggerOptions?) -> [String] { + return analyze(self, text, options).map { (token, tag) in token } + } +} diff --git a/Parsimmon/ParsimmonTests/LemmatizerTests.m b/Parsimmon/ParsimmonTests/LemmatizerTests.m new file mode 100644 index 0000000..721c4ac --- /dev/null +++ b/Parsimmon/ParsimmonTests/LemmatizerTests.m @@ -0,0 +1,54 @@ +// +// LemmatizerTests.m +// Parsimmon +// +// Created by Hector Zarate on 10/24/13. +// +// + +#import +#import "ParsimmonTests-Swift.h" + +@interface LemmatizerTests : XCTestCase + +@end + +@implementation LemmatizerTests + +- (void)setUp +{ + [super setUp]; + // Put setup code here; it will be run once, before the first test case. +} + +- (void)tearDown +{ + // Put teardown code here; it will be run once, after the last test case. + [super tearDown]; +} + +- (void)testLemmatizeWordsInText +{ + NSString *testString = @"Diane, I'm holding in my hand a small box of chocolate bunnies."; + + NSArray *expectedOutput = @[@"diane", + @"i", + @"hold", + @"in", + @"my", + @"hand", + @"a", + @"small", + @"box", + @"of", + @"chocolate", + @"bunny"]; + + Lemmatizer *lemmatizer = [[Lemmatizer alloc] init]; + + NSArray *lemmatizedTokenStrings = [lemmatizer lemmatizeWordsInText:testString]; + + XCTAssertEqualObjects(expectedOutput, lemmatizedTokenStrings, @"Failed to lematize words in text"); +} + +@end diff --git a/Parsimmon/ParsimmonTests/TaggerTests.m b/Parsimmon/ParsimmonTests/TaggerTests.m new file mode 100644 index 0000000..c404a59 --- /dev/null +++ b/Parsimmon/ParsimmonTests/TaggerTests.m @@ -0,0 +1,51 @@ +// +// ParsimmonTaggedTokenTests.m +// Parsimmon +// +// Created by Hector Zarate on 10/18/13. +// +// + +#import +#import "ParsimmonTests-Swift.h" + +@interface TaggerTests : XCTestCase + +@end + +@implementation TaggerTests + +- (void)setUp +{ + [super setUp]; + // Put setup code here; it will be run once, before the first test case. +} + +- (void)tearDown +{ + // Put teardown code here; it will be run once, after the last test case. + [super tearDown]; +} + +- (void)testTagWordsInText +{ + NSArray *expectedTaggedTokens = @[[[TaggedToken alloc] initWithToken:@"The" tag:@"Determiner"], + [[TaggedToken alloc] initWithToken:@"quick" tag:@"Adjective"], + [[TaggedToken alloc] initWithToken:@"brown" tag:@"Adjective"], + [[TaggedToken alloc] initWithToken:@"fox" tag:@"Noun"], + [[TaggedToken alloc] initWithToken:@"jumps" tag:@"Noun"], + [[TaggedToken alloc] initWithToken:@"over" tag:@"Preposition"], + [[TaggedToken alloc] initWithToken:@"the" tag:@"Determiner"], + [[TaggedToken alloc] initWithToken:@"lazy" tag:@"Adjective"], + [[TaggedToken alloc] initWithToken:@"dog" tag:@"Noun"]]; + + NSString *testStringOne = @"The quick brown fox jumps over the lazy dog"; + + Tagger *tagger = [[Tagger alloc] init]; + + NSArray *taggedTokens = [tagger tagWordsInText:testStringOne]; + + XCTAssertEqualObjects(taggedTokens, expectedTaggedTokens, @"Failed to tagged words in text"); +} + +@end diff --git a/Parsimmon/ParsimmonTests/TokenizerTests.swift b/Parsimmon/ParsimmonTests/TokenizerTests.swift new file mode 100644 index 0000000..0de36ad --- /dev/null +++ b/Parsimmon/ParsimmonTests/TokenizerTests.swift @@ -0,0 +1,53 @@ +// TokenizerTests.swift +// +// Copyright (c) 2014 Ayaka Nonaka +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +import XCTest +import Parsimmon + +class TokenizerTests : XCTestCase { + + func testTokenizeWords() { + let expectedTokens = ["I", + "the", + "quick", + "brown", + "fox", + "jumped", + "over", + "the", + "lazy", + "dog"] + + let testStringOne = "I, the quick brown fox jumped over the lazy dog..." + + let tokenizer = Tokenizer(); + let tokens = tokenizer.tokenize(testStringOne); + + XCTAssertEqual(tokens, expectedTokens, "Failed to tokenize words in text") + } + + func testTokenizeAllWhitespace() { + let tokenizer = Tokenizer(); + let tokens = tokenizer.tokenize(" "); + XCTAssertEqual(tokens, [], "Failed to tokenize all whitespace") + } +} From 44cbd7c440f4c597279a0a3f0ab84270006171b5 Mon Sep 17 00:00:00 2001 From: Jordan Kay Date: Fri, 20 Feb 2015 12:17:57 -0800 Subject: [PATCH 2/4] Remove Objective-C versions of files no longer needed --- Parsimmon/Example/ClassifierViewController.m | 1 - Parsimmon/Example/TaggerViewController.m | 6 +- Parsimmon/Parsimmon.xcodeproj/project.pbxproj | 48 +--- .../Parsimmon/NaiveBayesClassifier.swift | 6 +- .../Parsimmon/Parsimmon-Bridging-Header.h | 1 - Parsimmon/Parsimmon/ParsimmonLemmatizer.h | 43 ---- Parsimmon/Parsimmon/ParsimmonLemmatizer.m | 49 ---- ...Seed.h => ParsimmonNaiveBayesClassifier.h} | 42 ++-- .../Parsimmon/ParsimmonNaiveBayesClassifier.m | 218 ++++++++++++++++++ Parsimmon/Parsimmon/ParsimmonSeed.m | 59 ----- Parsimmon/Parsimmon/ParsimmonTaggedToken.h | 41 ---- Parsimmon/Parsimmon/ParsimmonTaggedToken.m | 73 ------ Parsimmon/Parsimmon/ParsimmonTagger.h | 43 ---- Parsimmon/Parsimmon/ParsimmonTagger.m | 50 ---- .../ParsimmonTests/ParsimmonLemmatizerTests.m | 55 ----- .../ParsimmonTests/ParsimmonTaggerTests.m | 52 ----- .../ParsimmonTokenizerTests.swift | 53 ----- 17 files changed, 253 insertions(+), 587 deletions(-) delete mode 100644 Parsimmon/Parsimmon/ParsimmonLemmatizer.h delete mode 100644 Parsimmon/Parsimmon/ParsimmonLemmatizer.m rename Parsimmon/Parsimmon/{ParsimmonSeed.h => ParsimmonNaiveBayesClassifier.h} (53%) create mode 100644 Parsimmon/Parsimmon/ParsimmonNaiveBayesClassifier.m delete mode 100644 Parsimmon/Parsimmon/ParsimmonSeed.m delete mode 100644 Parsimmon/Parsimmon/ParsimmonTaggedToken.h delete mode 100644 Parsimmon/Parsimmon/ParsimmonTaggedToken.m delete mode 100644 Parsimmon/Parsimmon/ParsimmonTagger.h delete mode 100644 Parsimmon/Parsimmon/ParsimmonTagger.m delete mode 100644 Parsimmon/ParsimmonTests/ParsimmonLemmatizerTests.m delete mode 100644 Parsimmon/ParsimmonTests/ParsimmonTaggerTests.m delete mode 100644 Parsimmon/ParsimmonTests/ParsimmonTokenizerTests.swift diff --git a/Parsimmon/Example/ClassifierViewController.m b/Parsimmon/Example/ClassifierViewController.m index c983021..62605d5 100644 --- a/Parsimmon/Example/ClassifierViewController.m +++ b/Parsimmon/Example/ClassifierViewController.m @@ -7,7 +7,6 @@ // #import "ClassifierViewController.h" -#import "Parsimmon.h" #import "Parsimmon-Swift.h" @interface ClassifierViewController () diff --git a/Parsimmon/Example/TaggerViewController.m b/Parsimmon/Example/TaggerViewController.m index 7a4bc36..36f0599 100644 --- a/Parsimmon/Example/TaggerViewController.m +++ b/Parsimmon/Example/TaggerViewController.m @@ -7,13 +7,13 @@ // #import "TaggerViewController.h" -#import "Parsimmon.h" +#import "Parsimmon-Swift.h" @interface TaggerViewController () @property (weak, nonatomic) IBOutlet UITextField *inputTextField; @property (weak, nonatomic) IBOutlet UITextView *outputTextView; -@property (strong, nonatomic) ParsimmonTagger *tagger; +@property (strong, nonatomic) Tagger *tagger; @end @implementation TaggerViewController @@ -22,7 +22,7 @@ - (void)viewDidLoad { [super viewDidLoad]; // Do any additional setup after loading the view, typically from a nib. - self.tagger = [[ParsimmonTagger alloc] init]; + self.tagger = [[Tagger alloc] init]; UITapGestureRecognizer *tap = [[UITapGestureRecognizer alloc] initWithTarget:self action:@selector(dismissKeyboard)]; [self.view addGestureRecognizer:tap]; diff --git a/Parsimmon/Parsimmon.xcodeproj/project.pbxproj b/Parsimmon/Parsimmon.xcodeproj/project.pbxproj index 3fd67e4..1dba79f 100644 --- a/Parsimmon/Parsimmon.xcodeproj/project.pbxproj +++ b/Parsimmon/Parsimmon.xcodeproj/project.pbxproj @@ -7,7 +7,8 @@ objects = { /* Begin PBXBuildFile section */ - 45166BDE1A94265800D0E013 /* ParsimmonTokenizer.swift in Sources */ = {isa = PBXBuildFile; fileRef = B6139F6919442FB700FC6CAA /* ParsimmonTokenizer.swift */; }; + 452631151A97CCD900F51473 /* LemmatizerTests.m in Sources */ = {isa = PBXBuildFile; fileRef = B63E18C118E618160006BD3E /* LemmatizerTests.m */; }; + 452631161A97CCDC00F51473 /* TaggerTests.m in Sources */ = {isa = PBXBuildFile; fileRef = B63E18C218E618160006BD3E /* TaggerTests.m */; }; 452C8BA51A96B19D003D7441 /* Seed.swift in Sources */ = {isa = PBXBuildFile; fileRef = 452C8BA41A96B19D003D7441 /* Seed.swift */; }; 452C8BA71A96B557003D7441 /* Tokenizer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 452C8BA61A96B557003D7441 /* Tokenizer.swift */; }; 452C8BAA1A96BC81003D7441 /* Tokenizer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 452C8BA61A96B557003D7441 /* Tokenizer.swift */; }; @@ -22,11 +23,7 @@ 459B01491A9534B0000859A1 /* NaiveBayesClassifier.swift in Sources */ = {isa = PBXBuildFile; fileRef = 459B01481A9534B0000859A1 /* NaiveBayesClassifier.swift */; }; 459B014B1A954B98000859A1 /* NaiveBayesClassifierTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 459B014A1A954B98000859A1 /* NaiveBayesClassifierTests.swift */; }; 459B014D1A955E3D000859A1 /* Functions.swift in Sources */ = {isa = PBXBuildFile; fileRef = 459B014C1A955E3D000859A1 /* Functions.swift */; }; - 459BE4051A97C99D0008714F /* ParsimmonLemmatizerTests.m in Sources */ = {isa = PBXBuildFile; fileRef = 459BE4021A97C99D0008714F /* ParsimmonLemmatizerTests.m */; }; - 459BE4061A97C99D0008714F /* ParsimmonTaggerTests.m in Sources */ = {isa = PBXBuildFile; fileRef = 459BE4031A97C99D0008714F /* ParsimmonTaggerTests.m */; }; - 459BE4071A97C99D0008714F /* ParsimmonTokenizerTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 459BE4041A97C99D0008714F /* ParsimmonTokenizerTests.swift */; }; 45B4EC5B1A97C65900B7B038 /* Tagger.swift in Sources */ = {isa = PBXBuildFile; fileRef = 452C8BB11A96E589003D7441 /* Tagger.swift */; }; - B6139F6A19442FB700FC6CAA /* ParsimmonTokenizer.swift in Sources */ = {isa = PBXBuildFile; fileRef = B6139F6919442FB700FC6CAA /* ParsimmonTokenizer.swift */; }; B6139F701944D59F00FC6CAA /* TokenizerTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = B6139F6E194433BA00FC6CAA /* TokenizerTests.swift */; }; B63E18C418E618160006BD3E /* InfoPlist.strings in Resources */ = {isa = PBXBuildFile; fileRef = B63E18BC18E618160006BD3E /* InfoPlist.strings */; }; B63E18C518E618160006BD3E /* ParsimmonTests-Info.plist in Resources */ = {isa = PBXBuildFile; fileRef = B63E18BE18E618160006BD3E /* ParsimmonTests-Info.plist */; }; @@ -42,10 +39,6 @@ B670059A1807D79500CFF860 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = B67005991807D79500CFF860 /* XCTest.framework */; }; B670059B1807D79500CFF860 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = B670057A1807D79500CFF860 /* Foundation.framework */; }; B67005B31808595600CFF860 /* UIKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = B67005B21808595600CFF860 /* UIKit.framework */; }; - B67005BB1809CD5600CFF860 /* ParsimmonTagger.m in Sources */ = {isa = PBXBuildFile; fileRef = B67005BA1809CD5600CFF860 /* ParsimmonTagger.m */; }; - B67005BE1809CE2F00CFF860 /* ParsimmonTaggedToken.m in Sources */ = {isa = PBXBuildFile; fileRef = B67005BD1809CE2F00CFF860 /* ParsimmonTaggedToken.m */; }; - B67005C1180A05FC00CFF860 /* ParsimmonLemmatizer.m in Sources */ = {isa = PBXBuildFile; fileRef = B67005C0180A05FC00CFF860 /* ParsimmonLemmatizer.m */; }; - B67005C4180A0A1D00CFF860 /* ParsimmonSeed.m in Sources */ = {isa = PBXBuildFile; fileRef = B67005C3180A0A1D00CFF860 /* ParsimmonSeed.m */; }; B6A43FD318837077000F61BA /* ParsimmonDecisionTree.m in Sources */ = {isa = PBXBuildFile; fileRef = B6A43FD218837077000F61BA /* ParsimmonDecisionTree.m */; }; B6A43FD618837CF6000F61BA /* ParsimmonNode.m in Sources */ = {isa = PBXBuildFile; fileRef = B6A43FD518837CF6000F61BA /* ParsimmonNode.m */; }; B6B05E36180B633F00D7F34F /* ClassifierViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = B6B05E35180B633F00D7F34F /* ClassifierViewController.m */; }; @@ -71,10 +64,6 @@ 459B01481A9534B0000859A1 /* NaiveBayesClassifier.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = NaiveBayesClassifier.swift; sourceTree = ""; }; 459B014A1A954B98000859A1 /* NaiveBayesClassifierTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = NaiveBayesClassifierTests.swift; sourceTree = ""; }; 459B014C1A955E3D000859A1 /* Functions.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Functions.swift; sourceTree = ""; }; - 459BE4021A97C99D0008714F /* ParsimmonLemmatizerTests.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = ParsimmonLemmatizerTests.m; sourceTree = ""; }; - 459BE4031A97C99D0008714F /* ParsimmonTaggerTests.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = ParsimmonTaggerTests.m; sourceTree = ""; }; - 459BE4041A97C99D0008714F /* ParsimmonTokenizerTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ParsimmonTokenizerTests.swift; sourceTree = ""; }; - B6139F6919442FB700FC6CAA /* ParsimmonTokenizer.swift */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 2; lastKnownFileType = sourcecode.swift; path = ParsimmonTokenizer.swift; sourceTree = ""; tabWidth = 2; usesTabs = 0; }; B6139F6B1944318F00FC6CAA /* ParsimmonTests-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "ParsimmonTests-Bridging-Header.h"; sourceTree = ""; }; B6139F6E194433BA00FC6CAA /* TokenizerTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 2; lastKnownFileType = sourcecode.swift; path = TokenizerTests.swift; sourceTree = ""; tabWidth = 2; usesTabs = 0; }; B61BB186194429CF003E7B7B /* Parsimmon-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "Parsimmon-Bridging-Header.h"; sourceTree = ""; }; @@ -99,15 +88,6 @@ B67005981807D79500CFF860 /* ParsimmonTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = ParsimmonTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; B67005991807D79500CFF860 /* XCTest.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = XCTest.framework; path = Library/Frameworks/XCTest.framework; sourceTree = DEVELOPER_DIR; }; B67005B21808595600CFF860 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS7.0.sdk/System/Library/Frameworks/UIKit.framework; sourceTree = DEVELOPER_DIR; }; - B67005B91809CD5600CFF860 /* ParsimmonTagger.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ParsimmonTagger.h; sourceTree = ""; }; - B67005BA1809CD5600CFF860 /* ParsimmonTagger.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = ParsimmonTagger.m; sourceTree = ""; }; - B67005BC1809CE2F00CFF860 /* ParsimmonTaggedToken.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ParsimmonTaggedToken.h; sourceTree = ""; }; - B67005BD1809CE2F00CFF860 /* ParsimmonTaggedToken.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = ParsimmonTaggedToken.m; sourceTree = ""; }; - B67005BF180A05FC00CFF860 /* ParsimmonLemmatizer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ParsimmonLemmatizer.h; sourceTree = ""; }; - B67005C0180A05FC00CFF860 /* ParsimmonLemmatizer.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = ParsimmonLemmatizer.m; sourceTree = ""; }; - B67005C2180A0A1D00CFF860 /* ParsimmonSeed.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ParsimmonSeed.h; sourceTree = ""; }; - B67005C3180A0A1D00CFF860 /* ParsimmonSeed.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = ParsimmonSeed.m; sourceTree = ""; }; - B6700608180A2A5A00CFF860 /* Parsimmon.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Parsimmon.h; sourceTree = ""; }; B6A43FD118837077000F61BA /* ParsimmonDecisionTree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ParsimmonDecisionTree.h; sourceTree = ""; }; B6A43FD218837077000F61BA /* ParsimmonDecisionTree.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = ParsimmonDecisionTree.m; sourceTree = ""; }; B6A43FD418837CF6000F61BA /* ParsimmonNode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ParsimmonNode.h; sourceTree = ""; }; @@ -146,9 +126,6 @@ 459B014A1A954B98000859A1 /* NaiveBayesClassifierTests.swift */, B63E18BE18E618160006BD3E /* ParsimmonTests-Info.plist */, B63E18C018E618160006BD3E /* ParsimmonDecisionTreeTests.m */, - 459BE4021A97C99D0008714F /* ParsimmonLemmatizerTests.m */, - 459BE4031A97C99D0008714F /* ParsimmonTaggerTests.m */, - 459BE4041A97C99D0008714F /* ParsimmonTokenizerTests.swift */, B63E18C118E618160006BD3E /* LemmatizerTests.m */, B63E18C218E618160006BD3E /* TaggerTests.m */, B6139F6B1944318F00FC6CAA /* ParsimmonTests-Bridging-Header.h */, @@ -221,13 +198,7 @@ children = ( 452C8BAE1A96C27F003D7441 /* Analyzer.swift */, 459B014C1A955E3D000859A1 /* Functions.swift */, - B6700608180A2A5A00CFF860 /* Parsimmon.h */, B61BB186194429CF003E7B7B /* Parsimmon-Bridging-Header.h */, - B6139F6919442FB700FC6CAA /* ParsimmonTokenizer.swift */, - B67005B91809CD5600CFF860 /* ParsimmonTagger.h */, - B67005BA1809CD5600CFF860 /* ParsimmonTagger.m */, - B67005BF180A05FC00CFF860 /* ParsimmonLemmatizer.h */, - B67005C0180A05FC00CFF860 /* ParsimmonLemmatizer.m */, 452C8BB31A96E58F003D7441 /* Lemmatizer.swift */, 452C8BA61A96B557003D7441 /* Tokenizer.swift */, 452C8BB11A96E589003D7441 /* Tagger.swift */, @@ -240,10 +211,6 @@ B67005C6180A19BD00CFF860 /* Seedlings */ = { isa = PBXGroup; children = ( - B67005BC1809CE2F00CFF860 /* ParsimmonTaggedToken.h */, - B67005BD1809CE2F00CFF860 /* ParsimmonTaggedToken.m */, - B67005C2180A0A1D00CFF860 /* ParsimmonSeed.h */, - B67005C3180A0A1D00CFF860 /* ParsimmonSeed.m */, 452C8BA41A96B19D003D7441 /* Seed.swift */, 452C8BB51A96E5DC003D7441 /* TaggedToken.swift */, ); @@ -366,7 +333,6 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( - B67005C4180A0A1D00CFF860 /* ParsimmonSeed.m in Sources */, 452C8BB61A96E5DC003D7441 /* TaggedToken.swift in Sources */, 452C8BA51A96B19D003D7441 /* Seed.swift in Sources */, B6A43FD618837CF6000F61BA /* ParsimmonNode.m in Sources */, @@ -377,12 +343,8 @@ B670058B1807D79500CFF860 /* AppDelegate.m in Sources */, 459B014D1A955E3D000859A1 /* Functions.swift in Sources */, B67005871807D79500CFF860 /* main.m in Sources */, - B6139F6A19442FB700FC6CAA /* ParsimmonTokenizer.swift in Sources */, - B67005C1180A05FC00CFF860 /* ParsimmonLemmatizer.m in Sources */, - B67005BE1809CE2F00CFF860 /* ParsimmonTaggedToken.m in Sources */, 452C8BA71A96B557003D7441 /* Tokenizer.swift in Sources */, B6B05E36180B633F00D7F34F /* ClassifierViewController.m in Sources */, - B67005BB1809CD5600CFF860 /* ParsimmonTagger.m in Sources */, 459B01491A9534B0000859A1 /* NaiveBayesClassifier.swift in Sources */, 452C8BAF1A96C27F003D7441 /* Analyzer.swift in Sources */, ); @@ -393,18 +355,16 @@ buildActionMask = 2147483647; files = ( 459B014B1A954B98000859A1 /* NaiveBayesClassifierTests.swift in Sources */, - 459BE4071A97C99D0008714F /* ParsimmonTokenizerTests.swift in Sources */, - 45166BDE1A94265800D0E013 /* ParsimmonTokenizer.swift in Sources */, B6139F701944D59F00FC6CAA /* TokenizerTests.swift in Sources */, 452C8BB81A97B944003D7441 /* Lemmatizer.swift in Sources */, - 459BE4051A97C99D0008714F /* ParsimmonLemmatizerTests.m in Sources */, 452C8BB91A97B94E003D7441 /* TaggedToken.swift in Sources */, + 452631161A97CCDC00F51473 /* TaggerTests.m in Sources */, 45B4EC5B1A97C65900B7B038 /* Tagger.swift in Sources */, B63E18CB18E6196A0006BD3E /* ParsimmonDecisionTreeTests.m in Sources */, 452C8BAB1A96BCBB003D7441 /* Seed.swift in Sources */, + 452631151A97CCD900F51473 /* LemmatizerTests.m in Sources */, 452C8BAA1A96BC81003D7441 /* Tokenizer.swift in Sources */, 452C8BB01A96E501003D7441 /* Analyzer.swift in Sources */, - 459BE4061A97C99D0008714F /* ParsimmonTaggerTests.m in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/Parsimmon/Parsimmon/NaiveBayesClassifier.swift b/Parsimmon/Parsimmon/NaiveBayesClassifier.swift index 45e403d..49fcecc 100644 --- a/Parsimmon/Parsimmon/NaiveBayesClassifier.swift +++ b/Parsimmon/Parsimmon/NaiveBayesClassifier.swift @@ -45,19 +45,19 @@ public class NaiveBayesClassifier: NSObject { public typealias Word = String public typealias Category = String - private let tokenizer: ParsimmonTokenizer + private let tokenizer: Tokenizer private var categoryOccurrences: [Category: Int] = [:] private var wordOccurrences: [Word: [Category: Int]] = [:] private var trainingCount = 0 private var wordCount = 0 - public init(tokenizer: ParsimmonTokenizer) { + public init(tokenizer: Tokenizer) { self.tokenizer = tokenizer } public convenience override init() { - self.init(tokenizer: ParsimmonTokenizer()) + self.init(tokenizer: Tokenizer()) } // MARK: - Training diff --git a/Parsimmon/Parsimmon/Parsimmon-Bridging-Header.h b/Parsimmon/Parsimmon/Parsimmon-Bridging-Header.h index 3e3313d..e69de29 100644 --- a/Parsimmon/Parsimmon/Parsimmon-Bridging-Header.h +++ b/Parsimmon/Parsimmon/Parsimmon-Bridging-Header.h @@ -1 +0,0 @@ -#import "ParsimmonSeed.h" diff --git a/Parsimmon/Parsimmon/ParsimmonLemmatizer.h b/Parsimmon/Parsimmon/ParsimmonLemmatizer.h deleted file mode 100644 index e949488..0000000 --- a/Parsimmon/Parsimmon/ParsimmonLemmatizer.h +++ /dev/null @@ -1,43 +0,0 @@ -// ParsimmonLemmatizer.h -// -// Copyright (c) 2013 Ayaka Nonaka -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#import -#import "ParsimmonSeed.h" - -@interface ParsimmonLemmatizer : ParsimmonSeed - -/** - Returns the lemmatized tokens for the input text, omitting any whitespace, punctuation, and other symbols. - @param text Text to lemmatized - @return The lemmatized tokens - */ -- (NSArray *)lemmatizeWordsInText:(NSString *)text; - -/** - Returns the lemmatized tokens for the input text using the specified linguistic tagger options. - @param text Text to lemmatized - @param options Linguistic tagger options - @return The lemmatized tokens - */ -- (NSArray *)lemmatizeText:(NSString *)text options:(NSLinguisticTaggerOptions)options; - -@end diff --git a/Parsimmon/Parsimmon/ParsimmonLemmatizer.m b/Parsimmon/Parsimmon/ParsimmonLemmatizer.m deleted file mode 100644 index d3e5a7e..0000000 --- a/Parsimmon/Parsimmon/ParsimmonLemmatizer.m +++ /dev/null @@ -1,49 +0,0 @@ -// ParsimmonLemmatizer.m -// -// Copyright (c) 2013 Ayaka Nonaka -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#import "ParsimmonLemmatizer.h" - -@implementation ParsimmonLemmatizer - -- (NSArray *)lemmatizeWordsInText:(NSString *)text -{ - return [self lemmatizeText:text options:self.defaultLinguisticTaggerOptions]; -} - -- (NSArray *)lemmatizeText:(NSString *)text options:(NSLinguisticTaggerOptions)options -{ - NSMutableArray *tags = [NSMutableArray array]; - NSLinguisticTagger *tagger = [self linguisticTaggerWithOptions:options]; - tagger.string = text; - [tagger enumerateTagsInRange:NSMakeRange(0, [text length]) - scheme:NSLinguisticTagSchemeLemma - options:options - usingBlock:^(NSString *tag, NSRange tokenRange, NSRange sentenceRange, BOOL *stop) { - if (tag) { - [tags addObject:tag]; - } - } - ]; - return tags; -} - -@end diff --git a/Parsimmon/Parsimmon/ParsimmonSeed.h b/Parsimmon/Parsimmon/ParsimmonNaiveBayesClassifier.h similarity index 53% rename from Parsimmon/Parsimmon/ParsimmonSeed.h rename to Parsimmon/Parsimmon/ParsimmonNaiveBayesClassifier.h index e52c53d..6658fab 100644 --- a/Parsimmon/Parsimmon/ParsimmonSeed.h +++ b/Parsimmon/Parsimmon/ParsimmonNaiveBayesClassifier.h @@ -1,4 +1,4 @@ -// ParsimmonSeed.h +// ParsimmonNaiveBayesClassifier.h // // Copyright (c) 2013 Ayaka Nonaka // @@ -22,34 +22,42 @@ #import -@interface ParsimmonSeed : NSObject - -@property (assign, nonatomic) NSLinguisticTaggerOptions linguisticTaggerOptions; +@class Tokenizer; +@interface ParsimmonNaiveBayesClassifier : NSObject /** - Creates a parsimmon seed instance for the English language. - @return The initialized seed + Creates a parsimmon naive bayes classifier instance that uses the default tokenizer. + @return The initialized classifier */ - (instancetype)init; /** - Creates a parsimmon seed instance for the specified language. - @param language The language to use - @return The initialized seed + Trains the classifier with text and its category. + @param text The text + @param category The category of the text */ -- (instancetype)initWithLanguage:(NSString *)language; +- (void)trainWithText:(NSString *)text category:(NSString *)category; /** - Returns a linguistic tagger in its language for specified options. - @param options Linguistic tagger options - @return The linguistic tagger + Trains the classifier with tokenized text and its category. + This is useful if you wish to use your own tokenization method. + @param tokens The tokenized text + @param category The category of the text */ -- (NSLinguisticTagger *)linguisticTaggerWithOptions:(NSLinguisticTaggerOptions)options; +- (void)trainWithTokens:(NSArray *)tokens category:(NSString *)category; /** - Returns the default linguistic tagger options for subclasses to use. - @return The default linguistic tagger options + Classifies the given text based on its training data. + @param text The text to classify + @return The category classification */ -- (NSLinguisticTaggerOptions)defaultLinguisticTaggerOptions; +- (NSString *)classify:(NSString *)text; + +/** + Classifies the given tokenized text based on its training data. + @param text The tokenized text to classify + @return The category classification +*/ +- (NSString *)classifyTokens:(NSArray *)tokens; @end diff --git a/Parsimmon/Parsimmon/ParsimmonNaiveBayesClassifier.m b/Parsimmon/Parsimmon/ParsimmonNaiveBayesClassifier.m new file mode 100644 index 0000000..29479e8 --- /dev/null +++ b/Parsimmon/Parsimmon/ParsimmonNaiveBayesClassifier.m @@ -0,0 +1,218 @@ +// ParsimmonNaiveBayesClassifier.m +// +// Copyright (c) 2013 Ayaka Nonaka +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#import "ParsimmonNaiveBayesClassifier.h" +#import "Parsimmon-Swift.h" + +#define kParsimmonSmoothingParameter 1 + +@interface ParsimmonNaiveBayesClassifier () +@property (strong, nonatomic) Tokenizer *tokenizer; +@property (strong, nonatomic) NSMutableDictionary *wordOccurrences; +@property (strong, nonatomic) NSMutableDictionary *categoryOccurrences; +@property (assign, nonatomic) NSUInteger trainingCount; +@property (assign, nonatomic) NSUInteger wordCount; +@end + +@implementation ParsimmonNaiveBayesClassifier + +- (instancetype)init +{ + Tokenizer *tokenizer = [[Tokenizer alloc] init]; + return [self initWithTokenizer:tokenizer]; +} + +- (instancetype)initWithTokenizer:(Tokenizer *)tokenizer +{ + self = [super init]; + if (self) { + self.tokenizer = tokenizer; + } + return self; +} + + +#pragma mark - Training + +- (void)trainWithText:(NSString *)text category:(NSString *)category +{ + [self trainWithTokens:[self.tokenizer tokenize:text] category:category]; +} + +- (void)trainWithTokens:(NSArray *)tokens category:(NSString *)category +{ + NSArray *words = [self removeDuplicates:tokens]; + for (NSString *word in words) { + [self incrementWord:word category:category]; + } + [self incrementCategory:category]; + self.trainingCount += 1; +} + + +#pragma mark - Classifying + +- (NSString *)classify:(NSString *)text +{ + return [self classifyTokens:[self.tokenizer tokenize:text]]; +} + +- (NSString *)classifyTokens:(NSArray *)tokens +{ + // Compute argmax_cat [log(P(C=cat)) + sum_token(log(P(W=token|C=cat)))] + CGFloat maxScore = -CGFLOAT_MAX; + NSString *bestCategory; + for (NSString *category in [self.categoryOccurrences allKeys]) { + CGFloat currentCategoryScore = 0; + CGFloat pCategory = [self pCategory:category]; // P(C=cat) + currentCategoryScore += log(pCategory); // log(P(C=cat)) + for (NSString *token in tokens) { // sum_token + // P(W=token|C=cat) = P(C=cat|W=token) * P(W=token) / P(C=token) [Bayes Theorem] + CGFloat numerator = [self pCategory:category givenWord:token] * [self pWord:token]; + // Do some smoothing + CGFloat pWordGivenCategory = (numerator + kParsimmonSmoothingParameter) / + (pCategory + kParsimmonSmoothingParameter * self.wordCount); + currentCategoryScore += log(pWordGivenCategory); // log(P(W=token|C=cat)) + } + // Update the argmax if necessary + if (currentCategoryScore > maxScore) { + maxScore = currentCategoryScore; + bestCategory = category; + } + } + return bestCategory; +} + + +#pragma mark - Probabilities + +/** + Returns P(C=category|W=word). + @param category The category + @param word The word + @return P(C=category|W=word) + */ +- (CGFloat)pCategory:(NSString *)category givenWord:(NSString *)word +{ + if (!self.wordOccurrences[word]) { + return 0; + } + if (!self.wordOccurrences[word][category]) { + return 0; + } + return ([self.wordOccurrences[word][category] floatValue]) / [self totalOccurrencesOfWord:word]; +} + +/** + Returns P(W=word). + @param word The word + @return P(W=word) + */ +- (CGFloat)pWord:(NSString *)word +{ + return [self totalOccurrencesOfWord:word] / self.wordCount; +} + +/** + Return P(C=category). + @param category The category. + @return P(C=category) + */ +- (CGFloat)pCategory:(NSString *)category +{ + return [self totalOccurrencesOfCategory:category] / self.trainingCount; +} + + +#pragma mark - Counting + +- (void)incrementWord:(NSString *)word category:(NSString *)category +{ + if (!self.wordOccurrences[word]) { + self.wordOccurrences[word] = [NSMutableDictionary new]; + self.wordCount += 1; + } + if (!self.wordOccurrences[word][category]) { + self.wordOccurrences[word][category] = @0; + } + NSUInteger wordCategoryCount = [self.wordOccurrences[word][category] integerValue]; + self.wordOccurrences[word][category] = @(wordCategoryCount + 1); +} + +- (void)incrementCategory:(NSString *)category +{ + if (!self.categoryOccurrences[category]) { + self.categoryOccurrences[category] = @0; + } + NSUInteger categoryCount = [self.categoryOccurrences[category] integerValue]; + self.categoryOccurrences[category] = @(categoryCount + 1); +} + +- (CGFloat)totalOccurrencesOfWord:(NSString *)word +{ + if (!self.wordOccurrences[word]) { + return 0; + } + CGFloat totalOccurrencesOfWord = 0; + for (NSString *category in self.wordOccurrences[word]) { + totalOccurrencesOfWord += [self.wordOccurrences[word][category] floatValue]; + } + return totalOccurrencesOfWord; +} + +- (CGFloat)totalOccurrencesOfCategory:(NSString *)category +{ + if (!self.categoryOccurrences[category]) { + return 0; + } + return [self.categoryOccurrences[category] floatValue]; +} + + +#pragma mark - Helpers + +- (NSArray *)removeDuplicates:(NSArray *)array +{ + NSSet *set = [NSSet setWithArray:array]; + return [set allObjects]; +} + + +#pragma mark - Properties + +- (NSMutableDictionary *)wordOccurrences +{ + if (!_wordOccurrences) { + _wordOccurrences = [NSMutableDictionary new]; + } + return _wordOccurrences; +} + +- (NSMutableDictionary *)categoryOccurrences +{ + if (!_categoryOccurrences) { + _categoryOccurrences = [NSMutableDictionary new]; + } + return _categoryOccurrences; +} + +@end diff --git a/Parsimmon/Parsimmon/ParsimmonSeed.m b/Parsimmon/Parsimmon/ParsimmonSeed.m deleted file mode 100644 index 97f55b3..0000000 --- a/Parsimmon/Parsimmon/ParsimmonSeed.m +++ /dev/null @@ -1,59 +0,0 @@ -// ParsimmonSeed.m -// -// Copyright (c) 2013 Ayaka Nonaka -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#import "ParsimmonSeed.h" - -@interface ParsimmonSeed () -@property (copy, nonatomic) NSString *language; -@end - -@implementation ParsimmonSeed - -- (instancetype)init -{ - return [self initWithLanguage:@"en"]; -} - -- (instancetype)initWithLanguage:(NSString *)language -{ - self = [super init]; - if (self) { - self.language = language; - } - return self; -} - -- (NSLinguisticTagger *)linguisticTaggerWithOptions:(NSLinguisticTaggerOptions)options -{ - return [[NSLinguisticTagger alloc] initWithTagSchemes:[NSLinguisticTagger availableTagSchemesForLanguage:self.language] - options:options]; -} - -- (NSLinguisticTaggerOptions)defaultLinguisticTaggerOptions -{ - if (!_linguisticTaggerOptions) { - _linguisticTaggerOptions = NSLinguisticTaggerOmitWhitespace | NSLinguisticTaggerOmitPunctuation | NSLinguisticTaggerOmitOther; - } - return _linguisticTaggerOptions; -} - -@end diff --git a/Parsimmon/Parsimmon/ParsimmonTaggedToken.h b/Parsimmon/Parsimmon/ParsimmonTaggedToken.h deleted file mode 100644 index bb59eda..0000000 --- a/Parsimmon/Parsimmon/ParsimmonTaggedToken.h +++ /dev/null @@ -1,41 +0,0 @@ -// ParsimmonToken.h -// -// Copyright (c) 2013 Ayaka Nonaka -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#import - -@interface ParsimmonTaggedToken : NSObject - -@property (copy, nonatomic, readonly) NSString *token; -@property (copy, nonatomic, readonly) NSString *tag; - -/** - Creates a parsimmon tagged token instance. - @param token The token - @param tag The tag - @return The initialized tagged token - */ -- (instancetype)initWithToken:(NSString *)token tag:(NSString *)tag; -- (id) init __unavailable; - -- (BOOL)isEqualToTaggedToken:(ParsimmonTaggedToken *)taggedToken; - -@end diff --git a/Parsimmon/Parsimmon/ParsimmonTaggedToken.m b/Parsimmon/Parsimmon/ParsimmonTaggedToken.m deleted file mode 100644 index e079172..0000000 --- a/Parsimmon/Parsimmon/ParsimmonTaggedToken.m +++ /dev/null @@ -1,73 +0,0 @@ -// ParsimmonToken.m -// -// Copyright (c) 2013 Ayaka Nonaka -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#import "ParsimmonTaggedToken.h" - -@interface ParsimmonTaggedToken () -@property (copy, nonatomic, readwrite) NSString *token; -@property (copy, nonatomic, readwrite) NSString *tag; -@end - -@implementation ParsimmonTaggedToken - -- (instancetype)initWithToken:(NSString *)token tag:(NSString *)tag -{ - self = [super init]; - if (self) { - self.token = token; - self.tag = tag; - } - return self; -} - - -#pragma mark - NSObject - -- (NSString *)description -{ - return [NSString stringWithFormat:@"('%@', %@)", self.token, self.tag]; -} - -- (NSUInteger)hash -{ - NSUInteger hash = self.token.hash ^ self.tag.hash; - - return hash; -} - -- (BOOL)isEqual:(id)object -{ - BOOL isEqual = NO; - - if ([object isKindOfClass:[self class]]) { - isEqual = [self isEqualToTaggedToken:object]; - } - return isEqual; -} - -- (BOOL)isEqualToTaggedToken:(ParsimmonTaggedToken *)taggedToken -{ - return ([self.token isEqualToString:taggedToken.token] && - [self.tag isEqualToString:taggedToken.tag]); -} - -@end diff --git a/Parsimmon/Parsimmon/ParsimmonTagger.h b/Parsimmon/Parsimmon/ParsimmonTagger.h deleted file mode 100644 index 9d2be8e..0000000 --- a/Parsimmon/Parsimmon/ParsimmonTagger.h +++ /dev/null @@ -1,43 +0,0 @@ -// ParsimmonTagger.h -// -// Copyright (c) 2013 Ayaka Nonaka -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#import -#import "ParsimmonSeed.h" - -@interface ParsimmonTagger : ParsimmonSeed - -/** - Returns the tagged tokens for the input text, omitting any whitespace, punctuation, and other symbols. - @param text The text to tag - @return The tagged tokens - */ -- (NSArray *)tagWordsInText:(NSString *)text; - -/** - Returns the tagged tokens for the input text using the specified linguistic tagger options. - @param text Text to tag - @param options Linguistic tagger options - @return The tagged tokens - */ -- (NSArray *)tagText:(NSString *)text options:(NSLinguisticTaggerOptions)options; - -@end diff --git a/Parsimmon/Parsimmon/ParsimmonTagger.m b/Parsimmon/Parsimmon/ParsimmonTagger.m deleted file mode 100644 index fda00b7..0000000 --- a/Parsimmon/Parsimmon/ParsimmonTagger.m +++ /dev/null @@ -1,50 +0,0 @@ -// ParsimmonTagger.m -// -// Copyright (c) 2013 Ayaka Nonaka -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#import "ParsimmonTagger.h" -#import "ParsimmonTaggedToken.h" - -@implementation ParsimmonTagger - -- (NSArray *)tagWordsInText:(NSString *)text -{ - return [self tagText:text options:self.defaultLinguisticTaggerOptions]; -} - -- (NSArray *)tagText:(NSString *)text options:(NSLinguisticTaggerOptions)options -{ - NSMutableArray *taggedTokens = [NSMutableArray array]; - NSLinguisticTagger *tagger = [self linguisticTaggerWithOptions:options]; - tagger.string = text; - [tagger enumerateTagsInRange:NSMakeRange(0, [text length]) - scheme:NSLinguisticTagSchemeNameTypeOrLexicalClass - options:options - usingBlock:^(NSString *tag, NSRange tokenRange, NSRange sentenceRange, BOOL *stop) { - NSString *token = [text substringWithRange:tokenRange]; - ParsimmonTaggedToken *taggedToken = [[ParsimmonTaggedToken alloc] initWithToken:token tag:tag]; - [taggedTokens addObject:taggedToken]; - } - ]; - return taggedTokens; -} - -@end diff --git a/Parsimmon/ParsimmonTests/ParsimmonLemmatizerTests.m b/Parsimmon/ParsimmonTests/ParsimmonLemmatizerTests.m deleted file mode 100644 index 3493a8f..0000000 --- a/Parsimmon/ParsimmonTests/ParsimmonLemmatizerTests.m +++ /dev/null @@ -1,55 +0,0 @@ -// -// ParsimmonLemmatizerTests.m -// Parsimmon -// -// Created by Hector Zarate on 10/24/13. -// -// - -#import -#import "ParsimmonLemmatizer.h" - - -@interface ParsimmonLemmatizerTests : XCTestCase - -@end - -@implementation ParsimmonLemmatizerTests - -- (void)setUp -{ - [super setUp]; - // Put setup code here; it will be run once, before the first test case. -} - -- (void)tearDown -{ - // Put teardown code here; it will be run once, after the last test case. - [super tearDown]; -} - -- (void)testLemmatizeWordsInText -{ - NSString *testString = @"Diane, I'm holding in my hand a small box of chocolate bunnies."; - - NSArray *expectedOutput = @[@"diane", - @"i", - @"hold", - @"in", - @"my", - @"hand", - @"a", - @"small", - @"box", - @"of", - @"chocolate", - @"bunny"]; - - ParsimmonLemmatizer *lemmatizer = [[ParsimmonLemmatizer alloc] init]; - - NSArray *lemmatizedTokenStrings = [lemmatizer lemmatizeWordsInText:testString]; - - XCTAssertEqualObjects(expectedOutput, lemmatizedTokenStrings, @"Failed to lematize words in text"); -} - -@end diff --git a/Parsimmon/ParsimmonTests/ParsimmonTaggerTests.m b/Parsimmon/ParsimmonTests/ParsimmonTaggerTests.m deleted file mode 100644 index a5a3aea..0000000 --- a/Parsimmon/ParsimmonTests/ParsimmonTaggerTests.m +++ /dev/null @@ -1,52 +0,0 @@ -// -// ParsimmonTaggedTokenTests.m -// Parsimmon -// -// Created by Hector Zarate on 10/18/13. -// -// - -#import -#import "ParsimmonTaggedToken.h" -#import "ParsimmonTagger.h" - -@interface ParsimmonTaggerTests : XCTestCase - -@end - -@implementation ParsimmonTaggerTests - -- (void)setUp -{ - [super setUp]; - // Put setup code here; it will be run once, before the first test case. -} - -- (void)tearDown -{ - // Put teardown code here; it will be run once, after the last test case. - [super tearDown]; -} - -- (void)testTagWordsInText -{ - NSArray *expectedTaggedTokens = @[[[ParsimmonTaggedToken alloc] initWithToken:@"The" tag:@"Determiner"], - [[ParsimmonTaggedToken alloc] initWithToken:@"quick" tag:@"Adjective"], - [[ParsimmonTaggedToken alloc] initWithToken:@"brown" tag:@"Adjective"], - [[ParsimmonTaggedToken alloc] initWithToken:@"fox" tag:@"Noun"], - [[ParsimmonTaggedToken alloc] initWithToken:@"jumps" tag:@"Noun"], - [[ParsimmonTaggedToken alloc] initWithToken:@"over" tag:@"Preposition"], - [[ParsimmonTaggedToken alloc] initWithToken:@"the" tag:@"Determiner"], - [[ParsimmonTaggedToken alloc] initWithToken:@"lazy" tag:@"Adjective"], - [[ParsimmonTaggedToken alloc] initWithToken:@"dog" tag:@"Noun"]]; - - NSString *testStringOne = @"The quick brown fox jumps over the lazy dog"; - - ParsimmonTagger *tagger = [[ParsimmonTagger alloc] init]; - - NSArray *taggedTokens = [tagger tagWordsInText:testStringOne]; - - XCTAssertEqualObjects(taggedTokens, expectedTaggedTokens, @"Failed to tagged words in text"); -} - -@end diff --git a/Parsimmon/ParsimmonTests/ParsimmonTokenizerTests.swift b/Parsimmon/ParsimmonTests/ParsimmonTokenizerTests.swift deleted file mode 100644 index b5d8a66..0000000 --- a/Parsimmon/ParsimmonTests/ParsimmonTokenizerTests.swift +++ /dev/null @@ -1,53 +0,0 @@ -// ParsimmonTokenizerTests.swift -// -// Copyright (c) 2014 Ayaka Nonaka -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -import XCTest -import Parsimmon - -class ParsimmonTokenizerTests : XCTestCase { - - func testTokenizeWords() { - let expectedTokens = ["I", - "the", - "quick", - "brown", - "fox", - "jumped", - "over", - "the", - "lazy", - "dog"] - - let testStringOne = "I, the quick brown fox jumped over the lazy dog..." - - let tokenizer = ParsimmonTokenizer(); - let tokens = tokenizer.tokenize(testStringOne); - - XCTAssertEqual(tokens, expectedTokens, "Failed to tokenize words in text") - } - - func testTokenizeAllWhitespace() { - let tokenizer = ParsimmonTokenizer(); - let tokens = tokenizer.tokenize(" "); - XCTAssertEqual(tokens, [], "Failed to tokenize all whitespace") - } -} From df779615cd9c7e4ed4b75570c96db8dfdb0e8177 Mon Sep 17 00:00:00 2001 From: Jordan Kay Date: Fri, 20 Feb 2015 12:31:40 -0800 Subject: [PATCH 3/4] Fix accidental duplicate boilerplate form copy-paste --- Parsimmon/Parsimmon/TaggedToken.swift | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/Parsimmon/Parsimmon/TaggedToken.swift b/Parsimmon/Parsimmon/TaggedToken.swift index d4237a8..3b6df5d 100644 --- a/Parsimmon/Parsimmon/TaggedToken.swift +++ b/Parsimmon/Parsimmon/TaggedToken.swift @@ -8,16 +8,6 @@ import Foundation -// -// ParsimmonTaggedToken.swift -// Parsimmon -// -// Created by Jordan Kay on 2/17/15. -// -// - -import Foundation - class TaggedToken: NSObject, Equatable { let token: String let tag: String From 475187e336e1b9a66dd599b849c0cf7145d6ec8a Mon Sep 17 00:00:00 2001 From: Jordan Kay Date: Fri, 20 Feb 2015 17:16:44 -0800 Subject: [PATCH 4/4] Delete files no longer needed --- .../Parsimmon/ParsimmonNaiveBayesClassifier.h | 63 ----- .../Parsimmon/ParsimmonNaiveBayesClassifier.m | 218 ------------------ Parsimmon/Parsimmon/ParsimmonTokenizer.swift | 44 ---- 3 files changed, 325 deletions(-) delete mode 100644 Parsimmon/Parsimmon/ParsimmonNaiveBayesClassifier.h delete mode 100644 Parsimmon/Parsimmon/ParsimmonNaiveBayesClassifier.m delete mode 100644 Parsimmon/Parsimmon/ParsimmonTokenizer.swift diff --git a/Parsimmon/Parsimmon/ParsimmonNaiveBayesClassifier.h b/Parsimmon/Parsimmon/ParsimmonNaiveBayesClassifier.h deleted file mode 100644 index 6658fab..0000000 --- a/Parsimmon/Parsimmon/ParsimmonNaiveBayesClassifier.h +++ /dev/null @@ -1,63 +0,0 @@ -// ParsimmonNaiveBayesClassifier.h -// -// Copyright (c) 2013 Ayaka Nonaka -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#import - -@class Tokenizer; -@interface ParsimmonNaiveBayesClassifier : NSObject - -/** - Creates a parsimmon naive bayes classifier instance that uses the default tokenizer. - @return The initialized classifier - */ -- (instancetype)init; - -/** - Trains the classifier with text and its category. - @param text The text - @param category The category of the text - */ -- (void)trainWithText:(NSString *)text category:(NSString *)category; - -/** - Trains the classifier with tokenized text and its category. - This is useful if you wish to use your own tokenization method. - @param tokens The tokenized text - @param category The category of the text - */ -- (void)trainWithTokens:(NSArray *)tokens category:(NSString *)category; - -/** - Classifies the given text based on its training data. - @param text The text to classify - @return The category classification - */ -- (NSString *)classify:(NSString *)text; - -/** - Classifies the given tokenized text based on its training data. - @param text The tokenized text to classify - @return The category classification -*/ -- (NSString *)classifyTokens:(NSArray *)tokens; - -@end diff --git a/Parsimmon/Parsimmon/ParsimmonNaiveBayesClassifier.m b/Parsimmon/Parsimmon/ParsimmonNaiveBayesClassifier.m deleted file mode 100644 index 29479e8..0000000 --- a/Parsimmon/Parsimmon/ParsimmonNaiveBayesClassifier.m +++ /dev/null @@ -1,218 +0,0 @@ -// ParsimmonNaiveBayesClassifier.m -// -// Copyright (c) 2013 Ayaka Nonaka -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#import "ParsimmonNaiveBayesClassifier.h" -#import "Parsimmon-Swift.h" - -#define kParsimmonSmoothingParameter 1 - -@interface ParsimmonNaiveBayesClassifier () -@property (strong, nonatomic) Tokenizer *tokenizer; -@property (strong, nonatomic) NSMutableDictionary *wordOccurrences; -@property (strong, nonatomic) NSMutableDictionary *categoryOccurrences; -@property (assign, nonatomic) NSUInteger trainingCount; -@property (assign, nonatomic) NSUInteger wordCount; -@end - -@implementation ParsimmonNaiveBayesClassifier - -- (instancetype)init -{ - Tokenizer *tokenizer = [[Tokenizer alloc] init]; - return [self initWithTokenizer:tokenizer]; -} - -- (instancetype)initWithTokenizer:(Tokenizer *)tokenizer -{ - self = [super init]; - if (self) { - self.tokenizer = tokenizer; - } - return self; -} - - -#pragma mark - Training - -- (void)trainWithText:(NSString *)text category:(NSString *)category -{ - [self trainWithTokens:[self.tokenizer tokenize:text] category:category]; -} - -- (void)trainWithTokens:(NSArray *)tokens category:(NSString *)category -{ - NSArray *words = [self removeDuplicates:tokens]; - for (NSString *word in words) { - [self incrementWord:word category:category]; - } - [self incrementCategory:category]; - self.trainingCount += 1; -} - - -#pragma mark - Classifying - -- (NSString *)classify:(NSString *)text -{ - return [self classifyTokens:[self.tokenizer tokenize:text]]; -} - -- (NSString *)classifyTokens:(NSArray *)tokens -{ - // Compute argmax_cat [log(P(C=cat)) + sum_token(log(P(W=token|C=cat)))] - CGFloat maxScore = -CGFLOAT_MAX; - NSString *bestCategory; - for (NSString *category in [self.categoryOccurrences allKeys]) { - CGFloat currentCategoryScore = 0; - CGFloat pCategory = [self pCategory:category]; // P(C=cat) - currentCategoryScore += log(pCategory); // log(P(C=cat)) - for (NSString *token in tokens) { // sum_token - // P(W=token|C=cat) = P(C=cat|W=token) * P(W=token) / P(C=token) [Bayes Theorem] - CGFloat numerator = [self pCategory:category givenWord:token] * [self pWord:token]; - // Do some smoothing - CGFloat pWordGivenCategory = (numerator + kParsimmonSmoothingParameter) / - (pCategory + kParsimmonSmoothingParameter * self.wordCount); - currentCategoryScore += log(pWordGivenCategory); // log(P(W=token|C=cat)) - } - // Update the argmax if necessary - if (currentCategoryScore > maxScore) { - maxScore = currentCategoryScore; - bestCategory = category; - } - } - return bestCategory; -} - - -#pragma mark - Probabilities - -/** - Returns P(C=category|W=word). - @param category The category - @param word The word - @return P(C=category|W=word) - */ -- (CGFloat)pCategory:(NSString *)category givenWord:(NSString *)word -{ - if (!self.wordOccurrences[word]) { - return 0; - } - if (!self.wordOccurrences[word][category]) { - return 0; - } - return ([self.wordOccurrences[word][category] floatValue]) / [self totalOccurrencesOfWord:word]; -} - -/** - Returns P(W=word). - @param word The word - @return P(W=word) - */ -- (CGFloat)pWord:(NSString *)word -{ - return [self totalOccurrencesOfWord:word] / self.wordCount; -} - -/** - Return P(C=category). - @param category The category. - @return P(C=category) - */ -- (CGFloat)pCategory:(NSString *)category -{ - return [self totalOccurrencesOfCategory:category] / self.trainingCount; -} - - -#pragma mark - Counting - -- (void)incrementWord:(NSString *)word category:(NSString *)category -{ - if (!self.wordOccurrences[word]) { - self.wordOccurrences[word] = [NSMutableDictionary new]; - self.wordCount += 1; - } - if (!self.wordOccurrences[word][category]) { - self.wordOccurrences[word][category] = @0; - } - NSUInteger wordCategoryCount = [self.wordOccurrences[word][category] integerValue]; - self.wordOccurrences[word][category] = @(wordCategoryCount + 1); -} - -- (void)incrementCategory:(NSString *)category -{ - if (!self.categoryOccurrences[category]) { - self.categoryOccurrences[category] = @0; - } - NSUInteger categoryCount = [self.categoryOccurrences[category] integerValue]; - self.categoryOccurrences[category] = @(categoryCount + 1); -} - -- (CGFloat)totalOccurrencesOfWord:(NSString *)word -{ - if (!self.wordOccurrences[word]) { - return 0; - } - CGFloat totalOccurrencesOfWord = 0; - for (NSString *category in self.wordOccurrences[word]) { - totalOccurrencesOfWord += [self.wordOccurrences[word][category] floatValue]; - } - return totalOccurrencesOfWord; -} - -- (CGFloat)totalOccurrencesOfCategory:(NSString *)category -{ - if (!self.categoryOccurrences[category]) { - return 0; - } - return [self.categoryOccurrences[category] floatValue]; -} - - -#pragma mark - Helpers - -- (NSArray *)removeDuplicates:(NSArray *)array -{ - NSSet *set = [NSSet setWithArray:array]; - return [set allObjects]; -} - - -#pragma mark - Properties - -- (NSMutableDictionary *)wordOccurrences -{ - if (!_wordOccurrences) { - _wordOccurrences = [NSMutableDictionary new]; - } - return _wordOccurrences; -} - -- (NSMutableDictionary *)categoryOccurrences -{ - if (!_categoryOccurrences) { - _categoryOccurrences = [NSMutableDictionary new]; - } - return _categoryOccurrences; -} - -@end diff --git a/Parsimmon/Parsimmon/ParsimmonTokenizer.swift b/Parsimmon/Parsimmon/ParsimmonTokenizer.swift deleted file mode 100644 index 4e1a6dc..0000000 --- a/Parsimmon/Parsimmon/ParsimmonTokenizer.swift +++ /dev/null @@ -1,44 +0,0 @@ -// ParsimmonTokenizer.swift -// -// Copyright (c) 2014 Ayaka Nonaka -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -import Foundation - -public class ParsimmonTokenizer : ParsimmonSeed { - - func tokenize(text: String) -> [String] { - return self.tokenize(text, options:self.defaultLinguisticTaggerOptions()) - } - - func tokenize(text: String, options: NSLinguisticTaggerOptions) -> [String] { - var tokens = [String]() - let tagger = self.linguisticTaggerWithOptions(options) - tagger.string = text - tagger.enumerateTagsInRange(NSRange(location:0, length:count(text)), - scheme:NSLinguisticTagSchemeNameTypeOrLexicalClass, - options:options) { - (tag: String!, tokenRange: NSRange, sentenceRange: NSRange, stop: UnsafeMutablePointer) -> Void in - let token = (text as NSString).substringWithRange(tokenRange) - tokens.append(token) - } - return tokens - } -}