Line data Source code
1 : // License
2 :
3 : // (The MIT License)
4 :
5 : // Copyright (c) by Tolga Tezel tolgatezel11@gmail.com
6 :
7 : // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
8 : // to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute,
9 : // sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
10 : // subject to the following conditions:
11 :
12 : // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
13 : // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
14 : // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
15 : // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
16 : // IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17 :
18 : /**
19 : * INFO: a port of ttezel's naive Bayes implementation
20 : */
21 :
22 : /*
23 : Expose our naive-bayes generator function
24 : */
25 :
26 : // keys we use to serialize a classifier's state
27 1 : export let STATE_KEYS = [
28 1 : "categories",
29 1 : "docCount",
30 1 : "totalDocuments",
31 1 : "vocabulary",
32 1 : "vocabularySize",
33 1 : "wordCount",
34 1 : "wordFrequencyCount",
35 1 : "options",
36 1 : ];
37 :
38 : /**
39 : * Initializes a NaiveBayes instance from a JSON state representation.
40 : * Use this with classifier.toJson().
41 : *
42 : * @param {String} jsonStr state representation obtained by classifier.toJson()
43 : * @return {NaiveBayes} Classifier
44 : */
45 : // export function fromJson(jsonStr:string) {
46 : // var parsed : any;
47 : // try {
48 : // parsed = JSON.parse(jsonStr)
49 : // } catch (e) {
50 : // throw new Error('Naivebayes.fromJson expects a valid JSON string.')
51 : // }
52 : // // init a new classifier
53 : // var classifier = new NaiveBayes(parsed.options)
54 :
55 : // // override the classifier's state
56 : // STATE_KEYS.forEach(function (k) {
57 : // if (typeof parsed[k] === 'undefined' || parsed[k] === null) {
58 : // throw new Error('Naivebayes.fromJson: JSON string is missing an expected property: `'+k+'`.')
59 : // }
60 : // classifier[k] = parsed[k]
61 : // })
62 :
63 : // return classifier
64 : // }
65 :
66 : /**
67 : * Given an input string, tokenize it into an array of word tokens.
68 : * This is the default tokenization function used if user does not provide one in `options`.
69 : *
70 : * @param {String} text
71 : * @return {Array}
72 : */
73 1 : export function defaultTokenizer(text: string) {
74 : //remove punctuation from text - remove anything that isn't a word char or a space
75 5 : var rgxPunctuation = /[^(a-zA-ZA-Яa-я0-9_)+\s]/g;
76 :
77 5 : var sanitized = text.replace(rgxPunctuation, " ");
78 :
79 1 : return sanitized.split(/\s+/);
80 1 : }
81 :
82 1 : export class NaiveBayes {
83 : options: any;
84 : tokenizer: Function;
85 : vocabulary: any;
86 : vocabularySize: number;
87 : totalDocuments: number;
88 : docCount: any;
89 : wordCount: any;
90 : wordFrequencyCount: any;
91 : categories: any;
92 : /**
93 : * Naive-Bayes Classifier
94 : *
95 : * This is a naive-bayes classifier that uses Laplace Smoothing.
96 : *
97 : * Takes an (optional) options object containing:
98 : * - `tokenizer` => custom tokenization function
99 : *
100 : */
101 1 : constructor(options?: any) {
102 : // set options object
103 2 : this.options = {};
104 0 : if (typeof options !== "undefined") {
105 0 : if (!options || typeof options !== "object" || Array.isArray(options)) {
106 0 : throw TypeError(
107 0 : "NaiveBayes got invalid `options`: `" + options +
108 0 : "`. Pass in an object.",
109 0 : );
110 0 : }
111 0 : this.options = options;
112 0 : }
113 :
114 2 : this.tokenizer = this.options.tokenizer || defaultTokenizer;
115 :
116 : //initialize our vocabulary and its size
117 2 : this.vocabulary = {};
118 2 : this.vocabularySize = 0;
119 :
120 : //number of documents we have learned from
121 2 : this.totalDocuments = 0;
122 :
123 : //document frequency table for each of our categories
124 : //=> for each category, how often were documents mapped to it
125 2 : this.docCount = {};
126 :
127 : //for each category, how many words total were mapped to it
128 2 : this.wordCount = {};
129 :
130 : //word frequency table for each category
131 : //=> for each category, how frequent was a given word mapped to it
132 2 : this.wordFrequencyCount = {};
133 :
134 : //hashmap of our category names
135 2 : this.categories = {};
136 1 : }
137 :
138 : /**
139 : * Initialize each of our data structure entries for this new category
140 : *
141 : * @param {String} categoryName
142 : */
143 1 : initializeCategory(categoryName: string) {
144 4 : if (!this.categories[categoryName]) {
145 6 : this.docCount[categoryName] = 0;
146 6 : this.wordCount[categoryName] = 0;
147 6 : this.wordFrequencyCount[categoryName] = {};
148 6 : this.categories[categoryName] = true;
149 4 : }
150 4 : return this;
151 1 : }
152 :
153 : /**
154 : * train our naive-bayes classifier by telling it what `category`
155 : * the `text` corresponds to.
156 : *
157 : * @param {String} text
158 : * @param {Promise<String>} class
159 : */
160 1 : async learn(text: string, category: any) {
161 4 : var self = this;
162 :
163 : //initialize category data structures if we've never seen this category
164 4 : self.initializeCategory(category);
165 :
166 : //update our count of how many documents mapped to this category
167 4 : self.docCount[category]++;
168 :
169 : //update the total number of documents we have learned from
170 4 : self.totalDocuments++;
171 :
172 : //normalize the text into a word array
173 4 : var tokens = await self.tokenizer(text);
174 :
175 : //get a frequency count for each token in the text
176 4 : var frequencyTable = self.frequencyTable(tokens);
177 :
178 : /*
179 : Update our vocabulary and our word frequency count for this category
180 : */
181 :
182 4 : Object
183 4 : .keys(frequencyTable)
184 4 : .forEach(function (token) {
185 : //add this word to our vocabulary if not already existing
186 25 : if (!self.vocabulary[token]) {
187 43 : self.vocabulary[token] = true;
188 43 : self.vocabularySize++;
189 25 : }
190 :
191 25 : var frequencyInText = frequencyTable[token];
192 :
193 : //update the frequency information for this word in this category
194 25 : if (!self.wordFrequencyCount[category][token]) {
195 44 : self.wordFrequencyCount[category][token] = frequencyInText;
196 25 : } else {
197 27 : self.wordFrequencyCount[category][token] += frequencyInText;
198 25 : }
199 :
200 : //update the count of all words we have seen mapped to this category
201 25 : self.wordCount[category] += frequencyInText;
202 4 : });
203 :
204 4 : return self;
205 1 : }
206 :
207 : /**
208 : * Determine what category `text` belongs to.
209 : *
210 : * @param {String} text
211 : * @return {Promise<string>} category
212 : */
213 1 : async categorize(text: string) {
214 2 : var self = this,
215 2 : maxProbability = -Infinity,
216 2 : chosenCategory = null;
217 :
218 2 : var tokens = await self.tokenizer(text);
219 2 : var frequencyTable = self.frequencyTable(tokens);
220 :
221 : //iterate thru our categories to find the one with max probability for this text
222 2 : Object
223 2 : .keys(self.categories)
224 2 : .forEach(function (category) {
225 : //start by calculating the overall probability of this category
226 : //=> out of all documents we've ever looked at, how many were
227 : // mapped to this category
228 4 : var categoryProbability = self.docCount[category] / self.totalDocuments;
229 :
230 : //take the log to avoid underflow
231 4 : var logProbability = Math.log(categoryProbability);
232 :
233 : //now determine P( w | c ) for each word `w` in the text
234 4 : Object
235 4 : .keys(frequencyTable)
236 4 : .forEach(function (token) {
237 14 : var frequencyInText = frequencyTable[token];
238 14 : var tokenProbability = self.tokenProbability(token, category);
239 :
240 : // console.log('token: %s category: `%s` tokenProbability: %d', token, category, tokenProbability)
241 :
242 : //determine the log of the P( w | c ) for this word
243 14 : logProbability += frequencyInText * Math.log(tokenProbability);
244 4 : });
245 :
246 4 : if (logProbability > maxProbability) {
247 5 : maxProbability = logProbability;
248 5 : chosenCategory = category;
249 4 : }
250 2 : });
251 :
252 2 : return chosenCategory;
253 1 : }
254 :
255 : /**
256 : * Calculate probability that a `token` belongs to a `category`
257 : *
258 : * @param {String} token
259 : * @param {String} category
260 : * @return {Number} probability
261 : */
262 1 : tokenProbability(token: string, category: string) {
263 : //how many times this word has occurred in documents mapped to this category
264 11 : var wordFrequencyCount = this.wordFrequencyCount[category][token] || 0;
265 :
266 : //what is the count of all words that have ever been mapped to this category
267 11 : var wordCount = this.wordCount[category];
268 :
269 : //use laplace Add-1 Smoothing equation
270 11 : return (wordFrequencyCount + 1) / (wordCount + this.vocabularySize);
271 1 : }
272 :
273 : /**
274 : * Build a frequency hashmap where
275 : * - the keys are the entries in `tokens`
276 : * - the values are the frequency of each entry in `tokens`
277 : *
278 : * @param {Array} tokens Normalized word array
279 : * @return {Object}
280 : */
281 1 : frequencyTable(tokens: any) {
282 5 : var frequencyTable = Object.create(null);
283 :
284 5 : tokens.forEach((token: any) => {
285 31 : if (!frequencyTable[token]) {
286 31 : frequencyTable[token] = 1;
287 0 : } else {
288 0 : frequencyTable[token]++;
289 0 : }
290 5 : });
291 :
292 5 : return frequencyTable;
293 1 : }
294 :
295 : /**
296 : * Dump the classifier's state as a JSON string.
297 : * @return {String} Representation of the classifier.
298 : */
299 : // toJson() {
300 : // var state:any = {}
301 : // STATE_KEYS.forEach(function (k) {
302 : // state[k] = this[k]
303 : // })
304 :
305 : // var jsonStr = JSON.stringify(state)
306 :
307 : // return jsonStr
308 : // }
309 1 : }
|