LCOV - coverage.lcov - ml/naivebayes.ts

LCOV - code coverage report

Current view:	top level - ml - naivebayes.ts (source / functions)		Hit	Total	Coverage
Test:	coverage.lcov	Lines:	99	111	89.2 %
Date:	2021-03-12 10:43:40	Functions:	7	7	100.0 %

          Line data    Source code

       1             : // License
       2             : 
       3             : // (The MIT License)
       4             : 
       5             : // Copyright (c) by Tolga Tezel tolgatezel11@gmail.com
       6             : 
       7             : // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
       8             : //  to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute,
       9             : // sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
      10             : // subject to the following conditions:
      11             : 
      12             : // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
      13             : // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
      14             : // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
      15             : // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
      16             : // IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      17             : 
      18             : /**
      19             :  * INFO: a port of ttezel's naive Bayes implementation
      20             :  */
      21             : 
      22             : /*
      23             :     Expose our naive-bayes generator function
      24             :  */
      25             : 
      26             : // keys we use to serialize a classifier's state
      27           1 : export let STATE_KEYS = [
      28           1 :   "categories",
      29           1 :   "docCount",
      30           1 :   "totalDocuments",
      31           1 :   "vocabulary",
      32           1 :   "vocabularySize",
      33           1 :   "wordCount",
      34           1 :   "wordFrequencyCount",
      35           1 :   "options",
      36           1 : ];
      37             : 
      38             : /**
      39             :  * Initializes a NaiveBayes instance from a JSON state representation.
      40             :  * Use this with classifier.toJson().
      41             :  *
      42             :  * @param  {String} jsonStr   state representation obtained by classifier.toJson()
      43             :  * @return {NaiveBayes}       Classifier
      44             :  */
      45             : // export function fromJson(jsonStr:string) {
      46             : //   var parsed : any;
      47             : //   try {
      48             : //     parsed = JSON.parse(jsonStr)
      49             : //   } catch (e) {
      50             : //     throw new Error('Naivebayes.fromJson expects a valid JSON string.')
      51             : //   }
      52             : //   // init a new classifier
      53             : //   var classifier = new NaiveBayes(parsed.options)
      54             : 
      55             : //   // override the classifier's state
      56             : //   STATE_KEYS.forEach(function (k) {
      57             : //     if (typeof parsed[k] === 'undefined' || parsed[k] === null) {
      58             : //       throw new Error('Naivebayes.fromJson: JSON string is missing an expected property: `'+k+'`.')
      59             : //     }
      60             : //     classifier[k] = parsed[k]
      61             : //   })
      62             : 
      63             : //   return classifier
      64             : // }
      65             : 
      66             : /**
      67             :  * Given an input string, tokenize it into an array of word tokens.
      68             :  * This is the default tokenization function used if user does not provide one in `options`.
      69             :  *
      70             :  * @param  {String} text
      71             :  * @return {Array}
      72             :  */
      73           1 : export function defaultTokenizer(text: string) {
      74             :   //remove punctuation from text - remove anything that isn't a word char or a space
      75           5 :   var rgxPunctuation = /[^(a-zA-ZA-Яa-я0-9_)+\s]/g;
      76             : 
      77           5 :   var sanitized = text.replace(rgxPunctuation, " ");
      78             : 
      79           1 :   return sanitized.split(/\s+/);
      80           1 : }
      81             : 
      82           1 : export class NaiveBayes {
      83             :   options: any;
      84             :   tokenizer: Function;
      85             :   vocabulary: any;
      86             :   vocabularySize: number;
      87             :   totalDocuments: number;
      88             :   docCount: any;
      89             :   wordCount: any;
      90             :   wordFrequencyCount: any;
      91             :   categories: any;
      92             :   /**
      93             :  * Naive-Bayes Classifier
      94             :  *
      95             :  * This is a naive-bayes classifier that uses Laplace Smoothing.
      96             :  *
      97             :  * Takes an (optional) options object containing:
      98             :  *   - `tokenizer`  => custom tokenization function
      99             :  *
     100             :  */
     101           1 :   constructor(options?: any) {
     102             :     // set options object
     103           2 :     this.options = {};
     104           0 :     if (typeof options !== "undefined") {
     105           0 :       if (!options || typeof options !== "object" || Array.isArray(options)) {
     106           0 :         throw TypeError(
     107           0 :           "NaiveBayes got invalid `options`: `" + options +
     108           0 :             "`. Pass in an object.",
     109           0 :         );
     110           0 :       }
     111           0 :       this.options = options;
     112           0 :     }
     113             : 
     114           2 :     this.tokenizer = this.options.tokenizer || defaultTokenizer;
     115             : 
     116             :     //initialize our vocabulary and its size
     117           2 :     this.vocabulary = {};
     118           2 :     this.vocabularySize = 0;
     119             : 
     120             :     //number of documents we have learned from
     121           2 :     this.totalDocuments = 0;
     122             : 
     123             :     //document frequency table for each of our categories
     124             :     //=> for each category, how often were documents mapped to it
     125           2 :     this.docCount = {};
     126             : 
     127             :     //for each category, how many words total were mapped to it
     128           2 :     this.wordCount = {};
     129             : 
     130             :     //word frequency table for each category
     131             :     //=> for each category, how frequent was a given word mapped to it
     132           2 :     this.wordFrequencyCount = {};
     133             : 
     134             :     //hashmap of our category names
     135           2 :     this.categories = {};
     136           1 :   }
     137             : 
     138             :   /**
     139             :  * Initialize each of our data structure entries for this new category
     140             :  *
     141             :  * @param  {String} categoryName
     142             :  */
     143           1 :   initializeCategory(categoryName: string) {
     144           4 :     if (!this.categories[categoryName]) {
     145           6 :       this.docCount[categoryName] = 0;
     146           6 :       this.wordCount[categoryName] = 0;
     147           6 :       this.wordFrequencyCount[categoryName] = {};
     148           6 :       this.categories[categoryName] = true;
     149           4 :     }
     150           4 :     return this;
     151           1 :   }
     152             : 
     153             :   /**
     154             :  * train our naive-bayes classifier by telling it what `category`
     155             :  * the `text` corresponds to.
     156             :  *
     157             :  * @param  {String} text
     158             :  * @param  {Promise<String>} class
     159             :  */
     160           1 :   async learn(text: string, category: any) {
     161           4 :     var self = this;
     162             : 
     163             :     //initialize category data structures if we've never seen this category
     164           4 :     self.initializeCategory(category);
     165             : 
     166             :     //update our count of how many documents mapped to this category
     167           4 :     self.docCount[category]++;
     168             : 
     169             :     //update the total number of documents we have learned from
     170           4 :     self.totalDocuments++;
     171             : 
     172             :     //normalize the text into a word array
     173           4 :     var tokens = await self.tokenizer(text);
     174             : 
     175             :     //get a frequency count for each token in the text
     176           4 :     var frequencyTable = self.frequencyTable(tokens);
     177             : 
     178             :     /*
     179             :       Update our vocabulary and our word frequency count for this category
     180             :    */
     181             : 
     182           4 :     Object
     183           4 :       .keys(frequencyTable)
     184           4 :       .forEach(function (token) {
     185             :         //add this word to our vocabulary if not already existing
     186          25 :         if (!self.vocabulary[token]) {
     187          43 :           self.vocabulary[token] = true;
     188          43 :           self.vocabularySize++;
     189          25 :         }
     190             : 
     191          25 :         var frequencyInText = frequencyTable[token];
     192             : 
     193             :         //update the frequency information for this word in this category
     194          25 :         if (!self.wordFrequencyCount[category][token]) {
     195          44 :           self.wordFrequencyCount[category][token] = frequencyInText;
     196          25 :         } else {
     197          27 :           self.wordFrequencyCount[category][token] += frequencyInText;
     198          25 :         }
     199             : 
     200             :         //update the count of all words we have seen mapped to this category
     201          25 :         self.wordCount[category] += frequencyInText;
     202           4 :       });
     203             : 
     204           4 :     return self;
     205           1 :   }
     206             : 
     207             :   /**
     208             :  * Determine what category `text` belongs to.
     209             :  *
     210             :  * @param  {String} text
     211             :  * @return {Promise<string>} category
     212             :  */
     213           1 :   async categorize(text: string) {
     214           2 :     var self = this,
     215           2 :       maxProbability = -Infinity,
     216           2 :       chosenCategory = null;
     217             : 
     218           2 :     var tokens = await self.tokenizer(text);
     219           2 :     var frequencyTable = self.frequencyTable(tokens);
     220             : 
     221             :     //iterate thru our categories to find the one with max probability for this text
     222           2 :     Object
     223           2 :       .keys(self.categories)
     224           2 :       .forEach(function (category) {
     225             :         //start by calculating the overall probability of this category
     226             :         //=>  out of all documents we've ever looked at, how many were
     227             :         //    mapped to this category
     228           4 :         var categoryProbability = self.docCount[category] / self.totalDocuments;
     229             : 
     230             :         //take the log to avoid underflow
     231           4 :         var logProbability = Math.log(categoryProbability);
     232             : 
     233             :         //now determine P( w | c ) for each word `w` in the text
     234           4 :         Object
     235           4 :           .keys(frequencyTable)
     236           4 :           .forEach(function (token) {
     237          14 :             var frequencyInText = frequencyTable[token];
     238          14 :             var tokenProbability = self.tokenProbability(token, category);
     239             : 
     240             :             // console.log('token: %s category: `%s` tokenProbability: %d', token, category, tokenProbability)
     241             : 
     242             :             //determine the log of the P( w | c ) for this word
     243          14 :             logProbability += frequencyInText * Math.log(tokenProbability);
     244           4 :           });
     245             : 
     246           4 :         if (logProbability > maxProbability) {
     247           5 :           maxProbability = logProbability;
     248           5 :           chosenCategory = category;
     249           4 :         }
     250           2 :       });
     251             : 
     252           2 :     return chosenCategory;
     253           1 :   }
     254             : 
     255             :   /**
     256             :  * Calculate probability that a `token` belongs to a `category`
     257             :  *
     258             :  * @param  {String} token
     259             :  * @param  {String} category
     260             :  * @return {Number} probability
     261             :  */
     262           1 :   tokenProbability(token: string, category: string) {
     263             :     //how many times this word has occurred in documents mapped to this category
     264          11 :     var wordFrequencyCount = this.wordFrequencyCount[category][token] || 0;
     265             : 
     266             :     //what is the count of all words that have ever been mapped to this category
     267          11 :     var wordCount = this.wordCount[category];
     268             : 
     269             :     //use laplace Add-1 Smoothing equation
     270          11 :     return (wordFrequencyCount + 1) / (wordCount + this.vocabularySize);
     271           1 :   }
     272             : 
     273             :   /**
     274             :  * Build a frequency hashmap where
     275             :  * - the keys are the entries in `tokens`
     276             :  * - the values are the frequency of each entry in `tokens`
     277             :  *
     278             :  * @param  {Array} tokens  Normalized word array
     279             :  * @return {Object}
     280             :  */
     281           1 :   frequencyTable(tokens: any) {
     282           5 :     var frequencyTable = Object.create(null);
     283             : 
     284           5 :     tokens.forEach((token: any) => {
     285          31 :       if (!frequencyTable[token]) {
     286          31 :         frequencyTable[token] = 1;
     287           0 :       } else {
     288           0 :         frequencyTable[token]++;
     289           0 :       }
     290           5 :     });
     291             : 
     292           5 :     return frequencyTable;
     293           1 :   }
     294             : 
     295             :   /**
     296             :  * Dump the classifier's state as a JSON string.
     297             :  * @return {String} Representation of the classifier.
     298             :  */
     299             :   // toJson() {
     300             :   //   var state:any = {}
     301             :   //   STATE_KEYS.forEach(function (k) {
     302             :   //     state[k] = this[k]
     303             :   //   })
     304             : 
     305             :   //   var jsonStr = JSON.stringify(state)
     306             : 
     307             :   //   return jsonStr
     308             :   // }
     309           1 : }

Generated by: LCOV version 1.15