nlp nlp-classifier.js

92.98% Statements 106/114
84.81% Branches 67/79
100% Functions 18/18
92.98% Lines 106/114

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31x
31x
31x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186x
186x
5x
 
186x
84x
 
186x
84x
 
186x
186x
 
186x
185x
 
186x
185x
 
186x
185x
 
186x
186x
 
 
 
 
 
 
 
 
2165x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463x
463x
463x
5975x
5975x
 
 
 
16x
 
 
447x
 
 
 
 
 
 
 
 
 
453x
 
 
 
 
 
 
 
 
455x
2x
 
453x
2x
 
451x
451x
451x
8x
 
443x
443x
443x
2314x
 
 
 
 
 
 
 
 
 
8x
1x
 
7x
7x
7x
1x
 
6x
6x
4x
4x
10x
10x
10x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
923x
 
 
923x
923x
923x
19022x
 
923x
 
 
 
373x
373x
2141x
2063x
2063x
2051x
 
12x
 
 
 
373x
 
 
 
 
 
 
47x
47x
47x
321x
321x
 
 
 
 
47x
39x
 
 
47x
47x
47x
321x
321x
 
 
 
 
47x
 
 
 
 
602x
117x
52x
 
 
550x
 
 
 
52x
52x
441x
 
52x
52x
52x
441x
 
 
 
 
52x
 
 
 
 
 
 
 
 
 
 
602x
602x
602x
 
 
602x
 
 
602x
550x
 
52x
 
 
 
 
 
52x
51x
38x
 
 
14x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2x
 
 
 
31x
  /*
 * Copyright (c) AXA Shared Services Spain S.A.
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 
const NlpUtil = require('./nlp-util');
const LogisticRegressionClassifier = require('../classifiers/logistic-regression-classifier');
const BinaryNeuralNetworkClassifier = require('../classifiers/binary-neural-network-classifier');
 
/**
 * Class for the NLP Classifier.
 * In the settings you can specify:
 * - classifier (optional): The Machine Learning Classifier Class. If not
 *      provided, then a default Logistic Regression Classifier is used.
 * - stemmer (optional): The language stemmer (also tokenize). If not
 *      provided, you can provide the language and the default stemmer
 *      for this language will be used.
 * - language (optional): If you don't provide a stemmer, then you can
 *      provide a language so a default stemmer for this language will
 *      be used.
 */
class NlpClassifier {
  /**
   * Constructor of the class.
   * @param {Object} settings Settings for this instance.
   */
  constructor(settings) {
    this.settings = settings || {};
    if (!this.settings.language) {
      this.settings.language = 'en';
    }
    if (this.settings.useNeural === undefined) {
      this.settings.useNeural = true;
    }
    if (this.settings.useLRC === undefined) {
      this.settings.useLRC = true;
    }
    Eif (!this.settings.classifier && this.settings.useLRC) {
      this.settings.classifier = new LogisticRegressionClassifier();
    }
    if (!this.settings.neuralClassifier && this.settings.useNeural) {
      this.settings.neuralClassifier = new BinaryNeuralNetworkClassifier();
    }
    if (!this.settings.stemmer) {
      this.settings.stemmer = NlpUtil.getStemmer(this.settings.language);
    }
    if (this.settings.keepStopWords === undefined) {
      this.settings.keepStopWords = true;
    }
    this.docs = [];
    this.features = {};
  }
 
  /**
   * Generate the vector of features.
   * @param {String} utterance Input utterance.
   * @returns {String[]} Vector of features.
   */
  tokenizeAndStem(utterance) {
    return typeof utterance === 'string'
      ? this.settings.stemmer.tokenizeAndStem(
          utterance,
          this.settings.keepStopWords
        )
      : utterance;
  }
 
  /**
   * Gets the position of a utterance for an intent.
   * @param {Object} srcUtterance Utterance to be found.
   * @param {Object} intent Intent of the utterance.
   * @returns {Number} Position of the utterance, -1 if not found.
   */
  posUtterance(srcUtterance, intent) {
    const utterance = this.tokenizeAndStem(srcUtterance);
    const utteranceStr = utterance.join(' ');
    for (let i = 0; i < this.docs.length; i += 1) {
      const doc = this.docs[i];
      if (
        doc.utterance.join(' ') === utteranceStr &&
        (!intent || doc.intent === intent)
      ) {
        return i;
      }
    }
    return -1;
  }
 
  /**
   * Indicates if an utterance already exists, at the given intent or globally.
   * @param {String} utterance Utterance to be checked.
   * @param {String} intent Intent to check, undefined to search globally.
   * @returns {boolean} True if the intent exists, false otherwise.
   */
  existsUtterance(utterance, intent) {
    return this.posUtterance(utterance, intent) !== -1;
  }
 
  /**
   * Adds a new utterance to an intent.
   * @param {String} srcUtterance Utterance to be added.
   * @param {String} srcIntent Intent for adding the utterance.
   */
  add(srcUtterance, srcIntent) {
    if (typeof srcUtterance !== 'string') {
      throw new Error('Utterance must be an string');
    }
    if (typeof srcIntent !== 'string') {
      throw new Error('Intent must be an string');
    }
    const intent = srcIntent.trim();
    const utterance = this.tokenizeAndStem(srcUtterance);
    if (utterance.length === 0 || this.existsUtterance(utterance)) {
      return;
    }
    const doc = { intent, utterance };
    this.docs.push(doc);
    utterance.forEach(token => {
      this.features[token] = (this.features[token] || 0) + 1;
    });
  }
 
  /**
   * Remove an utterance from the classifier.
   * @param {String} srcUtterance Utterance to be removed.
   * @param {String} srcIntent Intent of the utterance, undefined to search all
   */
  remove(srcUtterance, srcIntent) {
    if (typeof srcUtterance !== 'string') {
      throw new Error('Utterance must be an string');
    }
    const intent = srcIntent ? srcIntent.trim() : undefined;
    const utterance = this.tokenizeAndStem(srcUtterance);
    if (utterance.length === 0) {
      return;
    }
    const pos = this.posUtterance(utterance, intent);
    if (pos !== -1) {
      this.docs.splice(pos, 1);
      utterance.forEach(token => {
        this.features[token] = this.features[token] - 1;
        Eif (this.features[token] <= 0) {
          delete this.features[token];
        }
      });
    }
  }
 
  /**
   * Given an utterance, tokenize and steam the utterance and convert it
   * to a vector of binary values, where each position is a feature (a word
   * stemmed) and the value means if the utterance has this feature.
   * The input utterance can be an string or an array of tokens.
   * @param {String} srcUtterance Utterance to be converted to features vector.
   * @returns {Number[]} Features vector of the utterance.
   */
  textToFeatures(srcUtterance) {
    const utterance = Array.isArray(srcUtterance)
      ? srcUtterance
      : this.tokenizeAndStem(srcUtterance);
    const keys = Object.keys(this.features);
    const result = [];
    keys.forEach(key => {
      result.push(utterance.indexOf(key) > -1 ? 1 : 0);
    });
    return result;
  }
 
  tokensToNeural(tokens) {
    const result = {};
    for (let i = 0; i < tokens.length; i += 1) {
      if (this.features[tokens[i]]) {
        const value = Number.parseInt(tokens[i], 10);
        if (Number.isNaN(value)) {
          result[tokens[i]] = 1;
        } else {
          result['%number%'] = 1;
        }
      }
    }
    return result;
  }
 
  /**
   * Train the classifier with the existing utterances and intents.
   */
  async train() {
    Eif (this.settings.useLRC) {
      this.settings.classifier.clear();
      this.docs.forEach(doc => {
        const tokens = this.tokenizeAndStem(doc.utterance);
        this.settings.classifier.addObservation(
          this.textToFeatures(tokens),
          doc.intent
        );
      });
      if (this.settings.classifier.observationCount > 0) {
        await this.settings.classifier.train();
      }
    }
    Eif (this.settings.useNeural) {
      const corpus = [];
      this.docs.forEach(doc => {
        const tokens = this.tokenizeAndStem(doc.utterance);
        corpus.push({
          input: this.tokensToNeural(tokens),
          output: doc.intent,
        });
      });
      await this.settings.neuralClassifier.trainBatch(corpus);
    }
  }
 
  isEqualClassification(classifications) {
    for (let i = 0; i < classifications.length; i += 1) {
      if (classifications[i].value !== 0.5) {
        return false;
      }
    }
    return true;
  }
 
  normalizeNeural(classifications) {
    let total = 0;
    for (let i = 0; i < classifications.length; i += 1) {
      total += classifications[i].value;
    }
    Eif (total > 0) {
      const result = [];
      for (let i = 0; i < classifications.length; i += 1) {
        result.push({
          label: classifications[i].label,
          value: classifications[i].value / total,
        });
      }
      return result;
    }
    return classifications;
  }
 
  /**
   * Get all the labels and score for each label from this utterance.
   * @param {String} utterance Utterance to be classified.
   * @returns {Object[]} Sorted array of classifications, with label and score.
   */
  getClassifications(utterance) {
    const tokens = this.tokenizeAndStem(utterance);
    Eif (this.settings.useLRC) {
      const classificationLRC = this.settings.classifier.getClassifications(
        this.textToFeatures(tokens)
      );
      Iif (!this.settings.useNeural) {
        return classificationLRC;
      }
      if (this.isEqualClassification(classificationLRC)) {
        return classificationLRC;
      }
      const classificationNeural = this.normalizeNeural(
        this.settings.neuralClassifier.classify(
          this.tokensToNeural(tokens),
          true
        )
      );
      if (classificationLRC[0].label === classificationNeural[0].label) {
        if (classificationNeural[0].value < classificationLRC[0].value) {
          return classificationLRC;
        }
      }
      return classificationNeural;
    }
    if (this.settings.useNeural) {
      const classification = this.settings.neuralClassifier.classify(
        this.tokensToNeural(tokens),
        true
      );
      if (this.isEqualClassification(classification)) {
        return classification;
      }
      return this.normalizeNeural(classification);
    }
    return [];
  }
 
  /**
   * Given an utterance, get the label and score of the best classification.
   * @param {String} utterance Utterance to be classified.
   * @returns {Object} Best classification of the observation.
   */
  getBestClassification(utterance) {
    return this.getClassifications(utterance)[0];
  }
}
 
module.exports = NlpClassifier;