nlp nlp-classifier.js

99.08% Statements 108/109
97.01% Branches 65/67
100% Functions 18/18
99.08% Lines 108/109

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30x
30x
30x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185x
185x
7x
 
185x
184x
 
185x
90x
 
185x
184x
 
185x
184x
 
185x
184x
 
185x
185x
 
 
 
 
 
 
 
 
1795x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439x
439x
439x
5893x
5893x
 
 
 
15x
 
 
424x
 
 
 
 
 
 
 
 
 
429x
 
 
 
 
 
 
 
 
431x
2x
 
429x
2x
 
427x
427x
427x
7x
 
420x
420x
420x
2243x
 
 
 
 
 
 
 
 
 
8x
1x
 
7x
7x
7x
1x
 
6x
6x
4x
4x
10x
10x
10x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
922x
 
 
922x
922x
922x
19862x
 
922x
 
 
 
345x
345x
2057x
2057x
2045x
 
12x
 
 
345x
345x
17115x
17115x
17115x
 
345x
 
 
 
 
 
 
44x
44x
44x
298x
298x
 
 
 
298x
 
 
 
 
44x
37x
37x
36x
 
 
 
 
 
610x
59x
47x
 
 
563x
 
 
 
47x
47x
109x
 
47x
47x
47x
109x
 
 
 
 
47x
 
 
 
 
 
 
 
 
 
 
622x
622x
 
 
622x
610x
563x
 
47x
 
 
 
 
 
47x
47x
47x
46x
18x
 
28x
 
1x
 
12x
 
 
 
 
 
 
 
 
2x
 
 
 
 
 
30x
  /*
 * Copyright (c) AXA Shared Services Spain S.A.
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 
const LogisticRegressionClassifier = require('../classifiers/logistic-regression-classifier');
const NlpUtil = require('./nlp-util');
const BinaryNeuralNetworkClassifier = require('../classifiers/binary-neural-network-classifier');
 
/**
 * Class for the NLP Classifier.
 * In the settings you can specify:
 * - classifier (optional): The Machine Learning Classifier Class. If not
 *      provided, then a default Logistic Regression Classifier is used.
 * - stemmer (optional): The language stemmer (also tokenize). If not
 *      provided, you can provide the language and the default stemmer
 *      for this language will be used.
 * - language (optional): If you don't provide a stemmer, then you can
 *      provide a language so a default stemmer for this language will
 *      be used.
 */
class NlpClassifier {
  /**
   * Constructor of the class.
   * @param {Object} settings Settings for this instance.
   */
  constructor(settings) {
    this.settings = settings || {};
    if (!this.settings.language) {
      this.settings.language = 'en';
    }
    if (!this.settings.classifier) {
      this.settings.classifier = new LogisticRegressionClassifier();
    }
    if (this.settings.useNeural === undefined) {
      this.settings.useNeural = true;
    }
    if (!this.settings.neuralClassifier) {
      this.settings.neuralClassifier = new BinaryNeuralNetworkClassifier();
    }
    if (!this.settings.stemmer) {
      this.settings.stemmer = NlpUtil.getStemmer(this.settings.language);
    }
    if (this.settings.keepStopWords === undefined) {
      this.settings.keepStopWords = true;
    }
    this.docs = [];
    this.features = {};
  }
 
  /**
   * Generate the vector of features.
   * @param {String} utterance Input utterance.
   * @returns {String[]} Vector of features.
   */
  tokenizeAndStem(utterance) {
    return typeof utterance === 'string'
      ? this.settings.stemmer.tokenizeAndStem(
          utterance,
          this.settings.keepStopWords
        )
      : utterance;
  }
 
  /**
   * Gets the position of a utterance for an intent.
   * @param {Object} srcUtterance Utterance to be found.
   * @param {Object} intent Intent of the utterance.
   * @returns {Number} Position of the utterance, -1 if not found.
   */
  posUtterance(srcUtterance, intent) {
    const utterance = this.tokenizeAndStem(srcUtterance);
    const utteranceStr = utterance.join(' ');
    for (let i = 0; i < this.docs.length; i += 1) {
      const doc = this.docs[i];
      if (
        doc.utterance.join(' ') === utteranceStr &&
        (!intent || doc.intent === intent)
      ) {
        return i;
      }
    }
    return -1;
  }
 
  /**
   * Indicates if an utterance already exists, at the given intent or globally.
   * @param {String} utterance Utterance to be checked.
   * @param {String} intent Intent to check, undefined to search globally.
   * @returns {boolean} True if the intent exists, false otherwise.
   */
  existsUtterance(utterance, intent) {
    return this.posUtterance(utterance, intent) !== -1;
  }
 
  /**
   * Adds a new utterance to an intent.
   * @param {String} srcUtterance Utterance to be added.
   * @param {String} srcIntent Intent for adding the utterance.
   */
  add(srcUtterance, srcIntent) {
    if (typeof srcUtterance !== 'string') {
      throw new Error('Utterance must be an string');
    }
    if (typeof srcIntent !== 'string') {
      throw new Error('Intent must be an string');
    }
    const intent = srcIntent.trim();
    const utterance = this.tokenizeAndStem(srcUtterance);
    if (utterance.length === 0 || this.existsUtterance(utterance)) {
      return;
    }
    const doc = { intent, utterance };
    this.docs.push(doc);
    utterance.forEach(token => {
      this.features[token] = (this.features[token] || 0) + 1;
    });
  }
 
  /**
   * Remove an utterance from the classifier.
   * @param {String} srcUtterance Utterance to be removed.
   * @param {String} srcIntent Intent of the utterance, undefined to search all
   */
  remove(srcUtterance, srcIntent) {
    if (typeof srcUtterance !== 'string') {
      throw new Error('Utterance must be an string');
    }
    const intent = srcIntent ? srcIntent.trim() : undefined;
    const utterance = this.tokenizeAndStem(srcUtterance);
    if (utterance.length === 0) {
      return;
    }
    const pos = this.posUtterance(utterance, intent);
    if (pos !== -1) {
      this.docs.splice(pos, 1);
      utterance.forEach(token => {
        this.features[token] = this.features[token] - 1;
        Eif (this.features[token] <= 0) {
          delete this.features[token];
        }
      });
    }
  }
 
  /**
   * Given an utterance, tokenize and steam the utterance and convert it
   * to a vector of binary values, where each position is a feature (a word
   * stemmed) and the value means if the utterance has this feature.
   * The input utterance can be an string or an array of tokens.
   * @param {String} srcUtterance Utterance to be converted to features vector.
   * @returns {Number[]} Features vector of the utterance.
   */
  textToFeatures(srcUtterance) {
    const utterance = Array.isArray(srcUtterance)
      ? srcUtterance
      : this.tokenizeAndStem(srcUtterance);
    const keys = Object.keys(this.features);
    const result = [];
    keys.forEach(key => {
      result.push(utterance.indexOf(key) > -1 ? 1 : 0);
    });
    return result;
  }
 
  tokensToNeural(tokens) {
    const tokenFeatures = {};
    for (let i = 0; i < tokens.length; i += 1) {
      const value = Number.parseInt(tokens[i], 10);
      if (Number.isNaN(value)) {
        tokenFeatures[tokens[i]] = 1;
      } else {
        tokenFeatures['%number%'] = 1;
      }
    }
    const result = {};
    Object.keys(this.features).forEach(srcToken => {
      const value = Number.parseInt(srcToken, 10);
      const token = Number.isNaN(value) ? srcToken : '%number%';
      result[token] = tokenFeatures[token] ? 1 : 0;
    });
    return result;
  }
 
  /**
   * Train the classifier with the existing utterances and intents.
   */
  async train() {
    this.settings.classifier.clear();
    const corpus = [];
    this.docs.forEach(doc => {
      const tokens = this.tokenizeAndStem(doc.utterance);
      corpus.push({
        input: this.tokensToNeural(tokens),
        output: doc.intent,
      });
      this.settings.classifier.addObservation(
        this.textToFeatures(tokens),
        doc.intent
      );
    });
    if (this.settings.classifier.observationCount > 0) {
      await this.settings.classifier.train();
      if (this.settings.useNeural) {
        await this.settings.neuralClassifier.trainBatch(corpus);
      }
    }
  }
 
  isEqualClassification(classifications) {
    for (let i = 0; i < classifications.length; i += 1) {
      if (classifications[i].value !== 0.5) {
        return false;
      }
    }
    return true;
  }
 
  normalizeNeural(classifications) {
    let total = 0;
    for (let i = 0; i < classifications.length; i += 1) {
      total += classifications[i].value;
    }
    Eif (total > 0) {
      const result = [];
      for (let i = 0; i < classifications.length; i += 1) {
        result.push({
          label: classifications[i].label,
          value: classifications[i].value / total,
        });
      }
      return result;
    }
    return classifications;
  }
 
  /**
   * Get all the labels and score for each label from this utterance.
   * @param {String} utterance Utterance to be classified.
   * @returns {Object[]} Sorted array of classifications, with label and score.
   */
  getClassifications(utterance) {
    const tokens = this.tokenizeAndStem(utterance);
    const classification = this.settings.classifier.getClassifications(
      this.textToFeatures(tokens)
    );
    if (this.settings.useNeural) {
      if (this.isEqualClassification(classification)) {
        return classification;
      }
      const neuralClassification = this.normalizeNeural(
        this.settings.neuralClassifier.classify(
          this.tokensToNeural(tokens),
          true
        )
      );
      const neuralIntent = neuralClassification[0].label;
      const lrcIntent = classification[0].label;
      if (neuralIntent === lrcIntent) {
        if (neuralClassification[0].value > classification[0].value) {
          return neuralClassification;
        }
        return classification;
      }
      return neuralClassification;
    }
    return classification;
  }
 
  /**
   * Given an utterance, get the label and score of the best classification.
   * @param {String} utterance Utterance to be classified.
   * @returns {Object} Best classification of the observation.
   */
  getBestClassification(utterance) {
    return this.settings.classifier.getBestClassification(
      this.textToFeatures(utterance)
    );
  }
}
 
module.exports = NlpClassifier;