UNPKG

4.26 kBJavaScriptView Raw
1var _ = require("underscore");
2_.str = require('underscore.string');
3var supported_languages = ["danish","dutch","english","french","galician","german","italian","polish","portuguese","romanian","russian","spanish","swedish"];
4var stopwords = require("./stopwords/stopwords");
5
6function _extract(str, options){
7 if(_.isEmpty(str)){
8 return [];
9 }
10 if(_.isEmpty(options)){
11 options = {
12 remove_digits: true,
13 return_changed_case: true
14 };
15 }
16 var return_changed_case = options.return_changed_case;
17 var return_chained_words = options.return_chained_words;
18 var remove_digits = options.remove_digits;
19 var _language = options.language || "english";
20 var _remove_duplicates = options.remove_duplicates || false;
21 var return_max_ngrams = options.return_max_ngrams;
22
23 if(supported_languages.indexOf(_language) < 0){
24 throw new Error("Language must be one of ["+supported_languages.join(",")+"]");
25 }
26
27 // strip any HTML and trim whitespace
28 var text = _.str.trim(_.str.stripTags(str));
29 if(_.isEmpty(text)){
30 return [];
31 }else{
32 var words = text.split(/\s/);
33 var unchanged_words = [];
34 var low_words = [];
35 // change the case of all the words
36 for(var x = 0;x < words.length; x++){
37 var w = words[x].match(/https?:\/\/.*[\r\n]*/g) ? words[x] : words[x].replace(/\.|,|;|!|\?|\(|\)|:|"|^'|'$|“|”|‘|’/g,'');
38 // remove periods, question marks, exclamation points, commas, and semi-colons
39 // if this is a short result, make sure it's not a single character or something 'odd'
40 if(w.length === 1){
41 w = w.replace(/-|_|@|&|#/g,'');
42 }
43 // if it's a number, remove it
44 var digits_match = w.match(/\d/g);
45 if(remove_digits && digits_match && digits_match.length === w.length){
46 w = "";
47 }
48 if(w.length > 0){
49 low_words.push(w.toLowerCase());
50 unchanged_words.push(w);
51 }
52 }
53 var results = [];
54 var _stopwords = options.stopwords || _getStopwords({ language: _language });
55 var _last_result_word_index = 0;
56 var _start_result_word_index = 0;
57 var _unbroken_word_chain = false;
58 for(var y = 0; y < low_words.length; y++){
59
60 if(_stopwords.indexOf(low_words[y]) < 0){
61
62 if(_last_result_word_index !== y - 1){
63 _start_result_word_index = y;
64 _unbroken_word_chain = false;
65 } else {
66 _unbroken_word_chain = true;
67 }
68 var result_word = return_changed_case && !unchanged_words[y].match(/https?:\/\/.*[\r\n]*/g) ? low_words[y] : unchanged_words[y];
69
70 if (return_max_ngrams && _unbroken_word_chain && !return_chained_words && return_max_ngrams > (y - _start_result_word_index) && _last_result_word_index === y - 1){
71 var change_pos = results.length - 1 < 0 ? 0 : results.length - 1;
72 results[change_pos] = results[change_pos] ? results[change_pos] + ' ' + result_word : result_word;
73 } else if (return_chained_words && _last_result_word_index === y - 1) {
74 var change_pos = results.length - 1 < 0 ? 0 : results.length - 1;
75 results[change_pos] = results[change_pos] ? results[change_pos] + ' ' + result_word : result_word;
76 } else {
77 results.push(result_word);
78 }
79
80 _last_result_word_index = y;
81 } else {
82 _unbroken_word_chain = false;
83 }
84 }
85
86 if(_remove_duplicates) {
87 results= _.uniq(results, function (item) {
88 return item;
89 });
90 }
91
92 return results;
93 }
94}
95
96function _getStopwords(options){
97 options = options || {};
98
99 var _language = options.language || "english";
100 if(supported_languages.indexOf(_language) < 0){
101 throw new Error("Language must be one of ["+supported_languages.join(",")+"]");
102 }
103
104 return stopwords[_language];
105}
106
107module.exports = {
108 extract:_extract,
109 getStopwords: _getStopwords
110};