1 | var _ = require("underscore");
|
2 | _.str = require('underscore.string');
|
3 | var supported_languages = ["danish","dutch","english","french","galician","german","italian","polish","portuguese","romanian","russian","spanish","swedish"];
|
4 | var stopwords = require("./stopwords/stopwords");
|
5 |
|
6 | function _extract(str, options){
|
7 | if(_.isEmpty(str)){
|
8 | return [];
|
9 | }
|
10 | if(_.isEmpty(options)){
|
11 | options = {
|
12 | remove_digits: true,
|
13 | return_changed_case: true
|
14 | };
|
15 | }
|
16 | var return_changed_case = options.return_changed_case;
|
17 | var return_chained_words = options.return_chained_words;
|
18 | var remove_digits = options.remove_digits;
|
19 | var _language = options.language || "english";
|
20 | var _remove_duplicates = options.remove_duplicates || false;
|
21 | var return_max_ngrams = options.return_max_ngrams;
|
22 |
|
23 | if(supported_languages.indexOf(_language) < 0){
|
24 | throw new Error("Language must be one of ["+supported_languages.join(",")+"]");
|
25 | }
|
26 |
|
27 |
|
28 | var text = _.str.trim(_.str.stripTags(str));
|
29 | if(_.isEmpty(text)){
|
30 | return [];
|
31 | }else{
|
32 | var words = text.split(/\s/);
|
33 | var unchanged_words = [];
|
34 | var low_words = [];
|
35 |
|
36 | for(var x = 0;x < words.length; x++){
|
37 | var w = words[x].match(/https?:\/\/.*[\r\n]*/g) ? words[x] : words[x].replace(/\.|,|;|!|\?|\(|\)|:|"|^'|'$|“|”|‘|’/g,'');
|
38 |
|
39 |
|
40 | if(w.length === 1){
|
41 | w = w.replace(/-|_|@|&|#/g,'');
|
42 | }
|
43 |
|
44 | var digits_match = w.match(/\d/g);
|
45 | if(remove_digits && digits_match && digits_match.length === w.length){
|
46 | w = "";
|
47 | }
|
48 | if(w.length > 0){
|
49 | low_words.push(w.toLowerCase());
|
50 | unchanged_words.push(w);
|
51 | }
|
52 | }
|
53 | var results = [];
|
54 | var _stopwords = options.stopwords || _getStopwords({ language: _language });
|
55 | var _last_result_word_index = 0;
|
56 | var _start_result_word_index = 0;
|
57 | var _unbroken_word_chain = false;
|
58 | for(var y = 0; y < low_words.length; y++){
|
59 |
|
60 | if(_stopwords.indexOf(low_words[y]) < 0){
|
61 |
|
62 | if(_last_result_word_index !== y - 1){
|
63 | _start_result_word_index = y;
|
64 | _unbroken_word_chain = false;
|
65 | } else {
|
66 | _unbroken_word_chain = true;
|
67 | }
|
68 | var result_word = return_changed_case && !unchanged_words[y].match(/https?:\/\/.*[\r\n]*/g) ? low_words[y] : unchanged_words[y];
|
69 |
|
70 | if (return_max_ngrams && _unbroken_word_chain && !return_chained_words && return_max_ngrams > (y - _start_result_word_index) && _last_result_word_index === y - 1){
|
71 | var change_pos = results.length - 1 < 0 ? 0 : results.length - 1;
|
72 | results[change_pos] = results[change_pos] ? results[change_pos] + ' ' + result_word : result_word;
|
73 | } else if (return_chained_words && _last_result_word_index === y - 1) {
|
74 | var change_pos = results.length - 1 < 0 ? 0 : results.length - 1;
|
75 | results[change_pos] = results[change_pos] ? results[change_pos] + ' ' + result_word : result_word;
|
76 | } else {
|
77 | results.push(result_word);
|
78 | }
|
79 |
|
80 | _last_result_word_index = y;
|
81 | } else {
|
82 | _unbroken_word_chain = false;
|
83 | }
|
84 | }
|
85 |
|
86 | if(_remove_duplicates) {
|
87 | results= _.uniq(results, function (item) {
|
88 | return item;
|
89 | });
|
90 | }
|
91 |
|
92 | return results;
|
93 | }
|
94 | }
|
95 |
|
96 | function _getStopwords(options){
|
97 | options = options || {};
|
98 |
|
99 | var _language = options.language || "english";
|
100 | if(supported_languages.indexOf(_language) < 0){
|
101 | throw new Error("Language must be one of ["+supported_languages.join(",")+"]");
|
102 | }
|
103 |
|
104 | return stopwords[_language];
|
105 | }
|
106 |
|
107 | module.exports = {
|
108 | extract:_extract,
|
109 | getStopwords: _getStopwords
|
110 | };
|