UNPKG

4.02 kBJavaScriptView Raw
1const supported_languages = [
2 "danish",
3 "dutch",
4 "english",
5 "french",
6 "galician",
7 "german",
8 "italian",
9 "polish",
10 "portuguese",
11 "romanian",
12 "russian",
13 "spanish",
14 "swedish",
15 "persian",
16 "arabic"
17];
18const stopwords = require("./stopwords/stopwords");
19
20function extract(
21 str,
22 options = {
23 remove_digits: true,
24 return_changed_case: true,
25 }
26) {
27 if (!str) {
28 return [];
29 }
30
31 const return_changed_case = options.return_changed_case;
32 const return_chained_words = options.return_chained_words;
33 const remove_digits = options.remove_digits;
34 const _language = options.language || "english";
35 const _remove_duplicates = options.remove_duplicates || false;
36 const return_max_ngrams = options.return_max_ngrams;
37
38 if (supported_languages.indexOf(_language) < 0) {
39 throw new Error(
40 "Language must be one of [" + supported_languages.join(",") + "]"
41 );
42 }
43
44 // strip any HTML and trim whitespace
45 const text = str.replace(/(<([^>]+)>)/gi, "").trim();
46 if (!text) {
47 return [];
48 } else {
49 const words = text.split(/\s/);
50 const unchanged_words = [];
51 const low_words = [];
52 // change the case of all the words
53 for (let x = 0; x < words.length; x++) {
54 let w = words[x].match(/https?:\/\/.*[\r\n]*/g)
55 ? words[x]
56 : words[x].replace(/\.|,|;|!|\?|\(|\)|:|"|^'|'$|“|”|‘|’/g, "");
57 // remove periods, question marks, exclamation points, commas, and semi-colons
58 // if this is a short result, make sure it's not a single character or something 'odd'
59 if (w.length === 1) {
60 w = w.replace(/_|@|&|#/g, "");
61 }
62 // if it's a number, remove it
63 const digits_match = w.match(/\d/g);
64 if (remove_digits && digits_match && digits_match.length === w.length) {
65 w = "";
66 }
67 if (w.length > 0) {
68 low_words.push(w.toLowerCase());
69 unchanged_words.push(w);
70 }
71 }
72 let results = [];
73 const _stopwords =
74 options.stopwords || getStopwords({ language: _language });
75 let _last_result_word_index = 0;
76 let _start_result_word_index = 0;
77 let _unbroken_word_chain = false;
78 for (let y = 0; y < low_words.length; y++) {
79 if (_stopwords.indexOf(low_words[y]) < 0) {
80 if (_last_result_word_index !== y - 1) {
81 _start_result_word_index = y;
82 _unbroken_word_chain = false;
83 } else {
84 _unbroken_word_chain = true;
85 }
86 const result_word =
87 return_changed_case &&
88 !unchanged_words[y].match(/https?:\/\/.*[\r\n]*/g)
89 ? low_words[y]
90 : unchanged_words[y];
91
92 if (
93 return_max_ngrams &&
94 _unbroken_word_chain &&
95 !return_chained_words &&
96 return_max_ngrams > y - _start_result_word_index &&
97 _last_result_word_index === y - 1
98 ) {
99 const change_pos = results.length - 1 < 0 ? 0 : results.length - 1;
100 results[change_pos] = results[change_pos]
101 ? results[change_pos] + " " + result_word
102 : result_word;
103 } else if (return_chained_words && _last_result_word_index === y - 1) {
104 const change_pos = results.length - 1 < 0 ? 0 : results.length - 1;
105 results[change_pos] = results[change_pos]
106 ? results[change_pos] + " " + result_word
107 : result_word;
108 } else {
109 results.push(result_word);
110 }
111
112 _last_result_word_index = y;
113 } else {
114 _unbroken_word_chain = false;
115 }
116 }
117
118 if (_remove_duplicates) {
119 results = results.filter((v, i, a) => a.indexOf(v) === i);;
120 }
121
122 return results;
123 }
124}
125
126function getStopwords(options) {
127 options = options || {};
128
129 const _language = options.language || "english";
130 if (supported_languages.indexOf(_language) < 0) {
131 throw new Error(
132 "Language must be one of [" + supported_languages.join(",") + "]"
133 );
134 }
135
136 return stopwords[_language];
137}
138
139module.exports = {
140 getStopwords,
141 extract,
142}