UNPKG

4.03 kBJavaScriptView Raw
1const supported_languages = [
2 "danish",
3 "dutch",
4 "english",
5 "french",
6 "galician",
7 "german",
8 "italian",
9 "polish",
10 "portuguese",
11 "romanian",
12 "russian",
13 "spanish",
14 "swedish",
15 "persian",
16 "arabic",
17 "czech"
18];
19const stopwords = require("./stopwords/stopwords");
20
21function extract(
22 str,
23 options = {
24 remove_digits: true,
25 return_changed_case: true,
26 }
27) {
28 if (!str) {
29 return [];
30 }
31
32 const return_changed_case = options.return_changed_case;
33 const return_chained_words = options.return_chained_words;
34 const remove_digits = options.remove_digits;
35 const _language = options.language || "english";
36 const _remove_duplicates = options.remove_duplicates || false;
37 const return_max_ngrams = options.return_max_ngrams;
38
39 if (supported_languages.indexOf(_language) < 0) {
40 throw new Error(
41 "Language must be one of [" + supported_languages.join(",") + "]"
42 );
43 }
44
45 // strip any HTML and trim whitespace
46 const text = str.replace(/(<([^>]+)>)/gi, "").trim();
47 if (!text) {
48 return [];
49 } else {
50 const words = text.split(/\s/);
51 const unchanged_words = [];
52 const low_words = [];
53 // change the case of all the words
54 for (let x = 0; x < words.length; x++) {
55 let w = words[x].match(/https?:\/\/.*[\r\n]*/g)
56 ? words[x]
57 : words[x].replace(/\.|,|;|!|\?|\(|\)|:|"|^'|'$|“|”|‘|’/g, "");
58 // remove periods, question marks, exclamation points, commas, and semi-colons
59 // if this is a short result, make sure it's not a single character or something 'odd'
60 if (w.length === 1) {
61 w = w.replace(/_|@|&|#/g, "");
62 }
63 // if it's a number, remove it
64 const digits_match = w.match(/\d/g);
65 if (remove_digits && digits_match && digits_match.length === w.length) {
66 w = "";
67 }
68 if (w.length > 0) {
69 low_words.push(w.toLowerCase());
70 unchanged_words.push(w);
71 }
72 }
73 let results = [];
74 const _stopwords =
75 options.stopwords || getStopwords({ language: _language });
76 let _last_result_word_index = 0;
77 let _start_result_word_index = 0;
78 let _unbroken_word_chain = false;
79 for (let y = 0; y < low_words.length; y++) {
80 if (_stopwords.indexOf(low_words[y]) < 0) {
81 if (_last_result_word_index !== y - 1) {
82 _start_result_word_index = y;
83 _unbroken_word_chain = false;
84 } else {
85 _unbroken_word_chain = true;
86 }
87 const result_word =
88 return_changed_case &&
89 !unchanged_words[y].match(/https?:\/\/.*[\r\n]*/g)
90 ? low_words[y]
91 : unchanged_words[y];
92
93 if (
94 return_max_ngrams &&
95 _unbroken_word_chain &&
96 !return_chained_words &&
97 return_max_ngrams > y - _start_result_word_index &&
98 _last_result_word_index === y - 1
99 ) {
100 const change_pos = results.length - 1 < 0 ? 0 : results.length - 1;
101 results[change_pos] = results[change_pos]
102 ? results[change_pos] + " " + result_word
103 : result_word;
104 } else if (return_chained_words && _last_result_word_index === y - 1) {
105 const change_pos = results.length - 1 < 0 ? 0 : results.length - 1;
106 results[change_pos] = results[change_pos]
107 ? results[change_pos] + " " + result_word
108 : result_word;
109 } else {
110 results.push(result_word);
111 }
112
113 _last_result_word_index = y;
114 } else {
115 _unbroken_word_chain = false;
116 }
117 }
118
119 if (_remove_duplicates) {
120 results = results.filter((v, i, a) => a.indexOf(v) === i);;
121 }
122
123 return results;
124 }
125}
126
127function getStopwords(options) {
128 options = options || {};
129
130 const _language = options.language || "english";
131 if (supported_languages.indexOf(_language) < 0) {
132 throw new Error(
133 "Language must be one of [" + supported_languages.join(",") + "]"
134 );
135 }
136
137 return stopwords[_language];
138}
139
140module.exports = {
141 getStopwords,
142 extract,
143}