1 | const supported_languages = [
|
2 | "danish",
|
3 | "dutch",
|
4 | "english",
|
5 | "french",
|
6 | "galician",
|
7 | "german",
|
8 | "italian",
|
9 | "polish",
|
10 | "portuguese",
|
11 | "romanian",
|
12 | "russian",
|
13 | "spanish",
|
14 | "swedish",
|
15 | "persian",
|
16 | "arabic",
|
17 | "czech"
|
18 | ];
|
19 | const stopwords = require("./stopwords/stopwords");
|
20 |
|
21 | function extract(
|
22 | str,
|
23 | options = {
|
24 | remove_digits: true,
|
25 | return_changed_case: true,
|
26 | }
|
27 | ) {
|
28 | if (!str) {
|
29 | return [];
|
30 | }
|
31 |
|
32 | const return_changed_case = options.return_changed_case;
|
33 | const return_chained_words = options.return_chained_words;
|
34 | const remove_digits = options.remove_digits;
|
35 | const _language = options.language || "english";
|
36 | const _remove_duplicates = options.remove_duplicates || false;
|
37 | const return_max_ngrams = options.return_max_ngrams;
|
38 |
|
39 | if (supported_languages.indexOf(_language) < 0) {
|
40 | throw new Error(
|
41 | "Language must be one of [" + supported_languages.join(",") + "]"
|
42 | );
|
43 | }
|
44 |
|
45 |
|
46 | const text = str.replace(/(<([^>]+)>)/gi, "").trim();
|
47 | if (!text) {
|
48 | return [];
|
49 | } else {
|
50 | const words = text.split(/\s/);
|
51 | const unchanged_words = [];
|
52 | const low_words = [];
|
53 |
|
54 | for (let x = 0; x < words.length; x++) {
|
55 | let w = words[x].match(/https?:\/\/.*[\r\n]*/g)
|
56 | ? words[x]
|
57 | : words[x].replace(/\.|,|;|!|\?|\(|\)|:|"|^'|'$|“|”|‘|’/g, "");
|
58 |
|
59 |
|
60 | if (w.length === 1) {
|
61 | w = w.replace(/_|@|&|#/g, "");
|
62 | }
|
63 |
|
64 | const digits_match = w.match(/\d/g);
|
65 | if (remove_digits && digits_match && digits_match.length === w.length) {
|
66 | w = "";
|
67 | }
|
68 | if (w.length > 0) {
|
69 | low_words.push(w.toLowerCase());
|
70 | unchanged_words.push(w);
|
71 | }
|
72 | }
|
73 | let results = [];
|
74 | const _stopwords =
|
75 | options.stopwords || getStopwords({ language: _language });
|
76 | let _last_result_word_index = 0;
|
77 | let _start_result_word_index = 0;
|
78 | let _unbroken_word_chain = false;
|
79 | for (let y = 0; y < low_words.length; y++) {
|
80 | if (_stopwords.indexOf(low_words[y]) < 0) {
|
81 | if (_last_result_word_index !== y - 1) {
|
82 | _start_result_word_index = y;
|
83 | _unbroken_word_chain = false;
|
84 | } else {
|
85 | _unbroken_word_chain = true;
|
86 | }
|
87 | const result_word =
|
88 | return_changed_case &&
|
89 | !unchanged_words[y].match(/https?:\/\/.*[\r\n]*/g)
|
90 | ? low_words[y]
|
91 | : unchanged_words[y];
|
92 |
|
93 | if (
|
94 | return_max_ngrams &&
|
95 | _unbroken_word_chain &&
|
96 | !return_chained_words &&
|
97 | return_max_ngrams > y - _start_result_word_index &&
|
98 | _last_result_word_index === y - 1
|
99 | ) {
|
100 | const change_pos = results.length - 1 < 0 ? 0 : results.length - 1;
|
101 | results[change_pos] = results[change_pos]
|
102 | ? results[change_pos] + " " + result_word
|
103 | : result_word;
|
104 | } else if (return_chained_words && _last_result_word_index === y - 1) {
|
105 | const change_pos = results.length - 1 < 0 ? 0 : results.length - 1;
|
106 | results[change_pos] = results[change_pos]
|
107 | ? results[change_pos] + " " + result_word
|
108 | : result_word;
|
109 | } else {
|
110 | results.push(result_word);
|
111 | }
|
112 |
|
113 | _last_result_word_index = y;
|
114 | } else {
|
115 | _unbroken_word_chain = false;
|
116 | }
|
117 | }
|
118 |
|
119 | if (_remove_duplicates) {
|
120 | results = results.filter((v, i, a) => a.indexOf(v) === i);;
|
121 | }
|
122 |
|
123 | return results;
|
124 | }
|
125 | }
|
126 |
|
127 | function getStopwords(options) {
|
128 | options = options || {};
|
129 |
|
130 | const _language = options.language || "english";
|
131 | if (supported_languages.indexOf(_language) < 0) {
|
132 | throw new Error(
|
133 | "Language must be one of [" + supported_languages.join(",") + "]"
|
134 | );
|
135 | }
|
136 |
|
137 | return stopwords[_language];
|
138 | }
|
139 |
|
140 | module.exports = {
|
141 | getStopwords,
|
142 | extract,
|
143 | }
|