1 | 'use strict';
|
2 |
|
3 | const async = require('async');
|
4 | const entities = require('entities');
|
5 | const fs = require('fs');
|
6 | const isutf8 = require('isutf8');
|
7 | const fetch = require('node-fetch');
|
8 | const pth = require('path');
|
9 | const xml2js = require('xml2js');
|
10 | const yaspellerApi = require('yandex-speller');
|
11 | const MarkdownIt = require('markdown-it');
|
12 | const md = new MarkdownIt();
|
13 |
|
14 | const eyo = require('./plugins/eyo');
|
15 | const { getFormat, getApiFormat } = require('./helpers/format');
|
16 | const {
|
17 | hasIgnoredText,
|
18 | ignoreComments,
|
19 | ignoreBlocks,
|
20 | ignoreTags,
|
21 | ignoreLines,
|
22 | } = require('./helpers/ignore');
|
23 | const { consoleDebug } = require('../lib/helpers/console');
|
24 | const { jsonStringify, stripTags } = require('./helpers/string');
|
25 |
|
26 | const MAX_LEN_TEXT = 10000;
|
27 |
|
28 | function getMaxRequest(settings) {
|
29 | return settings.maxRequest || 2;
|
30 | }
|
31 |
|
32 |
|
33 |
|
34 |
|
35 |
|
36 |
|
37 |
|
38 |
|
39 |
|
40 |
|
41 |
|
42 |
|
43 |
|
44 | function checkText(originalText, callback, settings) {
|
45 | let text = originalText;
|
46 |
|
47 | consoleDebug(`Original text: ${originalText}`);
|
48 |
|
49 | const apiSettings = Object.assign({}, settings);
|
50 | const format = getFormat(text, apiSettings);
|
51 | const lang = apiSettings.lang || 'en,ru';
|
52 |
|
53 | apiSettings.lang = Array.isArray(lang) ? lang.join(',') : lang;
|
54 |
|
55 | Array.isArray(apiSettings.ignoreText) && apiSettings.ignoreText.forEach(function(re) {
|
56 | text = text.replace(re, '');
|
57 | });
|
58 |
|
59 | if (hasIgnoredText(text)) {
|
60 | text = ignoreLines(text);
|
61 | text = ignoreBlocks(text);
|
62 | }
|
63 |
|
64 | if (format === 'html' || format === 'markdown') {
|
65 | if (format === 'markdown') {
|
66 | text = md.render(text);
|
67 | }
|
68 |
|
69 | if (apiSettings.ignoreTags) {
|
70 | text = ignoreTags(text, apiSettings.ignoreTags);
|
71 | }
|
72 |
|
73 | text = ignoreComments(text);
|
74 | text = stripTags(text);
|
75 | text = entities.decodeHTML(text);
|
76 | }
|
77 |
|
78 | text = prepareText(text, format);
|
79 | consoleDebug(`Prepared text for API: ${text}`);
|
80 |
|
81 | const tasks = [];
|
82 | const texts = splitText(text);
|
83 |
|
84 | apiSettings.format = getApiFormat(format);
|
85 |
|
86 | texts.forEach(function(el, i) {
|
87 | consoleDebug({
|
88 | request: i,
|
89 | format: format,
|
90 | apiFormat: apiSettings.format,
|
91 | lang: apiSettings.lang,
|
92 | options: apiSettings.options,
|
93 | text: el.substring(0, 128),
|
94 | });
|
95 |
|
96 | tasks.push(function(cb) {
|
97 | yaspellerApi.checkText(el, function(error, body) {
|
98 | if (error) {
|
99 | cb(null, [true, error]);
|
100 | } else {
|
101 | cb(null, [false, body]);
|
102 | }
|
103 | }, apiSettings);
|
104 | });
|
105 | });
|
106 |
|
107 | async.parallelLimit(tasks, getMaxRequest(apiSettings), function(err, data) {
|
108 | const buf = mergeResults(data);
|
109 |
|
110 | consoleDebug('Yandex.Speller API response:');
|
111 | consoleDebug(jsonStringify(buf));
|
112 |
|
113 | if (!buf.err && apiSettings.checkYo) {
|
114 | checkYo(text, buf.data);
|
115 | }
|
116 |
|
117 | callback(buf.err, buf.data, originalText);
|
118 | });
|
119 | }
|
120 |
|
121 | function checkYo(text, data) {
|
122 | eyo(text).forEach(function(el) {
|
123 | data.push({
|
124 | code: 100,
|
125 | position: el.position,
|
126 | word: el.before,
|
127 | s: [el.after],
|
128 | count: el.count
|
129 | });
|
130 | });
|
131 | }
|
132 |
|
133 | function splitText(text) {
|
134 | const texts = [];
|
135 |
|
136 | let pos = 0;
|
137 | let newPos = 0;
|
138 |
|
139 | while (pos < text.length) {
|
140 | if (pos + MAX_LEN_TEXT >= text.length) {
|
141 | texts.push(text.substring(pos));
|
142 | break;
|
143 | } else {
|
144 | newPos = getPosition(text, pos + MAX_LEN_TEXT);
|
145 | texts.push(text.substring(pos, newPos));
|
146 | pos = newPos;
|
147 | }
|
148 | }
|
149 |
|
150 | return texts;
|
151 | }
|
152 |
|
153 | function getPosition(text, start) {
|
154 | const depth = 500;
|
155 | for (let i = start - 1; i >= start - depth; i--) {
|
156 | const sym = text[i];
|
157 | if (sym === ' ' || sym === '\n' || sym === '\t') {
|
158 | return i;
|
159 | }
|
160 | }
|
161 |
|
162 | return start;
|
163 | }
|
164 |
|
165 | function mergeResults(res) {
|
166 | let err = false;
|
167 | let data = [];
|
168 |
|
169 | res.some(function(el) {
|
170 | if (el[0]) {
|
171 | err = true;
|
172 | data = el[1];
|
173 | return true;
|
174 | }
|
175 |
|
176 | return false;
|
177 | });
|
178 |
|
179 | if (!err) {
|
180 | res.forEach(function(el) {
|
181 | data = data.concat(el[1]);
|
182 | });
|
183 | }
|
184 |
|
185 | return {
|
186 | err: err,
|
187 | data: data
|
188 | };
|
189 | }
|
190 |
|
191 |
|
192 |
|
193 |
|
194 |
|
195 |
|
196 |
|
197 |
|
198 | function checkFile(file, callback, settings) {
|
199 | settings = settings || {};
|
200 | settings.extname = pth.extname(file);
|
201 |
|
202 | consoleDebug('Get file: ' + file);
|
203 |
|
204 | if (fs.existsSync(file)) {
|
205 | if (fs.statSync(file).isFile()) {
|
206 | const buf = fs.readFileSync(file);
|
207 | if (isutf8(buf)) {
|
208 | consoleDebug('Post text → Yandex.Speller API: ' + file);
|
209 |
|
210 | const startTime = Date.now();
|
211 | checkText(buf.toString(), function(err, data, originalText) {
|
212 | callback(
|
213 | err,
|
214 | err ? data : {resource: file, data: data, time: Date.now() - startTime},
|
215 | originalText
|
216 | );
|
217 | }, settings);
|
218 | } else {
|
219 | callback(true, Error(file + ': is not UTF-8'));
|
220 | }
|
221 | } else {
|
222 | callback(true, Error(file + ': is not file'));
|
223 | }
|
224 | } else {
|
225 | callback(true, Error(file + ': is not exists'));
|
226 | }
|
227 | }
|
228 |
|
229 |
|
230 |
|
231 |
|
232 |
|
233 |
|
234 |
|
235 |
|
236 | function checkUrl(url, callback, settings) {
|
237 | settings = settings || {};
|
238 | settings.extname = pth.extname(url);
|
239 |
|
240 | consoleDebug('Get url: ' + url);
|
241 |
|
242 | fetch(url)
|
243 | .then(response => {
|
244 | if (response.ok) {
|
245 | return response.text();
|
246 | } else {
|
247 | throw Error(url + ': returns status code is ' + response.statusCode);
|
248 | }
|
249 | })
|
250 | .then(text => {
|
251 | const startTime = Date.now();
|
252 | checkText(text, function(err, data, originalText) {
|
253 | callback(
|
254 | err,
|
255 | err ? data : {resource: url, data: data, time: Date.now() - startTime},
|
256 | originalText
|
257 | );
|
258 | }, settings);
|
259 | })
|
260 | .catch(error => {
|
261 | callback(true, error);
|
262 | });
|
263 | }
|
264 |
|
265 |
|
266 |
|
267 |
|
268 |
|
269 |
|
270 |
|
271 |
|
272 |
|
273 | function checkSitemap(url, commonCallback, settings, callback) {
|
274 | settings = settings || {};
|
275 |
|
276 | const results = [];
|
277 |
|
278 | consoleDebug('Get sitemap: ' + url);
|
279 |
|
280 | fetch(url)
|
281 | .then(res => {
|
282 | if (res.ok) {
|
283 | return res.text();
|
284 | } else {
|
285 | throw Error(url + ': returns status code is ' + res.statusCode);
|
286 | }
|
287 | })
|
288 | .then(xml => {
|
289 | const parser = new xml2js.Parser();
|
290 | parser.parseString(xml, function(err, result) {
|
291 | if (err) {
|
292 | let obj = [true, Error(url + ': error parsing xml')];
|
293 | results.push(obj);
|
294 | callback && callback.apply(this, obj);
|
295 | commonCallback(results);
|
296 | return;
|
297 | }
|
298 |
|
299 | const tasks = [];
|
300 | if (result && result.urlset && Array.isArray(result.urlset.url)) {
|
301 | result.urlset.url.forEach(function(el) {
|
302 | el.loc && el.loc.forEach(function(url) {
|
303 | tasks.push(function(cb) {
|
304 | checkUrl(url, function(err, data, originalText) {
|
305 | callback && callback(err, data, originalText);
|
306 | cb(null, [err, data]);
|
307 | }, settings);
|
308 | });
|
309 | });
|
310 | });
|
311 | }
|
312 |
|
313 | async.parallelLimit(tasks, getMaxRequest(settings), function(err, data) {
|
314 | commonCallback(data);
|
315 | });
|
316 | });
|
317 | })
|
318 | .catch((error) => {
|
319 | const obj = [true, error];
|
320 | results.push(obj);
|
321 | callback && callback.apply(this, obj);
|
322 | commonCallback(results);
|
323 | });
|
324 | }
|
325 |
|
326 |
|
327 |
|
328 |
|
329 |
|
330 |
|
331 |
|
332 | function addPositions(text, data) {
|
333 | data.forEach(function(item) {
|
334 | if (item.code === yaspellerApi.ERROR_TOO_MANY_ERRORS || item.position) {
|
335 | return;
|
336 | }
|
337 |
|
338 | const result = [];
|
339 | const letters = '[^a-zA-Zа-яА-ЯЁёҐґЄєІіЇї]';
|
340 | let word = item.word;
|
341 |
|
342 | if (item.code === yaspellerApi.ERROR_REPEATED_WORD) {
|
343 | word = item.word + '\\s+' + item.word;
|
344 | }
|
345 |
|
346 | text.replace(new RegExp(word + '(?:' + letters + '|$)', 'mg'), function($0, index) {
|
347 | const prevSymbol = text[index - 1];
|
348 | if (prevSymbol && prevSymbol.search(letters) === -1) {
|
349 | return;
|
350 | }
|
351 |
|
352 | const lines = text.substr(0, index).split(/\r\n|\n|\r/);
|
353 |
|
354 | result.push({
|
355 | line: lines.length,
|
356 | column: lines[lines.length - 1].length + 1
|
357 | });
|
358 | });
|
359 |
|
360 | item.position = item.count >= result.length ? result : [];
|
361 | });
|
362 | }
|
363 |
|
364 |
|
365 |
|
366 |
|
367 |
|
368 |
|
369 |
|
370 | function removeDuplicates(data) {
|
371 | const result = [];
|
372 | const obj = {};
|
373 |
|
374 | data.forEach(function(el) {
|
375 | const code = el.code;
|
376 | const word = el.word;
|
377 | const s = el.s;
|
378 | const hasPosition = Array.isArray(el.position);
|
379 |
|
380 | if (!word) {
|
381 | return;
|
382 | }
|
383 |
|
384 | obj[code] = obj[code] || {};
|
385 |
|
386 | if (!obj[code][word]) {
|
387 | obj[code][word] = {
|
388 | code,
|
389 | word,
|
390 | count: el.count || 1,
|
391 | };
|
392 |
|
393 | if (Array.isArray(s) && s.length) {
|
394 | obj[code][word].suggest = s;
|
395 | }
|
396 |
|
397 | if (hasPosition) {
|
398 | obj[code][word].position = el.position;
|
399 | }
|
400 | } else {
|
401 | const objWord = obj[code][word];
|
402 | objWord.count += el.count || 1;
|
403 | if (hasPosition) {
|
404 | objWord.position = Array.isArray(objWord.position) ?
|
405 | objWord.position.concat(el.position) :
|
406 | el.position;
|
407 | }
|
408 | }
|
409 | });
|
410 |
|
411 | Object.keys(obj).forEach(function(code) {
|
412 | Object.keys(obj[code]).sort().forEach(function(word) {
|
413 | result.push(obj[code][word]);
|
414 | });
|
415 | });
|
416 |
|
417 | return result;
|
418 | }
|
419 |
|
420 |
|
421 |
|
422 |
|
423 |
|
424 |
|
425 | function sortByPositions(data) {
|
426 | data.sort(function(a, b) {
|
427 | const codeA = a.code;
|
428 | const codeB = b.code;
|
429 |
|
430 |
|
431 | if (codeA > codeB) {
|
432 | return 1;
|
433 | }
|
434 | if (codeA < codeB) {
|
435 | return -1;
|
436 | }
|
437 |
|
438 | const posA = a.position;
|
439 | const posB = b.position;
|
440 |
|
441 |
|
442 | if (!posA.length || !posB.length) {
|
443 | if (posA.length === posB.length) {
|
444 |
|
445 | return a.word.toLowerCase() > b.word.toLowerCase() ? 1 : -1;
|
446 | }
|
447 |
|
448 | if (posA.length < posB.length) {
|
449 | return 1;
|
450 | }
|
451 |
|
452 | return -1;
|
453 | } else {
|
454 |
|
455 | const lineA = posA[0].line;
|
456 | const lineB = posB[0].line;
|
457 | if (lineA > lineB) {
|
458 | return 1;
|
459 | }
|
460 |
|
461 | if (lineA < lineB) {
|
462 | return -1;
|
463 | }
|
464 |
|
465 |
|
466 | const colA = posA[0].column;
|
467 | const colB = posB[0].column;
|
468 | if (colA > colB) {
|
469 | return 1;
|
470 | }
|
471 |
|
472 | if (colA < colB) {
|
473 | return -1;
|
474 | }
|
475 |
|
476 | return 0;
|
477 | }
|
478 | });
|
479 | }
|
480 |
|
481 | function prepareText(text) {
|
482 | text = fixLineEndings(text);
|
483 | text = removeSpecialSymbols(text);
|
484 |
|
485 | return text;
|
486 | }
|
487 |
|
488 | function fixLineEndings(text) {
|
489 | return text
|
490 | .replace(/\r\n/g, '\n')
|
491 | .replace(/\r/g, '\n')
|
492 | .replace(/\s+\n/g, '\n')
|
493 | .trimRight();
|
494 | }
|
495 |
|
496 | function removeSpecialSymbols(text) {
|
497 | return text
|
498 |
|
499 |
|
500 |
|
501 | .replace(/([aeiouyаеёиоуыэюяєії])\u0301/gi, '$1')
|
502 |
|
503 | .replace(/[\u200c\u200d\u00ad]/g, '');
|
504 | }
|
505 |
|
506 | function getErrors() {
|
507 | return yaspellerApi.errors
|
508 | .filter(el => el.code !== yaspellerApi.ERROR_TOO_MANY_ERRORS)
|
509 | .map(el => ({
|
510 | code: el.code,
|
511 | title: el.text
|
512 | })).concat({
|
513 | code: 100,
|
514 | title: 'Letter Ё (Yo)'
|
515 | });
|
516 | }
|
517 |
|
518 | module.exports = {
|
519 | addPositions,
|
520 | errors: getErrors(),
|
521 | checkFile,
|
522 | checkSitemap,
|
523 | checkText,
|
524 | checkUrl,
|
525 | removeDuplicates,
|
526 | sortByPositions
|
527 | };
|