1 | 'use strict';
|
2 |
|
3 | const async = require('async');
|
4 | const entities = require('entities');
|
5 | const fs = require('fs');
|
6 | const eyo = require('./eyo');
|
7 | const formatModule = require('./format');
|
8 | const ignore = require('./ignore');
|
9 | const isutf8 = require('isutf8');
|
10 | const request = require('request');
|
11 | const pth = require('path');
|
12 | const showdown = require('showdown');
|
13 | const xml2js = require('xml2js');
|
14 | const yaspellerApi = require('yandex-speller');
|
15 | const markdownConverter = new showdown.Converter();
|
16 | const printDebug = require('../lib/debug').print;
|
17 | const MAX_LEN_TEXT = 10000;
|
18 | const TOO_MANY_ERRORS = 4;
|
19 |
|
20 | function getMaxRequest(settings) {
|
21 | return settings.maxRequest || 2;
|
22 | }
|
23 |
|
24 | function stripTags(html) {
|
25 | return html.replace(/<\/?[a-z][^>]*>/gi, ' ');
|
26 | }
|
27 |
|
28 |
|
29 |
|
30 |
|
31 |
|
32 |
|
33 |
|
34 |
|
35 |
|
36 |
|
37 |
|
38 |
|
39 |
|
40 | function checkText(originalText, callback, settings) {
|
41 | let text = originalText;
|
42 |
|
43 | const apiSettings = Object.assign({}, settings);
|
44 | const format = formatModule.getFormat(text, apiSettings);
|
45 | const lang = apiSettings.lang || 'en,ru';
|
46 |
|
47 | apiSettings.lang = Array.isArray(lang) ? lang.join(',') : lang;
|
48 |
|
49 | Array.isArray(apiSettings.ignoreText) && apiSettings.ignoreText.forEach(function(re) {
|
50 | text = text.replace(re, '');
|
51 | });
|
52 |
|
53 | if (ignore.hasIgnoredText(text)) {
|
54 | text = ignore.lines(text);
|
55 | text = ignore.blocks(text);
|
56 | }
|
57 |
|
58 | if (format === 'html' || format === 'markdown') {
|
59 | if (format === 'markdown') {
|
60 | text = markdownConverter.makeHtml(text);
|
61 | }
|
62 |
|
63 | if (apiSettings.ignoreTags) {
|
64 | text = ignore.tags(text, apiSettings.ignoreTags);
|
65 | }
|
66 |
|
67 | text = ignore.comments(text);
|
68 | text = stripTags(text);
|
69 | text = entities.decodeHTML(text);
|
70 | }
|
71 |
|
72 | text = prepareText(text, format);
|
73 |
|
74 | const tasks = [];
|
75 | const texts = splitText(text);
|
76 |
|
77 | apiSettings.format = formatModule.getApiFormat(format);
|
78 |
|
79 | texts.forEach(function(el, i) {
|
80 | printDebug({
|
81 | request: i,
|
82 | format: format,
|
83 | apiFormat: apiSettings.format,
|
84 | lang: apiSettings.lang,
|
85 | options: apiSettings.options,
|
86 | text: el.substring(0, 128)
|
87 | });
|
88 |
|
89 | tasks.push(function(cb) {
|
90 | yaspellerApi.checkText(el, function(error, body) {
|
91 | if (error) {
|
92 | cb(false, [true, error]);
|
93 | } else {
|
94 | cb(false, [false, body]);
|
95 | }
|
96 | }, apiSettings);
|
97 | });
|
98 | });
|
99 |
|
100 | async.parallelLimit(tasks, getMaxRequest(apiSettings), function(err, data) {
|
101 | const buf = mergeResults(data);
|
102 |
|
103 | if (!buf.err && apiSettings.checkYo) {
|
104 | checkYo(text, buf.data);
|
105 | }
|
106 |
|
107 | callback(buf.err, buf.data, originalText);
|
108 | });
|
109 | }
|
110 |
|
111 | function checkYo(text, data) {
|
112 | eyo(text).forEach(function(el) {
|
113 | data.push({
|
114 | code: 100,
|
115 | position: el.position,
|
116 | word: el.before,
|
117 | s: [el.after],
|
118 | count: el.count
|
119 | });
|
120 | });
|
121 | }
|
122 |
|
123 | function splitText(text) {
|
124 | const texts = [];
|
125 |
|
126 | let pos = 0;
|
127 | let newPos = 0;
|
128 |
|
129 | while (pos < text.length) {
|
130 | if (pos + MAX_LEN_TEXT >= text.length) {
|
131 | texts.push(text.substring(pos));
|
132 | break;
|
133 | } else {
|
134 | newPos = getPosition(text, pos + MAX_LEN_TEXT);
|
135 | texts.push(text.substring(pos, newPos));
|
136 | pos = newPos;
|
137 | }
|
138 | }
|
139 |
|
140 | return texts;
|
141 | }
|
142 |
|
143 | function getPosition(text, start) {
|
144 | const depth = 500;
|
145 | for (let i = start - 1; i >= start - depth; i--) {
|
146 | const sym = text[i];
|
147 | if (sym === ' ' || sym === '\n' || sym === '\t') {
|
148 | return i;
|
149 | }
|
150 | }
|
151 |
|
152 | return start;
|
153 | }
|
154 |
|
155 | function mergeResults(res) {
|
156 | let err = false;
|
157 | let data = [];
|
158 |
|
159 | res.some(function(el) {
|
160 | if (el[0]) {
|
161 | err = true;
|
162 | data = el[1];
|
163 | return true;
|
164 | }
|
165 |
|
166 | return false;
|
167 | });
|
168 |
|
169 | if (!err) {
|
170 | res.forEach(function(el) {
|
171 | data = data.concat(el[1]);
|
172 | });
|
173 | }
|
174 |
|
175 | return {
|
176 | err: err,
|
177 | data: data
|
178 | };
|
179 | }
|
180 |
|
181 |
|
182 |
|
183 |
|
184 |
|
185 |
|
186 |
|
187 |
|
188 | function checkFile(file, callback, settings) {
|
189 | settings = settings || {};
|
190 | settings.extname = pth.extname(file);
|
191 |
|
192 | printDebug('get: ' + file);
|
193 |
|
194 | if (fs.existsSync(file)) {
|
195 | if (fs.statSync(file).isFile()) {
|
196 | const buf = fs.readFileSync(file);
|
197 | if (isutf8(buf)) {
|
198 | printDebug('post text -> Yandex.Speller API: ' + file);
|
199 |
|
200 | const startTime = Date.now();
|
201 | checkText(buf.toString(), function(err, data, originalText) {
|
202 | callback(
|
203 | err,
|
204 | err ? data : {resource: file, data: data, time: Date.now() - startTime},
|
205 | originalText
|
206 | );
|
207 | }, settings);
|
208 | } else {
|
209 | callback(true, Error(file + ': is not utf-8'));
|
210 | }
|
211 | } else {
|
212 | callback(true, Error(file + ': is not file'));
|
213 | }
|
214 | } else {
|
215 | callback(true, Error(file + ': is not exists'));
|
216 | }
|
217 | }
|
218 |
|
219 |
|
220 |
|
221 |
|
222 |
|
223 |
|
224 |
|
225 |
|
226 | function checkUrl(url, callback, settings) {
|
227 | settings = settings || {};
|
228 | settings.extname = pth.extname(url);
|
229 |
|
230 | printDebug('get: ' + url);
|
231 |
|
232 | request.get({
|
233 | method: 'GET',
|
234 | uri: url,
|
235 | gzip: true
|
236 | },
|
237 | function(error, response, text) {
|
238 | if (error) {
|
239 | callback(true, error);
|
240 | return;
|
241 | }
|
242 |
|
243 | if (response.statusCode !== 200) {
|
244 | callback(true, Error(url + ': returns status code is ' + response.statusCode));
|
245 | return;
|
246 | }
|
247 |
|
248 | const startTime = Date.now();
|
249 | checkText(text, function(err, data, originalText) {
|
250 | callback(
|
251 | err,
|
252 | err ? data : {resource: url, data: data, time: Date.now() - startTime},
|
253 | originalText
|
254 | );
|
255 | }, settings);
|
256 | });
|
257 | }
|
258 |
|
259 |
|
260 |
|
261 |
|
262 |
|
263 |
|
264 |
|
265 |
|
266 |
|
267 | function checkSitemap(url, commonCallback, settings, callback) {
|
268 | settings = settings || {};
|
269 |
|
270 | const results = [];
|
271 |
|
272 | printDebug('get: ' + url);
|
273 |
|
274 | request.get(url, function(error, response, xml) {
|
275 | let obj;
|
276 |
|
277 | if (error) {
|
278 | obj = [true, error];
|
279 | results.push(obj);
|
280 | callback && callback.apply(this, obj);
|
281 | commonCallback(results);
|
282 |
|
283 | return;
|
284 | }
|
285 |
|
286 | if (response.statusCode !== 200) {
|
287 | obj = [true, Error(url + ': returns status code is ' + response.statusCode)];
|
288 | results.push(obj);
|
289 | callback && callback.apply(this, obj);
|
290 | commonCallback(results);
|
291 |
|
292 | return;
|
293 | }
|
294 |
|
295 | const parser = new xml2js.Parser();
|
296 | parser.parseString(xml, function(err, result) {
|
297 | if (err) {
|
298 | let obj = [true, Error(url + ': error parsing xml')];
|
299 | results.push(obj);
|
300 | callback && callback.apply(this, obj);
|
301 | commonCallback(results);
|
302 | return;
|
303 | }
|
304 |
|
305 | const tasks = [];
|
306 | if (result && result.urlset && Array.isArray(result.urlset.url)) {
|
307 | result.urlset.url.forEach(function(el) {
|
308 | el.loc && el.loc.forEach(function(url) {
|
309 | tasks.push(function(cb) {
|
310 | checkUrl(url, function(err, data) {
|
311 | callback && callback(err, data);
|
312 | cb(false, [err, data]);
|
313 | }, settings);
|
314 | });
|
315 | });
|
316 | });
|
317 | }
|
318 |
|
319 | async.parallelLimit(tasks, getMaxRequest(settings), function(err, data) {
|
320 | commonCallback(data);
|
321 | });
|
322 | });
|
323 | });
|
324 | }
|
325 |
|
326 |
|
327 |
|
328 |
|
329 |
|
330 |
|
331 |
|
332 | function addPositions(text, data) {
|
333 | data.forEach(function(item) {
|
334 | if (item.code === TOO_MANY_ERRORS || item.position) {
|
335 | return;
|
336 | }
|
337 |
|
338 | const result = [];
|
339 | const letters = '[^a-zA-Zа-яА-ЯЁёҐґЄєІіЇї]';
|
340 |
|
341 | text.replace(new RegExp(item.word + '(?:' + letters + '|$)', 'g'), function($0, index) {
|
342 | const prevSymbol = text[index - 1];
|
343 | if (prevSymbol && prevSymbol.search(letters) === -1) {
|
344 | return;
|
345 | }
|
346 |
|
347 | const lines = text.substr(0, index).split(/\r\n|\n|\r/);
|
348 |
|
349 | result.push({
|
350 | line: lines.length,
|
351 | column: lines[lines.length - 1].length + 1
|
352 | });
|
353 | });
|
354 |
|
355 | item.position = item.count >= result.length ? result : [];
|
356 | });
|
357 | }
|
358 |
|
359 |
|
360 |
|
361 |
|
362 |
|
363 |
|
364 |
|
365 | function removeDuplicates(data) {
|
366 | const result = [];
|
367 | const obj = {};
|
368 |
|
369 | data.forEach(function(el) {
|
370 | const code = el.code;
|
371 | const word = el.word;
|
372 | const s = el.s;
|
373 | const hasPosition = Array.isArray(el.position);
|
374 |
|
375 | if (!word) {
|
376 | return;
|
377 | }
|
378 |
|
379 | obj[code] = obj[code] || {};
|
380 |
|
381 | if (!obj[code][word]) {
|
382 | obj[code][word] = {
|
383 | code,
|
384 | word,
|
385 | count: el.count || 1,
|
386 | };
|
387 |
|
388 | if (Array.isArray(s) && s.length) {
|
389 | obj[code][word].suggest = s;
|
390 | }
|
391 |
|
392 | if (hasPosition) {
|
393 | obj[code][word].position = el.position;
|
394 | }
|
395 | } else {
|
396 | const objWord = obj[code][word];
|
397 | objWord.count += el.count || 1;
|
398 | if (hasPosition) {
|
399 | objWord.position = Array.isArray(objWord.position) ?
|
400 | objWord.position.concat(el.position) :
|
401 | el.position;
|
402 | }
|
403 | }
|
404 | });
|
405 |
|
406 | Object.keys(obj).forEach(function(code) {
|
407 | Object.keys(obj[code]).sort().forEach(function(word) {
|
408 | result.push(obj[code][word]);
|
409 | });
|
410 | });
|
411 |
|
412 | return result;
|
413 | }
|
414 |
|
415 |
|
416 |
|
417 |
|
418 |
|
419 |
|
420 | function sortByPositions(data) {
|
421 | data.sort(function(a, b) {
|
422 | const codeA = a.code;
|
423 | const codeB = b.code;
|
424 |
|
425 |
|
426 | if (codeA > codeB) {
|
427 | return 1;
|
428 | }
|
429 | if (codeA < codeB) {
|
430 | return -1;
|
431 | }
|
432 |
|
433 | const posA = a.position;
|
434 | const posB = b.position;
|
435 |
|
436 |
|
437 | if (!posA.length || !posB.length) {
|
438 | if (posA.length === posB.length) {
|
439 |
|
440 | return a.word.toLowerCase() > b.word.toLowerCase() ? 1 : -1;
|
441 | }
|
442 |
|
443 | if (posA.length < posB.length) {
|
444 | return 1;
|
445 | }
|
446 |
|
447 | return -1;
|
448 | } else {
|
449 |
|
450 | const lineA = posA[0].line;
|
451 | const lineB = posB[0].line;
|
452 | if (lineA > lineB) {
|
453 | return 1;
|
454 | }
|
455 |
|
456 | if (lineA < lineB) {
|
457 | return -1;
|
458 | }
|
459 |
|
460 |
|
461 | const colA = posA[0].column;
|
462 | const colB = posB[0].column;
|
463 | if (colA > colB) {
|
464 | return 1;
|
465 | }
|
466 |
|
467 | if (colA < colB) {
|
468 | return -1;
|
469 | }
|
470 |
|
471 | return 0;
|
472 | }
|
473 | });
|
474 | }
|
475 |
|
476 | function prepareText(text) {
|
477 | text = fixLineEndings(text);
|
478 | text = removeSpecialSymbols(text);
|
479 |
|
480 | return text.trim();
|
481 | }
|
482 |
|
483 | function fixLineEndings(text) {
|
484 | return text.replace(/\r\n/g, '\n')
|
485 | .replace(/\r/g, '\n')
|
486 | .replace(/\s+\n/g, '\n')
|
487 | .replace(/\s+/g, ' ')
|
488 | .replace(/\n+/g, '\n');
|
489 | }
|
490 |
|
491 | function removeSpecialSymbols(text) {
|
492 | return text
|
493 |
|
494 |
|
495 |
|
496 | .replace(/([aeiouyаеёиоуыэюяєії])\u0301/gi, '$1')
|
497 | .replace(/[\u200c\u200d\u00ad]/g, '');
|
498 | }
|
499 |
|
500 | function getErrors() {
|
501 | return yaspellerApi.errorCodes.filter(function(el) {
|
502 | return el.code !== TOO_MANY_ERRORS;
|
503 | }).map(function(el) {
|
504 | return {
|
505 | code: el.code,
|
506 | title: el.text
|
507 | };
|
508 | }).concat({
|
509 | code: 100,
|
510 | title: 'Letter Ё (Yo)'
|
511 | });
|
512 | }
|
513 |
|
514 | module.exports = {
|
515 | addPositions,
|
516 | errors: getErrors(),
|
517 | checkFile,
|
518 | checkSitemap,
|
519 | checkText,
|
520 | checkUrl,
|
521 | removeDuplicates,
|
522 | sortByPositions
|
523 | };
|