UNPKG

14 kBJavaScriptView Raw
1'use strict';
2
3const async = require('async');
4const entities = require('entities');
5const fs = require('fs');
6const isutf8 = require('isutf8');
7const fetch = require('node-fetch');
8const pth = require('path');
9const xml2js = require('xml2js');
10const yaspellerApi = require('yandex-speller');
11const MarkdownIt = require('markdown-it');
12const md = new MarkdownIt();
13
14const eyo = require('./plugins/eyo');
15const { getFormat, getApiFormat } = require('./helpers/format');
16const {
17 hasIgnoredText,
18 ignoreComments,
19 ignoreBlocks,
20 ignoreTags,
21 ignoreLines,
22} = require('./helpers/ignore');
23const { consoleDebug } = require('../lib/helpers/console');
24const { jsonStringify, stripTags } = require('./helpers/string');
25
26const MAX_LEN_TEXT = 10000; // Max length of text for Yandex.Speller API
27
28function getMaxRequest(settings) {
29 return settings.maxRequest || 2;
30}
31
32/**
33 * Check text for typos.
34 *
35 * @param {string} originalText
36 * @param {Function} callback
37 * @tutorial settings
38 * @param {Object} [settings]
39 * @param {string} [settings.format] Text format: plain or html.
40 * @param {string|Array} [settings.lang] Language: en, ru or uk.
41 * @param {Array<RegExp>} [settings.ignoreText]
42 * @param {Object} [settings.options]
43 */
44function checkText(originalText, callback, settings) {
45 let text = originalText;
46
47 consoleDebug(`Original text: ${originalText}`);
48
49 const apiSettings = Object.assign({}, settings);
50 const format = getFormat(text, apiSettings);
51 const lang = apiSettings.lang || 'en,ru';
52
53 apiSettings.lang = Array.isArray(lang) ? lang.join(',') : lang;
54
55 Array.isArray(apiSettings.ignoreText) && apiSettings.ignoreText.forEach(function(re) {
56 text = text.replace(re, '');
57 });
58
59 if (hasIgnoredText(text)) {
60 text = ignoreLines(text);
61 text = ignoreBlocks(text);
62 }
63
64 if (format === 'html' || format === 'markdown') {
65 if (format === 'markdown') {
66 text = md.render(text);
67 }
68
69 if (apiSettings.ignoreTags) {
70 text = ignoreTags(text, apiSettings.ignoreTags);
71 }
72
73 text = ignoreComments(text);
74 text = stripTags(text);
75 text = entities.decodeHTML(text);
76 }
77
78 text = prepareText(text, format);
79 consoleDebug(`Prepared text for API: ${text}`);
80
81 const tasks = [];
82 const texts = splitText(text);
83
84 apiSettings.format = getApiFormat(format);
85
86 texts.forEach(function(el, i) {
87 consoleDebug({
88 request: i,
89 format: format,
90 apiFormat: apiSettings.format,
91 lang: apiSettings.lang,
92 options: apiSettings.options,
93 text: el.substring(0, 128),
94 });
95
96 tasks.push(function(cb) {
97 yaspellerApi.checkText(el, function(error, body) {
98 if (error) {
99 cb(null, [true, error]);
100 } else {
101 cb(null, [false, body]);
102 }
103 }, apiSettings);
104 });
105 });
106
107 async.parallelLimit(tasks, getMaxRequest(apiSettings), function(err, data) {
108 const buf = mergeResults(data);
109
110 consoleDebug('Yandex.Speller API response:');
111 consoleDebug(jsonStringify(buf));
112
113 if (!buf.err && apiSettings.checkYo) {
114 checkYo(text, buf.data);
115 }
116
117 callback(buf.err, buf.data, originalText);
118 });
119}
120
121function checkYo(text, data) {
122 eyo(text).forEach(function(el) {
123 data.push({
124 code: 100,
125 position: el.position,
126 word: el.before,
127 s: [el.after],
128 count: el.count
129 });
130 });
131}
132
133function splitText(text) {
134 const texts = [];
135
136 let pos = 0;
137 let newPos = 0;
138
139 while (pos < text.length) {
140 if (pos + MAX_LEN_TEXT >= text.length) {
141 texts.push(text.substring(pos));
142 break;
143 } else {
144 newPos = getPosition(text, pos + MAX_LEN_TEXT);
145 texts.push(text.substring(pos, newPos));
146 pos = newPos;
147 }
148 }
149
150 return texts;
151}
152
153function getPosition(text, start) {
154 const depth = 500; // MAX_LEN_TEXT / 20
155 for (let i = start - 1; i >= start - depth; i--) {
156 const sym = text[i];
157 if (sym === ' ' || sym === '\n' || sym === '\t') {
158 return i;
159 }
160 }
161
162 return start;
163}
164
165function mergeResults(res) {
166 let err = false;
167 let data = [];
168
169 res.some(function(el) {
170 if (el[0]) {
171 err = true;
172 data = el[1];
173 return true;
174 }
175
176 return false;
177 });
178
179 if (!err) {
180 res.forEach(function(el) {
181 data = data.concat(el[1]);
182 });
183 }
184
185 return {
186 err: err,
187 data: data
188 };
189}
190
191/**
192 * Check text in file on typos.
193 *
194 * @param {string} file
195 * @param {Function} callback
196 * @param {Object} [settings] See {@tutorial options}
197 */
198function checkFile(file, callback, settings) {
199 settings = settings || {};
200 settings.extname = pth.extname(file);
201
202 consoleDebug('Get file: ' + file);
203
204 if (fs.existsSync(file)) {
205 if (fs.statSync(file).isFile()) {
206 const buf = fs.readFileSync(file);
207 if (isutf8(buf)) {
208 consoleDebug('Post text → Yandex.Speller API: ' + file);
209
210 const startTime = Date.now();
211 checkText(buf.toString(), function(err, data, originalText) {
212 callback(
213 err,
214 err ? data : {resource: file, data: data, time: Date.now() - startTime},
215 originalText
216 );
217 }, settings);
218 } else {
219 callback(true, Error(file + ': is not UTF-8'));
220 }
221 } else {
222 callback(true, Error(file + ': is not file'));
223 }
224 } else {
225 callback(true, Error(file + ': is not exists'));
226 }
227}
228
229/**
230 * Check text on link for typos.
231 *
232 * @param {string} url
233 * @param {Function} callback
234 * @param {Object} [settings] See {@tutorial settings}
235 */
236function checkUrl(url, callback, settings) {
237 settings = settings || {};
238 settings.extname = pth.extname(url);
239
240 consoleDebug('Get url: ' + url);
241
242 fetch(url)
243 .then(response => {
244 if (response.ok) {
245 return response.text();
246 } else {
247 throw Error(url + ': returns status code is ' + response.statusCode);
248 }
249 })
250 .then(text => {
251 const startTime = Date.now();
252 checkText(text, function(err, data, originalText) {
253 callback(
254 err,
255 err ? data : {resource: url, data: data, time: Date.now() - startTime},
256 originalText
257 );
258 }, settings);
259 })
260 .catch(error => {
261 callback(true, error);
262 });
263}
264
265/**
266 * Check text on pages of sitemap.xml.
267 *
268 * @param {string} url
269 * @param {Function} commonCallback - Common callback
270 * @param {Object} [settings] See {@tutorial settings}
271 * @param {Function} [callback] callback - Callback on each url.
272 */
273function checkSitemap(url, commonCallback, settings, callback) {
274 settings = settings || {};
275
276 const results = [];
277
278 consoleDebug('Get sitemap: ' + url);
279
280 fetch(url)
281 .then(res => {
282 if (res.ok) {
283 return res.text();
284 } else {
285 throw Error(url + ': returns status code is ' + res.statusCode);
286 }
287 })
288 .then(xml => {
289 const parser = new xml2js.Parser();
290 parser.parseString(xml, function(err, result) {
291 if (err) {
292 let obj = [true, Error(url + ': error parsing xml')];
293 results.push(obj);
294 callback && callback.apply(this, obj);
295 commonCallback(results);
296 return;
297 }
298
299 const tasks = [];
300 if (result && result.urlset && Array.isArray(result.urlset.url)) {
301 result.urlset.url.forEach(function(el) {
302 el.loc && el.loc.forEach(function(url) {
303 tasks.push(function(cb) {
304 checkUrl(url, function(err, data, originalText) {
305 callback && callback(err, data, originalText);
306 cb(null, [err, data]);
307 }, settings);
308 });
309 });
310 });
311 }
312
313 async.parallelLimit(tasks, getMaxRequest(settings), function(err, data) {
314 commonCallback(data);
315 });
316 });
317 })
318 .catch((error) => {
319 const obj = [true, error];
320 results.push(obj);
321 callback && callback.apply(this, obj);
322 commonCallback(results);
323 });
324}
325
326/**
327 * Add positions (line number and column number) for typos.
328 *
329 * @param {string} text
330 * @param {Object[]} data - Array of typos.
331 */
332function addPositions(text, data) {
333 data.forEach(function(item) {
334 if (item.code === yaspellerApi.ERROR_TOO_MANY_ERRORS || item.position) {
335 return;
336 }
337
338 const result = [];
339 const letters = '[^a-zA-Zа-яА-ЯЁёҐґЄєІіЇї]';
340 let word = item.word;
341
342 if (item.code === yaspellerApi.ERROR_REPEATED_WORD) {
343 word = item.word + '\\s+' + item.word;
344 }
345
346 text.replace(new RegExp(word + '(?:' + letters + '|$)', 'mg'), function($0, index) {
347 const prevSymbol = text[index - 1];
348 if (prevSymbol && prevSymbol.search(letters) === -1) {
349 return;
350 }
351
352 const lines = text.substr(0, index).split(/\r\n|\n|\r/);
353
354 result.push({
355 line: lines.length,
356 column: lines[lines.length - 1].length + 1
357 });
358 });
359
360 item.position = item.count >= result.length ? result : [];
361 });
362}
363
364/**
365 * Remove duplicates in typos.
366 *
367 * @param {Object[]} data - Array of typos.
368 * @returns {Object[]}
369 */
370function removeDuplicates(data) {
371 const result = [];
372 const obj = {};
373
374 data.forEach(function(el) {
375 const code = el.code;
376 const word = el.word;
377 const s = el.s;
378 const hasPosition = Array.isArray(el.position);
379
380 if (!word) {
381 return;
382 }
383
384 obj[code] = obj[code] || {};
385
386 if (!obj[code][word]) {
387 obj[code][word] = {
388 code,
389 word,
390 count: el.count || 1,
391 };
392
393 if (Array.isArray(s) && s.length) {
394 obj[code][word].suggest = s;
395 }
396
397 if (hasPosition) {
398 obj[code][word].position = el.position;
399 }
400 } else {
401 const objWord = obj[code][word];
402 objWord.count += el.count || 1;
403 if (hasPosition) {
404 objWord.position = Array.isArray(objWord.position) ?
405 objWord.position.concat(el.position) :
406 el.position;
407 }
408 }
409 });
410
411 Object.keys(obj).forEach(function(code) {
412 Object.keys(obj[code]).sort().forEach(function(word) {
413 result.push(obj[code][word]);
414 });
415 });
416
417 return result;
418}
419
420/**
421 * Sort results by positions.
422 *
423 * @param {Object[]} data
424 */
425function sortByPositions(data) {
426 data.sort(function(a, b) {
427 const codeA = a.code;
428 const codeB = b.code;
429
430 // Sort by a code
431 if (codeA > codeB) {
432 return 1;
433 }
434 if (codeA < codeB) {
435 return -1;
436 }
437
438 const posA = a.position;
439 const posB = b.position;
440
441 // No position
442 if (!posA.length || !posB.length) {
443 if (posA.length === posB.length) {
444 // Sort by a word
445 return a.word.toLowerCase() > b.word.toLowerCase() ? 1 : -1;
446 }
447
448 if (posA.length < posB.length) {
449 return 1;
450 }
451
452 return -1;
453 } else {
454 // Sort by a line
455 const lineA = posA[0].line;
456 const lineB = posB[0].line;
457 if (lineA > lineB) {
458 return 1;
459 }
460
461 if (lineA < lineB) {
462 return -1;
463 }
464
465 // Sort by a column
466 const colA = posA[0].column;
467 const colB = posB[0].column;
468 if (colA > colB) {
469 return 1;
470 }
471
472 if (colA < colB) {
473 return -1;
474 }
475
476 return 0;
477 }
478 });
479}
480
481function prepareText(text) {
482 text = fixLineEndings(text);
483 text = removeSpecialSymbols(text);
484
485 return text;
486}
487
488function fixLineEndings(text) {
489 return text
490 .replace(/\r\n/g, '\n') // Fix Windows
491 .replace(/\r/g, '\n') // Fix MacOS
492 .replace(/\s+\n/g, '\n') // Trailling spaces
493 .trimRight();
494}
495
496function removeSpecialSymbols(text) {
497 return text
498 // en: aeiouy
499 // ru: аеёиоуыэюя
500 // uk: аеєиіїоуюя
501 .replace(/([aeiouyаеёиоуыэюяєії])\u0301/gi, '$1') // Acute accent
502 // eslint-disable-next-line no-misleading-character-class
503 .replace(/[\u200c\u200d\u00ad]/g, ''); // Zero-width non-joiner, Zero-width joiner and shy
504}
505
506function getErrors() {
507 return yaspellerApi.errors
508 .filter(el => el.code !== yaspellerApi.ERROR_TOO_MANY_ERRORS)
509 .map(el => ({
510 code: el.code,
511 title: el.text
512 })).concat({
513 code: 100, // ERROR_EYO
514 title: 'Letter Ё (Yo)'
515 });
516}
517
518module.exports = {
519 addPositions,
520 errors: getErrors(),
521 checkFile,
522 checkSitemap,
523 checkText,
524 checkUrl,
525 removeDuplicates,
526 sortByPositions
527};