UNPKG

13.6 kBJavaScriptView Raw
1'use strict';
2
3const async = require('async');
4const entities = require('entities');
5const fs = require('fs');
6const eyo = require('./eyo');
7const formatModule = require('./format');
8const ignore = require('./ignore');
9const isutf8 = require('isutf8');
10const request = require('request');
11const pth = require('path');
12const showdown = require('showdown');
13const xml2js = require('xml2js');
14const yaspellerApi = require('yandex-speller');
15const markdownConverter = new showdown.Converter();
16const printDebug = require('../lib/debug').print;
17const MAX_LEN_TEXT = 10000; // Max length of text for Yandex.Speller API
18const TOO_MANY_ERRORS = 4;
19
20function getMaxRequest(settings) {
21 return settings.maxRequest || 2;
22}
23
24function stripTags(html) {
25 return html.replace(/<\/?[a-z][^>]*>/gi, ' ');
26}
27
28/**
29 * Check text for typos.
30 *
31 * @param {string} originalText
32 * @param {Function} callback
33 * @tutorial settings
34 * @param {Object} [settings]
35 * @param {string} [settings.format] Text format: plain or html.
36 * @param {string|Array} [settings.lang] Language: en, ru or uk.
37 * @param {Array<RegExp>} [settings.ignoreText]
38 * @param {Object} [settings.options]
39 */
40function checkText(originalText, callback, settings) {
41 let text = originalText;
42
43 const apiSettings = Object.assign({}, settings);
44 const format = formatModule.getFormat(text, apiSettings);
45 const lang = apiSettings.lang || 'en,ru';
46
47 apiSettings.lang = Array.isArray(lang) ? lang.join(',') : lang;
48
49 Array.isArray(apiSettings.ignoreText) && apiSettings.ignoreText.forEach(function(re) {
50 text = text.replace(re, '');
51 });
52
53 if (ignore.hasIgnoredText(text)) {
54 text = ignore.lines(text);
55 text = ignore.blocks(text);
56 }
57
58 if (format === 'html' || format === 'markdown') {
59 if (format === 'markdown') {
60 text = markdownConverter.makeHtml(text);
61 }
62
63 if (apiSettings.ignoreTags) {
64 text = ignore.tags(text, apiSettings.ignoreTags);
65 }
66
67 text = ignore.comments(text);
68 text = stripTags(text);
69 text = entities.decodeHTML(text);
70 }
71
72 text = prepareText(text, format);
73
74 const tasks = [];
75 const texts = splitText(text);
76
77 apiSettings.format = formatModule.getApiFormat(format);
78
79 texts.forEach(function(el, i) {
80 printDebug({
81 request: i,
82 format: format,
83 apiFormat: apiSettings.format,
84 lang: apiSettings.lang,
85 options: apiSettings.options,
86 text: el.substring(0, 128)
87 });
88
89 tasks.push(function(cb) {
90 yaspellerApi.checkText(el, function(error, body) {
91 if (error) {
92 cb(false, [true, error]);
93 } else {
94 cb(false, [false, body]);
95 }
96 }, apiSettings);
97 });
98 });
99
100 async.parallelLimit(tasks, getMaxRequest(apiSettings), function(err, data) {
101 const buf = mergeResults(data);
102
103 if (!buf.err && apiSettings.checkYo) {
104 checkYo(text, buf.data);
105 }
106
107 callback(buf.err, buf.data, originalText);
108 });
109}
110
111function checkYo(text, data) {
112 eyo(text).forEach(function(el) {
113 data.push({
114 code: 100,
115 position: el.position,
116 word: el.before,
117 s: [el.after],
118 count: el.count
119 });
120 });
121}
122
123function splitText(text) {
124 const texts = [];
125
126 let pos = 0;
127 let newPos = 0;
128
129 while (pos < text.length) {
130 if (pos + MAX_LEN_TEXT >= text.length) {
131 texts.push(text.substring(pos));
132 break;
133 } else {
134 newPos = getPosition(text, pos + MAX_LEN_TEXT);
135 texts.push(text.substring(pos, newPos));
136 pos = newPos;
137 }
138 }
139
140 return texts;
141}
142
143function getPosition(text, start) {
144 const depth = 500; // MAX_LEN_TEXT / 20
145 for (let i = start - 1; i >= start - depth; i--) {
146 const sym = text[i];
147 if (sym === ' ' || sym === '\n' || sym === '\t') {
148 return i;
149 }
150 }
151
152 return start;
153}
154
155function mergeResults(res) {
156 let err = false;
157 let data = [];
158
159 res.some(function(el) {
160 if (el[0]) {
161 err = true;
162 data = el[1];
163 return true;
164 }
165
166 return false;
167 });
168
169 if (!err) {
170 res.forEach(function(el) {
171 data = data.concat(el[1]);
172 });
173 }
174
175 return {
176 err: err,
177 data: data
178 };
179}
180
181/**
182 * Check text in file on typos.
183 *
184 * @param {string} file
185 * @param {Function} callback
186 * @param {Object} [settings] See {@tutorial options}
187 */
188function checkFile(file, callback, settings) {
189 settings = settings || {};
190 settings.extname = pth.extname(file);
191
192 printDebug('get: ' + file);
193
194 if (fs.existsSync(file)) {
195 if (fs.statSync(file).isFile()) {
196 const buf = fs.readFileSync(file);
197 if (isutf8(buf)) {
198 printDebug('post text -> Yandex.Speller API: ' + file);
199
200 const startTime = Date.now();
201 checkText(buf.toString(), function(err, data, originalText) {
202 callback(
203 err,
204 err ? data : {resource: file, data: data, time: Date.now() - startTime},
205 originalText
206 );
207 }, settings);
208 } else {
209 callback(true, Error(file + ': is not utf-8'));
210 }
211 } else {
212 callback(true, Error(file + ': is not file'));
213 }
214 } else {
215 callback(true, Error(file + ': is not exists'));
216 }
217}
218
219/**
220 * Check text on link for typos.
221 *
222 * @param {string} url
223 * @param {Function} callback
224 * @param {Object} [settings] See {@tutorial settings}
225 */
226function checkUrl(url, callback, settings) {
227 settings = settings || {};
228 settings.extname = pth.extname(url);
229
230 printDebug('get: ' + url);
231
232 request.get({
233 method: 'GET',
234 uri: url,
235 gzip: true
236 },
237 function(error, response, text) {
238 if (error) {
239 callback(true, error);
240 return;
241 }
242
243 if (response.statusCode !== 200) {
244 callback(true, Error(url + ': returns status code is ' + response.statusCode));
245 return;
246 }
247
248 const startTime = Date.now();
249 checkText(text, function(err, data, originalText) {
250 callback(
251 err,
252 err ? data : {resource: url, data: data, time: Date.now() - startTime},
253 originalText
254 );
255 }, settings);
256 });
257}
258
259/**
260 * Check text on pages of sitemap.xml.
261 *
262 * @param {string} url
263 * @param {Function} commonCallback - Common callback
264 * @param {Object} [settings] See {@tutorial settings}
265 * @param {Function} [callback] callback - Callback on each url.
266 */
267function checkSitemap(url, commonCallback, settings, callback) {
268 settings = settings || {};
269
270 const results = [];
271
272 printDebug('get: ' + url);
273
274 request.get(url, function(error, response, xml) {
275 let obj;
276
277 if (error) {
278 obj = [true, error];
279 results.push(obj);
280 callback && callback.apply(this, obj);
281 commonCallback(results);
282
283 return;
284 }
285
286 if (response.statusCode !== 200) {
287 obj = [true, Error(url + ': returns status code is ' + response.statusCode)];
288 results.push(obj);
289 callback && callback.apply(this, obj);
290 commonCallback(results);
291
292 return;
293 }
294
295 const parser = new xml2js.Parser();
296 parser.parseString(xml, function(err, result) {
297 if (err) {
298 let obj = [true, Error(url + ': error parsing xml')];
299 results.push(obj);
300 callback && callback.apply(this, obj);
301 commonCallback(results);
302 return;
303 }
304
305 const tasks = [];
306 if (result && result.urlset && Array.isArray(result.urlset.url)) {
307 result.urlset.url.forEach(function(el) {
308 el.loc && el.loc.forEach(function(url) {
309 tasks.push(function(cb) {
310 checkUrl(url, function(err, data) {
311 callback && callback(err, data);
312 cb(false, [err, data]);
313 }, settings);
314 });
315 });
316 });
317 }
318
319 async.parallelLimit(tasks, getMaxRequest(settings), function(err, data) {
320 commonCallback(data);
321 });
322 });
323 });
324}
325
326/**
327 * Add positions (line number and column number) for typos.
328 *
329 * @param {string} text
330 * @param {Object[]} data - Array of typos.
331 */
332function addPositions(text, data) {
333 data.forEach(function(item) {
334 if (item.code === TOO_MANY_ERRORS || item.position) {
335 return;
336 }
337
338 const result = [];
339 const letters = '[^a-zA-Zа-яА-ЯЁёҐґЄєІіЇї]';
340
341 text.replace(new RegExp(item.word + '(?:' + letters + '|$)', 'g'), function($0, index) {
342 const prevSymbol = text[index - 1];
343 if (prevSymbol && prevSymbol.search(letters) === -1) {
344 return;
345 }
346
347 const lines = text.substr(0, index).split(/\r\n|\n|\r/);
348
349 result.push({
350 line: lines.length,
351 column: lines[lines.length - 1].length + 1
352 });
353 });
354
355 item.position = item.count >= result.length ? result : [];
356 });
357}
358
359/**
360 * Remove duplicates in typos.
361 *
362 * @param {Object[]} data - Array of typos.
363 * @returns {Object[]}
364 */
365function removeDuplicates(data) {
366 const result = [];
367 const obj = {};
368
369 data.forEach(function(el) {
370 const code = el.code;
371 const word = el.word;
372 const s = el.s;
373 const hasPosition = Array.isArray(el.position);
374
375 if (!word) {
376 return;
377 }
378
379 obj[code] = obj[code] || {};
380
381 if (!obj[code][word]) {
382 obj[code][word] = {
383 code,
384 word,
385 count: el.count || 1,
386 };
387
388 if (Array.isArray(s) && s.length) {
389 obj[code][word].suggest = s;
390 }
391
392 if (hasPosition) {
393 obj[code][word].position = el.position;
394 }
395 } else {
396 const objWord = obj[code][word];
397 objWord.count += el.count || 1;
398 if (hasPosition) {
399 objWord.position = Array.isArray(objWord.position) ?
400 objWord.position.concat(el.position) :
401 el.position;
402 }
403 }
404 });
405
406 Object.keys(obj).forEach(function(code) {
407 Object.keys(obj[code]).sort().forEach(function(word) {
408 result.push(obj[code][word]);
409 });
410 });
411
412 return result;
413}
414
415/**
416 * Sort results by positions.
417 *
418 * @param {Object[]} data
419 */
420function sortByPositions(data) {
421 data.sort(function(a, b) {
422 const codeA = a.code;
423 const codeB = b.code;
424
425 // Sort by a code
426 if (codeA > codeB) {
427 return 1;
428 }
429 if (codeA < codeB) {
430 return -1;
431 }
432
433 const posA = a.position;
434 const posB = b.position;
435
436 // No position
437 if (!posA.length || !posB.length) {
438 if (posA.length === posB.length) {
439 // Sort by a word
440 return a.word.toLowerCase() > b.word.toLowerCase() ? 1 : -1;
441 }
442
443 if (posA.length < posB.length) {
444 return 1;
445 }
446
447 return -1;
448 } else {
449 // Sort by a line
450 const lineA = posA[0].line;
451 const lineB = posB[0].line;
452 if (lineA > lineB) {
453 return 1;
454 }
455
456 if (lineA < lineB) {
457 return -1;
458 }
459
460 // Sort by a column
461 const colA = posA[0].column;
462 const colB = posB[0].column;
463 if (colA > colB) {
464 return 1;
465 }
466
467 if (colA < colB) {
468 return -1;
469 }
470
471 return 0;
472 }
473 });
474}
475
476function prepareText(text) {
477 text = fixLineEndings(text);
478 text = removeSpecialSymbols(text);
479
480 return text.trim();
481}
482
483function fixLineEndings(text) {
484 return text.replace(/\r\n/g, '\n') // Fix Windows
485 .replace(/\r/g, '\n') // Fix MacOS
486 .replace(/\s+\n/g, '\n') // Trailling spaces
487 .replace(/\s+/g, ' ') // Repeat spaces
488 .replace(/\n+/g, '\n'); // Repeat line endings
489}
490
491function removeSpecialSymbols(text) {
492 return text
493 // en: aeiouy
494 // ru: аеёиоуыэюя
495 // uk: аеєиіїоуюя
496 .replace(/([aeiouyаеёиоуыэюяєії])\u0301/gi, '$1') // Acute accent
497 .replace(/[\u200c\u200d\u00ad]/g, ''); // Zero-width non-joiner, Zero-width joiner and shy
498}
499
500function getErrors() {
501 return yaspellerApi.errorCodes.filter(function(el) {
502 return el.code !== TOO_MANY_ERRORS;
503 }).map(function(el) {
504 return {
505 code: el.code,
506 title: el.text
507 };
508 }).concat({
509 code: 100, // ERROR_EYO
510 title: 'Letter Ё (Yo)'
511 });
512}
513
514module.exports = {
515 addPositions,
516 errors: getErrors(),
517 checkFile,
518 checkSitemap,
519 checkText,
520 checkUrl,
521 removeDuplicates,
522 sortByPositions
523};