1 | var events = require('events');
|
2 | var timers = require('timers');
|
3 | var util = require("util");
|
4 | var _ = require("underscore");
|
5 | var async = require('async');
|
6 | var log = require("crawler-ninja-logger").Logger;
|
7 | var Map = require("collections/fast-map");
|
8 | var Set = require("collections/fast-set");
|
9 | var requester = require("./lib/queue-requester");
|
10 | var URI = require('./lib/uri.js');
|
11 | var html = require("./lib/html.js");
|
12 | var store = require("./lib/store/store.js");
|
13 | var pm = require("./lib/plugin-manager.js");
|
14 |
|
15 |
|
16 |
|
17 | var domainBlackList = require("./default-lists/domain-black-list.js").list();
|
18 | var suffixBlackList = require("./default-lists/suffix-black-list.js").list();
|
19 |
|
20 |
|
21 | var DEFAULT_NUMBER_OF_CONNECTIONS = 30;
|
22 | var DEFAULT_DEPTH_LIMIT = -1;
|
23 | var DEFAULT_TIME_OUT = 20000;
|
24 | var DEFAULT_RETRIES = 3;
|
25 | var DEFAULT_RETRY_TIMEOUT = 10000;
|
26 | var DEFAULT_SKIP_DUPLICATES = true;
|
27 | var DEFAULT_RATE_LIMITS = 0;
|
28 | var DEFAULT_MAX_ERRORS = 5;
|
29 | var DEFAULT_ERROR_RATES = [200, 350, 500];
|
30 |
|
31 | var DEFAULT_FIRST_EXTERNAL_LINK_ONLY = false;
|
32 | var DEFAULT_CRAWL_EXTERNAL_DOMAINS = false;
|
33 | var DEFAULT_CRAWL_EXTERNAL_HOSTS = false;
|
34 | var DEFAULT_CRAWL_SCRIPTS = true;
|
35 | var DEFAULT_CRAWL_LINKS = true;
|
36 | var DEFAULT_CRAWL_IMAGES = true;
|
37 |
|
38 | var DEFAULT_PROTOCOLS_TO_CRAWL = ["http", "https"];
|
39 | var DEFAULT_FOLLOW_301 = false;
|
40 |
|
41 | var DEFAULT_LINKS_TYPES = ["canonical", "stylesheet"];
|
42 | var DEFAULT_USER_AGENT = "NinjaBot";
|
43 | var DEFAULT_CACHE = false;
|
44 | var DEFAULT_METHOD = 'GET';
|
45 | var DEFAULT_REFERER = false;
|
46 |
|
47 | var DEFAULT_STORE_MODULE = "./memory-store.js";
|
48 |
|
49 |
|
50 |
|
51 |
|
52 |
|
53 |
|
54 |
|
55 |
|
56 |
|
57 |
|
58 |
|
59 |
|
60 |
|
61 |
|
62 |
|
63 |
|
64 |
|
65 |
|
66 |
|
67 |
|
68 |
|
69 |
|
70 |
|
71 |
|
72 |
|
73 |
|
74 |
|
75 |
|
76 |
|
77 |
|
78 | function Crawler(config) {
|
79 |
|
80 |
|
81 | this.config = this.createDefaultConfig();
|
82 |
|
83 |
|
84 | if (config) {
|
85 | _.extend(this.config, config);
|
86 | }
|
87 |
|
88 |
|
89 | if (this.config.rateLimits !== 0) {
|
90 | this.config.maxConnections = 1;
|
91 | }
|
92 |
|
93 |
|
94 | store.createStore(this.config.storeModuleName, this.config.storeParams ? this.config.storeParams : null);
|
95 |
|
96 |
|
97 |
|
98 | this.updateDepth = updateDepth;
|
99 |
|
100 |
|
101 | if (this.config.updateDepth) {
|
102 | this.updateDepth = this.config.updateDepth;
|
103 | }
|
104 |
|
105 | this.pm = new pm.PluginManager();
|
106 |
|
107 | this.httpRequester = new requester.Requester(this.config);
|
108 |
|
109 | events.EventEmitter.call(this);
|
110 |
|
111 | }
|
112 |
|
113 | util.inherits(Crawler, events.EventEmitter);
|
114 |
|
115 |
|
116 |
|
117 |
|
118 |
|
119 |
|
120 |
|
121 |
|
122 | Crawler.prototype.queue = function(options) {
|
123 |
|
124 | var self = this;
|
125 |
|
126 |
|
127 | if (! options){
|
128 | if (self.config.onCrawl) {
|
129 | self.config.onCrawl({errorCode : "NO_OPTIONS"}, {method:"GET", url : "unknown", proxy : "", error : true},
|
130 | function(error){
|
131 | if (self.httpRequester.idle()) {
|
132 | self.config.onDrain();
|
133 | }
|
134 | });
|
135 | }
|
136 | return;
|
137 |
|
138 | }
|
139 |
|
140 |
|
141 |
|
142 | if (_.isArray(options)) {
|
143 | options.forEach(function(opt){
|
144 | self.queue(opt);
|
145 | });
|
146 |
|
147 | return;
|
148 | }
|
149 |
|
150 |
|
151 |
|
152 | if (_.isString(options)) {
|
153 | store.getStore().addStartUrl(options, function(error) {
|
154 | self.httpRequester.queue(addDefaultOptions({uri:options, url:options}, self.config));
|
155 | });
|
156 |
|
157 | }
|
158 |
|
159 | else {
|
160 |
|
161 | if (! _.has(options, "url") && ! _.has(options, "uri")) {
|
162 | if (self.config.onCrawl) {
|
163 | self.config.onCrawl({errorCode : "NO_URL_OPTION"}, {method:"GET", url : "unknown", proxy : "", error : true},
|
164 | function(error){
|
165 | if (self.httpRequester.idle()) {
|
166 | self.config.onDrain();
|
167 | }
|
168 | });
|
169 | }
|
170 |
|
171 | }
|
172 | else {
|
173 | store.getStore().addStartUrl(_.has(options, "url") ? options.url : options.uri, function(error) {
|
174 | self.httpRequester.queue(addDefaultOptions(options, self.config));
|
175 | });
|
176 | }
|
177 | }
|
178 |
|
179 |
|
180 | }
|
181 |
|
182 |
|
183 |
|
184 |
|
185 |
|
186 |
|
187 |
|
188 |
|
189 | function addDefaultOptions(options, defaultOptions) {
|
190 |
|
191 | _.defaults(options, defaultOptions);
|
192 | options.maxRetries = options.retries;
|
193 |
|
194 | return options;
|
195 |
|
196 | }
|
197 |
|
198 |
|
199 |
|
200 |
|
201 |
|
202 |
|
203 |
|
204 |
|
205 |
|
206 | Crawler.prototype.buildNewOptions = function(options, newUrl) {
|
207 |
|
208 | var o = this.createDefaultConfig(newUrl);
|
209 |
|
210 |
|
211 |
|
212 | o = _.extend(o, _.pick(options, _.without(_.keys(o), "url", "uri")));
|
213 |
|
214 |
|
215 | o.maxRetries = o.retries;
|
216 | o.depthLimit = options.depthLimit;
|
217 |
|
218 |
|
219 | if (options.canCrawl) {
|
220 | o.canCrawl = options.canCrawl;
|
221 | }
|
222 | return o;
|
223 |
|
224 | }
|
225 |
|
226 |
|
227 |
|
228 |
|
229 |
|
230 |
|
231 |
|
232 | Crawler.prototype.createDefaultConfig = function(url) {
|
233 | var self = this;
|
234 | var config = {
|
235 |
|
236 |
|
237 | cache : DEFAULT_CACHE,
|
238 | method : DEFAULT_METHOD,
|
239 | referer : DEFAULT_REFERER,
|
240 | maxConnections : DEFAULT_NUMBER_OF_CONNECTIONS,
|
241 | timeout : DEFAULT_TIME_OUT,
|
242 | retries : DEFAULT_RETRIES,
|
243 | maxRetries : DEFAULT_RETRIES,
|
244 | retryTimeout : DEFAULT_RETRY_TIMEOUT,
|
245 | maxErrors : DEFAULT_MAX_ERRORS,
|
246 | errorRates : DEFAULT_ERROR_RATES,
|
247 | skipDuplicates : DEFAULT_SKIP_DUPLICATES,
|
248 | rateLimits : DEFAULT_RATE_LIMITS,
|
249 | externalDomains : DEFAULT_CRAWL_EXTERNAL_DOMAINS,
|
250 | externalHosts : DEFAULT_CRAWL_EXTERNAL_HOSTS,
|
251 | firstExternalLinkOnly : DEFAULT_FIRST_EXTERNAL_LINK_ONLY,
|
252 | protocols : DEFAULT_PROTOCOLS_TO_CRAWL,
|
253 | depthLimit : DEFAULT_DEPTH_LIMIT,
|
254 | followRedirect : DEFAULT_FOLLOW_301,
|
255 | images : DEFAULT_CRAWL_IMAGES,
|
256 | links : DEFAULT_CRAWL_LINKS,
|
257 | linkTypes : DEFAULT_LINKS_TYPES,
|
258 | scripts : DEFAULT_CRAWL_SCRIPTS,
|
259 | userAgent : DEFAULT_USER_AGENT,
|
260 | domainBlackList : domainBlackList,
|
261 | suffixBlackList : suffixBlackList,
|
262 | storeModuleName : DEFAULT_STORE_MODULE,
|
263 |
|
264 | onCrawl : function(error, result, callback){
|
265 | self.crawl(error, result, callback);
|
266 | },
|
267 |
|
268 | onDrain : function(){
|
269 | timers.setImmediate(function(){
|
270 | log.debug({ "step" : "onDrain", "message" : "End of the crawl"});
|
271 | self.emit('end');
|
272 | });
|
273 |
|
274 | }
|
275 |
|
276 | };
|
277 |
|
278 | if (url) {
|
279 | config.url = url;
|
280 | config.uri = url;
|
281 | }
|
282 |
|
283 | return config;
|
284 |
|
285 | }
|
286 |
|
287 |
|
288 |
|
289 |
|
290 |
|
291 |
|
292 |
|
293 |
|
294 | Crawler.prototype.crawl = function (error, result, callback) {
|
295 |
|
296 | var self = this;
|
297 |
|
298 |
|
299 | if (error) {
|
300 | this.pm.error(error,result, callback);
|
301 | return;
|
302 | }
|
303 | var $ = html.isHTML(result.body) ? html.$(result.body) : null;
|
304 |
|
305 |
|
306 |
|
307 | async.parallel([
|
308 | async.apply(self.pm.crawl.bind(this.pm),result, $),
|
309 | async.apply(self.analyzeHTML.bind(self), result, $),
|
310 | async.apply(self.applyRedirect.bind(self), result),
|
311 | ], callback);
|
312 |
|
313 | }
|
314 |
|
315 |
|
316 | Crawler.prototype.applyRedirect = function(result, callback) {
|
317 |
|
318 | if (result.statusCode >= 300 && result.statusCode <= 399 && ! this.config.followRedirect) {
|
319 |
|
320 | var from = result.uri;
|
321 | var to = result.headers["location"];
|
322 | var to = URI.linkToURI(from, to);
|
323 | var self = this;
|
324 | this.pm.crawlRedirect(from, to, result.statusCode, function(){
|
325 | self.httpRequester.queue(self.buildNewOptions(result,to));
|
326 | callback();
|
327 | });
|
328 | }
|
329 | else {
|
330 | callback();
|
331 | }
|
332 |
|
333 |
|
334 | }
|
335 |
|
336 |
|
337 |
|
338 |
|
339 |
|
340 |
|
341 |
|
342 |
|
343 | Crawler.prototype.analyzeHTML = function(result, $, callback) {
|
344 |
|
345 |
|
346 | if (! $) {
|
347 | return callback();
|
348 | }
|
349 |
|
350 | log.debug({"url" : result.url, "step" : "analyzeHTML", "message" : "Start check HTML code"});
|
351 | var self = this;
|
352 |
|
353 | async.parallel([
|
354 |
|
355 | async.apply(self.crawlHrefs.bind(self), result, $),
|
356 | async.apply(self.crawlLinks.bind(self), result, $),
|
357 | async.apply(self.crawlScripts.bind(self), result, $),
|
358 | async.apply(self.crawlImages.bind(self), result, $),
|
359 |
|
360 | ], callback);
|
361 |
|
362 | }
|
363 |
|
364 |
|
365 |
|
366 |
|
367 |
|
368 |
|
369 |
|
370 |
|
371 |
|
372 | Crawler.prototype.crawlHrefs = function(result, $, endCallback) {
|
373 |
|
374 | log.debug({"url" : result.url, "step" : "analyzeHTML", "message" : "CrawlHrefs"});
|
375 | var self = this;
|
376 | async.each($('a'), function(a, callback) {
|
377 | self.crawlHref($, result, a, callback);
|
378 | }, endCallback);
|
379 |
|
380 | }
|
381 |
|
382 | Crawler.prototype.crawlHref = function($,result, a, callback) {
|
383 |
|
384 | var link = $(a).attr('href');
|
385 | var parentUri = result.uri;
|
386 | if (link) {
|
387 |
|
388 | var anchor = $(a).text() ? $(a).text() : "";
|
389 | var noFollow = $(a).attr("rel");
|
390 | var isDoFollow = ! (noFollow && noFollow === "nofollow");
|
391 |
|
392 | var linkUri = URI.linkToURI(parentUri, link);
|
393 |
|
394 | var self = this;
|
395 | this.pm.crawlLink(parentUri, linkUri, anchor, isDoFollow, function(){
|
396 | self.checkUrlToCrawl(result, parentUri, linkUri, anchor, isDoFollow, callback);
|
397 | });
|
398 |
|
399 | }
|
400 | else {
|
401 | callback();
|
402 | }
|
403 |
|
404 | }
|
405 |
|
406 |
|
407 |
|
408 |
|
409 |
|
410 |
|
411 |
|
412 |
|
413 |
|
414 | Crawler.prototype.crawlLinks = function(result, $, endCallback) {
|
415 |
|
416 | if (! this.config.links){
|
417 | return endCallback();
|
418 | }
|
419 |
|
420 | log.debug({"url" : result.url, "step" : "analyzeHTML", "message" : "CrawlLinks"});
|
421 | var self = this;
|
422 |
|
423 | async.each($('link'), function(linkTag, callback) {
|
424 | self.crawLink($, result, linkTag, callback);
|
425 | }, endCallback);
|
426 | }
|
427 |
|
428 | Crawler.prototype.crawLink = function($,result,linkTag, callback) {
|
429 | var link = $(linkTag).attr('href');
|
430 | var parentUri = result.uri;
|
431 |
|
432 | if (link) {
|
433 |
|
434 | var rel = $(linkTag).attr('rel');
|
435 |
|
436 | if (this.config.linkTypes.indexOf(rel) > 0) {
|
437 | var linkUri = URI.linkToURI(parentUri, link);
|
438 | var self = this;
|
439 | this.pm.crawlLink(parentUri, linkUri, null, null, function(){
|
440 | self.checkUrlToCrawl(result, parentUri, linkUri, null, null, callback);
|
441 | });
|
442 | }
|
443 | else {
|
444 | callback();
|
445 | }
|
446 | }
|
447 | else {
|
448 | callback();
|
449 | }
|
450 | }
|
451 |
|
452 |
|
453 |
|
454 |
|
455 |
|
456 |
|
457 |
|
458 |
|
459 | Crawler.prototype.crawlScripts = function(result, $, endCallback) {
|
460 |
|
461 | if (! this.config.scripts) {
|
462 | return endCallback();
|
463 | }
|
464 |
|
465 | log.debug({"url" : result.url, "step" : "analyzeHTML", "message" : "CrawlScripts"});
|
466 | var self = this;
|
467 |
|
468 | async.each($('script'), function(script, callback) {
|
469 | self.crawlScript($, result, script, callback);
|
470 | }, endCallback);
|
471 | }
|
472 |
|
473 | Crawler.prototype.crawlScript = function($,result, script, callback) {
|
474 |
|
475 | var link = $(script).attr('src');
|
476 | var parentUri = result.uri;
|
477 |
|
478 | if (link) {
|
479 | var linkUri = URI.linkToURI(parentUri, link);
|
480 | var self = this;
|
481 | this.pm.crawlLink(parentUri, linkUri, null, null, function(){
|
482 | self.checkUrlToCrawl(result, parentUri, linkUri, null, null, callback);
|
483 | });
|
484 |
|
485 | }
|
486 | else {
|
487 | callback();
|
488 | }
|
489 |
|
490 |
|
491 | }
|
492 |
|
493 |
|
494 |
|
495 |
|
496 |
|
497 |
|
498 |
|
499 |
|
500 | Crawler.prototype.crawlImages = function(result, $, endCallback) {
|
501 |
|
502 | if (! this.config.images) {
|
503 | return endCallback();
|
504 | }
|
505 |
|
506 | log.debug({"url" : result.url, "step" : "analyzeHTML", "message" : "CrawlImages"});
|
507 | var self = this;
|
508 |
|
509 | async.each($('img'), function(img, callback) {
|
510 | self.crawlImage($, result, img, callback);
|
511 | }, endCallback);
|
512 | }
|
513 |
|
514 | Crawler.prototype.crawlImage = function($,result, img, callback) {
|
515 | var parentUri = result.uri;
|
516 |
|
517 | var link = $(img).attr('src');
|
518 | var alt = $(img).attr('alt');
|
519 | if (link) {
|
520 | var linkUri = URI.linkToURI(parentUri, link);
|
521 | var self = this;
|
522 | this.pm.crawlImage(parentUri, linkUri, alt, function(){
|
523 | self.checkUrlToCrawl(result, parentUri, linkUri, null, null, callback);
|
524 | });
|
525 |
|
526 | }
|
527 | else {
|
528 | callback();
|
529 | }
|
530 | }
|
531 |
|
532 | Crawler.prototype.checkUrlToCrawl = function(result, parentUri, linkUri, anchor, isDoFollow, endCallback) {
|
533 | var self = this;
|
534 |
|
535 | async.waterfall([
|
536 | function(callback) {
|
537 |
|
538 | self.updateDepth(parentUri, linkUri, function(error, currentDepth) {
|
539 | callback(error,currentDepth);
|
540 | });
|
541 |
|
542 | },
|
543 | function(currentDepth, callback) {
|
544 | self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri, anchor, isDoFollow, function(error, toCrawl) {
|
545 | if (error) {
|
546 | return callback(error);
|
547 | }
|
548 | if (toCrawl && (result.depthLimit == -1 || currentDepth <= result.depthLimit)) {
|
549 | self.httpRequester.queue(self.buildNewOptions(result,linkUri));
|
550 | callback();
|
551 | }
|
552 | else {
|
553 | self.pm.unCrawl(parentUri, linkUri, anchor, isDoFollow, callback);
|
554 | }
|
555 |
|
556 | });
|
557 |
|
558 | }
|
559 | ], endCallback);
|
560 | }
|
561 |
|
562 |
|
563 |
|
564 |
|
565 |
|
566 |
|
567 |
|
568 |
|
569 |
|
570 |
|
571 | Crawler.prototype.isAGoodLinkToCrawl = function(result, currentDepth, parentUri, link, anchor, isDoFollow, callback) {
|
572 |
|
573 | store.getStore().isStartFromUrl(parentUri, link, function(error, startFrom){
|
574 |
|
575 |
|
576 | if ((! startFrom.link.isStartFromHost && ! result.externalHosts) &&
|
577 | (! (! startFrom.link.isStartFromDomains && result.externalDomains))) {
|
578 | log.warn({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "Don't crawl url - no external host or domain"});
|
579 | return callback(null, false);
|
580 | }
|
581 |
|
582 |
|
583 | if (result.firstExternalLinkOnly && ((! startFrom.link.isStartFromHost) || (! startFrom.link.isStartFromDomains))) {
|
584 |
|
585 | if (! startFrom.parentUri.isStartFromHost) {
|
586 | log.warn({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "Don't crawl url - no external host or domain (not the first link)"});
|
587 | return callback(null, false);
|
588 | }
|
589 | }
|
590 |
|
591 |
|
592 | var protocol = URI.protocol(link);
|
593 | if (result.protocols.indexOf(protocol) < 0) {
|
594 | log.warn({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "Don't crawl url - no valid protocol : " + protocol});
|
595 | return callback(null, false);
|
596 | }
|
597 |
|
598 |
|
599 | if (result.domainBlackList.indexOf(URI.domainName(link)) > 0) {
|
600 | log.warn({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "Don't crawl url - domain is blacklisted" });
|
601 | return callback(null, false);
|
602 | }
|
603 |
|
604 |
|
605 | var suffix = URI.suffix(link);
|
606 | if (result.suffixBlackList.indexOf(suffix) > 0) {
|
607 | log.warn({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "Don't crawl url - suffix is blacklisted"});
|
608 | return callback(null, false);
|
609 | }
|
610 |
|
611 |
|
612 | if (! result.canCrawl) {
|
613 | log.info({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "URL can be crawled"});
|
614 | return callback(null, true);
|
615 | }
|
616 |
|
617 | var check = result.canCrawl(parentUri, link, anchor, isDoFollow);
|
618 | log.debug({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "method options.canCrawl has been called and return "} + check);
|
619 | return callback(null, check);
|
620 |
|
621 | });
|
622 |
|
623 | }
|
624 |
|
625 | Crawler.prototype.registerPlugin = function (plugin) {
|
626 | this.pm.registerPlugin(plugin);
|
627 | }
|
628 |
|
629 | Crawler.prototype.unregisterPlugin = function (plugin) {
|
630 | this.pm.unregisterPlugin(plugin);
|
631 | }
|
632 |
|
633 |
|
634 |
|
635 |
|
636 |
|
637 |
|
638 |
|
639 |
|
640 |
|
641 |
|
642 | var updateDepth = function(parentUri, linkUri, callback) {
|
643 |
|
644 | var depths = {parentUri : parentUri, linkUri : linkUri, parentDepth : 0, linkDepth : 0};
|
645 | var execFns = async.seq(getDepths , calcultateDepths , saveDepths);
|
646 |
|
647 | execFns(depths, function (error, result) {
|
648 | if (error) {
|
649 | callback(error);
|
650 | }
|
651 | return callback(error, result.linkDepth);
|
652 | });
|
653 |
|
654 | }
|
655 |
|
656 |
|
657 |
|
658 |
|
659 |
|
660 |
|
661 |
|
662 |
|
663 |
|
664 | var getDepths = function (depths, callback) {
|
665 |
|
666 | async.parallel([
|
667 | async.apply(store.getStore().getDepth.bind(store.getStore()), depths.parentUri),
|
668 | async.apply(store.getStore().getDepth.bind(store.getStore()), depths.linkUri)
|
669 | ],
|
670 | function(error, results){
|
671 | if (error) {
|
672 | return callback(error);
|
673 | }
|
674 | depths.parentDepth = results[0];
|
675 | depths.linkDepth = results[1];
|
676 | callback(null, depths);
|
677 | });
|
678 | }
|
679 |
|
680 |
|
681 |
|
682 |
|
683 |
|
684 |
|
685 |
|
686 |
|
687 |
|
688 |
|
689 | var calcultateDepths = function (depths, callback) {
|
690 | if (depths.parentDepth) {
|
691 |
|
692 |
|
693 | if (! depths.linkDepth) {
|
694 | depths.linkDepth = depths.parentDepth + 1;
|
695 | }
|
696 | }
|
697 | else {
|
698 | depths.parentDepth = 0;
|
699 | depths.linkDepth = 1;
|
700 | }
|
701 | callback(null, depths);
|
702 | }
|
703 |
|
704 |
|
705 |
|
706 |
|
707 |
|
708 |
|
709 |
|
710 |
|
711 |
|
712 | var saveDepths = function(depths, callback) {
|
713 |
|
714 | async.parallel([
|
715 | async.apply(store.getStore().setDepth.bind(store.getStore()), depths.parentUri, depths.parentDepth ),
|
716 | async.apply(store.getStore().setDepth.bind(store.getStore()), depths.linkUri, depths.linkDepth )
|
717 | ],
|
718 | function(error){
|
719 | callback(error, depths);
|
720 | });
|
721 | }
|
722 |
|
723 | module.exports.Crawler = Crawler;
|