UNPKG

23.4 kBJavaScriptView Raw
1var events = require('events');
2var timers = require('timers');
3var util = require("util");
4var _ = require("underscore");
5var async = require('async');
6var log = require("crawler-ninja-logger").Logger;
7var Map = require("collections/fast-map");
8var Set = require("collections/fast-set");
9var requester = require("./lib/queue-requester");
10var URI = require('./lib/uri.js');
11var html = require("./lib/html.js");
12var store = require("./lib/store/store.js");
13var pm = require("./lib/plugin-manager.js");
14
15
16
17var domainBlackList = require("./default-lists/domain-black-list.js").list();
18var suffixBlackList = require("./default-lists/suffix-black-list.js").list();
19
20
21var DEFAULT_NUMBER_OF_CONNECTIONS = 30;
22var DEFAULT_DEPTH_LIMIT = -1; // no limit
23var DEFAULT_TIME_OUT = 20000;
24var DEFAULT_RETRIES = 3;
25var DEFAULT_RETRY_TIMEOUT = 10000;
26var DEFAULT_SKIP_DUPLICATES = true;
27var DEFAULT_RATE_LIMITS = 0;
28var DEFAULT_MAX_ERRORS = 5;
29var DEFAULT_ERROR_RATES = [200, 350, 500];
30
31var DEFAULT_FIRST_EXTERNAL_LINK_ONLY = false;
32var DEFAULT_CRAWL_EXTERNAL_DOMAINS = false;
33var DEFAULT_CRAWL_EXTERNAL_HOSTS = false;
34var DEFAULT_CRAWL_SCRIPTS = true; // Crawl <script>
35var DEFAULT_CRAWL_LINKS = true; // Crawl <link>
36var DEFAULT_CRAWL_IMAGES = true;
37
38var DEFAULT_PROTOCOLS_TO_CRAWL = ["http", "https"];
39var DEFAULT_FOLLOW_301 = false;
40
41var DEFAULT_LINKS_TYPES = ["canonical", "stylesheet"];
42var DEFAULT_USER_AGENT = "NinjaBot";
43var DEFAULT_CACHE = false;
44var DEFAULT_METHOD = 'GET';
45var DEFAULT_REFERER = false;
46
47var DEFAULT_STORE_MODULE = "./memory-store.js";
48
49/**
50 * The crawler object
51 *
52 * @param config used to customize the crawler.
53 *
54 * The current config attributes are :
55 * - maxConnections : the number of connections used to crawl - default is 10
56 * - externalDomains : if true crawl the external domains. This option can crawl a lot of different linked domains, default = false.
57 * - externalHosts : if true crawl the others hosts on the same domain, default = false.
58 * - firstExternalLinkOnly : crawl only the first link found for external domains/hosts. externalHosts or externalDomains should be = true
59 * - scripts : if true crawl script tags
60 * - links : if true crawl link tags
61 * - linkTypes : the type of the links tags to crawl (match to the rel attribute), default : ["canonical", "stylesheet"]
62 * - images : if true crawl images
63 * - protocols : list of the protocols to crawl, default = ["http", "https"]
64 * - timeout : timeout per requests in milliseconds (Default 20000)
65 * - retries : number of retries if the request fails (default 3)
66 * - retryTimeout : number of milliseconds to wait before retrying (Default 10000)
67 * - maxErrors : number of timeout errors before changing the crawl rate, default is 5,
68 - errorRates : list of rates to used when too many timeout errors occur.
69 * - skipDuplicates : if true skips URIs that were already crawled - default is true
70 * - rateLimits : number of milliseconds to delay between each requests (Default 0).
71 * Note that this option will force crawler to use only one connection
72 * - depthLimit : the depth limit for the crawl
73 * - followRedirect : if true, the crawl will not return the 301, it will follow directly the redirection
74 * - proxyList : the list of proxies (see the project simple-proxies on npm)
75 *
76 * + all options provided by nodejs request : https://github.com/request/request
77 */
78function Crawler(config) {
79
80 // Default config
81 this.config = this.createDefaultConfig();
82
83 // Merge default config values & overridden values provided by the arg config
84 if (config) {
85 _.extend(this.config, config);
86 }
87
88 // if using rateLimits we want to use only one connection with delay between requests
89 if (this.config.rateLimits !== 0) {
90 this.config.maxConnections = 1;
91 }
92
93 // create the crawl store
94 store.createStore(this.config.storeModuleName, this.config.storeParams ? this.config.storeParams : null);
95
96
97 // assign the default updateDepth method used to calculate the crawl depth
98 this.updateDepth = updateDepth;
99
100 // If the config object contains an new implementation of the updateDepth method
101 if (this.config.updateDepth) {
102 this.updateDepth = this.config.updateDepth;
103 }
104
105 this.pm = new pm.PluginManager();
106
107 this.httpRequester = new requester.Requester(this.config);
108
109 events.EventEmitter.call(this);
110
111}
112
113util.inherits(Crawler, events.EventEmitter);
114
115
116/**
117 * Add one or more urls to crawl
118 *
119 * @param The url(s) to crawl
120 *
121 */
122Crawler.prototype.queue = function(options) {
123
124 var self = this;
125
126 // Error if no options
127 if (! options){
128 if (self.config.onCrawl) {
129 self.config.onCrawl({errorCode : "NO_OPTIONS"}, {method:"GET", url : "unknown", proxy : "", error : true},
130 function(error){
131 if (self.httpRequester.idle()) {
132 self.config.onDrain();
133 }
134 });
135 }
136 return;
137
138 }
139
140
141 // if Array => recall this method for each element
142 if (_.isArray(options)) {
143 options.forEach(function(opt){
144 self.queue(opt);
145 });
146
147 return;
148 }
149
150
151 // if String, we expect to receive an url
152 if (_.isString(options)) {
153 store.getStore().addStartUrl(options, function(error) {
154 self.httpRequester.queue(addDefaultOptions({uri:options, url:options}, self.config));
155 });
156
157 }
158 // Last possibility, this is a json
159 else {
160
161 if (! _.has(options, "url") && ! _.has(options, "uri")) {
162 if (self.config.onCrawl) {
163 self.config.onCrawl({errorCode : "NO_URL_OPTION"}, {method:"GET", url : "unknown", proxy : "", error : true},
164 function(error){
165 if (self.httpRequester.idle()) {
166 self.config.onDrain();
167 }
168 });
169 }
170
171 }
172 else {
173 store.getStore().addStartUrl(_.has(options, "url") ? options.url : options.uri, function(error) {
174 self.httpRequester.queue(addDefaultOptions(options, self.config));
175 });
176 }
177 }
178
179
180}
181
182/**
183 * Add the default crawler options into the option used for the current request
184 *
185 *
186 * @param the option used for the current request
187 * @return
188 */
189 function addDefaultOptions(options, defaultOptions) {
190
191 _.defaults(options, defaultOptions);
192 options.maxRetries = options.retries;
193
194 return options;
195
196}
197
198/**
199 * Make a copy of an option object for a specific url
200 *
201 *
202 * @param the options object to create/copy
203 * @param the url to apply into the new option object
204 * @return the new options object
205 */
206Crawler.prototype.buildNewOptions = function(options, newUrl) {
207
208 var o = this.createDefaultConfig(newUrl);
209
210 // Copy only options attributes that are in the options used for the previous request
211 // Could be more simple ? ;-)
212 o = _.extend(o, _.pick(options, _.without(_.keys(o), "url", "uri")));
213
214 //Reset setting used for retries when an error occurs like a timeout
215 o.maxRetries = o.retries;
216 o.depthLimit = options.depthLimit;
217
218
219 if (options.canCrawl) {
220 o.canCrawl = options.canCrawl;
221 }
222 return o;
223
224}
225
226
227/**
228 * Default crawler config
229 *
230 * @returns the config object
231 */
232Crawler.prototype.createDefaultConfig = function(url) {
233 var self = this;
234 var config = {
235
236
237 cache : DEFAULT_CACHE,
238 method : DEFAULT_METHOD,
239 referer : DEFAULT_REFERER,
240 maxConnections : DEFAULT_NUMBER_OF_CONNECTIONS,
241 timeout : DEFAULT_TIME_OUT,
242 retries : DEFAULT_RETRIES,
243 maxRetries : DEFAULT_RETRIES,
244 retryTimeout : DEFAULT_RETRY_TIMEOUT,
245 maxErrors : DEFAULT_MAX_ERRORS,
246 errorRates : DEFAULT_ERROR_RATES,
247 skipDuplicates : DEFAULT_SKIP_DUPLICATES,
248 rateLimits : DEFAULT_RATE_LIMITS,
249 externalDomains : DEFAULT_CRAWL_EXTERNAL_DOMAINS,
250 externalHosts : DEFAULT_CRAWL_EXTERNAL_HOSTS,
251 firstExternalLinkOnly : DEFAULT_FIRST_EXTERNAL_LINK_ONLY,
252 protocols : DEFAULT_PROTOCOLS_TO_CRAWL,
253 depthLimit : DEFAULT_DEPTH_LIMIT,
254 followRedirect : DEFAULT_FOLLOW_301,
255 images : DEFAULT_CRAWL_IMAGES,
256 links : DEFAULT_CRAWL_LINKS,
257 linkTypes : DEFAULT_LINKS_TYPES,
258 scripts : DEFAULT_CRAWL_SCRIPTS,
259 userAgent : DEFAULT_USER_AGENT,
260 domainBlackList : domainBlackList,
261 suffixBlackList : suffixBlackList,
262 storeModuleName : DEFAULT_STORE_MODULE,
263
264 onCrawl : function(error, result, callback){
265 self.crawl(error, result, callback);
266 },
267
268 onDrain : function(){
269 timers.setImmediate(function(){
270 log.debug({ "step" : "onDrain", "message" : "End of the crawl"});
271 self.emit('end');
272 });
273
274 }
275
276 };
277
278 if (url) {
279 config.url = url;
280 config.uri = url;
281 }
282
283 return config;
284
285}
286
287/**
288 * Default callback function used when the http queue requester get a resource (html, pdf, css, ...)
289 *
290 * @param error : the usual nodejs error
291 * @param result: the crawled resource
292 *
293 */
294Crawler.prototype.crawl = function (error, result, callback) {
295
296 var self = this;
297
298 // if HTTP error, delegate to the plugins
299 if (error) {
300 this.pm.error(error,result, callback);
301 return;
302 }
303 var $ = html.isHTML(result.body) ? html.$(result.body) : null;
304
305 // Analyse the HTTP response in order to check the content (links, images, ...)
306 // or apply a redirect
307 async.parallel([
308 async.apply(self.pm.crawl.bind(this.pm),result, $),
309 async.apply(self.analyzeHTML.bind(self), result, $),
310 async.apply(self.applyRedirect.bind(self), result),
311 ], callback);
312
313}
314
315
316Crawler.prototype.applyRedirect = function(result, callback) {
317 // if 30* & followRedirect = false => chain 30*
318 if (result.statusCode >= 300 && result.statusCode <= 399 && ! this.config.followRedirect) {
319
320 var from = result.uri;
321 var to = result.headers["location"];
322 var to = URI.linkToURI(from, to);
323 var self = this;
324 this.pm.crawlRedirect(from, to, result.statusCode, function(){
325 self.httpRequester.queue(self.buildNewOptions(result,to));
326 callback();
327 });
328 }
329 else {
330 callback();
331 }
332
333
334}
335
336/**
337 * Analyze an HTML page. Mainly, found a.href, links,scripts & images in the page
338 *
339 * @param result : the result of the crawled resource
340 * @param the jquery like object for accessing to the HTML tags. Null is the resource
341 * is not an HTML
342 */
343Crawler.prototype.analyzeHTML = function(result, $, callback) {
344
345 // if $ is note defined, this is not a HTML page with an http status 200
346 if (! $) {
347 return callback();
348 }
349
350 log.debug({"url" : result.url, "step" : "analyzeHTML", "message" : "Start check HTML code"});
351 var self = this;
352
353 async.parallel([
354
355 async.apply(self.crawlHrefs.bind(self), result, $),
356 async.apply(self.crawlLinks.bind(self), result, $),
357 async.apply(self.crawlScripts.bind(self), result, $),
358 async.apply(self.crawlImages.bind(self), result, $),
359
360 ], callback);
361
362}
363
364
365/**
366 * Crawl urls that match to HTML tags a.href found in one page
367 *
368 * @param result : the result of the crawled resource
369 * @param the jquery like object for accessing to the HTML tags.
370 *
371 */
372Crawler.prototype.crawlHrefs = function(result, $, endCallback) {
373
374 log.debug({"url" : result.url, "step" : "analyzeHTML", "message" : "CrawlHrefs"});
375 var self = this;
376 async.each($('a'), function(a, callback) {
377 self.crawlHref($, result, a, callback);
378 }, endCallback);
379
380}
381
382Crawler.prototype.crawlHref = function($,result, a, callback) {
383
384 var link = $(a).attr('href');
385 var parentUri = result.uri;
386 if (link) {
387
388 var anchor = $(a).text() ? $(a).text() : "";
389 var noFollow = $(a).attr("rel");
390 var isDoFollow = ! (noFollow && noFollow === "nofollow");
391
392 var linkUri = URI.linkToURI(parentUri, link);
393
394 var self = this;
395 this.pm.crawlLink(parentUri, linkUri, anchor, isDoFollow, function(){
396 self.checkUrlToCrawl(result, parentUri, linkUri, anchor, isDoFollow, callback);
397 });
398
399 }
400 else {
401 callback();
402 }
403
404}
405
406
407/**
408 * Crawl link tags found in the HTML page
409 * eg. : <link rel="stylesheet" href="/css/bootstrap.min.css">
410 *
411 * @param result : the result of the crawled resource
412 * @param the jquery like object for accessing to the HTML tags.
413 */
414Crawler.prototype.crawlLinks = function(result, $, endCallback) {
415
416 if (! this.config.links){
417 return endCallback();
418 }
419
420 log.debug({"url" : result.url, "step" : "analyzeHTML", "message" : "CrawlLinks"});
421 var self = this;
422
423 async.each($('link'), function(linkTag, callback) {
424 self.crawLink($, result, linkTag, callback);
425 }, endCallback);
426}
427
428Crawler.prototype.crawLink = function($,result,linkTag, callback) {
429 var link = $(linkTag).attr('href');
430 var parentUri = result.uri;
431
432 if (link) {
433
434 var rel = $(linkTag).attr('rel');
435
436 if (this.config.linkTypes.indexOf(rel) > 0) {
437 var linkUri = URI.linkToURI(parentUri, link);
438 var self = this;
439 this.pm.crawlLink(parentUri, linkUri, null, null, function(){
440 self.checkUrlToCrawl(result, parentUri, linkUri, null, null, callback);
441 });
442 }
443 else {
444 callback();
445 }
446 }
447 else {
448 callback();
449 }
450}
451
452
453/**
454 * Crawl script tags found in the HTML page
455 *
456 * @param result : the result of the crawled resource
457 * @param the jquery like object for accessing to the HTML tags.
458 */
459Crawler.prototype.crawlScripts = function(result, $, endCallback) {
460
461 if (! this.config.scripts) {
462 return endCallback();
463 }
464
465 log.debug({"url" : result.url, "step" : "analyzeHTML", "message" : "CrawlScripts"});
466 var self = this;
467
468 async.each($('script'), function(script, callback) {
469 self.crawlScript($, result, script, callback);
470 }, endCallback);
471}
472
473Crawler.prototype.crawlScript = function($,result, script, callback) {
474
475 var link = $(script).attr('src');
476 var parentUri = result.uri;
477
478 if (link) {
479 var linkUri = URI.linkToURI(parentUri, link);
480 var self = this;
481 this.pm.crawlLink(parentUri, linkUri, null, null, function(){
482 self.checkUrlToCrawl(result, parentUri, linkUri, null, null, callback);
483 });
484
485 }
486 else {
487 callback();
488 }
489
490
491}
492
493
494/**
495 * Crawl image tags found in the HTML page
496 *
497 * @param result : the result of the crawled resource
498 * @param the jquery like object for accessing to the HTML tags.
499 */
500Crawler.prototype.crawlImages = function(result, $, endCallback) {
501
502 if (! this.config.images) {
503 return endCallback();
504 }
505
506 log.debug({"url" : result.url, "step" : "analyzeHTML", "message" : "CrawlImages"});
507 var self = this;
508
509 async.each($('img'), function(img, callback) {
510 self.crawlImage($, result, img, callback);
511 }, endCallback);
512}
513
514Crawler.prototype.crawlImage = function($,result, img, callback) {
515 var parentUri = result.uri;
516
517 var link = $(img).attr('src');
518 var alt = $(img).attr('alt');
519 if (link) {
520 var linkUri = URI.linkToURI(parentUri, link);
521 var self = this;
522 this.pm.crawlImage(parentUri, linkUri, alt, function(){
523 self.checkUrlToCrawl(result, parentUri, linkUri, null, null, callback);
524 });
525
526 }
527 else {
528 callback();
529 }
530}
531
532Crawler.prototype.checkUrlToCrawl = function(result, parentUri, linkUri, anchor, isDoFollow, endCallback) {
533 var self = this;
534
535 async.waterfall([
536 function(callback) {
537
538 self.updateDepth(parentUri, linkUri, function(error, currentDepth) {
539 callback(error,currentDepth);
540 });
541
542 },
543 function(currentDepth, callback) {
544 self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri, anchor, isDoFollow, function(error, toCrawl) {
545 if (error) {
546 return callback(error);
547 }
548 if (toCrawl && (result.depthLimit == -1 || currentDepth <= result.depthLimit)) {
549 self.httpRequester.queue(self.buildNewOptions(result,linkUri));
550 callback();
551 }
552 else {
553 self.pm.unCrawl(parentUri, linkUri, anchor, isDoFollow, callback);
554 }
555
556 });
557
558 }
559 ], endCallback);
560}
561
562
563/**
564 * Check if a link has to be crawled
565 *
566 * @param the link url
567 * @param the anchor text of the links
568 * @param true if the link is dofollow
569 * @returns
570 */
571Crawler.prototype.isAGoodLinkToCrawl = function(result, currentDepth, parentUri, link, anchor, isDoFollow, callback) {
572
573 store.getStore().isStartFromUrl(parentUri, link, function(error, startFrom){
574
575 // 1. Check if we need to crawl other hosts & domains
576 if ((! startFrom.link.isStartFromHost && ! result.externalHosts) &&
577 (! (! startFrom.link.isStartFromDomains && result.externalDomains))) {
578 log.warn({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "Don't crawl url - no external host or domain"});
579 return callback(null, false);
580 }
581
582 // 2. Check if we need to crawl only the first pages of external hosts/domains
583 if (result.firstExternalLinkOnly && ((! startFrom.link.isStartFromHost) || (! startFrom.link.isStartFromDomains))) {
584
585 if (! startFrom.parentUri.isStartFromHost) {
586 log.warn({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "Don't crawl url - no external host or domain (not the first link)"});
587 return callback(null, false);
588 }
589 }
590
591 // 3. Check if the link is based on a good protocol
592 var protocol = URI.protocol(link);
593 if (result.protocols.indexOf(protocol) < 0) {
594 log.warn({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "Don't crawl url - no valid protocol : " + protocol});
595 return callback(null, false);
596 }
597
598 // 4. Check if the domain is in the domain black-list
599 if (result.domainBlackList.indexOf(URI.domainName(link)) > 0) {
600 log.warn({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "Don't crawl url - domain is blacklisted" });
601 return callback(null, false);
602 }
603
604 // 5. Check if the domain is in the suffix black-list
605 var suffix = URI.suffix(link);
606 if (result.suffixBlackList.indexOf(suffix) > 0) {
607 log.warn({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "Don't crawl url - suffix is blacklisted"});
608 return callback(null, false);
609 }
610
611 // 6. Check if there is a rule in the crawler configuration
612 if (! result.canCrawl) {
613 log.info({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "URL can be crawled"});
614 return callback(null, true);
615 }
616 // TODO : asynch this function ?
617 var check = result.canCrawl(parentUri, link, anchor, isDoFollow);
618 log.debug({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "method options.canCrawl has been called and return "} + check);
619 return callback(null, check);
620
621 });
622
623}
624
625Crawler.prototype.registerPlugin = function (plugin) {
626 this.pm.registerPlugin(plugin);
627}
628
629Crawler.prototype.unregisterPlugin = function (plugin) {
630 this.pm.unregisterPlugin(plugin);
631}
632
633/**
634 * Compute the crawl depth for a link in function of the crawl depth
635 * of the page that contains the link
636 *
637 * @param The URI of page that contains the link
638 * @param The link for which the crawl depth has to be calculated
639 * @param callback(error, depth)
640 *
641 */
642var updateDepth = function(parentUri, linkUri, callback) {
643
644 var depths = {parentUri : parentUri, linkUri : linkUri, parentDepth : 0, linkDepth : 0};
645 var execFns = async.seq(getDepths , calcultateDepths , saveDepths);
646
647 execFns(depths, function (error, result) {
648 if (error) {
649 callback(error);
650 }
651 return callback(error, result.linkDepth);
652 });
653
654}
655
656/**
657 * get the crawl depths for a parent & link uri
658 *
659 *
660 * @param a structure containing both url
661 * {parentUri : parentUri, linkUri : linkUri}
662 * @param callback(error, depth)
663 */
664var getDepths = function (depths, callback) {
665
666 async.parallel([
667 async.apply(store.getStore().getDepth.bind(store.getStore()), depths.parentUri),
668 async.apply(store.getStore().getDepth.bind(store.getStore()), depths.linkUri)
669 ],
670 function(error, results){
671 if (error) {
672 return callback(error);
673 }
674 depths.parentDepth = results[0];
675 depths.linkDepth = results[1];
676 callback(null, depths);
677 });
678}
679
680/**
681 * Calculate the depth
682 *
683 *
684 * @param a structure containing both url
685 * {parentUri : parentUri, linkUri : linkUri}
686 * @param callback(error, depth)
687 */
688
689var calcultateDepths = function (depths, callback) {
690 if (depths.parentDepth) {
691 // if a depth of the links doesn't exist : assign the parehtDepth +1
692 // if not, this link has been already found in the past => don't update its depth
693 if (! depths.linkDepth) {
694 depths.linkDepth = depths.parentDepth + 1;
695 }
696 }
697 else {
698 depths.parentDepth = 0;
699 depths.linkDepth = 1;
700 }
701 callback(null, depths);
702}
703
704/**
705 * Save the crawl depths for a parent & link uri
706 *
707 *
708 * @param a structure containing both url
709 * {parentUri : parentUri, linkUri : linkUri}
710 * @param callback(error, depth)
711 */
712var saveDepths = function(depths, callback) {
713
714 async.parallel([
715 async.apply(store.getStore().setDepth.bind(store.getStore()), depths.parentUri, depths.parentDepth ),
716 async.apply(store.getStore().setDepth.bind(store.getStore()), depths.linkUri, depths.linkDepth )
717 ],
718 function(error){
719 callback(error, depths);
720 });
721}
722
723module.exports.Crawler = Crawler;