UNPKG

crawler-ninja/index.js

Version:
23.4 kBJavaScriptView Raw
1var events      = require('events');
2var timers      = require('timers');
3var util        = require("util");
4var _           = require("underscore");
5var async       = require('async');
6var log         = require("crawler-ninja-logger").Logger;
7var Map         = require("collections/fast-map");
8var Set         = require("collections/fast-set");
9var requester   = require("./lib/queue-requester");
10var URI         = require('./lib/uri.js');
11var html        = require("./lib/html.js");
12var store       = require("./lib/store/store.js");
13var pm          = require("./lib/plugin-manager.js");
14
15
16
17var domainBlackList  = require("./default-lists/domain-black-list.js").list();
18var suffixBlackList  = require("./default-lists/suffix-black-list.js").list();
19
20
21var DEFAULT_NUMBER_OF_CONNECTIONS = 30;
22var DEFAULT_DEPTH_LIMIT = -1; // no limit
23var DEFAULT_TIME_OUT = 20000;
24var DEFAULT_RETRIES = 3;
25var DEFAULT_RETRY_TIMEOUT = 10000;
26var DEFAULT_SKIP_DUPLICATES = true;
27var DEFAULT_RATE_LIMITS = 0;
28var DEFAULT_MAX_ERRORS = 5;
29var DEFAULT_ERROR_RATES = [200, 350, 500];
30
31var DEFAULT_FIRST_EXTERNAL_LINK_ONLY = false;
32var DEFAULT_CRAWL_EXTERNAL_DOMAINS = false;
33var DEFAULT_CRAWL_EXTERNAL_HOSTS = false;
34var DEFAULT_CRAWL_SCRIPTS = true;   // Crawl <script>
35var DEFAULT_CRAWL_LINKS = true;     // Crawl <link>
36var DEFAULT_CRAWL_IMAGES = true;
37
38var DEFAULT_PROTOCOLS_TO_CRAWL = ["http", "https"];
39var DEFAULT_FOLLOW_301 = false;
40
41var DEFAULT_LINKS_TYPES = ["canonical", "stylesheet"];
42var DEFAULT_USER_AGENT = "NinjaBot";
43var DEFAULT_CACHE = false;
44var DEFAULT_METHOD = 'GET';
45var DEFAULT_REFERER = false;
46
47var DEFAULT_STORE_MODULE = "./memory-store.js";
48
49/**
50 * The crawler object
51 *
52 * @param config used to customize the crawler.
53 *
54 *  The current config attributes are :
55 *  - maxConnections        : the number of connections used to crawl - default is 10
56 *  - externalDomains       : if true crawl the  external domains. This option can crawl a lot of different linked domains, default = false.
57 *  - externalHosts         : if true crawl the others hosts on the same domain, default = false.
58 *  - firstExternalLinkOnly : crawl only the first link found for external domains/hosts. externalHosts or externalDomains should be = true
59 *  - scripts               : if true crawl script tags
60 *  - links                 : if true crawl link tags
61 *  - linkTypes             : the type of the links tags to crawl (match to the rel attribute), default : ["canonical", "stylesheet"]
62 *  - images                : if true crawl images
63 *  - protocols             : list of the protocols to crawl, default = ["http", "https"]
64 *  - timeout               : timeout per requests in milliseconds (Default 20000)
65 *  - retries               : number of retries if the request fails (default 3)
66 *  - retryTimeout          : number of milliseconds to wait before retrying (Default 10000)
67 *  - maxErrors             : number of timeout errors before changing the crawl rate, default is 5,
68    - errorRates            : list of rates to used when too many timeout errors occur.
69 *  - skipDuplicates        : if true skips URIs that were already crawled - default is true
70 *  - rateLimits            : number of milliseconds to delay between each requests (Default 0).
71 *                            Note that this option will force crawler to use only one connection
72 *  - depthLimit            : the depth limit for the crawl
73 *  - followRedirect        : if true, the crawl will not return the 301, it will follow directly the redirection
74 *  - proxyList             : the list of proxies (see the project simple-proxies on npm)
75 *
76 *  + all options provided by nodejs request : https://github.com/request/request
77 */
78function Crawler(config) {
79
80    // Default config
81    this.config = this.createDefaultConfig();
82
83    // Merge default config values & overridden values provided by the arg config
84    if (config) {
85      _.extend(this.config, config);
86    }
87
88    // if using rateLimits we want to use only one connection with delay between requests
89    if (this.config.rateLimits !== 0) {
90        this.config.maxConnections = 1;
91    }
92
93    // create the crawl store
94    store.createStore(this.config.storeModuleName, this.config.storeParams ? this.config.storeParams : null);
95
96
97    // assign the default updateDepth method used to calculate the crawl depth
98    this.updateDepth = updateDepth;
99
100    // If the config object contains an new implementation of the updateDepth method
101    if (this.config.updateDepth) {
102      this.updateDepth = this.config.updateDepth;
103    }
104
105    this.pm = new pm.PluginManager();
106
107    this.httpRequester = new requester.Requester(this.config);
108
109    events.EventEmitter.call(this);
110
111}
112
113util.inherits(Crawler, events.EventEmitter);
114
115
116/**
117 * Add one or more urls to crawl
118 *
119 * @param The url(s) to crawl
120 *
121 */
122Crawler.prototype.queue = function(options) {
123
124    var self = this;
125
126    // Error if no options
127    if (! options){
128        if (self.config.onCrawl) {
129            self.config.onCrawl({errorCode : "NO_OPTIONS"}, {method:"GET", url : "unknown", proxy : "", error : true},
130                                function(error){
131                                    if (self.httpRequester.idle()) {
132                                      self.config.onDrain();
133                                    }
134                                });
135        }
136        return;
137
138    }
139
140
141    // if Array => recall this method for each element
142    if (_.isArray(options)) {
143        options.forEach(function(opt){
144            self.queue(opt);
145        });
146
147        return;
148    }
149
150
151    // if String, we expect to receive an url
152    if (_.isString(options)) {
153      store.getStore().addStartUrl(options, function(error) {
154          self.httpRequester.queue(addDefaultOptions({uri:options, url:options}, self.config));
155      });
156
157    }
158    // Last possibility, this is a json
159    else {
160
161      if (! _.has(options, "url") && ! _.has(options, "uri")) {
162        if (self.config.onCrawl) {
163            self.config.onCrawl({errorCode : "NO_URL_OPTION"}, {method:"GET", url : "unknown", proxy : "", error : true},
164                                function(error){
165                                    if (self.httpRequester.idle()) {
166                                      self.config.onDrain();
167                                    }
168                                });
169        }
170
171      }
172      else {
173        store.getStore().addStartUrl(_.has(options, "url") ? options.url : options.uri, function(error) {
174            self.httpRequester.queue(addDefaultOptions(options, self.config));
175        });
176      }
177    }
178
179
180}
181
182/**
183 *  Add the default crawler options into the option used for the current request
184 *
185 *
186 * @param the option used for the current request
187 * @return
188 */
189 function addDefaultOptions(options, defaultOptions) {
190
191    _.defaults(options, defaultOptions);
192    options.maxRetries = options.retries;
193
194    return options;
195
196}
197
198/**
199 *  Make a copy of an option object for a specific url
200 *
201 *
202 * @param the options object to create/copy
203 * @param the url to apply into the new option object
204 * @return the new options object
205 */
206Crawler.prototype.buildNewOptions = function(options, newUrl) {
207
208    var o = this.createDefaultConfig(newUrl);
209
210    // Copy only options attributes that are in the options used for the previous request
211    // Could be more simple ? ;-)
212    o =  _.extend(o, _.pick(options, _.without(_.keys(o), "url", "uri")));
213
214    //Reset setting used for retries when an error occurs like a timeout
215    o.maxRetries = o.retries;
216    o.depthLimit = options.depthLimit;
217
218
219    if (options.canCrawl) {
220      o.canCrawl = options.canCrawl;
221    }
222    return o;
223
224}
225
226
227/**
228 * Default crawler config
229 *
230 * @returns the config object
231 */
232Crawler.prototype.createDefaultConfig = function(url) {
233  var self = this;
234  var config = {
235
236
237      cache                   : DEFAULT_CACHE,
238      method                  : DEFAULT_METHOD,
239      referer                 : DEFAULT_REFERER,
240      maxConnections          : DEFAULT_NUMBER_OF_CONNECTIONS,
241      timeout                 : DEFAULT_TIME_OUT,
242      retries                 : DEFAULT_RETRIES,
243      maxRetries              : DEFAULT_RETRIES,
244      retryTimeout            : DEFAULT_RETRY_TIMEOUT,
245      maxErrors               : DEFAULT_MAX_ERRORS,
246      errorRates              : DEFAULT_ERROR_RATES,
247      skipDuplicates          : DEFAULT_SKIP_DUPLICATES,
248      rateLimits              : DEFAULT_RATE_LIMITS,
249      externalDomains         : DEFAULT_CRAWL_EXTERNAL_DOMAINS,
250      externalHosts           : DEFAULT_CRAWL_EXTERNAL_HOSTS,
251      firstExternalLinkOnly   : DEFAULT_FIRST_EXTERNAL_LINK_ONLY,
252      protocols               : DEFAULT_PROTOCOLS_TO_CRAWL,
253      depthLimit              : DEFAULT_DEPTH_LIMIT,
254      followRedirect          : DEFAULT_FOLLOW_301,
255      images                  : DEFAULT_CRAWL_IMAGES,
256      links                   : DEFAULT_CRAWL_LINKS,
257      linkTypes               : DEFAULT_LINKS_TYPES,
258      scripts                 : DEFAULT_CRAWL_SCRIPTS,
259      userAgent               : DEFAULT_USER_AGENT,
260      domainBlackList         : domainBlackList,
261      suffixBlackList         : suffixBlackList,
262      storeModuleName             : DEFAULT_STORE_MODULE,
263
264      onCrawl : function(error, result, callback){
265        self.crawl(error, result, callback);
266      },
267
268      onDrain : function(){
269        timers.setImmediate(function(){
270            log.debug({ "step" : "onDrain", "message" : "End of the crawl"});
271            self.emit('end');
272        });
273
274      }
275
276  };
277
278  if (url) {
279    config.url = url;
280    config.uri = url;
281  }
282
283  return config;
284
285}
286
287/**
288 * Default callback function used when the http queue requester get a resource (html, pdf, css, ...)
289 *
290 * @param error : the usual nodejs error
291 * @param result: the crawled resource
292 *
293 */
294Crawler.prototype.crawl = function (error, result, callback) {
295
296    var self = this;
297
298    // if HTTP error, delegate to the plugins
299    if (error) {
300        this.pm.error(error,result, callback);
301        return;
302    }
303    var $ = html.isHTML(result.body) ? html.$(result.body) : null;
304
305    // Analyse the HTTP response in order to check the content (links, images, ...)
306    // or apply a redirect
307    async.parallel([
308      async.apply(self.pm.crawl.bind(this.pm),result, $),
309      async.apply(self.analyzeHTML.bind(self), result, $),
310      async.apply(self.applyRedirect.bind(self), result),
311    ], callback);
312
313}
314
315
316Crawler.prototype.applyRedirect = function(result, callback) {
317  // if 30* & followRedirect = false => chain 30*
318  if (result.statusCode >= 300 && result.statusCode <= 399  &&  ! this.config.followRedirect) {
319
320      var from = result.uri;
321      var to = result.headers["location"];
322      var to = URI.linkToURI(from, to);
323      var self = this;
324      this.pm.crawlRedirect(from, to, result.statusCode, function(){
325        self.httpRequester.queue(self.buildNewOptions(result,to));
326        callback();
327      });
328  }
329  else {
330      callback();
331  }
332
333
334}
335
336/**
337 * Analyze an HTML page. Mainly, found a.href, links,scripts & images in the page
338 *
339 * @param result : the result of the crawled resource
340 * @param the jquery like object for accessing to the HTML tags. Null is the resource
341 *        is not an HTML
342 */
343Crawler.prototype.analyzeHTML = function(result, $, callback) {
344
345  // if $ is note defined, this is not a HTML page with an http status 200
346  if (! $) {
347    return callback();
348  }
349
350  log.debug({"url" : result.url, "step" : "analyzeHTML", "message" : "Start check HTML code"});
351  var self = this;
352
353  async.parallel([
354
355    async.apply(self.crawlHrefs.bind(self), result, $),
356    async.apply(self.crawlLinks.bind(self), result, $),
357    async.apply(self.crawlScripts.bind(self), result, $),
358    async.apply(self.crawlImages.bind(self), result, $),
359
360  ], callback);
361
362}
363
364
365/**
366 * Crawl urls that match to HTML tags a.href found in one page
367 *
368 * @param result : the result of the crawled resource
369 * @param the jquery like object for accessing to the HTML tags.
370 *
371 */
372Crawler.prototype.crawlHrefs = function(result, $, endCallback) {
373
374    log.debug({"url" : result.url, "step" : "analyzeHTML", "message" : "CrawlHrefs"});
375    var self = this;
376    async.each($('a'), function(a, callback) {
377        self.crawlHref($, result, a, callback);
378    }, endCallback);
379
380}
381
382Crawler.prototype.crawlHref = function($,result, a, callback) {
383
384      var link = $(a).attr('href');
385      var parentUri = result.uri;
386      if (link) {
387
388        var anchor = $(a).text() ? $(a).text() : "";
389        var noFollow = $(a).attr("rel");
390        var isDoFollow =  ! (noFollow && noFollow === "nofollow");
391
392        var linkUri = URI.linkToURI(parentUri, link);
393
394        var self = this;
395        this.pm.crawlLink(parentUri, linkUri, anchor, isDoFollow, function(){
396          self.checkUrlToCrawl(result, parentUri, linkUri, anchor, isDoFollow, callback);
397        });
398
399      }
400      else {
401        callback();
402      }
403
404}
405
406
407/**
408 * Crawl link tags found in the HTML page
409 * eg. : <link rel="stylesheet" href="/css/bootstrap.min.css">
410 *
411 * @param result : the result of the crawled resource
412 * @param the jquery like object for accessing to the HTML tags.
413 */
414Crawler.prototype.crawlLinks = function(result, $, endCallback) {
415
416    if (! this.config.links){
417        return endCallback();
418    }
419
420    log.debug({"url" : result.url, "step" : "analyzeHTML", "message" : "CrawlLinks"});
421    var self = this;
422
423    async.each($('link'), function(linkTag, callback) {
424        self.crawLink($, result, linkTag, callback);
425    }, endCallback);
426}
427
428Crawler.prototype.crawLink = function($,result,linkTag, callback) {
429      var link = $(linkTag).attr('href');
430      var parentUri = result.uri;
431
432      if (link) {
433
434          var rel =  $(linkTag).attr('rel');
435
436          if (this.config.linkTypes.indexOf(rel) > 0) {
437              var linkUri = URI.linkToURI(parentUri, link);
438              var self = this;
439              this.pm.crawlLink(parentUri, linkUri, null, null, function(){
440                self.checkUrlToCrawl(result, parentUri, linkUri, null, null, callback);
441              });
442          }
443          else {
444            callback();
445          }
446      }
447      else {
448        callback();
449      }
450}
451
452
453/**
454 * Crawl script tags found in the HTML page
455 *
456 * @param result : the result of the crawled resource
457 * @param the jquery like object for accessing to the HTML tags.
458 */
459Crawler.prototype.crawlScripts = function(result, $, endCallback) {
460
461    if (! this.config.scripts) {
462      return endCallback();
463    }
464
465    log.debug({"url" : result.url, "step" : "analyzeHTML", "message" : "CrawlScripts"});
466    var self = this;
467
468    async.each($('script'), function(script, callback) {
469        self.crawlScript($, result, script, callback);
470    }, endCallback);
471}
472
473Crawler.prototype.crawlScript = function($,result, script, callback) {
474
475    var link = $(script).attr('src');
476    var parentUri = result.uri;
477
478    if (link) {
479          var linkUri = URI.linkToURI(parentUri, link);
480          var self = this;
481          this.pm.crawlLink(parentUri, linkUri, null, null, function(){
482            self.checkUrlToCrawl(result, parentUri, linkUri, null, null, callback);
483          });
484
485    }
486    else {
487      callback();
488    }
489
490
491}
492
493
494/**
495 * Crawl image tags found in the HTML page
496 *
497 * @param result : the result of the crawled resource
498 * @param the jquery like object for accessing to the HTML tags.
499 */
500Crawler.prototype.crawlImages = function(result, $, endCallback) {
501
502    if (! this.config.images) {
503      return endCallback();
504    }
505
506    log.debug({"url" : result.url, "step" : "analyzeHTML", "message" : "CrawlImages"});
507    var self = this;
508
509    async.each($('img'), function(img, callback) {
510        self.crawlImage($, result, img, callback);
511    }, endCallback);
512}
513
514Crawler.prototype.crawlImage = function($,result, img, callback) {
515      var parentUri = result.uri;
516
517      var link = $(img).attr('src');
518      var alt = $(img).attr('alt');
519      if (link) {
520          var linkUri = URI.linkToURI(parentUri, link);
521          var self = this;
522          this.pm.crawlImage(parentUri, linkUri, alt, function(){
523            self.checkUrlToCrawl(result, parentUri, linkUri, null, null, callback);
524          });
525
526      }
527      else {
528        callback();
529      }
530}
531
532Crawler.prototype.checkUrlToCrawl = function(result, parentUri, linkUri, anchor, isDoFollow, endCallback) {
533    var self = this;
534
535    async.waterfall([
536        function(callback) {
537
538            self.updateDepth(parentUri, linkUri, function(error, currentDepth) {
539                callback(error,currentDepth);
540            });
541
542        },
543        function(currentDepth, callback) {
544          self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri, anchor, isDoFollow, function(error, toCrawl) {
545              if (error) {
546                return callback(error);
547              }
548              if (toCrawl && (result.depthLimit == -1 || currentDepth <= result.depthLimit)) {
549                  self.httpRequester.queue(self.buildNewOptions(result,linkUri));
550                  callback();
551              }
552              else {
553                self.pm.unCrawl(parentUri, linkUri, anchor, isDoFollow, callback);
554              }
555
556          });
557
558        }
559    ], endCallback);
560}
561
562
563/**
564 * Check if a link has to be crawled
565 *
566 * @param the link url
567 * @param the anchor text of the links
568 * @param true if the link is dofollow
569 * @returns
570 */
571Crawler.prototype.isAGoodLinkToCrawl = function(result, currentDepth, parentUri, link, anchor, isDoFollow, callback) {
572
573  store.getStore().isStartFromUrl(parentUri, link, function(error, startFrom){
574
575        // 1. Check if we need to crawl other hosts & domains
576        if ((! startFrom.link.isStartFromHost && ! result.externalHosts) &&
577           (! (! startFrom.link.isStartFromDomains && result.externalDomains))) {
578            log.warn({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "Don't crawl url - no external host or domain"});
579            return callback(null, false);
580        }
581
582        // 2. Check if we need to crawl only the first pages of external hosts/domains
583        if (result.firstExternalLinkOnly &&  ((! startFrom.link.isStartFromHost) || (! startFrom.link.isStartFromDomains))) {
584
585          if (! startFrom.parentUri.isStartFromHost) {
586            log.warn({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "Don't crawl url - no external host or domain (not the first link)"});
587            return callback(null, false);
588          }
589        }
590
591        // 3. Check if the link is based on a good protocol
592        var protocol = URI.protocol(link);
593        if (result.protocols.indexOf(protocol) < 0) {
594          log.warn({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "Don't crawl url - no valid protocol : " + protocol});
595          return callback(null, false);
596        }
597
598        // 4. Check if the domain is in the domain black-list
599        if (result.domainBlackList.indexOf(URI.domainName(link)) > 0) {
600          log.warn({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "Don't crawl url - domain is blacklisted" });
601          return callback(null, false);
602        }
603
604        // 5. Check if the domain is in the suffix black-list
605        var suffix = URI.suffix(link);
606        if (result.suffixBlackList.indexOf(suffix) > 0) {
607          log.warn({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "Don't crawl url - suffix is blacklisted"});
608          return callback(null, false);
609        }
610
611        // 6. Check if there is a rule in the crawler configuration
612        if (! result.canCrawl) {
613          log.info({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "URL can be crawled"});
614          return callback(null, true);
615        }
616        // TODO : asynch this function ?
617        var check =  result.canCrawl(parentUri, link, anchor, isDoFollow);
618        log.debug({"url" : link, "step" : "isAGoodLinkToCrawl", "message" : "method options.canCrawl has been called and return "} + check);
619        return callback(null, check);
620
621  });
622
623}
624
625Crawler.prototype.registerPlugin = function (plugin) {
626    this.pm.registerPlugin(plugin);
627}
628
629Crawler.prototype.unregisterPlugin = function (plugin) {
630    this.pm.unregisterPlugin(plugin);
631}
632
633/**
634 * Compute the crawl depth for a link in function of the crawl depth
635 * of the page that contains the link
636 *
637 * @param The URI of page that contains the link
638 * @param The link for which the crawl depth has to be calculated
639 * @param callback(error, depth)
640 *
641 */
642var updateDepth = function(parentUri, linkUri, callback) {
643
644    var depths = {parentUri : parentUri, linkUri : linkUri, parentDepth : 0, linkDepth : 0};
645    var execFns = async.seq(getDepths , calcultateDepths , saveDepths);
646
647    execFns(depths, function (error, result) {
648      if (error) {
649        callback(error);
650      }
651      return callback(error, result.linkDepth);
652    });
653
654}
655
656/**
657 * get the crawl depths for a parent & link uri
658 *
659 *
660 * @param a structure containing both url
661 *        {parentUri : parentUri, linkUri : linkUri}
662 * @param callback(error, depth)
663 */
664var getDepths = function (depths, callback) {
665
666    async.parallel([
667        async.apply(store.getStore().getDepth.bind(store.getStore()), depths.parentUri),
668        async.apply(store.getStore().getDepth.bind(store.getStore()), depths.linkUri)
669    ],
670    function(error, results){
671        if (error) {
672          return callback(error);
673        }
674        depths.parentDepth = results[0];
675        depths.linkDepth = results[1];
676        callback(null, depths);
677    });
678}
679
680/**
681 * Calculate the depth
682 *
683 *
684 * @param a structure containing both url
685 *        {parentUri : parentUri, linkUri : linkUri}
686 * @param callback(error, depth)
687 */
688
689var calcultateDepths = function (depths, callback) {
690    if (depths.parentDepth) {
691        // if a depth of the links doesn't exist : assign the parehtDepth +1
692        // if not, this link has been already found in the past => don't update its depth
693        if (! depths.linkDepth) {
694            depths.linkDepth = depths.parentDepth + 1;
695        }
696    }
697    else {
698        depths.parentDepth = 0;
699        depths.linkDepth = 1;
700    }
701    callback(null, depths);
702}
703
704/**
705 * Save the crawl depths for a parent & link uri
706 *
707 *
708 * @param a structure containing both url
709 *        {parentUri : parentUri, linkUri : linkUri}
710 * @param callback(error, depth)
711 */
712var saveDepths = function(depths, callback) {
713
714  async.parallel([
715      async.apply(store.getStore().setDepth.bind(store.getStore()), depths.parentUri, depths.parentDepth ),
716      async.apply(store.getStore().setDepth.bind(store.getStore()), depths.linkUri, depths.linkDepth )
717  ],
718  function(error){
719      callback(error, depths);
720  });
721}
722
723module.exports.Crawler = Crawler;