UNPKG

crawler-ninja/index.js

Version:

17.8 kBJavaScriptView Raw

1var events      = require('events');
2var timers      = require('timers');
3var util        = require("util");
4var _           = require("underscore");
5var requester   = require("./lib/queue-requester");
6var URI         = require('./lib/uri.js');
7var Map         = require("collections/fast-map");
8var Set         = require("collections/fast-set");
9var html        = require("./lib/html.js");
10var domainBlackList  = require("./default-lists/domain-black-list.js").list();
11
12var DEFAULT_NUMBER_OF_CONNECTIONS = 10;
13var DEFAULT_DEPTH_LIMIT = -1; // no limit
14var DEFAULT_TIME_OUT = 20000;
15var DEFAULT_RETRIES = 3;
16var DEFAULT_RETRY_TIMEOUT = 10000;
17var DEFAULT_SKIP_DUPLICATES = true;
18var DEFAULT_RATE_LIMITS = 0;
19var DEFAULT_MAX_ERRORS = 5;
20var DEFAULT_ERROR_RATES = [200, 350, 500];
21
22var DEFAULT_CRAWL_EXTERNAL_LINKS = false;
23var DEFAULT_CRAWL_EXTERNAL_DOMAINS = false;
24var DEFAULT_CRAWL_EXTERNAL_HOSTS = false;
25var DEFAULT_CRAWL_SCRIPTS = true;   // Crawl <script>
26var DEFAULT_CRAWL_LINKS = true;     // Crawl <link>
27var DEFAULT_CRAWL_IMAGES = true;
28
29var DEFAULT_PROTOCOLS_TO_CRAWL = ["http", "https"];
30var DEFAULT_FOLLOW_301 = false;
31
32var DEFAULT_LINKS_TYPES = ["canonical", "stylesheet"];
33var DEFAULT_USER_AGENT = "NinjaBot";
34var DEFAULT_CACHE = false;
35var DEFAULT_METHOD = 'GET';
36var DEFAULT_REFERER = false;
37
38/**
39 * The crawler object
40 *
41 * @param config used to customize the crawler.
42 *
43 *  The current config attributes are :
44 *  - maxConnections     : the number of connections used to crawl - default is 10
45 *  - externalLinks      : if true crawl external links
46 *  - externalDomains    : if true crawl the complete external domains. This option can crawl a lot of different domains
47 *  - scripts            : if true crawl script tags
48 *  - links              : if true crawl link tags
49 *  - linkTypes          : the type of the links tags to crawl (match to the rel attribute), default : ["canonical", "stylesheet"]
50 *  - images             : if true crawl images
51 *  - protocols          : list of the protocols to crawl, default = ["http", "https"]
52 *  - timeout            : timeout per requests in milliseconds (Default 20000)
53 *  - retries            : number of retries if the request fails (default 3)
54 *  - retryTimeout       : number of milliseconds to wait before retrying (Default 10000)
55 *  - maxErrors          : number of timeout errors before changing the crawl rate, default is 5,
56    - errorRates         : list of rates to used when too many timeout errors occur.
57 *  - skipDuplicates     : if true skips URIs that were already crawled - default is true
58 *  - rateLimits         : number of milliseconds to delay between each requests (Default 0).
59 *                         Note that this option will force crawler to use only one connection
60 *  - depthLimit         : the depth limit for the crawl
61 *  - followRedirect     : if true, the crawl will not return the 301, it will follow directly the redirection
62 *  - proxyList          : the list of proxies (see the project simple-proxies on npm)
63 *
64 *  + all options provided by nodejs request : https://github.com/request/request
65 */
66function Crawler(config) {
67
68
69    // Store the depth for each crawled url
70    // Override config.updateDepth function in order to use another storage
71    // This default implementation is not recommanded for big crawl
72    // TODO : use an external store
73    this.depthUrls = new Map();
74
75    // list of the hosts from which the crawl starts
76    this.startFromHosts = new Set();
77
78    // list of the domains from wih the crawl starts
79    this.startFromDomains = new Set();
80
81    // Default config
82    this.config = this.createDefaultConfig();
83
84    // Merge default config values & overridden values provided by the arg config
85    if (config) {
86      _.extend(this.config, config);
87    }
88
89    // if using rateLimits we want to use only one connection with delay between requests
90    if (this.config.rateLimits !== 0) {
91        this.config.maxConnections = 1;
92    }
93
94
95    // assign the default updateDepth method used to calculate the crawl depth
96    this.updateDepth = updateDepth;
97
98    // If the config object contains an new implementation of the updateDepth method
99    if (this.config.updateDepth) {
100      this.updateDepth = this.config.updateDepth;
101    }
102
103    this.httpRequester = new requester.Requester(this.config);
104
105    events.EventEmitter.call(this);
106
107}
108
109util.inherits(Crawler, events.EventEmitter);
110
111
112/**
113 * Add one or more urls to crawl
114 *
115 * @param The url to crawl
116 *
117 */
118Crawler.prototype.queue = function(options) {
119
120    var self = this;
121
122    // Error if no options
123    if (! options)  {
124        if (self.config.onCrawl) {
125            self.config.onCrawl({errorCode : "NO_OPTIONS"}, {method:"GET", url : "unknown", proxy : "", error : true});
126        }
127
128        if (this.httpRequester.idle()) {
129          self.config.onDrain();
130        }
131        return;
132    }
133
134
135    // if Array => recall this method for each element
136    if (_.isArray(options)) {
137        options.forEach(function(opt){
138            self.queue(opt);
139        });
140
141        return;
142    }
143
144
145    // if String, we expect to receive an url
146    if (_.isString(options)) {
147      this.startFromHosts.add(URI.host(options));
148      this.startFromDomains.add(URI.domain(options));
149      this.httpRequester.queue(this.addDefaultOptions({uri:options, url:options}, this.config))
150    }
151    // Last possibility, this is a json
152    else {
153
154      if (! _.has(options, "url") && ! _.has(options, "uri")) {
155        if (self.config.onCrawl) {
156            self.config.onCrawl({errorCode : "NO_URL_OPTION"}, {method:"GET", url : "unknown", proxy : "", error : true});
157        }
158
159        if (this.httpRequester.idle()) {
160          self.config.onDrain();
161        }
162      }
163      else {
164        this.startFromHosts.add(URI.host(_.has(options, "url") ? options.url : options.uri));
165        this.startFromDomains.add(URI.domain(_.has(options, "url") ? options.url : options.uri)); 
166        this.httpRequester.queue(this.addDefaultOptions(options, this.config));
167      }
168    }
169
170
171}
172
173Crawler.prototype.addDefaultOptions = function(options, defaultOptions) {
174
175    _.defaults(options, defaultOptions);
176    options.maxRetries = options.retries;
177    return options;
178
179}
180
181Crawler.prototype.buildNewOptions = function(options, newUrl) {
182
183    var o = this.createDefaultConfig(newUrl);
184
185    // Copy only options attributes that are in the options used for the previous request
186    // Could be simple ? ;-)
187    o =  _.extend(o, _.pick(options, _.without(_.keys(o), "url", "uri") ));
188
189    //Reset setting used for retries when an error occurs like a timeout
190    o.maxRetries = o.retries;
191
192
193    if (options.canCrawl) {
194      o.canCrawl = options.canCrawl;
195    }
196    return o;
197
198}
199
200
201/**
202 * Default crawler config
203 *
204 * @returns the config object
205 */
206Crawler.prototype.createDefaultConfig = function(url) {
207  var self = this;
208  var config = {
209
210
211      cache                   : DEFAULT_CACHE,
212      method                  : DEFAULT_METHOD,
213      referer                 : DEFAULT_REFERER,
214      maxConnections          : DEFAULT_NUMBER_OF_CONNECTIONS,
215      timeout                 : DEFAULT_TIME_OUT,
216      retries                 : DEFAULT_RETRIES,
217      maxRetries              : DEFAULT_RETRIES,
218      retryTimeout            : DEFAULT_RETRY_TIMEOUT,
219      maxErrors               : DEFAULT_MAX_ERRORS,
220      errorRates              : DEFAULT_ERROR_RATES,
221      skipDuplicates          : DEFAULT_SKIP_DUPLICATES,
222      rateLimits              : DEFAULT_RATE_LIMITS,
223      externalLinks           : DEFAULT_CRAWL_EXTERNAL_LINKS,
224      externalDomains         : DEFAULT_CRAWL_EXTERNAL_DOMAINS,
225      externalHosts           : DEFAULT_CRAWL_EXTERNAL_HOSTS,
226      protocols               : DEFAULT_PROTOCOLS_TO_CRAWL,
227      depthLimit              : DEFAULT_DEPTH_LIMIT,
228      followRedirect          : DEFAULT_FOLLOW_301,
229      images                  : DEFAULT_CRAWL_IMAGES,
230      links                   : DEFAULT_CRAWL_LINKS,
231      linkTypes               : DEFAULT_LINKS_TYPES,
232      scripts                 : DEFAULT_CRAWL_SCRIPTS,
233      userAgent               : DEFAULT_USER_AGENT,
234      domainBlackList         : domainBlackList,
235
236      onCrawl : function(error, result){
237        self.crawl(error, result);
238      },
239
240      onDrain : function(){
241        timers.setImmediate(function(){
242            self.emit('end');
243        });
244
245      }
246
247  };
248
249  if (url) {
250    config.url = url;
251    config.uri = url;
252  }
253
254  return config;
255
256}
257
258/**
259 * Default callback function used when the http queue requester get a resource (html, pdf, css, ...)
260 *
261 * @param error The usual nodejs error
262 * @param result : the result of the resource crawl
263 * @param the jquery like object for accessing to the HTML tags. Null is the resource
264 *        is not an HTML
265 */
266Crawler.prototype.crawl = function (error, result) {
267
268
269    var self = this;
270    if (error) {
271        //console.log(error);
272        timers.setImmediate(emitErrorEvent, self, error, result);
273        return;
274    }
275
276    var $ = html.isHTML(result.body) ? html.$(result.body) : null;
277
278    timers.setImmediate(emitCrawlEvent, self,result, $);
279
280    // if $ is defined, this is an HTML page with an http status 200
281    if ($) {
282      this.analyzeHTML(result,$);
283    }
284
285
286    // if 30* & followRedirect = false => chain 30*
287    if (result.statusCode >= 300 && result.statusCode <= 399  &&  ! this.config.followRedirect) {
288
289        var from = result.uri;
290        var to = result.headers["location"];
291        var to = URI.linkToURI(from, to);
292        timers.setImmediate(emitRedirectEvent, self, from, to, result.statusCode);
293
294        this.httpRequester.queue(this.buildNewOptions(result,to));
295
296    }
297}
298
299
300
301/**
302 * Analyze an HTML page. Mainly, found a.href & links in the page
303 *
304 * @param result : the result of the crawled resource
305 * @param the jquery like object for accessing to the HTML tags. Null is the resource
306 *        is not an HTML
307 */
308Crawler.prototype.analyzeHTML = function(result, $) {
309
310
311    this.crawlHrefs(result, $);
312
313    if (this.config.links){
314        this.crawlLinks(result, $);
315    }
316
317    if (this.config.scripts) {
318        this.crawlScripts(result,$);
319    }
320
321    if (this.config.images) {
322      this.crawlImages(result,$);
323    }
324
325
326}
327
328
329/**
330 * Crawl urls that match to HTML tags a.href found in one page
331 *
332 * @param result : the result of the crawled resource
333 * @param the jquery like object for accessing to the HTML tags.
334 *
335 */
336Crawler.prototype.crawlHrefs = function(result, $) {
337  var parentUri = result.uri
338  var self = this;
339
340  $('a').each(function(index, a) {
341
342      var link = $(a).attr('href');
343
344      if (link) {
345
346        var anchor = $(a).text() ? $(a).text() : "";
347        var noFollow = $(a).attr("rel");
348        var isDoFollow =  ! (noFollow && noFollow === "nofollow");
349
350        var linkUri = URI.linkToURI(parentUri, link);
351
352        var currentDepth = self.updateDepth(parentUri, linkUri);
353
354        timers.setImmediate(emitCrawlHrefEvent, self, "crawlLink", parentUri, linkUri, anchor, isDoFollow);
355
356
357        if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri, anchor, isDoFollow)) {
358          self.httpRequester.queue(self.buildNewOptions(result,linkUri));
359        }
360        else {
361          timers.setImmediate(emitCrawlHrefEvent, self, "uncrawl", parentUri, linkUri, anchor, isDoFollow);
362        }
363      }
364
365  });
366
367}
368
369/**
370 * Crawl link tags found in the HTML page
371 * eg. : <link rel="stylesheet" href="/css/bootstrap.min.css">
372 *
373 * @param result : the result of the crawled resource
374 * @param the jquery like object for accessing to the HTML tags.
375 */
376Crawler.prototype.crawlLinks = function(result, $) {
377
378  var parentUri = result.uri;
379  var self = this;
380
381  $('link').each(function(index, linkTag) {
382
383      var link = $(linkTag).attr('href');
384
385      if (link) {
386
387          var rel =  $(linkTag).attr('rel');
388
389          if (self.config.linkTypes.indexOf(rel) > 0) {
390              var linkUri = URI.linkToURI(parentUri, link);
391              var currentDepth = self.updateDepth(parentUri, linkUri);
392
393              timers.setImmediate(emitCrawlLinkEvent, self, parentUri, linkUri);
394
395              if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {
396
397                self.httpRequester.queue(self.buildNewOptions(result,linkUri));
398
399              }
400              else {
401                timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);
402              }
403          }
404
405      }
406
407  });
408
409}
410
411/**
412 * Crawl script tags found in the HTML page
413 *
414 * @param result : the result of the crawled resource
415 * @param the jquery like object for accessing to the HTML tags.
416 */
417Crawler.prototype.crawlScripts = function(result, $) {
418
419  var parentUri = result.uri;
420  var self = this;
421
422  $('script').each(function(index, link) {
423
424      var link = $(link).attr('src');
425      if (link) {
426        var linkUri = URI.linkToURI(parentUri, link);
427        var currentDepth = self.updateDepth(parentUri, linkUri);
428
429        timers.setImmediate(emitCrawlLinkEvent, self, parentUri, linkUri);
430
431        if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {
432
433          self.httpRequester.queue(self.buildNewOptions(result, linkUri));
434
435        }
436        else {
437          timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);
438        }
439      }
440
441  });
442
443}
444
445/**
446 * Crawl image tags found in the HTML page
447 *
448 * @param result : the result of the crawled resource
449 * @param the jquery like object for accessing to the HTML tags.
450 */
451Crawler.prototype.crawlImages = function(result, $) {
452
453  var parentUri = result.uri;
454  var self = this;
455
456  $('img').each(function(index, img) {
457
458      var link = $(img).attr('src');
459      var alt = $(img).attr('alt');
460      if (link) {
461        var linkUri = URI.linkToURI(parentUri, link);
462
463        var currentDepth = self.updateDepth(parentUri, linkUri);
464
465        timers.setImmediate(emitCrawlImage, self, parentUri, linkUri, alt);
466
467        if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {
468
469          self.httpRequester.queue(self.buildNewOptions(result,linkUri));
470
471        }
472        else {
473          timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);
474        }
475      }
476
477  });
478
479}
480
481/**
482 * Check if a link has to be crawled
483 *
484 * @param the link url
485 * @param the anchor text of the links
486 * @param true if the link is dofollow
487 * @returns
488 */
489Crawler.prototype.isAGoodLinkToCrawl = function(result, currentDepth, parentUri, link, anchor, isDoFollow) {
490
491  // 1. Check the depthLimit
492  if (result.depthLimit > -1 && currentDepth > result.depthLimit) {
493    return false
494  }
495
496  // 2. Check if we need to crawl external links
497  if (URI.isExternalLink(parentUri,link) &&  ! result.externalLinks) {
498    return false;
499  }
500
501  // 3. Check if we need to crawl others host
502  if (! this.startFromHosts.has(URI.host(parentUri)) && ! result.externalHosts) {
503    return false;
504  }
505
506  // 3. Check if we need to crawl others domain
507  if (! this.startFromDomains.has(URI.domain(parentUri)) && ! result.externalDomains) {
508    return false;
509  }
510
511  // 5. Check if the link is based on a good protocol
512  if (result.protocols.indexOf(URI.protocol(link)) < 0) {
513    return false;
514  }
515
516  // 6. Check if the domain is in the black-list
517  if (result.domainBlackList.indexOf(URI.domainName(link)) > 0) {
518
519    return false;
520  }
521
522  // 7. Check if there is a rule in the crawler configuration
523  if (! result.canCrawl) {
524    return true;
525  }
526
527  var check =  result.canCrawl(parentUri, link, anchor, isDoFollow);
528  //console.log(parentUri + " - " + link + " : " + check);
529  return check;
530}
531
532/**
533 * Compute the crawl depth for a link in function of the crawl depth
534 * of the page that contains the link
535 *
536 * @param The URI of page that contains the link
537 * @param The link for which the crawl depth has to be calculated
538 * @returns the crawl depth of the link
539 *
540 */
541var updateDepth = function(parentUri, linkUri) {
542
543    if (this.depthUrls.has(parentUri)) {
544
545        var parentDepth = this.depthUrls.get(parentUri);
546        if (this.depthUrls.has(linkUri)) {
547            return this.depthUrls.get(linkUri);
548        }
549        else {
550          var depth = parentDepth + 1;
551          this.depthUrls.set(linkUri, depth);
552          return depth;
553        }
554    }
555    else {
556        this.depthUrls.set(parentUri, 0);
557        this.depthUrls.set(linkUri, 1);
558        return 1;
559    }
560
561
562}
563
564function emitCrawlEvent(crawler, result, $) {
565
566  crawler.emit("crawl", result, $);
567}
568
569function emitErrorEvent(crawler, error, result) {
570  crawler.emit("error", error, result);
571}
572
573function emitRedirectEvent(crawler, from, to, statusCode) {
574  crawler.emit("crawlRedirect", from, to, statusCode);
575}
576
577
578function emitCrawlHrefEvent(crawler, eventName, parentUri, linkUri, anchor, isDoFollow) {
579  crawler.emit(eventName, parentUri, linkUri, anchor, isDoFollow);
580}
581
582function emitCrawlLinkEvent(crawler, parentUri, linkUri ) {
583  crawler.emit("crawlLink", parentUri, linkUri);
584}
585
586function emitUnCrawlEvent(crawler, parentUri, linkUri ) {
587  crawler.emit("uncrawl", parentUri, linkUri);
588}
589
590function emitCrawlImage(crawler, parentUri, linkUri, alt ) {
591
592  crawler.emit("crawlImage", parentUri, linkUri, alt);
593}
594
595module.exports.Crawler = Crawler;

1	`var events = require('events');`
2	`var timers = require('timers');`
3	`var util = require("util");`
4	`var _ = require("underscore");`
5	`var requester = require("./lib/queue-requester");`
6	`var URI = require('./lib/uri.js');`
7	`var Map = require("collections/fast-map");`
8	`var Set = require("collections/fast-set");`
9	`var html = require("./lib/html.js");`
10	`var domainBlackList = require("./default-lists/domain-black-list.js").list();`
11
12	`var DEFAULT_NUMBER_OF_CONNECTIONS = 10;`
13	`var DEFAULT_DEPTH_LIMIT = -1; // no limit`
14	`var DEFAULT_TIME_OUT = 20000;`
15	`var DEFAULT_RETRIES = 3;`
16	`var DEFAULT_RETRY_TIMEOUT = 10000;`
17	`var DEFAULT_SKIP_DUPLICATES = true;`
18	`var DEFAULT_RATE_LIMITS = 0;`
19	`var DEFAULT_MAX_ERRORS = 5;`
20	`var DEFAULT_ERROR_RATES = [200, 350, 500];`
21
22	`var DEFAULT_CRAWL_EXTERNAL_LINKS = false;`
23	`var DEFAULT_CRAWL_EXTERNAL_DOMAINS = false;`
24	`var DEFAULT_CRAWL_EXTERNAL_HOSTS = false;`
25	`var DEFAULT_CRAWL_SCRIPTS = true; // Crawl <script>`
26	`var DEFAULT_CRAWL_LINKS = true; // Crawl <link>`
27	`var DEFAULT_CRAWL_IMAGES = true;`
28
29	`var DEFAULT_PROTOCOLS_TO_CRAWL = ["http", "https"];`
30	`var DEFAULT_FOLLOW_301 = false;`
31
32	`var DEFAULT_LINKS_TYPES = ["canonical", "stylesheet"];`
33	`var DEFAULT_USER_AGENT = "NinjaBot";`
34	`var DEFAULT_CACHE = false;`
35	`var DEFAULT_METHOD = 'GET';`
36	`var DEFAULT_REFERER = false;`
37
38	`/**`
39	`* The crawler object`
40	`*`
41	`* @param config used to customize the crawler.`
42	`*`
43	`* The current config attributes are :`
44	`* - maxConnections : the number of connections used to crawl - default is 10`
45	`* - externalLinks : if true crawl external links`
46	`* - externalDomains : if true crawl the complete external domains. This option can crawl a lot of different domains`
47	`* - scripts : if true crawl script tags`
48	`* - links : if true crawl link tags`
49	`* - linkTypes : the type of the links tags to crawl (match to the rel attribute), default : ["canonical", "stylesheet"]`
50	`* - images : if true crawl images`
51	`* - protocols : list of the protocols to crawl, default = ["http", "https"]`
52	`* - timeout : timeout per requests in milliseconds (Default 20000)`
53	`* - retries : number of retries if the request fails (default 3)`
54	`* - retryTimeout : number of milliseconds to wait before retrying (Default 10000)`
55	`* - maxErrors : number of timeout errors before changing the crawl rate, default is 5,`
56	`- errorRates : list of rates to used when too many timeout errors occur.`
57	`* - skipDuplicates : if true skips URIs that were already crawled - default is true`
58	`* - rateLimits : number of milliseconds to delay between each requests (Default 0).`
59	`* Note that this option will force crawler to use only one connection`
60	`* - depthLimit : the depth limit for the crawl`
61	`* - followRedirect : if true, the crawl will not return the 301, it will follow directly the redirection`
62	`* - proxyList : the list of proxies (see the project simple-proxies on npm)`
63	`*`
64	`* + all options provided by nodejs request : https://github.com/request/request`
65	`*/`
66	`function Crawler(config) {`
67
68
69	`// Store the depth for each crawled url`
70	`// Override config.updateDepth function in order to use another storage`
71	`// This default implementation is not recommanded for big crawl`
72	`// TODO : use an external store`
73	`this.depthUrls = new Map();`
74
75	`// list of the hosts from which the crawl starts`
76	`this.startFromHosts = new Set();`
77
78	`// list of the domains from wih the crawl starts`
79	`this.startFromDomains = new Set();`
80
81	`// Default config`
82	`this.config = this.createDefaultConfig();`
83
84	`// Merge default config values & overridden values provided by the arg config`
85	`if (config) {`
86	`_.extend(this.config, config);`
87	`}`
88
89	`// if using rateLimits we want to use only one connection with delay between requests`
90	`if (this.config.rateLimits !== 0) {`
91	`this.config.maxConnections = 1;`
92	`}`
93
94
95	`// assign the default updateDepth method used to calculate the crawl depth`
96	`this.updateDepth = updateDepth;`
97
98	`// If the config object contains an new implementation of the updateDepth method`
99	`if (this.config.updateDepth) {`
100	`this.updateDepth = this.config.updateDepth;`
101	`}`
102
103	`this.httpRequester = new requester.Requester(this.config);`
104
105	`events.EventEmitter.call(this);`
106
107	`}`
108
109	`util.inherits(Crawler, events.EventEmitter);`
110
111
112	`/**`
113	`* Add one or more urls to crawl`
114	`*`
115	`* @param The url to crawl`
116	`*`
117	`*/`
118	`Crawler.prototype.queue = function(options) {`
119
120	`var self = this;`
121
122	`// Error if no options`
123	`if (! options) {`
124	`if (self.config.onCrawl) {`
125	`self.config.onCrawl({errorCode : "NO_OPTIONS"}, {method:"GET", url : "unknown", proxy : "", error : true});`
126	`}`
127
128	`if (this.httpRequester.idle()) {`
129	`self.config.onDrain();`
130	`}`
131	`return;`
132	`}`
133
134
135	`// if Array => recall this method for each element`
136	`if (_.isArray(options)) {`
137	`options.forEach(function(opt){`
138	`self.queue(opt);`
139	`});`
140
141	`return;`
142	`}`
143
144
145	`// if String, we expect to receive an url`
146	`if (_.isString(options)) {`
147	`this.startFromHosts.add(URI.host(options));`
148	`this.startFromDomains.add(URI.domain(options));`
149	`this.httpRequester.queue(this.addDefaultOptions({uri:options, url:options}, this.config))`
150	`}`
151	`// Last possibility, this is a json`
152	`else {`
153
154	`if (! _.has(options, "url") && ! _.has(options, "uri")) {`
155	`if (self.config.onCrawl) {`
156	`self.config.onCrawl({errorCode : "NO_URL_OPTION"}, {method:"GET", url : "unknown", proxy : "", error : true});`
157	`}`
158
159	`if (this.httpRequester.idle()) {`
160	`self.config.onDrain();`
161	`}`
162	`}`
163	`else {`
164	`this.startFromHosts.add(URI.host(_.has(options, "url") ? options.url : options.uri));`
165	`this.startFromDomains.add(URI.domain(_.has(options, "url") ? options.url : options.uri));`
166	`this.httpRequester.queue(this.addDefaultOptions(options, this.config));`
167	`}`
168	`}`
169
170
171	`}`
172
173	`Crawler.prototype.addDefaultOptions = function(options, defaultOptions) {`
174
175	`_.defaults(options, defaultOptions);`
176	`options.maxRetries = options.retries;`
177	`return options;`
178
179	`}`
180
181	`Crawler.prototype.buildNewOptions = function(options, newUrl) {`
182
183	`var o = this.createDefaultConfig(newUrl);`
184
185	`// Copy only options attributes that are in the options used for the previous request`
186	`// Could be simple ? ;-)`
187	`o = _.extend(o, _.pick(options, _.without(_.keys(o), "url", "uri") ));`
188
189	`//Reset setting used for retries when an error occurs like a timeout`
190	`o.maxRetries = o.retries;`
191
192
193	`if (options.canCrawl) {`
194	`o.canCrawl = options.canCrawl;`
195	`}`
196	`return o;`
197
198	`}`
199
200
201	`/**`
202	`* Default crawler config`
203	`*`
204	`* @returns the config object`
205	`*/`
206	`Crawler.prototype.createDefaultConfig = function(url) {`
207	`var self = this;`
208	`var config = {`
209
210
211	`cache : DEFAULT_CACHE,`
212	`method : DEFAULT_METHOD,`
213	`referer : DEFAULT_REFERER,`
214	`maxConnections : DEFAULT_NUMBER_OF_CONNECTIONS,`
215	`timeout : DEFAULT_TIME_OUT,`
216	`retries : DEFAULT_RETRIES,`
217	`maxRetries : DEFAULT_RETRIES,`
218	`retryTimeout : DEFAULT_RETRY_TIMEOUT,`
219	`maxErrors : DEFAULT_MAX_ERRORS,`
220	`errorRates : DEFAULT_ERROR_RATES,`
221	`skipDuplicates : DEFAULT_SKIP_DUPLICATES,`
222	`rateLimits : DEFAULT_RATE_LIMITS,`
223	`externalLinks : DEFAULT_CRAWL_EXTERNAL_LINKS,`
224	`externalDomains : DEFAULT_CRAWL_EXTERNAL_DOMAINS,`
225	`externalHosts : DEFAULT_CRAWL_EXTERNAL_HOSTS,`
226	`protocols : DEFAULT_PROTOCOLS_TO_CRAWL,`
227	`depthLimit : DEFAULT_DEPTH_LIMIT,`
228	`followRedirect : DEFAULT_FOLLOW_301,`
229	`images : DEFAULT_CRAWL_IMAGES,`
230	`links : DEFAULT_CRAWL_LINKS,`
231	`linkTypes : DEFAULT_LINKS_TYPES,`
232	`scripts : DEFAULT_CRAWL_SCRIPTS,`
233	`userAgent : DEFAULT_USER_AGENT,`
234	`domainBlackList : domainBlackList,`
235
236	`onCrawl : function(error, result){`
237	`self.crawl(error, result);`
238	`},`
239
240	`onDrain : function(){`
241	`timers.setImmediate(function(){`
242	`self.emit('end');`
243	`});`
244
245	`}`
246
247	`};`
248
249	`if (url) {`
250	`config.url = url;`
251	`config.uri = url;`
252	`}`
253
254	`return config;`
255
256	`}`
257
258	`/**`
259	`* Default callback function used when the http queue requester get a resource (html, pdf, css, ...)`
260	`*`
261	`* @param error The usual nodejs error`
262	`* @param result : the result of the resource crawl`
263	`* @param the jquery like object for accessing to the HTML tags. Null is the resource`
264	`* is not an HTML`
265	`*/`
266	`Crawler.prototype.crawl = function (error, result) {`
267
268
269	`var self = this;`
270	`if (error) {`
271	`//console.log(error);`
272	`timers.setImmediate(emitErrorEvent, self, error, result);`
273	`return;`
274	`}`
275
276	`var $ = html.isHTML(result.body) ? html.$(result.body) : null;`
277
278	`timers.setImmediate(emitCrawlEvent, self,result, $);`
279
280	`// if $ is defined, this is an HTML page with an http status 200`
281	`if ($) {`
282	`this.analyzeHTML(result,$);`
283	`}`
284
285
286	`// if 30* & followRedirect = false => chain 30*`
287	`if (result.statusCode >= 300 && result.statusCode <= 399 && ! this.config.followRedirect) {`
288
289	`var from = result.uri;`
290	`var to = result.headers["location"];`
291	`var to = URI.linkToURI(from, to);`
292	`timers.setImmediate(emitRedirectEvent, self, from, to, result.statusCode);`
293
294	`this.httpRequester.queue(this.buildNewOptions(result,to));`
295
296	`}`
297	`}`
298
299
300
301	`/**`
302	`* Analyze an HTML page. Mainly, found a.href & links in the page`
303	`*`
304	`* @param result : the result of the crawled resource`
305	`* @param the jquery like object for accessing to the HTML tags. Null is the resource`
306	`* is not an HTML`
307	`*/`
308	`Crawler.prototype.analyzeHTML = function(result, $) {`
309
310
311	`this.crawlHrefs(result, $);`
312
313	`if (this.config.links){`
314	`this.crawlLinks(result, $);`
315	`}`
316
317	`if (this.config.scripts) {`
318	`this.crawlScripts(result,$);`
319	`}`
320
321	`if (this.config.images) {`
322	`this.crawlImages(result,$);`
323	`}`
324
325
326	`}`
327
328
329	`/**`
330	`* Crawl urls that match to HTML tags a.href found in one page`
331	`*`
332	`* @param result : the result of the crawled resource`
333	`* @param the jquery like object for accessing to the HTML tags.`
334	`*`
335	`*/`
336	`Crawler.prototype.crawlHrefs = function(result, $) {`
337	`var parentUri = result.uri`
338	`var self = this;`
339
340	`$('a').each(function(index, a) {`
341
342	`var link = $(a).attr('href');`
343
344	`if (link) {`
345
346	`var anchor = $(a).text() ? $(a).text() : "";`
347	`var noFollow = $(a).attr("rel");`
348	`var isDoFollow = ! (noFollow && noFollow === "nofollow");`
349
350	`var linkUri = URI.linkToURI(parentUri, link);`
351
352	`var currentDepth = self.updateDepth(parentUri, linkUri);`
353
354	`timers.setImmediate(emitCrawlHrefEvent, self, "crawlLink", parentUri, linkUri, anchor, isDoFollow);`
355
356
357	`if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri, anchor, isDoFollow)) {`
358	`self.httpRequester.queue(self.buildNewOptions(result,linkUri));`
359	`}`
360	`else {`
361	`timers.setImmediate(emitCrawlHrefEvent, self, "uncrawl", parentUri, linkUri, anchor, isDoFollow);`
362	`}`
363	`}`
364
365	`});`
366
367	`}`
368
369	`/**`
370	`* Crawl link tags found in the HTML page`
371	`* eg. : <link rel="stylesheet" href="/css/bootstrap.min.css">`
372	`*`
373	`* @param result : the result of the crawled resource`
374	`* @param the jquery like object for accessing to the HTML tags.`
375	`*/`
376	`Crawler.prototype.crawlLinks = function(result, $) {`
377
378	`var parentUri = result.uri;`
379	`var self = this;`
380
381	`$('link').each(function(index, linkTag) {`
382
383	`var link = $(linkTag).attr('href');`
384
385	`if (link) {`
386
387	`var rel = $(linkTag).attr('rel');`
388
389	`if (self.config.linkTypes.indexOf(rel) > 0) {`
390	`var linkUri = URI.linkToURI(parentUri, link);`
391	`var currentDepth = self.updateDepth(parentUri, linkUri);`
392
393	`timers.setImmediate(emitCrawlLinkEvent, self, parentUri, linkUri);`
394
395	`if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {`
396
397	`self.httpRequester.queue(self.buildNewOptions(result,linkUri));`
398
399	`}`
400	`else {`
401	`timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);`
402	`}`
403	`}`
404
405	`}`
406
407	`});`
408
409	`}`
410
411	`/**`
412	`* Crawl script tags found in the HTML page`
413	`*`
414	`* @param result : the result of the crawled resource`
415	`* @param the jquery like object for accessing to the HTML tags.`
416	`*/`
417	`Crawler.prototype.crawlScripts = function(result, $) {`
418
419	`var parentUri = result.uri;`
420	`var self = this;`
421
422	`$('script').each(function(index, link) {`
423
424	`var link = $(link).attr('src');`
425	`if (link) {`
426	`var linkUri = URI.linkToURI(parentUri, link);`
427	`var currentDepth = self.updateDepth(parentUri, linkUri);`
428
429	`timers.setImmediate(emitCrawlLinkEvent, self, parentUri, linkUri);`
430
431	`if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {`
432
433	`self.httpRequester.queue(self.buildNewOptions(result, linkUri));`
434
435	`}`
436	`else {`
437	`timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);`
438	`}`
439	`}`
440
441	`});`
442
443	`}`
444
445	`/**`
446	`* Crawl image tags found in the HTML page`
447	`*`
448	`* @param result : the result of the crawled resource`
449	`* @param the jquery like object for accessing to the HTML tags.`
450	`*/`
451	`Crawler.prototype.crawlImages = function(result, $) {`
452
453	`var parentUri = result.uri;`
454	`var self = this;`
455
456	`$('img').each(function(index, img) {`
457
458	`var link = $(img).attr('src');`
459	`var alt = $(img).attr('alt');`
460	`if (link) {`
461	`var linkUri = URI.linkToURI(parentUri, link);`
462
463	`var currentDepth = self.updateDepth(parentUri, linkUri);`
464
465	`timers.setImmediate(emitCrawlImage, self, parentUri, linkUri, alt);`
466
467	`if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {`
468
469	`self.httpRequester.queue(self.buildNewOptions(result,linkUri));`
470
471	`}`
472	`else {`
473	`timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);`
474	`}`
475	`}`
476
477	`});`
478
479	`}`
480
481	`/**`
482	`* Check if a link has to be crawled`
483	`*`
484	`* @param the link url`
485	`* @param the anchor text of the links`
486	`* @param true if the link is dofollow`
487	`* @returns`
488	`*/`
489	`Crawler.prototype.isAGoodLinkToCrawl = function(result, currentDepth, parentUri, link, anchor, isDoFollow) {`
490
491	`// 1. Check the depthLimit`
492	`if (result.depthLimit > -1 && currentDepth > result.depthLimit) {`
493	`return false`
494	`}`
495
496	`// 2. Check if we need to crawl external links`
497	`if (URI.isExternalLink(parentUri,link) && ! result.externalLinks) {`
498	`return false;`
499	`}`
500
501	`// 3. Check if we need to crawl others host`
502	`if (! this.startFromHosts.has(URI.host(parentUri)) && ! result.externalHosts) {`
503	`return false;`
504	`}`
505
506	`// 3. Check if we need to crawl others domain`
507	`if (! this.startFromDomains.has(URI.domain(parentUri)) && ! result.externalDomains) {`
508	`return false;`
509	`}`
510
511	`// 5. Check if the link is based on a good protocol`
512	`if (result.protocols.indexOf(URI.protocol(link)) < 0) {`
513	`return false;`
514	`}`
515
516	`// 6. Check if the domain is in the black-list`
517	`if (result.domainBlackList.indexOf(URI.domainName(link)) > 0) {`
518
519	`return false;`
520	`}`
521
522	`// 7. Check if there is a rule in the crawler configuration`
523	`if (! result.canCrawl) {`
524	`return true;`
525	`}`
526
527	`var check = result.canCrawl(parentUri, link, anchor, isDoFollow);`
528	`//console.log(parentUri + " - " + link + " : " + check);`
529	`return check;`
530	`}`
531
532	`/**`
533	`* Compute the crawl depth for a link in function of the crawl depth`
534	`* of the page that contains the link`
535	`*`
536	`* @param The URI of page that contains the link`
537	`* @param The link for which the crawl depth has to be calculated`
538	`* @returns the crawl depth of the link`
539	`*`
540	`*/`
541	`var updateDepth = function(parentUri, linkUri) {`
542
543	`if (this.depthUrls.has(parentUri)) {`
544
545	`var parentDepth = this.depthUrls.get(parentUri);`
546	`if (this.depthUrls.has(linkUri)) {`
547	`return this.depthUrls.get(linkUri);`
548	`}`
549	`else {`
550	`var depth = parentDepth + 1;`
551	`this.depthUrls.set(linkUri, depth);`
552	`return depth;`
553	`}`
554	`}`
555	`else {`
556	`this.depthUrls.set(parentUri, 0);`
557	`this.depthUrls.set(linkUri, 1);`
558	`return 1;`
559	`}`
560
561
562	`}`
563
564	`function emitCrawlEvent(crawler, result, $) {`
565
566	`crawler.emit("crawl", result, $);`
567	`}`
568
569	`function emitErrorEvent(crawler, error, result) {`
570	`crawler.emit("error", error, result);`
571	`}`
572
573	`function emitRedirectEvent(crawler, from, to, statusCode) {`
574	`crawler.emit("crawlRedirect", from, to, statusCode);`
575	`}`
576
577
578	`function emitCrawlHrefEvent(crawler, eventName, parentUri, linkUri, anchor, isDoFollow) {`
579	`crawler.emit(eventName, parentUri, linkUri, anchor, isDoFollow);`
580	`}`
581
582	`function emitCrawlLinkEvent(crawler, parentUri, linkUri ) {`
583	`crawler.emit("crawlLink", parentUri, linkUri);`
584	`}`
585
586	`function emitUnCrawlEvent(crawler, parentUri, linkUri ) {`
587	`crawler.emit("uncrawl", parentUri, linkUri);`
588	`}`
589
590	`function emitCrawlImage(crawler, parentUri, linkUri, alt ) {`
591
592	`crawler.emit("crawlImage", parentUri, linkUri, alt);`
593	`}`
594
595	`module.exports.Crawler = Crawler;`