UNPKG

crawler-ninja/index.js

Version:

17.3 kBJavaScriptView Raw

1var events      = require('events');
2var timers      = require('timers');
3var util        = require("util");
4var _           = require("underscore");
5var requester   = require("./lib/queue-requester");
6var URI         = require('./lib/uri.js');
7var Map         = require("collections/fast-map");
8var Set         = require("collections/fast-set");
9var html        = require("./lib/html.js");
10var domainBlackList  = require("./default-lists/domain-black-list.js").list();
11
12var DEFAULT_NUMBER_OF_CONNECTIONS = 10;
13var DEFAULT_DEPTH_LIMIT = -1; // no limit
14var DEFAULT_TIME_OUT = 20000;
15var DEFAULT_RETRIES = 3;
16var DEFAULT_RETRY_TIMEOUT = 10000;
17var DEFAULT_SKIP_DUPLICATES = true;
18var DEFAULT_RATE_LIMITS = 0;
19var DEFAULT_MAX_ERRORS = 5;
20var DEFAULT_ERROR_RATES = [200, 350, 500];
21
22var DEFAULT_CRAWL_EXTERNAL_LINKS = false;
23var DEFAULT_CRAWL_EXTERNAL_DOMAINS = false;
24var DEFAULT_CRAWL_SCRIPTS = true;   // Crawl <script>
25var DEFAULT_CRAWL_LINKS = true;     // Crawl <link>
26var DEFAULT_CRAWL_IMAGES = true;
27
28var DEFAULT_PROTOCOLS_TO_CRAWL = ["http", "https"];
29var DEFAULT_FOLLOW_301 = false;
30
31var DEFAULT_LINKS_TYPES = ["canonical", "stylesheet"];
32var DEFAULT_USER_AGENT = "NinjaBot";
33var DEFAULT_CACHE = false;
34var DEFAULT_METHOD = 'GET';
35var DEFAULT_REFERER = false;
36
37/**
38 * The crawler object
39 *
40 * @param config used to customize the crawler.
41 *
42 *  The current config attributes are :
43 *  - maxConnections     : the number of connections used to crawl - default is 10
44 *  - externalLinks      : if true crawl external links
45 *  - externalDomains    : if true crawl the complete external domains. This option can crawl a lot of different domains
46 *  - scripts            : if true crawl script tags
47 *  - links              : if true crawl link tags
48 *  - linkTypes          : the type of the links tags to crawl (match to the rel attribute), default : ["canonical", "stylesheet"]
49 *  - images             : if true crawl images
50 *  - protocols          : list of the protocols to crawl, default = ["http", "https"]
51 *  - timeout            : timeout per requests in milliseconds (Default 20000)
52 *  - retries            : number of retries if the request fails (default 3)
53 *  - retryTimeout       : number of milliseconds to wait before retrying (Default 10000)
54 *  - maxErrors          : number of timeout errors before changing the crawl rate, default is 5,
55    - errorRates         : list of rates to used when too many timeout errors occur.
56 *  - skipDuplicates     : if true skips URIs that were already crawled - default is true
57 *  - rateLimits         : number of milliseconds to delay between each requests (Default 0).
58 *                         Note that this option will force crawler to use only one connection
59 *  - depthLimit         : the depth limit for the crawl
60 *  - followRedirect     : if true, the crawl will not return the 301, it will follow directly the redirection
61 *  - proxyList          : the list of proxies (see the project simple-proxies on npm)
62 *
63 *  + all options provided by nodejs request : https://github.com/request/request
64 */
65function Crawler(config) {
66
67
68    // Store the depth for each crawled url
69    // Override config.updateDepth function in order to use another storage
70    // This default implementation is not recommanded for big crawl
71    // TODO : use an external store
72    this.depthUrls = new Map();
73
74    // list of the hosts from which the crawl starts
75    this.startFromHosts = new Set();
76
77    // Default config
78    this.config = this.createDefaultConfig();
79
80    // Merge default config values & overridden values provided by the arg config
81    if (config) {
82      _.extend(this.config, config);
83    }
84
85    // if using rateLimits we want to use only one connection with delay between requests
86    if (this.config.rateLimits !== 0) {
87        this.config.maxConnections = 1;
88    }
89
90
91    // assign the default updateDepth method used to calculate the crawl depth
92    this.updateDepth = updateDepth;
93
94    // If the config object contains an new implementation of the updateDepth method
95    if (this.config.updateDepth) {
96      this.updateDepth = this.config.updateDepth;
97    }
98
99    this.httpRequester = new requester.Requester(this.config);
100
101    events.EventEmitter.call(this);
102
103}
104
105util.inherits(Crawler, events.EventEmitter);
106
107
108/**
109 * Add one or more urls to crawl
110 *
111 * @param The url to crawl
112 *
113 */
114Crawler.prototype.queue = function(options) {
115
116    var self = this;
117
118    // Error if no options
119    if (! options)  {
120        if (self.config.onCrawl) {
121            self.config.onCrawl({errorCode : "NO_OPTIONS"}, {method:"GET", url : "unknown", proxy : "", error : true});
122        }
123
124        if (this.httpRequester.idle()) {
125          self.config.onDrain();
126        }
127        return;
128    }
129
130
131    // if Array => recall this method for each element
132    if (_.isArray(options)) {
133        options.forEach(function(opt){
134            self.queue(opt);
135        });
136
137        return;
138    }
139
140
141    // if String, we expect to receive an url
142    if (_.isString(options)) {
143      this.startFromHosts.add(URI.host(options));
144      this.httpRequester.queue(this.addDefaultOptions({uri:options, url:options}, this.config))
145    }
146    // Last possibility, this is a json
147    else {
148
149      if (! _.has(options, "url") && ! _.has(options, "uri")) {
150        if (self.config.onCrawl) {
151            self.config.onCrawl({errorCode : "NO_URL_OPTION"}, {method:"GET", url : "unknown", proxy : "", error : true});
152        }
153
154        if (this.httpRequester.idle()) {
155          self.config.onDrain();
156        }
157      }
158      else {
159        this.startFromHosts.add(URI.host(_.has(options, "url") ? options.url : options.uri));
160        this.httpRequester.queue(this.addDefaultOptions(options, this.config));
161      }
162    }
163
164
165}
166
167Crawler.prototype.addDefaultOptions = function(options, defaultOptions) {
168
169    _.defaults(options, defaultOptions);
170    options.maxRetries = options.retries;
171    return options;
172
173}
174
175Crawler.prototype.buildNewOptions = function(options, newUrl) {
176
177    var o = this.createDefaultConfig(newUrl);
178
179    // Copy only options attributes that are in the options used for the previous request
180    // Could be simple ? ;-)
181    o =  _.extend(o, _.pick(options, _.without(_.keys(o), "url", "uri") ));
182
183    //Reset setting used for retries when an error occurs like a timeout
184    o.maxRetries = o.retries;
185
186
187    if (options.canCrawl) {
188      o.canCrawl = options.canCrawl;
189    }
190    return o;
191
192}
193
194
195/**
196 * Default crawler config
197 *
198 * @returns the config object
199 */
200Crawler.prototype.createDefaultConfig = function(url) {
201  var self = this;
202  var config = {
203
204
205      cache                   : DEFAULT_CACHE,
206      method                  : DEFAULT_METHOD,
207      referer                 : DEFAULT_REFERER,
208      maxConnections          : DEFAULT_NUMBER_OF_CONNECTIONS,
209      timeout                 : DEFAULT_TIME_OUT,
210      retries                 : DEFAULT_RETRIES,
211      maxRetries              : DEFAULT_RETRIES,
212      retryTimeout            : DEFAULT_RETRY_TIMEOUT,
213      maxErrors               : DEFAULT_MAX_ERRORS,
214      errorRates              : DEFAULT_ERROR_RATES,
215      skipDuplicates          : DEFAULT_SKIP_DUPLICATES,
216      rateLimits              : DEFAULT_RATE_LIMITS,
217      externalLinks           : DEFAULT_CRAWL_EXTERNAL_LINKS,
218      externalDomains         : DEFAULT_CRAWL_EXTERNAL_DOMAINS,
219      protocols               : DEFAULT_PROTOCOLS_TO_CRAWL,
220      depthLimit              : DEFAULT_DEPTH_LIMIT,
221      followRedirect          : DEFAULT_FOLLOW_301,
222      images                  : DEFAULT_CRAWL_IMAGES,
223      links                   : DEFAULT_CRAWL_LINKS,
224      linkTypes               : DEFAULT_LINKS_TYPES,
225      scripts                 : DEFAULT_CRAWL_SCRIPTS,
226      userAgent               : DEFAULT_USER_AGENT,
227      domainBlackList       : domainBlackList,
228
229      onCrawl : function(error, result){
230        self.crawl(error, result);
231      },
232
233      onDrain : function(){
234        timers.setImmediate(function(){
235            self.emit('end');
236        });
237
238      }
239
240  };
241
242  if (url) {
243    config.url = url;
244    config.uri = url;
245  }
246
247  return config;
248
249}
250
251/**
252 * Default callback function used when the http queue requester get a resource (html, pdf, css, ...)
253 *
254 * @param error The usual nodejs error
255 * @param result : the result of the resource crawl
256 * @param the jquery like object for accessing to the HTML tags. Null is the resource
257 *        is not an HTML
258 */
259Crawler.prototype.crawl = function (error, result) {
260
261
262    var self = this;
263    if (error) {
264        //console.log(error);
265        timers.setImmediate(emitErrorEvent, self, error, result);
266        return;
267    }
268
269    var $ = html.isHTML(result.body) ? html.$(result.body) : null;
270
271    timers.setImmediate(emitCrawlEvent, self,result, $);
272
273    // if $ is defined, this is an HTML page with an http status 200
274    if ($) {
275      this.analyzeHTML(result,$);
276    }
277
278
279    // if 30* & followRedirect = false => chain 30*
280    if (result.statusCode >= 300 && result.statusCode <= 399  &&  ! this.config.followRedirect) {
281
282        var from = result.uri;
283        var to = result.headers["location"];
284        var to = URI.linkToURI(from, to);
285        timers.setImmediate(emitRedirectEvent, self, from, to, result.statusCode);
286
287        this.httpRequester.queue(this.buildNewOptions(result,to));
288
289    }
290}
291
292
293
294/**
295 * Analyze an HTML page. Mainly, found a.href & links in the page
296 *
297 * @param result : the result of the crawled resource
298 * @param the jquery like object for accessing to the HTML tags. Null is the resource
299 *        is not an HTML
300 */
301Crawler.prototype.analyzeHTML = function(result, $) {
302
303
304    this.crawlHrefs(result, $);
305
306    if (this.config.links){
307        this.crawlLinks(result, $);
308    }
309
310    if (this.config.scripts) {
311        this.crawlScripts(result,$);
312    }
313
314    if (this.config.images) {
315      this.crawlImages(result,$);
316    }
317
318
319}
320
321
322/**
323 * Crawl urls that match to HTML tags a.href found in one page
324 *
325 * @param result : the result of the crawled resource
326 * @param the jquery like object for accessing to the HTML tags.
327 *
328 */
329Crawler.prototype.crawlHrefs = function(result, $) {
330  var parentUri = result.uri
331  var self = this;
332
333  $('a').each(function(index, a) {
334
335      var link = $(a).attr('href');
336
337      if (link) {
338
339        var anchor = $(a).text() ? $(a).text() : "";
340        var noFollow = $(a).attr("rel");
341        var isDoFollow =  ! (noFollow && noFollow === "nofollow");
342
343        var linkUri = URI.linkToURI(parentUri, link);
344
345        var currentDepth = self.updateDepth(parentUri, linkUri);
346
347        timers.setImmediate(emitCrawlHrefEvent, self, "crawlLink", parentUri, linkUri, anchor, isDoFollow);
348
349
350        if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri, anchor, isDoFollow)) {
351          self.httpRequester.queue(self.buildNewOptions(result,linkUri));
352        }
353        else {
354          timers.setImmediate(emitCrawlHrefEvent, self, "uncrawl", parentUri, linkUri, anchor, isDoFollow);
355        }
356      }
357
358  });
359
360}
361
362/**
363 * Crawl link tags found in the HTML page
364 * eg. : <link rel="stylesheet" href="/css/bootstrap.min.css">
365 *
366 * @param result : the result of the crawled resource
367 * @param the jquery like object for accessing to the HTML tags.
368 */
369Crawler.prototype.crawlLinks = function(result, $) {
370
371  var parentUri = result.uri;
372  var self = this;
373
374  $('link').each(function(index, linkTag) {
375
376      var link = $(linkTag).attr('href');
377
378      if (link) {
379
380          var rel =  $(linkTag).attr('rel');
381
382          if (self.config.linkTypes.indexOf(rel) > 0) {
383              var linkUri = URI.linkToURI(parentUri, link);
384              var currentDepth = self.updateDepth(parentUri, linkUri);
385
386              timers.setImmediate(emitCrawlLinkEvent, self, parentUri, linkUri);
387
388              if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {
389
390                self.httpRequester.queue(self.buildNewOptions(result,linkUri));
391
392              }
393              else {
394                timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);
395              }
396          }
397
398      }
399
400  });
401
402}
403
404/**
405 * Crawl script tags found in the HTML page
406 *
407 * @param result : the result of the crawled resource
408 * @param the jquery like object for accessing to the HTML tags.
409 */
410Crawler.prototype.crawlScripts = function(result, $) {
411
412  var parentUri = result.uri;
413  var self = this;
414
415  $('script').each(function(index, link) {
416
417      var link = $(link).attr('src');
418      if (link) {
419        var linkUri = URI.linkToURI(parentUri, link);
420        var currentDepth = self.updateDepth(parentUri, linkUri);
421
422        timers.setImmediate(emitCrawlLinkEvent, self, parentUri, linkUri);
423
424        if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {
425
426          self.httpRequester.queue(self.buildNewOptions(result, linkUri));
427
428        }
429        else {
430          timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);
431        }
432      }
433
434  });
435
436}
437
438/**
439 * Crawl image tags found in the HTML page
440 *
441 * @param result : the result of the crawled resource
442 * @param the jquery like object for accessing to the HTML tags.
443 */
444Crawler.prototype.crawlImages = function(result, $) {
445
446  var parentUri = result.uri;
447  var self = this;
448
449  $('img').each(function(index, img) {
450
451      var link = $(img).attr('src');
452      var alt = $(img).attr('alt');
453      if (link) {
454        var linkUri = URI.linkToURI(parentUri, link);
455
456        var currentDepth = self.updateDepth(parentUri, linkUri);
457
458        timers.setImmediate(emitCrawlImage, self, parentUri, linkUri, alt);
459
460        if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {
461
462          self.httpRequester.queue(self.buildNewOptions(result,linkUri));
463
464        }
465        else {
466          timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);
467        }
468      }
469
470  });
471
472}
473
474/**
475 * Check if a link has to be crawled
476 *
477 * @param the link url
478 * @param the anchor text of the links
479 * @param true if the link is dofollow
480 * @returns
481 */
482Crawler.prototype.isAGoodLinkToCrawl = function(result, currentDepth, parentUri, link, anchor, isDoFollow) {
483
484  // 1. Check the depthLimit
485  if (result.depthLimit > -1 && currentDepth > result.depthLimit) {
486    return false
487  }
488
489  // 2. Check if we need to crawl external links
490  if (URI.isExternalLink(parentUri,link) &&  ! result.externalLinks) {
491    return false;
492  }
493
494  // 3. Check if we need to crawl external domains
495  if (! this.startFromHosts.has(URI.host(parentUri)) && ! result.externalDomains) {
496    return false;
497  }
498
499  // 4. Check if the link is based on a good protocol
500  if (result.protocols.indexOf(URI.protocol(link)) < 0) {
501    return false;
502  }
503
504  // 5. Check if the domain is in the black-list
505  if (result.domainBlackList.indexOf(URI.domainName(link)) > 0) {
506
507    return false;
508  }
509
510  // 6. Check if there is a rule in the crawler configuration
511  if (! result.canCrawl) {
512    return true;
513  }
514
515  var check =  result.canCrawl(parentUri, link, anchor, isDoFollow);
516  //console.log(parentUri + " - " + link + " : " + check);
517  return check;
518}
519
520/**
521 * Compute the crawl depth for a link in function of the crawl depth
522 * of the page that contains the link
523 *
524 * @param The URI of page that contains the link
525 * @param The link for which the crawl depth has to be calculated
526 * @returns the crawl depth of the link
527 *
528 */
529var updateDepth = function(parentUri, linkUri) {
530
531    if (this.depthUrls.has(parentUri)) {
532
533        var parentDepth = this.depthUrls.get(parentUri);
534        if (this.depthUrls.has(linkUri)) {
535            return this.depthUrls.get(linkUri);
536        }
537        else {
538          var depth = parentDepth + 1;
539          this.depthUrls.set(linkUri, depth);
540          return depth;
541        }
542    }
543    else {
544        this.depthUrls.set(parentUri, 0);
545        this.depthUrls.set(linkUri, 1);
546        return 1;
547    }
548
549
550}
551
552function emitCrawlEvent(crawler, result, $) {
553
554  crawler.emit("crawl", result, $);
555}
556
557function emitErrorEvent(crawler, error, result) {
558  crawler.emit("error", error, result);
559}
560
561function emitRedirectEvent(crawler, from, to, statusCode) {
562  crawler.emit("crawlRedirect", from, to, statusCode);
563}
564
565
566function emitCrawlHrefEvent(crawler, eventName, parentUri, linkUri, anchor, isDoFollow) {
567  crawler.emit(eventName, parentUri, linkUri, anchor, isDoFollow);
568}
569
570function emitCrawlLinkEvent(crawler, parentUri, linkUri ) {
571  crawler.emit("crawlLink", parentUri, linkUri);
572}
573
574function emitUnCrawlEvent(crawler, parentUri, linkUri ) {
575  crawler.emit("uncrawl", parentUri, linkUri);
576}
577
578function emitCrawlImage(crawler, parentUri, linkUri, alt ) {
579
580  crawler.emit("crawlImage", parentUri, linkUri, alt);
581}
582
583module.exports.Crawler = Crawler;

1	`var events = require('events');`
2	`var timers = require('timers');`
3	`var util = require("util");`
4	`var _ = require("underscore");`
5	`var requester = require("./lib/queue-requester");`
6	`var URI = require('./lib/uri.js');`
7	`var Map = require("collections/fast-map");`
8	`var Set = require("collections/fast-set");`
9	`var html = require("./lib/html.js");`
10	`var domainBlackList = require("./default-lists/domain-black-list.js").list();`
11
12	`var DEFAULT_NUMBER_OF_CONNECTIONS = 10;`
13	`var DEFAULT_DEPTH_LIMIT = -1; // no limit`
14	`var DEFAULT_TIME_OUT = 20000;`
15	`var DEFAULT_RETRIES = 3;`
16	`var DEFAULT_RETRY_TIMEOUT = 10000;`
17	`var DEFAULT_SKIP_DUPLICATES = true;`
18	`var DEFAULT_RATE_LIMITS = 0;`
19	`var DEFAULT_MAX_ERRORS = 5;`
20	`var DEFAULT_ERROR_RATES = [200, 350, 500];`
21
22	`var DEFAULT_CRAWL_EXTERNAL_LINKS = false;`
23	`var DEFAULT_CRAWL_EXTERNAL_DOMAINS = false;`
24	`var DEFAULT_CRAWL_SCRIPTS = true; // Crawl <script>`
25	`var DEFAULT_CRAWL_LINKS = true; // Crawl <link>`
26	`var DEFAULT_CRAWL_IMAGES = true;`
27
28	`var DEFAULT_PROTOCOLS_TO_CRAWL = ["http", "https"];`
29	`var DEFAULT_FOLLOW_301 = false;`
30
31	`var DEFAULT_LINKS_TYPES = ["canonical", "stylesheet"];`
32	`var DEFAULT_USER_AGENT = "NinjaBot";`
33	`var DEFAULT_CACHE = false;`
34	`var DEFAULT_METHOD = 'GET';`
35	`var DEFAULT_REFERER = false;`
36
37	`/**`
38	`* The crawler object`
39	`*`
40	`* @param config used to customize the crawler.`
41	`*`
42	`* The current config attributes are :`
43	`* - maxConnections : the number of connections used to crawl - default is 10`
44	`* - externalLinks : if true crawl external links`
45	`* - externalDomains : if true crawl the complete external domains. This option can crawl a lot of different domains`
46	`* - scripts : if true crawl script tags`
47	`* - links : if true crawl link tags`
48	`* - linkTypes : the type of the links tags to crawl (match to the rel attribute), default : ["canonical", "stylesheet"]`
49	`* - images : if true crawl images`
50	`* - protocols : list of the protocols to crawl, default = ["http", "https"]`
51	`* - timeout : timeout per requests in milliseconds (Default 20000)`
52	`* - retries : number of retries if the request fails (default 3)`
53	`* - retryTimeout : number of milliseconds to wait before retrying (Default 10000)`
54	`* - maxErrors : number of timeout errors before changing the crawl rate, default is 5,`
55	`- errorRates : list of rates to used when too many timeout errors occur.`
56	`* - skipDuplicates : if true skips URIs that were already crawled - default is true`
57	`* - rateLimits : number of milliseconds to delay between each requests (Default 0).`
58	`* Note that this option will force crawler to use only one connection`
59	`* - depthLimit : the depth limit for the crawl`
60	`* - followRedirect : if true, the crawl will not return the 301, it will follow directly the redirection`
61	`* - proxyList : the list of proxies (see the project simple-proxies on npm)`
62	`*`
63	`* + all options provided by nodejs request : https://github.com/request/request`
64	`*/`
65	`function Crawler(config) {`
66
67
68	`// Store the depth for each crawled url`
69	`// Override config.updateDepth function in order to use another storage`
70	`// This default implementation is not recommanded for big crawl`
71	`// TODO : use an external store`
72	`this.depthUrls = new Map();`
73
74	`// list of the hosts from which the crawl starts`
75	`this.startFromHosts = new Set();`
76
77	`// Default config`
78	`this.config = this.createDefaultConfig();`
79
80	`// Merge default config values & overridden values provided by the arg config`
81	`if (config) {`
82	`_.extend(this.config, config);`
83	`}`
84
85	`// if using rateLimits we want to use only one connection with delay between requests`
86	`if (this.config.rateLimits !== 0) {`
87	`this.config.maxConnections = 1;`
88	`}`
89
90
91	`// assign the default updateDepth method used to calculate the crawl depth`
92	`this.updateDepth = updateDepth;`
93
94	`// If the config object contains an new implementation of the updateDepth method`
95	`if (this.config.updateDepth) {`
96	`this.updateDepth = this.config.updateDepth;`
97	`}`
98
99	`this.httpRequester = new requester.Requester(this.config);`
100
101	`events.EventEmitter.call(this);`
102
103	`}`
104
105	`util.inherits(Crawler, events.EventEmitter);`
106
107
108	`/**`
109	`* Add one or more urls to crawl`
110	`*`
111	`* @param The url to crawl`
112	`*`
113	`*/`
114	`Crawler.prototype.queue = function(options) {`
115
116	`var self = this;`
117
118	`// Error if no options`
119	`if (! options) {`
120	`if (self.config.onCrawl) {`
121	`self.config.onCrawl({errorCode : "NO_OPTIONS"}, {method:"GET", url : "unknown", proxy : "", error : true});`
122	`}`
123
124	`if (this.httpRequester.idle()) {`
125	`self.config.onDrain();`
126	`}`
127	`return;`
128	`}`
129
130
131	`// if Array => recall this method for each element`
132	`if (_.isArray(options)) {`
133	`options.forEach(function(opt){`
134	`self.queue(opt);`
135	`});`
136
137	`return;`
138	`}`
139
140
141	`// if String, we expect to receive an url`
142	`if (_.isString(options)) {`
143	`this.startFromHosts.add(URI.host(options));`
144	`this.httpRequester.queue(this.addDefaultOptions({uri:options, url:options}, this.config))`
145	`}`
146	`// Last possibility, this is a json`
147	`else {`
148
149	`if (! _.has(options, "url") && ! _.has(options, "uri")) {`
150	`if (self.config.onCrawl) {`
151	`self.config.onCrawl({errorCode : "NO_URL_OPTION"}, {method:"GET", url : "unknown", proxy : "", error : true});`
152	`}`
153
154	`if (this.httpRequester.idle()) {`
155	`self.config.onDrain();`
156	`}`
157	`}`
158	`else {`
159	`this.startFromHosts.add(URI.host(_.has(options, "url") ? options.url : options.uri));`
160	`this.httpRequester.queue(this.addDefaultOptions(options, this.config));`
161	`}`
162	`}`
163
164
165	`}`
166
167	`Crawler.prototype.addDefaultOptions = function(options, defaultOptions) {`
168
169	`_.defaults(options, defaultOptions);`
170	`options.maxRetries = options.retries;`
171	`return options;`
172
173	`}`
174
175	`Crawler.prototype.buildNewOptions = function(options, newUrl) {`
176
177	`var o = this.createDefaultConfig(newUrl);`
178
179	`// Copy only options attributes that are in the options used for the previous request`
180	`// Could be simple ? ;-)`
181	`o = _.extend(o, _.pick(options, _.without(_.keys(o), "url", "uri") ));`
182
183	`//Reset setting used for retries when an error occurs like a timeout`
184	`o.maxRetries = o.retries;`
185
186
187	`if (options.canCrawl) {`
188	`o.canCrawl = options.canCrawl;`
189	`}`
190	`return o;`
191
192	`}`
193
194
195	`/**`
196	`* Default crawler config`
197	`*`
198	`* @returns the config object`
199	`*/`
200	`Crawler.prototype.createDefaultConfig = function(url) {`
201	`var self = this;`
202	`var config = {`
203
204
205	`cache : DEFAULT_CACHE,`
206	`method : DEFAULT_METHOD,`
207	`referer : DEFAULT_REFERER,`
208	`maxConnections : DEFAULT_NUMBER_OF_CONNECTIONS,`
209	`timeout : DEFAULT_TIME_OUT,`
210	`retries : DEFAULT_RETRIES,`
211	`maxRetries : DEFAULT_RETRIES,`
212	`retryTimeout : DEFAULT_RETRY_TIMEOUT,`
213	`maxErrors : DEFAULT_MAX_ERRORS,`
214	`errorRates : DEFAULT_ERROR_RATES,`
215	`skipDuplicates : DEFAULT_SKIP_DUPLICATES,`
216	`rateLimits : DEFAULT_RATE_LIMITS,`
217	`externalLinks : DEFAULT_CRAWL_EXTERNAL_LINKS,`
218	`externalDomains : DEFAULT_CRAWL_EXTERNAL_DOMAINS,`
219	`protocols : DEFAULT_PROTOCOLS_TO_CRAWL,`
220	`depthLimit : DEFAULT_DEPTH_LIMIT,`
221	`followRedirect : DEFAULT_FOLLOW_301,`
222	`images : DEFAULT_CRAWL_IMAGES,`
223	`links : DEFAULT_CRAWL_LINKS,`
224	`linkTypes : DEFAULT_LINKS_TYPES,`
225	`scripts : DEFAULT_CRAWL_SCRIPTS,`
226	`userAgent : DEFAULT_USER_AGENT,`
227	`domainBlackList : domainBlackList,`
228
229	`onCrawl : function(error, result){`
230	`self.crawl(error, result);`
231	`},`
232
233	`onDrain : function(){`
234	`timers.setImmediate(function(){`
235	`self.emit('end');`
236	`});`
237
238	`}`
239
240	`};`
241
242	`if (url) {`
243	`config.url = url;`
244	`config.uri = url;`
245	`}`
246
247	`return config;`
248
249	`}`
250
251	`/**`
252	`* Default callback function used when the http queue requester get a resource (html, pdf, css, ...)`
253	`*`
254	`* @param error The usual nodejs error`
255	`* @param result : the result of the resource crawl`
256	`* @param the jquery like object for accessing to the HTML tags. Null is the resource`
257	`* is not an HTML`
258	`*/`
259	`Crawler.prototype.crawl = function (error, result) {`
260
261
262	`var self = this;`
263	`if (error) {`
264	`//console.log(error);`
265	`timers.setImmediate(emitErrorEvent, self, error, result);`
266	`return;`
267	`}`
268
269	`var $ = html.isHTML(result.body) ? html.$(result.body) : null;`
270
271	`timers.setImmediate(emitCrawlEvent, self,result, $);`
272
273	`// if $ is defined, this is an HTML page with an http status 200`
274	`if ($) {`
275	`this.analyzeHTML(result,$);`
276	`}`
277
278
279	`// if 30* & followRedirect = false => chain 30*`
280	`if (result.statusCode >= 300 && result.statusCode <= 399 && ! this.config.followRedirect) {`
281
282	`var from = result.uri;`
283	`var to = result.headers["location"];`
284	`var to = URI.linkToURI(from, to);`
285	`timers.setImmediate(emitRedirectEvent, self, from, to, result.statusCode);`
286
287	`this.httpRequester.queue(this.buildNewOptions(result,to));`
288
289	`}`
290	`}`
291
292
293
294	`/**`
295	`* Analyze an HTML page. Mainly, found a.href & links in the page`
296	`*`
297	`* @param result : the result of the crawled resource`
298	`* @param the jquery like object for accessing to the HTML tags. Null is the resource`
299	`* is not an HTML`
300	`*/`
301	`Crawler.prototype.analyzeHTML = function(result, $) {`
302
303
304	`this.crawlHrefs(result, $);`
305
306	`if (this.config.links){`
307	`this.crawlLinks(result, $);`
308	`}`
309
310	`if (this.config.scripts) {`
311	`this.crawlScripts(result,$);`
312	`}`
313
314	`if (this.config.images) {`
315	`this.crawlImages(result,$);`
316	`}`
317
318
319	`}`
320
321
322	`/**`
323	`* Crawl urls that match to HTML tags a.href found in one page`
324	`*`
325	`* @param result : the result of the crawled resource`
326	`* @param the jquery like object for accessing to the HTML tags.`
327	`*`
328	`*/`
329	`Crawler.prototype.crawlHrefs = function(result, $) {`
330	`var parentUri = result.uri`
331	`var self = this;`
332
333	`$('a').each(function(index, a) {`
334
335	`var link = $(a).attr('href');`
336
337	`if (link) {`
338
339	`var anchor = $(a).text() ? $(a).text() : "";`
340	`var noFollow = $(a).attr("rel");`
341	`var isDoFollow = ! (noFollow && noFollow === "nofollow");`
342
343	`var linkUri = URI.linkToURI(parentUri, link);`
344
345	`var currentDepth = self.updateDepth(parentUri, linkUri);`
346
347	`timers.setImmediate(emitCrawlHrefEvent, self, "crawlLink", parentUri, linkUri, anchor, isDoFollow);`
348
349
350	`if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri, anchor, isDoFollow)) {`
351	`self.httpRequester.queue(self.buildNewOptions(result,linkUri));`
352	`}`
353	`else {`
354	`timers.setImmediate(emitCrawlHrefEvent, self, "uncrawl", parentUri, linkUri, anchor, isDoFollow);`
355	`}`
356	`}`
357
358	`});`
359
360	`}`
361
362	`/**`
363	`* Crawl link tags found in the HTML page`
364	`* eg. : <link rel="stylesheet" href="/css/bootstrap.min.css">`
365	`*`
366	`* @param result : the result of the crawled resource`
367	`* @param the jquery like object for accessing to the HTML tags.`
368	`*/`
369	`Crawler.prototype.crawlLinks = function(result, $) {`
370
371	`var parentUri = result.uri;`
372	`var self = this;`
373
374	`$('link').each(function(index, linkTag) {`
375
376	`var link = $(linkTag).attr('href');`
377
378	`if (link) {`
379
380	`var rel = $(linkTag).attr('rel');`
381
382	`if (self.config.linkTypes.indexOf(rel) > 0) {`
383	`var linkUri = URI.linkToURI(parentUri, link);`
384	`var currentDepth = self.updateDepth(parentUri, linkUri);`
385
386	`timers.setImmediate(emitCrawlLinkEvent, self, parentUri, linkUri);`
387
388	`if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {`
389
390	`self.httpRequester.queue(self.buildNewOptions(result,linkUri));`
391
392	`}`
393	`else {`
394	`timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);`
395	`}`
396	`}`
397
398	`}`
399
400	`});`
401
402	`}`
403
404	`/**`
405	`* Crawl script tags found in the HTML page`
406	`*`
407	`* @param result : the result of the crawled resource`
408	`* @param the jquery like object for accessing to the HTML tags.`
409	`*/`
410	`Crawler.prototype.crawlScripts = function(result, $) {`
411
412	`var parentUri = result.uri;`
413	`var self = this;`
414
415	`$('script').each(function(index, link) {`
416
417	`var link = $(link).attr('src');`
418	`if (link) {`
419	`var linkUri = URI.linkToURI(parentUri, link);`
420	`var currentDepth = self.updateDepth(parentUri, linkUri);`
421
422	`timers.setImmediate(emitCrawlLinkEvent, self, parentUri, linkUri);`
423
424	`if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {`
425
426	`self.httpRequester.queue(self.buildNewOptions(result, linkUri));`
427
428	`}`
429	`else {`
430	`timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);`
431	`}`
432	`}`
433
434	`});`
435
436	`}`
437
438	`/**`
439	`* Crawl image tags found in the HTML page`
440	`*`
441	`* @param result : the result of the crawled resource`
442	`* @param the jquery like object for accessing to the HTML tags.`
443	`*/`
444	`Crawler.prototype.crawlImages = function(result, $) {`
445
446	`var parentUri = result.uri;`
447	`var self = this;`
448
449	`$('img').each(function(index, img) {`
450
451	`var link = $(img).attr('src');`
452	`var alt = $(img).attr('alt');`
453	`if (link) {`
454	`var linkUri = URI.linkToURI(parentUri, link);`
455
456	`var currentDepth = self.updateDepth(parentUri, linkUri);`
457
458	`timers.setImmediate(emitCrawlImage, self, parentUri, linkUri, alt);`
459
460	`if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {`
461
462	`self.httpRequester.queue(self.buildNewOptions(result,linkUri));`
463
464	`}`
465	`else {`
466	`timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);`
467	`}`
468	`}`
469
470	`});`
471
472	`}`
473
474	`/**`
475	`* Check if a link has to be crawled`
476	`*`
477	`* @param the link url`
478	`* @param the anchor text of the links`
479	`* @param true if the link is dofollow`
480	`* @returns`
481	`*/`
482	`Crawler.prototype.isAGoodLinkToCrawl = function(result, currentDepth, parentUri, link, anchor, isDoFollow) {`
483
484	`// 1. Check the depthLimit`
485	`if (result.depthLimit > -1 && currentDepth > result.depthLimit) {`
486	`return false`
487	`}`
488
489	`// 2. Check if we need to crawl external links`
490	`if (URI.isExternalLink(parentUri,link) && ! result.externalLinks) {`
491	`return false;`
492	`}`
493
494	`// 3. Check if we need to crawl external domains`
495	`if (! this.startFromHosts.has(URI.host(parentUri)) && ! result.externalDomains) {`
496	`return false;`
497	`}`
498
499	`// 4. Check if the link is based on a good protocol`
500	`if (result.protocols.indexOf(URI.protocol(link)) < 0) {`
501	`return false;`
502	`}`
503
504	`// 5. Check if the domain is in the black-list`
505	`if (result.domainBlackList.indexOf(URI.domainName(link)) > 0) {`
506
507	`return false;`
508	`}`
509
510	`// 6. Check if there is a rule in the crawler configuration`
511	`if (! result.canCrawl) {`
512	`return true;`
513	`}`
514
515	`var check = result.canCrawl(parentUri, link, anchor, isDoFollow);`
516	`//console.log(parentUri + " - " + link + " : " + check);`
517	`return check;`
518	`}`
519
520	`/**`
521	`* Compute the crawl depth for a link in function of the crawl depth`
522	`* of the page that contains the link`
523	`*`
524	`* @param The URI of page that contains the link`
525	`* @param The link for which the crawl depth has to be calculated`
526	`* @returns the crawl depth of the link`
527	`*`
528	`*/`
529	`var updateDepth = function(parentUri, linkUri) {`
530
531	`if (this.depthUrls.has(parentUri)) {`
532
533	`var parentDepth = this.depthUrls.get(parentUri);`
534	`if (this.depthUrls.has(linkUri)) {`
535	`return this.depthUrls.get(linkUri);`
536	`}`
537	`else {`
538	`var depth = parentDepth + 1;`
539	`this.depthUrls.set(linkUri, depth);`
540	`return depth;`
541	`}`
542	`}`
543	`else {`
544	`this.depthUrls.set(parentUri, 0);`
545	`this.depthUrls.set(linkUri, 1);`
546	`return 1;`
547	`}`
548
549
550	`}`
551
552	`function emitCrawlEvent(crawler, result, $) {`
553
554	`crawler.emit("crawl", result, $);`
555	`}`
556
557	`function emitErrorEvent(crawler, error, result) {`
558	`crawler.emit("error", error, result);`
559	`}`
560
561	`function emitRedirectEvent(crawler, from, to, statusCode) {`
562	`crawler.emit("crawlRedirect", from, to, statusCode);`
563	`}`
564
565
566	`function emitCrawlHrefEvent(crawler, eventName, parentUri, linkUri, anchor, isDoFollow) {`
567	`crawler.emit(eventName, parentUri, linkUri, anchor, isDoFollow);`
568	`}`
569
570	`function emitCrawlLinkEvent(crawler, parentUri, linkUri ) {`
571	`crawler.emit("crawlLink", parentUri, linkUri);`
572	`}`
573
574	`function emitUnCrawlEvent(crawler, parentUri, linkUri ) {`
575	`crawler.emit("uncrawl", parentUri, linkUri);`
576	`}`
577
578	`function emitCrawlImage(crawler, parentUri, linkUri, alt ) {`
579
580	`crawler.emit("crawlImage", parentUri, linkUri, alt);`
581	`}`
582
583	`module.exports.Crawler = Crawler;`