UNPKG

17.8 kBJavaScriptView Raw
1var events = require('events');
2var timers = require('timers');
3var util = require("util");
4var _ = require("underscore");
5var requester = require("./lib/queue-requester");
6var URI = require('./lib/uri.js');
7var Map = require("collections/fast-map");
8var Set = require("collections/fast-set");
9var html = require("./lib/html.js");
10var domainBlackList = require("./default-lists/domain-black-list.js").list();
11
12var DEFAULT_NUMBER_OF_CONNECTIONS = 10;
13var DEFAULT_DEPTH_LIMIT = -1; // no limit
14var DEFAULT_TIME_OUT = 20000;
15var DEFAULT_RETRIES = 3;
16var DEFAULT_RETRY_TIMEOUT = 10000;
17var DEFAULT_SKIP_DUPLICATES = true;
18var DEFAULT_RATE_LIMITS = 0;
19var DEFAULT_MAX_ERRORS = 5;
20var DEFAULT_ERROR_RATES = [200, 350, 500];
21
22var DEFAULT_CRAWL_EXTERNAL_LINKS = false;
23var DEFAULT_CRAWL_EXTERNAL_DOMAINS = false;
24var DEFAULT_CRAWL_EXTERNAL_HOSTS = false;
25var DEFAULT_CRAWL_SCRIPTS = true; // Crawl <script>
26var DEFAULT_CRAWL_LINKS = true; // Crawl <link>
27var DEFAULT_CRAWL_IMAGES = true;
28
29var DEFAULT_PROTOCOLS_TO_CRAWL = ["http", "https"];
30var DEFAULT_FOLLOW_301 = false;
31
32var DEFAULT_LINKS_TYPES = ["canonical", "stylesheet"];
33var DEFAULT_USER_AGENT = "NinjaBot";
34var DEFAULT_CACHE = false;
35var DEFAULT_METHOD = 'GET';
36var DEFAULT_REFERER = false;
37
38/**
39 * The crawler object
40 *
41 * @param config used to customize the crawler.
42 *
43 * The current config attributes are :
44 * - maxConnections : the number of connections used to crawl - default is 10
45 * - externalLinks : if true crawl external links
46 * - externalDomains : if true crawl the complete external domains. This option can crawl a lot of different domains
47 * - scripts : if true crawl script tags
48 * - links : if true crawl link tags
49 * - linkTypes : the type of the links tags to crawl (match to the rel attribute), default : ["canonical", "stylesheet"]
50 * - images : if true crawl images
51 * - protocols : list of the protocols to crawl, default = ["http", "https"]
52 * - timeout : timeout per requests in milliseconds (Default 20000)
53 * - retries : number of retries if the request fails (default 3)
54 * - retryTimeout : number of milliseconds to wait before retrying (Default 10000)
55 * - maxErrors : number of timeout errors before changing the crawl rate, default is 5,
56 - errorRates : list of rates to used when too many timeout errors occur.
57 * - skipDuplicates : if true skips URIs that were already crawled - default is true
58 * - rateLimits : number of milliseconds to delay between each requests (Default 0).
59 * Note that this option will force crawler to use only one connection
60 * - depthLimit : the depth limit for the crawl
61 * - followRedirect : if true, the crawl will not return the 301, it will follow directly the redirection
62 * - proxyList : the list of proxies (see the project simple-proxies on npm)
63 *
64 * + all options provided by nodejs request : https://github.com/request/request
65 */
66function Crawler(config) {
67
68
69 // Store the depth for each crawled url
70 // Override config.updateDepth function in order to use another storage
71 // This default implementation is not recommanded for big crawl
72 // TODO : use an external store
73 this.depthUrls = new Map();
74
75 // list of the hosts from which the crawl starts
76 this.startFromHosts = new Set();
77
78 // list of the domains from wih the crawl starts
79 this.startFromDomains = new Set();
80
81 // Default config
82 this.config = this.createDefaultConfig();
83
84 // Merge default config values & overridden values provided by the arg config
85 if (config) {
86 _.extend(this.config, config);
87 }
88
89 // if using rateLimits we want to use only one connection with delay between requests
90 if (this.config.rateLimits !== 0) {
91 this.config.maxConnections = 1;
92 }
93
94
95 // assign the default updateDepth method used to calculate the crawl depth
96 this.updateDepth = updateDepth;
97
98 // If the config object contains an new implementation of the updateDepth method
99 if (this.config.updateDepth) {
100 this.updateDepth = this.config.updateDepth;
101 }
102
103 this.httpRequester = new requester.Requester(this.config);
104
105 events.EventEmitter.call(this);
106
107}
108
109util.inherits(Crawler, events.EventEmitter);
110
111
112/**
113 * Add one or more urls to crawl
114 *
115 * @param The url to crawl
116 *
117 */
118Crawler.prototype.queue = function(options) {
119
120 var self = this;
121
122 // Error if no options
123 if (! options) {
124 if (self.config.onCrawl) {
125 self.config.onCrawl({errorCode : "NO_OPTIONS"}, {method:"GET", url : "unknown", proxy : "", error : true});
126 }
127
128 if (this.httpRequester.idle()) {
129 self.config.onDrain();
130 }
131 return;
132 }
133
134
135 // if Array => recall this method for each element
136 if (_.isArray(options)) {
137 options.forEach(function(opt){
138 self.queue(opt);
139 });
140
141 return;
142 }
143
144
145 // if String, we expect to receive an url
146 if (_.isString(options)) {
147 this.startFromHosts.add(URI.host(options));
148 this.startFromDomains.add(URI.domain(options));
149 this.httpRequester.queue(this.addDefaultOptions({uri:options, url:options}, this.config))
150 }
151 // Last possibility, this is a json
152 else {
153
154 if (! _.has(options, "url") && ! _.has(options, "uri")) {
155 if (self.config.onCrawl) {
156 self.config.onCrawl({errorCode : "NO_URL_OPTION"}, {method:"GET", url : "unknown", proxy : "", error : true});
157 }
158
159 if (this.httpRequester.idle()) {
160 self.config.onDrain();
161 }
162 }
163 else {
164 this.startFromHosts.add(URI.host(_.has(options, "url") ? options.url : options.uri));
165 this.startFromDomains.add(URI.domain(_.has(options, "url") ? options.url : options.uri));
166 this.httpRequester.queue(this.addDefaultOptions(options, this.config));
167 }
168 }
169
170
171}
172
173Crawler.prototype.addDefaultOptions = function(options, defaultOptions) {
174
175 _.defaults(options, defaultOptions);
176 options.maxRetries = options.retries;
177 return options;
178
179}
180
181Crawler.prototype.buildNewOptions = function(options, newUrl) {
182
183 var o = this.createDefaultConfig(newUrl);
184
185 // Copy only options attributes that are in the options used for the previous request
186 // Could be simple ? ;-)
187 o = _.extend(o, _.pick(options, _.without(_.keys(o), "url", "uri") ));
188
189 //Reset setting used for retries when an error occurs like a timeout
190 o.maxRetries = o.retries;
191
192
193 if (options.canCrawl) {
194 o.canCrawl = options.canCrawl;
195 }
196 return o;
197
198}
199
200
201/**
202 * Default crawler config
203 *
204 * @returns the config object
205 */
206Crawler.prototype.createDefaultConfig = function(url) {
207 var self = this;
208 var config = {
209
210
211 cache : DEFAULT_CACHE,
212 method : DEFAULT_METHOD,
213 referer : DEFAULT_REFERER,
214 maxConnections : DEFAULT_NUMBER_OF_CONNECTIONS,
215 timeout : DEFAULT_TIME_OUT,
216 retries : DEFAULT_RETRIES,
217 maxRetries : DEFAULT_RETRIES,
218 retryTimeout : DEFAULT_RETRY_TIMEOUT,
219 maxErrors : DEFAULT_MAX_ERRORS,
220 errorRates : DEFAULT_ERROR_RATES,
221 skipDuplicates : DEFAULT_SKIP_DUPLICATES,
222 rateLimits : DEFAULT_RATE_LIMITS,
223 externalLinks : DEFAULT_CRAWL_EXTERNAL_LINKS,
224 externalDomains : DEFAULT_CRAWL_EXTERNAL_DOMAINS,
225 externalHosts : DEFAULT_CRAWL_EXTERNAL_HOSTS,
226 protocols : DEFAULT_PROTOCOLS_TO_CRAWL,
227 depthLimit : DEFAULT_DEPTH_LIMIT,
228 followRedirect : DEFAULT_FOLLOW_301,
229 images : DEFAULT_CRAWL_IMAGES,
230 links : DEFAULT_CRAWL_LINKS,
231 linkTypes : DEFAULT_LINKS_TYPES,
232 scripts : DEFAULT_CRAWL_SCRIPTS,
233 userAgent : DEFAULT_USER_AGENT,
234 domainBlackList : domainBlackList,
235
236 onCrawl : function(error, result){
237 self.crawl(error, result);
238 },
239
240 onDrain : function(){
241 timers.setImmediate(function(){
242 self.emit('end');
243 });
244
245 }
246
247 };
248
249 if (url) {
250 config.url = url;
251 config.uri = url;
252 }
253
254 return config;
255
256}
257
258/**
259 * Default callback function used when the http queue requester get a resource (html, pdf, css, ...)
260 *
261 * @param error The usual nodejs error
262 * @param result : the result of the resource crawl
263 * @param the jquery like object for accessing to the HTML tags. Null is the resource
264 * is not an HTML
265 */
266Crawler.prototype.crawl = function (error, result) {
267
268
269 var self = this;
270 if (error) {
271 //console.log(error);
272 timers.setImmediate(emitErrorEvent, self, error, result);
273 return;
274 }
275
276 var $ = html.isHTML(result.body) ? html.$(result.body) : null;
277
278 timers.setImmediate(emitCrawlEvent, self,result, $);
279
280 // if $ is defined, this is an HTML page with an http status 200
281 if ($) {
282 this.analyzeHTML(result,$);
283 }
284
285
286 // if 30* & followRedirect = false => chain 30*
287 if (result.statusCode >= 300 && result.statusCode <= 399 && ! this.config.followRedirect) {
288
289 var from = result.uri;
290 var to = result.headers["location"];
291 var to = URI.linkToURI(from, to);
292 timers.setImmediate(emitRedirectEvent, self, from, to, result.statusCode);
293
294 this.httpRequester.queue(this.buildNewOptions(result,to));
295
296 }
297}
298
299
300
301/**
302 * Analyze an HTML page. Mainly, found a.href & links in the page
303 *
304 * @param result : the result of the crawled resource
305 * @param the jquery like object for accessing to the HTML tags. Null is the resource
306 * is not an HTML
307 */
308Crawler.prototype.analyzeHTML = function(result, $) {
309
310
311 this.crawlHrefs(result, $);
312
313 if (this.config.links){
314 this.crawlLinks(result, $);
315 }
316
317 if (this.config.scripts) {
318 this.crawlScripts(result,$);
319 }
320
321 if (this.config.images) {
322 this.crawlImages(result,$);
323 }
324
325
326}
327
328
329/**
330 * Crawl urls that match to HTML tags a.href found in one page
331 *
332 * @param result : the result of the crawled resource
333 * @param the jquery like object for accessing to the HTML tags.
334 *
335 */
336Crawler.prototype.crawlHrefs = function(result, $) {
337 var parentUri = result.uri
338 var self = this;
339
340 $('a').each(function(index, a) {
341
342 var link = $(a).attr('href');
343
344 if (link) {
345
346 var anchor = $(a).text() ? $(a).text() : "";
347 var noFollow = $(a).attr("rel");
348 var isDoFollow = ! (noFollow && noFollow === "nofollow");
349
350 var linkUri = URI.linkToURI(parentUri, link);
351
352 var currentDepth = self.updateDepth(parentUri, linkUri);
353
354 timers.setImmediate(emitCrawlHrefEvent, self, "crawlLink", parentUri, linkUri, anchor, isDoFollow);
355
356
357 if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri, anchor, isDoFollow)) {
358 self.httpRequester.queue(self.buildNewOptions(result,linkUri));
359 }
360 else {
361 timers.setImmediate(emitCrawlHrefEvent, self, "uncrawl", parentUri, linkUri, anchor, isDoFollow);
362 }
363 }
364
365 });
366
367}
368
369/**
370 * Crawl link tags found in the HTML page
371 * eg. : <link rel="stylesheet" href="/css/bootstrap.min.css">
372 *
373 * @param result : the result of the crawled resource
374 * @param the jquery like object for accessing to the HTML tags.
375 */
376Crawler.prototype.crawlLinks = function(result, $) {
377
378 var parentUri = result.uri;
379 var self = this;
380
381 $('link').each(function(index, linkTag) {
382
383 var link = $(linkTag).attr('href');
384
385 if (link) {
386
387 var rel = $(linkTag).attr('rel');
388
389 if (self.config.linkTypes.indexOf(rel) > 0) {
390 var linkUri = URI.linkToURI(parentUri, link);
391 var currentDepth = self.updateDepth(parentUri, linkUri);
392
393 timers.setImmediate(emitCrawlLinkEvent, self, parentUri, linkUri);
394
395 if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {
396
397 self.httpRequester.queue(self.buildNewOptions(result,linkUri));
398
399 }
400 else {
401 timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);
402 }
403 }
404
405 }
406
407 });
408
409}
410
411/**
412 * Crawl script tags found in the HTML page
413 *
414 * @param result : the result of the crawled resource
415 * @param the jquery like object for accessing to the HTML tags.
416 */
417Crawler.prototype.crawlScripts = function(result, $) {
418
419 var parentUri = result.uri;
420 var self = this;
421
422 $('script').each(function(index, link) {
423
424 var link = $(link).attr('src');
425 if (link) {
426 var linkUri = URI.linkToURI(parentUri, link);
427 var currentDepth = self.updateDepth(parentUri, linkUri);
428
429 timers.setImmediate(emitCrawlLinkEvent, self, parentUri, linkUri);
430
431 if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {
432
433 self.httpRequester.queue(self.buildNewOptions(result, linkUri));
434
435 }
436 else {
437 timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);
438 }
439 }
440
441 });
442
443}
444
445/**
446 * Crawl image tags found in the HTML page
447 *
448 * @param result : the result of the crawled resource
449 * @param the jquery like object for accessing to the HTML tags.
450 */
451Crawler.prototype.crawlImages = function(result, $) {
452
453 var parentUri = result.uri;
454 var self = this;
455
456 $('img').each(function(index, img) {
457
458 var link = $(img).attr('src');
459 var alt = $(img).attr('alt');
460 if (link) {
461 var linkUri = URI.linkToURI(parentUri, link);
462
463 var currentDepth = self.updateDepth(parentUri, linkUri);
464
465 timers.setImmediate(emitCrawlImage, self, parentUri, linkUri, alt);
466
467 if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {
468
469 self.httpRequester.queue(self.buildNewOptions(result,linkUri));
470
471 }
472 else {
473 timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);
474 }
475 }
476
477 });
478
479}
480
481/**
482 * Check if a link has to be crawled
483 *
484 * @param the link url
485 * @param the anchor text of the links
486 * @param true if the link is dofollow
487 * @returns
488 */
489Crawler.prototype.isAGoodLinkToCrawl = function(result, currentDepth, parentUri, link, anchor, isDoFollow) {
490
491 // 1. Check the depthLimit
492 if (result.depthLimit > -1 && currentDepth > result.depthLimit) {
493 return false
494 }
495
496 // 2. Check if we need to crawl external links
497 if (URI.isExternalLink(parentUri,link) && ! result.externalLinks) {
498 return false;
499 }
500
501 // 3. Check if we need to crawl others host
502 if (! this.startFromHosts.has(URI.host(parentUri)) && ! result.externalHosts) {
503 return false;
504 }
505
506 // 3. Check if we need to crawl others domain
507 if (! this.startFromDomains.has(URI.domain(parentUri)) && ! result.externalDomains) {
508 return false;
509 }
510
511 // 5. Check if the link is based on a good protocol
512 if (result.protocols.indexOf(URI.protocol(link)) < 0) {
513 return false;
514 }
515
516 // 6. Check if the domain is in the black-list
517 if (result.domainBlackList.indexOf(URI.domainName(link)) > 0) {
518
519 return false;
520 }
521
522 // 7. Check if there is a rule in the crawler configuration
523 if (! result.canCrawl) {
524 return true;
525 }
526
527 var check = result.canCrawl(parentUri, link, anchor, isDoFollow);
528 //console.log(parentUri + " - " + link + " : " + check);
529 return check;
530}
531
532/**
533 * Compute the crawl depth for a link in function of the crawl depth
534 * of the page that contains the link
535 *
536 * @param The URI of page that contains the link
537 * @param The link for which the crawl depth has to be calculated
538 * @returns the crawl depth of the link
539 *
540 */
541var updateDepth = function(parentUri, linkUri) {
542
543 if (this.depthUrls.has(parentUri)) {
544
545 var parentDepth = this.depthUrls.get(parentUri);
546 if (this.depthUrls.has(linkUri)) {
547 return this.depthUrls.get(linkUri);
548 }
549 else {
550 var depth = parentDepth + 1;
551 this.depthUrls.set(linkUri, depth);
552 return depth;
553 }
554 }
555 else {
556 this.depthUrls.set(parentUri, 0);
557 this.depthUrls.set(linkUri, 1);
558 return 1;
559 }
560
561
562}
563
564function emitCrawlEvent(crawler, result, $) {
565
566 crawler.emit("crawl", result, $);
567}
568
569function emitErrorEvent(crawler, error, result) {
570 crawler.emit("error", error, result);
571}
572
573function emitRedirectEvent(crawler, from, to, statusCode) {
574 crawler.emit("crawlRedirect", from, to, statusCode);
575}
576
577
578function emitCrawlHrefEvent(crawler, eventName, parentUri, linkUri, anchor, isDoFollow) {
579 crawler.emit(eventName, parentUri, linkUri, anchor, isDoFollow);
580}
581
582function emitCrawlLinkEvent(crawler, parentUri, linkUri ) {
583 crawler.emit("crawlLink", parentUri, linkUri);
584}
585
586function emitUnCrawlEvent(crawler, parentUri, linkUri ) {
587 crawler.emit("uncrawl", parentUri, linkUri);
588}
589
590function emitCrawlImage(crawler, parentUri, linkUri, alt ) {
591
592 crawler.emit("crawlImage", parentUri, linkUri, alt);
593}
594
595module.exports.Crawler = Crawler;