UNPKG

17.3 kBJavaScriptView Raw
1var events = require('events');
2var timers = require('timers');
3var util = require("util");
4var _ = require("underscore");
5var requester = require("./lib/queue-requester");
6var URI = require('./lib/uri.js');
7var Map = require("collections/fast-map");
8var Set = require("collections/fast-set");
9var html = require("./lib/html.js");
10var domainBlackList = require("./default-lists/domain-black-list.js").list();
11
12var DEFAULT_NUMBER_OF_CONNECTIONS = 10;
13var DEFAULT_DEPTH_LIMIT = -1; // no limit
14var DEFAULT_TIME_OUT = 20000;
15var DEFAULT_RETRIES = 3;
16var DEFAULT_RETRY_TIMEOUT = 10000;
17var DEFAULT_SKIP_DUPLICATES = true;
18var DEFAULT_RATE_LIMITS = 0;
19var DEFAULT_MAX_ERRORS = 5;
20var DEFAULT_ERROR_RATES = [200, 350, 500];
21
22var DEFAULT_CRAWL_EXTERNAL_LINKS = false;
23var DEFAULT_CRAWL_EXTERNAL_DOMAINS = false;
24var DEFAULT_CRAWL_SCRIPTS = true; // Crawl <script>
25var DEFAULT_CRAWL_LINKS = true; // Crawl <link>
26var DEFAULT_CRAWL_IMAGES = true;
27
28var DEFAULT_PROTOCOLS_TO_CRAWL = ["http", "https"];
29var DEFAULT_FOLLOW_301 = false;
30
31var DEFAULT_LINKS_TYPES = ["canonical", "stylesheet"];
32var DEFAULT_USER_AGENT = "NinjaBot";
33var DEFAULT_CACHE = false;
34var DEFAULT_METHOD = 'GET';
35var DEFAULT_REFERER = false;
36
37/**
38 * The crawler object
39 *
40 * @param config used to customize the crawler.
41 *
42 * The current config attributes are :
43 * - maxConnections : the number of connections used to crawl - default is 10
44 * - externalLinks : if true crawl external links
45 * - externalDomains : if true crawl the complete external domains. This option can crawl a lot of different domains
46 * - scripts : if true crawl script tags
47 * - links : if true crawl link tags
48 * - linkTypes : the type of the links tags to crawl (match to the rel attribute), default : ["canonical", "stylesheet"]
49 * - images : if true crawl images
50 * - protocols : list of the protocols to crawl, default = ["http", "https"]
51 * - timeout : timeout per requests in milliseconds (Default 20000)
52 * - retries : number of retries if the request fails (default 3)
53 * - retryTimeout : number of milliseconds to wait before retrying (Default 10000)
54 * - maxErrors : number of timeout errors before changing the crawl rate, default is 5,
55 - errorRates : list of rates to used when too many timeout errors occur.
56 * - skipDuplicates : if true skips URIs that were already crawled - default is true
57 * - rateLimits : number of milliseconds to delay between each requests (Default 0).
58 * Note that this option will force crawler to use only one connection
59 * - depthLimit : the depth limit for the crawl
60 * - followRedirect : if true, the crawl will not return the 301, it will follow directly the redirection
61 * - proxyList : the list of proxies (see the project simple-proxies on npm)
62 *
63 * + all options provided by nodejs request : https://github.com/request/request
64 */
65function Crawler(config) {
66
67
68 // Store the depth for each crawled url
69 // Override config.updateDepth function in order to use another storage
70 // This default implementation is not recommanded for big crawl
71 // TODO : use an external store
72 this.depthUrls = new Map();
73
74 // list of the hosts from which the crawl starts
75 this.startFromHosts = new Set();
76
77 // Default config
78 this.config = this.createDefaultConfig();
79
80 // Merge default config values & overridden values provided by the arg config
81 if (config) {
82 _.extend(this.config, config);
83 }
84
85 // if using rateLimits we want to use only one connection with delay between requests
86 if (this.config.rateLimits !== 0) {
87 this.config.maxConnections = 1;
88 }
89
90
91 // assign the default updateDepth method used to calculate the crawl depth
92 this.updateDepth = updateDepth;
93
94 // If the config object contains an new implementation of the updateDepth method
95 if (this.config.updateDepth) {
96 this.updateDepth = this.config.updateDepth;
97 }
98
99 this.httpRequester = new requester.Requester(this.config);
100
101 events.EventEmitter.call(this);
102
103}
104
105util.inherits(Crawler, events.EventEmitter);
106
107
108/**
109 * Add one or more urls to crawl
110 *
111 * @param The url to crawl
112 *
113 */
114Crawler.prototype.queue = function(options) {
115
116 var self = this;
117
118 // Error if no options
119 if (! options) {
120 if (self.config.onCrawl) {
121 self.config.onCrawl({errorCode : "NO_OPTIONS"}, {method:"GET", url : "unknown", proxy : "", error : true});
122 }
123
124 if (this.httpRequester.idle()) {
125 self.config.onDrain();
126 }
127 return;
128 }
129
130
131 // if Array => recall this method for each element
132 if (_.isArray(options)) {
133 options.forEach(function(opt){
134 self.queue(opt);
135 });
136
137 return;
138 }
139
140
141 // if String, we expect to receive an url
142 if (_.isString(options)) {
143 this.startFromHosts.add(URI.host(options));
144 this.httpRequester.queue(this.addDefaultOptions({uri:options, url:options}, this.config))
145 }
146 // Last possibility, this is a json
147 else {
148
149 if (! _.has(options, "url") && ! _.has(options, "uri")) {
150 if (self.config.onCrawl) {
151 self.config.onCrawl({errorCode : "NO_URL_OPTION"}, {method:"GET", url : "unknown", proxy : "", error : true});
152 }
153
154 if (this.httpRequester.idle()) {
155 self.config.onDrain();
156 }
157 }
158 else {
159 this.startFromHosts.add(URI.host(_.has(options, "url") ? options.url : options.uri));
160 this.httpRequester.queue(this.addDefaultOptions(options, this.config));
161 }
162 }
163
164
165}
166
167Crawler.prototype.addDefaultOptions = function(options, defaultOptions) {
168
169 _.defaults(options, defaultOptions);
170 options.maxRetries = options.retries;
171 return options;
172
173}
174
175Crawler.prototype.buildNewOptions = function(options, newUrl) {
176
177 var o = this.createDefaultConfig(newUrl);
178
179 // Copy only options attributes that are in the options used for the previous request
180 // Could be simple ? ;-)
181 o = _.extend(o, _.pick(options, _.without(_.keys(o), "url", "uri") ));
182
183 //Reset setting used for retries when an error occurs like a timeout
184 o.maxRetries = o.retries;
185
186
187 if (options.canCrawl) {
188 o.canCrawl = options.canCrawl;
189 }
190 return o;
191
192}
193
194
195/**
196 * Default crawler config
197 *
198 * @returns the config object
199 */
200Crawler.prototype.createDefaultConfig = function(url) {
201 var self = this;
202 var config = {
203
204
205 cache : DEFAULT_CACHE,
206 method : DEFAULT_METHOD,
207 referer : DEFAULT_REFERER,
208 maxConnections : DEFAULT_NUMBER_OF_CONNECTIONS,
209 timeout : DEFAULT_TIME_OUT,
210 retries : DEFAULT_RETRIES,
211 maxRetries : DEFAULT_RETRIES,
212 retryTimeout : DEFAULT_RETRY_TIMEOUT,
213 maxErrors : DEFAULT_MAX_ERRORS,
214 errorRates : DEFAULT_ERROR_RATES,
215 skipDuplicates : DEFAULT_SKIP_DUPLICATES,
216 rateLimits : DEFAULT_RATE_LIMITS,
217 externalLinks : DEFAULT_CRAWL_EXTERNAL_LINKS,
218 externalDomains : DEFAULT_CRAWL_EXTERNAL_DOMAINS,
219 protocols : DEFAULT_PROTOCOLS_TO_CRAWL,
220 depthLimit : DEFAULT_DEPTH_LIMIT,
221 followRedirect : DEFAULT_FOLLOW_301,
222 images : DEFAULT_CRAWL_IMAGES,
223 links : DEFAULT_CRAWL_LINKS,
224 linkTypes : DEFAULT_LINKS_TYPES,
225 scripts : DEFAULT_CRAWL_SCRIPTS,
226 userAgent : DEFAULT_USER_AGENT,
227 domainBlackList : domainBlackList,
228
229 onCrawl : function(error, result){
230 self.crawl(error, result);
231 },
232
233 onDrain : function(){
234 timers.setImmediate(function(){
235 self.emit('end');
236 });
237
238 }
239
240 };
241
242 if (url) {
243 config.url = url;
244 config.uri = url;
245 }
246
247 return config;
248
249}
250
251/**
252 * Default callback function used when the http queue requester get a resource (html, pdf, css, ...)
253 *
254 * @param error The usual nodejs error
255 * @param result : the result of the resource crawl
256 * @param the jquery like object for accessing to the HTML tags. Null is the resource
257 * is not an HTML
258 */
259Crawler.prototype.crawl = function (error, result) {
260
261
262 var self = this;
263 if (error) {
264 //console.log(error);
265 timers.setImmediate(emitErrorEvent, self, error, result);
266 return;
267 }
268
269 var $ = html.isHTML(result.body) ? html.$(result.body) : null;
270
271 timers.setImmediate(emitCrawlEvent, self,result, $);
272
273 // if $ is defined, this is an HTML page with an http status 200
274 if ($) {
275 this.analyzeHTML(result,$);
276 }
277
278
279 // if 30* & followRedirect = false => chain 30*
280 if (result.statusCode >= 300 && result.statusCode <= 399 && ! this.config.followRedirect) {
281
282 var from = result.uri;
283 var to = result.headers["location"];
284 var to = URI.linkToURI(from, to);
285 timers.setImmediate(emitRedirectEvent, self, from, to, result.statusCode);
286
287 this.httpRequester.queue(this.buildNewOptions(result,to));
288
289 }
290}
291
292
293
294/**
295 * Analyze an HTML page. Mainly, found a.href & links in the page
296 *
297 * @param result : the result of the crawled resource
298 * @param the jquery like object for accessing to the HTML tags. Null is the resource
299 * is not an HTML
300 */
301Crawler.prototype.analyzeHTML = function(result, $) {
302
303
304 this.crawlHrefs(result, $);
305
306 if (this.config.links){
307 this.crawlLinks(result, $);
308 }
309
310 if (this.config.scripts) {
311 this.crawlScripts(result,$);
312 }
313
314 if (this.config.images) {
315 this.crawlImages(result,$);
316 }
317
318
319}
320
321
322/**
323 * Crawl urls that match to HTML tags a.href found in one page
324 *
325 * @param result : the result of the crawled resource
326 * @param the jquery like object for accessing to the HTML tags.
327 *
328 */
329Crawler.prototype.crawlHrefs = function(result, $) {
330 var parentUri = result.uri
331 var self = this;
332
333 $('a').each(function(index, a) {
334
335 var link = $(a).attr('href');
336
337 if (link) {
338
339 var anchor = $(a).text() ? $(a).text() : "";
340 var noFollow = $(a).attr("rel");
341 var isDoFollow = ! (noFollow && noFollow === "nofollow");
342
343 var linkUri = URI.linkToURI(parentUri, link);
344
345 var currentDepth = self.updateDepth(parentUri, linkUri);
346
347 timers.setImmediate(emitCrawlHrefEvent, self, "crawlLink", parentUri, linkUri, anchor, isDoFollow);
348
349
350 if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri, anchor, isDoFollow)) {
351 self.httpRequester.queue(self.buildNewOptions(result,linkUri));
352 }
353 else {
354 timers.setImmediate(emitCrawlHrefEvent, self, "uncrawl", parentUri, linkUri, anchor, isDoFollow);
355 }
356 }
357
358 });
359
360}
361
362/**
363 * Crawl link tags found in the HTML page
364 * eg. : <link rel="stylesheet" href="/css/bootstrap.min.css">
365 *
366 * @param result : the result of the crawled resource
367 * @param the jquery like object for accessing to the HTML tags.
368 */
369Crawler.prototype.crawlLinks = function(result, $) {
370
371 var parentUri = result.uri;
372 var self = this;
373
374 $('link').each(function(index, linkTag) {
375
376 var link = $(linkTag).attr('href');
377
378 if (link) {
379
380 var rel = $(linkTag).attr('rel');
381
382 if (self.config.linkTypes.indexOf(rel) > 0) {
383 var linkUri = URI.linkToURI(parentUri, link);
384 var currentDepth = self.updateDepth(parentUri, linkUri);
385
386 timers.setImmediate(emitCrawlLinkEvent, self, parentUri, linkUri);
387
388 if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {
389
390 self.httpRequester.queue(self.buildNewOptions(result,linkUri));
391
392 }
393 else {
394 timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);
395 }
396 }
397
398 }
399
400 });
401
402}
403
404/**
405 * Crawl script tags found in the HTML page
406 *
407 * @param result : the result of the crawled resource
408 * @param the jquery like object for accessing to the HTML tags.
409 */
410Crawler.prototype.crawlScripts = function(result, $) {
411
412 var parentUri = result.uri;
413 var self = this;
414
415 $('script').each(function(index, link) {
416
417 var link = $(link).attr('src');
418 if (link) {
419 var linkUri = URI.linkToURI(parentUri, link);
420 var currentDepth = self.updateDepth(parentUri, linkUri);
421
422 timers.setImmediate(emitCrawlLinkEvent, self, parentUri, linkUri);
423
424 if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {
425
426 self.httpRequester.queue(self.buildNewOptions(result, linkUri));
427
428 }
429 else {
430 timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);
431 }
432 }
433
434 });
435
436}
437
438/**
439 * Crawl image tags found in the HTML page
440 *
441 * @param result : the result of the crawled resource
442 * @param the jquery like object for accessing to the HTML tags.
443 */
444Crawler.prototype.crawlImages = function(result, $) {
445
446 var parentUri = result.uri;
447 var self = this;
448
449 $('img').each(function(index, img) {
450
451 var link = $(img).attr('src');
452 var alt = $(img).attr('alt');
453 if (link) {
454 var linkUri = URI.linkToURI(parentUri, link);
455
456 var currentDepth = self.updateDepth(parentUri, linkUri);
457
458 timers.setImmediate(emitCrawlImage, self, parentUri, linkUri, alt);
459
460 if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {
461
462 self.httpRequester.queue(self.buildNewOptions(result,linkUri));
463
464 }
465 else {
466 timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);
467 }
468 }
469
470 });
471
472}
473
474/**
475 * Check if a link has to be crawled
476 *
477 * @param the link url
478 * @param the anchor text of the links
479 * @param true if the link is dofollow
480 * @returns
481 */
482Crawler.prototype.isAGoodLinkToCrawl = function(result, currentDepth, parentUri, link, anchor, isDoFollow) {
483
484 // 1. Check the depthLimit
485 if (result.depthLimit > -1 && currentDepth > result.depthLimit) {
486 return false
487 }
488
489 // 2. Check if we need to crawl external links
490 if (URI.isExternalLink(parentUri,link) && ! result.externalLinks) {
491 return false;
492 }
493
494 // 3. Check if we need to crawl external domains
495 if (! this.startFromHosts.has(URI.host(parentUri)) && ! result.externalDomains) {
496 return false;
497 }
498
499 // 4. Check if the link is based on a good protocol
500 if (result.protocols.indexOf(URI.protocol(link)) < 0) {
501 return false;
502 }
503
504 // 5. Check if the domain is in the black-list
505 if (result.domainBlackList.indexOf(URI.domainName(link)) > 0) {
506
507 return false;
508 }
509
510 // 6. Check if there is a rule in the crawler configuration
511 if (! result.canCrawl) {
512 return true;
513 }
514
515 var check = result.canCrawl(parentUri, link, anchor, isDoFollow);
516 //console.log(parentUri + " - " + link + " : " + check);
517 return check;
518}
519
520/**
521 * Compute the crawl depth for a link in function of the crawl depth
522 * of the page that contains the link
523 *
524 * @param The URI of page that contains the link
525 * @param The link for which the crawl depth has to be calculated
526 * @returns the crawl depth of the link
527 *
528 */
529var updateDepth = function(parentUri, linkUri) {
530
531 if (this.depthUrls.has(parentUri)) {
532
533 var parentDepth = this.depthUrls.get(parentUri);
534 if (this.depthUrls.has(linkUri)) {
535 return this.depthUrls.get(linkUri);
536 }
537 else {
538 var depth = parentDepth + 1;
539 this.depthUrls.set(linkUri, depth);
540 return depth;
541 }
542 }
543 else {
544 this.depthUrls.set(parentUri, 0);
545 this.depthUrls.set(linkUri, 1);
546 return 1;
547 }
548
549
550}
551
552function emitCrawlEvent(crawler, result, $) {
553
554 crawler.emit("crawl", result, $);
555}
556
557function emitErrorEvent(crawler, error, result) {
558 crawler.emit("error", error, result);
559}
560
561function emitRedirectEvent(crawler, from, to, statusCode) {
562 crawler.emit("crawlRedirect", from, to, statusCode);
563}
564
565
566function emitCrawlHrefEvent(crawler, eventName, parentUri, linkUri, anchor, isDoFollow) {
567 crawler.emit(eventName, parentUri, linkUri, anchor, isDoFollow);
568}
569
570function emitCrawlLinkEvent(crawler, parentUri, linkUri ) {
571 crawler.emit("crawlLink", parentUri, linkUri);
572}
573
574function emitUnCrawlEvent(crawler, parentUri, linkUri ) {
575 crawler.emit("uncrawl", parentUri, linkUri);
576}
577
578function emitCrawlImage(crawler, parentUri, linkUri, alt ) {
579
580 crawler.emit("crawlImage", parentUri, linkUri, alt);
581}
582
583module.exports.Crawler = Crawler;