1 | var events = require('events');
|
2 | var timers = require('timers');
|
3 | var util = require("util");
|
4 | var _ = require("underscore");
|
5 | var requester = require("./lib/queue-requester");
|
6 | var URI = require('./lib/uri.js');
|
7 | var Map = require("collections/fast-map");
|
8 | var Set = require("collections/fast-set");
|
9 | var html = require("./lib/html.js");
|
10 | var domainBlackList = require("./default-lists/domain-black-list.js").list();
|
11 |
|
12 | var DEFAULT_NUMBER_OF_CONNECTIONS = 10;
|
13 | var DEFAULT_DEPTH_LIMIT = -1;
|
14 | var DEFAULT_TIME_OUT = 20000;
|
15 | var DEFAULT_RETRIES = 3;
|
16 | var DEFAULT_RETRY_TIMEOUT = 10000;
|
17 | var DEFAULT_SKIP_DUPLICATES = true;
|
18 | var DEFAULT_RATE_LIMITS = 0;
|
19 | var DEFAULT_MAX_ERRORS = 5;
|
20 | var DEFAULT_ERROR_RATES = [200, 350, 500];
|
21 |
|
22 | var DEFAULT_CRAWL_EXTERNAL_LINKS = false;
|
23 | var DEFAULT_CRAWL_EXTERNAL_DOMAINS = false;
|
24 | var DEFAULT_CRAWL_EXTERNAL_HOSTS = false;
|
25 | var DEFAULT_CRAWL_SCRIPTS = true;
|
26 | var DEFAULT_CRAWL_LINKS = true;
|
27 | var DEFAULT_CRAWL_IMAGES = true;
|
28 |
|
29 | var DEFAULT_PROTOCOLS_TO_CRAWL = ["http", "https"];
|
30 | var DEFAULT_FOLLOW_301 = false;
|
31 |
|
32 | var DEFAULT_LINKS_TYPES = ["canonical", "stylesheet"];
|
33 | var DEFAULT_USER_AGENT = "NinjaBot";
|
34 | var DEFAULT_CACHE = false;
|
35 | var DEFAULT_METHOD = 'GET';
|
36 | var DEFAULT_REFERER = false;
|
37 |
|
38 |
|
39 |
|
40 |
|
41 |
|
42 |
|
43 |
|
44 |
|
45 |
|
46 |
|
47 |
|
48 |
|
49 |
|
50 |
|
51 |
|
52 |
|
53 |
|
54 |
|
55 |
|
56 |
|
57 |
|
58 |
|
59 |
|
60 |
|
61 |
|
62 |
|
63 |
|
64 |
|
65 |
|
66 | function Crawler(config) {
|
67 |
|
68 |
|
69 |
|
70 |
|
71 |
|
72 |
|
73 | this.depthUrls = new Map();
|
74 |
|
75 |
|
76 | this.startFromHosts = new Set();
|
77 |
|
78 |
|
79 | this.startFromDomains = new Set();
|
80 |
|
81 |
|
82 | this.config = this.createDefaultConfig();
|
83 |
|
84 |
|
85 | if (config) {
|
86 | _.extend(this.config, config);
|
87 | }
|
88 |
|
89 |
|
90 | if (this.config.rateLimits !== 0) {
|
91 | this.config.maxConnections = 1;
|
92 | }
|
93 |
|
94 |
|
95 |
|
96 | this.updateDepth = updateDepth;
|
97 |
|
98 |
|
99 | if (this.config.updateDepth) {
|
100 | this.updateDepth = this.config.updateDepth;
|
101 | }
|
102 |
|
103 | this.httpRequester = new requester.Requester(this.config);
|
104 |
|
105 | events.EventEmitter.call(this);
|
106 |
|
107 | }
|
108 |
|
109 | util.inherits(Crawler, events.EventEmitter);
|
110 |
|
111 |
|
112 |
|
113 |
|
114 |
|
115 |
|
116 |
|
117 |
|
118 | Crawler.prototype.queue = function(options) {
|
119 |
|
120 | var self = this;
|
121 |
|
122 |
|
123 | if (! options) {
|
124 | if (self.config.onCrawl) {
|
125 | self.config.onCrawl({errorCode : "NO_OPTIONS"}, {method:"GET", url : "unknown", proxy : "", error : true});
|
126 | }
|
127 |
|
128 | if (this.httpRequester.idle()) {
|
129 | self.config.onDrain();
|
130 | }
|
131 | return;
|
132 | }
|
133 |
|
134 |
|
135 |
|
136 | if (_.isArray(options)) {
|
137 | options.forEach(function(opt){
|
138 | self.queue(opt);
|
139 | });
|
140 |
|
141 | return;
|
142 | }
|
143 |
|
144 |
|
145 |
|
146 | if (_.isString(options)) {
|
147 | this.startFromHosts.add(URI.host(options));
|
148 | this.startFromDomains.add(URI.domain(options));
|
149 | this.httpRequester.queue(this.addDefaultOptions({uri:options, url:options}, this.config))
|
150 | }
|
151 |
|
152 | else {
|
153 |
|
154 | if (! _.has(options, "url") && ! _.has(options, "uri")) {
|
155 | if (self.config.onCrawl) {
|
156 | self.config.onCrawl({errorCode : "NO_URL_OPTION"}, {method:"GET", url : "unknown", proxy : "", error : true});
|
157 | }
|
158 |
|
159 | if (this.httpRequester.idle()) {
|
160 | self.config.onDrain();
|
161 | }
|
162 | }
|
163 | else {
|
164 | this.startFromHosts.add(URI.host(_.has(options, "url") ? options.url : options.uri));
|
165 | this.startFromDomains.add(URI.domain(_.has(options, "url") ? options.url : options.uri));
|
166 | this.httpRequester.queue(this.addDefaultOptions(options, this.config));
|
167 | }
|
168 | }
|
169 |
|
170 |
|
171 | }
|
172 |
|
173 | Crawler.prototype.addDefaultOptions = function(options, defaultOptions) {
|
174 |
|
175 | _.defaults(options, defaultOptions);
|
176 | options.maxRetries = options.retries;
|
177 | return options;
|
178 |
|
179 | }
|
180 |
|
181 | Crawler.prototype.buildNewOptions = function(options, newUrl) {
|
182 |
|
183 | var o = this.createDefaultConfig(newUrl);
|
184 |
|
185 |
|
186 |
|
187 | o = _.extend(o, _.pick(options, _.without(_.keys(o), "url", "uri") ));
|
188 |
|
189 |
|
190 | o.maxRetries = o.retries;
|
191 |
|
192 |
|
193 | if (options.canCrawl) {
|
194 | o.canCrawl = options.canCrawl;
|
195 | }
|
196 | return o;
|
197 |
|
198 | }
|
199 |
|
200 |
|
201 |
|
202 |
|
203 |
|
204 |
|
205 |
|
206 | Crawler.prototype.createDefaultConfig = function(url) {
|
207 | var self = this;
|
208 | var config = {
|
209 |
|
210 |
|
211 | cache : DEFAULT_CACHE,
|
212 | method : DEFAULT_METHOD,
|
213 | referer : DEFAULT_REFERER,
|
214 | maxConnections : DEFAULT_NUMBER_OF_CONNECTIONS,
|
215 | timeout : DEFAULT_TIME_OUT,
|
216 | retries : DEFAULT_RETRIES,
|
217 | maxRetries : DEFAULT_RETRIES,
|
218 | retryTimeout : DEFAULT_RETRY_TIMEOUT,
|
219 | maxErrors : DEFAULT_MAX_ERRORS,
|
220 | errorRates : DEFAULT_ERROR_RATES,
|
221 | skipDuplicates : DEFAULT_SKIP_DUPLICATES,
|
222 | rateLimits : DEFAULT_RATE_LIMITS,
|
223 | externalLinks : DEFAULT_CRAWL_EXTERNAL_LINKS,
|
224 | externalDomains : DEFAULT_CRAWL_EXTERNAL_DOMAINS,
|
225 | externalHosts : DEFAULT_CRAWL_EXTERNAL_HOSTS,
|
226 | protocols : DEFAULT_PROTOCOLS_TO_CRAWL,
|
227 | depthLimit : DEFAULT_DEPTH_LIMIT,
|
228 | followRedirect : DEFAULT_FOLLOW_301,
|
229 | images : DEFAULT_CRAWL_IMAGES,
|
230 | links : DEFAULT_CRAWL_LINKS,
|
231 | linkTypes : DEFAULT_LINKS_TYPES,
|
232 | scripts : DEFAULT_CRAWL_SCRIPTS,
|
233 | userAgent : DEFAULT_USER_AGENT,
|
234 | domainBlackList : domainBlackList,
|
235 |
|
236 | onCrawl : function(error, result){
|
237 | self.crawl(error, result);
|
238 | },
|
239 |
|
240 | onDrain : function(){
|
241 | timers.setImmediate(function(){
|
242 | self.emit('end');
|
243 | });
|
244 |
|
245 | }
|
246 |
|
247 | };
|
248 |
|
249 | if (url) {
|
250 | config.url = url;
|
251 | config.uri = url;
|
252 | }
|
253 |
|
254 | return config;
|
255 |
|
256 | }
|
257 |
|
258 |
|
259 |
|
260 |
|
261 |
|
262 |
|
263 |
|
264 |
|
265 |
|
266 | Crawler.prototype.crawl = function (error, result) {
|
267 |
|
268 |
|
269 | var self = this;
|
270 | if (error) {
|
271 |
|
272 | timers.setImmediate(emitErrorEvent, self, error, result);
|
273 | return;
|
274 | }
|
275 |
|
276 | var $ = html.isHTML(result.body) ? html.$(result.body) : null;
|
277 |
|
278 | timers.setImmediate(emitCrawlEvent, self,result, $);
|
279 |
|
280 |
|
281 | if ($) {
|
282 | this.analyzeHTML(result,$);
|
283 | }
|
284 |
|
285 |
|
286 |
|
287 | if (result.statusCode >= 300 && result.statusCode <= 399 && ! this.config.followRedirect) {
|
288 |
|
289 | var from = result.uri;
|
290 | var to = result.headers["location"];
|
291 | var to = URI.linkToURI(from, to);
|
292 | timers.setImmediate(emitRedirectEvent, self, from, to, result.statusCode);
|
293 |
|
294 | this.httpRequester.queue(this.buildNewOptions(result,to));
|
295 |
|
296 | }
|
297 | }
|
298 |
|
299 |
|
300 |
|
301 |
|
302 |
|
303 |
|
304 |
|
305 |
|
306 |
|
307 |
|
308 | Crawler.prototype.analyzeHTML = function(result, $) {
|
309 |
|
310 |
|
311 | this.crawlHrefs(result, $);
|
312 |
|
313 | if (this.config.links){
|
314 | this.crawlLinks(result, $);
|
315 | }
|
316 |
|
317 | if (this.config.scripts) {
|
318 | this.crawlScripts(result,$);
|
319 | }
|
320 |
|
321 | if (this.config.images) {
|
322 | this.crawlImages(result,$);
|
323 | }
|
324 |
|
325 |
|
326 | }
|
327 |
|
328 |
|
329 |
|
330 |
|
331 |
|
332 |
|
333 |
|
334 |
|
335 |
|
336 | Crawler.prototype.crawlHrefs = function(result, $) {
|
337 | var parentUri = result.uri
|
338 | var self = this;
|
339 |
|
340 | $('a').each(function(index, a) {
|
341 |
|
342 | var link = $(a).attr('href');
|
343 |
|
344 | if (link) {
|
345 |
|
346 | var anchor = $(a).text() ? $(a).text() : "";
|
347 | var noFollow = $(a).attr("rel");
|
348 | var isDoFollow = ! (noFollow && noFollow === "nofollow");
|
349 |
|
350 | var linkUri = URI.linkToURI(parentUri, link);
|
351 |
|
352 | var currentDepth = self.updateDepth(parentUri, linkUri);
|
353 |
|
354 | timers.setImmediate(emitCrawlHrefEvent, self, "crawlLink", parentUri, linkUri, anchor, isDoFollow);
|
355 |
|
356 |
|
357 | if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri, anchor, isDoFollow)) {
|
358 | self.httpRequester.queue(self.buildNewOptions(result,linkUri));
|
359 | }
|
360 | else {
|
361 | timers.setImmediate(emitCrawlHrefEvent, self, "uncrawl", parentUri, linkUri, anchor, isDoFollow);
|
362 | }
|
363 | }
|
364 |
|
365 | });
|
366 |
|
367 | }
|
368 |
|
369 |
|
370 |
|
371 |
|
372 |
|
373 |
|
374 |
|
375 |
|
376 | Crawler.prototype.crawlLinks = function(result, $) {
|
377 |
|
378 | var parentUri = result.uri;
|
379 | var self = this;
|
380 |
|
381 | $('link').each(function(index, linkTag) {
|
382 |
|
383 | var link = $(linkTag).attr('href');
|
384 |
|
385 | if (link) {
|
386 |
|
387 | var rel = $(linkTag).attr('rel');
|
388 |
|
389 | if (self.config.linkTypes.indexOf(rel) > 0) {
|
390 | var linkUri = URI.linkToURI(parentUri, link);
|
391 | var currentDepth = self.updateDepth(parentUri, linkUri);
|
392 |
|
393 | timers.setImmediate(emitCrawlLinkEvent, self, parentUri, linkUri);
|
394 |
|
395 | if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {
|
396 |
|
397 | self.httpRequester.queue(self.buildNewOptions(result,linkUri));
|
398 |
|
399 | }
|
400 | else {
|
401 | timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);
|
402 | }
|
403 | }
|
404 |
|
405 | }
|
406 |
|
407 | });
|
408 |
|
409 | }
|
410 |
|
411 |
|
412 |
|
413 |
|
414 |
|
415 |
|
416 |
|
417 | Crawler.prototype.crawlScripts = function(result, $) {
|
418 |
|
419 | var parentUri = result.uri;
|
420 | var self = this;
|
421 |
|
422 | $('script').each(function(index, link) {
|
423 |
|
424 | var link = $(link).attr('src');
|
425 | if (link) {
|
426 | var linkUri = URI.linkToURI(parentUri, link);
|
427 | var currentDepth = self.updateDepth(parentUri, linkUri);
|
428 |
|
429 | timers.setImmediate(emitCrawlLinkEvent, self, parentUri, linkUri);
|
430 |
|
431 | if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {
|
432 |
|
433 | self.httpRequester.queue(self.buildNewOptions(result, linkUri));
|
434 |
|
435 | }
|
436 | else {
|
437 | timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);
|
438 | }
|
439 | }
|
440 |
|
441 | });
|
442 |
|
443 | }
|
444 |
|
445 |
|
446 |
|
447 |
|
448 |
|
449 |
|
450 |
|
451 | Crawler.prototype.crawlImages = function(result, $) {
|
452 |
|
453 | var parentUri = result.uri;
|
454 | var self = this;
|
455 |
|
456 | $('img').each(function(index, img) {
|
457 |
|
458 | var link = $(img).attr('src');
|
459 | var alt = $(img).attr('alt');
|
460 | if (link) {
|
461 | var linkUri = URI.linkToURI(parentUri, link);
|
462 |
|
463 | var currentDepth = self.updateDepth(parentUri, linkUri);
|
464 |
|
465 | timers.setImmediate(emitCrawlImage, self, parentUri, linkUri, alt);
|
466 |
|
467 | if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {
|
468 |
|
469 | self.httpRequester.queue(self.buildNewOptions(result,linkUri));
|
470 |
|
471 | }
|
472 | else {
|
473 | timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);
|
474 | }
|
475 | }
|
476 |
|
477 | });
|
478 |
|
479 | }
|
480 |
|
481 |
|
482 |
|
483 |
|
484 |
|
485 |
|
486 |
|
487 |
|
488 |
|
489 | Crawler.prototype.isAGoodLinkToCrawl = function(result, currentDepth, parentUri, link, anchor, isDoFollow) {
|
490 |
|
491 |
|
492 | if (result.depthLimit > -1 && currentDepth > result.depthLimit) {
|
493 | return false
|
494 | }
|
495 |
|
496 |
|
497 | if (URI.isExternalLink(parentUri,link) && ! result.externalLinks) {
|
498 | return false;
|
499 | }
|
500 |
|
501 |
|
502 | if (! this.startFromHosts.has(URI.host(parentUri)) && ! result.externalHosts) {
|
503 | return false;
|
504 | }
|
505 |
|
506 |
|
507 | if (! this.startFromDomains.has(URI.domain(parentUri)) && ! result.externalDomains) {
|
508 | return false;
|
509 | }
|
510 |
|
511 |
|
512 | if (result.protocols.indexOf(URI.protocol(link)) < 0) {
|
513 | return false;
|
514 | }
|
515 |
|
516 |
|
517 | if (result.domainBlackList.indexOf(URI.domainName(link)) > 0) {
|
518 |
|
519 | return false;
|
520 | }
|
521 |
|
522 |
|
523 | if (! result.canCrawl) {
|
524 | return true;
|
525 | }
|
526 |
|
527 | var check = result.canCrawl(parentUri, link, anchor, isDoFollow);
|
528 |
|
529 | return check;
|
530 | }
|
531 |
|
532 |
|
533 |
|
534 |
|
535 |
|
536 |
|
537 |
|
538 |
|
539 |
|
540 |
|
541 | var updateDepth = function(parentUri, linkUri) {
|
542 |
|
543 | if (this.depthUrls.has(parentUri)) {
|
544 |
|
545 | var parentDepth = this.depthUrls.get(parentUri);
|
546 | if (this.depthUrls.has(linkUri)) {
|
547 | return this.depthUrls.get(linkUri);
|
548 | }
|
549 | else {
|
550 | var depth = parentDepth + 1;
|
551 | this.depthUrls.set(linkUri, depth);
|
552 | return depth;
|
553 | }
|
554 | }
|
555 | else {
|
556 | this.depthUrls.set(parentUri, 0);
|
557 | this.depthUrls.set(linkUri, 1);
|
558 | return 1;
|
559 | }
|
560 |
|
561 |
|
562 | }
|
563 |
|
564 | function emitCrawlEvent(crawler, result, $) {
|
565 |
|
566 | crawler.emit("crawl", result, $);
|
567 | }
|
568 |
|
569 | function emitErrorEvent(crawler, error, result) {
|
570 | crawler.emit("error", error, result);
|
571 | }
|
572 |
|
573 | function emitRedirectEvent(crawler, from, to, statusCode) {
|
574 | crawler.emit("crawlRedirect", from, to, statusCode);
|
575 | }
|
576 |
|
577 |
|
578 | function emitCrawlHrefEvent(crawler, eventName, parentUri, linkUri, anchor, isDoFollow) {
|
579 | crawler.emit(eventName, parentUri, linkUri, anchor, isDoFollow);
|
580 | }
|
581 |
|
582 | function emitCrawlLinkEvent(crawler, parentUri, linkUri ) {
|
583 | crawler.emit("crawlLink", parentUri, linkUri);
|
584 | }
|
585 |
|
586 | function emitUnCrawlEvent(crawler, parentUri, linkUri ) {
|
587 | crawler.emit("uncrawl", parentUri, linkUri);
|
588 | }
|
589 |
|
590 | function emitCrawlImage(crawler, parentUri, linkUri, alt ) {
|
591 |
|
592 | crawler.emit("crawlImage", parentUri, linkUri, alt);
|
593 | }
|
594 |
|
595 | module.exports.Crawler = Crawler;
|