1 | var events = require('events');
|
2 | var timers = require('timers');
|
3 | var util = require("util");
|
4 | var _ = require("underscore");
|
5 | var requester = require("./lib/queue-requester");
|
6 | var URI = require('./lib/uri.js');
|
7 | var Map = require("collections/fast-map");
|
8 | var Set = require("collections/fast-set");
|
9 | var html = require("./lib/html.js");
|
10 | var domainBlackList = require("./default-lists/domain-black-list.js").list();
|
11 |
|
12 | var DEFAULT_NUMBER_OF_CONNECTIONS = 10;
|
13 | var DEFAULT_DEPTH_LIMIT = -1;
|
14 | var DEFAULT_TIME_OUT = 20000;
|
15 | var DEFAULT_RETRIES = 3;
|
16 | var DEFAULT_RETRY_TIMEOUT = 10000;
|
17 | var DEFAULT_SKIP_DUPLICATES = true;
|
18 | var DEFAULT_RATE_LIMITS = 0;
|
19 | var DEFAULT_MAX_ERRORS = 5;
|
20 | var DEFAULT_ERROR_RATES = [200, 350, 500];
|
21 |
|
22 | var DEFAULT_CRAWL_EXTERNAL_LINKS = false;
|
23 | var DEFAULT_CRAWL_EXTERNAL_DOMAINS = false;
|
24 | var DEFAULT_CRAWL_SCRIPTS = true;
|
25 | var DEFAULT_CRAWL_LINKS = true;
|
26 | var DEFAULT_CRAWL_IMAGES = true;
|
27 |
|
28 | var DEFAULT_PROTOCOLS_TO_CRAWL = ["http", "https"];
|
29 | var DEFAULT_FOLLOW_301 = false;
|
30 |
|
31 | var DEFAULT_LINKS_TYPES = ["canonical", "stylesheet"];
|
32 | var DEFAULT_USER_AGENT = "NinjaBot";
|
33 | var DEFAULT_CACHE = false;
|
34 | var DEFAULT_METHOD = 'GET';
|
35 | var DEFAULT_REFERER = false;
|
36 |
|
37 |
|
38 |
|
39 |
|
40 |
|
41 |
|
42 |
|
43 |
|
44 |
|
45 |
|
46 |
|
47 |
|
48 |
|
49 |
|
50 |
|
51 |
|
52 |
|
53 |
|
54 |
|
55 |
|
56 |
|
57 |
|
58 |
|
59 |
|
60 |
|
61 |
|
62 |
|
63 |
|
64 |
|
65 | function Crawler(config) {
|
66 |
|
67 |
|
68 |
|
69 |
|
70 |
|
71 |
|
72 | this.depthUrls = new Map();
|
73 |
|
74 |
|
75 | this.startFromHosts = new Set();
|
76 |
|
77 |
|
78 | this.config = this.createDefaultConfig();
|
79 |
|
80 |
|
81 | if (config) {
|
82 | _.extend(this.config, config);
|
83 | }
|
84 |
|
85 |
|
86 | if (this.config.rateLimits !== 0) {
|
87 | this.config.maxConnections = 1;
|
88 | }
|
89 |
|
90 |
|
91 |
|
92 | this.updateDepth = updateDepth;
|
93 |
|
94 |
|
95 | if (this.config.updateDepth) {
|
96 | this.updateDepth = this.config.updateDepth;
|
97 | }
|
98 |
|
99 | this.httpRequester = new requester.Requester(this.config);
|
100 |
|
101 | events.EventEmitter.call(this);
|
102 |
|
103 | }
|
104 |
|
105 | util.inherits(Crawler, events.EventEmitter);
|
106 |
|
107 |
|
108 |
|
109 |
|
110 |
|
111 |
|
112 |
|
113 |
|
114 | Crawler.prototype.queue = function(options) {
|
115 |
|
116 | var self = this;
|
117 |
|
118 |
|
119 | if (! options) {
|
120 | if (self.config.onCrawl) {
|
121 | self.config.onCrawl({errorCode : "NO_OPTIONS"}, {method:"GET", url : "unknown", proxy : "", error : true});
|
122 | }
|
123 |
|
124 | if (this.httpRequester.idle()) {
|
125 | self.config.onDrain();
|
126 | }
|
127 | return;
|
128 | }
|
129 |
|
130 |
|
131 |
|
132 | if (_.isArray(options)) {
|
133 | options.forEach(function(opt){
|
134 | self.queue(opt);
|
135 | });
|
136 |
|
137 | return;
|
138 | }
|
139 |
|
140 |
|
141 |
|
142 | if (_.isString(options)) {
|
143 | this.startFromHosts.add(URI.host(options));
|
144 | this.httpRequester.queue(this.addDefaultOptions({uri:options, url:options}, this.config))
|
145 | }
|
146 |
|
147 | else {
|
148 |
|
149 | if (! _.has(options, "url") && ! _.has(options, "uri")) {
|
150 | if (self.config.onCrawl) {
|
151 | self.config.onCrawl({errorCode : "NO_URL_OPTION"}, {method:"GET", url : "unknown", proxy : "", error : true});
|
152 | }
|
153 |
|
154 | if (this.httpRequester.idle()) {
|
155 | self.config.onDrain();
|
156 | }
|
157 | }
|
158 | else {
|
159 | this.startFromHosts.add(URI.host(_.has(options, "url") ? options.url : options.uri));
|
160 | this.httpRequester.queue(this.addDefaultOptions(options, this.config));
|
161 | }
|
162 | }
|
163 |
|
164 |
|
165 | }
|
166 |
|
167 | Crawler.prototype.addDefaultOptions = function(options, defaultOptions) {
|
168 |
|
169 | _.defaults(options, defaultOptions);
|
170 | options.maxRetries = options.retries;
|
171 | return options;
|
172 |
|
173 | }
|
174 |
|
175 | Crawler.prototype.buildNewOptions = function(options, newUrl) {
|
176 |
|
177 | var o = this.createDefaultConfig(newUrl);
|
178 |
|
179 |
|
180 |
|
181 | o = _.extend(o, _.pick(options, _.without(_.keys(o), "url", "uri") ));
|
182 |
|
183 |
|
184 | o.maxRetries = o.retries;
|
185 |
|
186 |
|
187 | if (options.canCrawl) {
|
188 | o.canCrawl = options.canCrawl;
|
189 | }
|
190 | return o;
|
191 |
|
192 | }
|
193 |
|
194 |
|
195 |
|
196 |
|
197 |
|
198 |
|
199 |
|
200 | Crawler.prototype.createDefaultConfig = function(url) {
|
201 | var self = this;
|
202 | var config = {
|
203 |
|
204 |
|
205 | cache : DEFAULT_CACHE,
|
206 | method : DEFAULT_METHOD,
|
207 | referer : DEFAULT_REFERER,
|
208 | maxConnections : DEFAULT_NUMBER_OF_CONNECTIONS,
|
209 | timeout : DEFAULT_TIME_OUT,
|
210 | retries : DEFAULT_RETRIES,
|
211 | maxRetries : DEFAULT_RETRIES,
|
212 | retryTimeout : DEFAULT_RETRY_TIMEOUT,
|
213 | maxErrors : DEFAULT_MAX_ERRORS,
|
214 | errorRates : DEFAULT_ERROR_RATES,
|
215 | skipDuplicates : DEFAULT_SKIP_DUPLICATES,
|
216 | rateLimits : DEFAULT_RATE_LIMITS,
|
217 | externalLinks : DEFAULT_CRAWL_EXTERNAL_LINKS,
|
218 | externalDomains : DEFAULT_CRAWL_EXTERNAL_DOMAINS,
|
219 | protocols : DEFAULT_PROTOCOLS_TO_CRAWL,
|
220 | depthLimit : DEFAULT_DEPTH_LIMIT,
|
221 | followRedirect : DEFAULT_FOLLOW_301,
|
222 | images : DEFAULT_CRAWL_IMAGES,
|
223 | links : DEFAULT_CRAWL_LINKS,
|
224 | linkTypes : DEFAULT_LINKS_TYPES,
|
225 | scripts : DEFAULT_CRAWL_SCRIPTS,
|
226 | userAgent : DEFAULT_USER_AGENT,
|
227 | domainBlackList : domainBlackList,
|
228 |
|
229 | onCrawl : function(error, result){
|
230 | self.crawl(error, result);
|
231 | },
|
232 |
|
233 | onDrain : function(){
|
234 | timers.setImmediate(function(){
|
235 | self.emit('end');
|
236 | });
|
237 |
|
238 | }
|
239 |
|
240 | };
|
241 |
|
242 | if (url) {
|
243 | config.url = url;
|
244 | config.uri = url;
|
245 | }
|
246 |
|
247 | return config;
|
248 |
|
249 | }
|
250 |
|
251 |
|
252 |
|
253 |
|
254 |
|
255 |
|
256 |
|
257 |
|
258 |
|
259 | Crawler.prototype.crawl = function (error, result) {
|
260 |
|
261 |
|
262 | var self = this;
|
263 | if (error) {
|
264 |
|
265 | timers.setImmediate(emitErrorEvent, self, error, result);
|
266 | return;
|
267 | }
|
268 |
|
269 | var $ = html.isHTML(result.body) ? html.$(result.body) : null;
|
270 |
|
271 | timers.setImmediate(emitCrawlEvent, self,result, $);
|
272 |
|
273 |
|
274 | if ($) {
|
275 | this.analyzeHTML(result,$);
|
276 | }
|
277 |
|
278 |
|
279 |
|
280 | if (result.statusCode >= 300 && result.statusCode <= 399 && ! this.config.followRedirect) {
|
281 |
|
282 | var from = result.uri;
|
283 | var to = result.headers["location"];
|
284 | var to = URI.linkToURI(from, to);
|
285 | timers.setImmediate(emitRedirectEvent, self, from, to, result.statusCode);
|
286 |
|
287 | this.httpRequester.queue(this.buildNewOptions(result,to));
|
288 |
|
289 | }
|
290 | }
|
291 |
|
292 |
|
293 |
|
294 |
|
295 |
|
296 |
|
297 |
|
298 |
|
299 |
|
300 |
|
301 | Crawler.prototype.analyzeHTML = function(result, $) {
|
302 |
|
303 |
|
304 | this.crawlHrefs(result, $);
|
305 |
|
306 | if (this.config.links){
|
307 | this.crawlLinks(result, $);
|
308 | }
|
309 |
|
310 | if (this.config.scripts) {
|
311 | this.crawlScripts(result,$);
|
312 | }
|
313 |
|
314 | if (this.config.images) {
|
315 | this.crawlImages(result,$);
|
316 | }
|
317 |
|
318 |
|
319 | }
|
320 |
|
321 |
|
322 |
|
323 |
|
324 |
|
325 |
|
326 |
|
327 |
|
328 |
|
329 | Crawler.prototype.crawlHrefs = function(result, $) {
|
330 | var parentUri = result.uri
|
331 | var self = this;
|
332 |
|
333 | $('a').each(function(index, a) {
|
334 |
|
335 | var link = $(a).attr('href');
|
336 |
|
337 | if (link) {
|
338 |
|
339 | var anchor = $(a).text() ? $(a).text() : "";
|
340 | var noFollow = $(a).attr("rel");
|
341 | var isDoFollow = ! (noFollow && noFollow === "nofollow");
|
342 |
|
343 | var linkUri = URI.linkToURI(parentUri, link);
|
344 |
|
345 | var currentDepth = self.updateDepth(parentUri, linkUri);
|
346 |
|
347 | timers.setImmediate(emitCrawlHrefEvent, self, "crawlLink", parentUri, linkUri, anchor, isDoFollow);
|
348 |
|
349 |
|
350 | if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri, anchor, isDoFollow)) {
|
351 | self.httpRequester.queue(self.buildNewOptions(result,linkUri));
|
352 | }
|
353 | else {
|
354 | timers.setImmediate(emitCrawlHrefEvent, self, "uncrawl", parentUri, linkUri, anchor, isDoFollow);
|
355 | }
|
356 | }
|
357 |
|
358 | });
|
359 |
|
360 | }
|
361 |
|
362 |
|
363 |
|
364 |
|
365 |
|
366 |
|
367 |
|
368 |
|
369 | Crawler.prototype.crawlLinks = function(result, $) {
|
370 |
|
371 | var parentUri = result.uri;
|
372 | var self = this;
|
373 |
|
374 | $('link').each(function(index, linkTag) {
|
375 |
|
376 | var link = $(linkTag).attr('href');
|
377 |
|
378 | if (link) {
|
379 |
|
380 | var rel = $(linkTag).attr('rel');
|
381 |
|
382 | if (self.config.linkTypes.indexOf(rel) > 0) {
|
383 | var linkUri = URI.linkToURI(parentUri, link);
|
384 | var currentDepth = self.updateDepth(parentUri, linkUri);
|
385 |
|
386 | timers.setImmediate(emitCrawlLinkEvent, self, parentUri, linkUri);
|
387 |
|
388 | if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {
|
389 |
|
390 | self.httpRequester.queue(self.buildNewOptions(result,linkUri));
|
391 |
|
392 | }
|
393 | else {
|
394 | timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);
|
395 | }
|
396 | }
|
397 |
|
398 | }
|
399 |
|
400 | });
|
401 |
|
402 | }
|
403 |
|
404 |
|
405 |
|
406 |
|
407 |
|
408 |
|
409 |
|
410 | Crawler.prototype.crawlScripts = function(result, $) {
|
411 |
|
412 | var parentUri = result.uri;
|
413 | var self = this;
|
414 |
|
415 | $('script').each(function(index, link) {
|
416 |
|
417 | var link = $(link).attr('src');
|
418 | if (link) {
|
419 | var linkUri = URI.linkToURI(parentUri, link);
|
420 | var currentDepth = self.updateDepth(parentUri, linkUri);
|
421 |
|
422 | timers.setImmediate(emitCrawlLinkEvent, self, parentUri, linkUri);
|
423 |
|
424 | if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {
|
425 |
|
426 | self.httpRequester.queue(self.buildNewOptions(result, linkUri));
|
427 |
|
428 | }
|
429 | else {
|
430 | timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);
|
431 | }
|
432 | }
|
433 |
|
434 | });
|
435 |
|
436 | }
|
437 |
|
438 |
|
439 |
|
440 |
|
441 |
|
442 |
|
443 |
|
444 | Crawler.prototype.crawlImages = function(result, $) {
|
445 |
|
446 | var parentUri = result.uri;
|
447 | var self = this;
|
448 |
|
449 | $('img').each(function(index, img) {
|
450 |
|
451 | var link = $(img).attr('src');
|
452 | var alt = $(img).attr('alt');
|
453 | if (link) {
|
454 | var linkUri = URI.linkToURI(parentUri, link);
|
455 |
|
456 | var currentDepth = self.updateDepth(parentUri, linkUri);
|
457 |
|
458 | timers.setImmediate(emitCrawlImage, self, parentUri, linkUri, alt);
|
459 |
|
460 | if (self.isAGoodLinkToCrawl(result, currentDepth, parentUri, linkUri)) {
|
461 |
|
462 | self.httpRequester.queue(self.buildNewOptions(result,linkUri));
|
463 |
|
464 | }
|
465 | else {
|
466 | timers.setImmediate(emitUnCrawlEvent, self, parentUri, linkUri);
|
467 | }
|
468 | }
|
469 |
|
470 | });
|
471 |
|
472 | }
|
473 |
|
474 |
|
475 |
|
476 |
|
477 |
|
478 |
|
479 |
|
480 |
|
481 |
|
482 | Crawler.prototype.isAGoodLinkToCrawl = function(result, currentDepth, parentUri, link, anchor, isDoFollow) {
|
483 |
|
484 |
|
485 | if (result.depthLimit > -1 && currentDepth > result.depthLimit) {
|
486 | return false
|
487 | }
|
488 |
|
489 |
|
490 | if (URI.isExternalLink(parentUri,link) && ! result.externalLinks) {
|
491 | return false;
|
492 | }
|
493 |
|
494 |
|
495 | if (! this.startFromHosts.has(URI.host(parentUri)) && ! result.externalDomains) {
|
496 | return false;
|
497 | }
|
498 |
|
499 |
|
500 | if (result.protocols.indexOf(URI.protocol(link)) < 0) {
|
501 | return false;
|
502 | }
|
503 |
|
504 |
|
505 | if (result.domainBlackList.indexOf(URI.domainName(link)) > 0) {
|
506 |
|
507 | return false;
|
508 | }
|
509 |
|
510 |
|
511 | if (! result.canCrawl) {
|
512 | return true;
|
513 | }
|
514 |
|
515 | var check = result.canCrawl(parentUri, link, anchor, isDoFollow);
|
516 |
|
517 | return check;
|
518 | }
|
519 |
|
520 |
|
521 |
|
522 |
|
523 |
|
524 |
|
525 |
|
526 |
|
527 |
|
528 |
|
529 | var updateDepth = function(parentUri, linkUri) {
|
530 |
|
531 | if (this.depthUrls.has(parentUri)) {
|
532 |
|
533 | var parentDepth = this.depthUrls.get(parentUri);
|
534 | if (this.depthUrls.has(linkUri)) {
|
535 | return this.depthUrls.get(linkUri);
|
536 | }
|
537 | else {
|
538 | var depth = parentDepth + 1;
|
539 | this.depthUrls.set(linkUri, depth);
|
540 | return depth;
|
541 | }
|
542 | }
|
543 | else {
|
544 | this.depthUrls.set(parentUri, 0);
|
545 | this.depthUrls.set(linkUri, 1);
|
546 | return 1;
|
547 | }
|
548 |
|
549 |
|
550 | }
|
551 |
|
552 | function emitCrawlEvent(crawler, result, $) {
|
553 |
|
554 | crawler.emit("crawl", result, $);
|
555 | }
|
556 |
|
557 | function emitErrorEvent(crawler, error, result) {
|
558 | crawler.emit("error", error, result);
|
559 | }
|
560 |
|
561 | function emitRedirectEvent(crawler, from, to, statusCode) {
|
562 | crawler.emit("crawlRedirect", from, to, statusCode);
|
563 | }
|
564 |
|
565 |
|
566 | function emitCrawlHrefEvent(crawler, eventName, parentUri, linkUri, anchor, isDoFollow) {
|
567 | crawler.emit(eventName, parentUri, linkUri, anchor, isDoFollow);
|
568 | }
|
569 |
|
570 | function emitCrawlLinkEvent(crawler, parentUri, linkUri ) {
|
571 | crawler.emit("crawlLink", parentUri, linkUri);
|
572 | }
|
573 |
|
574 | function emitUnCrawlEvent(crawler, parentUri, linkUri ) {
|
575 | crawler.emit("uncrawl", parentUri, linkUri);
|
576 | }
|
577 |
|
578 | function emitCrawlImage(crawler, parentUri, linkUri, alt ) {
|
579 |
|
580 | crawler.emit("crawlImage", parentUri, linkUri, alt);
|
581 | }
|
582 |
|
583 | module.exports.Crawler = Crawler;
|