UNPKG

1.9 kBJavaScriptView Raw
1/**
2 * Basic crawler plugin that can be used to harvest some statistics
3 *
4 */
5var URI = require('crawler-ninja-uri');
6
7
8var CONTENT_TYPE_HEADER = "content-type";
9var CONTENT_LENGTH_HEADER = "content-length";
10
11
12function Plugin(crawler) {
13 this.crawler = crawler;
14
15 this.data = {
16 numberOfUrls : 0, // could be any kind of content type
17 numberOfHTMLs : 0, // number of html pages
18 numberOfUncrawlUrls : 0, // number of uncrawl urls
19 hostnames : [], // number of crawled resources per hostname
20 contentTypes : [], // number of crawled resources per content types
21 errors : [] // List of the errors
22
23 };
24}
25
26Plugin.prototype.unCrawl = function(parentUri, linkUri, anchor, isDoFollow, callback) {
27 this.data.numberOfUncrawlUrls++;
28 callback();
29};
30
31Plugin.prototype.error = function(error, result, callback) {
32
33 self.data.errors.push(error);
34 callback();
35};
36
37
38
39/**
40 * callback function for the event crawl
41 *
42 */
43Plugin.prototype.crawl = function(result, $, callback) {
44
45 this.data.numberOfUrls++;
46
47 var contentType = result.headers[CONTENT_TYPE_HEADER];
48 if (contentType) {
49 this.addContentType(contentType);
50 }
51
52 this.addHostname(URI.host(result.uri));
53
54 if ($) {
55 this.data.numberOfHTMLs++;
56 }
57
58 callback();
59
60}
61
62
63/**
64 * Add stat for the resource content type
65 *
66 */
67Plugin.prototype.addContentType= function (contentType) {
68 if (! this.data.contentTypes[contentType]) {
69 this.data.contentTypes[contentType] = 1;
70 }
71 else {
72 this.data.contentTypes[contentType]++;
73 }
74}
75
76
77/**
78 * Add stat for the hostname
79 *
80 */
81Plugin.prototype.addHostname= function (hostname) {
82 if (! this.data.hostnames[hostname]) {
83 this.data.hostnames[hostname] = 1;
84 }
85 else {
86 this.data.hostnames[hostname]++;
87 }
88}
89
90
91module.exports.Plugin = Plugin;