1 |
|
2 |
|
3 |
|
4 |
|
5 | var URI = require('crawler-ninja-uri');
|
6 |
|
7 |
|
8 | var CONTENT_TYPE_HEADER = "content-type";
|
9 | var CONTENT_LENGTH_HEADER = "content-length";
|
10 |
|
11 |
|
12 | function Plugin(crawler) {
|
13 | this.crawler = crawler;
|
14 |
|
15 | this.data = {
|
16 | numberOfUrls : 0,
|
17 | numberOfHTMLs : 0,
|
18 | numberOfUncrawlUrls : 0,
|
19 | hostnames : [],
|
20 | contentTypes : [],
|
21 | errors : []
|
22 |
|
23 | };
|
24 | }
|
25 |
|
26 | Plugin.prototype.unCrawl = function(parentUri, linkUri, anchor, isDoFollow, callback) {
|
27 | this.data.numberOfUncrawlUrls++;
|
28 | callback();
|
29 | };
|
30 |
|
31 | Plugin.prototype.error = function(error, result, callback) {
|
32 |
|
33 | self.data.errors.push(error);
|
34 | callback();
|
35 | };
|
36 |
|
37 |
|
38 |
|
39 |
|
40 |
|
41 |
|
42 |
|
43 | Plugin.prototype.crawl = function(result, $, callback) {
|
44 |
|
45 | this.data.numberOfUrls++;
|
46 |
|
47 | var contentType = result.headers[CONTENT_TYPE_HEADER];
|
48 | if (contentType) {
|
49 | this.addContentType(contentType);
|
50 | }
|
51 |
|
52 | this.addHostname(URI.host(result.uri));
|
53 |
|
54 | if ($) {
|
55 | this.data.numberOfHTMLs++;
|
56 | }
|
57 |
|
58 | callback();
|
59 |
|
60 | }
|
61 |
|
62 |
|
63 |
|
64 |
|
65 |
|
66 |
|
67 | Plugin.prototype.addContentType= function (contentType) {
|
68 | if (! this.data.contentTypes[contentType]) {
|
69 | this.data.contentTypes[contentType] = 1;
|
70 | }
|
71 | else {
|
72 | this.data.contentTypes[contentType]++;
|
73 | }
|
74 | }
|
75 |
|
76 |
|
77 |
|
78 |
|
79 |
|
80 |
|
81 | Plugin.prototype.addHostname= function (hostname) {
|
82 | if (! this.data.hostnames[hostname]) {
|
83 | this.data.hostnames[hostname] = 1;
|
84 | }
|
85 | else {
|
86 | this.data.hostnames[hostname]++;
|
87 | }
|
88 | }
|
89 |
|
90 |
|
91 | module.exports.Plugin = Plugin;
|