UNPKG

8.66 kBJavaScriptView Raw
1var crypto = require('crypto');
2var Map = require("collections/fast-map");
3var Set = require("collections/fast-set");
4var URI = require('crawler-ninja-uri');
5
6var CONTENT_TYPE_HEADER = "content-type";
7var CONTENT_LENGTH_HEADER = "content-length";
8
9var ERROR_CODE_TIMEOUT = "ETIMEDOUT";
10var ERROR_DNS_LOOKUP = "ENOTFOUND";
11
12var STATUS_DNS_LOOKUP_ERROR = "DNS lookup failed";
13/**
14 * Basic crawler plugin that can be used to make a SEO audit for one site
15 * This is just an example that requires some customizations
16 *
17 * @param : the crawler engine that will emit events to this plugin
18 */
19function Plugin() {
20
21 this.name = "Audit-Plugin";
22 this.resources = new Map();
23 this.duplicateContents = new Map();
24 this.inLinks = new Map();
25 this.outLinks = new Map();
26 this.externalLinks = new Map();
27 this.unparsedLinks = new Set();
28 this.images = new Map();
29 this.errors = new Set();
30 this.redirects = new Map();
31
32
33}
34
35Plugin.prototype.error = function (error, result, callback) {
36 this.errors.add({uri : result.uri, error : error});
37 if (error.code == ERROR_CODE_TIMEOUT) {
38 var resourceInfo = this.getresourceInfo(result.uri);
39 resourceInfo.statusCode = 408;
40 }
41 if (error.code == ERROR_DNS_LOOKUP) {
42 var resourceInfo = this.getresourceInfo(result.uri);
43 resourceInfo.statusCode = STATUS_DNS_LOOKUP_ERROR;
44 }
45
46 callback();
47}
48
49
50/**
51 * callback function for the event crawl
52 *
53 * @param result : the result of the resource crawl
54 * @param the jquery like object for accessing to the HTML tags. Null is the resource
55 * is not an HTML
56 *
57 */
58Plugin.prototype.crawl = function(result, $, callback) {
59
60 //http status 2**
61 if (result.statusCode >= 200 && result.statusCode <= 299 ) {
62 this.analyzeResource(result, $);
63 }
64
65 //http status 3**
66 if (result.statusCode >= 300 && result.statusCode <= 399 ) {
67
68 this.analyzeRedirect(result);
69 }
70
71 //http status 4** & 5**
72 if (result.statusCode >= 400 && result.statusCode <= 599 ) {
73
74 this.analyzeHttpError(result);
75 }
76
77 callback();
78
79
80}
81
82/**
83 * Analyze resources & store the infos into the audit maps
84 *
85 * @param result : the result of the resource crawl
86 * @param the jquery like object for accessing to the HTML tags. Null is the resource
87 * is not an HTML
88 */
89Plugin.prototype.analyzeResource = function(result, $) {
90
91 var resourceInfo = this.getresourceInfo(result.uri);
92
93 resourceInfo.statusCode = result.statusCode;
94 resourceInfo.responseTime = result.responseTime;
95
96 resourceInfo.size = result.body.length;
97 resourceInfo.contentType = result.headers["content-type"];
98
99 // last modified & other header attributes
100 resourceInfo.headers = result.headers;
101
102 // if HTML
103 if ($) {
104 var titleElement = $("title");
105 resourceInfo.title = ! titleElement ? "" : titleElement.text();
106 resourceInfo.titleLen = ! titleElement ? 0 : resourceInfo.title.length;
107
108 var description = $('meta[name=description]').attr("content");
109 resourceInfo.description = ! description ? "" : description;
110 resourceInfo.descriptionLen = ! resourceInfo.description ? 0 : resourceInfo.description.length;
111
112 var keywords = $('meta[name=keywords]').attr("content");
113 resourceInfo.keywords = ! keywords ? "" : keywords;
114 resourceInfo.keywordsLen = ! resourceInfo.keywords ? 0 : resourceInfo.keywords.length;
115
116 var refresh = $('meta[http-equiv=Refresh]').attr("content");
117 resourceInfo.refresh = ! refresh ? "" : refresh
118
119 var canonicalLink = $('link[rel=canonical]').attr("href");
120 resourceInfo.canonicalLink = ! canonicalLink ? "" : canonicalLink;
121
122 resourceInfo.wordCount = this.getNumberOfWords($);
123
124 resourceInfo.h1 = this.getHeaders($, "h1");
125 resourceInfo.h2 = this.getHeaders($, "h2");
126
127 // Use hash for detecting duplicate content
128 var shasum = crypto.createHash('sha1');
129 shasum.update(result.body);
130 resourceInfo.hash = shasum.digest('hex');
131 this.addHash(result.uri,resourceInfo.hash);
132 }
133
134 this.resources.set(result.uri, resourceInfo);
135}
136
137/**
138 * Analyze redirect & store info into the audit maps
139 *
140 * @param result : the result of the resource crawl
141 *
142 */
143Plugin.prototype.analyzeRedirect = function(result) {
144
145 var resourceInfo = this.getresourceInfo(result.uri);
146
147 resourceInfo.statusCode = result.statusCode;
148 resourceInfo.responseTime = result.responseTime;
149
150 resourceInfo.size = result.body.length;
151 resourceInfo.contentType = result.headers["content-type"];
152
153 // last modified & other header attributes
154 resourceInfo.headers = result.headers;
155
156 this.resources.set(result.uri, resourceInfo);
157
158 addToListMap(this.outLinks, result.uri, {page: result.headers["location"], anchor : 'Redirect', isDoFollow : true});
159}
160
161
162
163Plugin.prototype.analyzeHttpError = function(result) {
164
165 var resourceInfo = this.getresourceInfo(result.uri);
166
167 resourceInfo.statusCode = result.statusCode;
168 resourceInfo.responseTime = result.responseTime;
169
170 resourceInfo.headers = result.headers;
171
172 this.resources.set(result.uri, resourceInfo);
173
174}
175
176/**
177 * Callback for the event crawlink. Triggers when the crawler found a link
178 * on a page
179 *
180 * @param the page url that contains the link
181 * @param the link found in the page
182 * @param the link anchor text
183 * @param true if the link is on follow
184 * @returns
185 */
186Plugin.prototype.crawlLink = function(page, link, anchor, isDoFollow, callback) {
187
188 // Outlinks
189 addToListMap(this.outLinks, page, {page: link, anchor : anchor, isDoFollow : isDoFollow});
190
191 // Inlinks
192 addToListMap(this.inLinks, link, {page: page, anchor : anchor, isDoFollow : isDoFollow});
193
194 // External links
195 if (URI.host(page) != URI.host(link)) {
196
197 addToListMap(this.externalLinks, link, page);
198 }
199
200 callback();
201
202}
203
204/**
205 * Callback for the event crawimage. Triggers when the crawler found a image
206 * on a page
207 *
208 * @param the page url that contains the link
209 * @param the link found in the page
210 * @param the link anchor text
211 * @param true if the link is on follow
212 *
213 */
214Plugin.prototype.crawlImage = function(page, link, alt, callback) {
215
216 // Outlinks
217 addToListMap(this.outLinks, page, {page: link, anchor : alt, isDoFollow : null});
218
219 // Don't add external images
220 if (URI.host(page) == URI.host(link)) {
221 addToListMap(this.images, link, {page: page, alt : alt});
222 }
223
224 callback();
225
226}
227
228/**
229 * Add the redirect into a map in order to build the complete redirect chain
230 *
231 * @param the from url
232 * @param the to url
233 *
234 */
235Plugin.prototype.crawlRedirect = function(from, to, statusCode, callback) {
236 this.redirects.set(from, {'to': to, 'statusCode' : statusCode});
237 callback();
238}
239
240
241/**
242 * Get the info for one url from the store (actually a map)
243 *
244 * @param the page url
245 * @returns The page info
246 */
247Plugin.prototype.getresourceInfo = function(url) {
248
249
250 if (this.resources.has(url)) {
251
252 return this.resources.get(url);
253
254 }
255 else {
256 var resourceInfo = {url : url};
257 this.resources.set(url, resourceInfo);
258 return resourceInfo;
259
260 }
261
262}
263
264/**
265 * Add a hex representation of a page in the duplicateContents map
266 *
267 * @param the page uri
268 * @param the hex representation
269 *
270 */
271Plugin.prototype.addHash = function(pageUri, hex) {
272
273 addToListMap(this.duplicateContents, hex, pageUri);
274
275};
276
277/**
278 * Get the number of words in a page
279 *
280 * @param the jquery like represention of the page
281 * @returns the number of words
282 */
283Plugin.prototype.getNumberOfWords = function($) {
284 var s = $('body').text();
285 var counter = 0;
286 counter = s.split(' ').length;
287
288 return counter;
289
290};
291
292/**
293 * Find Hn tags in a page
294 *
295 * @param the jquery like represention of the page
296 * @returns an array of Hn tags with their text & len
297 */
298Plugin.prototype.getHeaders = function($, headerTag) {
299 headers = [];
300 $(headerTag).each(function(index,headerTag) {
301 var headerText = $(headerTag).text();
302 headers.push({"text" : headerText, "len" : headerText.length});
303
304 });
305
306 return headers;
307};
308
309
310/**
311 * Generic method to add a new element in list which is indexed in a map
312 * So, value of each key is a list
313 *
314 * @param the map
315 * @param the key
316 * @param the new value to add to the list (the value) for that key
317 *
318 */
319var addToListMap = function(map, key, value) {
320
321 var list = [];
322
323 if (map.has(key)) {
324
325 list = map.get(key);
326
327 if (!list)
328 list = [];
329
330 }
331 list.push(value);
332 map.set(key, list);
333
334}
335
336
337module.exports.Plugin = Plugin;