UNPKG

9.1 kBJavaScriptView Raw
1/*!
2 * Robots
3 * Copyright(c) 2011 Eugene Kalinin
4 * MIT Licensed
5 */
6
7var Rule = require('./rule').Rule
8 , Entry = require('./entry').Entry
9 , urlparser = require('url')
10 , http = require('http')
11 , https = require('https')
12 , ut = require('./utils');
13
14exports.RobotsParser = RobotsParser;
15
16/**
17 * Provides a set of methods to read, parse and answer
18 * questions about a single robots.txt file.
19 *
20 * @constructor
21 * @param {String} url
22 * @param {String} userAgent User-Agent for fetching robots.txt
23 */
24function RobotsParser (url, userAgent, after_parse) {
25 this.entries = [];
26 this.defaultEntry = '';
27 this.disallowAll = false;
28 this.statusCode = -1;
29 this.allowAll = false;
30 this.userAgent = userAgent || 'Mozilla/5.0 (X11; Linux i686; rv:5.0) '+
31 'Gecko/20100101 Firefox/5.0';
32 this.setUrl(url, after_parse);
33}
34
35
36/**
37 * Sets the URL referring to a robots.txt file
38 *
39 * @param {String} url
40 * @param {String} read Optional, default=true. Immediate read robots.txt
41 * if read is a callback function, pass it through to this.read
42 */
43RobotsParser.prototype.setUrl = function(url, read) {
44 this.url = url;
45 if(url) {
46 if(read === undefined || read === null || read === true) {
47 this.read();
48 } else if(typeof(read) === "function") {
49 this.read(read)
50 }
51 }
52};
53
54
55/**
56 * Reads the robots.txt URL and feeds it to the parser
57 *
58 * @param {Function} callback called after remote robots.txt is downloaded and parsed
59 */
60RobotsParser.prototype.read = function(after_parse) {
61 var self = this;
62 var url = urlparser.parse(this.url);
63 var port;
64 var protocol;
65 if (url.protocol == 'https:') {
66 port = 443;
67 protocol = https;
68 } else {
69 port = 80
70 protocol = http;
71 }
72 var request = protocol.request({
73 'host': url.host,
74 'headers': { "User-Agent": self.userAgent },
75 'port': url.port || port,
76 'method': 'GET',
77 'path': url.pathname
78 });
79
80 ut.d('RobotsParser.read: start ...');
81 if(typeof(after_parse) !== "function") {
82 after_parse = function(obj, success) { };
83 }
84 request.on('response', function(resp) {
85 ut.d('RobotsParser.read: get response, code: '+resp.statusCode);
86
87 self.statusCode = resp.statusCode;
88
89 if ( [401, 403].indexOf(resp.statusCode) > -1 ) {
90 ut.d('RobotsParser.read: set disallowAll');
91 self.disallowAll = true;
92 after_parse(self, false);
93 }
94 else if (resp.statusCode >= 400) {
95 ut.d('RobotsParser.read: set allowAll');
96 self.allowAll = true;
97 after_parse(self, false);
98 }
99 else if ([301, 302].indexOf(resp.statusCode) > -1) {
100 // redirect
101 self.setUrl(resp.headers.location, after_parse);
102 }
103 else {
104 resp.setEncoding('utf8');
105 resp.on('data', function (chunk) {
106 ut.d('RobotsParser.read: reads robots.txt');
107 self.parse(chunk.split(/\r\n|\r|\n/));
108 after_parse(self, true);
109 });
110 }
111
112 });
113 request.end();
114};
115
116/**
117 * Adds entry
118 *
119 * @private
120 * @param {Entry} entry
121 */
122RobotsParser.prototype._addEntry = function(entry) {
123 ut.d('Parser._addEntry, entry: '+entry);
124 if( entry.userAgents.indexOf('*') > -1 ) {
125 // the default entry is considered last
126 // the first default entry wins
127 if ( !this.defaultEntry ) {
128 this.defaultEntry = entry;
129 }
130 }
131 else {
132 this.entries.push(entry);
133 }
134};
135
136/**
137 * Parse the input lines from a robots.txt file.
138 *
139 * We allow that a user-agent: line is not preceded by
140 * one or more blank lines.
141 *
142 * @param {Array} lines Array of rows from robots.txt
143 */
144RobotsParser.prototype.parse = function(lines) {
145 // states:
146 // 0: start state
147 // 1: saw user-agent line
148 // 2: saw an allow or disallow line
149 var STATE_START = 0
150 , STATE_SAW_AGENT = 1
151 , STATE_SAW_ALLOW_OR_DISALLOW = 2
152 , state = STATE_START
153 , entry = new Entry()
154 , line
155 , comment
156 , field
157 , value;
158
159 for (var i = 0; i < lines.length; i++) {
160 line = lines[i];
161
162 if (!line) {
163 if (state === STATE_SAW_AGENT) {
164 entry = new Entry();
165 state = STATE_START;
166 }
167 else if (state === STATE_SAW_ALLOW_OR_DISALLOW) {
168 this._addEntry(entry);
169 entry = new Entry();
170 state = STATE_START;
171 }
172 }
173
174 // remove optional comment and strip line
175 comment = line.indexOf('#')
176 if (comment > -1) {
177 line = line.substring(0, comment);
178 }
179
180 // strip line
181 line = line.trim();
182 // find 'field:value'
183 line = line.split(':');
184 if (line.length !== 2) {
185 continue;
186 }
187
188 field = line[0].trim().toLowerCase();
189 value = line[1].trim();
190
191 switch(field) {
192 case 'user-agent':
193 if (state === STATE_SAW_ALLOW_OR_DISALLOW) {
194 this._addEntry(entry);
195 entry = new Entry();
196 }
197 entry.userAgents.push(value);
198 state = STATE_SAW_AGENT;
199 break;
200
201 case 'disallow':
202 case 'allow':
203 if (state !== STATE_START) {
204 entry.rules.push(new Rule(value, field === 'allow'));
205 state = STATE_SAW_ALLOW_OR_DISALLOW;
206 }
207 break;
208
209 case 'crawl-delay':
210 if (state !== STATE_START) {
211 entry.crawl_delay = value;
212 state = STATE_SAW_ALLOW_OR_DISALLOW;
213 }
214 }
215 };
216 if (state === STATE_SAW_ALLOW_OR_DISALLOW) {
217 this._addEntry(entry);
218 }
219};
220
221
222/**
223 * Using the parsed robots.txt decide if userAgent can fetch url.
224 *
225 * @param {String} userAgent
226 * @param {String} url
227 * @return {Boolean}
228 *
229 */
230RobotsParser.prototype.canFetchSync = function(userAgent, url) {
231 var url = url || '/'
232 , entry;
233
234 ut.d('Parser.canFetch: url:'+url);
235
236 if (this.disallowAll) {
237 return false;
238 }
239 if (this.allowAll) {
240 return true;
241 }
242
243 // search for given user agent matches
244 // the first match counts
245 for (var i = 0; i < this.entries.length; i++) {
246 entry = this.entries[i];
247 if (entry.appliesTo(userAgent)) {
248 return entry.allowance(url);
249 }
250 };
251
252 // try the default entry last
253 if (this.defaultEntry) {
254 return this.defaultEntry.allowance(url);
255 }
256
257 // agent not found ==> access granted
258 return true;
259};
260
261/**
262 * Using the parsed robots.txt decide if userAgent can fetch url.
263 *
264 * @param {String} userAgent
265 * @param {String} url
266 * @param {Function} callback function (access, url, rule) { ... }
267 *
268 */
269RobotsParser.prototype.canFetch = function(userAgent, url, callback) {
270 var self = this
271 , url = url || '/'
272 , entry;
273
274
275 process.nextTick( function () {
276 if (self.disallowAll) {
277 callback(false, url, {
278 type: "statusCode",
279 statusCode: self.statusCode});
280 return;
281 }
282
283 if (self.allowAll) {
284 callback(true, url, {
285 type: "statusCode",
286 statusCode: self.statusCode});
287 return;
288 }
289
290 // search for given user agent matches
291 // the first match counts
292 for (var i = 0; i < self.entries.length; i++) {
293 entry = self.entries[i];
294 if (entry.appliesTo(userAgent)) {
295 callback(entry.allowance(url), url, {
296 type: "entry",
297 entry: entry});
298 return;
299 }
300 };
301
302 // try the default entry last
303 if (self.defaultEntry) {
304 callback(self.defaultEntry.allowance(url), url, {
305 type: "defaultEntry",
306 entry: self.defaultEntry});
307 return;
308 }
309
310 // agent not found ==> access granted
311 callback(true, url, {type: "noRule"});
312 });
313};
314
315
316/**
317 * Using the parsed robots.txt decide if userAgent has a specified
318 * Crawl-delay.
319 *
320 * @param {String} userAgent
321 * @param {String} url
322 * @param {Function} callback function (access, url, rule) { ... }
323 * @return {Number} or undefined
324 *
325 */
326RobotsParser.prototype.getCrawlDelay = function (userAgent) {
327 var entry;
328 for (var i = 0; i < this.entries.length; i++) {
329 entry = this.entries[i];
330 if (entry.appliesTo(userAgent) && (entry.crawl_delay != null)) {
331 return entry.crawl_delay;
332 }
333 }
334 return this.defaultEntry.crawl_delay;
335};
336
337/**
338 * Returns a string representation of this RobotsParser
339 *
340 */
341RobotsParser.prototype.toString = function() {
342 var res = [];
343 res.push("<Parser: Crawler User Agent: " + this.userAgent);
344 res.push(this.defaultEntry.toString());
345 for (var i in this.entries) {
346 res.push(this.entries[i].toString());
347 };
348 res.push(">");
349 return res.join('\n');
350};
351
352/**
353 * Returns a (shorter) string representation of this RobotsParser
354 *
355 */
356RobotsParser.prototype.toStringLite = function() {
357 var res = [];
358 var agent_names = [];
359 function list_agent_names(entry) {
360 var names = [];
361 for(var j in entry.userAgents) {
362 names.push(entry.userAgents[j]);
363 }
364 return names;
365 }
366 agent_names = list_agent_names(this.defaultEntry);
367 for (var i in this.entries) {
368 agent_names = agent_names.concat(list_agent_names(this.entries[i]));
369 };
370 var output = "<Parser: ";
371 output += "Crawler User Agent is `" + this.userAgent + "`, ";
372 output += "Listed Robot Agents: `" + agent_names.join('`, `');
373 output += "`>";
374 return output;
375};
376