1 |
|
2 |
|
3 |
|
4 |
|
5 |
|
6 |
|
7 | var Rule = require('./rule').Rule
|
8 | , Entry = require('./entry').Entry
|
9 | , urlparser = require('url')
|
10 | , http = require('http')
|
11 | , https = require('https')
|
12 | , ut = require('./utils');
|
13 |
|
14 | exports.RobotsParser = RobotsParser;
|
15 |
|
16 |
|
17 |
|
18 |
|
19 |
|
20 |
|
21 |
|
22 |
|
23 |
|
24 | function RobotsParser (url, userAgent, after_parse) {
|
25 | this.entries = [];
|
26 | this.defaultEntry = '';
|
27 | this.disallowAll = false;
|
28 | this.statusCode = -1;
|
29 | this.allowAll = false;
|
30 | this.userAgent = userAgent || 'Mozilla/5.0 (X11; Linux i686; rv:5.0) '+
|
31 | 'Gecko/20100101 Firefox/5.0';
|
32 | this.setUrl(url, after_parse);
|
33 | }
|
34 |
|
35 |
|
36 |
|
37 |
|
38 |
|
39 |
|
40 |
|
41 |
|
42 |
|
43 | RobotsParser.prototype.setUrl = function(url, read) {
|
44 | this.url = url;
|
45 | if(url) {
|
46 | if(read === undefined || read === null || read === true) {
|
47 | this.read();
|
48 | } else if(typeof(read) === "function") {
|
49 | this.read(read)
|
50 | }
|
51 | }
|
52 | };
|
53 |
|
54 |
|
55 |
|
56 |
|
57 |
|
58 |
|
59 |
|
60 | RobotsParser.prototype.read = function(after_parse) {
|
61 | var self = this;
|
62 | var url = urlparser.parse(this.url);
|
63 | var port;
|
64 | var protocol;
|
65 | if (url.protocol == 'https:') {
|
66 | port = 443;
|
67 | protocol = https;
|
68 | } else {
|
69 | port = 80
|
70 | protocol = http;
|
71 | }
|
72 | var request = protocol.request({
|
73 | 'host': url.host,
|
74 | 'headers': { "User-Agent": self.userAgent },
|
75 | 'port': url.port || port,
|
76 | 'method': 'GET',
|
77 | 'path': url.pathname
|
78 | });
|
79 |
|
80 | ut.d('RobotsParser.read: start ...');
|
81 | if(typeof(after_parse) !== "function") {
|
82 | after_parse = function(obj, success) { };
|
83 | }
|
84 | request.on('response', function(resp) {
|
85 | ut.d('RobotsParser.read: get response, code: '+resp.statusCode);
|
86 |
|
87 | self.statusCode = resp.statusCode;
|
88 |
|
89 | if ( [401, 403].indexOf(resp.statusCode) > -1 ) {
|
90 | ut.d('RobotsParser.read: set disallowAll');
|
91 | self.disallowAll = true;
|
92 | after_parse(self, false);
|
93 | }
|
94 | else if (resp.statusCode >= 400) {
|
95 | ut.d('RobotsParser.read: set allowAll');
|
96 | self.allowAll = true;
|
97 | after_parse(self, false);
|
98 | }
|
99 | else if ([301, 302].indexOf(resp.statusCode) > -1) {
|
100 |
|
101 | self.setUrl(resp.headers.location, after_parse);
|
102 | }
|
103 | else {
|
104 | resp.setEncoding('utf8');
|
105 | resp.on('data', function (chunk) {
|
106 | ut.d('RobotsParser.read: reads robots.txt');
|
107 | self.parse(chunk.split(/\r\n|\r|\n/));
|
108 | after_parse(self, true);
|
109 | });
|
110 | }
|
111 |
|
112 | });
|
113 | request.end();
|
114 | };
|
115 |
|
116 |
|
117 |
|
118 |
|
119 |
|
120 |
|
121 |
|
122 | RobotsParser.prototype._addEntry = function(entry) {
|
123 | ut.d('Parser._addEntry, entry: '+entry);
|
124 | if( entry.userAgents.indexOf('*') > -1 ) {
|
125 |
|
126 |
|
127 | if ( !this.defaultEntry ) {
|
128 | this.defaultEntry = entry;
|
129 | }
|
130 | }
|
131 | else {
|
132 | this.entries.push(entry);
|
133 | }
|
134 | };
|
135 |
|
136 |
|
137 |
|
138 |
|
139 |
|
140 |
|
141 |
|
142 |
|
143 |
|
144 | RobotsParser.prototype.parse = function(lines) {
|
145 |
|
146 |
|
147 |
|
148 |
|
149 | var STATE_START = 0
|
150 | , STATE_SAW_AGENT = 1
|
151 | , STATE_SAW_ALLOW_OR_DISALLOW = 2
|
152 | , state = STATE_START
|
153 | , entry = new Entry()
|
154 | , line
|
155 | , comment
|
156 | , field
|
157 | , value;
|
158 |
|
159 | for (var i = 0; i < lines.length; i++) {
|
160 | line = lines[i];
|
161 |
|
162 | if (!line) {
|
163 | if (state === STATE_SAW_AGENT) {
|
164 | entry = new Entry();
|
165 | state = STATE_START;
|
166 | }
|
167 | else if (state === STATE_SAW_ALLOW_OR_DISALLOW) {
|
168 | this._addEntry(entry);
|
169 | entry = new Entry();
|
170 | state = STATE_START;
|
171 | }
|
172 | }
|
173 |
|
174 |
|
175 | comment = line.indexOf('#')
|
176 | if (comment > -1) {
|
177 | line = line.substring(0, comment);
|
178 | }
|
179 |
|
180 |
|
181 | line = line.trim();
|
182 |
|
183 | line = line.split(':');
|
184 | if (line.length !== 2) {
|
185 | continue;
|
186 | }
|
187 |
|
188 | field = line[0].trim().toLowerCase();
|
189 | value = line[1].trim();
|
190 |
|
191 | switch(field) {
|
192 | case 'user-agent':
|
193 | if (state === STATE_SAW_ALLOW_OR_DISALLOW) {
|
194 | this._addEntry(entry);
|
195 | entry = new Entry();
|
196 | }
|
197 | entry.userAgents.push(value);
|
198 | state = STATE_SAW_AGENT;
|
199 | break;
|
200 |
|
201 | case 'disallow':
|
202 | case 'allow':
|
203 | if (state !== STATE_START) {
|
204 | entry.rules.push(new Rule(value, field === 'allow'));
|
205 | state = STATE_SAW_ALLOW_OR_DISALLOW;
|
206 | }
|
207 | break;
|
208 |
|
209 | case 'crawl-delay':
|
210 | if (state !== STATE_START) {
|
211 | entry.crawl_delay = value;
|
212 | state = STATE_SAW_ALLOW_OR_DISALLOW;
|
213 | }
|
214 | }
|
215 | };
|
216 | if (state === STATE_SAW_ALLOW_OR_DISALLOW) {
|
217 | this._addEntry(entry);
|
218 | }
|
219 | };
|
220 |
|
221 |
|
222 |
|
223 |
|
224 |
|
225 |
|
226 |
|
227 |
|
228 |
|
229 |
|
230 | RobotsParser.prototype.canFetchSync = function(userAgent, url) {
|
231 | var url = url || '/'
|
232 | , entry;
|
233 |
|
234 | ut.d('Parser.canFetch: url:'+url);
|
235 |
|
236 | if (this.disallowAll) {
|
237 | return false;
|
238 | }
|
239 | if (this.allowAll) {
|
240 | return true;
|
241 | }
|
242 |
|
243 |
|
244 |
|
245 | for (var i = 0; i < this.entries.length; i++) {
|
246 | entry = this.entries[i];
|
247 | if (entry.appliesTo(userAgent)) {
|
248 | return entry.allowance(url);
|
249 | }
|
250 | };
|
251 |
|
252 |
|
253 | if (this.defaultEntry) {
|
254 | return this.defaultEntry.allowance(url);
|
255 | }
|
256 |
|
257 |
|
258 | return true;
|
259 | };
|
260 |
|
261 |
|
262 |
|
263 |
|
264 |
|
265 |
|
266 |
|
267 |
|
268 |
|
269 | RobotsParser.prototype.canFetch = function(userAgent, url, callback) {
|
270 | var self = this
|
271 | , url = url || '/'
|
272 | , entry;
|
273 |
|
274 |
|
275 | process.nextTick( function () {
|
276 | if (self.disallowAll) {
|
277 | callback(false, url, {
|
278 | type: "statusCode",
|
279 | statusCode: self.statusCode});
|
280 | return;
|
281 | }
|
282 |
|
283 | if (self.allowAll) {
|
284 | callback(true, url, {
|
285 | type: "statusCode",
|
286 | statusCode: self.statusCode});
|
287 | return;
|
288 | }
|
289 |
|
290 |
|
291 |
|
292 | for (var i = 0; i < self.entries.length; i++) {
|
293 | entry = self.entries[i];
|
294 | if (entry.appliesTo(userAgent)) {
|
295 | callback(entry.allowance(url), url, {
|
296 | type: "entry",
|
297 | entry: entry});
|
298 | return;
|
299 | }
|
300 | };
|
301 |
|
302 |
|
303 | if (self.defaultEntry) {
|
304 | callback(self.defaultEntry.allowance(url), url, {
|
305 | type: "defaultEntry",
|
306 | entry: self.defaultEntry});
|
307 | return;
|
308 | }
|
309 |
|
310 |
|
311 | callback(true, url, {type: "noRule"});
|
312 | });
|
313 | };
|
314 |
|
315 |
|
316 |
|
317 |
|
318 |
|
319 |
|
320 |
|
321 |
|
322 |
|
323 |
|
324 |
|
325 |
|
326 | RobotsParser.prototype.getCrawlDelay = function (userAgent) {
|
327 | var entry;
|
328 | for (var i = 0; i < this.entries.length; i++) {
|
329 | entry = this.entries[i];
|
330 | if (entry.appliesTo(userAgent) && (entry.crawl_delay != null)) {
|
331 | return entry.crawl_delay;
|
332 | }
|
333 | }
|
334 | return this.defaultEntry.crawl_delay;
|
335 | };
|
336 |
|
337 |
|
338 |
|
339 |
|
340 |
|
341 | RobotsParser.prototype.toString = function() {
|
342 | var res = [];
|
343 | res.push("<Parser: Crawler User Agent: " + this.userAgent);
|
344 | res.push(this.defaultEntry.toString());
|
345 | for (var i in this.entries) {
|
346 | res.push(this.entries[i].toString());
|
347 | };
|
348 | res.push(">");
|
349 | return res.join('\n');
|
350 | };
|
351 |
|
352 |
|
353 |
|
354 |
|
355 |
|
356 | RobotsParser.prototype.toStringLite = function() {
|
357 | var res = [];
|
358 | var agent_names = [];
|
359 | function list_agent_names(entry) {
|
360 | var names = [];
|
361 | for(var j in entry.userAgents) {
|
362 | names.push(entry.userAgents[j]);
|
363 | }
|
364 | return names;
|
365 | }
|
366 | agent_names = list_agent_names(this.defaultEntry);
|
367 | for (var i in this.entries) {
|
368 | agent_names = agent_names.concat(list_agent_names(this.entries[i]));
|
369 | };
|
370 | var output = "<Parser: ";
|
371 | output += "Crawler User Agent is `" + this.userAgent + "`, ";
|
372 | output += "Listed Robot Agents: `" + agent_names.join('`, `');
|
373 | output += "`>";
|
374 | return output;
|
375 | };
|
376 |
|