1 | var http = require("http"),
|
2 | https = require("https"),
|
3 | urllib = require("url"),
|
4 | utillib = require("util"),
|
5 | zlib = require('zlib'),
|
6 | dns = require('dns'),
|
7 | Stream = require("stream").Stream,
|
8 | CookieJar = require("./cookiejar").CookieJar,
|
9 | encodinglib = require("encoding"),
|
10 | net = require("net");
|
11 |
|
12 | exports.FetchStream = FetchStream;
|
13 | exports.CookieJar = CookieJar;
|
14 |
|
15 | exports.fetchUrl = fetchUrl;
|
16 |
|
17 | function FetchStream(url, options){
|
18 | Stream.call(this);
|
19 |
|
20 | options = options || {};
|
21 |
|
22 | this.url = url;
|
23 | if(!this.url){
|
24 | return this.emit("error", new Error("url not defined"));
|
25 | }
|
26 |
|
27 | this.userAgent = options.userAgent || "FetchStream";
|
28 |
|
29 | this._redirect_count = 0;
|
30 |
|
31 | this.options = options || {};
|
32 | this.normalizeOptions();
|
33 |
|
34 |
|
35 | if(typeof setImmediate != "undefined"){
|
36 | setImmediate(this.runStream.bind(this, url));
|
37 | }else{
|
38 | process.nextTick(this.runStream.bind(this, url));
|
39 | }
|
40 | }
|
41 | utillib.inherits(FetchStream, Stream);
|
42 |
|
43 |
|
44 | FetchStream.prototype.normalizeOptions = function(){
|
45 |
|
46 |
|
47 | this.cookieJar = this.options.cookieJar || new CookieJar();
|
48 |
|
49 |
|
50 |
|
51 | if(!this.options.disableRedirect && typeof this.options.maxRedirects != "number" &&
|
52 | !(this.options.maxRedirects instanceof Number)){
|
53 | this.options.maxRedirects = 10;
|
54 | }else if(this.options.disableRedirects){
|
55 | this.options.maxRedirects = 0;
|
56 | }
|
57 |
|
58 |
|
59 |
|
60 |
|
61 |
|
62 | this.options.headers = this.options.headers || {};
|
63 |
|
64 | var keys = Object.keys(this.options.headers),
|
65 | newheaders = {},
|
66 | i;
|
67 |
|
68 | for(i=keys.length-1; i>=0; i--){
|
69 | newheaders[keys[i].toLowerCase().trim()] = this.options.headers[keys[i]];
|
70 | }
|
71 |
|
72 | this.options.headers = newheaders;
|
73 |
|
74 | if(!this.options.headers["user-agent"]){
|
75 | this.options.headers["user-agent"] = this.userAgent;
|
76 | }
|
77 |
|
78 | if(!this.options.headers["pragma"]){
|
79 | this.options.headers["pragma"] = "no-cache";
|
80 | }
|
81 |
|
82 | if(!this.options.headers["cache-control"]){
|
83 | this.options.headers["cache-control"] = "no-cache";
|
84 | }
|
85 |
|
86 | if(!this.options.disableGzip){
|
87 | this.options.headers['accept-encoding'] = 'gzip, deflate';
|
88 | }else{
|
89 | delete this.options.headers['accept-encoding'];
|
90 | }
|
91 |
|
92 |
|
93 |
|
94 | if(!this.options.maxResponseLength){
|
95 | this.options.maxResponseLength = Infinity;
|
96 | }
|
97 |
|
98 |
|
99 |
|
100 | if(!this.options.method){
|
101 | this.options.method = this.options.payload || this.options.payloadSize?"POST":"GET";
|
102 | }
|
103 |
|
104 |
|
105 |
|
106 | if(this.options.cookies){
|
107 | for(var i=0; i<this.options.cookies.length; i++){
|
108 | this.cookieJar.setCookie(this.options.cookies[i], this.url);
|
109 | }
|
110 | }
|
111 |
|
112 |
|
113 | if (typeof this.options.rejectUnauthorized === 'undefined') {
|
114 | this.options.rejectUnauthorized = true;
|
115 | }
|
116 | }
|
117 |
|
118 | FetchStream.prototype.parseUrl = function(url){
|
119 | var urlparts = urllib.parse(url, false, true),
|
120 | transport,
|
121 | urloptions = {
|
122 | host: urlparts.hostname || urlparts.host,
|
123 | port: urlparts.port,
|
124 | path: urlparts.pathname + (urlparts.search || "") || "/",
|
125 | method: this.options.method,
|
126 | rejectUnauthorized: this.options.rejectUnauthorized
|
127 | };
|
128 |
|
129 | if("agent" in this.options){
|
130 | urloptions.agent = this.options.agent;
|
131 | }
|
132 |
|
133 | switch(urlparts.protocol){
|
134 | case "https:":
|
135 | transport = https;
|
136 | break;
|
137 | case "http:":
|
138 | default:
|
139 | transport = http;
|
140 | break;
|
141 | }
|
142 |
|
143 | if(!urloptions.port){
|
144 | switch(urlparts.protocol){
|
145 | case "https:":
|
146 | urloptions.port = 443;
|
147 | break;
|
148 | case "http:":
|
149 | default:
|
150 | urloptions.port = 80;
|
151 | break;
|
152 | }
|
153 | }
|
154 |
|
155 | urloptions.headers = this.options.headers;
|
156 |
|
157 | return {
|
158 | urloptions: urloptions,
|
159 | transport: transport
|
160 | }
|
161 | }
|
162 |
|
163 | FetchStream.prototype.setEncoding = function(encoding){
|
164 | this.options.encoding = encoding;
|
165 | }
|
166 |
|
167 | FetchStream.prototype.absoluteUrl = function(url, base){
|
168 |
|
169 | var target_url = urllib.parse(url, false, true),
|
170 | base_url = urllib.parse(base || "", false, true),
|
171 | base_path, target_path, final_path;
|
172 |
|
173 |
|
174 | if(target_url.protocol){
|
175 | return url;
|
176 | }
|
177 |
|
178 |
|
179 |
|
180 | if(target_url.hostname){
|
181 | return (base_url.protocol || "http:") + (url.substr(0,2)!="//"?"//":"") + url;
|
182 | }
|
183 |
|
184 |
|
185 | if((target_url.pathname || "/").substr(0,1)=="/"){
|
186 | return (base_url.protocol || "http:") + "//" + (base_url.hostname || "") + url;
|
187 | }
|
188 |
|
189 |
|
190 |
|
191 | base_path = (base_url.pathname || "/").split("/");
|
192 | base_path.pop();
|
193 |
|
194 | target_path = (target_url.pathname || "/").split("/");
|
195 |
|
196 | target_path = base_path.concat(target_path);
|
197 | final_path = [];
|
198 |
|
199 | target_path.forEach(function(dir){
|
200 | if(dir=="."){
|
201 | return;
|
202 | }
|
203 |
|
204 | if(dir==".."){
|
205 | final_path.pop();
|
206 | return;
|
207 | }
|
208 |
|
209 | if(dir){
|
210 | final_path.push(dir);
|
211 | }
|
212 | });
|
213 |
|
214 | return (base_url.protocol || "http:") + "//" + (base_url.hostname || "") + "/" +
|
215 | final_path.join("/") + (target_url.search || "");
|
216 | }
|
217 |
|
218 | FetchStream.prototype.runStream = function(url){
|
219 | var url_data = this.parseUrl(url),
|
220 | cookies = this.cookieJar.getCookies(url);
|
221 |
|
222 | if(cookies){
|
223 | url_data.urloptions.headers.cookie = cookies;
|
224 | }else{
|
225 | delete url_data.urloptions.headers.cookie;
|
226 | }
|
227 |
|
228 | if(this.options.payload){
|
229 | url_data.urloptions.headers['content-length'] = Buffer.byteLength(this.options.payload || "","utf-8");
|
230 | }
|
231 |
|
232 | if(this.options.payloadSize){
|
233 | url_data.urloptions.headers['content-length'] = this.options.payloadSize;
|
234 | }
|
235 |
|
236 | if (this.options.asyncDnsLoookup) {
|
237 | var dnsCallback = (function (err, addresses){
|
238 | if (err) {
|
239 | this.emit("error", err);
|
240 | return;
|
241 | }
|
242 |
|
243 | url_data.urloptions.headers['host'] = url_data.urloptions.hostname || url_data.urloptions.host;
|
244 | url_data.urloptions.hostname = addresses[0];
|
245 | url_data.urloptions.host = url_data.urloptions.headers['host'] + (url_data.urloptions.port? ':' + url_data.urloptions.port: '');
|
246 |
|
247 | this._runStream(url_data, url);
|
248 | }).bind(this);
|
249 |
|
250 | if(net.isIP(url_data.urloptions.host)){
|
251 | dnsCallback(null, [url_data.urloptions.host]);
|
252 | }else{
|
253 | dns.resolve4(url_data.urloptions.host, dnsCallback);
|
254 | }
|
255 | } else {
|
256 | this._runStream(url_data, url);
|
257 | }
|
258 | }
|
259 |
|
260 | FetchStream.prototype._runStream = function(url_data, url){
|
261 |
|
262 | var req = url_data.transport.request(url_data.urloptions, (function(res) {
|
263 |
|
264 |
|
265 | if(Array.isArray(res.headers['set-cookie'])){
|
266 | for(var i=0; i<res.headers['set-cookie'].length; i++){
|
267 | this.cookieJar.setCookie(res.headers['set-cookie'][i], url)
|
268 | }
|
269 | }
|
270 |
|
271 | if([301, 302, 303, 307, 308].indexOf(res.statusCode)>=0){
|
272 | if(!this.options.disableRedirects && this.options.maxRedirects>this._redirect_count && res.headers.location){
|
273 | this._redirect_count++;
|
274 | this.runStream(this.absoluteUrl(res.headers.location, url));
|
275 | return;
|
276 | }
|
277 | }
|
278 |
|
279 | this.meta = {
|
280 | status: res.statusCode,
|
281 | responseHeaders: res.headers,
|
282 | finalUrl: url,
|
283 | redirectCount: this._redirect_count,
|
284 | cookieJar: this.cookieJar
|
285 | }
|
286 |
|
287 | var curlen = 0,
|
288 | maxlen,
|
289 |
|
290 | receive = (function(chunk){
|
291 |
|
292 | if(curlen + chunk.length > this.options.maxResponseLength){
|
293 | maxlen = this.options.maxResponseLength - curlen;
|
294 | }else{
|
295 | maxlen = chunk.length;
|
296 | }
|
297 | if(maxlen<=0)return;
|
298 |
|
299 | curlen += Math.min(maxlen, chunk.length);
|
300 |
|
301 | if(maxlen>=chunk.length){
|
302 | if(this.options.encoding){
|
303 | this.emit("data", chunk.toString(this.options.encoding));
|
304 | }else{
|
305 | this.emit("data", chunk);
|
306 | }
|
307 | }else{
|
308 | if(this.options.encoding){
|
309 | this.emit("data", chunk.slice(0, maxlen).toString(this.options.encoding));
|
310 | }else{
|
311 | this.emit("data", chunk.slice(0, maxlen));
|
312 | }
|
313 | }
|
314 | }).bind(this),
|
315 |
|
316 | error = (function(e){
|
317 | this.emit("error", e);
|
318 | }).bind(this),
|
319 |
|
320 | end = (function(){
|
321 | this.emit("end");
|
322 | }).bind(this),
|
323 |
|
324 | unpack = (function(type, res){
|
325 | var z = zlib["create"+type]();
|
326 | z.on("data", receive);
|
327 | z.on("error", error);
|
328 | z.on("end", end);
|
329 | res.pipe(z);
|
330 | }).bind(this);
|
331 |
|
332 | this.emit("meta", this.meta);
|
333 |
|
334 | if(res.headers['content-encoding']){
|
335 | switch(res.headers['content-encoding'].toLowerCase().trim()){
|
336 | case "gzip":
|
337 | return unpack("Gunzip", res);
|
338 | case "deflate":
|
339 | return unpack("InflateRaw", res);
|
340 | }
|
341 | }
|
342 |
|
343 | res.on('data', receive);
|
344 | res.on('end', end);
|
345 |
|
346 | }).bind(this));
|
347 |
|
348 | req.on('error', (function(e){
|
349 | this.emit("error", e);
|
350 | }).bind(this));
|
351 |
|
352 | if (this.options.timeout) {
|
353 | req.setTimeout(this.options.timeout, req.abort.bind(req));
|
354 | }
|
355 |
|
356 | if(this.options.payload){
|
357 | req.end(this.options.payload);
|
358 | }else if(this.options.payloadStream){
|
359 | this.options.payloadStream.pipe(req);
|
360 | this.options.payloadStream.resume();
|
361 | }else{
|
362 | req.end();
|
363 | }
|
364 | }
|
365 |
|
366 | function fetchUrl(url, options, callback){
|
367 | if(!callback && typeof options=="function"){
|
368 | callback = options;
|
369 | options = undefined;
|
370 | }
|
371 | options = options || {};
|
372 |
|
373 | var fetchstream = new FetchStream(url, options),
|
374 | response_data, chunks = [], length=0, curpos=0, buffer,
|
375 | content_type,
|
376 | callbackFired = false;
|
377 |
|
378 | fetchstream.on("meta", function(meta){
|
379 | response_data = meta;
|
380 | content_type = _parseContentType(meta.responseHeaders['content-type']);
|
381 | });
|
382 |
|
383 | fetchstream.on("data", function(chunk){
|
384 | if(chunk){
|
385 | chunks.push(chunk);
|
386 | length += chunk.length;
|
387 | }
|
388 | });
|
389 |
|
390 | fetchstream.on("error", function(error){
|
391 | if(error && error.code == 'HPE_INVALID_CONSTANT'){
|
392 |
|
393 | return;
|
394 | }
|
395 | if(callbackFired){
|
396 | return;
|
397 | }
|
398 | callbackFired = true;
|
399 | callback(error);
|
400 | });
|
401 |
|
402 | fetchstream.on("end", function(error){
|
403 | if(callbackFired){
|
404 | return;
|
405 | }
|
406 | callbackFired = true;
|
407 |
|
408 | buffer = new Buffer(length);
|
409 | for(var i=0, len = chunks.length; i<len; i++){
|
410 | chunks[i].copy(buffer, curpos);
|
411 | curpos += chunks[i].length;
|
412 | }
|
413 |
|
414 | if(content_type.mimeType == "text/html"){
|
415 | content_type.charset = _findHTMLCharset(buffer) || content_type.charset;
|
416 | }
|
417 |
|
418 | content_type.charset = (options.overrideCharset || content_type.charset || "utf-8").trim().toLowerCase();
|
419 |
|
420 |
|
421 | if(!this.options.disableDecoding && !content_type.charset.match(/^utf-?8$/i)){
|
422 | buffer = encodinglib.convert(buffer, "UTF-8", content_type.charset);
|
423 | }
|
424 |
|
425 | if(this.options.outputEncoding){
|
426 | callback(null, response_data, buffer.toString(this.options.outputEncoding));
|
427 | }else{
|
428 | callback(null, response_data, buffer);
|
429 | }
|
430 |
|
431 | });
|
432 | }
|
433 |
|
434 | function _parseContentType(str){
|
435 | if(!str){
|
436 | return {};
|
437 | }
|
438 | var parts = str.split(";"),
|
439 | mimeType = parts.shift(),
|
440 | charset, chparts;
|
441 |
|
442 | for(var i=0, len = parts.length; i<len; i++){
|
443 | chparts = parts[i].split("=");
|
444 | if(chparts.length>1){
|
445 | if(chparts[0].trim().toLowerCase() == "charset"){
|
446 | charset = chparts[1];
|
447 | }
|
448 | }
|
449 | }
|
450 |
|
451 | return {
|
452 | mimeType: (mimeType || "").trim().toLowerCase(),
|
453 | charset: (charset || "UTF-8").trim().toLowerCase()
|
454 | }
|
455 | }
|
456 |
|
457 | function _findHTMLCharset(htmlbuffer){
|
458 |
|
459 | var body = htmlbuffer.toString("ascii"),
|
460 | input, meta, charset;
|
461 |
|
462 | if(meta = body.match(/<meta\s+http-equiv=["']content-type["'][^>]*?>/i)){
|
463 | input = meta[0];
|
464 | }
|
465 |
|
466 | if(input){
|
467 | charset = input.match(/charset\s?=\s?([a-zA-Z\-0-9]*);?/);
|
468 | if(charset){
|
469 | charset = (charset[1] || "").trim().toLowerCase();
|
470 | }
|
471 | }
|
472 |
|
473 | if(!charset && (meta = body.match(/<meta\s+charset=["'](.*?)["']/i))){
|
474 | charset = (meta[1] || "").trim().toLowerCase();
|
475 | }
|
476 |
|
477 | return charset;
|
478 | }
|
479 |
|
480 |
|
481 |
|