UNPKG

13.9 kBJavaScriptView Raw
1var http = require("http"),
2 https = require("https"),
3 urllib = require("url"),
4 utillib = require("util"),
5 zlib = require('zlib'),
6 dns = require('dns'),
7 Stream = require("stream").Stream,
8 CookieJar = require("./cookiejar").CookieJar,
9 encodinglib = require("encoding"),
10 net = require("net");
11
12exports.FetchStream = FetchStream;
13exports.CookieJar = CookieJar;
14
15exports.fetchUrl = fetchUrl;
16
17function FetchStream(url, options){
18 Stream.call(this);
19
20 options = options || {};
21
22 this.url = url;
23 if(!this.url){
24 return this.emit("error", new Error("url not defined"));
25 }
26
27 this.userAgent = options.userAgent || "FetchStream";
28
29 this._redirect_count = 0;
30
31 this.options = options || {};
32 this.normalizeOptions();
33
34 // prevent errors before "error" handler is set by defferring actions
35 if(typeof setImmediate != "undefined"){
36 setImmediate(this.runStream.bind(this, url));
37 }else{
38 process.nextTick(this.runStream.bind(this, url));
39 }
40}
41utillib.inherits(FetchStream, Stream);
42
43
44FetchStream.prototype.normalizeOptions = function(){
45
46 // cookiejar
47 this.cookieJar = this.options.cookieJar || new CookieJar();
48
49 // default redirects - 10
50 // if disableRedirect is set, then 0
51 if(!this.options.disableRedirect && typeof this.options.maxRedirects != "number" &&
52 !(this.options.maxRedirects instanceof Number)){
53 this.options.maxRedirects = 10;
54 }else if(this.options.disableRedirects){
55 this.options.maxRedirects = 0;
56 }
57
58 // normalize header keys
59 // HTTP and HTTPS takes in key names in case insensitive but to find
60 // an exact value from an object key name needs to be case sensitive
61 // so we're just lowercasing all input keys
62 this.options.headers = this.options.headers || {};
63
64 var keys = Object.keys(this.options.headers),
65 newheaders = {},
66 i;
67
68 for(i=keys.length-1; i>=0; i--){
69 newheaders[keys[i].toLowerCase().trim()] = this.options.headers[keys[i]];
70 }
71
72 this.options.headers = newheaders;
73
74 if(!this.options.headers["user-agent"]){
75 this.options.headers["user-agent"] = this.userAgent;
76 }
77
78 if(!this.options.headers["pragma"]){
79 this.options.headers["pragma"] = "no-cache";
80 }
81
82 if(!this.options.headers["cache-control"]){
83 this.options.headers["cache-control"] = "no-cache";
84 }
85
86 if(!this.options.disableGzip){
87 this.options.headers['accept-encoding'] = 'gzip, deflate';
88 }else{
89 delete this.options.headers['accept-encoding'];
90 }
91
92 // max length for the response,
93 // if not set, default is Infinity
94 if(!this.options.maxResponseLength){
95 this.options.maxResponseLength = Infinity;
96 }
97
98 // method:
99 // defaults to GET, or when payload present to POST
100 if(!this.options.method){
101 this.options.method = this.options.payload || this.options.payloadSize?"POST":"GET";
102 }
103
104 // set cookies
105 // takes full cookie definition strings as params
106 if(this.options.cookies){
107 for(var i=0; i<this.options.cookies.length; i++){
108 this.cookieJar.setCookie(this.options.cookies[i], this.url);
109 }
110 }
111
112 // rejectUnauthorized
113 if (typeof this.options.rejectUnauthorized === 'undefined') {
114 this.options.rejectUnauthorized = true;
115 }
116}
117
118FetchStream.prototype.parseUrl = function(url){
119 var urlparts = urllib.parse(url, false, true),
120 transport,
121 urloptions = {
122 host: urlparts.hostname || urlparts.host,
123 port: urlparts.port,
124 path: urlparts.pathname + (urlparts.search || "") || "/",
125 method: this.options.method,
126 rejectUnauthorized: this.options.rejectUnauthorized
127 };
128
129 if("agent" in this.options){
130 urloptions.agent = this.options.agent;
131 }
132
133 switch(urlparts.protocol){
134 case "https:":
135 transport = https;
136 break;
137 case "http:":
138 default:
139 transport = http;
140 break;
141 }
142
143 if(!urloptions.port){
144 switch(urlparts.protocol){
145 case "https:":
146 urloptions.port = 443;
147 break;
148 case "http:":
149 default:
150 urloptions.port = 80;
151 break;
152 }
153 }
154
155 urloptions.headers = this.options.headers;
156
157 return {
158 urloptions: urloptions,
159 transport: transport
160 }
161}
162
163FetchStream.prototype.setEncoding = function(encoding){
164 this.options.encoding = encoding;
165}
166
167FetchStream.prototype.absoluteUrl = function(url, base){
168
169 var target_url = urllib.parse(url, false, true),
170 base_url = urllib.parse(base || "", false, true),
171 base_path, target_path, final_path;
172
173 // if protocol is set, then it's good to go
174 if(target_url.protocol){
175 return url;
176 }
177
178 // the url might be int the form of "//www.example.com" with leading slashes -
179 // the protocol from the base url must be used, defaults to http
180 if(target_url.hostname){
181 return (base_url.protocol || "http:") + (url.substr(0,2)!="//"?"//":"") + url;
182 }
183
184 // this is absolute path for relative domain
185 if((target_url.pathname || "/").substr(0,1)=="/"){
186 return (base_url.protocol || "http:") + "//" + (base_url.hostname || "") + url;
187 }
188
189 // relative path
190 // remove also .. and . directory references
191 base_path = (base_url.pathname || "/").split("/");
192 base_path.pop(); // ditch the last element, empty for dir or a file name
193
194 target_path = (target_url.pathname || "/").split("/");
195
196 target_path = base_path.concat(target_path);
197 final_path = [];
198
199 target_path.forEach(function(dir){
200 if(dir=="."){
201 return;
202 }
203
204 if(dir==".."){
205 final_path.pop();
206 return;
207 }
208
209 if(dir){
210 final_path.push(dir);
211 }
212 });
213
214 return (base_url.protocol || "http:") + "//" + (base_url.hostname || "") + "/" +
215 final_path.join("/") + (target_url.search || "");
216}
217
218FetchStream.prototype.runStream = function(url){
219 var url_data = this.parseUrl(url),
220 cookies = this.cookieJar.getCookies(url);
221
222 if(cookies){
223 url_data.urloptions.headers.cookie = cookies;
224 }else{
225 delete url_data.urloptions.headers.cookie;
226 }
227
228 if(this.options.payload){
229 url_data.urloptions.headers['content-length'] = Buffer.byteLength(this.options.payload || "","utf-8");
230 }
231
232 if(this.options.payloadSize){
233 url_data.urloptions.headers['content-length'] = this.options.payloadSize;
234 }
235
236 if (this.options.asyncDnsLoookup) {
237 var dnsCallback = (function (err, addresses){
238 if (err) {
239 this.emit("error", err);
240 return;
241 }
242
243 url_data.urloptions.headers['host'] = url_data.urloptions.hostname || url_data.urloptions.host;
244 url_data.urloptions.hostname = addresses[0];
245 url_data.urloptions.host = url_data.urloptions.headers['host'] + (url_data.urloptions.port? ':' + url_data.urloptions.port: '');
246
247 this._runStream(url_data, url);
248 }).bind(this);
249
250 if(net.isIP(url_data.urloptions.host)){
251 dnsCallback(null, [url_data.urloptions.host]);
252 }else{
253 dns.resolve4(url_data.urloptions.host, dnsCallback);
254 }
255 } else {
256 this._runStream(url_data, url);
257 }
258}
259
260FetchStream.prototype._runStream = function(url_data, url){
261
262 var req = url_data.transport.request(url_data.urloptions, (function(res) {
263
264 // catch new cookies before potential redirect
265 if(Array.isArray(res.headers['set-cookie'])){
266 for(var i=0; i<res.headers['set-cookie'].length; i++){
267 this.cookieJar.setCookie(res.headers['set-cookie'][i], url)
268 }
269 }
270
271 if([301, 302, 303, 307, 308].indexOf(res.statusCode)>=0){
272 if(!this.options.disableRedirects && this.options.maxRedirects>this._redirect_count && res.headers.location){
273 this._redirect_count++;
274 this.runStream(this.absoluteUrl(res.headers.location, url));
275 return;
276 }
277 }
278
279 this.meta = {
280 status: res.statusCode,
281 responseHeaders: res.headers,
282 finalUrl: url,
283 redirectCount: this._redirect_count,
284 cookieJar: this.cookieJar
285 }
286
287 var curlen = 0,
288 maxlen,
289
290 receive = (function(chunk){
291
292 if(curlen + chunk.length > this.options.maxResponseLength){
293 maxlen = this.options.maxResponseLength - curlen;
294 }else{
295 maxlen = chunk.length;
296 }
297 if(maxlen<=0)return;
298
299 curlen += Math.min(maxlen, chunk.length);
300
301 if(maxlen>=chunk.length){
302 if(this.options.encoding){
303 this.emit("data", chunk.toString(this.options.encoding));
304 }else{
305 this.emit("data", chunk);
306 }
307 }else{
308 if(this.options.encoding){
309 this.emit("data", chunk.slice(0, maxlen).toString(this.options.encoding));
310 }else{
311 this.emit("data", chunk.slice(0, maxlen));
312 }
313 }
314 }).bind(this),
315
316 error = (function(e){
317 this.emit("error", e);
318 }).bind(this),
319
320 end = (function(){
321 this.emit("end");
322 }).bind(this),
323
324 unpack = (function(type, res){
325 var z = zlib["create"+type]();
326 z.on("data", receive);
327 z.on("error", error);
328 z.on("end", end);
329 res.pipe(z);
330 }).bind(this);
331
332 this.emit("meta", this.meta);
333
334 if(res.headers['content-encoding']){
335 switch(res.headers['content-encoding'].toLowerCase().trim()){
336 case "gzip":
337 return unpack("Gunzip", res);
338 case "deflate":
339 return unpack("InflateRaw", res);
340 }
341 }
342
343 res.on('data', receive);
344 res.on('end', end);
345
346 }).bind(this));
347
348 req.on('error', (function(e){
349 this.emit("error", e);
350 }).bind(this));
351
352 if (this.options.timeout) {
353 req.setTimeout(this.options.timeout, req.abort.bind(req));
354 }
355
356 if(this.options.payload){
357 req.end(this.options.payload);
358 }else if(this.options.payloadStream){
359 this.options.payloadStream.pipe(req);
360 this.options.payloadStream.resume();
361 }else{
362 req.end();
363 }
364}
365
366function fetchUrl(url, options, callback){
367 if(!callback && typeof options=="function"){
368 callback = options;
369 options = undefined;
370 }
371 options = options || {};
372
373 var fetchstream = new FetchStream(url, options),
374 response_data, chunks = [], length=0, curpos=0, buffer,
375 content_type,
376 callbackFired = false;
377
378 fetchstream.on("meta", function(meta){
379 response_data = meta;
380 content_type = _parseContentType(meta.responseHeaders['content-type']);
381 });
382
383 fetchstream.on("data", function(chunk){
384 if(chunk){
385 chunks.push(chunk);
386 length += chunk.length;
387 }
388 });
389
390 fetchstream.on("error", function(error){
391 if(error && error.code == 'HPE_INVALID_CONSTANT'){
392 // skip invalid formatting errors
393 return;
394 }
395 if(callbackFired){
396 return;
397 }
398 callbackFired = true;
399 callback(error);
400 });
401
402 fetchstream.on("end", function(error){
403 if(callbackFired){
404 return;
405 }
406 callbackFired = true;
407
408 buffer = new Buffer(length);
409 for(var i=0, len = chunks.length; i<len; i++){
410 chunks[i].copy(buffer, curpos);
411 curpos += chunks[i].length;
412 }
413
414 if(content_type.mimeType == "text/html"){
415 content_type.charset = _findHTMLCharset(buffer) || content_type.charset;
416 }
417
418 content_type.charset = (options.overrideCharset || content_type.charset || "utf-8").trim().toLowerCase();
419
420
421 if(!this.options.disableDecoding && !content_type.charset.match(/^utf-?8$/i)){
422 buffer = encodinglib.convert(buffer, "UTF-8", content_type.charset);
423 }
424
425 if(this.options.outputEncoding){
426 callback(null, response_data, buffer.toString(this.options.outputEncoding));
427 }else{
428 callback(null, response_data, buffer);
429 }
430
431 });
432}
433
434function _parseContentType(str){
435 if(!str){
436 return {};
437 }
438 var parts = str.split(";"),
439 mimeType = parts.shift(),
440 charset, chparts;
441
442 for(var i=0, len = parts.length; i<len; i++){
443 chparts = parts[i].split("=");
444 if(chparts.length>1){
445 if(chparts[0].trim().toLowerCase() == "charset"){
446 charset = chparts[1];
447 }
448 }
449 }
450
451 return {
452 mimeType: (mimeType || "").trim().toLowerCase(),
453 charset: (charset || "UTF-8").trim().toLowerCase() // defaults to UTF-8
454 }
455}
456
457function _findHTMLCharset(htmlbuffer){
458
459 var body = htmlbuffer.toString("ascii"),
460 input, meta, charset;
461
462 if(meta = body.match(/<meta\s+http-equiv=["']content-type["'][^>]*?>/i)){
463 input = meta[0];
464 }
465
466 if(input){
467 charset = input.match(/charset\s?=\s?([a-zA-Z\-0-9]*);?/);
468 if(charset){
469 charset = (charset[1] || "").trim().toLowerCase();
470 }
471 }
472
473 if(!charset && (meta = body.match(/<meta\s+charset=["'](.*?)["']/i))){
474 charset = (meta[1] || "").trim().toLowerCase();
475 }
476
477 return charset;
478}
479
480
481