All files lib.js

75.32% Statements 58/77
65.38% Branches 34/52
80% Functions 12/15
78.08% Lines 57/73

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209                          1x                 1x             7x       7x 7x   7x 267x     7x 7x 7x   7x 7x     7x         7x             7x             7x                     7x       1x 1x     1x   1x                               7x         7x 7x   7x 4x 3x     7x                                     14x   14x 14x 3x 3x         14x         14x 3x 3x 11x 11x 3x     2x 2x   1x       14x       7x   7x 7x 7x 7x             7x   7x   6x 6x 6x 5x     1x               1x                
// @flow strict
import request, { type Options as RequestOptions, type OptionsObject, type Callback, type CookieJar } from 'request';
import cheerio, { type CheerioStatic } from 'cheerio';
import path from 'path';
import fs, { type Stats } from 'fs';
import url from 'url';
import zlib from 'zlib';
 
import { debuglog } from 'util';
 
export type { OptionsObject, Callback };
 
// base options
const BASE_OPTIONS = {
	headers: {
		"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36",
		"Cache-Control": "no-cache",
		"Pragma": "no-cache"
	}
};
 
// fix unicode in JSON response
const UNICODE_HEADER = /\\x([0-9a-fA-F]{2})/g;
 
// adds additional functionality like automatic gunzipping / deflating and 303 redirects
// into mikeal's request.
function betterRequest(options: RequestOptions, callback: Callback) {
 
	// my god why doesn't mikeal just bake this shit into request
	const req = request(options);
 
	// adapted from http://nickfishman.com/post/49533681471/nodejs-http-requests-with-gzip-deflate-compression
	// TODO: Consider a streamed approach next time
	req.on('response', function(res: http$IncomingMessage<>) {
		const chunks = [];
 
		res.on('data', function(chunk: Buffer) {
			chunks.push(chunk);
		});
 
		res.on('end', function(): void {
			const buffer = Buffer.concat(chunks);
			const encoding = res.headers['content-encoding'];
 
			try {
				Iif (encoding === 'gzip') {
					debuglog('Content is gzipped');
					zlib.gunzip(buffer, (err: Error, decoded: string): void => callback(err, res, decoded && decoded.toString()));
				} else Iif (encoding === 'deflate') {
					debuglog('Content is deflated');
					zlib.inflate(buffer, (err: Error, decoded: string): void =>  callback(err, res, decoded && decoded.toString()));
				} else {
					// very special case, although this should really be a 303.
					Iif (res.statusCode === 302) {
						const err = new Error(`Unexpected Redirect to ${res.headers.location}`);
						err.name = 'UnexpectedRedirectError';
						return callback(err);
					}
 
					// manually handle 303... bah
					Iif (res.statusCode === 303) {
						const forwardOptions = typeof options === 'string' ? { uri: res.headers.location } : {
							...options,
							uri: res.headers.location,
						};
						return betterRequest(forwardOptions, callback);
					} else {
						return callback(null, res, buffer && buffer.toString());
					}
				}
			} catch (e) {
				callback(e);
			}
 
		});
 
	});
 
	req.on('error', callback);
}
 
function constructError(options: RequestOptions, resp: http$IncomingMessage<>, body: string): Error {
	const error = new Error();
	Iif (typeof options === 'string') {
		error.message = `ERROR ${options}`;
	} else {
		error.message = `${options.method || 'GET'} ERROR ${options.uri} HttpCode ${resp.statusCode}\n${body}`;
	}	
	return error;
}
 
type StringMap = {|
	[string]: string
|};
 
export type WebscrapeResult = {|
	headers: StringMap,
	json?: { ... },
	body: string,
	$?: CheerioStatic
|};
 
// TODO: This could throw errors. Deal with it.
function constructResult(resp: http$IncomingMessage<>, body: string): WebscrapeResult {
	const result: WebscrapeResult = {
		body,
		headers: resp.headers
	};
 
	const contentType = resp.headers['content-type'];
	const mimeType = contentType && contentType.split(';')[0];
	// augment the result
	switch (mimeType) {
		case 'text/html': result.$ = cheerio.load(body, { lowerCaseTags: true }); break;
		case 'application/json': result.json = JSON.parse(body.replace(UNICODE_HEADER, (m: string, n: string): string => String.fromCharCode(parseInt(n,16))));
	}
 
	return result;
}
 
export type WebscrapeOptions = {|
	headers?: StringMap,
	query?: StringMap,
	body?: string | StringMap,
	jar?: CookieJar,
	agentOptions?: http$agentOptions,
	method?: 'GET' | 'POST' | 'PUT' | 'DELETE' | 'OPTIONS',
	indicies?: boolean
|};
 
type KVPair = {|
	key: string,
	value: string,
|};
 
function constructOptionsWithJar(uri: string, { headers, query, body, jar, agentOptions, method = 'GET', indicies = true }: WebscrapeOptions): OptionsObject {
	const options: OptionsObject = { uri, jar, method };
 
	options.headers = Object.assign({}, BASE_OPTIONS.headers, headers);
	if (query !== undefined) {
		options.qs = query;
		options.qsStringifyOptions = {
			arrayFormat: indicies ? 'indicies' : 'repeat' // the documentation on this is terrible
		};
	}
 
	Iif (agentOptions) {
		options.agentOptions = agentOptions;
	}
 
	// TODO: this logic may change later, since it is not obvious
	if (body !== undefined) {
		const headers = options.headers || {};
		const contentTypeSet = Object.keys(headers)
			.map((key: string): KVPair => ({ key: key.toLowerCase(), value: headers[key] }))
			.filter((pair: KVPair): boolean => pair.key === 'content-type');
		if (contentTypeSet.length === 1) {
			// since there is a content type, we assume this is not a HTTP form.
			// NOTE: as a result, the user must do encoding manually.
			options.json = contentTypeSet[0].value.toLowerCase().startsWith('application/json');
			options.body = body;
		} else {
			options.form = body;
		}
	}
 
	return options;
}
 
function determineFilename(uri: string, filename: string): Promise<string> {
	return new Promise<string>((resolve: (string) => void, reject: (Error) => void): void => {
		let baseFilename
		try {
			const pathname = url.parse(uri,true).pathname;
			const matchResult = pathname ? /[^/]+$/.exec(pathname) : null;
			baseFilename = matchResult ? matchResult[0] : 'unknown';
		} catch (err) {
			debuglog(`WARNING Unable to determine base filename for ${uri}, using "unknown"`);
			baseFilename = 'unknown';
		}
 
		// why is this the first condition? because we may need baseFilename if filename is a folder
		Iif (!filename && !baseFilename) {
			return reject(new Error(`DOWNLOAD ${uri} - Filename not given and cannot determine base name`)); // TODO: Nicer error
		} else if (filename) {
			// if the filename is actually a folder that already exists, then download to the folder using the baseFilename
			fs.stat(filename, (err: ?Error, result: Stats): void => {
				try {
					if (err || !result.isDirectory()) {
						return resolve(filename); // just carry on using the filename
					} else {
						// we append the basefilename to the directory
						return resolve(path.join(filename, baseFilename));
					}
				} catch (e) {
					return reject(e);
				}
			});
		} else {
			// no filename, but we have a baseFilename
			return resolve(baseFilename);
		}
 
	});
}
 
export default {
	betterRequest, constructError, constructResult, constructOptionsWithJar, determineFilename
};