UNPKG

crawler/tests/encoding.test.js

Version:

2.82 kBJavaScriptView Raw

1/*jshint expr:true */
2
3'use strict';
4
5const Crawler = require('../lib/crawler');
6const expect = require('chai').expect;
7const nock = require('nock');
8
9describe('Encoding', function() {
before(function() {
nock.cleanAll();
});
13
const origin = 'http://czyborra.com';
const encodingFileName = 'iso8859.html';
const charsetName = 'ISO-8859-1';
const path = `/charsets/${encodingFileName}`;
const url = `${origin}${path}`;
const pathWithoutCharsetHeader = `/charsets-noheader/${encodingFileName}`;
const urlWithoutCharsetHeader = `${origin}${pathWithoutCharsetHeader}`;

let crawler = null;

beforeEach(function() {
crawler = new Crawler({
	retries: 0,
});

nock(origin).get(path).replyWithFile(200, `${__dirname}/${encodingFileName}`, { 'Content-Type': `text/html;charset=${charsetName}` });
nock(origin).get(pathWithoutCharsetHeader).replyWithFile(200, `${__dirname}/${encodingFileName}`, { 'Content-Type': 'text/html' });
});

it('should parse latin-1', function(done) {
crawler.queue([{
	uri: url,
	callback: function(error, result) {
		expect(error).to.be.null;
		expect(result.charset).to.eql(charsetName);
		expect(result.body.indexOf('Jörg')).to.be.above(0);
		done();
	}
}]);
});

it('should return buffer if encoding = null', function(done) {
crawler.queue([{
	uri: url,
	encoding:null,
	callback: function(error, result) {
		expect(error).to.be.null;
		expect(result.body instanceof Buffer).to.be.true;
		done();
	}
}]);
});

it('should parse latin-1 if incomingEncoding = ISO-8859-1', function(done) {
crawler.queue([{
	uri: url,
	incomingEncoding: charsetName,
	callback: function(error, result) {
		expect(error).to.be.null;
		expect(result.charset).to.eql(charsetName);
		expect(result.body.indexOf('Jörg')).to.be.above(0);
		done();
	}
}]);
});

it('could not parse latin-1 if incomingEncoding = gb2312', function(done) {
crawler.queue([{
	uri: url,
	incomingEncoding: 'gb2312',
	callback: function(error, result) {
		expect(error).to.be.null;
		expect(result.body.indexOf('Jörg')).to.equal(-1);
		done();
	}
}]);
});
81
it('should parse charset from header ', function(done) {
crawler.queue([{
	uri: url,
	callback: function(error, result) {
		expect(error).to.be.null;
		expect(result.charset).to.equal(charsetName);
		expect(result.body.indexOf('Jörg')).to.be.above(0);
		done();
	}
}]);
});
93
it('should parse charset from meta tag in html if header does not contain content-type key ', function(done) {
crawler.queue([{
	uri: urlWithoutCharsetHeader,
	callback: function(error, result) {
		expect(error).to.be.null;
		expect(result.charset).to.equal(charsetName);
		expect(result.body.indexOf('Jörg')).to.be.above(0);
		done();
	}
}]);
});
105});
106
107
108

1	`/jshint expr:true /`
2
3	`'use strict';`
4
5	`const Crawler = require('../lib/crawler');`
6	`const expect = require('chai').expect;`
7	`const nock = require('nock');`
8
9	`describe('Encoding', function() {`
10	`before(function() {`
11	`nock.cleanAll();`
12	`});`
13
14	`const origin = 'http://czyborra.com';`
15	`const encodingFileName = 'iso8859.html';`
16	`const charsetName = 'ISO-8859-1';`
17	const path = `/charsets/${encodingFileName}`;
18	const url = `${origin}${path}`;
19	const pathWithoutCharsetHeader = `/charsets-noheader/${encodingFileName}`;
20	const urlWithoutCharsetHeader = `${origin}${pathWithoutCharsetHeader}`;
21
22	`let crawler = null;`
23
24	`beforeEach(function() {`
25	`crawler = new Crawler({`
26	`retries: 0,`
27	`});`
28
29	nock(origin).get(path).replyWithFile(200, `${__dirname}/${encodingFileName}`, { 'Content-Type': `text/html;charset=${charsetName}` });
30	nock(origin).get(pathWithoutCharsetHeader).replyWithFile(200, `${__dirname}/${encodingFileName}`, { 'Content-Type': 'text/html' });
31	`});`
32
33	`it('should parse latin-1', function(done) {`
34	`crawler.queue([{`
35	`uri: url,`
36	`callback: function(error, result) {`
37	`expect(error).to.be.null;`
38	`expect(result.charset).to.eql(charsetName);`
39	`expect(result.body.indexOf('Jörg')).to.be.above(0);`
40	`done();`
41	`}`
42	`}]);`
43	`});`
44
45	`it('should return buffer if encoding = null', function(done) {`
46	`crawler.queue([{`
47	`uri: url,`
48	`encoding:null,`
49	`callback: function(error, result) {`
50	`expect(error).to.be.null;`
51	`expect(result.body instanceof Buffer).to.be.true;`
52	`done();`
53	`}`
54	`}]);`
55	`});`
56
57	`it('should parse latin-1 if incomingEncoding = ISO-8859-1', function(done) {`
58	`crawler.queue([{`
59	`uri: url,`
60	`incomingEncoding: charsetName,`
61	`callback: function(error, result) {`
62	`expect(error).to.be.null;`
63	`expect(result.charset).to.eql(charsetName);`
64	`expect(result.body.indexOf('Jörg')).to.be.above(0);`
65	`done();`
66	`}`
67	`}]);`
68	`});`
69
70	`it('could not parse latin-1 if incomingEncoding = gb2312', function(done) {`
71	`crawler.queue([{`
72	`uri: url,`
73	`incomingEncoding: 'gb2312',`
74	`callback: function(error, result) {`
75	`expect(error).to.be.null;`
76	`expect(result.body.indexOf('Jörg')).to.equal(-1);`
77	`done();`
78	`}`
79	`}]);`
80	`});`
81
82	`it('should parse charset from header ', function(done) {`
83	`crawler.queue([{`
84	`uri: url,`
85	`callback: function(error, result) {`
86	`expect(error).to.be.null;`
87	`expect(result.charset).to.equal(charsetName);`
88	`expect(result.body.indexOf('Jörg')).to.be.above(0);`
89	`done();`
90	`}`
91	`}]);`
92	`});`
93
94	`it('should parse charset from meta tag in html if header does not contain content-type key ', function(done) {`
95	`crawler.queue([{`
96	`uri: urlWithoutCharsetHeader,`
97	`callback: function(error, result) {`
98	`expect(error).to.be.null;`
99	`expect(result.charset).to.equal(charsetName);`
100	`expect(result.body.indexOf('Jörg')).to.be.above(0);`
101	`done();`
102	`}`
103	`}]);`
104	`});`
105	`});`
106
107
108