1 |
|
2 |
|
3 | 'use strict';
|
4 |
|
5 | const Crawler = require('../lib/crawler');
|
6 | const expect = require('chai').expect;
|
7 | const nock = require('nock');
|
8 |
|
9 | describe('Encoding', function() {
|
10 | before(function() {
|
11 | nock.cleanAll();
|
12 | });
|
13 |
|
14 | const origin = 'http://czyborra.com';
|
15 | const encodingFileName = 'iso8859.html';
|
16 | const charsetName = 'ISO-8859-1';
|
17 | const path = `/charsets/${encodingFileName}`;
|
18 | const url = `${origin}${path}`;
|
19 | const pathWithoutCharsetHeader = `/charsets-noheader/${encodingFileName}`;
|
20 | const urlWithoutCharsetHeader = `${origin}${pathWithoutCharsetHeader}`;
|
21 |
|
22 | let crawler = null;
|
23 |
|
24 | beforeEach(function() {
|
25 | crawler = new Crawler({
|
26 | retries: 0,
|
27 | });
|
28 |
|
29 | nock(origin).get(path).replyWithFile(200, `${__dirname}/${encodingFileName}`, { 'Content-Type': `text/html;charset=${charsetName}` });
|
30 | nock(origin).get(pathWithoutCharsetHeader).replyWithFile(200, `${__dirname}/${encodingFileName}`, { 'Content-Type': 'text/html' });
|
31 | });
|
32 |
|
33 | it('should parse latin-1', function(done) {
|
34 | crawler.queue([{
|
35 | uri: url,
|
36 | callback: function(error, result) {
|
37 | expect(error).to.be.null;
|
38 | expect(result.charset).to.eql(charsetName);
|
39 | expect(result.body.indexOf('Jörg')).to.be.above(0);
|
40 | done();
|
41 | }
|
42 | }]);
|
43 | });
|
44 |
|
45 | it('should return buffer if encoding = null', function(done) {
|
46 | crawler.queue([{
|
47 | uri: url,
|
48 | encoding:null,
|
49 | callback: function(error, result) {
|
50 | expect(error).to.be.null;
|
51 | expect(result.body instanceof Buffer).to.be.true;
|
52 | done();
|
53 | }
|
54 | }]);
|
55 | });
|
56 |
|
57 | it('should parse latin-1 if incomingEncoding = ISO-8859-1', function(done) {
|
58 | crawler.queue([{
|
59 | uri: url,
|
60 | incomingEncoding: charsetName,
|
61 | callback: function(error, result) {
|
62 | expect(error).to.be.null;
|
63 | expect(result.charset).to.eql(charsetName);
|
64 | expect(result.body.indexOf('Jörg')).to.be.above(0);
|
65 | done();
|
66 | }
|
67 | }]);
|
68 | });
|
69 |
|
70 | it('could not parse latin-1 if incomingEncoding = gb2312', function(done) {
|
71 | crawler.queue([{
|
72 | uri: url,
|
73 | incomingEncoding: 'gb2312',
|
74 | callback: function(error, result) {
|
75 | expect(error).to.be.null;
|
76 | expect(result.body.indexOf('Jörg')).to.equal(-1);
|
77 | done();
|
78 | }
|
79 | }]);
|
80 | });
|
81 |
|
82 | it('should parse charset from header ', function(done) {
|
83 | crawler.queue([{
|
84 | uri: url,
|
85 | callback: function(error, result) {
|
86 | expect(error).to.be.null;
|
87 | expect(result.charset).to.equal(charsetName);
|
88 | expect(result.body.indexOf('Jörg')).to.be.above(0);
|
89 | done();
|
90 | }
|
91 | }]);
|
92 | });
|
93 |
|
94 | it('should parse charset from meta tag in html if header does not contain content-type key ', function(done) {
|
95 | crawler.queue([{
|
96 | uri: urlWithoutCharsetHeader,
|
97 | callback: function(error, result) {
|
98 | expect(error).to.be.null;
|
99 | expect(result.charset).to.equal(charsetName);
|
100 | expect(result.body.indexOf('Jörg')).to.be.above(0);
|
101 | done();
|
102 | }
|
103 | }]);
|
104 | });
|
105 | });
|
106 |
|
107 |
|
108 |
|