UNPKG

2.82 kBJavaScriptView Raw
1/*jshint expr:true */
2
3'use strict';
4
5const Crawler = require('../lib/crawler');
6const expect = require('chai').expect;
7const nock = require('nock');
8
9describe('Encoding', function() {
10 before(function() {
11 nock.cleanAll();
12 });
13
14 const origin = 'http://czyborra.com';
15 const encodingFileName = 'iso8859.html';
16 const charsetName = 'ISO-8859-1';
17 const path = `/charsets/${encodingFileName}`;
18 const url = `${origin}${path}`;
19 const pathWithoutCharsetHeader = `/charsets-noheader/${encodingFileName}`;
20 const urlWithoutCharsetHeader = `${origin}${pathWithoutCharsetHeader}`;
21
22 let crawler = null;
23
24 beforeEach(function() {
25 crawler = new Crawler({
26 retries: 0,
27 });
28
29 nock(origin).get(path).replyWithFile(200, `${__dirname}/${encodingFileName}`, { 'Content-Type': `text/html;charset=${charsetName}` });
30 nock(origin).get(pathWithoutCharsetHeader).replyWithFile(200, `${__dirname}/${encodingFileName}`, { 'Content-Type': 'text/html' });
31 });
32
33 it('should parse latin-1', function(done) {
34 crawler.queue([{
35 uri: url,
36 callback: function(error, result) {
37 expect(error).to.be.null;
38 expect(result.charset).to.eql(charsetName);
39 expect(result.body.indexOf('Jörg')).to.be.above(0);
40 done();
41 }
42 }]);
43 });
44
45 it('should return buffer if encoding = null', function(done) {
46 crawler.queue([{
47 uri: url,
48 encoding:null,
49 callback: function(error, result) {
50 expect(error).to.be.null;
51 expect(result.body instanceof Buffer).to.be.true;
52 done();
53 }
54 }]);
55 });
56
57 it('should parse latin-1 if incomingEncoding = ISO-8859-1', function(done) {
58 crawler.queue([{
59 uri: url,
60 incomingEncoding: charsetName,
61 callback: function(error, result) {
62 expect(error).to.be.null;
63 expect(result.charset).to.eql(charsetName);
64 expect(result.body.indexOf('Jörg')).to.be.above(0);
65 done();
66 }
67 }]);
68 });
69
70 it('could not parse latin-1 if incomingEncoding = gb2312', function(done) {
71 crawler.queue([{
72 uri: url,
73 incomingEncoding: 'gb2312',
74 callback: function(error, result) {
75 expect(error).to.be.null;
76 expect(result.body.indexOf('Jörg')).to.equal(-1);
77 done();
78 }
79 }]);
80 });
81
82 it('should parse charset from header ', function(done) {
83 crawler.queue([{
84 uri: url,
85 callback: function(error, result) {
86 expect(error).to.be.null;
87 expect(result.charset).to.equal(charsetName);
88 expect(result.body.indexOf('Jörg')).to.be.above(0);
89 done();
90 }
91 }]);
92 });
93
94 it('should parse charset from meta tag in html if header does not contain content-type key ', function(done) {
95 crawler.queue([{
96 uri: urlWithoutCharsetHeader,
97 callback: function(error, result) {
98 expect(error).to.be.null;
99 expect(result.charset).to.equal(charsetName);
100 expect(result.body.indexOf('Jörg')).to.be.above(0);
101 done();
102 }
103 }]);
104 });
105});
106
107
108