UNPKG

7.39 kBJavaScriptView Raw
1"use strict";
2const whatwgEncoding = require("whatwg-encoding");
3
4// https://html.spec.whatwg.org/#encoding-sniffing-algorithm
5module.exports = (buffer, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => {
6 let encoding = whatwgEncoding.getBOMEncoding(buffer); // see https://github.com/whatwg/html/issues/1910
7
8 if (encoding === null && transportLayerEncodingLabel !== undefined) {
9 encoding = whatwgEncoding.labelToName(transportLayerEncodingLabel);
10 }
11
12 if (encoding === null) {
13 encoding = prescanMetaCharset(buffer);
14 }
15
16 if (encoding === null) {
17 encoding = defaultEncoding;
18 }
19
20 return encoding;
21};
22
23// https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
24function prescanMetaCharset(buffer) {
25 const l = Math.min(buffer.length, 1024);
26 for (let i = 0; i < l; i++) {
27 let c = buffer[i];
28 if (c === 0x3C) {
29 // "<"
30 const c1 = buffer[i + 1];
31 const c2 = buffer[i + 2];
32 const c3 = buffer[i + 3];
33 const c4 = buffer[i + 4];
34 const c5 = buffer[i + 5];
35 // !-- (comment start)
36 if (c1 === 0x21 && c2 === 0x2D && c3 === 0x2D) {
37 i += 4;
38 for (; i < l; i++) {
39 c = buffer[i];
40 const cMinus1 = buffer[i - 1];
41 const cMinus2 = buffer[i - 2];
42 // --> (comment end)
43 if (c === 0x3E && cMinus1 === 0x2D && cMinus2 === 0x2D) {
44 break;
45 }
46 }
47 } else if ((c1 === 0x4D || c1 === 0x6D) &&
48 (c2 === 0x45 || c2 === 0x65) &&
49 (c3 === 0x54 || c3 === 0x74) &&
50 (c4 === 0x41 || c4 === 0x61) &&
51 (isSpaceCharacter(c5) || c5 === 0x2F)) {
52 // "meta" + space or /
53 i += 6;
54 let gotPragma = false;
55 let needPragma = null;
56 let charset = null;
57
58 let attrRes;
59 do {
60 attrRes = getAttribute(buffer, i, l);
61 if (attrRes.attr) {
62 if (attrRes.attr.name === "http-equiv") {
63 gotPragma = attrRes.attr.value === "content-type";
64 } else if (attrRes.attr.name === "content" && !charset) {
65 charset = extractCharacterEncodingFromMeta(attrRes.attr.value);
66 if (charset !== null) {
67 needPragma = true;
68 }
69 } else if (attrRes.attr.name === "charset") {
70 charset = whatwgEncoding.labelToName(attrRes.attr.value);
71 needPragma = false;
72 }
73 }
74 i = attrRes.i;
75 } while (attrRes.attr);
76
77 if (needPragma === null) {
78 continue;
79 }
80 if (needPragma === true && gotPragma === false) {
81 continue;
82 }
83 if (charset === null) {
84 continue;
85 }
86
87 if (charset === "UTF-16LE" || charset === "UTF-16BE") {
88 charset = "UTF-8";
89 }
90 if (charset === "x-user-defined") {
91 charset = "windows-1252";
92 }
93
94 return charset;
95 } else if ((c1 >= 0x41 && c1 <= 0x5A) || (c1 >= 0x61 && c1 <= 0x7A)) {
96 // a-z or A-Z
97 for (i += 2; i < l; i++) {
98 c = buffer[i];
99 // space or >
100 if (isSpaceCharacter(c) || c === 0x3E) {
101 break;
102 }
103 }
104 let attrRes;
105 do {
106 attrRes = getAttribute(buffer, i, l);
107 i = attrRes.i;
108 } while (attrRes.attr);
109 } else if (c1 === 0x21 || c1 === 0x2F || c1 === 0x3F) {
110 // ! or / or ?
111 for (i += 2; i < l; i++) {
112 c = buffer[i];
113 // >
114 if (c === 0x3E) {
115 break;
116 }
117 }
118 }
119 }
120 }
121 return null;
122}
123
124// https://html.spec.whatwg.org/multipage/syntax.html#concept-get-attributes-when-sniffing
125function getAttribute(buffer, i, l) {
126 for (; i < l; i++) {
127 let c = buffer[i];
128 // space or /
129 if (isSpaceCharacter(c) || c === 0x2F) {
130 continue;
131 }
132 // ">"
133 if (c === 0x3E) {
134 break;
135 }
136 let name = "";
137 let value = "";
138 nameLoop:for (; i < l; i++) {
139 c = buffer[i];
140 // "="
141 if (c === 0x3D && name !== "") {
142 i++;
143 break;
144 }
145 // space
146 if (isSpaceCharacter(c)) {
147 for (i++; i < l; i++) {
148 c = buffer[i];
149 // space
150 if (isSpaceCharacter(c)) {
151 continue;
152 }
153 // not "="
154 if (c !== 0x3D) {
155 return { attr: { name, value }, i };
156 }
157
158 i++;
159 break nameLoop;
160 }
161 break;
162 }
163 // / or >
164 if (c === 0x2F || c === 0x3E) {
165 return { attr: { name, value }, i };
166 }
167 // A-Z
168 if (c >= 0x41 && c <= 0x5A) {
169 name += String.fromCharCode(c + 0x20); // lowercase
170 } else {
171 name += String.fromCharCode(c);
172 }
173 }
174 c = buffer[i];
175 // space
176 if (isSpaceCharacter(c)) {
177 for (i++; i < l; i++) {
178 c = buffer[i];
179 // space
180 if (isSpaceCharacter(c)) {
181 continue;
182 } else {
183 break;
184 }
185 }
186 }
187 // " or '
188 if (c === 0x22 || c === 0x27) {
189 const quote = c;
190 for (i++; i < l; i++) {
191 c = buffer[i];
192
193 if (c === quote) {
194 i++;
195 return { attr: { name, value }, i };
196 }
197
198 // A-Z
199 if (c >= 0x41 && c <= 0x5A) {
200 value += String.fromCharCode(c + 0x20); // lowercase
201 } else {
202 value += String.fromCharCode(c);
203 }
204 }
205 }
206
207 // >
208 if (c === 0x3E) {
209 return { attr: { name, value }, i };
210 }
211
212 // A-Z
213 if (c >= 0x41 && c <= 0x5A) {
214 value += String.fromCharCode(c + 0x20); // lowercase
215 } else {
216 value += String.fromCharCode(c);
217 }
218
219 for (i++; i < l; i++) {
220 c = buffer[i];
221
222 // space or >
223 if (isSpaceCharacter(c) || c === 0x3E) {
224 return { attr: { name, value }, i };
225 }
226
227 // A-Z
228 if (c >= 0x41 && c <= 0x5A) {
229 value += String.fromCharCode(c + 0x20); // lowercase
230 } else {
231 value += String.fromCharCode(c);
232 }
233 }
234 }
235 return { i };
236}
237
238function extractCharacterEncodingFromMeta(string) {
239 let position = 0;
240
241 while (true) {
242 let subPosition = string.substring(position).search(/charset/i);
243
244 if (subPosition === -1) {
245 return null;
246 }
247 subPosition += "charset".length;
248
249 while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
250 ++subPosition;
251 }
252
253 if (string[subPosition] !== "=") {
254 position = subPosition - 1;
255 continue;
256 }
257
258 ++subPosition;
259
260 while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
261 ++subPosition;
262 }
263
264 position = subPosition;
265 break;
266 }
267
268 if (string[position] === "\"" || string[position] === "'") {
269 const nextIndex = string.indexOf(string[position], position + 1);
270
271 if (nextIndex !== -1) {
272 return whatwgEncoding.labelToName(string.substring(position + 1, nextIndex));
273 }
274
275 // It is an unmatched quotation mark
276 return null;
277 }
278
279 if (string.length === position + 1) {
280 return null;
281 }
282
283 let end = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/);
284 if (end === -1) {
285 end = string.length;
286 }
287 return whatwgEncoding.labelToName(string.substring(position, end));
288}
289
290function isSpaceCharacter(c) {
291 return c === 0x09 || c === 0x0A || c === 0x0C || c === 0x0D || c === 0x20;
292}