UNPKG

7.74 kBJavaScriptView Raw
1"use strict";
2const whatwgEncoding = require("whatwg-encoding");
3
4// https://html.spec.whatwg.org/#encoding-sniffing-algorithm
5module.exports = (uint8Array, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => {
6 let encoding = whatwgEncoding.getBOMEncoding(uint8Array);
7
8 if (encoding === null && transportLayerEncodingLabel !== undefined) {
9 encoding = whatwgEncoding.labelToName(transportLayerEncodingLabel);
10 }
11
12 if (encoding === null) {
13 encoding = prescanMetaCharset(uint8Array);
14 }
15
16 if (encoding === null) {
17 encoding = defaultEncoding;
18 }
19
20 return encoding;
21};
22
23// https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
24function prescanMetaCharset(uint8Array) {
25 const l = Math.min(uint8Array.byteLength, 1024);
26 for (let i = 0; i < l; i++) {
27 let c = uint8Array[i];
28 if (c === 0x3C) {
29 // "<"
30 const c1 = uint8Array[i + 1];
31 const c2 = uint8Array[i + 2];
32 const c3 = uint8Array[i + 3];
33 const c4 = uint8Array[i + 4];
34 const c5 = uint8Array[i + 5];
35 // !-- (comment start)
36 if (c1 === 0x21 && c2 === 0x2D && c3 === 0x2D) {
37 i += 4;
38 for (; i < l; i++) {
39 c = uint8Array[i];
40 const cMinus1 = uint8Array[i - 1];
41 const cMinus2 = uint8Array[i - 2];
42 // --> (comment end)
43 if (c === 0x3E && cMinus1 === 0x2D && cMinus2 === 0x2D) {
44 break;
45 }
46 }
47 } else if ((c1 === 0x4D || c1 === 0x6D) &&
48 (c2 === 0x45 || c2 === 0x65) &&
49 (c3 === 0x54 || c3 === 0x74) &&
50 (c4 === 0x41 || c4 === 0x61) &&
51 (isSpaceCharacter(c5) || c5 === 0x2F)) {
52 // "meta" + space or /
53 i += 6;
54 const attributeList = new Set();
55 let gotPragma = false;
56 let needPragma = null;
57 let charset = null;
58
59 let attrRes;
60 do {
61 attrRes = getAttribute(uint8Array, i, l);
62 if (attrRes.attr && !attributeList.has(attrRes.attr.name)) {
63 attributeList.add(attrRes.attr.name);
64 if (attrRes.attr.name === "http-equiv") {
65 gotPragma = attrRes.attr.value === "content-type";
66 } else if (attrRes.attr.name === "content" && !charset) {
67 charset = extractCharacterEncodingFromMeta(attrRes.attr.value);
68 if (charset !== null) {
69 needPragma = true;
70 }
71 } else if (attrRes.attr.name === "charset") {
72 charset = whatwgEncoding.labelToName(attrRes.attr.value);
73 needPragma = false;
74 }
75 }
76 i = attrRes.i;
77 } while (attrRes.attr);
78
79 if (needPragma === null) {
80 continue;
81 }
82 if (needPragma === true && gotPragma === false) {
83 continue;
84 }
85 if (charset === null) {
86 continue;
87 }
88
89 if (charset === "UTF-16LE" || charset === "UTF-16BE") {
90 charset = "UTF-8";
91 }
92 if (charset === "x-user-defined") {
93 charset = "windows-1252";
94 }
95
96 return charset;
97 } else if ((c1 >= 0x41 && c1 <= 0x5A) || (c1 >= 0x61 && c1 <= 0x7A)) {
98 // a-z or A-Z
99 for (i += 2; i < l; i++) {
100 c = uint8Array[i];
101 // space or >
102 if (isSpaceCharacter(c) || c === 0x3E) {
103 break;
104 }
105 }
106 let attrRes;
107 do {
108 attrRes = getAttribute(uint8Array, i, l);
109 i = attrRes.i;
110 } while (attrRes.attr);
111 } else if (c1 === 0x21 || c1 === 0x2F || c1 === 0x3F) {
112 // ! or / or ?
113 for (i += 2; i < l; i++) {
114 c = uint8Array[i];
115 // >
116 if (c === 0x3E) {
117 break;
118 }
119 }
120 }
121 }
122 }
123 return null;
124}
125
126// https://html.spec.whatwg.org/multipage/syntax.html#concept-get-attributes-when-sniffing
127function getAttribute(uint8Array, i, l) {
128 for (; i < l; i++) {
129 let c = uint8Array[i];
130 // space or /
131 if (isSpaceCharacter(c) || c === 0x2F) {
132 continue;
133 }
134 // ">"
135 if (c === 0x3E) {
136 break;
137 }
138 let name = "";
139 let value = "";
140 nameLoop:for (; i < l; i++) {
141 c = uint8Array[i];
142 // "="
143 if (c === 0x3D && name !== "") {
144 i++;
145 break;
146 }
147 // space
148 if (isSpaceCharacter(c)) {
149 for (i++; i < l; i++) {
150 c = uint8Array[i];
151 // space
152 if (isSpaceCharacter(c)) {
153 continue;
154 }
155 // not "="
156 if (c !== 0x3D) {
157 return { attr: { name, value }, i };
158 }
159
160 i++;
161 break nameLoop;
162 }
163 break;
164 }
165 // / or >
166 if (c === 0x2F || c === 0x3E) {
167 return { attr: { name, value }, i };
168 }
169 // A-Z
170 if (c >= 0x41 && c <= 0x5A) {
171 name += String.fromCharCode(c + 0x20); // lowercase
172 } else {
173 name += String.fromCharCode(c);
174 }
175 }
176 c = uint8Array[i];
177 // space
178 if (isSpaceCharacter(c)) {
179 for (i++; i < l; i++) {
180 c = uint8Array[i];
181 // space
182 if (isSpaceCharacter(c)) {
183 continue;
184 } else {
185 break;
186 }
187 }
188 }
189 // " or '
190 if (c === 0x22 || c === 0x27) {
191 const quote = c;
192 for (i++; i < l; i++) {
193 c = uint8Array[i];
194
195 if (c === quote) {
196 i++;
197 return { attr: { name, value }, i };
198 }
199
200 // A-Z
201 if (c >= 0x41 && c <= 0x5A) {
202 value += String.fromCharCode(c + 0x20); // lowercase
203 } else {
204 value += String.fromCharCode(c);
205 }
206 }
207 }
208
209 // >
210 if (c === 0x3E) {
211 return { attr: { name, value }, i };
212 }
213
214 // A-Z
215 if (c >= 0x41 && c <= 0x5A) {
216 value += String.fromCharCode(c + 0x20); // lowercase
217 } else {
218 value += String.fromCharCode(c);
219 }
220
221 for (i++; i < l; i++) {
222 c = uint8Array[i];
223
224 // space or >
225 if (isSpaceCharacter(c) || c === 0x3E) {
226 return { attr: { name, value }, i };
227 }
228
229 // A-Z
230 if (c >= 0x41 && c <= 0x5A) {
231 value += String.fromCharCode(c + 0x20); // lowercase
232 } else {
233 value += String.fromCharCode(c);
234 }
235 }
236 }
237 return { i };
238}
239
240function extractCharacterEncodingFromMeta(string) {
241 let position = 0;
242
243 while (true) {
244 const indexOfCharset = string.substring(position).search(/charset/ui);
245
246 if (indexOfCharset === -1) {
247 return null;
248 }
249 let subPosition = position + indexOfCharset + "charset".length;
250
251 while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
252 ++subPosition;
253 }
254
255 if (string[subPosition] !== "=") {
256 position = subPosition - 1;
257 continue;
258 }
259
260 ++subPosition;
261
262 while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
263 ++subPosition;
264 }
265
266 position = subPosition;
267 break;
268 }
269
270 if (string[position] === "\"" || string[position] === "'") {
271 const nextIndex = string.indexOf(string[position], position + 1);
272
273 if (nextIndex !== -1) {
274 return whatwgEncoding.labelToName(string.substring(position + 1, nextIndex));
275 }
276
277 // It is an unmatched quotation mark
278 return null;
279 }
280
281 if (string.length === position + 1) {
282 return null;
283 }
284
285 const indexOfASCIIWhitespaceOrSemicolon = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/u);
286 const end = indexOfASCIIWhitespaceOrSemicolon === -1 ?
287 string.length :
288 position + indexOfASCIIWhitespaceOrSemicolon + 1;
289
290 return whatwgEncoding.labelToName(string.substring(position, end));
291}
292
293function isSpaceCharacter(c) {
294 return c === 0x09 || c === 0x0A || c === 0x0C || c === 0x0D || c === 0x20;
295}