UNPKG

15.9 kBJavaScriptView Raw
1"use strict";
2// *****************************************************************************
3// Copyright (C) 2020 TypeFox and others.
4//
5// This program and the accompanying materials are made available under the
6// terms of the Eclipse Public License v. 2.0 which is available at
7// http://www.eclipse.org/legal/epl-2.0.
8//
9// This Source Code may also be made available under the following Secondary
10// Licenses when the conditions for such availability set forth in the Eclipse
11// Public License v. 2.0 are satisfied: GNU General Public License, version 2
12// with the GNU Classpath Exception which is available at
13// https://www.gnu.org/software/classpath/license.html.
14//
15// SPDX-License-Identifier: EPL-2.0 OR GPL-2.0 WITH Classpath-exception-2.0
16// *****************************************************************************
17/*---------------------------------------------------------------------------------------------
18 * Copyright (c) Microsoft Corporation. All rights reserved.
19 * Licensed under the MIT License. See License.txt in the project root for license information.
20 *--------------------------------------------------------------------------------------------*/
21// based on https://github.com/microsoft/vscode/blob/04c36be045a94fee58e5f8992d3e3fd980294a84/src/vs/workbench/services/textfile/common/encoding.ts
22var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
23 var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
24 if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
25 else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
26 return c > 3 && r && Object.defineProperty(target, key, r), r;
27};
28Object.defineProperty(exports, "__esModule", { value: true });
29exports.EncodingService = void 0;
30/* eslint-disable no-null/no-null */
31const iconv = require("iconv-lite");
32const safer_buffer_1 = require("safer-buffer");
33const inversify_1 = require("inversify");
34const buffer_1 = require("./buffer");
35const encodings_1 = require("./encodings");
36const stream_1 = require("./stream");
37const ZERO_BYTE_DETECTION_BUFFER_MAX_LEN = 512; // number of bytes to look at to decide about a file being binary or not
38const NO_ENCODING_GUESS_MIN_BYTES = 512; // when not auto guessing the encoding, small number of bytes are enough
39const AUTO_ENCODING_GUESS_MIN_BYTES = 512 * 8; // with auto guessing we want a lot more content to be read for guessing
40const AUTO_ENCODING_GUESS_MAX_BYTES = 512 * 128; // set an upper limit for the number of bytes we pass on to jschardet
41// we explicitly ignore a specific set of encodings from auto guessing
42// - ASCII: we never want this encoding (most UTF-8 files would happily detect as
43// ASCII files and then you could not type non-ASCII characters anymore)
44// - UTF-16: we have our own detection logic for UTF-16
45// - UTF-32: we do not support this encoding in VSCode
46const IGNORE_ENCODINGS = ['ascii', 'utf-16', 'utf-32'];
47let EncodingService = class EncodingService {
48 encode(value, options) {
49 let encoding = options === null || options === void 0 ? void 0 : options.encoding;
50 const addBOM = options === null || options === void 0 ? void 0 : options.hasBOM;
51 encoding = this.toIconvEncoding(encoding);
52 if (encoding === encodings_1.UTF8 && !addBOM) {
53 return buffer_1.BinaryBuffer.fromString(value);
54 }
55 const buffer = iconv.encode(value, encoding, { addBOM });
56 return buffer_1.BinaryBuffer.wrap(buffer);
57 }
58 decode(value, encoding) {
59 const buffer = safer_buffer_1.Buffer.from(value.buffer);
60 encoding = this.toIconvEncoding(encoding);
61 return iconv.decode(buffer, encoding);
62 }
63 exists(encoding) {
64 encoding = this.toIconvEncoding(encoding);
65 return iconv.encodingExists(encoding);
66 }
67 toIconvEncoding(encoding) {
68 if (encoding === encodings_1.UTF8_with_bom || !encoding) {
69 return encodings_1.UTF8; // iconv does not distinguish UTF 8 with or without BOM, so we need to help it
70 }
71 return encoding;
72 }
73 async toResourceEncoding(encoding, options) {
74 // Some encodings come with a BOM automatically
75 if (encoding === encodings_1.UTF16be || encoding === encodings_1.UTF16le || encoding === encodings_1.UTF8_with_bom) {
76 return { encoding, hasBOM: true };
77 }
78 // Ensure that we preserve an existing BOM if found for UTF8
79 // unless we are instructed to overwrite the encoding
80 const overwriteEncoding = options === null || options === void 0 ? void 0 : options.overwriteEncoding;
81 if (!overwriteEncoding && encoding === encodings_1.UTF8) {
82 try {
83 // stream here to avoid fetching the whole content on write
84 const buffer = await options.read(encodings_1.UTF8_BOM.length);
85 if (this.detectEncodingByBOMFromBuffer(safer_buffer_1.Buffer.from(buffer), buffer.byteLength) === encodings_1.UTF8_with_bom) {
86 return { encoding, hasBOM: true };
87 }
88 }
89 catch (error) {
90 // ignore - file might not exist
91 }
92 }
93 return { encoding, hasBOM: false };
94 }
95 async detectEncoding(data, autoGuessEncoding) {
96 const buffer = safer_buffer_1.Buffer.from(data.buffer);
97 const bytesRead = data.byteLength;
98 // Always first check for BOM to find out about encoding
99 let encoding = this.detectEncodingByBOMFromBuffer(buffer, bytesRead);
100 // Detect 0 bytes to see if file is binary or UTF-16 LE/BEÏ
101 // unless we already know that this file has a UTF-16 encoding
102 let seemsBinary = false;
103 if (encoding !== encodings_1.UTF16be && encoding !== encodings_1.UTF16le && buffer) {
104 let couldBeUTF16LE = true; // e.g. 0xAA 0x00
105 let couldBeUTF16BE = true; // e.g. 0x00 0xAA
106 let containsZeroByte = false;
107 // This is a simplified guess to detect UTF-16 BE or LE by just checking if
108 // the first 512 bytes have the 0-byte at a specific location. For UTF-16 LE
109 // this would be the odd byte index and for UTF-16 BE the even one.
110 // Note: this can produce false positives (a binary file that uses a 2-byte
111 // encoding of the same format as UTF-16) and false negatives (a UTF-16 file
112 // that is using 4 bytes to encode a character).
113 for (let i = 0; i < bytesRead && i < ZERO_BYTE_DETECTION_BUFFER_MAX_LEN; i++) {
114 const isEndian = (i % 2 === 1); // assume 2-byte sequences typical for UTF-16
115 const isZeroByte = (buffer.readUInt8(i) === 0);
116 if (isZeroByte) {
117 containsZeroByte = true;
118 }
119 // UTF-16 LE: expect e.g. 0xAA 0x00
120 if (couldBeUTF16LE && (isEndian && !isZeroByte || !isEndian && isZeroByte)) {
121 couldBeUTF16LE = false;
122 }
123 // UTF-16 BE: expect e.g. 0x00 0xAA
124 if (couldBeUTF16BE && (isEndian && isZeroByte || !isEndian && !isZeroByte)) {
125 couldBeUTF16BE = false;
126 }
127 // Return if this is neither UTF16-LE nor UTF16-BE and thus treat as binary
128 if (isZeroByte && !couldBeUTF16LE && !couldBeUTF16BE) {
129 break;
130 }
131 }
132 // Handle case of 0-byte included
133 if (containsZeroByte) {
134 if (couldBeUTF16LE) {
135 encoding = encodings_1.UTF16le;
136 }
137 else if (couldBeUTF16BE) {
138 encoding = encodings_1.UTF16be;
139 }
140 else {
141 seemsBinary = true;
142 }
143 }
144 }
145 // Auto guess encoding if configured
146 if (autoGuessEncoding && !seemsBinary && !encoding && buffer) {
147 const guessedEncoding = await this.guessEncodingByBuffer(buffer.slice(0, bytesRead));
148 return {
149 seemsBinary: false,
150 encoding: guessedEncoding
151 };
152 }
153 return { seemsBinary, encoding };
154 }
155 detectEncodingByBOMFromBuffer(buffer, bytesRead) {
156 if (!buffer || bytesRead < encodings_1.UTF16be_BOM.length) {
157 return undefined;
158 }
159 const b0 = buffer.readUInt8(0);
160 const b1 = buffer.readUInt8(1);
161 // UTF-16 BE
162 if (b0 === encodings_1.UTF16be_BOM[0] && b1 === encodings_1.UTF16be_BOM[1]) {
163 return encodings_1.UTF16be;
164 }
165 // UTF-16 LE
166 if (b0 === encodings_1.UTF16le_BOM[0] && b1 === encodings_1.UTF16le_BOM[1]) {
167 return encodings_1.UTF16le;
168 }
169 if (bytesRead < encodings_1.UTF8_BOM.length) {
170 return undefined;
171 }
172 const b2 = buffer.readUInt8(2);
173 // UTF-8
174 if (b0 === encodings_1.UTF8_BOM[0] && b1 === encodings_1.UTF8_BOM[1] && b2 === encodings_1.UTF8_BOM[2]) {
175 return encodings_1.UTF8_with_bom;
176 }
177 return undefined;
178 }
179 async guessEncodingByBuffer(buffer) {
180 const jschardet = await Promise.resolve().then(() => require('jschardet'));
181 const guessed = jschardet.detect(buffer.slice(0, AUTO_ENCODING_GUESS_MAX_BYTES)); // ensure to limit buffer for guessing due to https://github.com/aadsm/jschardet/issues/53
182 if (!guessed || !guessed.encoding) {
183 return undefined;
184 }
185 const enc = guessed.encoding.toLowerCase();
186 if (0 <= IGNORE_ENCODINGS.indexOf(enc)) {
187 return undefined; // see comment above why we ignore some encodings
188 }
189 return this.toIconvEncoding(guessed.encoding);
190 }
191 decodeStream(source, options) {
192 var _a;
193 const minBytesRequiredForDetection = ((_a = options.minBytesRequiredForDetection) !== null && _a !== void 0 ? _a : options.guessEncoding) ? AUTO_ENCODING_GUESS_MIN_BYTES : NO_ENCODING_GUESS_MIN_BYTES;
194 return new Promise((resolve, reject) => {
195 const target = (0, stream_1.newWriteableStream)(strings => strings.join(''));
196 const bufferedChunks = [];
197 let bytesBuffered = 0;
198 let decoder = undefined;
199 const createDecoder = async () => {
200 try {
201 // detect encoding from buffer
202 const detected = await this.detectEncoding(buffer_1.BinaryBuffer.concat(bufferedChunks), options.guessEncoding);
203 // ensure to respect overwrite of encoding
204 detected.encoding = await options.overwriteEncoding(detected.encoding);
205 // decode and write buffered content
206 decoder = iconv.getDecoder(this.toIconvEncoding(detected.encoding));
207 const decoded = decoder.write(safer_buffer_1.Buffer.from(buffer_1.BinaryBuffer.concat(bufferedChunks).buffer));
208 target.write(decoded);
209 bufferedChunks.length = 0;
210 bytesBuffered = 0;
211 // signal to the outside our detected encoding and final decoder stream
212 resolve({
213 stream: target,
214 detected
215 });
216 }
217 catch (error) {
218 reject(error);
219 }
220 };
221 // Stream error: forward to target
222 source.on('error', error => target.error(error));
223 // Stream data
224 source.on('data', async (chunk) => {
225 // if the decoder is ready, we just write directly
226 if (decoder) {
227 target.write(decoder.write(safer_buffer_1.Buffer.from(chunk.buffer)));
228 }
229 else {
230 bufferedChunks.push(chunk);
231 bytesBuffered += chunk.byteLength;
232 // buffered enough data for encoding detection, create stream
233 if (bytesBuffered >= minBytesRequiredForDetection) {
234 // pause stream here until the decoder is ready
235 source.pause();
236 await createDecoder();
237 // resume stream now that decoder is ready but
238 // outside of this stack to reduce recursion
239 setTimeout(() => source.resume());
240 }
241 }
242 });
243 // Stream end
244 source.on('end', async () => {
245 // we were still waiting for data to do the encoding
246 // detection. thus, wrap up starting the stream even
247 // without all the data to get things going
248 if (!decoder) {
249 await createDecoder();
250 }
251 // end the target with the remainders of the decoder
252 target.end(decoder === null || decoder === void 0 ? void 0 : decoder.end());
253 });
254 });
255 }
256 async encodeStream(value, options) {
257 let encoding = options === null || options === void 0 ? void 0 : options.encoding;
258 const addBOM = options === null || options === void 0 ? void 0 : options.hasBOM;
259 encoding = this.toIconvEncoding(encoding);
260 if (encoding === encodings_1.UTF8 && !addBOM) {
261 return value === undefined ? undefined : typeof value === 'string' ?
262 buffer_1.BinaryBuffer.fromString(value) : buffer_1.BinaryBufferReadable.fromReadable(value);
263 }
264 value = value || '';
265 const readable = typeof value === 'string' ? stream_1.Readable.fromString(value) : value;
266 const encoder = iconv.getEncoder(encoding, { addBOM });
267 let bytesWritten = false;
268 let done = false;
269 return {
270 read() {
271 if (done) {
272 return null;
273 }
274 const chunk = readable.read();
275 if (typeof chunk !== 'string') {
276 done = true;
277 // If we are instructed to add a BOM but we detect that no
278 // bytes have been written, we must ensure to return the BOM
279 // ourselves so that we comply with the contract.
280 if (!bytesWritten && addBOM) {
281 switch (encoding) {
282 case encodings_1.UTF8:
283 case encodings_1.UTF8_with_bom:
284 return buffer_1.BinaryBuffer.wrap(Uint8Array.from(encodings_1.UTF8_BOM));
285 case encodings_1.UTF16be:
286 return buffer_1.BinaryBuffer.wrap(Uint8Array.from(encodings_1.UTF16be_BOM));
287 case encodings_1.UTF16le:
288 return buffer_1.BinaryBuffer.wrap(Uint8Array.from(encodings_1.UTF16le_BOM));
289 }
290 }
291 const leftovers = encoder.end();
292 if (leftovers && leftovers.length > 0) {
293 bytesWritten = true;
294 return buffer_1.BinaryBuffer.wrap(leftovers);
295 }
296 return null;
297 }
298 bytesWritten = true;
299 return buffer_1.BinaryBuffer.wrap(encoder.write(chunk));
300 }
301 };
302 }
303};
304EncodingService = __decorate([
305 (0, inversify_1.injectable)()
306], EncodingService);
307exports.EncodingService = EncodingService;
308//# sourceMappingURL=encoding-service.js.map
\No newline at end of file