1 | /* eslint no-use-before-define:0 */
|
2 | import * as pathUtil from 'path';
|
3 | import textExtensions from 'textextensions';
|
4 | import binaryExtensions from 'binaryextensions';
|
5 | /**
|
6 | * Determine if the filename and/or buffer is text.
|
7 | * Determined by extension checks first (if filename is available), otherwise if unknown extension or no filename, will perform a slower buffer encoding detection.
|
8 | * This order is done, as extension checks are quicker, and also because encoding checks cannot guarantee accuracy for chars between utf8 and utf16.
|
9 | * The extension checks are performed using the resources https://github.com/bevry/textextensions and https://github.com/bevry/binaryextensions
|
10 | * @param filename The filename for the file/buffer if available
|
11 | * @param buffer The buffer for the file if available
|
12 | * @returns Will be `null` if neither `filename` nor `buffer` were provided. Otherwise will be a boolean value with the detection result.
|
13 | */
|
14 | export function isText(filename, buffer) {
|
15 | // Test extensions
|
16 | if (filename) {
|
17 | // Extract filename
|
18 | const parts = pathUtil.basename(filename).split('.').reverse();
|
19 | // Cycle extensions
|
20 | for (const extension of parts) {
|
21 | if (textExtensions.indexOf(extension) !== -1) {
|
22 | return true;
|
23 | }
|
24 | if (binaryExtensions.indexOf(extension) !== -1) {
|
25 | return false;
|
26 | }
|
27 | }
|
28 | }
|
29 | // Fallback to encoding if extension check was not enough
|
30 | if (buffer) {
|
31 | return getEncoding(buffer) === 'utf8';
|
32 | }
|
33 | // No buffer was provided
|
34 | return null;
|
35 | }
|
36 | /**
|
37 | * Determine if the filename and/or buffer is binary.
|
38 | * Determined by extension checks first (if filename is available), otherwise if unknown extension or no filename, will perform a slower buffer encoding detection.
|
39 | * This order is done, as extension checks are quicker, and also because encoding checks cannot guarantee accuracy for chars between utf8 and utf16.
|
40 | * The extension checks are performed using the resources https://github.com/bevry/textextensions and https://github.com/bevry/binaryextensions
|
41 | * @param filename The filename for the file/buffer if available
|
42 | * @param buffer The buffer for the file if available
|
43 | * @returns Will be `null` if neither `filename` nor `buffer` were provided. Otherwise will be a boolean value with the detection result.
|
44 | */
|
45 | export function isBinary(filename, buffer) {
|
46 | const text = isText(filename, buffer);
|
47 | if (text == null)
|
48 | return null;
|
49 | return !text;
|
50 | }
|
51 | /**
|
52 | * Get the encoding of a buffer.
|
53 | * Checks the start, middle, and end of the buffer for characters that are unrecognized within UTF8 encoding.
|
54 | * History has shown that inspection at all three locations is necessary.
|
55 | * @returns Will be `null` if `buffer` was not provided. Otherwise will be either `'utf8'` or `'binary'`
|
56 | */
|
57 | export function getEncoding(buffer, opts) {
|
58 | var _a, _b;
|
59 | // Check
|
60 | if (!buffer)
|
61 | return null;
|
62 | // Prepare
|
63 | const textEncoding = 'utf8';
|
64 | const binaryEncoding = 'binary';
|
65 | const chunkLength = (_a = opts === null || opts === void 0 ? void 0 : opts.chunkLength) !== null && _a !== void 0 ? _a : 24;
|
66 | let chunkBegin = (_b = opts === null || opts === void 0 ? void 0 : opts.chunkBegin) !== null && _b !== void 0 ? _b : 0;
|
67 | // Discover
|
68 | if ((opts === null || opts === void 0 ? void 0 : opts.chunkBegin) == null) {
|
69 | // Start
|
70 | let encoding = getEncoding(buffer, { chunkLength, chunkBegin });
|
71 | if (encoding === textEncoding) {
|
72 | // Middle
|
73 | chunkBegin = Math.max(0, Math.floor(buffer.length / 2) - chunkLength);
|
74 | encoding = getEncoding(buffer, {
|
75 | chunkLength,
|
76 | chunkBegin,
|
77 | });
|
78 | if (encoding === textEncoding) {
|
79 | // End
|
80 | chunkBegin = Math.max(0, buffer.length - chunkLength);
|
81 | encoding = getEncoding(buffer, {
|
82 | chunkLength,
|
83 | chunkBegin,
|
84 | });
|
85 | }
|
86 | }
|
87 | // Return
|
88 | return encoding;
|
89 | }
|
90 | else {
|
91 | // Extract
|
92 | chunkBegin = getChunkBegin(buffer, chunkBegin);
|
93 | if (chunkBegin === -1) {
|
94 | return binaryEncoding;
|
95 | }
|
96 | const chunkEnd = getChunkEnd(buffer, Math.min(buffer.length, chunkBegin + chunkLength));
|
97 | if (chunkEnd > buffer.length) {
|
98 | return binaryEncoding;
|
99 | }
|
100 | const contentChunkUTF8 = buffer.toString(textEncoding, chunkBegin, chunkEnd);
|
101 | // Detect encoding
|
102 | for (let i = 0; i < contentChunkUTF8.length; ++i) {
|
103 | const charCode = contentChunkUTF8.charCodeAt(i);
|
104 | if (charCode === 65533 || charCode <= 8) {
|
105 | // 8 and below are control characters (e.g. backspace, null, eof, etc.)
|
106 | // 65533 is the unknown character
|
107 | // console.log(charCode, contentChunkUTF8[i])
|
108 | return binaryEncoding;
|
109 | }
|
110 | }
|
111 | // Return
|
112 | return textEncoding;
|
113 | }
|
114 | }
|
115 | // ====================================
|
116 | // The functions below are created to handle multibyte utf8 characters.
|
117 | // To understand how the encoding works, check this article: https://en.wikipedia.org/wiki/UTF-8#Encoding
|
118 | // @todo add documentation for these
|
119 | function getChunkBegin(buf, chunkBegin) {
|
120 | // If it's the beginning, just return.
|
121 | if (chunkBegin === 0) {
|
122 | return 0;
|
123 | }
|
124 | if (!isLaterByteOfUtf8(buf[chunkBegin])) {
|
125 | return chunkBegin;
|
126 | }
|
127 | let begin = chunkBegin - 3;
|
128 | if (begin >= 0) {
|
129 | if (isFirstByteOf4ByteChar(buf[begin])) {
|
130 | return begin;
|
131 | }
|
132 | }
|
133 | begin = chunkBegin - 2;
|
134 | if (begin >= 0) {
|
135 | if (isFirstByteOf4ByteChar(buf[begin]) ||
|
136 | isFirstByteOf3ByteChar(buf[begin])) {
|
137 | return begin;
|
138 | }
|
139 | }
|
140 | begin = chunkBegin - 1;
|
141 | if (begin >= 0) {
|
142 | // Is it a 4-byte, 3-byte utf8 character?
|
143 | if (isFirstByteOf4ByteChar(buf[begin]) ||
|
144 | isFirstByteOf3ByteChar(buf[begin]) ||
|
145 | isFirstByteOf2ByteChar(buf[begin])) {
|
146 | return begin;
|
147 | }
|
148 | }
|
149 | return -1;
|
150 | }
|
151 | function getChunkEnd(buf, chunkEnd) {
|
152 | // If it's the end, just return.
|
153 | if (chunkEnd === buf.length) {
|
154 | return chunkEnd;
|
155 | }
|
156 | let index = chunkEnd - 3;
|
157 | if (index >= 0) {
|
158 | if (isFirstByteOf4ByteChar(buf[index])) {
|
159 | return chunkEnd + 1;
|
160 | }
|
161 | }
|
162 | index = chunkEnd - 2;
|
163 | if (index >= 0) {
|
164 | if (isFirstByteOf4ByteChar(buf[index])) {
|
165 | return chunkEnd + 2;
|
166 | }
|
167 | if (isFirstByteOf3ByteChar(buf[index])) {
|
168 | return chunkEnd + 1;
|
169 | }
|
170 | }
|
171 | index = chunkEnd - 1;
|
172 | if (index >= 0) {
|
173 | if (isFirstByteOf4ByteChar(buf[index])) {
|
174 | return chunkEnd + 3;
|
175 | }
|
176 | if (isFirstByteOf3ByteChar(buf[index])) {
|
177 | return chunkEnd + 2;
|
178 | }
|
179 | if (isFirstByteOf2ByteChar(buf[index])) {
|
180 | return chunkEnd + 1;
|
181 | }
|
182 | }
|
183 | return chunkEnd;
|
184 | }
|
185 | function isFirstByteOf4ByteChar(byte) {
|
186 | // eslint-disable-next-line no-bitwise
|
187 | return byte >> 3 === 30; // 11110xxx?
|
188 | }
|
189 | function isFirstByteOf3ByteChar(byte) {
|
190 | // eslint-disable-next-line no-bitwise
|
191 | return byte >> 4 === 14; // 1110xxxx?
|
192 | }
|
193 | function isFirstByteOf2ByteChar(byte) {
|
194 | // eslint-disable-next-line no-bitwise
|
195 | return byte >> 5 === 6; // 110xxxxx?
|
196 | }
|
197 | function isLaterByteOfUtf8(byte) {
|
198 | // eslint-disable-next-line no-bitwise
|
199 | return byte >> 6 === 2; // 10xxxxxx?
|
200 | }
|