1 | /* eslint no-use-before-define:0 */
|
2 | import * as pathUtil from 'path';
|
3 | import textExtensions from 'textextensions';
|
4 | import binaryExtensions from 'binaryextensions';
|
5 | /**
|
6 | * Determine if the filename and/or buffer is text.
|
7 | * Determined by extension checks first (if filename is available), otherwise if unknown extension or no filename, will perform a slower buffer encoding detection.
|
8 | * This order is done, as extension checks are quicker, and also because encoding checks cannot guarantee accuracy for chars between utf8 and utf16.
|
9 | * The extension checks are performed using the resources https://github.com/bevry/textextensions and https://github.com/bevry/binaryextensions
|
10 | * @param filename The filename for the file/buffer if available
|
11 | * @param buffer The buffer for the file if available
|
12 | * @returns Will be `null` if neither `filename` nor `buffer` were provided. Otherwise will be a boolean value with the detection result.
|
13 | */
|
14 | export function isText(filename, buffer) {
|
15 | // Test extensions
|
16 | if (filename) {
|
17 | // Extract filename
|
18 | const parts = pathUtil.basename(filename).split('.').reverse();
|
19 | // Cycle extensions
|
20 | for (const extension of parts) {
|
21 | if (textExtensions.indexOf(extension) !== -1) {
|
22 | return true;
|
23 | }
|
24 | if (binaryExtensions.indexOf(extension) !== -1) {
|
25 | return false;
|
26 | }
|
27 | }
|
28 | }
|
29 | // Fallback to encoding if extension check was not enough
|
30 | if (buffer) {
|
31 | return getEncoding(buffer) === 'utf8';
|
32 | }
|
33 | // No buffer was provided
|
34 | return null;
|
35 | }
|
36 | /**
|
37 | * Determine if the filename and/or buffer is binary.
|
38 | * Determined by extension checks first (if filename is available), otherwise if unknown extension or no filename, will perform a slower buffer encoding detection.
|
39 | * This order is done, as extension checks are quicker, and also because encoding checks cannot guarantee accuracy for chars between utf8 and utf16.
|
40 | * The extension checks are performed using the resources https://github.com/bevry/textextensions and https://github.com/bevry/binaryextensions
|
41 | * @param filename The filename for the file/buffer if available
|
42 | * @param buffer The buffer for the file if available
|
43 | * @returns Will be `null` if neither `filename` nor `buffer` were provided. Otherwise will be a boolean value with the detection result.
|
44 | */
|
45 | export function isBinary(filename, buffer) {
|
46 | const text = isText(filename, buffer);
|
47 | if (text == null)
|
48 | return null;
|
49 | return !text;
|
50 | }
|
51 | /**
|
52 | * Get the encoding of a buffer.
|
53 | * Checks the start, middle, and end of the buffer for characters that are unrecognized within UTF8 encoding.
|
54 | * History has shown that inspection at all three locations is necessary.
|
55 | * @returns Will be `null` if `buffer` was not provided. Otherwise will be either `'utf8'` or `'binary'`
|
56 | */
|
57 | export function getEncoding(buffer, opts) {
|
58 | // Check
|
59 | if (!buffer)
|
60 | return null;
|
61 | // Prepare
|
62 | const textEncoding = 'utf8';
|
63 | const binaryEncoding = 'binary';
|
64 | const chunkLength = opts?.chunkLength ?? 24;
|
65 | let chunkBegin = opts?.chunkBegin ?? 0;
|
66 | // Discover
|
67 | if (opts?.chunkBegin == null) {
|
68 | // Start
|
69 | let encoding = getEncoding(buffer, { chunkLength, chunkBegin });
|
70 | if (encoding === textEncoding) {
|
71 | // Middle
|
72 | chunkBegin = Math.max(0, Math.floor(buffer.length / 2) - chunkLength);
|
73 | encoding = getEncoding(buffer, {
|
74 | chunkLength,
|
75 | chunkBegin,
|
76 | });
|
77 | if (encoding === textEncoding) {
|
78 | // End
|
79 | chunkBegin = Math.max(0, buffer.length - chunkLength);
|
80 | encoding = getEncoding(buffer, {
|
81 | chunkLength,
|
82 | chunkBegin,
|
83 | });
|
84 | }
|
85 | }
|
86 | // Return
|
87 | return encoding;
|
88 | }
|
89 | else {
|
90 | // Extract
|
91 | chunkBegin = getChunkBegin(buffer, chunkBegin);
|
92 | if (chunkBegin === -1) {
|
93 | return binaryEncoding;
|
94 | }
|
95 | const chunkEnd = getChunkEnd(buffer, Math.min(buffer.length, chunkBegin + chunkLength));
|
96 | if (chunkEnd > buffer.length) {
|
97 | return binaryEncoding;
|
98 | }
|
99 | const contentChunkUTF8 = buffer.toString(textEncoding, chunkBegin, chunkEnd);
|
100 | // Detect encoding
|
101 | for (let i = 0; i < contentChunkUTF8.length; ++i) {
|
102 | const charCode = contentChunkUTF8.charCodeAt(i);
|
103 | if (charCode === 65533 || charCode <= 8) {
|
104 | // 8 and below are control characters (e.g. backspace, null, eof, etc.)
|
105 | // 65533 is the unknown character
|
106 | // console.log(charCode, contentChunkUTF8[i])
|
107 | return binaryEncoding;
|
108 | }
|
109 | }
|
110 | // Return
|
111 | return textEncoding;
|
112 | }
|
113 | }
|
114 | // ====================================
|
115 | // The functions below are created to handle multibyte utf8 characters.
|
116 | // To understand how the encoding works, check this article: https://en.wikipedia.org/wiki/UTF-8#Encoding
|
117 | // @todo add documentation for these
|
118 | function getChunkBegin(buf, chunkBegin) {
|
119 | // If it's the beginning, just return.
|
120 | if (chunkBegin === 0) {
|
121 | return 0;
|
122 | }
|
123 | if (!isLaterByteOfUtf8(buf[chunkBegin])) {
|
124 | return chunkBegin;
|
125 | }
|
126 | let begin = chunkBegin - 3;
|
127 | if (begin >= 0) {
|
128 | if (isFirstByteOf4ByteChar(buf[begin])) {
|
129 | return begin;
|
130 | }
|
131 | }
|
132 | begin = chunkBegin - 2;
|
133 | if (begin >= 0) {
|
134 | if (isFirstByteOf4ByteChar(buf[begin]) ||
|
135 | isFirstByteOf3ByteChar(buf[begin])) {
|
136 | return begin;
|
137 | }
|
138 | }
|
139 | begin = chunkBegin - 1;
|
140 | if (begin >= 0) {
|
141 | // Is it a 4-byte, 3-byte utf8 character?
|
142 | if (isFirstByteOf4ByteChar(buf[begin]) ||
|
143 | isFirstByteOf3ByteChar(buf[begin]) ||
|
144 | isFirstByteOf2ByteChar(buf[begin])) {
|
145 | return begin;
|
146 | }
|
147 | }
|
148 | return -1;
|
149 | }
|
150 | function getChunkEnd(buf, chunkEnd) {
|
151 | // If it's the end, just return.
|
152 | if (chunkEnd === buf.length) {
|
153 | return chunkEnd;
|
154 | }
|
155 | let index = chunkEnd - 3;
|
156 | if (index >= 0) {
|
157 | if (isFirstByteOf4ByteChar(buf[index])) {
|
158 | return chunkEnd + 1;
|
159 | }
|
160 | }
|
161 | index = chunkEnd - 2;
|
162 | if (index >= 0) {
|
163 | if (isFirstByteOf4ByteChar(buf[index])) {
|
164 | return chunkEnd + 2;
|
165 | }
|
166 | if (isFirstByteOf3ByteChar(buf[index])) {
|
167 | return chunkEnd + 1;
|
168 | }
|
169 | }
|
170 | index = chunkEnd - 1;
|
171 | if (index >= 0) {
|
172 | if (isFirstByteOf4ByteChar(buf[index])) {
|
173 | return chunkEnd + 3;
|
174 | }
|
175 | if (isFirstByteOf3ByteChar(buf[index])) {
|
176 | return chunkEnd + 2;
|
177 | }
|
178 | if (isFirstByteOf2ByteChar(buf[index])) {
|
179 | return chunkEnd + 1;
|
180 | }
|
181 | }
|
182 | return chunkEnd;
|
183 | }
|
184 | function isFirstByteOf4ByteChar(byte) {
|
185 | // eslint-disable-next-line no-bitwise
|
186 | return byte >> 3 === 30; // 11110xxx?
|
187 | }
|
188 | function isFirstByteOf3ByteChar(byte) {
|
189 | // eslint-disable-next-line no-bitwise
|
190 | return byte >> 4 === 14; // 1110xxxx?
|
191 | }
|
192 | function isFirstByteOf2ByteChar(byte) {
|
193 | // eslint-disable-next-line no-bitwise
|
194 | return byte >> 5 === 6; // 110xxxxx?
|
195 | }
|
196 | function isLaterByteOfUtf8(byte) {
|
197 | // eslint-disable-next-line no-bitwise
|
198 | return byte >> 6 === 2; // 10xxxxxx?
|
199 | }
|