UNPKG

7.34 kBJavaScriptView Raw
1/* eslint no-use-before-define:0 */
2import * as pathUtil from 'path';
3import textExtensions from 'textextensions';
4import binaryExtensions from 'binaryextensions';
5/**
6 * Determine if the filename and/or buffer is text.
7 * Determined by extension checks first (if filename is available), otherwise if unknown extension or no filename, will perform a slower buffer encoding detection.
8 * This order is done, as extension checks are quicker, and also because encoding checks cannot guarantee accuracy for chars between utf8 and utf16.
9 * The extension checks are performed using the resources https://github.com/bevry/textextensions and https://github.com/bevry/binaryextensions
10 * @param filename The filename for the file/buffer if available
11 * @param buffer The buffer for the file if available
12 * @returns Will be `null` if neither `filename` nor `buffer` were provided. Otherwise will be a boolean value with the detection result.
13 */
14export function isText(filename, buffer) {
15 // Test extensions
16 if (filename) {
17 // Extract filename
18 const parts = pathUtil.basename(filename).split('.').reverse();
19 // Cycle extensions
20 for (const extension of parts) {
21 if (textExtensions.indexOf(extension) !== -1) {
22 return true;
23 }
24 if (binaryExtensions.indexOf(extension) !== -1) {
25 return false;
26 }
27 }
28 }
29 // Fallback to encoding if extension check was not enough
30 if (buffer) {
31 return getEncoding(buffer) === 'utf8';
32 }
33 // No buffer was provided
34 return null;
35}
36/**
37 * Determine if the filename and/or buffer is binary.
38 * Determined by extension checks first (if filename is available), otherwise if unknown extension or no filename, will perform a slower buffer encoding detection.
39 * This order is done, as extension checks are quicker, and also because encoding checks cannot guarantee accuracy for chars between utf8 and utf16.
40 * The extension checks are performed using the resources https://github.com/bevry/textextensions and https://github.com/bevry/binaryextensions
41 * @param filename The filename for the file/buffer if available
42 * @param buffer The buffer for the file if available
43 * @returns Will be `null` if neither `filename` nor `buffer` were provided. Otherwise will be a boolean value with the detection result.
44 */
45export function isBinary(filename, buffer) {
46 const text = isText(filename, buffer);
47 if (text == null)
48 return null;
49 return !text;
50}
51/**
52 * Get the encoding of a buffer.
53 * Checks the start, middle, and end of the buffer for characters that are unrecognized within UTF8 encoding.
54 * History has shown that inspection at all three locations is necessary.
55 * @returns Will be `null` if `buffer` was not provided. Otherwise will be either `'utf8'` or `'binary'`
56 */
57export function getEncoding(buffer, opts) {
58 // Check
59 if (!buffer)
60 return null;
61 // Prepare
62 const textEncoding = 'utf8';
63 const binaryEncoding = 'binary';
64 const chunkLength = opts?.chunkLength ?? 24;
65 let chunkBegin = opts?.chunkBegin ?? 0;
66 // Discover
67 if (opts?.chunkBegin == null) {
68 // Start
69 let encoding = getEncoding(buffer, { chunkLength, chunkBegin });
70 if (encoding === textEncoding) {
71 // Middle
72 chunkBegin = Math.max(0, Math.floor(buffer.length / 2) - chunkLength);
73 encoding = getEncoding(buffer, {
74 chunkLength,
75 chunkBegin,
76 });
77 if (encoding === textEncoding) {
78 // End
79 chunkBegin = Math.max(0, buffer.length - chunkLength);
80 encoding = getEncoding(buffer, {
81 chunkLength,
82 chunkBegin,
83 });
84 }
85 }
86 // Return
87 return encoding;
88 }
89 else {
90 // Extract
91 chunkBegin = getChunkBegin(buffer, chunkBegin);
92 if (chunkBegin === -1) {
93 return binaryEncoding;
94 }
95 const chunkEnd = getChunkEnd(buffer, Math.min(buffer.length, chunkBegin + chunkLength));
96 if (chunkEnd > buffer.length) {
97 return binaryEncoding;
98 }
99 const contentChunkUTF8 = buffer.toString(textEncoding, chunkBegin, chunkEnd);
100 // Detect encoding
101 for (let i = 0; i < contentChunkUTF8.length; ++i) {
102 const charCode = contentChunkUTF8.charCodeAt(i);
103 if (charCode === 65533 || charCode <= 8) {
104 // 8 and below are control characters (e.g. backspace, null, eof, etc.)
105 // 65533 is the unknown character
106 // console.log(charCode, contentChunkUTF8[i])
107 return binaryEncoding;
108 }
109 }
110 // Return
111 return textEncoding;
112 }
113}
114// ====================================
115// The functions below are created to handle multibyte utf8 characters.
116// To understand how the encoding works, check this article: https://en.wikipedia.org/wiki/UTF-8#Encoding
117// @todo add documentation for these
118function getChunkBegin(buf, chunkBegin) {
119 // If it's the beginning, just return.
120 if (chunkBegin === 0) {
121 return 0;
122 }
123 if (!isLaterByteOfUtf8(buf[chunkBegin])) {
124 return chunkBegin;
125 }
126 let begin = chunkBegin - 3;
127 if (begin >= 0) {
128 if (isFirstByteOf4ByteChar(buf[begin])) {
129 return begin;
130 }
131 }
132 begin = chunkBegin - 2;
133 if (begin >= 0) {
134 if (isFirstByteOf4ByteChar(buf[begin]) ||
135 isFirstByteOf3ByteChar(buf[begin])) {
136 return begin;
137 }
138 }
139 begin = chunkBegin - 1;
140 if (begin >= 0) {
141 // Is it a 4-byte, 3-byte utf8 character?
142 if (isFirstByteOf4ByteChar(buf[begin]) ||
143 isFirstByteOf3ByteChar(buf[begin]) ||
144 isFirstByteOf2ByteChar(buf[begin])) {
145 return begin;
146 }
147 }
148 return -1;
149}
150function getChunkEnd(buf, chunkEnd) {
151 // If it's the end, just return.
152 if (chunkEnd === buf.length) {
153 return chunkEnd;
154 }
155 let index = chunkEnd - 3;
156 if (index >= 0) {
157 if (isFirstByteOf4ByteChar(buf[index])) {
158 return chunkEnd + 1;
159 }
160 }
161 index = chunkEnd - 2;
162 if (index >= 0) {
163 if (isFirstByteOf4ByteChar(buf[index])) {
164 return chunkEnd + 2;
165 }
166 if (isFirstByteOf3ByteChar(buf[index])) {
167 return chunkEnd + 1;
168 }
169 }
170 index = chunkEnd - 1;
171 if (index >= 0) {
172 if (isFirstByteOf4ByteChar(buf[index])) {
173 return chunkEnd + 3;
174 }
175 if (isFirstByteOf3ByteChar(buf[index])) {
176 return chunkEnd + 2;
177 }
178 if (isFirstByteOf2ByteChar(buf[index])) {
179 return chunkEnd + 1;
180 }
181 }
182 return chunkEnd;
183}
184function isFirstByteOf4ByteChar(byte) {
185 // eslint-disable-next-line no-bitwise
186 return byte >> 3 === 30; // 11110xxx?
187}
188function isFirstByteOf3ByteChar(byte) {
189 // eslint-disable-next-line no-bitwise
190 return byte >> 4 === 14; // 1110xxxx?
191}
192function isFirstByteOf2ByteChar(byte) {
193 // eslint-disable-next-line no-bitwise
194 return byte >> 5 === 6; // 110xxxxx?
195}
196function isLaterByteOfUtf8(byte) {
197 // eslint-disable-next-line no-bitwise
198 return byte >> 6 === 2; // 10xxxxxx?
199}