UNPKG

7.56 kBJavaScriptView Raw
1/* eslint no-use-before-define:0 */
2import * as pathUtil from 'path';
3import textExtensions from 'textextensions';
4import binaryExtensions from 'binaryextensions';
5/**
6 * Determine if the filename and/or buffer is text.
7 * Determined by extension checks first (if filename is available), otherwise if unknown extension or no filename, will perform a slower buffer encoding detection.
8 * This order is done, as extension checks are quicker, and also because encoding checks cannot guarantee accuracy for chars between utf8 and utf16.
9 * The extension checks are performed using the resources https://github.com/bevry/textextensions and https://github.com/bevry/binaryextensions
10 * @param filename The filename for the file/buffer if available
11 * @param buffer The buffer for the file if available
12 * @returns Will be `null` if neither `filename` nor `buffer` were provided. Otherwise will be a boolean value with the detection result.
13 */
14export function isText(filename, buffer) {
15 // Test extensions
16 if (filename) {
17 // Extract filename
18 const parts = pathUtil.basename(filename).split('.').reverse();
19 // Cycle extensions
20 for (const extension of parts) {
21 if (textExtensions.indexOf(extension) !== -1) {
22 return true;
23 }
24 if (binaryExtensions.indexOf(extension) !== -1) {
25 return false;
26 }
27 }
28 }
29 // Fallback to encoding if extension check was not enough
30 if (buffer) {
31 return getEncoding(buffer) === 'utf8';
32 }
33 // No buffer was provided
34 return null;
35}
36/**
37 * Determine if the filename and/or buffer is binary.
38 * Determined by extension checks first (if filename is available), otherwise if unknown extension or no filename, will perform a slower buffer encoding detection.
39 * This order is done, as extension checks are quicker, and also because encoding checks cannot guarantee accuracy for chars between utf8 and utf16.
40 * The extension checks are performed using the resources https://github.com/bevry/textextensions and https://github.com/bevry/binaryextensions
41 * @param filename The filename for the file/buffer if available
42 * @param buffer The buffer for the file if available
43 * @returns Will be `null` if neither `filename` nor `buffer` were provided. Otherwise will be a boolean value with the detection result.
44 */
45export function isBinary(filename, buffer) {
46 const text = isText(filename, buffer);
47 if (text == null)
48 return null;
49 return !text;
50}
51/**
52 * Get the encoding of a buffer.
53 * Checks the start, middle, and end of the buffer for characters that are unrecognized within UTF8 encoding.
54 * History has shown that inspection at all three locations is necessary.
55 * @returns Will be `null` if `buffer` was not provided. Otherwise will be either `'utf8'` or `'binary'`
56 */
57export function getEncoding(buffer, opts) {
58 var _a, _b;
59 // Check
60 if (!buffer)
61 return null;
62 // Prepare
63 const textEncoding = 'utf8';
64 const binaryEncoding = 'binary';
65 const chunkLength = (_a = opts === null || opts === void 0 ? void 0 : opts.chunkLength) !== null && _a !== void 0 ? _a : 24;
66 let chunkBegin = (_b = opts === null || opts === void 0 ? void 0 : opts.chunkBegin) !== null && _b !== void 0 ? _b : 0;
67 // Discover
68 if ((opts === null || opts === void 0 ? void 0 : opts.chunkBegin) == null) {
69 // Start
70 let encoding = getEncoding(buffer, { chunkLength, chunkBegin });
71 if (encoding === textEncoding) {
72 // Middle
73 chunkBegin = Math.max(0, Math.floor(buffer.length / 2) - chunkLength);
74 encoding = getEncoding(buffer, {
75 chunkLength,
76 chunkBegin,
77 });
78 if (encoding === textEncoding) {
79 // End
80 chunkBegin = Math.max(0, buffer.length - chunkLength);
81 encoding = getEncoding(buffer, {
82 chunkLength,
83 chunkBegin,
84 });
85 }
86 }
87 // Return
88 return encoding;
89 }
90 else {
91 // Extract
92 chunkBegin = getChunkBegin(buffer, chunkBegin);
93 if (chunkBegin === -1) {
94 return binaryEncoding;
95 }
96 const chunkEnd = getChunkEnd(buffer, Math.min(buffer.length, chunkBegin + chunkLength));
97 if (chunkEnd > buffer.length) {
98 return binaryEncoding;
99 }
100 const contentChunkUTF8 = buffer.toString(textEncoding, chunkBegin, chunkEnd);
101 // Detect encoding
102 for (let i = 0; i < contentChunkUTF8.length; ++i) {
103 const charCode = contentChunkUTF8.charCodeAt(i);
104 if (charCode === 65533 || charCode <= 8) {
105 // 8 and below are control characters (e.g. backspace, null, eof, etc.)
106 // 65533 is the unknown character
107 // console.log(charCode, contentChunkUTF8[i])
108 return binaryEncoding;
109 }
110 }
111 // Return
112 return textEncoding;
113 }
114}
115// ====================================
116// The functions below are created to handle multibyte utf8 characters.
117// To understand how the encoding works, check this article: https://en.wikipedia.org/wiki/UTF-8#Encoding
118// @todo add documentation for these
119function getChunkBegin(buf, chunkBegin) {
120 // If it's the beginning, just return.
121 if (chunkBegin === 0) {
122 return 0;
123 }
124 if (!isLaterByteOfUtf8(buf[chunkBegin])) {
125 return chunkBegin;
126 }
127 let begin = chunkBegin - 3;
128 if (begin >= 0) {
129 if (isFirstByteOf4ByteChar(buf[begin])) {
130 return begin;
131 }
132 }
133 begin = chunkBegin - 2;
134 if (begin >= 0) {
135 if (isFirstByteOf4ByteChar(buf[begin]) ||
136 isFirstByteOf3ByteChar(buf[begin])) {
137 return begin;
138 }
139 }
140 begin = chunkBegin - 1;
141 if (begin >= 0) {
142 // Is it a 4-byte, 3-byte utf8 character?
143 if (isFirstByteOf4ByteChar(buf[begin]) ||
144 isFirstByteOf3ByteChar(buf[begin]) ||
145 isFirstByteOf2ByteChar(buf[begin])) {
146 return begin;
147 }
148 }
149 return -1;
150}
151function getChunkEnd(buf, chunkEnd) {
152 // If it's the end, just return.
153 if (chunkEnd === buf.length) {
154 return chunkEnd;
155 }
156 let index = chunkEnd - 3;
157 if (index >= 0) {
158 if (isFirstByteOf4ByteChar(buf[index])) {
159 return chunkEnd + 1;
160 }
161 }
162 index = chunkEnd - 2;
163 if (index >= 0) {
164 if (isFirstByteOf4ByteChar(buf[index])) {
165 return chunkEnd + 2;
166 }
167 if (isFirstByteOf3ByteChar(buf[index])) {
168 return chunkEnd + 1;
169 }
170 }
171 index = chunkEnd - 1;
172 if (index >= 0) {
173 if (isFirstByteOf4ByteChar(buf[index])) {
174 return chunkEnd + 3;
175 }
176 if (isFirstByteOf3ByteChar(buf[index])) {
177 return chunkEnd + 2;
178 }
179 if (isFirstByteOf2ByteChar(buf[index])) {
180 return chunkEnd + 1;
181 }
182 }
183 return chunkEnd;
184}
185function isFirstByteOf4ByteChar(byte) {
186 // eslint-disable-next-line no-bitwise
187 return byte >> 3 === 30; // 11110xxx?
188}
189function isFirstByteOf3ByteChar(byte) {
190 // eslint-disable-next-line no-bitwise
191 return byte >> 4 === 14; // 1110xxxx?
192}
193function isFirstByteOf2ByteChar(byte) {
194 // eslint-disable-next-line no-bitwise
195 return byte >> 5 === 6; // 110xxxxx?
196}
197function isLaterByteOfUtf8(byte) {
198 // eslint-disable-next-line no-bitwise
199 return byte >> 6 === 2; // 10xxxxxx?
200}