UNPKG

istextorbinary/edition-es2019-esm/index.js

Version:

7.56 kBJavaScriptView Raw

1/* eslint no-use-before-define:0 */
2import * as pathUtil from 'path';
3import textExtensions from 'textextensions';
4import binaryExtensions from 'binaryextensions';
5/**
* Determine if the filename and/or buffer is text.
* Determined by extension checks first (if filename is available), otherwise if unknown extension or no filename, will perform a slower buffer encoding detection.
* This order is done, as extension checks are quicker, and also because encoding checks cannot guarantee accuracy for chars between utf8 and utf16.
* The extension checks are performed using the resources https://github.com/bevry/textextensions and https://github.com/bevry/binaryextensions
* @param filename The filename for the file/buffer if available
* @param buffer The buffer for the file if available
* @returns Will be `null` if neither `filename` nor `buffer` were provided. Otherwise will be a boolean value with the detection result.
*/
14export function isText(filename, buffer) {
  // Test extensions
  if (filename) {
      // Extract filename
      const parts = pathUtil.basename(filename).split('.').reverse();
      // Cycle extensions
      for (const extension of parts) {
          if (textExtensions.indexOf(extension) !== -1) {
              return true;
          }
          if (binaryExtensions.indexOf(extension) !== -1) {
              return false;
          }
      }
  }
  // Fallback to encoding if extension check was not enough
  if (buffer) {
      return getEncoding(buffer) === 'utf8';
  }
  // No buffer was provided
  return null;
35}
36/**
* Determine if the filename and/or buffer is binary.
* Determined by extension checks first (if filename is available), otherwise if unknown extension or no filename, will perform a slower buffer encoding detection.
* This order is done, as extension checks are quicker, and also because encoding checks cannot guarantee accuracy for chars between utf8 and utf16.
* The extension checks are performed using the resources https://github.com/bevry/textextensions and https://github.com/bevry/binaryextensions
* @param filename The filename for the file/buffer if available
* @param buffer The buffer for the file if available
* @returns Will be `null` if neither `filename` nor `buffer` were provided. Otherwise will be a boolean value with the detection result.
*/
45export function isBinary(filename, buffer) {
  const text = isText(filename, buffer);
  if (text == null)
      return null;
  return !text;
50}
51/**
* Get the encoding of a buffer.
* Checks the start, middle, and end of the buffer for characters that are unrecognized within UTF8 encoding.
* History has shown that inspection at all three locations is necessary.
* @returns Will be `null` if `buffer` was not provided. Otherwise will be either `'utf8'` or `'binary'`
*/
57export function getEncoding(buffer, opts) {
  var _a, _b;
  // Check
  if (!buffer)
      return null;
  // Prepare
  const textEncoding = 'utf8';
  const binaryEncoding = 'binary';
  const chunkLength = (_a = opts === null || opts === void 0 ? void 0 : opts.chunkLength) !== null && _a !== void 0 ? _a : 24;
  let chunkBegin = (_b = opts === null || opts === void 0 ? void 0 : opts.chunkBegin) !== null && _b !== void 0 ? _b : 0;
  // Discover
  if ((opts === null || opts === void 0 ? void 0 : opts.chunkBegin) == null) {
      // Start
      let encoding = getEncoding(buffer, { chunkLength, chunkBegin });
      if (encoding === textEncoding) {
          // Middle
          chunkBegin = Math.max(0, Math.floor(buffer.length / 2) - chunkLength);
          encoding = getEncoding(buffer, {
              chunkLength,
              chunkBegin,
          });
          if (encoding === textEncoding) {
              // End
              chunkBegin = Math.max(0, buffer.length - chunkLength);
              encoding = getEncoding(buffer, {
                  chunkLength,
                  chunkBegin,
              });
          }
      }
      // Return
      return encoding;
  }
  else {
      // Extract
      chunkBegin = getChunkBegin(buffer, chunkBegin);
      if (chunkBegin === -1) {
          return binaryEncoding;
      }
      const chunkEnd = getChunkEnd(buffer, Math.min(buffer.length, chunkBegin + chunkLength));
      if (chunkEnd > buffer.length) {
          return binaryEncoding;
      }
      const contentChunkUTF8 = buffer.toString(textEncoding, chunkBegin, chunkEnd);
      // Detect encoding
      for (let i = 0; i < contentChunkUTF8.length; ++i) {
          const charCode = contentChunkUTF8.charCodeAt(i);
          if (charCode === 65533 || charCode <= 8) {
              // 8 and below are control characters (e.g. backspace, null, eof, etc.)
              // 65533 is the unknown character
              // console.log(charCode, contentChunkUTF8[i])
              return binaryEncoding;
          }
      }
      // Return
      return textEncoding;
  }
114}
115// ====================================
116// The functions below are created to handle multibyte utf8 characters.
117// To understand how the encoding works, check this article: https://en.wikipedia.org/wiki/UTF-8#Encoding
118// @todo add documentation for these
119function getChunkBegin(buf, chunkBegin) {
  // If it's the beginning, just return.
  if (chunkBegin === 0) {
      return 0;
  }
  if (!isLaterByteOfUtf8(buf[chunkBegin])) {
      return chunkBegin;
  }
  let begin = chunkBegin - 3;
  if (begin >= 0) {
      if (isFirstByteOf4ByteChar(buf[begin])) {
          return begin;
      }
  }
  begin = chunkBegin - 2;
  if (begin >= 0) {
      if (isFirstByteOf4ByteChar(buf[begin]) ||
          isFirstByteOf3ByteChar(buf[begin])) {
          return begin;
      }
  }
  begin = chunkBegin - 1;
  if (begin >= 0) {
      // Is it a 4-byte, 3-byte utf8 character?
      if (isFirstByteOf4ByteChar(buf[begin]) ||
          isFirstByteOf3ByteChar(buf[begin]) ||
          isFirstByteOf2ByteChar(buf[begin])) {
          return begin;
      }
  }
  return -1;
150}
151function getChunkEnd(buf, chunkEnd) {
  // If it's the end, just return.
  if (chunkEnd === buf.length) {
      return chunkEnd;
  }
  let index = chunkEnd - 3;
  if (index >= 0) {
      if (isFirstByteOf4ByteChar(buf[index])) {
          return chunkEnd + 1;
      }
  }
  index = chunkEnd - 2;
  if (index >= 0) {
      if (isFirstByteOf4ByteChar(buf[index])) {
          return chunkEnd + 2;
      }
      if (isFirstByteOf3ByteChar(buf[index])) {
          return chunkEnd + 1;
      }
  }
  index = chunkEnd - 1;
  if (index >= 0) {
      if (isFirstByteOf4ByteChar(buf[index])) {
          return chunkEnd + 3;
      }
      if (isFirstByteOf3ByteChar(buf[index])) {
          return chunkEnd + 2;
      }
      if (isFirstByteOf2ByteChar(buf[index])) {
          return chunkEnd + 1;
      }
  }
  return chunkEnd;
184}
185function isFirstByteOf4ByteChar(byte) {
  // eslint-disable-next-line no-bitwise
  return byte >> 3 === 30; // 11110xxx?
188}
189function isFirstByteOf3ByteChar(byte) {
  // eslint-disable-next-line no-bitwise
  return byte >> 4 === 14; // 1110xxxx?
192}
193function isFirstByteOf2ByteChar(byte) {
  // eslint-disable-next-line no-bitwise
  return byte >> 5 === 6; // 110xxxxx?
196}
197function isLaterByteOfUtf8(byte) {
  // eslint-disable-next-line no-bitwise
  return byte >> 6 === 2; // 10xxxxxx?
200}

1	`/* eslint no-use-before-define:0 */`
2	`import * as pathUtil from 'path';`
3	`import textExtensions from 'textextensions';`
4	`import binaryExtensions from 'binaryextensions';`
5	`/**`
6	`* Determine if the filename and/or buffer is text.`
7	`* Determined by extension checks first (if filename is available), otherwise if unknown extension or no filename, will perform a slower buffer encoding detection.`
8	`* This order is done, as extension checks are quicker, and also because encoding checks cannot guarantee accuracy for chars between utf8 and utf16.`
9	`* The extension checks are performed using the resources https://github.com/bevry/textextensions and https://github.com/bevry/binaryextensions`
10	`* @param filename The filename for the file/buffer if available`
11	`* @param buffer The buffer for the file if available`
12	* @returns Will be `null` if neither `filename` nor `buffer` were provided. Otherwise will be a boolean value with the detection result.
13	`*/`
14	`export function isText(filename, buffer) {`
15	`// Test extensions`
16	`if (filename) {`
17	`// Extract filename`
18	`const parts = pathUtil.basename(filename).split('.').reverse();`
19	`// Cycle extensions`
20	`for (const extension of parts) {`
21	`if (textExtensions.indexOf(extension) !== -1) {`
22	`return true;`
23	`}`
24	`if (binaryExtensions.indexOf(extension) !== -1) {`
25	`return false;`
26	`}`
27	`}`
28	`}`
29	`// Fallback to encoding if extension check was not enough`
30	`if (buffer) {`
31	`return getEncoding(buffer) === 'utf8';`
32	`}`
33	`// No buffer was provided`
34	`return null;`
35	`}`
36	`/**`
37	`* Determine if the filename and/or buffer is binary.`
38	`* Determined by extension checks first (if filename is available), otherwise if unknown extension or no filename, will perform a slower buffer encoding detection.`
39	`* This order is done, as extension checks are quicker, and also because encoding checks cannot guarantee accuracy for chars between utf8 and utf16.`
40	`* The extension checks are performed using the resources https://github.com/bevry/textextensions and https://github.com/bevry/binaryextensions`
41	`* @param filename The filename for the file/buffer if available`
42	`* @param buffer The buffer for the file if available`
43	* @returns Will be `null` if neither `filename` nor `buffer` were provided. Otherwise will be a boolean value with the detection result.
44	`*/`
45	`export function isBinary(filename, buffer) {`
46	`const text = isText(filename, buffer);`
47	`if (text == null)`
48	`return null;`
49	`return !text;`
50	`}`
51	`/**`
52	`* Get the encoding of a buffer.`
53	`* Checks the start, middle, and end of the buffer for characters that are unrecognized within UTF8 encoding.`
54	`* History has shown that inspection at all three locations is necessary.`
55	* @returns Will be `null` if `buffer` was not provided. Otherwise will be either `'utf8'` or `'binary'`
56	`*/`
57	`export function getEncoding(buffer, opts) {`
58	`var _a, _b;`
59	`// Check`
60	`if (!buffer)`
61	`return null;`
62	`// Prepare`
63	`const textEncoding = 'utf8';`
64	`const binaryEncoding = 'binary';`
65	`const chunkLength = (_a = opts === null \|\| opts === void 0 ? void 0 : opts.chunkLength) !== null && _a !== void 0 ? _a : 24;`
66	`let chunkBegin = (_b = opts === null \|\| opts === void 0 ? void 0 : opts.chunkBegin) !== null && _b !== void 0 ? _b : 0;`
67	`// Discover`
68	`if ((opts === null \|\| opts === void 0 ? void 0 : opts.chunkBegin) == null) {`
69	`// Start`
70	`let encoding = getEncoding(buffer, { chunkLength, chunkBegin });`
71	`if (encoding === textEncoding) {`
72	`// Middle`
73	`chunkBegin = Math.max(0, Math.floor(buffer.length / 2) - chunkLength);`
74	`encoding = getEncoding(buffer, {`
75	`chunkLength,`
76	`chunkBegin,`
77	`});`
78	`if (encoding === textEncoding) {`
79	`// End`
80	`chunkBegin = Math.max(0, buffer.length - chunkLength);`
81	`encoding = getEncoding(buffer, {`
82	`chunkLength,`
83	`chunkBegin,`
84	`});`
85	`}`
86	`}`
87	`// Return`
88	`return encoding;`
89	`}`
90	`else {`
91	`// Extract`
92	`chunkBegin = getChunkBegin(buffer, chunkBegin);`
93	`if (chunkBegin === -1) {`
94	`return binaryEncoding;`
95	`}`
96	`const chunkEnd = getChunkEnd(buffer, Math.min(buffer.length, chunkBegin + chunkLength));`
97	`if (chunkEnd > buffer.length) {`
98	`return binaryEncoding;`
99	`}`
100	`const contentChunkUTF8 = buffer.toString(textEncoding, chunkBegin, chunkEnd);`
101	`// Detect encoding`
102	`for (let i = 0; i < contentChunkUTF8.length; ++i) {`
103	`const charCode = contentChunkUTF8.charCodeAt(i);`
104	`if (charCode === 65533 \|\| charCode <= 8) {`
105	`// 8 and below are control characters (e.g. backspace, null, eof, etc.)`
106	`// 65533 is the unknown character`
107	`// console.log(charCode, contentChunkUTF8[i])`
108	`return binaryEncoding;`
109	`}`
110	`}`
111	`// Return`
112	`return textEncoding;`
113	`}`
114	`}`
115	`// ====================================`
116	`// The functions below are created to handle multibyte utf8 characters.`
117	`// To understand how the encoding works, check this article: https://en.wikipedia.org/wiki/UTF-8#Encoding`
118	`// @todo add documentation for these`
119	`function getChunkBegin(buf, chunkBegin) {`
120	`// If it's the beginning, just return.`
121	`if (chunkBegin === 0) {`
122	`return 0;`
123	`}`
124	`if (!isLaterByteOfUtf8(buf[chunkBegin])) {`
125	`return chunkBegin;`
126	`}`
127	`let begin = chunkBegin - 3;`
128	`if (begin >= 0) {`
129	`if (isFirstByteOf4ByteChar(buf[begin])) {`
130	`return begin;`
131	`}`
132	`}`
133	`begin = chunkBegin - 2;`
134	`if (begin >= 0) {`
135	`if (isFirstByteOf4ByteChar(buf[begin]) \|\|`
136	`isFirstByteOf3ByteChar(buf[begin])) {`
137	`return begin;`
138	`}`
139	`}`
140	`begin = chunkBegin - 1;`
141	`if (begin >= 0) {`
142	`// Is it a 4-byte, 3-byte utf8 character?`
143	`if (isFirstByteOf4ByteChar(buf[begin]) \|\|`
144	`isFirstByteOf3ByteChar(buf[begin]) \|\|`
145	`isFirstByteOf2ByteChar(buf[begin])) {`
146	`return begin;`
147	`}`
148	`}`
149	`return -1;`
150	`}`
151	`function getChunkEnd(buf, chunkEnd) {`
152	`// If it's the end, just return.`
153	`if (chunkEnd === buf.length) {`
154	`return chunkEnd;`
155	`}`
156	`let index = chunkEnd - 3;`
157	`if (index >= 0) {`
158	`if (isFirstByteOf4ByteChar(buf[index])) {`
159	`return chunkEnd + 1;`
160	`}`
161	`}`
162	`index = chunkEnd - 2;`
163	`if (index >= 0) {`
164	`if (isFirstByteOf4ByteChar(buf[index])) {`
165	`return chunkEnd + 2;`
166	`}`
167	`if (isFirstByteOf3ByteChar(buf[index])) {`
168	`return chunkEnd + 1;`
169	`}`
170	`}`
171	`index = chunkEnd - 1;`
172	`if (index >= 0) {`
173	`if (isFirstByteOf4ByteChar(buf[index])) {`
174	`return chunkEnd + 3;`
175	`}`
176	`if (isFirstByteOf3ByteChar(buf[index])) {`
177	`return chunkEnd + 2;`
178	`}`
179	`if (isFirstByteOf2ByteChar(buf[index])) {`
180	`return chunkEnd + 1;`
181	`}`
182	`}`
183	`return chunkEnd;`
184	`}`
185	`function isFirstByteOf4ByteChar(byte) {`
186	`// eslint-disable-next-line no-bitwise`
187	`return byte >> 3 === 30; // 11110xxx?`
188	`}`
189	`function isFirstByteOf3ByteChar(byte) {`
190	`// eslint-disable-next-line no-bitwise`
191	`return byte >> 4 === 14; // 1110xxxx?`
192	`}`
193	`function isFirstByteOf2ByteChar(byte) {`
194	`// eslint-disable-next-line no-bitwise`
195	`return byte >> 5 === 6; // 110xxxxx?`
196	`}`
197	`function isLaterByteOfUtf8(byte) {`
198	`// eslint-disable-next-line no-bitwise`
199	`return byte >> 6 === 2; // 10xxxxxx?`
200	`}`