UNPKG

istextorbinary/edition-browsers/index.js

Version:

7.34 kBJavaScriptView Raw

1/* eslint no-use-before-define:0 */
2import * as pathUtil from 'path';
3import textExtensions from 'textextensions';
4import binaryExtensions from 'binaryextensions';
5/**
* Determine if the filename and/or buffer is text.
* Determined by extension checks first (if filename is available), otherwise if unknown extension or no filename, will perform a slower buffer encoding detection.
* This order is done, as extension checks are quicker, and also because encoding checks cannot guarantee accuracy for chars between utf8 and utf16.
* The extension checks are performed using the resources https://github.com/bevry/textextensions and https://github.com/bevry/binaryextensions
* @param filename The filename for the file/buffer if available
* @param buffer The buffer for the file if available
* @returns Will be `null` if neither `filename` nor `buffer` were provided. Otherwise will be a boolean value with the detection result.
*/
14export function isText(filename, buffer) {
  // Test extensions
  if (filename) {
      // Extract filename
      const parts = pathUtil.basename(filename).split('.').reverse();
      // Cycle extensions
      for (const extension of parts) {
          if (textExtensions.indexOf(extension) !== -1) {
              return true;
          }
          if (binaryExtensions.indexOf(extension) !== -1) {
              return false;
          }
      }
  }
  // Fallback to encoding if extension check was not enough
  if (buffer) {
      return getEncoding(buffer) === 'utf8';
  }
  // No buffer was provided
  return null;
35}
36/**
* Determine if the filename and/or buffer is binary.
* Determined by extension checks first (if filename is available), otherwise if unknown extension or no filename, will perform a slower buffer encoding detection.
* This order is done, as extension checks are quicker, and also because encoding checks cannot guarantee accuracy for chars between utf8 and utf16.
* The extension checks are performed using the resources https://github.com/bevry/textextensions and https://github.com/bevry/binaryextensions
* @param filename The filename for the file/buffer if available
* @param buffer The buffer for the file if available
* @returns Will be `null` if neither `filename` nor `buffer` were provided. Otherwise will be a boolean value with the detection result.
*/
45export function isBinary(filename, buffer) {
  const text = isText(filename, buffer);
  if (text == null)
      return null;
  return !text;
50}
51/**
* Get the encoding of a buffer.
* Checks the start, middle, and end of the buffer for characters that are unrecognized within UTF8 encoding.
* History has shown that inspection at all three locations is necessary.
* @returns Will be `null` if `buffer` was not provided. Otherwise will be either `'utf8'` or `'binary'`
*/
57export function getEncoding(buffer, opts) {
  // Check
  if (!buffer)
      return null;
  // Prepare
  const textEncoding = 'utf8';
  const binaryEncoding = 'binary';
  const chunkLength = opts?.chunkLength ?? 24;
  let chunkBegin = opts?.chunkBegin ?? 0;
  // Discover
  if (opts?.chunkBegin == null) {
      // Start
      let encoding = getEncoding(buffer, { chunkLength, chunkBegin });
      if (encoding === textEncoding) {
          // Middle
          chunkBegin = Math.max(0, Math.floor(buffer.length / 2) - chunkLength);
          encoding = getEncoding(buffer, {
              chunkLength,
              chunkBegin,
          });
          if (encoding === textEncoding) {
              // End
              chunkBegin = Math.max(0, buffer.length - chunkLength);
              encoding = getEncoding(buffer, {
                  chunkLength,
                  chunkBegin,
              });
          }
      }
      // Return
      return encoding;
  }
  else {
      // Extract
      chunkBegin = getChunkBegin(buffer, chunkBegin);
      if (chunkBegin === -1) {
          return binaryEncoding;
      }
      const chunkEnd = getChunkEnd(buffer, Math.min(buffer.length, chunkBegin + chunkLength));
      if (chunkEnd > buffer.length) {
          return binaryEncoding;
      }
      const contentChunkUTF8 = buffer.toString(textEncoding, chunkBegin, chunkEnd);
      // Detect encoding
      for (let i = 0; i < contentChunkUTF8.length; ++i) {
          const charCode = contentChunkUTF8.charCodeAt(i);
          if (charCode === 65533 || charCode <= 8) {
              // 8 and below are control characters (e.g. backspace, null, eof, etc.)
              // 65533 is the unknown character
              // console.log(charCode, contentChunkUTF8[i])
              return binaryEncoding;
          }
      }
      // Return
      return textEncoding;
  }
113}
114// ====================================
115// The functions below are created to handle multibyte utf8 characters.
116// To understand how the encoding works, check this article: https://en.wikipedia.org/wiki/UTF-8#Encoding
117// @todo add documentation for these
118function getChunkBegin(buf, chunkBegin) {
  // If it's the beginning, just return.
  if (chunkBegin === 0) {
      return 0;
  }
  if (!isLaterByteOfUtf8(buf[chunkBegin])) {
      return chunkBegin;
  }
  let begin = chunkBegin - 3;
  if (begin >= 0) {
      if (isFirstByteOf4ByteChar(buf[begin])) {
          return begin;
      }
  }
  begin = chunkBegin - 2;
  if (begin >= 0) {
      if (isFirstByteOf4ByteChar(buf[begin]) ||
          isFirstByteOf3ByteChar(buf[begin])) {
          return begin;
      }
  }
  begin = chunkBegin - 1;
  if (begin >= 0) {
      // Is it a 4-byte, 3-byte utf8 character?
      if (isFirstByteOf4ByteChar(buf[begin]) ||
          isFirstByteOf3ByteChar(buf[begin]) ||
          isFirstByteOf2ByteChar(buf[begin])) {
          return begin;
      }
  }
  return -1;
149}
150function getChunkEnd(buf, chunkEnd) {
  // If it's the end, just return.
  if (chunkEnd === buf.length) {
      return chunkEnd;
  }
  let index = chunkEnd - 3;
  if (index >= 0) {
      if (isFirstByteOf4ByteChar(buf[index])) {
          return chunkEnd + 1;
      }
  }
  index = chunkEnd - 2;
  if (index >= 0) {
      if (isFirstByteOf4ByteChar(buf[index])) {
          return chunkEnd + 2;
      }
      if (isFirstByteOf3ByteChar(buf[index])) {
          return chunkEnd + 1;
      }
  }
  index = chunkEnd - 1;
  if (index >= 0) {
      if (isFirstByteOf4ByteChar(buf[index])) {
          return chunkEnd + 3;
      }
      if (isFirstByteOf3ByteChar(buf[index])) {
          return chunkEnd + 2;
      }
      if (isFirstByteOf2ByteChar(buf[index])) {
          return chunkEnd + 1;
      }
  }
  return chunkEnd;
183}
184function isFirstByteOf4ByteChar(byte) {
  // eslint-disable-next-line no-bitwise
  return byte >> 3 === 30; // 11110xxx?
187}
188function isFirstByteOf3ByteChar(byte) {
  // eslint-disable-next-line no-bitwise
  return byte >> 4 === 14; // 1110xxxx?
191}
192function isFirstByteOf2ByteChar(byte) {
  // eslint-disable-next-line no-bitwise
  return byte >> 5 === 6; // 110xxxxx?
195}
196function isLaterByteOfUtf8(byte) {
  // eslint-disable-next-line no-bitwise
  return byte >> 6 === 2; // 10xxxxxx?
199}

1	`/* eslint no-use-before-define:0 */`
2	`import * as pathUtil from 'path';`
3	`import textExtensions from 'textextensions';`
4	`import binaryExtensions from 'binaryextensions';`
5	`/**`
6	`* Determine if the filename and/or buffer is text.`
7	`* Determined by extension checks first (if filename is available), otherwise if unknown extension or no filename, will perform a slower buffer encoding detection.`
8	`* This order is done, as extension checks are quicker, and also because encoding checks cannot guarantee accuracy for chars between utf8 and utf16.`
9	`* The extension checks are performed using the resources https://github.com/bevry/textextensions and https://github.com/bevry/binaryextensions`
10	`* @param filename The filename for the file/buffer if available`
11	`* @param buffer The buffer for the file if available`
12	* @returns Will be `null` if neither `filename` nor `buffer` were provided. Otherwise will be a boolean value with the detection result.
13	`*/`
14	`export function isText(filename, buffer) {`
15	`// Test extensions`
16	`if (filename) {`
17	`// Extract filename`
18	`const parts = pathUtil.basename(filename).split('.').reverse();`
19	`// Cycle extensions`
20	`for (const extension of parts) {`
21	`if (textExtensions.indexOf(extension) !== -1) {`
22	`return true;`
23	`}`
24	`if (binaryExtensions.indexOf(extension) !== -1) {`
25	`return false;`
26	`}`
27	`}`
28	`}`
29	`// Fallback to encoding if extension check was not enough`
30	`if (buffer) {`
31	`return getEncoding(buffer) === 'utf8';`
32	`}`
33	`// No buffer was provided`
34	`return null;`
35	`}`
36	`/**`
37	`* Determine if the filename and/or buffer is binary.`
38	`* Determined by extension checks first (if filename is available), otherwise if unknown extension or no filename, will perform a slower buffer encoding detection.`
39	`* This order is done, as extension checks are quicker, and also because encoding checks cannot guarantee accuracy for chars between utf8 and utf16.`
40	`* The extension checks are performed using the resources https://github.com/bevry/textextensions and https://github.com/bevry/binaryextensions`
41	`* @param filename The filename for the file/buffer if available`
42	`* @param buffer The buffer for the file if available`
43	* @returns Will be `null` if neither `filename` nor `buffer` were provided. Otherwise will be a boolean value with the detection result.
44	`*/`
45	`export function isBinary(filename, buffer) {`
46	`const text = isText(filename, buffer);`
47	`if (text == null)`
48	`return null;`
49	`return !text;`
50	`}`
51	`/**`
52	`* Get the encoding of a buffer.`
53	`* Checks the start, middle, and end of the buffer for characters that are unrecognized within UTF8 encoding.`
54	`* History has shown that inspection at all three locations is necessary.`
55	* @returns Will be `null` if `buffer` was not provided. Otherwise will be either `'utf8'` or `'binary'`
56	`*/`
57	`export function getEncoding(buffer, opts) {`
58	`// Check`
59	`if (!buffer)`
60	`return null;`
61	`// Prepare`
62	`const textEncoding = 'utf8';`
63	`const binaryEncoding = 'binary';`
64	`const chunkLength = opts?.chunkLength ?? 24;`
65	`let chunkBegin = opts?.chunkBegin ?? 0;`
66	`// Discover`
67	`if (opts?.chunkBegin == null) {`
68	`// Start`
69	`let encoding = getEncoding(buffer, { chunkLength, chunkBegin });`
70	`if (encoding === textEncoding) {`
71	`// Middle`
72	`chunkBegin = Math.max(0, Math.floor(buffer.length / 2) - chunkLength);`
73	`encoding = getEncoding(buffer, {`
74	`chunkLength,`
75	`chunkBegin,`
76	`});`
77	`if (encoding === textEncoding) {`
78	`// End`
79	`chunkBegin = Math.max(0, buffer.length - chunkLength);`
80	`encoding = getEncoding(buffer, {`
81	`chunkLength,`
82	`chunkBegin,`
83	`});`
84	`}`
85	`}`
86	`// Return`
87	`return encoding;`
88	`}`
89	`else {`
90	`// Extract`
91	`chunkBegin = getChunkBegin(buffer, chunkBegin);`
92	`if (chunkBegin === -1) {`
93	`return binaryEncoding;`
94	`}`
95	`const chunkEnd = getChunkEnd(buffer, Math.min(buffer.length, chunkBegin + chunkLength));`
96	`if (chunkEnd > buffer.length) {`
97	`return binaryEncoding;`
98	`}`
99	`const contentChunkUTF8 = buffer.toString(textEncoding, chunkBegin, chunkEnd);`
100	`// Detect encoding`
101	`for (let i = 0; i < contentChunkUTF8.length; ++i) {`
102	`const charCode = contentChunkUTF8.charCodeAt(i);`
103	`if (charCode === 65533 \|\| charCode <= 8) {`
104	`// 8 and below are control characters (e.g. backspace, null, eof, etc.)`
105	`// 65533 is the unknown character`
106	`// console.log(charCode, contentChunkUTF8[i])`
107	`return binaryEncoding;`
108	`}`
109	`}`
110	`// Return`
111	`return textEncoding;`
112	`}`
113	`}`
114	`// ====================================`
115	`// The functions below are created to handle multibyte utf8 characters.`
116	`// To understand how the encoding works, check this article: https://en.wikipedia.org/wiki/UTF-8#Encoding`
117	`// @todo add documentation for these`
118	`function getChunkBegin(buf, chunkBegin) {`
119	`// If it's the beginning, just return.`
120	`if (chunkBegin === 0) {`
121	`return 0;`
122	`}`
123	`if (!isLaterByteOfUtf8(buf[chunkBegin])) {`
124	`return chunkBegin;`
125	`}`
126	`let begin = chunkBegin - 3;`
127	`if (begin >= 0) {`
128	`if (isFirstByteOf4ByteChar(buf[begin])) {`
129	`return begin;`
130	`}`
131	`}`
132	`begin = chunkBegin - 2;`
133	`if (begin >= 0) {`
134	`if (isFirstByteOf4ByteChar(buf[begin]) \|\|`
135	`isFirstByteOf3ByteChar(buf[begin])) {`
136	`return begin;`
137	`}`
138	`}`
139	`begin = chunkBegin - 1;`
140	`if (begin >= 0) {`
141	`// Is it a 4-byte, 3-byte utf8 character?`
142	`if (isFirstByteOf4ByteChar(buf[begin]) \|\|`
143	`isFirstByteOf3ByteChar(buf[begin]) \|\|`
144	`isFirstByteOf2ByteChar(buf[begin])) {`
145	`return begin;`
146	`}`
147	`}`
148	`return -1;`
149	`}`
150	`function getChunkEnd(buf, chunkEnd) {`
151	`// If it's the end, just return.`
152	`if (chunkEnd === buf.length) {`
153	`return chunkEnd;`
154	`}`
155	`let index = chunkEnd - 3;`
156	`if (index >= 0) {`
157	`if (isFirstByteOf4ByteChar(buf[index])) {`
158	`return chunkEnd + 1;`
159	`}`
160	`}`
161	`index = chunkEnd - 2;`
162	`if (index >= 0) {`
163	`if (isFirstByteOf4ByteChar(buf[index])) {`
164	`return chunkEnd + 2;`
165	`}`
166	`if (isFirstByteOf3ByteChar(buf[index])) {`
167	`return chunkEnd + 1;`
168	`}`
169	`}`
170	`index = chunkEnd - 1;`
171	`if (index >= 0) {`
172	`if (isFirstByteOf4ByteChar(buf[index])) {`
173	`return chunkEnd + 3;`
174	`}`
175	`if (isFirstByteOf3ByteChar(buf[index])) {`
176	`return chunkEnd + 2;`
177	`}`
178	`if (isFirstByteOf2ByteChar(buf[index])) {`
179	`return chunkEnd + 1;`
180	`}`
181	`}`
182	`return chunkEnd;`
183	`}`
184	`function isFirstByteOf4ByteChar(byte) {`
185	`// eslint-disable-next-line no-bitwise`
186	`return byte >> 3 === 30; // 11110xxx?`
187	`}`
188	`function isFirstByteOf3ByteChar(byte) {`
189	`// eslint-disable-next-line no-bitwise`
190	`return byte >> 4 === 14; // 1110xxxx?`
191	`}`
192	`function isFirstByteOf2ByteChar(byte) {`
193	`// eslint-disable-next-line no-bitwise`
194	`return byte >> 5 === 6; // 110xxxxx?`
195	`}`
196	`function isLaterByteOfUtf8(byte) {`
197	`// eslint-disable-next-line no-bitwise`
198	`return byte >> 6 === 2; // 10xxxxxx?`
199	`}`