1 |
|
2 |
|
3 |
|
4 | import type Buffer from 'buffer'
|
5 | import * as pathUtil from 'path'
|
6 | import textExtensions from 'textextensions'
|
7 | import binaryExtensions from 'binaryextensions'
|
8 |
|
9 | export interface EncodingOpts {
|
10 |
|
11 | chunkLength?: number
|
12 |
|
13 |
|
14 | chunkBegin?: number
|
15 | }
|
16 |
|
17 |
|
18 |
|
19 |
|
20 |
|
21 |
|
22 |
|
23 |
|
24 |
|
25 |
|
26 | export function isText(
|
27 | filename?: string | null,
|
28 | buffer?: Buffer | null
|
29 | ): boolean | null {
|
30 |
|
31 | if (filename) {
|
32 |
|
33 | const parts = pathUtil.basename(filename).split('.').reverse()
|
34 |
|
35 |
|
36 | for (const extension of parts) {
|
37 | if (textExtensions.indexOf(extension) !== -1) {
|
38 | return true
|
39 | }
|
40 | if (binaryExtensions.indexOf(extension) !== -1) {
|
41 | return false
|
42 | }
|
43 | }
|
44 | }
|
45 |
|
46 |
|
47 | if (buffer) {
|
48 | return getEncoding(buffer) === 'utf8'
|
49 | }
|
50 |
|
51 |
|
52 | return null
|
53 | }
|
54 |
|
55 |
|
56 |
|
57 |
|
58 |
|
59 |
|
60 |
|
61 |
|
62 |
|
63 |
|
64 | export function isBinary(filename?: string | null, buffer?: Buffer | null) {
|
65 | const text = isText(filename, buffer)
|
66 | if (text == null) return null
|
67 | return !text
|
68 | }
|
69 |
|
70 |
|
71 |
|
72 |
|
73 |
|
74 |
|
75 |
|
76 | export function getEncoding(
|
77 | buffer: Buffer | null,
|
78 | opts?: EncodingOpts
|
79 | ): 'utf8' | 'binary' | null {
|
80 |
|
81 | if (!buffer) return null
|
82 |
|
83 |
|
84 | const textEncoding = 'utf8'
|
85 | const binaryEncoding = 'binary'
|
86 | const chunkLength = opts?.chunkLength ?? 24
|
87 | let chunkBegin = opts?.chunkBegin ?? 0
|
88 |
|
89 |
|
90 | if (opts?.chunkBegin == null) {
|
91 |
|
92 | let encoding = getEncoding(buffer, { chunkLength, chunkBegin })
|
93 | if (encoding === textEncoding) {
|
94 |
|
95 | chunkBegin = Math.max(0, Math.floor(buffer.length / 2) - chunkLength)
|
96 | encoding = getEncoding(buffer, {
|
97 | chunkLength,
|
98 | chunkBegin,
|
99 | })
|
100 | if (encoding === textEncoding) {
|
101 |
|
102 | chunkBegin = Math.max(0, buffer.length - chunkLength)
|
103 | encoding = getEncoding(buffer, {
|
104 | chunkLength,
|
105 | chunkBegin,
|
106 | })
|
107 | }
|
108 | }
|
109 |
|
110 |
|
111 | return encoding
|
112 | } else {
|
113 |
|
114 | chunkBegin = getChunkBegin(buffer, chunkBegin)
|
115 | if (chunkBegin === -1) {
|
116 | return binaryEncoding
|
117 | }
|
118 |
|
119 | const chunkEnd = getChunkEnd(
|
120 | buffer,
|
121 | Math.min(buffer.length, chunkBegin + chunkLength)
|
122 | )
|
123 |
|
124 | if (chunkEnd > buffer.length) {
|
125 | return binaryEncoding
|
126 | }
|
127 |
|
128 | const contentChunkUTF8 = buffer.toString(textEncoding, chunkBegin, chunkEnd)
|
129 |
|
130 |
|
131 | for (let i = 0; i < contentChunkUTF8.length; ++i) {
|
132 | const charCode = contentChunkUTF8.charCodeAt(i)
|
133 | if (charCode === 65533 || charCode <= 8) {
|
134 |
|
135 |
|
136 |
|
137 | return binaryEncoding
|
138 | }
|
139 | }
|
140 |
|
141 |
|
142 | return textEncoding
|
143 | }
|
144 | }
|
145 |
|
146 |
|
147 |
|
148 |
|
149 |
|
150 |
|
151 | function getChunkBegin(buf: Buffer, chunkBegin: number) {
|
152 |
|
153 | if (chunkBegin === 0) {
|
154 | return 0
|
155 | }
|
156 |
|
157 | if (!isLaterByteOfUtf8(buf[chunkBegin])) {
|
158 | return chunkBegin
|
159 | }
|
160 |
|
161 | let begin = chunkBegin - 3
|
162 |
|
163 | if (begin >= 0) {
|
164 | if (isFirstByteOf4ByteChar(buf[begin])) {
|
165 | return begin
|
166 | }
|
167 | }
|
168 |
|
169 | begin = chunkBegin - 2
|
170 |
|
171 | if (begin >= 0) {
|
172 | if (
|
173 | isFirstByteOf4ByteChar(buf[begin]) ||
|
174 | isFirstByteOf3ByteChar(buf[begin])
|
175 | ) {
|
176 | return begin
|
177 | }
|
178 | }
|
179 |
|
180 | begin = chunkBegin - 1
|
181 |
|
182 | if (begin >= 0) {
|
183 |
|
184 | if (
|
185 | isFirstByteOf4ByteChar(buf[begin]) ||
|
186 | isFirstByteOf3ByteChar(buf[begin]) ||
|
187 | isFirstByteOf2ByteChar(buf[begin])
|
188 | ) {
|
189 | return begin
|
190 | }
|
191 | }
|
192 |
|
193 | return -1
|
194 | }
|
195 |
|
196 | function getChunkEnd(buf: Buffer, chunkEnd: number) {
|
197 |
|
198 | if (chunkEnd === buf.length) {
|
199 | return chunkEnd
|
200 | }
|
201 |
|
202 | let index = chunkEnd - 3
|
203 |
|
204 | if (index >= 0) {
|
205 | if (isFirstByteOf4ByteChar(buf[index])) {
|
206 | return chunkEnd + 1
|
207 | }
|
208 | }
|
209 |
|
210 | index = chunkEnd - 2
|
211 |
|
212 | if (index >= 0) {
|
213 | if (isFirstByteOf4ByteChar(buf[index])) {
|
214 | return chunkEnd + 2
|
215 | }
|
216 |
|
217 | if (isFirstByteOf3ByteChar(buf[index])) {
|
218 | return chunkEnd + 1
|
219 | }
|
220 | }
|
221 |
|
222 | index = chunkEnd - 1
|
223 |
|
224 | if (index >= 0) {
|
225 | if (isFirstByteOf4ByteChar(buf[index])) {
|
226 | return chunkEnd + 3
|
227 | }
|
228 |
|
229 | if (isFirstByteOf3ByteChar(buf[index])) {
|
230 | return chunkEnd + 2
|
231 | }
|
232 |
|
233 | if (isFirstByteOf2ByteChar(buf[index])) {
|
234 | return chunkEnd + 1
|
235 | }
|
236 | }
|
237 |
|
238 | return chunkEnd
|
239 | }
|
240 |
|
241 | function isFirstByteOf4ByteChar(byte: number) {
|
242 |
|
243 | return byte >> 3 === 30
|
244 | }
|
245 |
|
246 | function isFirstByteOf3ByteChar(byte: number) {
|
247 |
|
248 | return byte >> 4 === 14
|
249 | }
|
250 |
|
251 | function isFirstByteOf2ByteChar(byte: number) {
|
252 |
|
253 | return byte >> 5 === 6
|
254 | }
|
255 |
|
256 | function isLaterByteOfUtf8(byte: number) {
|
257 |
|
258 | return byte >> 6 === 2
|
259 | }
|