1 | "use strict";
|
2 | var Buffer = require("safer-buffer").Buffer;
|
3 |
|
4 |
|
5 |
|
6 |
|
7 |
|
8 | exports._dbcs = DBCSCodec;
|
9 |
|
10 | var UNASSIGNED = -1,
|
11 | GB18030_CODE = -2,
|
12 | SEQ_START = -10,
|
13 | NODE_START = -1000,
|
14 | UNASSIGNED_NODE = new Array(0x100),
|
15 | DEF_CHAR = -1;
|
16 |
|
17 | for (var i = 0; i < 0x100; i++)
|
18 | UNASSIGNED_NODE[i] = UNASSIGNED;
|
19 |
|
20 |
|
21 |
|
22 | function DBCSCodec(codecOptions, iconv) {
|
23 | this.encodingName = codecOptions.encodingName;
|
24 | if (!codecOptions)
|
25 | throw new Error("DBCS codec is called without the data.")
|
26 | if (!codecOptions.table)
|
27 | throw new Error("Encoding '" + this.encodingName + "' has no data.");
|
28 |
|
29 |
|
30 | var mappingTable = codecOptions.table();
|
31 |
|
32 |
|
33 |
|
34 |
|
35 |
|
36 |
|
37 |
|
38 |
|
39 |
|
40 |
|
41 |
|
42 | this.decodeTables = [];
|
43 | this.decodeTables[0] = UNASSIGNED_NODE.slice(0);
|
44 |
|
45 |
|
46 | this.decodeTableSeq = [];
|
47 |
|
48 |
|
49 | for (var i = 0; i < mappingTable.length; i++)
|
50 | this._addDecodeChunk(mappingTable[i]);
|
51 |
|
52 |
|
53 | if (typeof codecOptions.gb18030 === 'function') {
|
54 | this.gb18030 = codecOptions.gb18030();
|
55 |
|
56 |
|
57 | var commonThirdByteNodeIdx = this.decodeTables.length;
|
58 | this.decodeTables.push(UNASSIGNED_NODE.slice(0));
|
59 |
|
60 | var commonFourthByteNodeIdx = this.decodeTables.length;
|
61 | this.decodeTables.push(UNASSIGNED_NODE.slice(0));
|
62 |
|
63 |
|
64 | var firstByteNode = this.decodeTables[0];
|
65 | for (var i = 0x81; i <= 0xFE; i++) {
|
66 | var secondByteNode = this.decodeTables[NODE_START - firstByteNode[i]];
|
67 | for (var j = 0x30; j <= 0x39; j++) {
|
68 | if (secondByteNode[j] === UNASSIGNED) {
|
69 | secondByteNode[j] = NODE_START - commonThirdByteNodeIdx;
|
70 | } else if (secondByteNode[j] > NODE_START) {
|
71 | throw new Error("gb18030 decode tables conflict at byte 2");
|
72 | }
|
73 |
|
74 | var thirdByteNode = this.decodeTables[NODE_START - secondByteNode[j]];
|
75 | for (var k = 0x81; k <= 0xFE; k++) {
|
76 | if (thirdByteNode[k] === UNASSIGNED) {
|
77 | thirdByteNode[k] = NODE_START - commonFourthByteNodeIdx;
|
78 | } else if (thirdByteNode[k] === NODE_START - commonFourthByteNodeIdx) {
|
79 | continue;
|
80 | } else if (thirdByteNode[k] > NODE_START) {
|
81 | throw new Error("gb18030 decode tables conflict at byte 3");
|
82 | }
|
83 |
|
84 | var fourthByteNode = this.decodeTables[NODE_START - thirdByteNode[k]];
|
85 | for (var l = 0x30; l <= 0x39; l++) {
|
86 | if (fourthByteNode[l] === UNASSIGNED)
|
87 | fourthByteNode[l] = GB18030_CODE;
|
88 | }
|
89 | }
|
90 | }
|
91 | }
|
92 | }
|
93 |
|
94 | this.defaultCharUnicode = iconv.defaultCharUnicode;
|
95 |
|
96 |
|
97 |
|
98 |
|
99 |
|
100 |
|
101 |
|
102 |
|
103 |
|
104 | this.encodeTable = [];
|
105 |
|
106 |
|
107 |
|
108 |
|
109 |
|
110 | this.encodeTableSeq = [];
|
111 |
|
112 |
|
113 | var skipEncodeChars = {};
|
114 | if (codecOptions.encodeSkipVals)
|
115 | for (var i = 0; i < codecOptions.encodeSkipVals.length; i++) {
|
116 | var val = codecOptions.encodeSkipVals[i];
|
117 | if (typeof val === 'number')
|
118 | skipEncodeChars[val] = true;
|
119 | else
|
120 | for (var j = val.from; j <= val.to; j++)
|
121 | skipEncodeChars[j] = true;
|
122 | }
|
123 |
|
124 |
|
125 | this._fillEncodeTable(0, 0, skipEncodeChars);
|
126 |
|
127 |
|
128 | if (codecOptions.encodeAdd) {
|
129 | for (var uChar in codecOptions.encodeAdd)
|
130 | if (Object.prototype.hasOwnProperty.call(codecOptions.encodeAdd, uChar))
|
131 | this._setEncodeChar(uChar.charCodeAt(0), codecOptions.encodeAdd[uChar]);
|
132 | }
|
133 |
|
134 | this.defCharSB = this.encodeTable[0][iconv.defaultCharSingleByte.charCodeAt(0)];
|
135 | if (this.defCharSB === UNASSIGNED) this.defCharSB = this.encodeTable[0]['?'];
|
136 | if (this.defCharSB === UNASSIGNED) this.defCharSB = "?".charCodeAt(0);
|
137 | }
|
138 |
|
139 | DBCSCodec.prototype.encoder = DBCSEncoder;
|
140 | DBCSCodec.prototype.decoder = DBCSDecoder;
|
141 |
|
142 |
|
143 | DBCSCodec.prototype._getDecodeTrieNode = function(addr) {
|
144 | var bytes = [];
|
145 | for (; addr > 0; addr >>>= 8)
|
146 | bytes.push(addr & 0xFF);
|
147 | if (bytes.length == 0)
|
148 | bytes.push(0);
|
149 |
|
150 | var node = this.decodeTables[0];
|
151 | for (var i = bytes.length-1; i > 0; i--) {
|
152 | var val = node[bytes[i]];
|
153 |
|
154 | if (val == UNASSIGNED) {
|
155 | node[bytes[i]] = NODE_START - this.decodeTables.length;
|
156 | this.decodeTables.push(node = UNASSIGNED_NODE.slice(0));
|
157 | }
|
158 | else if (val <= NODE_START) {
|
159 | node = this.decodeTables[NODE_START - val];
|
160 | }
|
161 | else
|
162 | throw new Error("Overwrite byte in " + this.encodingName + ", addr: " + addr.toString(16));
|
163 | }
|
164 | return node;
|
165 | }
|
166 |
|
167 |
|
168 | DBCSCodec.prototype._addDecodeChunk = function(chunk) {
|
169 |
|
170 | var curAddr = parseInt(chunk[0], 16);
|
171 |
|
172 |
|
173 | var writeTable = this._getDecodeTrieNode(curAddr);
|
174 | curAddr = curAddr & 0xFF;
|
175 |
|
176 |
|
177 | for (var k = 1; k < chunk.length; k++) {
|
178 | var part = chunk[k];
|
179 | if (typeof part === "string") {
|
180 | for (var l = 0; l < part.length;) {
|
181 | var code = part.charCodeAt(l++);
|
182 | if (0xD800 <= code && code < 0xDC00) {
|
183 | var codeTrail = part.charCodeAt(l++);
|
184 | if (0xDC00 <= codeTrail && codeTrail < 0xE000)
|
185 | writeTable[curAddr++] = 0x10000 + (code - 0xD800) * 0x400 + (codeTrail - 0xDC00);
|
186 | else
|
187 | throw new Error("Incorrect surrogate pair in " + this.encodingName + " at chunk " + chunk[0]);
|
188 | }
|
189 | else if (0x0FF0 < code && code <= 0x0FFF) {
|
190 | var len = 0xFFF - code + 2;
|
191 | var seq = [];
|
192 | for (var m = 0; m < len; m++)
|
193 | seq.push(part.charCodeAt(l++));
|
194 |
|
195 | writeTable[curAddr++] = SEQ_START - this.decodeTableSeq.length;
|
196 | this.decodeTableSeq.push(seq);
|
197 | }
|
198 | else
|
199 | writeTable[curAddr++] = code;
|
200 | }
|
201 | }
|
202 | else if (typeof part === "number") {
|
203 | var charCode = writeTable[curAddr - 1] + 1;
|
204 | for (var l = 0; l < part; l++)
|
205 | writeTable[curAddr++] = charCode++;
|
206 | }
|
207 | else
|
208 | throw new Error("Incorrect type '" + typeof part + "' given in " + this.encodingName + " at chunk " + chunk[0]);
|
209 | }
|
210 | if (curAddr > 0xFF)
|
211 | throw new Error("Incorrect chunk in " + this.encodingName + " at addr " + chunk[0] + ": too long" + curAddr);
|
212 | }
|
213 |
|
214 |
|
215 | DBCSCodec.prototype._getEncodeBucket = function(uCode) {
|
216 | var high = uCode >> 8;
|
217 | if (this.encodeTable[high] === undefined)
|
218 | this.encodeTable[high] = UNASSIGNED_NODE.slice(0);
|
219 | return this.encodeTable[high];
|
220 | }
|
221 |
|
222 | DBCSCodec.prototype._setEncodeChar = function(uCode, dbcsCode) {
|
223 | var bucket = this._getEncodeBucket(uCode);
|
224 | var low = uCode & 0xFF;
|
225 | if (bucket[low] <= SEQ_START)
|
226 | this.encodeTableSeq[SEQ_START-bucket[low]][DEF_CHAR] = dbcsCode;
|
227 | else if (bucket[low] == UNASSIGNED)
|
228 | bucket[low] = dbcsCode;
|
229 | }
|
230 |
|
231 | DBCSCodec.prototype._setEncodeSequence = function(seq, dbcsCode) {
|
232 |
|
233 |
|
234 | var uCode = seq[0];
|
235 | var bucket = this._getEncodeBucket(uCode);
|
236 | var low = uCode & 0xFF;
|
237 |
|
238 | var node;
|
239 | if (bucket[low] <= SEQ_START) {
|
240 |
|
241 | node = this.encodeTableSeq[SEQ_START-bucket[low]];
|
242 | }
|
243 | else {
|
244 |
|
245 | node = {};
|
246 | if (bucket[low] !== UNASSIGNED) node[DEF_CHAR] = bucket[low];
|
247 | bucket[low] = SEQ_START - this.encodeTableSeq.length;
|
248 | this.encodeTableSeq.push(node);
|
249 | }
|
250 |
|
251 |
|
252 | for (var j = 1; j < seq.length-1; j++) {
|
253 | var oldVal = node[uCode];
|
254 | if (typeof oldVal === 'object')
|
255 | node = oldVal;
|
256 | else {
|
257 | node = node[uCode] = {}
|
258 | if (oldVal !== undefined)
|
259 | node[DEF_CHAR] = oldVal
|
260 | }
|
261 | }
|
262 |
|
263 |
|
264 | uCode = seq[seq.length-1];
|
265 | node[uCode] = dbcsCode;
|
266 | }
|
267 |
|
268 | DBCSCodec.prototype._fillEncodeTable = function(nodeIdx, prefix, skipEncodeChars) {
|
269 | var node = this.decodeTables[nodeIdx];
|
270 | var hasValues = false;
|
271 | var subNodeEmpty = {};
|
272 | for (var i = 0; i < 0x100; i++) {
|
273 | var uCode = node[i];
|
274 | var mbCode = prefix + i;
|
275 | if (skipEncodeChars[mbCode])
|
276 | continue;
|
277 |
|
278 | if (uCode >= 0) {
|
279 | this._setEncodeChar(uCode, mbCode);
|
280 | hasValues = true;
|
281 | } else if (uCode <= NODE_START) {
|
282 | var subNodeIdx = NODE_START - uCode;
|
283 | if (!subNodeEmpty[subNodeIdx]) {
|
284 | var newPrefix = (mbCode << 8) >>> 0;
|
285 | if (this._fillEncodeTable(subNodeIdx, newPrefix, skipEncodeChars))
|
286 | hasValues = true;
|
287 | else
|
288 | subNodeEmpty[subNodeIdx] = true;
|
289 | }
|
290 | } else if (uCode <= SEQ_START) {
|
291 | this._setEncodeSequence(this.decodeTableSeq[SEQ_START - uCode], mbCode);
|
292 | hasValues = true;
|
293 | }
|
294 | }
|
295 | return hasValues;
|
296 | }
|
297 |
|
298 |
|
299 |
|
300 |
|
301 |
|
302 | function DBCSEncoder(options, codec) {
|
303 |
|
304 | this.leadSurrogate = -1;
|
305 | this.seqObj = undefined;
|
306 |
|
307 |
|
308 | this.encodeTable = codec.encodeTable;
|
309 | this.encodeTableSeq = codec.encodeTableSeq;
|
310 | this.defaultCharSingleByte = codec.defCharSB;
|
311 | this.gb18030 = codec.gb18030;
|
312 | }
|
313 |
|
314 | DBCSEncoder.prototype.write = function(str) {
|
315 | var newBuf = Buffer.alloc(str.length * (this.gb18030 ? 4 : 3)),
|
316 | leadSurrogate = this.leadSurrogate,
|
317 | seqObj = this.seqObj, nextChar = -1,
|
318 | i = 0, j = 0;
|
319 |
|
320 | while (true) {
|
321 |
|
322 | if (nextChar === -1) {
|
323 | if (i == str.length) break;
|
324 | var uCode = str.charCodeAt(i++);
|
325 | }
|
326 | else {
|
327 | var uCode = nextChar;
|
328 | nextChar = -1;
|
329 | }
|
330 |
|
331 |
|
332 | if (0xD800 <= uCode && uCode < 0xE000) {
|
333 | if (uCode < 0xDC00) {
|
334 | if (leadSurrogate === -1) {
|
335 | leadSurrogate = uCode;
|
336 | continue;
|
337 | } else {
|
338 | leadSurrogate = uCode;
|
339 |
|
340 | uCode = UNASSIGNED;
|
341 | }
|
342 | } else {
|
343 | if (leadSurrogate !== -1) {
|
344 | uCode = 0x10000 + (leadSurrogate - 0xD800) * 0x400 + (uCode - 0xDC00);
|
345 | leadSurrogate = -1;
|
346 | } else {
|
347 |
|
348 | uCode = UNASSIGNED;
|
349 | }
|
350 |
|
351 | }
|
352 | }
|
353 | else if (leadSurrogate !== -1) {
|
354 |
|
355 | nextChar = uCode; uCode = UNASSIGNED;
|
356 | leadSurrogate = -1;
|
357 | }
|
358 |
|
359 |
|
360 | var dbcsCode = UNASSIGNED;
|
361 | if (seqObj !== undefined && uCode != UNASSIGNED) {
|
362 | var resCode = seqObj[uCode];
|
363 | if (typeof resCode === 'object') {
|
364 | seqObj = resCode;
|
365 | continue;
|
366 |
|
367 | } else if (typeof resCode == 'number') {
|
368 | dbcsCode = resCode;
|
369 |
|
370 | } else if (resCode == undefined) {
|
371 |
|
372 |
|
373 | resCode = seqObj[DEF_CHAR];
|
374 | if (resCode !== undefined) {
|
375 | dbcsCode = resCode;
|
376 | nextChar = uCode;
|
377 |
|
378 | } else {
|
379 |
|
380 |
|
381 |
|
382 |
|
383 | }
|
384 | }
|
385 | seqObj = undefined;
|
386 | }
|
387 | else if (uCode >= 0) {
|
388 | var subtable = this.encodeTable[uCode >> 8];
|
389 | if (subtable !== undefined)
|
390 | dbcsCode = subtable[uCode & 0xFF];
|
391 |
|
392 | if (dbcsCode <= SEQ_START) {
|
393 | seqObj = this.encodeTableSeq[SEQ_START-dbcsCode];
|
394 | continue;
|
395 | }
|
396 |
|
397 | if (dbcsCode == UNASSIGNED && this.gb18030) {
|
398 |
|
399 | var idx = findIdx(this.gb18030.uChars, uCode);
|
400 | if (idx != -1) {
|
401 | var dbcsCode = this.gb18030.gbChars[idx] + (uCode - this.gb18030.uChars[idx]);
|
402 | newBuf[j++] = 0x81 + Math.floor(dbcsCode / 12600); dbcsCode = dbcsCode % 12600;
|
403 | newBuf[j++] = 0x30 + Math.floor(dbcsCode / 1260); dbcsCode = dbcsCode % 1260;
|
404 | newBuf[j++] = 0x81 + Math.floor(dbcsCode / 10); dbcsCode = dbcsCode % 10;
|
405 | newBuf[j++] = 0x30 + dbcsCode;
|
406 | continue;
|
407 | }
|
408 | }
|
409 | }
|
410 |
|
411 |
|
412 | if (dbcsCode === UNASSIGNED)
|
413 | dbcsCode = this.defaultCharSingleByte;
|
414 |
|
415 | if (dbcsCode < 0x100) {
|
416 | newBuf[j++] = dbcsCode;
|
417 | }
|
418 | else if (dbcsCode < 0x10000) {
|
419 | newBuf[j++] = dbcsCode >> 8;
|
420 | newBuf[j++] = dbcsCode & 0xFF;
|
421 | }
|
422 | else if (dbcsCode < 0x1000000) {
|
423 | newBuf[j++] = dbcsCode >> 16;
|
424 | newBuf[j++] = (dbcsCode >> 8) & 0xFF;
|
425 | newBuf[j++] = dbcsCode & 0xFF;
|
426 | } else {
|
427 | newBuf[j++] = dbcsCode >>> 24;
|
428 | newBuf[j++] = (dbcsCode >>> 16) & 0xFF;
|
429 | newBuf[j++] = (dbcsCode >>> 8) & 0xFF;
|
430 | newBuf[j++] = dbcsCode & 0xFF;
|
431 | }
|
432 | }
|
433 |
|
434 | this.seqObj = seqObj;
|
435 | this.leadSurrogate = leadSurrogate;
|
436 | return newBuf.slice(0, j);
|
437 | }
|
438 |
|
439 | DBCSEncoder.prototype.end = function() {
|
440 | if (this.leadSurrogate === -1 && this.seqObj === undefined)
|
441 | return;
|
442 |
|
443 | var newBuf = Buffer.alloc(10), j = 0;
|
444 |
|
445 | if (this.seqObj) {
|
446 | var dbcsCode = this.seqObj[DEF_CHAR];
|
447 | if (dbcsCode !== undefined) {
|
448 | if (dbcsCode < 0x100) {
|
449 | newBuf[j++] = dbcsCode;
|
450 | }
|
451 | else {
|
452 | newBuf[j++] = dbcsCode >> 8;
|
453 | newBuf[j++] = dbcsCode & 0xFF;
|
454 | }
|
455 | } else {
|
456 |
|
457 | }
|
458 | this.seqObj = undefined;
|
459 | }
|
460 |
|
461 | if (this.leadSurrogate !== -1) {
|
462 |
|
463 | newBuf[j++] = this.defaultCharSingleByte;
|
464 | this.leadSurrogate = -1;
|
465 | }
|
466 |
|
467 | return newBuf.slice(0, j);
|
468 | }
|
469 |
|
470 |
|
471 | DBCSEncoder.prototype.findIdx = findIdx;
|
472 |
|
473 |
|
474 |
|
475 |
|
476 | function DBCSDecoder(options, codec) {
|
477 |
|
478 | this.nodeIdx = 0;
|
479 | this.prevBytes = [];
|
480 |
|
481 |
|
482 | this.decodeTables = codec.decodeTables;
|
483 | this.decodeTableSeq = codec.decodeTableSeq;
|
484 | this.defaultCharUnicode = codec.defaultCharUnicode;
|
485 | this.gb18030 = codec.gb18030;
|
486 | }
|
487 |
|
488 | DBCSDecoder.prototype.write = function(buf) {
|
489 | var newBuf = Buffer.alloc(buf.length*2),
|
490 | nodeIdx = this.nodeIdx,
|
491 | prevBytes = this.prevBytes, prevOffset = this.prevBytes.length,
|
492 | seqStart = -this.prevBytes.length,
|
493 | uCode;
|
494 |
|
495 | for (var i = 0, j = 0; i < buf.length; i++) {
|
496 | var curByte = (i >= 0) ? buf[i] : prevBytes[i + prevOffset];
|
497 |
|
498 |
|
499 | var uCode = this.decodeTables[nodeIdx][curByte];
|
500 |
|
501 | if (uCode >= 0) {
|
502 |
|
503 | }
|
504 | else if (uCode === UNASSIGNED) {
|
505 |
|
506 | uCode = this.defaultCharUnicode.charCodeAt(0);
|
507 | i = seqStart;
|
508 | }
|
509 | else if (uCode === GB18030_CODE) {
|
510 | if (i >= 3) {
|
511 | var ptr = (buf[i-3]-0x81)*12600 + (buf[i-2]-0x30)*1260 + (buf[i-1]-0x81)*10 + (curByte-0x30);
|
512 | } else {
|
513 | var ptr = (prevBytes[i-3+prevOffset]-0x81)*12600 +
|
514 | (((i-2 >= 0) ? buf[i-2] : prevBytes[i-2+prevOffset])-0x30)*1260 +
|
515 | (((i-1 >= 0) ? buf[i-1] : prevBytes[i-1+prevOffset])-0x81)*10 +
|
516 | (curByte-0x30);
|
517 | }
|
518 | var idx = findIdx(this.gb18030.gbChars, ptr);
|
519 | uCode = this.gb18030.uChars[idx] + ptr - this.gb18030.gbChars[idx];
|
520 | }
|
521 | else if (uCode <= NODE_START) {
|
522 | nodeIdx = NODE_START - uCode;
|
523 | continue;
|
524 | }
|
525 | else if (uCode <= SEQ_START) {
|
526 | var seq = this.decodeTableSeq[SEQ_START - uCode];
|
527 | for (var k = 0; k < seq.length - 1; k++) {
|
528 | uCode = seq[k];
|
529 | newBuf[j++] = uCode & 0xFF;
|
530 | newBuf[j++] = uCode >> 8;
|
531 | }
|
532 | uCode = seq[seq.length-1];
|
533 | }
|
534 | else
|
535 | throw new Error("iconv-lite internal error: invalid decoding table value " + uCode + " at " + nodeIdx + "/" + curByte);
|
536 |
|
537 |
|
538 | if (uCode >= 0x10000) {
|
539 | uCode -= 0x10000;
|
540 | var uCodeLead = 0xD800 | (uCode >> 10);
|
541 | newBuf[j++] = uCodeLead & 0xFF;
|
542 | newBuf[j++] = uCodeLead >> 8;
|
543 |
|
544 | uCode = 0xDC00 | (uCode & 0x3FF);
|
545 | }
|
546 | newBuf[j++] = uCode & 0xFF;
|
547 | newBuf[j++] = uCode >> 8;
|
548 |
|
549 |
|
550 | nodeIdx = 0; seqStart = i+1;
|
551 | }
|
552 |
|
553 | this.nodeIdx = nodeIdx;
|
554 | this.prevBytes = (seqStart >= 0)
|
555 | ? Array.prototype.slice.call(buf, seqStart)
|
556 | : prevBytes.slice(seqStart + prevOffset).concat(Array.prototype.slice.call(buf));
|
557 |
|
558 | return newBuf.slice(0, j).toString('ucs2');
|
559 | }
|
560 |
|
561 | DBCSDecoder.prototype.end = function() {
|
562 | var ret = '';
|
563 |
|
564 |
|
565 | while (this.prevBytes.length > 0) {
|
566 |
|
567 | ret += this.defaultCharUnicode;
|
568 | var bytesArr = this.prevBytes.slice(1);
|
569 |
|
570 |
|
571 | this.prevBytes = [];
|
572 | this.nodeIdx = 0;
|
573 | if (bytesArr.length > 0)
|
574 | ret += this.write(bytesArr);
|
575 | }
|
576 |
|
577 | this.prevBytes = [];
|
578 | this.nodeIdx = 0;
|
579 | return ret;
|
580 | }
|
581 |
|
582 |
|
583 | function findIdx(table, val) {
|
584 | if (table[0] > val)
|
585 | return -1;
|
586 |
|
587 | var l = 0, r = table.length;
|
588 | while (l < r-1) {
|
589 | var mid = l + ((r-l+1) >> 1);
|
590 | if (table[mid] <= val)
|
591 | l = mid;
|
592 | else
|
593 | r = mid;
|
594 | }
|
595 | return l;
|
596 | }
|
597 |
|