1 | ;
|
2 | Object.defineProperty(exports, "__esModule", { value: true });
|
3 | exports.init = exports.DictTokenizer = exports.DEFAULT_MAX_CHUNK_COUNT_MIN = exports.DEFAULT_MAX_CHUNK_COUNT = void 0;
|
4 | const mod_1 = require("../mod");
|
5 | const index_1 = require("../util/index");
|
6 | const CHS_NAMES_1 = require("../mod/CHS_NAMES");
|
7 | const const_1 = require("../mod/const");
|
8 | exports.DEFAULT_MAX_CHUNK_COUNT = 40;
|
9 | exports.DEFAULT_MAX_CHUNK_COUNT_MIN = 30;
|
10 | /**
|
11 | * 字典识别模块
|
12 | *
|
13 | * @author 老雷<leizongmin@gmail.com>
|
14 | */
|
15 | class DictTokenizer extends mod_1.SubSModuleTokenizer {
|
16 | constructor() {
|
17 | super(...arguments);
|
18 | /**
|
19 | * 防止因無分段導致分析過久甚至超過處理負荷
|
20 | * 越高越精準但是處理時間會加倍成長甚至超過記憶體能處理的程度
|
21 | *
|
22 | * 數字越小越快
|
23 | *
|
24 | * FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap out of memory
|
25 | *
|
26 | * @type {number}
|
27 | */
|
28 | this.MAX_CHUNK_COUNT = exports.DEFAULT_MAX_CHUNK_COUNT;
|
29 | /**
|
30 | *
|
31 | * 追加新模式使 MAX_CHUNK_COUNT 遞減來防止無分段長段落的總處理次數過高 由 DEFAULT_MAX_CHUNK_COUNT_MIN 來限制最小值
|
32 | */
|
33 | this.DEFAULT_MAX_CHUNK_COUNT_MIN = exports.DEFAULT_MAX_CHUNK_COUNT_MIN;
|
34 | }
|
35 | _cache() {
|
36 | super._cache();
|
37 | this._TABLE = this.segment.getDict('TABLE');
|
38 | this._TABLE2 = this.segment.getDict('TABLE2');
|
39 | this._POSTAG = this.segment.POSTAG;
|
40 | if (typeof this.segment.options.maxChunkCount == 'number' && this.segment.options.maxChunkCount > exports.DEFAULT_MAX_CHUNK_COUNT_MIN) {
|
41 | this.MAX_CHUNK_COUNT = this.segment.options.maxChunkCount;
|
42 | }
|
43 | if (typeof this.segment.options.minChunkCount == 'number' && this.segment.options.minChunkCount > exports.DEFAULT_MAX_CHUNK_COUNT_MIN) {
|
44 | this.DEFAULT_MAX_CHUNK_COUNT_MIN = this.segment.options.minChunkCount;
|
45 | }
|
46 | }
|
47 | /**
|
48 | * 对未识别的单词进行分词
|
49 | *
|
50 | * @param {array} words 单词数组
|
51 | * @return {array}
|
52 | */
|
53 | split(words) {
|
54 | //debug(words);
|
55 | const TABLE = this._TABLE;
|
56 | //const POSTAG = this._POSTAG;
|
57 | const self = this;
|
58 | let ret = [];
|
59 | for (let i = 0, word; word = words[i]; i++) {
|
60 | if (word.p > 0) {
|
61 | ret.push(word);
|
62 | continue;
|
63 | }
|
64 | // 仅对未识别的词进行匹配
|
65 | let wordinfo = this.matchWord(word.w, 0, words[i - 1]);
|
66 | if (wordinfo.length < 1) {
|
67 | ret.push(word);
|
68 | continue;
|
69 | }
|
70 | // 分离出已识别的单词
|
71 | let lastc = 0;
|
72 | wordinfo.forEach(function (bw, ui) {
|
73 | if (bw.c > lastc) {
|
74 | ret.push({
|
75 | w: word.w.substr(lastc, bw.c - lastc),
|
76 | });
|
77 | }
|
78 | let cw = self.createRawToken({
|
79 | w: bw.w,
|
80 | f: bw.f,
|
81 | }, TABLE[bw.w]);
|
82 | ret.push(cw);
|
83 | /*
|
84 | ret.push({
|
85 | w: bw.w,
|
86 | p: ww.p,
|
87 | f: bw.f,
|
88 | s: ww.s,
|
89 | });
|
90 | */
|
91 | lastc = bw.c + bw.w.length;
|
92 | });
|
93 | let lastword = wordinfo[wordinfo.length - 1];
|
94 | if (lastword.c + lastword.w.length < word.w.length) {
|
95 | let cw = self.createRawToken({
|
96 | w: word.w.substr(lastword.c + lastword.w.length),
|
97 | });
|
98 | ret.push(cw);
|
99 | }
|
100 | }
|
101 | words = undefined;
|
102 | return ret;
|
103 | }
|
104 | // =================================================================
|
105 | /**
|
106 | * 匹配单词,返回相关信息
|
107 | *
|
108 | * @param {string} text 文本
|
109 | * @param {int} cur 开始位置
|
110 | * @param {object} preword 上一个单词
|
111 | * @return {array} 返回格式 {w: '单词', c: 开始位置}
|
112 | */
|
113 | matchWord(text, cur, preword) {
|
114 | if (isNaN(cur))
|
115 | cur = 0;
|
116 | let ret = [];
|
117 | let s = false;
|
118 | const TABLE2 = this._TABLE2;
|
119 | // 匹配可能出现的单词
|
120 | while (cur < text.length) {
|
121 | for (let i in TABLE2) {
|
122 | let w = text.substr(cur, i);
|
123 | if (w in TABLE2[i]) {
|
124 | ret.push({
|
125 | w: w,
|
126 | c: cur,
|
127 | f: TABLE2[i][w].f,
|
128 | });
|
129 | }
|
130 | }
|
131 | cur++;
|
132 | }
|
133 | return this.filterWord(ret, preword, text);
|
134 | }
|
135 | /**
|
136 | * 选择最有可能匹配的单词
|
137 | *
|
138 | * @param {array} words 单词信息数组
|
139 | * @param {object} preword 上一个单词
|
140 | * @param {string} text 本节要分词的文本
|
141 | * @return {array}
|
142 | */
|
143 | filterWord(words, preword, text) {
|
144 | const TABLE = this._TABLE;
|
145 | const POSTAG = this._POSTAG;
|
146 | let ret = [];
|
147 | // 将单词按位置分组
|
148 | let wordpos = this.getPosInfo(words, text);
|
149 | //debug(wordpos);
|
150 | /**
|
151 | * 使用类似于MMSG的分词算法
|
152 | * 找出所有分词可能,主要根据一下几项来评价:
|
153 | * x、词数量最少;
|
154 | * a、词平均频率最大;
|
155 | * b、每个词长度标准差最小;
|
156 | * c、未识别词最少;
|
157 | * d、符合语法结构项:如两个连续的动词减分,数词后面跟量词加分;
|
158 | * 取以上几项综合排名最最好的
|
159 | */
|
160 | let chunks = this.getChunks(wordpos, 0, text);
|
161 | //debug(chunks);
|
162 | let assess = []; // 评价表
|
163 | //console.log(chunks);
|
164 | // 对各个分支就行评估
|
165 | for (let i = 0, chunk; chunk = chunks[i]; i++) {
|
166 | assess[i] = {
|
167 | x: chunk.length,
|
168 | a: 0,
|
169 | b: 0,
|
170 | c: 0,
|
171 | d: 0,
|
172 | index: i,
|
173 | };
|
174 | // 词平均长度
|
175 | let sp = text.length / chunk.length;
|
176 | // 句子经常包含的语法结构
|
177 | let has_D_V = false; // 是否包含动词
|
178 | // 遍历各个词
|
179 | let prew;
|
180 | if (preword) {
|
181 | /*
|
182 | prew = {
|
183 | w: preword.w,
|
184 | p: preword.p,
|
185 | f: preword.f,
|
186 | s: preword.s,
|
187 | }
|
188 | */
|
189 | prew = this.createRawToken(preword);
|
190 | }
|
191 | else {
|
192 | prew = null;
|
193 | }
|
194 | for (let j = 0, w; w = chunk[j]; j++) {
|
195 | if (w.w in TABLE) {
|
196 | w.p = TABLE[w.w].p;
|
197 | assess[i].a += w.f; // 总词频
|
198 | if (j === 0 && !preword && (w.p & POSTAG.D_V)) {
|
199 | /**
|
200 | * 將第一個字也計算進去是否包含動詞
|
201 | */
|
202 | has_D_V = true;
|
203 | }
|
204 | // ================ 检查语法结构 ===================
|
205 | if (prew) {
|
206 | // 如果上一个词是数词且当前词是量词(单位),则加分
|
207 | if ((prew.p & POSTAG.A_M)
|
208 | &&
|
209 | ((w.p & POSTAG.A_Q)
|
210 | || w.w in const_1.DATETIME)) {
|
211 | assess[i].d++;
|
212 | }
|
213 | // 如果当前词是动词
|
214 | if (w.p & POSTAG.D_V) {
|
215 | has_D_V = true;
|
216 | // 如果是连续的两个动词,则减分
|
217 | //if ((prew.p & POSTAG.D_V) > 0)
|
218 | //assess[i].d--;
|
219 | /*
|
220 | // 如果是 形容词 + 动词,则加分
|
221 | if ((prew.p & POSTAG.D_A))
|
222 | {
|
223 | assess[i].d++;
|
224 | }
|
225 | */
|
226 | // 如果是 副词 + 动词,则加分
|
227 | if (prew.p & POSTAG.D_D) {
|
228 | assess[i].d++;
|
229 | }
|
230 | }
|
231 | // 如果是地区名、机构名或形容词,后面跟地区、机构、代词、名词等,则加分
|
232 | if (((prew.p & POSTAG.A_NS)
|
233 | || (prew.p & POSTAG.A_NT)
|
234 | || (prew.p & POSTAG.D_A)) &&
|
235 | ((w.p & POSTAG.D_N)
|
236 | || (w.p & POSTAG.A_NR)
|
237 | || (w.p & POSTAG.A_NS)
|
238 | || (w.p & POSTAG.A_NZ)
|
239 | || (w.p & POSTAG.A_NT))) {
|
240 | assess[i].d++;
|
241 | }
|
242 | // 如果是 方位词 + 数量词,则加分
|
243 | if ((prew.p & POSTAG.D_F)
|
244 | &&
|
245 | ((w.p & POSTAG.A_M)
|
246 | || (w.p & POSTAG.D_MQ))) {
|
247 | //debug(prew, w);
|
248 | assess[i].d++;
|
249 | }
|
250 | // 如果是 姓 + 名词,则加分
|
251 | if ((prew.w in CHS_NAMES_1.FAMILY_NAME_1
|
252 | || prew.w in CHS_NAMES_1.FAMILY_NAME_2) &&
|
253 | ((w.p & POSTAG.D_N)
|
254 | || (w.p & POSTAG.A_NZ))) {
|
255 | //debug(prew, w);
|
256 | assess[i].d++;
|
257 | }
|
258 | /**
|
259 | * 地名/处所 + 方位
|
260 | */
|
261 | if (index_1.hexAndAny(prew.p, POSTAG.D_S, POSTAG.A_NS) && index_1.hexAndAny(w.p, POSTAG.D_F)) {
|
262 | assess[i].d += 0.5;
|
263 | }
|
264 | // 探测下一个词
|
265 | let nextw = chunk[j + 1];
|
266 | if (nextw) {
|
267 | if (nextw.w in TABLE) {
|
268 | nextw.p = TABLE[nextw.w].p;
|
269 | }
|
270 | let _temp_ok = true;
|
271 | /**
|
272 | * 如果当前是“的”+ 名词,则加分
|
273 | */
|
274 | if ((w.w === '的' || w.w === '之')
|
275 | && nextw.p && ((nextw.p & POSTAG.D_N)
|
276 | || (nextw.p & POSTAG.D_V)
|
277 | || (nextw.p & POSTAG.A_NR)
|
278 | || (nextw.p & POSTAG.A_NS)
|
279 | || (nextw.p & POSTAG.A_NZ)
|
280 | || (nextw.p & POSTAG.A_NT))) {
|
281 | assess[i].d += 1.5;
|
282 | _temp_ok = false;
|
283 | }
|
284 | /**
|
285 | * 如果是连词,前后两个词词性相同则加分
|
286 | */
|
287 | else if (prew.p && (w.p & POSTAG.D_C)) {
|
288 | let p = prew.p & nextw.p;
|
289 | if (prew.p === nextw.p) {
|
290 | assess[i].d++;
|
291 | _temp_ok = false;
|
292 | }
|
293 | else if (p) {
|
294 | assess[i].d += 0.25;
|
295 | _temp_ok = false;
|
296 | if (p & POSTAG.D_N) {
|
297 | assess[i].d += 0.75;
|
298 | }
|
299 | }
|
300 | }
|
301 | /**
|
302 | * 在感動的重逢中有余在的話就太過閃耀
|
303 | */
|
304 | if (_temp_ok && (w.p & POSTAG.D_R) && (nextw.p & POSTAG.D_P)) {
|
305 | assess[i].d += 1;
|
306 | _temp_ok = false;
|
307 | }
|
308 | if (_temp_ok && nextw.p && (w.p & POSTAG.D_P)) {
|
309 | if (nextw.p & POSTAG.A_NR && (nextw.w.length > 1)) {
|
310 | assess[i].d++;
|
311 | if (prew.w === '的') {
|
312 | /**
|
313 | * 的 + 介詞 + 人名
|
314 | */
|
315 | assess[i].d += 1;
|
316 | _temp_ok = false;
|
317 | }
|
318 | }
|
319 | }
|
320 | if (_temp_ok && (w.p & POSTAG.D_P) && index_1.hexAndAny(prew.p, POSTAG.D_N) && index_1.hexAndAny(nextw.p, POSTAG.D_N, POSTAG.D_V)) {
|
321 | assess[i].d++;
|
322 | _temp_ok = false;
|
323 | }
|
324 | else if (_temp_ok && (w.p & POSTAG.D_P) && index_1.hexAndAny(prew.p, POSTAG.D_R) && index_1.hexAndAny(nextw.p, POSTAG.D_R)) {
|
325 | assess[i].d += 0.5;
|
326 | _temp_ok = false;
|
327 | }
|
328 | // @FIXME 暴力解決 三天后 的問題
|
329 | if (nextw.w === '后' && w.p & POSTAG.D_T && index_1.hexAndAny(prew.p, POSTAG.D_MQ, POSTAG.A_M)) {
|
330 | assess[i].d++;
|
331 | }
|
332 | // @FIXME 到湖中間后手終於能休息了
|
333 | else if ((nextw.w === '后'
|
334 | || nextw.w === '後')
|
335 | && index_1.hexAndAny(w.p, POSTAG.D_F)) {
|
336 | assess[i].d++;
|
337 | }
|
338 | if ((w.w === '后'
|
339 | || w.w === '後')
|
340 | && index_1.hexAndAny(prew.p, POSTAG.D_F)
|
341 | && index_1.hexAndAny(nextw.p, POSTAG.D_N)) {
|
342 | assess[i].d++;
|
343 | }
|
344 | }
|
345 | else {
|
346 | let _temp_ok = true;
|
347 | /**
|
348 | * 她把荷包蛋摆在像是印度烤饼的面包上
|
349 | */
|
350 | if (_temp_ok && (w.p & POSTAG.D_F) && index_1.hexAndAny(prew.p, POSTAG.D_N)) {
|
351 | assess[i].d += 1;
|
352 | _temp_ok = false;
|
353 | }
|
354 | }
|
355 | }
|
356 | // ===========================================
|
357 | }
|
358 | else {
|
359 | // 未识别的词数量
|
360 | assess[i].c++;
|
361 | }
|
362 | // 标准差
|
363 | assess[i].b += Math.pow(sp - w.w.length, 2);
|
364 | prew = chunk[j];
|
365 | }
|
366 | // 如果句子中包含了至少一个动词
|
367 | if (has_D_V === false)
|
368 | assess[i].d -= 0.5;
|
369 | assess[i].a = assess[i].a / chunk.length;
|
370 | assess[i].b = assess[i].b / chunk.length;
|
371 | }
|
372 | //console.dir(assess);
|
373 | // 计算排名
|
374 | let top = this.getTops(assess);
|
375 | let currchunk = chunks[top];
|
376 | if (false) {
|
377 | //console.log(assess);
|
378 | //console.log(Object.entries(chunks));
|
379 | console.dir(Object.entries(chunks)
|
380 | .map(([i, chunk]) => { return { i, asses: assess[i], chunk }; }), { depth: 5 });
|
381 | console.dir({ i: top, asses: assess[top], currchunk });
|
382 | //console.log(top);
|
383 | //console.log(currchunk);
|
384 | }
|
385 | // 剔除不能识别的词
|
386 | for (let i = 0, word; word = currchunk[i]; i++) {
|
387 | if (!(word.w in TABLE)) {
|
388 | currchunk.splice(i--, 1);
|
389 | }
|
390 | }
|
391 | ret = currchunk;
|
392 | // 試圖主動清除記憶體
|
393 | assess = undefined;
|
394 | chunks = undefined;
|
395 | currchunk = undefined;
|
396 | top = undefined;
|
397 | wordpos = undefined;
|
398 | //debug(ret);
|
399 | return ret;
|
400 | }
|
401 | /**
|
402 | * 评价排名
|
403 | *
|
404 | * @param {object} assess
|
405 | * @return {object}
|
406 | */
|
407 | getTops(assess) {
|
408 | //debug(assess);
|
409 | // 取各项最大值
|
410 | let top = {
|
411 | x: assess[0].x,
|
412 | a: assess[0].a,
|
413 | b: assess[0].b,
|
414 | c: assess[0].c,
|
415 | d: assess[0].d,
|
416 | };
|
417 | for (let i = 1, ass; ass = assess[i]; i++) {
|
418 | if (ass.a > top.a)
|
419 | top.a = ass.a; // 取最大平均词频
|
420 | if (ass.b < top.b)
|
421 | top.b = ass.b; // 取最小标准差
|
422 | if (ass.c > top.c)
|
423 | top.c = ass.c; // 取最大未识别词
|
424 | if (ass.d < top.d)
|
425 | top.d = ass.d; // 取最小语法分数
|
426 | if (ass.x > top.x)
|
427 | top.x = ass.x; // 取最大单词数量
|
428 | }
|
429 | //debug(top);
|
430 | // 评估排名
|
431 | let tops = [];
|
432 | for (let i = 0, ass; ass = assess[i]; i++) {
|
433 | tops[i] = 0;
|
434 | // 词数量,越小越好
|
435 | tops[i] += (top.x - ass.x) * 1.5;
|
436 | // 词总频率,越大越好
|
437 | if (ass.a >= top.a)
|
438 | tops[i] += 1;
|
439 | // 词标准差,越小越好
|
440 | if (ass.b <= top.b)
|
441 | tops[i] += 1;
|
442 | // 未识别词,越小越好
|
443 | tops[i] += (top.c - ass.c); //debug(tops[i]);
|
444 | // 符合语法结构程度,越大越好
|
445 | tops[i] += (ass.d < 0 ? top.d + ass.d : ass.d - top.d) * 1;
|
446 | ass.score = tops[i];
|
447 | //debug(tops[i]);debug('---');
|
448 | }
|
449 | //debug(tops.join(' '));
|
450 | //console.log(tops);
|
451 | //console.log(assess);
|
452 | //const old_method = true;
|
453 | const old_method = false;
|
454 | // 取分数最高的
|
455 | let curri = 0;
|
456 | let maxs = tops[0];
|
457 | for (let i in tops) {
|
458 | let s = tops[i];
|
459 | if (s > maxs) {
|
460 | curri = i;
|
461 | maxs = s;
|
462 | }
|
463 | else if (s === maxs) {
|
464 | /**
|
465 | * 如果分数相同,则根据词长度、未识别词个数和平均频率来选择
|
466 | *
|
467 | * 如果依然同分,則保持不變
|
468 | */
|
469 | let a = 0;
|
470 | let b = 0;
|
471 | if (assess[i].c < assess[curri].c) {
|
472 | a++;
|
473 | }
|
474 | else if (assess[i].c !== assess[curri].c) {
|
475 | b++;
|
476 | }
|
477 | if (assess[i].a > assess[curri].a) {
|
478 | a++;
|
479 | }
|
480 | else if (assess[i].a !== assess[curri].a) {
|
481 | b++;
|
482 | }
|
483 | if (assess[i].x < assess[curri].x) {
|
484 | a++;
|
485 | }
|
486 | else if (assess[i].x !== assess[curri].x) {
|
487 | b++;
|
488 | }
|
489 | if (a > b) {
|
490 | curri = i;
|
491 | maxs = s;
|
492 | }
|
493 | }
|
494 | //debug({ i, s, maxs, curri });
|
495 | }
|
496 | //debug('max: i=' + curri + ', s=' + tops[curri]);
|
497 | assess = undefined;
|
498 | top = undefined;
|
499 | return curri;
|
500 | }
|
501 | /**
|
502 | * 将单词按照位置排列
|
503 | *
|
504 | * @param {array} words
|
505 | * @param {string} text
|
506 | * @return {object}
|
507 | */
|
508 | getPosInfo(words, text) {
|
509 | let wordpos = {};
|
510 | // 将单词按位置分组
|
511 | for (let i = 0, word; word = words[i]; i++) {
|
512 | if (!wordpos[word.c]) {
|
513 | wordpos[word.c] = [];
|
514 | }
|
515 | wordpos[word.c].push(word);
|
516 | }
|
517 | // 按单字分割文本,填补空缺的位置
|
518 | for (let i = 0; i < text.length; i++) {
|
519 | if (!wordpos[i]) {
|
520 | wordpos[i] = [{ w: text.charAt(i), c: i, f: 0 }];
|
521 | }
|
522 | }
|
523 | return wordpos;
|
524 | }
|
525 | /**
|
526 | * 取所有分支
|
527 | *
|
528 | * @param {{[p: number]: Segment.IWord[]}} wordpos
|
529 | * @param {number} pos 当前位置
|
530 | * @param {string} text 本节要分词的文本
|
531 | * @param {number} total_count
|
532 | * @returns {Segment.IWord[][]}
|
533 | */
|
534 | getChunks(wordpos, pos, text, total_count = 0, MAX_CHUNK_COUNT) {
|
535 | /**
|
536 | *
|
537 | * 追加新模式使 MAX_CHUNK_COUNT 遞減來防止無分段長段落的總處理次數過高 由 DEFAULT_MAX_CHUNK_COUNT_MIN 來限制最小值
|
538 | */
|
539 | if (total_count === 0) {
|
540 | MAX_CHUNK_COUNT = this.MAX_CHUNK_COUNT;
|
541 | /**
|
542 | * 只有當目前文字長度大於 MAX_CHUNK_COUNT 時才遞減
|
543 | */
|
544 | if (text.length < MAX_CHUNK_COUNT) {
|
545 | MAX_CHUNK_COUNT += 1;
|
546 | }
|
547 | }
|
548 | else if (MAX_CHUNK_COUNT <= this.MAX_CHUNK_COUNT) {
|
549 | MAX_CHUNK_COUNT = Math.max(MAX_CHUNK_COUNT - 1, this.DEFAULT_MAX_CHUNK_COUNT_MIN, exports.DEFAULT_MAX_CHUNK_COUNT_MIN);
|
550 | }
|
551 | else {
|
552 | //MAX_CHUNK_COUNT = Math.max(MAX_CHUNK_COUNT, this.DEFAULT_MAX_CHUNK_COUNT_MIN, DEFAULT_MAX_CHUNK_COUNT_MIN)
|
553 | }
|
554 | /**
|
555 | * 忽略連字
|
556 | *
|
557 | * 例如: 啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊
|
558 | */
|
559 | let m;
|
560 | if (m = text.match(/^((.+)\2{5,})/)) {
|
561 | let s1 = text.slice(0, m[1].length);
|
562 | let s2 = text.slice(m[1].length);
|
563 | let word = {
|
564 | w: s1,
|
565 | c: pos,
|
566 | f: 0,
|
567 | };
|
568 | let _ret = [];
|
569 | if (s2 !== '') {
|
570 | let chunks = this.getChunks(wordpos, pos + s1.length, s2, total_count, MAX_CHUNK_COUNT);
|
571 | for (let ws of chunks) {
|
572 | _ret.push([word].concat(ws));
|
573 | }
|
574 | }
|
575 | else {
|
576 | _ret.push([word]);
|
577 | }
|
578 | // console.dir(wordpos);
|
579 | //
|
580 | // console.dir(ret);
|
581 | //
|
582 | // console.dir([pos, text, total_count]);
|
583 | return _ret;
|
584 | }
|
585 | total_count++;
|
586 | let words = wordpos[pos] || [];
|
587 | //debug(total_count, MAX_CHUNK_COUNT);
|
588 | // debug({
|
589 | // total_count,
|
590 | // MAX_CHUNK_COUNT: this.MAX_CHUNK_COUNT,
|
591 | // text,
|
592 | // words,
|
593 | // });
|
594 | // debug('getChunks: ');
|
595 | // debug(words);
|
596 | //throw new Error();
|
597 | let ret = [];
|
598 | for (let word of words) {
|
599 | //debug(word);
|
600 | let nextcur = word.c + word.w.length;
|
601 | /**
|
602 | * @FIXME
|
603 | */
|
604 | if (!wordpos[nextcur]) {
|
605 | ret.push([word]);
|
606 | }
|
607 | else if (total_count > MAX_CHUNK_COUNT) {
|
608 | // do something
|
609 | // console.log(444, words.slice(i));
|
610 | // console.log(333, word);
|
611 | let w1 = [word];
|
612 | let j = nextcur;
|
613 | while (j in wordpos) {
|
614 | let w2 = wordpos[j][0];
|
615 | if (w2) {
|
616 | w1.push(w2);
|
617 | j += w2.w.length;
|
618 | }
|
619 | else {
|
620 | break;
|
621 | }
|
622 | }
|
623 | ret.push(w1);
|
624 | }
|
625 | else {
|
626 | let t = text.slice(word.w.length);
|
627 | let chunks = this.getChunks(wordpos, nextcur, t, total_count, MAX_CHUNK_COUNT);
|
628 | for (let ws of chunks) {
|
629 | ret.push([word].concat(ws));
|
630 | }
|
631 | chunks = null;
|
632 | }
|
633 | }
|
634 | words = undefined;
|
635 | wordpos = undefined;
|
636 | m = undefined;
|
637 | return ret;
|
638 | }
|
639 | }
|
640 | exports.DictTokenizer = DictTokenizer;
|
641 | exports.init = DictTokenizer.init.bind(DictTokenizer);
|
642 | exports.default = DictTokenizer;
|
643 | //# sourceMappingURL=data:application/json;base64,{"version":3,"file":"DictTokenizer.js","sourceRoot":"","sources":["DictTokenizer.ts"],"names":[],"mappings":"AAAA,YAAY,CAAC;;;AAEb,gCAA8E;AAI9E,yCAAiD;AACjD,gDAAsH;AAGtH,wCAAwC;AAG3B,QAAA,uBAAuB,GAAG,EAAE,CAAC;AAC7B,QAAA,2BAA2B,GAAG,EAAE,CAAC;AAE9C;;;;GAIG;AACH,MAAa,aAAc,SAAQ,yBAAmB;IAAtD;;QAGC;;;;;;;;;WASG;QACH,oBAAe,GAAG,+BAAuB,CAAC;QAC1C;;;WAGG;QACH,gCAA2B,GAAG,mCAA2B,CAAC;IAs0B3D,CAAC;IAj0BA,MAAM;QAEL,KAAK,CAAC,MAAM,EAAE,CAAC;QACf,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAC5C,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QAC9C,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC;QAEnC,IAAI,OAAO,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,aAAa,IAAI,QAAQ,IAAI,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,aAAa,GAAG,mCAA2B,EAC7H;YACC,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC;SAC1D;QAED,IAAI,OAAO,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,aAAa,IAAI,QAAQ,IAAI,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,aAAa,GAAG,mCAA2B,EAC7H;YACC,IAAI,CAAC,2BAA2B,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC;SACtE;IACF,CAAC;IAED;;;;;OAKG;IACH,KAAK,CAAC,KAAc;QAEnB,eAAe;QACf,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC;QAC1B,8BAA8B;QAE9B,MAAM,IAAI,GAAG,IAAI,CAAC;QAElB,IAAI,GAAG,GAAY,EAAE,CAAC;QACtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,EAAE,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAC1C;YACC,IAAI,IAAI,CAAC,CAAC,GAAG,CAAC,EACd;gBACC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACf,SAAS;aACT;YAED,cAAc;YACd,IAAI,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC,EAAE,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;YACvD,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EACvB;gBACC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACf,SAAS;aACT;YAED,YAAY;YACZ,IAAI,KAAK,GAAG,CAAC,CAAC;YAEd,QAAQ,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,EAAE;gBAEhC,IAAI,EAAE,CAAC,CAAC,GAAG,KAAK,EAChB;oBACC,GAAG,CAAC,IAAI,CAAC;wBACR,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,GAAG,KAAK,CAAC;qBACrC,CAAC,CAAC;iBACH;gBAED,IAAI,EAAE,GAAG,IAAI,CAAC,cAAc,CAAC;oBAC5B,CAAC,EAAE,EAAE,CAAC,CAAC;oBACP,CAAC,EAAE,EAAE,CAAC,CAAC;iBACP,EAAE,KAAK,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;gBAEhB,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBAEb;;;;;;;kBAOE;gBACF,KAAK,GAAG,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;YAC5B,CAAC,CAAC,CAAC;YAEH,IAAI,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;YAC7C,IAAI,QAAQ,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAAC,MAAM,EAClD;gBACC,IAAI,EAAE,GAAG,IAAI,CAAC,cAAc,CAAC;oBAC5B,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC;iBAChD,CAAC,CAAC;gBAEH,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;aACb;SACD;QAED,KAAK,GAAG,SAAS,CAAC;QAElB,OAAO,GAAG,CAAC;IACZ,CAAC;IAED,oEAAoE;IAEpE;;;;;;;OAOG;IACO,SAAS,CAAC,IAAY,EAAE,GAAW,EAAE,OAAc;QAE5D,IAAI,KAAK,CAAC,GAAG,CAAC;YAAE,GAAG,GAAG,CAAC,CAAC;QACxB,IAAI,GAAG,GAAY,EAAE,CAAC;QACtB,IAAI,CAAC,GAAG,KAAK,CAAC;QAEd,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC;QAE5B,YAAY;QACZ,OAAO,GAAG,GAAG,IAAI,CAAC,MAAM,EACxB;YACC,KAAK,IAAI,CAAC,IAAI,MAAM,EACpB;gBACC,IAAI,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,EAAE,CAAkB,CAAC,CAAC;gBAC7C,IAAI,CAAC,IAAI,MAAM,CAAC,CAAC,CAAC,EAClB;oBACC,GAAG,CAAC,IAAI,CAAC;wBACR,CAAC,EAAE,CAAC;wBACJ,CAAC,EAAE,GAAG;wBACN,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;qBACjB,CAAC,CAAC;iBACH;aACD;YACD,GAAG,EAAE,CAAC;SACN;QAED,OAAO,IAAI,CAAC,UAAU,CAAC,GAAG,EAAE,OAAO,EAAE,IAAI,CAAC,CAAC;IAC5C,CAAC;IAED;;;;;;;OAOG;IACO,UAAU,CAAC,KAAc,EAAE,OAAc,EAAE,IAAY;QAEhE,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC;QAC1B,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC;QAC5B,IAAI,GAAG,GAAY,EAAE,CAAC;QAEtB,WAAW;QACX,IAAI,OAAO,GAAG,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QAC3C,iBAAiB;QAEjB;;;;;;;;;WASG;QACH,IAAI,MAAM,GAAG,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,CAAC,EAAE,IAAI,CAAC,CAAC;QAC9C,gBAAgB;QAChB,IAAI,MAAM,GAAsB,EAAE,CAAC,CAAE,MAAM;QAE3C,sBAAsB;QAEtB,YAAY;QACZ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,KAAc,EAAE,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EACtD;YACC,MAAM,CAAC,CAAC,CAAC,GAAG;gBACX,CAAC,EAAE,KAAK,CAAC,MAAM;gBACf,CAAC,EAAE,CAAC;gBACJ,CAAC,EAAE,CAAC;gBACJ,CAAC,EAAE,CAAC;gBACJ,CAAC,EAAE,CAAC;gBAEJ,KAAK,EAAE,CAAC;aACR,CAAC;YACF,QAAQ;YACR,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;YACpC,cAAc;YACd,IAAI,OAAO,GAAG,KAAK,CAAC,CAAE,SAAS;YAE/B,QAAQ;YACR,IAAI,IAAW,CAAC;YAEhB,IAAI,OAAO,EACX;gBACC;;;;;;;kBAOE;gBAEF,IAAI,GAAG,IAAI,CAAC,cAAc,CAAC,OAAO,CAAC,CAAC;aAEpC;iBAED;gBACC,IAAI,GAAG,IAAI,CAAC;aACZ;YACD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAQ,EAAE,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAC3C;gBACC,IAAI,CAAC,CAAC,CAAC,IAAI,KAAK,EAChB;oBACC,CAAC,CAAC,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;oBACnB,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAG,MAAM;oBAE5B,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC,EAC7C;wBACC;;2BAEG;wBACH,OAAO,GAAG,IAAI,CAAC;qBACf;oBAED,8CAA8C;oBAC9C,IAAI,IAAI,EACR;wBACC,2BAA2B;wBAC3B,IACC,CAAC,IAAI,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC;;gCAErB,CACC,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC;uCACf,CAAC,CAAC,CAAC,IAAI,gBAAQ,CAClB,EAEF;4BACC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;yBACd;wBAED,WAAW;wBACX,IAAI,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,EACpB;4BACC,OAAO,GAAG,IAAI,CAAC;4BACf,iBAAiB;4BACjB,gCAAgC;4BAChC,gBAAgB;4BAEhB;;;;;;8BAME;4BAEF,kBAAkB;4BAClB,IAAI,IAAI,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,EACvB;gCACC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;6BACd;yBACD;wBACD,qCAAqC;wBACrC,IAAI,CACF,CAAC,IAAI,CAAC,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC;+BACnB,CAAC,IAAI,CAAC,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC;+BACtB,CAAC,IAAI,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC,CACxB;4BACD,CACC,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC;mCACf,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC;mCACnB,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC;mCACnB,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC;mCACnB,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC,CACtB,EACF;4BACC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;yBACd;wBACD,oBAAoB;wBACpB,IACC,CAAC,IAAI,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC;;gCAErB,CACC,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC;uCACf,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC,CACtB,EACF;4BACC,iBAAiB;4BACjB,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;yBACd;wBACD,iBAAiB;wBACjB,IACC,CACC,IAAI,CAAC,CAAC,IAAI,yBAAa;+BACpB,IAAI,CAAC,CAAC,IAAI,yBAAa,CAC1B;4BACD,CACC,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC;mCACf,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC,CACtB,EACF;4BACC,iBAAiB;4BACjB,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;yBACd;wBAED;;2BAEG;wBACH,IAAI,iBAAS,CAAC,IAAI,CAAC,CAAC,EACjB,MAAM,CAAC,GAAG,EACV,MAAM,CAAC,IAAI,CACb,IAAI,iBAAS,CAAC,CAAC,CAAC,CAAC,EACf,MAAM,CAAC,GAAG,CACZ,EACD;4BACC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC;yBACnB;wBAED,SAAS;wBACT,IAAI,KAAK,GAAG,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;wBACzB,IAAI,KAAK,EACT;4BACC,IAAI,KAAK,CAAC,CAAC,IAAI,KAAK,EACpB;gCACC,KAAK,CAAC,CAAC,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;6BAC3B;4BAED,IAAI,QAAQ,GAAY,IAAI,CAAC;4BAE7B;;+BAEG;4BACH,IACC,CAAC,CAAC,CAAC,CAAC,KAAK,GAAG,IAAI,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC;mCACzB,KAAK,CAAC,CAAC,IAAI,CACb,CAAC,KAAK,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC;mCACnB,CAAC,KAAK,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC;mCACtB,CAAC,KAAK,CAAC,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC;mCACvB,CAAC,KAAK,CAAC,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC;mCACvB,CAAC,KAAK,CAAC,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC;mCACvB,CAAC,KAAK,CAAC,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC,CAC1B,EACF;gCACC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC;gCACnB,QAAQ,GAAG,KAAK,CAAC;6BACjB;4BACD;;+BAEG;iCACE,IAAI,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC,EACrC;gCACC,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC;gCAEzB,IAAI,IAAI,CAAC,CAAC,KAAK,KAAK,CAAC,CAAC,EACtB;oCACC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;oCACd,QAAQ,GAAG,KAAK,CAAC;iCACjB;qCACI,IAAI,CAAC,EACV;oCACC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC;oCACpB,QAAQ,GAAG,KAAK,CAAC;oCAEjB,IAAI,CAAC,GAAG,MAAM,CAAC,GAAG,EAClB;wCACC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC;qCACpB;iCACD;6BACD;4BAED;;+BAEG;4BACH,IAAI,QAAQ,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC,EAC5D;gCACC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;gCACjB,QAAQ,GAAG,KAAK,CAAC;6BACjB;4BAED,IAAI,QAAQ,IAAI,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC,EAC7C;gCACC,IAAI,KAAK,CAAC,CAAC,GAAG,MAAM,CAAC,IAAI,IAAI,CAC5B,KAAK,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAClB,EACD;oCACC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;oCAEd,IAAI,IAAI,CAAC,CAAC,KAAK,GAAG,EAClB;wCACC;;2CAEG;wCACH,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;wCACjB,QAAQ,GAAG,KAAK,CAAC;qCACjB;iCACD;6BACD;4BAED,IAAI,QAAQ,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC,IAAI,iBAAS,CAAC,IAAI,CAAC,CAAC,EACrD,MAAM,CAAC,GAAG,CACV,IAAI,iBAAS,CAAC,KAAK,CAAC,CAAC,EACrB,MAAM,CAAC,GAAG,EACV,MAAM,CAAC,GAAG,CACV,EACD;gCACC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;gCACd,QAAQ,GAAG,KAAK,CAAC;6BACjB;iCACI,IAAI,QAAQ,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC,IAAI,iBAAS,CAAC,IAAI,CAAC,CAAC,EAC1D,MAAM,CAAC,GAAG,CACV,IAAI,iBAAS,CAAC,KAAK,CAAC,CAAC,EACrB,MAAM,CAAC,GAAG,CACV,EACD;gCACC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC;gCACnB,QAAQ,GAAG,KAAK,CAAC;6BACjB;4BAED,sBAAsB;4BACtB,IAAI,KAAK,CAAC,CAAC,KAAK,GAAG,IAAI,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,IAAI,iBAAS,CAAC,IAAI,CAAC,CAAC,EAC1D,MAAM,CAAC,IAAI,EACX,MAAM,CAAC,GAAG,CACV,EACD;gCACC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;6BACd;4BACD,sBAAsB;iCACjB,IACJ,CACC,KAAK,CAAC,CAAC,KAAK,GAAG;mCACZ,KAAK,CAAC,CAAC,KAAK,GAAG,CAClB;mCACE,iBAAS,CAAC,CAAC,CAAC,CAAC,EAChB,MAAM,CAAC,GAAG,CACT,EAEF;gCACC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;6BACd;4BAED,IACC,CACC,CAAC,CAAC,CAAC,KAAK,GAAG;mCACR,CAAC,CAAC,CAAC,KAAK,GAAG,CACd;mCACE,iBAAS,CAAC,IAAI,CAAC,CAAC,EACnB,MAAM,CAAC,GAAG,CACT;mCACE,iBAAS,CAAC,KAAK,CAAC,CAAC,EACpB,MAAM,CAAC,GAAG,CACT,EAEF;gCACC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;6BACd;yBACD;6BAED;4BACC,IAAI,QAAQ,GAAY,IAAI,CAAC;4BAE7B;;+BAEG;4BACH,IAAI,QAAQ,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC,IAAI,iBAAS,CAAC,IAAI,CAAC,CAAC,EACrD,MAAM,CAAC,GAAG,CACV,EACD;gCACC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;gCACjB,QAAQ,GAAG,KAAK,CAAC;6BACjB;yBACD;qBACD;oBACD,8CAA8C;iBAC9C;qBAED;oBACC,UAAU;oBACV,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;iBACd;gBACD,MAAM;gBACN,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC,GAAG,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;gBAC5C,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;aAChB;YAED,iBAAiB;YACjB,IAAI,OAAO,KAAK,KAAK;gBAAE,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC;YAE1C,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC;YACzC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC;SACzC;QAED,sBAAsB;QAEtB,OAAO;QACP,IAAI,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;QAC/B,IAAI,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC;QAE5B,IAAI,KAAK,EACT;YACC,sBAAsB;YACtB,sCAAsC;YACtC,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC;iBAChC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,EAAE,EAAE,GAAG,OAAO,EAAE,CAAC,EAAE,KAAK,EAAE,MAAM,CAAC,CAAsB,CAAC,EAAE,KAAK,EAAE,CAAA,CAAC,CAAC,CAAC,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC,CAAC;YACrG,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC;YACvD,mBAAmB;YACnB,yBAAyB;SACzB;QAED,WAAW;QACX,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,IAAW,EAAE,IAAI,GAAG,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EACrD;YACC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,KAAK,CAAC,EACtB;gBACC,SAAS,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;aACzB;SACD;QACD,GAAG,GAAG,SAAS,CAAC;QAEhB,YAAY;QACZ,MAAM,GAAG,SAAS,CAAC;QACnB,MAAM,GAAG,SAAS,CAAC;QACnB,SAAS,GAAG,SAAS,CAAC;QACtB,GAAG,GAAG,SAAS,CAAC;QAChB,OAAO,GAAG,SAAS,CAAC;QAEpB,aAAa;QACb,OAAO,GAAG,CAAC;IACZ,CAAC;IAED;;;;;OAKG;IACH,OAAO,CAAC,MAAyB;QAEhC,gBAAgB;QAChB,SAAS;QACT,IAAI,GAAG,GAAe;YACrB,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;YACd,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;YACd,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;YACd,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;YACd,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;SACd,CAAC;QAEF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,GAAe,EAAE,GAAG,GAAG,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EACrD;YACC,IAAI,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC;gBAAE,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAE,UAAU;YAC7C,IAAI,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC;gBAAE,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAE,SAAS;YAC5C,IAAI,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC;gBAAE,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAE,UAAU;YAC7C,IAAI,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC;gBAAE,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAE,UAAU;YAC7C,IAAI,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC;gBAAE,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAE,UAAU;SAC7C;QACD,aAAa;QAEb,OAAO;QACP,IAAI,IAAI,GAAa,EAAE,CAAC;QACxB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,GAAe,EAAE,GAAG,GAAG,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EACrD;YACC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;YACZ,WAAW;YACX,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC;YACjC,YAAY;YACZ,IAAI,GAAG,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC;gBAAE,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YACjC,YAAY;YACZ,IAAI,GAAG,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC;gBAAE,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YACjC,YAAY;YACZ,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAA,iBAAiB;YAC5C,gBAAgB;YAChB,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;YAE3D,GAAG,CAAC,KAAK,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;YAEpB,8BAA8B;SAC9B;QACD,yBAAyB;QAEzB,oBAAoB;QACpB,sBAAsB;QAEtB,0BAA0B;QAC1B,MAAM,UAAU,GAAG,KAAK,CAAC;QAEzB,SAAS;QACT,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;QACnB,KAAK,IAAI,CAAC,IAAI,IAAI,EAClB;YACC,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;YAChB,IAAI,CAAC,GAAG,IAAI,EACZ;gBACC,KAAK,GAAG,CAAkB,CAAC;gBAC3B,IAAI,GAAG,CAAC,CAAC;aACT;iBACI,IAAI,CAAC,KAAK,IAAI,EACnB;gBACC;;;;mBAIG;gBACH,IAAI,CAAC,GAAG,CAAC,CAAC;gBACV,IAAI,CAAC,GAAG,CAAC,CAAC;gBACV,IAAI,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EACjC;oBACC,CAAC,EAAE,CAAC;iBACJ;qBACI,IAAI,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EACxC;oBACC,CAAC,EAAE,CAAC;iBACJ;gBACD,IAAI,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EACjC;oBACC,CAAC,EAAE,CAAC;iBACJ;qBACI,IAAI,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EACxC;oBACC,CAAC,EAAE,CAAC;iBACJ;gBACD,IAAI,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EACjC;oBACC,CAAC,EAAE,CAAC;iBACJ;qBACI,IAAI,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EACxC;oBACC,CAAC,EAAE,CAAC;iBACJ;gBACD,IAAI,CAAC,GAAG,CAAC,EACT;oBACC,KAAK,GAAG,CAAkB,CAAC;oBAC3B,IAAI,GAAG,CAAC,CAAC;iBACT;aACD;YACD,+BAA+B;SAC/B;QACD,kDAAkD;QAElD,MAAM,GAAG,SAAS,CAAC;QACnB,GAAG,GAAG,SAAS,CAAC;QAEhB,OAAO,KAAK,CAAC;IACd,CAAC;IAED;;;;;;OAMG;IACH,UAAU,CAAC,KAAc,EAAE,IAAY;QAItC,IAAI,OAAO,GAAG,EAAE,CAAC;QACjB,WAAW;QACX,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,EAAE,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAC1C;YACC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EACpB;gBACC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC;aACrB;YACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;SAC3B;QACD,kBAAkB;QAClB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EACpC;YACC,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,EACf;gBACC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC;aACjD;SACD;QAED,OAAO,OAAO,CAAC;IAChB,CAAC;IAED;;;;;;;;OAQG;IACH,SAAS,CAAC,OAET,EAAE,GAAW,EAAE,IAAa,EAAE,WAAW,GAAG,CAAC,EAAE,eAAwB;QAGvE;;;WAGG;QACH,IAAI,WAAW,KAAK,CAAC,EACrB;YACC,eAAe,GAAG,IAAI,CAAC,eAAe,CAAC;YAEvC;;eAEG;YACH,IAAI,IAAI,CAAC,MAAM,GAAG,eAAe,EACjC;gBACC,eAAe,IAAI,CAAC,CAAC;aACrB;SACD;aACI,IAAI,eAAe,IAAI,IAAI,CAAC,eAAe,EAChD;YACC,eAAe,GAAG,IAAI,CAAC,GAAG,CAAC,eAAe,GAAG,CAAC,EAAE,IAAI,CAAC,2BAA2B,EAAE,mCAA2B,CAAC,CAAA;SAC9G;aAED;YACC,4GAA4G;SAC5G;QAED;;;;WAIG;QACH,IAAI,CAAmB,CAAC;QACxB,IAAI,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,eAAe,CAAC,EACnC;YACC,IAAI,EAAE,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;YACpC,IAAI,EAAE,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;YAEjC,IAAI,IAAI,GAAG;gBACV,CAAC,EAAE,EAAE;gBACL,CAAC,EAAE,GAAG;gBACN,CAAC,EAAE,CAAC;aACK,CAAC;YAEX,IAAI,IAAI,GAAc,EAAE,CAAC;YAEzB,IAAI,EAAE,KAAK,EAAE,EACb;gBACC,IAAI,MAAM,GAAG,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,GAAG,GAAG,EAAE,CAAC,MAAM,EAAE,EAAE,EAAE,WAAW,EAAE,eAAe,CAAC,CAAC;gBAExF,KAAK,IAAI,EAAE,IAAI,MAAM,EACrB;oBACC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;iBAC7B;aACD;iBAED;gBACC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;aAClB;YAEJ,0BAA0B;YAC1B,EAAE;YACF,sBAAsB;YACtB,EAAE;YACF,2CAA2C;YAExC,OAAO,IAAI,CAAC;SACZ;QAED,WAAW,EAAE,CAAC;QAEd,IAAI,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;QAE/B,sCAAsC;QAExC,WAAW;QACX,iBAAiB;QACjB,2CAA2C;QAC3C,UAAU;QACV,WAAW;QACX,OAAO;QAEL,wBAAwB;QACxB,gBAAgB;QAChB,oBAAoB;QAEpB,IAAI,GAAG,GAAc,EAAE,CAAC;QACxB,KAAK,IAAI,IAAI,IAAI,KAAK,EACtB;YACC,cAAc;YACd,IAAI,OAAO,GAAG,IAAI,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC;YACrC;;eAEG;YACH,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,EACrB;gBACC,GAAG,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;aACjB;iBACI,IAAI,WAAW,GAAG,eAAe,EACtC;gBACC,eAAe;gBAEnB,uCAAuC;gBACvC,6BAA6B;gBAEzB,IAAI,EAAE,GAAY,CAAC,IAAI,CAAC,CAAC;gBAEzB,IAAI,CAAC,GAAG,OAAO,CAAC;gBAChB,OAAO,CAAC,IAAI,OAAO,EACnB;oBACC,IAAI,EAAE,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;oBAEvB,IAAI,EAAE,EACN;wBACC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;wBAEZ,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;qBACjB;yBAED;wBACC,MAAM;qBACN;iBACD;gBAED,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;aACb;iBAED;gBACC,IAAI,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;gBAElC,IAAI,MAAM,GAAG,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,OAAO,EAAE,CAAC,EAAE,WAAW,EAAE,eAAe,CAAE,CAAC;gBAChF,KAAK,IAAI,EAAE,IAAI,MAAM,EACrB;oBACC,GAAG,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;iBAC5B;gBAED,MAAM,GAAG,IAAI,CAAC;aACd;SACD;QAED,KAAK,GAAG,SAAS,CAAC;QAClB,OAAO,GAAG,SAAS,CAAC;QACpB,CAAC,GAAG,SAAS,CAAC;QAEd,OAAO,GAAG,CAAC;IACZ,CAAC;CACD;AAx1BD,sCAw1BC;AAkDY,QAAA,IAAI,GAAG,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,aAAa,CAAuC,CAAC;AAEjG,kBAAe,aAAa,CAAC","sourcesContent":["'use strict';\n\nimport { SubSModule, SubSModuleTokenizer, ISubTokenizerCreate } from '../mod';\n// @ts-ignore\nimport { UString } from 'uni-string';\nimport { ITableDictRow } from '../table/dict';\nimport { hexAndAny, toHex } from '../util/index';\nimport CHS_NAMES, { FAMILY_NAME_1, FAMILY_NAME_2, SINGLE_NAME, DOUBLE_NAME_1, DOUBLE_NAME_2 } from '../mod/CHS_NAMES';\nimport Segment, { IDICT, IWord, IDICT2 } from '../Segment';\nimport { debug } from '../util';\nimport { DATETIME } from '../mod/const';\nimport IPOSTAG from '../POSTAG';\n\nexport const DEFAULT_MAX_CHUNK_COUNT = 40;\nexport const DEFAULT_MAX_CHUNK_COUNT_MIN = 30;\n\n/**\n * 字典识别模块\n *\n * @author 老雷<leizongmin@gmail.com>\n */\nexport class DictTokenizer extends SubSModuleTokenizer\n{\n\n\t/**\n\t * 防止因無分段導致分析過久甚至超過處理負荷\n\t * 越高越精準但是處理時間會加倍成長甚至超過記憶體能處理的程度\n\t *\n\t * 數字越小越快\n\t *\n\t * FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap out of memory\n\t *\n\t * @type {number}\n\t */\n\tMAX_CHUNK_COUNT = DEFAULT_MAX_CHUNK_COUNT;\n\t/**\n\t *\n\t * 追加新模式使 MAX_CHUNK_COUNT 遞減來防止無分段長段落的總處理次數過高 由 DEFAULT_MAX_CHUNK_COUNT_MIN 來限制最小值\n\t */\n\tDEFAULT_MAX_CHUNK_COUNT_MIN = DEFAULT_MAX_CHUNK_COUNT_MIN;\n\n\tprotected _TABLE: IDICT<IWord>;\n\tprotected _TABLE2: IDICT2<IWord>;\n\n\t_cache()\n\t{\n\t\tsuper._cache();\n\t\tthis._TABLE = this.segment.getDict('TABLE');\n\t\tthis._TABLE2 = this.segment.getDict('TABLE2');\n\t\tthis._POSTAG = this.segment.POSTAG;\n\n\t\tif (typeof this.segment.options.maxChunkCount == 'number' && this.segment.options.maxChunkCount > DEFAULT_MAX_CHUNK_COUNT_MIN)\n\t\t{\n\t\t\tthis.MAX_CHUNK_COUNT = this.segment.options.maxChunkCount;\n\t\t}\n\n\t\tif (typeof this.segment.options.minChunkCount == 'number' && this.segment.options.minChunkCount > DEFAULT_MAX_CHUNK_COUNT_MIN)\n\t\t{\n\t\t\tthis.DEFAULT_MAX_CHUNK_COUNT_MIN = this.segment.options.minChunkCount;\n\t\t}\n\t}\n\n\t/**\n\t * 对未识别的单词进行分词\n\t *\n\t * @param {array} words 单词数组\n\t * @return {array}\n\t */\n\tsplit(words: IWord[]): IWord[]\n\t{\n\t\t//debug(words);\n\t\tconst TABLE = this._TABLE;\n\t\t//const POSTAG = this._POSTAG;\n\n\t\tconst self = this;\n\n\t\tlet ret: IWord[] = [];\n\t\tfor (let i = 0, word; word = words[i]; i++)\n\t\t{\n\t\t\tif (word.p > 0)\n\t\t\t{\n\t\t\t\tret.push(word);\n\t\t\t\tcontinue;\n\t\t\t}\n\n\t\t\t// 仅对未识别的词进行匹配\n\t\t\tlet wordinfo = this.matchWord(word.w, 0, words[i - 1]);\n\t\t\tif (wordinfo.length < 1)\n\t\t\t{\n\t\t\t\tret.push(word);\n\t\t\t\tcontinue;\n\t\t\t}\n\n\t\t\t// 分离出已识别的单词\n\t\t\tlet lastc = 0;\n\n\t\t\twordinfo.forEach(function (bw, ui)\n\t\t\t{\n\t\t\t\tif (bw.c > lastc)\n\t\t\t\t{\n\t\t\t\t\tret.push({\n\t\t\t\t\t\tw: word.w.substr(lastc, bw.c - lastc),\n\t\t\t\t\t});\n\t\t\t\t}\n\n\t\t\t\tlet cw = self.createRawToken({\n\t\t\t\t\tw: bw.w,\n\t\t\t\t\tf: bw.f,\n\t\t\t\t}, TABLE[bw.w]);\n\n\t\t\t\tret.push(cw);\n\n\t\t\t\t/*\n\t\t\t\tret.push({\n\t\t\t\t\tw: bw.w,\n\t\t\t\t\tp: ww.p,\n\t\t\t\t\tf: bw.f,\n\t\t\t\t\ts: ww.s,\n\t\t\t\t});\n\t\t\t\t*/\n\t\t\t\tlastc = bw.c + bw.w.length;\n\t\t\t});\n\n\t\t\tlet lastword = wordinfo[wordinfo.length - 1];\n\t\t\tif (lastword.c + lastword.w.length < word.w.length)\n\t\t\t{\n\t\t\t\tlet cw = self.createRawToken({\n\t\t\t\t\tw: word.w.substr(lastword.c + lastword.w.length),\n\t\t\t\t});\n\n\t\t\t\tret.push(cw);\n\t\t\t}\n\t\t}\n\n\t\twords = undefined;\n\n\t\treturn ret;\n\t}\n\n\t// =================================================================\n\n\t/**\n\t * 匹配单词，返回相关信息\n\t *\n\t * @param {string} text 文本\n\t * @param {int} cur 开始位置\n\t * @param {object} preword 上一个单词\n\t * @return {array}  返回格式   {w: '单词', c: 开始位置}\n\t */\n\tprotected matchWord(text: string, cur: number, preword: IWord)\n\t{\n\t\tif (isNaN(cur)) cur = 0;\n\t\tlet ret: IWord[] = [];\n\t\tlet s = false;\n\n\t\tconst TABLE2 = this._TABLE2;\n\n\t\t// 匹配可能出现的单词\n\t\twhile (cur < text.length)\n\t\t{\n\t\t\tfor (let i in TABLE2)\n\t\t\t{\n\t\t\t\tlet w = text.substr(cur, i as any as number);\n\t\t\t\tif (w in TABLE2[i])\n\t\t\t\t{\n\t\t\t\t\tret.push({\n\t\t\t\t\t\tw: w,\n\t\t\t\t\t\tc: cur,\n\t\t\t\t\t\tf: TABLE2[i][w].f,\n\t\t\t\t\t});\n\t\t\t\t}\n\t\t\t}\n\t\t\tcur++;\n\t\t}\n\n\t\treturn this.filterWord(ret, preword, text);\n\t}\n\n\t/**\n\t * 选择最有可能匹配的单词\n\t *\n\t * @param {array} words 单词信息数组\n\t * @param {object} preword 上一个单词\n\t * @param {string} text 本节要分词的文本\n\t * @return {array}\n\t */\n\tprotected filterWord(words: IWord[], preword: IWord, text: string)\n\t{\n\t\tconst TABLE = this._TABLE;\n\t\tconst POSTAG = this._POSTAG;\n\t\tlet ret: IWord[] = [];\n\n\t\t// 将单词按位置分组\n\t\tlet wordpos = this.getPosInfo(words, text);\n\t\t//debug(wordpos);\n\n\t\t/**\n\t\t * 使用类似于MMSG的分词算法\n\t\t * 找出所有分词可能，主要根据一下几项来评价：\n\t\t * x、词数量最少；\n\t\t * a、词平均频率最大；\n\t\t * b、每个词长度标准差最小；\n\t\t * c、未识别词最少；\n\t\t * d、符合语法结构项：如两个连续的动词减分，数词后面跟量词加分；\n\t\t * 取以上几项综合排名最最好的\n\t\t */\n\t\tlet chunks = this.getChunks(wordpos, 0, text);\n\t\t//debug(chunks);\n\t\tlet assess: Array<IAssessRow> = [];  // 评价表\n\n\t\t//console.log(chunks);\n\n\t\t// 对各个分支就行评估\n\t\tfor (let i = 0, chunk: IWord[]; chunk = chunks[i]; i++)\n\t\t{\n\t\t\tassess[i] = {\n\t\t\t\tx: chunk.length,\n\t\t\t\ta: 0,\n\t\t\t\tb: 0,\n\t\t\t\tc: 0,\n\t\t\t\td: 0,\n\n\t\t\t\tindex: i,\n\t\t\t};\n\t\t\t// 词平均长度\n\t\t\tlet sp = text.length / chunk.length;\n\t\t\t// 句子经常包含的语法结构\n\t\t\tlet has_D_V = false;  // 是否包含动词\n\n\t\t\t// 遍历各个词\n\t\t\tlet prew: IWord;\n\n\t\t\tif (preword)\n\t\t\t{\n\t\t\t\t/*\n\t\t\t\tprew = {\n\t\t\t\t\tw: preword.w,\n\t\t\t\t\tp: preword.p,\n\t\t\t\t\tf: preword.f,\n\t\t\t\t\ts: preword.s,\n\t\t\t\t}\n\t\t\t\t*/\n\n\t\t\t\tprew = this.createRawToken(preword);\n\n\t\t\t}\n\t\t\telse\n\t\t\t{\n\t\t\t\tprew = null;\n\t\t\t}\n\t\t\tfor (let j = 0, w: IWord; w = chunk[j]; j++)\n\t\t\t{\n\t\t\t\tif (w.w in TABLE)\n\t\t\t\t{\n\t\t\t\t\tw.p = TABLE[w.w].p;\n\t\t\t\t\tassess[i].a += w.f;   // 总词频\n\n\t\t\t\t\tif (j === 0 && !preword && (w.p & POSTAG.D_V))\n\t\t\t\t\t{\n\t\t\t\t\t\t/**\n\t\t\t\t\t\t * 將第一個字也計算進去是否包含動詞\n\t\t\t\t\t\t */\n\t\t\t\t\t\thas_D_V = true;\n\t\t\t\t\t}\n\n\t\t\t\t\t// ================ 检查语法结构 ===================\n\t\t\t\t\tif (prew)\n\t\t\t\t\t{\n\t\t\t\t\t\t// 如果上一个词是数词且当前词是量词（单位），则加分\n\t\t\t\t\t\tif (\n\t\t\t\t\t\t\t(prew.p & POSTAG.A_M)\n\t\t\t\t\t\t\t&&\n\t\t\t\t\t\t\t(\n\t\t\t\t\t\t\t\t(w.p & POSTAG.A_Q)\n\t\t\t\t\t\t\t\t|| w.w in DATETIME\n\t\t\t\t\t\t\t)\n\t\t\t\t\t\t)\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tassess[i].d++;\n\t\t\t\t\t\t}\n\n\t\t\t\t\t\t// 如果当前词是动词\n\t\t\t\t\t\tif (w.p & POSTAG.D_V)\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\thas_D_V = true;\n\t\t\t\t\t\t\t// 如果是连续的两个动词，则减分\n\t\t\t\t\t\t\t//if ((prew.p & POSTAG.D_V) > 0)\n\t\t\t\t\t\t\t//assess[i].d--;\n\n\t\t\t\t\t\t\t/*\n\t\t\t\t\t\t\t// 如果是 形容词 + 动词，则加分\n\t\t\t\t\t\t\tif ((prew.p & POSTAG.D_A))\n\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\tassess[i].d++;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t*/\n\n\t\t\t\t\t\t\t// 如果是 副词 + 动词，则加分\n\t\t\t\t\t\t\tif (prew.p & POSTAG.D_D)\n\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\tassess[i].d++;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\t\t\t\t\t\t// 如果是地区名、机构名或形容词，后面跟地区、机构、代词、名词等，则加分\n\t\t\t\t\t\tif ((\n\t\t\t\t\t\t\t\t(prew.p & POSTAG.A_NS)\n\t\t\t\t\t\t\t\t|| (prew.p & POSTAG.A_NT)\n\t\t\t\t\t\t\t\t|| (prew.p & POSTAG.D_A)\n\t\t\t\t\t\t\t) &&\n\t\t\t\t\t\t\t(\n\t\t\t\t\t\t\t\t(w.p & POSTAG.D_N)\n\t\t\t\t\t\t\t\t|| (w.p & POSTAG.A_NR)\n\t\t\t\t\t\t\t\t|| (w.p & POSTAG.A_NS)\n\t\t\t\t\t\t\t\t|| (w.p & POSTAG.A_NZ)\n\t\t\t\t\t\t\t\t|| (w.p & POSTAG.A_NT)\n\t\t\t\t\t\t\t))\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tassess[i].d++;\n\t\t\t\t\t\t}\n\t\t\t\t\t\t// 如果是 方位词 + 数量词，则加分\n\t\t\t\t\t\tif (\n\t\t\t\t\t\t\t(prew.p & POSTAG.D_F)\n\t\t\t\t\t\t\t&&\n\t\t\t\t\t\t\t(\n\t\t\t\t\t\t\t\t(w.p & POSTAG.A_M)\n\t\t\t\t\t\t\t\t|| (w.p & POSTAG.D_MQ)\n\t\t\t\t\t\t\t))\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t//debug(prew, w);\n\t\t\t\t\t\t\tassess[i].d++;\n\t\t\t\t\t\t}\n\t\t\t\t\t\t// 如果是 姓 + 名词，则加分\n\t\t\t\t\t\tif (\n\t\t\t\t\t\t\t(\n\t\t\t\t\t\t\t\tprew.w in FAMILY_NAME_1\n\t\t\t\t\t\t\t\t|| prew.w in FAMILY_NAME_2\n\t\t\t\t\t\t\t) &&\n\t\t\t\t\t\t\t(\n\t\t\t\t\t\t\t\t(w.p & POSTAG.D_N)\n\t\t\t\t\t\t\t\t|| (w.p & POSTAG.A_NZ)\n\t\t\t\t\t\t\t))\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t//debug(prew, w);\n\t\t\t\t\t\t\tassess[i].d++;\n\t\t\t\t\t\t}\n\n\t\t\t\t\t\t/**\n\t\t\t\t\t\t * 地名/处所 + 方位\n\t\t\t\t\t\t */\n\t\t\t\t\t\tif (hexAndAny(prew.p\n\t\t\t\t\t\t\t, POSTAG.D_S\n\t\t\t\t\t\t\t, POSTAG.A_NS,\n\t\t\t\t\t\t) && hexAndAny(w.p\n\t\t\t\t\t\t\t, POSTAG.D_F,\n\t\t\t\t\t\t))\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tassess[i].d += 0.5;\n\t\t\t\t\t\t}\n\n\t\t\t\t\t\t// 探测下一个词\n\t\t\t\t\t\tlet nextw = chunk[j + 1];\n\t\t\t\t\t\tif (nextw)\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tif (nextw.w in TABLE)\n\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\tnextw.p = TABLE[nextw.w].p;\n\t\t\t\t\t\t\t}\n\n\t\t\t\t\t\t\tlet _temp_ok: boolean = true;\n\n\t\t\t\t\t\t\t/**\n\t\t\t\t\t\t\t * 如果当前是“的”+ 名词，则加分\n\t\t\t\t\t\t\t */\n\t\t\t\t\t\t\tif (\n\t\t\t\t\t\t\t\t(w.w === '的' || w.w === '之')\n\t\t\t\t\t\t\t\t&& nextw.p && (\n\t\t\t\t\t\t\t\t\t(nextw.p & POSTAG.D_N)\n\t\t\t\t\t\t\t\t\t|| (nextw.p & POSTAG.D_V)\n\t\t\t\t\t\t\t\t\t|| (nextw.p & POSTAG.A_NR)\n\t\t\t\t\t\t\t\t\t|| (nextw.p & POSTAG.A_NS)\n\t\t\t\t\t\t\t\t\t|| (nextw.p & POSTAG.A_NZ)\n\t\t\t\t\t\t\t\t\t|| (nextw.p & POSTAG.A_NT)\n\t\t\t\t\t\t\t\t))\n\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\tassess[i].d += 1.5;\n\t\t\t\t\t\t\t\t_temp_ok = false;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t/**\n\t\t\t\t\t\t\t * 如果是连词，前后两个词词性相同则加分\n\t\t\t\t\t\t\t */\n\t\t\t\t\t\t\telse if (prew.p && (w.p & POSTAG.D_C))\n\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\tlet p = prew.p & nextw.p;\n\n\t\t\t\t\t\t\t\tif (prew.p === nextw.p)\n\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\tassess[i].d++;\n\t\t\t\t\t\t\t\t\t_temp_ok = false;\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t\telse if (p)\n\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\tassess[i].d += 0.25;\n\t\t\t\t\t\t\t\t\t_temp_ok = false;\n\n\t\t\t\t\t\t\t\t\tif (p & POSTAG.D_N)\n\t\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\t\tassess[i].d += 0.75;\n\t\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t}\n\n\t\t\t\t\t\t\t/**\n\t\t\t\t\t\t\t * 在感動的重逢中有余在的話就太過閃耀\n\t\t\t\t\t\t\t */\n\t\t\t\t\t\t\tif (_temp_ok && (w.p & POSTAG.D_R) && (nextw.p & POSTAG.D_P))\n\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\tassess[i].d += 1;\n\t\t\t\t\t\t\t\t_temp_ok = false;\n\t\t\t\t\t\t\t}\n\n\t\t\t\t\t\t\tif (_temp_ok && nextw.p && (w.p & POSTAG.D_P))\n\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\tif (nextw.p & POSTAG.A_NR && (\n\t\t\t\t\t\t\t\t\tnextw.w.length > 1\n\t\t\t\t\t\t\t\t))\n\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\tassess[i].d++;\n\n\t\t\t\t\t\t\t\t\tif (prew.w === '的')\n\t\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\t\t/**\n\t\t\t\t\t\t\t\t\t\t * 的 + 介詞 + 人名\n\t\t\t\t\t\t\t\t\t\t */\n\t\t\t\t\t\t\t\t\t\tassess[i].d += 1;\n\t\t\t\t\t\t\t\t\t\t_temp_ok = false;\n\t\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t}\n\n\t\t\t\t\t\t\tif (_temp_ok && (w.p & POSTAG.D_P) && hexAndAny(prew.p,\n\t\t\t\t\t\t\t\tPOSTAG.D_N,\n\t\t\t\t\t\t\t) && hexAndAny(nextw.p,\n\t\t\t\t\t\t\t\tPOSTAG.D_N,\n\t\t\t\t\t\t\t\tPOSTAG.D_V,\n\t\t\t\t\t\t\t))\n\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\tassess[i].d++;\n\t\t\t\t\t\t\t\t_temp_ok = false;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\telse if (_temp_ok && (w.p & POSTAG.D_P) && hexAndAny(prew.p,\n\t\t\t\t\t\t\t\tPOSTAG.D_R,\n\t\t\t\t\t\t\t) && hexAndAny(nextw.p,\n\t\t\t\t\t\t\t\tPOSTAG.D_R,\n\t\t\t\t\t\t\t))\n\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\tassess[i].d += 0.5;\n\t\t\t\t\t\t\t\t_temp_ok = false;\n\t\t\t\t\t\t\t}\n\n\t\t\t\t\t\t\t// @FIXME 暴力解決 三天后 的問題\n\t\t\t\t\t\t\tif (nextw.w === '后' && w.p & POSTAG.D_T && hexAndAny(prew.p,\n\t\t\t\t\t\t\t\tPOSTAG.D_MQ,\n\t\t\t\t\t\t\t\tPOSTAG.A_M,\n\t\t\t\t\t\t\t))\n\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\tassess[i].d++;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t// @FIXME 到湖中間后手終於能休息了\n\t\t\t\t\t\t\telse if (\n\t\t\t\t\t\t\t\t(\n\t\t\t\t\t\t\t\t\tnextw.w === '后'\n\t\t\t\t\t\t\t\t\t|| nextw.w === '後'\n\t\t\t\t\t\t\t\t)\n\t\t\t\t\t\t\t\t&& hexAndAny(w.p,\n\t\t\t\t\t\t\t\tPOSTAG.D_F,\n\t\t\t\t\t\t\t\t)\n\t\t\t\t\t\t\t)\n\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\tassess[i].d++;\n\t\t\t\t\t\t\t}\n\n\t\t\t\t\t\t\tif (\n\t\t\t\t\t\t\t\t(\n\t\t\t\t\t\t\t\t\tw.w === '后'\n\t\t\t\t\t\t\t\t\t|| w.w === '後'\n\t\t\t\t\t\t\t\t)\n\t\t\t\t\t\t\t\t&& hexAndAny(prew.p,\n\t\t\t\t\t\t\t\tPOSTAG.D_F,\n\t\t\t\t\t\t\t\t)\n\t\t\t\t\t\t\t\t&& hexAndAny(nextw.p,\n\t\t\t\t\t\t\t\tPOSTAG.D_N,\n\t\t\t\t\t\t\t\t)\n\t\t\t\t\t\t\t)\n\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\tassess[i].d++;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\t\t\t\t\t\telse\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tlet _temp_ok: boolean = true;\n\n\t\t\t\t\t\t\t/**\n\t\t\t\t\t\t\t * 她把荷包蛋摆在像是印度烤饼的面包上\n\t\t\t\t\t\t\t */\n\t\t\t\t\t\t\tif (_temp_ok && (w.p & POSTAG.D_F) && hexAndAny(prew.p,\n\t\t\t\t\t\t\t\tPOSTAG.D_N,\n\t\t\t\t\t\t\t))\n\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\tassess[i].d += 1;\n\t\t\t\t\t\t\t\t_temp_ok = false;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t\t// ===========================================\n\t\t\t\t}\n\t\t\t\telse\n\t\t\t\t{\n\t\t\t\t\t// 未识别的词数量\n\t\t\t\t\tassess[i].c++;\n\t\t\t\t}\n\t\t\t\t// 标准差\n\t\t\t\tassess[i].b += Math.pow(sp - w.w.length, 2);\n\t\t\t\tprew = chunk[j];\n\t\t\t}\n\n\t\t\t// 如果句子中包含了至少一个动词\n\t\t\tif (has_D_V === false) assess[i].d -= 0.5;\n\n\t\t\tassess[i].a = assess[i].a / chunk.length;\n\t\t\tassess[i].b = assess[i].b / chunk.length;\n\t\t}\n\n\t\t//console.dir(assess);\n\n\t\t// 计算排名\n\t\tlet top = this.getTops(assess);\n\t\tlet currchunk = chunks[top];\n\n\t\tif (false)\n\t\t{\n\t\t\t//console.log(assess);\n\t\t\t//console.log(Object.entries(chunks));\n\t\t\tconsole.dir(Object.entries(chunks)\n\t\t\t\t.map(([i, chunk]) => { return { i, asses: assess[i as unknown as number], chunk } }), { depth: 5 });\n\t\t\tconsole.dir({ i: top, asses: assess[top], currchunk });\n\t\t\t//console.log(top);\n\t\t\t//console.log(currchunk);\n\t\t}\n\n\t\t// 剔除不能识别的词\n\t\tfor (let i = 0, word: IWord; word = currchunk[i]; i++)\n\t\t{\n\t\t\tif (!(word.w in TABLE))\n\t\t\t{\n\t\t\t\tcurrchunk.splice(i--, 1);\n\t\t\t}\n\t\t}\n\t\tret = currchunk;\n\n\t\t// 試圖主動清除記憶體\n\t\tassess = undefined;\n\t\tchunks = undefined;\n\t\tcurrchunk = undefined;\n\t\ttop = undefined;\n\t\twordpos = undefined;\n\n\t\t//debug(ret);\n\t\treturn ret;\n\t}\n\n\t/**\n\t * 评价排名\n\t *\n\t * @param {object} assess\n\t * @return {object}\n\t */\n\tgetTops(assess: Array<IAssessRow>)\n\t{\n\t\t//debug(assess);\n\t\t// 取各项最大值\n\t\tlet top: IAssessRow = {\n\t\t\tx: assess[0].x,\n\t\t\ta: assess[0].a,\n\t\t\tb: assess[0].b,\n\t\t\tc: assess[0].c,\n\t\t\td: assess[0].d,\n\t\t};\n\n\t\tfor (let i = 1, ass: IAssessRow; ass = assess[i]; i++)\n\t\t{\n\t\t\tif (ass.a > top.a) top.a = ass.a;  // 取最大平均词频\n\t\t\tif (ass.b < top.b) top.b = ass.b;  // 取最小标准差\n\t\t\tif (ass.c > top.c) top.c = ass.c;  // 取最大未识别词\n\t\t\tif (ass.d < top.d) top.d = ass.d;  // 取最小语法分数\n\t\t\tif (ass.x > top.x) top.x = ass.x;  // 取最大单词数量\n\t\t}\n\t\t//debug(top);\n\n\t\t// 评估排名\n\t\tlet tops: number[] = [];\n\t\tfor (let i = 0, ass: IAssessRow; ass = assess[i]; i++)\n\t\t{\n\t\t\ttops[i] = 0;\n\t\t\t// 词数量，越小越好\n\t\t\ttops[i] += (top.x - ass.x) * 1.5;\n\t\t\t// 词总频率，越大越好\n\t\t\tif (ass.a >= top.a) tops[i] += 1;\n\t\t\t// 词标准差，越小越好\n\t\t\tif (ass.b <= top.b) tops[i] += 1;\n\t\t\t// 未识别词，越小越好\n\t\t\ttops[i] += (top.c - ass.c);//debug(tops[i]);\n\t\t\t// 符合语法结构程度，越大越好\n\t\t\ttops[i] += (ass.d < 0 ? top.d + ass.d : ass.d - top.d) * 1;\n\n\t\t\tass.score = tops[i];\n\n\t\t\t//debug(tops[i]);debug('---');\n\t\t}\n\t\t//debug(tops.join('  '));\n\n\t\t//console.log(tops);\n\t\t//console.log(assess);\n\n\t\t//const old_method = true;\n\t\tconst old_method = false;\n\n\t\t// 取分数最高的\n\t\tlet curri = 0;\n\t\tlet maxs = tops[0];\n\t\tfor (let i in tops)\n\t\t{\n\t\t\tlet s = tops[i];\n\t\t\tif (s > maxs)\n\t\t\t{\n\t\t\t\tcurri = i as any as number;\n\t\t\t\tmaxs = s;\n\t\t\t}\n\t\t\telse if (s === maxs)\n\t\t\t{\n\t\t\t\t/**\n\t\t\t\t * 如果分数相同，则根据词长度、未识别词个数和平均频率来选择\n\t\t\t\t *\n\t\t\t\t * 如果依然同分，則保持不變\n\t\t\t\t */\n\t\t\t\tlet a = 0;\n\t\t\t\tlet b = 0;\n\t\t\t\tif (assess[i].c < assess[curri].c)\n\t\t\t\t{\n\t\t\t\t\ta++;\n\t\t\t\t}\n\t\t\t\telse if (assess[i].c !== assess[curri].c)\n\t\t\t\t{\n\t\t\t\t\tb++;\n\t\t\t\t}\n\t\t\t\tif (assess[i].a > assess[curri].a)\n\t\t\t\t{\n\t\t\t\t\ta++;\n\t\t\t\t}\n\t\t\t\telse if (assess[i].a !== assess[curri].a)\n\t\t\t\t{\n\t\t\t\t\tb++;\n\t\t\t\t}\n\t\t\t\tif (assess[i].x < assess[curri].x)\n\t\t\t\t{\n\t\t\t\t\ta++;\n\t\t\t\t}\n\t\t\t\telse if (assess[i].x !== assess[curri].x)\n\t\t\t\t{\n\t\t\t\t\tb++;\n\t\t\t\t}\n\t\t\t\tif (a > b)\n\t\t\t\t{\n\t\t\t\t\tcurri = i as any as number;\n\t\t\t\t\tmaxs = s;\n\t\t\t\t}\n\t\t\t}\n\t\t\t//debug({ i, s, maxs, curri });\n\t\t}\n\t\t//debug('max: i=' + curri + ', s=' + tops[curri]);\n\n\t\tassess = undefined;\n\t\ttop = undefined;\n\n\t\treturn curri;\n\t}\n\n\t/**\n\t * 将单词按照位置排列\n\t *\n\t * @param {array} words\n\t * @param {string} text\n\t * @return {object}\n\t */\n\tgetPosInfo(words: IWord[], text: string): {\n\t\t[index: number]: IWord[];\n\t}\n\t{\n\t\tlet wordpos = {};\n\t\t// 将单词按位置分组\n\t\tfor (let i = 0, word; word = words[i]; i++)\n\t\t{\n\t\t\tif (!wordpos[word.c])\n\t\t\t{\n\t\t\t\twordpos[word.c] = [];\n\t\t\t}\n\t\t\twordpos[word.c].push(word);\n\t\t}\n\t\t// 按单字分割文本，填补空缺的位置\n\t\tfor (let i = 0; i < text.length; i++)\n\t\t{\n\t\t\tif (!wordpos[i])\n\t\t\t{\n\t\t\t\twordpos[i] = [{ w: text.charAt(i), c: i, f: 0 }];\n\t\t\t}\n\t\t}\n\n\t\treturn wordpos;\n\t}\n\n\t/**\n\t * 取所有分支\n\t *\n\t * @param {{[p: number]: Segment.IWord[]}} wordpos\n\t * @param {number} pos 当前位置\n\t * @param {string} text 本节要分词的文本\n\t * @param {number} total_count\n\t * @returns {Segment.IWord[][]}\n\t */\n\tgetChunks(wordpos: {\n\t\t[index: number]: IWord[];\n\t}, pos: number, text?: string, total_count = 0, MAX_CHUNK_COUNT?: number): IWord[][]\n\t{\n\n\t\t/**\n\t\t *\n\t\t * 追加新模式使 MAX_CHUNK_COUNT 遞減來防止無分段長段落的總處理次數過高 由 DEFAULT_MAX_CHUNK_COUNT_MIN 來限制最小值\n\t\t */\n\t\tif (total_count === 0)\n\t\t{\n\t\t\tMAX_CHUNK_COUNT = this.MAX_CHUNK_COUNT;\n\n\t\t\t/**\n\t\t\t * 只有當目前文字長度大於 MAX_CHUNK_COUNT 時才遞減\n\t\t\t */\n\t\t\tif (text.length < MAX_CHUNK_COUNT)\n\t\t\t{\n\t\t\t\tMAX_CHUNK_COUNT += 1;\n\t\t\t}\n\t\t}\n\t\telse if (MAX_CHUNK_COUNT <= this.MAX_CHUNK_COUNT)\n\t\t{\n\t\t\tMAX_CHUNK_COUNT = Math.max(MAX_CHUNK_COUNT - 1, this.DEFAULT_MAX_CHUNK_COUNT_MIN, DEFAULT_MAX_CHUNK_COUNT_MIN)\n\t\t}\n\t\telse\n\t\t{\n\t\t\t//MAX_CHUNK_COUNT = Math.max(MAX_CHUNK_COUNT, this.DEFAULT_MAX_CHUNK_COUNT_MIN, DEFAULT_MAX_CHUNK_COUNT_MIN)\n\t\t}\n\n\t\t/**\n\t\t * 忽略連字\n\t\t *\n\t\t * 例如: 啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊\n\t\t */\n\t\tlet m: RegExpMatchArray;\n\t\tif (m = text.match(/^((.+)\\2{5,})/))\n\t\t{\n\t\t\tlet s1 = text.slice(0, m[1].length);\n\t\t\tlet s2 = text.slice(m[1].length);\n\n\t\t\tlet word = {\n\t\t\t\tw: s1,\n\t\t\t\tc: pos,\n\t\t\t\tf: 0,\n\t\t\t} as IWord;\n\n\t\t\tlet _ret: IWord[][] = [];\n\n\t\t\tif (s2 !== '')\n\t\t\t{\n\t\t\t\tlet chunks = this.getChunks(wordpos, pos + s1.length, s2, total_count, MAX_CHUNK_COUNT);\n\n\t\t\t\tfor (let ws of chunks)\n\t\t\t\t{\n\t\t\t\t\t_ret.push([word].concat(ws));\n\t\t\t\t}\n\t\t\t}\n\t\t\telse\n\t\t\t{\n\t\t\t\t_ret.push([word]);\n\t\t\t}\n\n//\t\t\tconsole.dir(wordpos);\n//\n//\t\t\tconsole.dir(ret);\n//\n//\t\t\tconsole.dir([pos, text, total_count]);\n\n\t\t\treturn _ret;\n\t\t}\n\n\t\ttotal_count++;\n\n\t\tlet words = wordpos[pos] || [];\n\n\t\t//debug(total_count, MAX_CHUNK_COUNT);\n\n//\t\tdebug({\n//\t\t\ttotal_count,\n//\t\t\tMAX_CHUNK_COUNT: this.MAX_CHUNK_COUNT,\n//\t\t\ttext,\n//\t\t\twords,\n//\t\t});\n\n\t\t// debug('getChunks: ');\n\t\t// debug(words);\n\t\t//throw new Error();\n\n\t\tlet ret: IWord[][] = [];\n\t\tfor (let word of words)\n\t\t{\n\t\t\t//debug(word);\n\t\t\tlet nextcur = word.c + word.w.length;\n\t\t\t/**\n\t\t\t * @FIXME\n\t\t\t */\n\t\t\tif (!wordpos[nextcur])\n\t\t\t{\n\t\t\t\tret.push([word]);\n\t\t\t}\n\t\t\telse if (total_count > MAX_CHUNK_COUNT)\n\t\t\t{\n\t\t\t\t// do something\n\n//\t\t\t\tconsole.log(444, words.slice(i));\n//\t\t\t\tconsole.log(333, word);\n\n\t\t\t\tlet w1: IWord[] = [word];\n\n\t\t\t\tlet j = nextcur;\n\t\t\t\twhile (j in wordpos)\n\t\t\t\t{\n\t\t\t\t\tlet w2 = wordpos[j][0];\n\n\t\t\t\t\tif (w2)\n\t\t\t\t\t{\n\t\t\t\t\t\tw1.push(w2);\n\n\t\t\t\t\t\tj += w2.w.length;\n\t\t\t\t\t}\n\t\t\t\t\telse\n\t\t\t\t\t{\n\t\t\t\t\t\tbreak;\n\t\t\t\t\t}\n\t\t\t\t}\n\n\t\t\t\tret.push(w1);\n\t\t\t}\n\t\t\telse\n\t\t\t{\n\t\t\t\tlet t = text.slice(word.w.length);\n\n\t\t\t\tlet chunks = this.getChunks(wordpos, nextcur, t, total_count, MAX_CHUNK_COUNT );\n\t\t\t\tfor (let ws of chunks)\n\t\t\t\t{\n\t\t\t\t\tret.push([word].concat(ws));\n\t\t\t\t}\n\n\t\t\t\tchunks = null;\n\t\t\t}\n\t\t}\n\n\t\twords = undefined;\n\t\twordpos = undefined;\n\t\tm = undefined;\n\n\t\treturn ret;\n\t}\n}\n\nexport namespace DictTokenizer\n{\n\t/**\n\t * 使用类似于MMSG的分词算法\n\t * 找出所有分词可能，主要根据一下几项来评价：\n\t *\n\t * x、词数量最少；\n\t * a、词平均频率最大；\n\t * b、每个词长度标准差最小；\n\t * c、未识别词最少；\n\t * d、符合语法结构项：如两个连续的动词减分，数词后面跟量词加分；\n\t *\n\t * 取以上几项综合排名最最好的\n\t */\n\texport type IAssessRow = {\n\t\t/**\n\t\t * 词数量，越小越好\n\t\t */\n\t\tx: number,\n\t\t/**\n\t\t * 词总频率，越大越好\n\t\t */\n\t\ta: number,\n\t\t/**\n\t\t * 词标准差，越小越好\n\t\t * 每个词长度标准差最小\n\t\t */\n\t\tb: number,\n\t\t/**\n\t\t * 未识别词，越小越好\n\t\t */\n\t\tc: number,\n\t\t/**\n\t\t * 符合语法结构程度，越大越好\n\t\t * 符合语法结构项：如两个连续的动词减分，数词后面跟量词加分\n\t\t */\n\t\td: number,\n\n\t\t/**\n\t\t * 結算評分(自動計算)\n\t\t */\n\t\tscore?: number,\n\t\treadonly index?: number,\n\t};\n}\n\nexport import IAssessRow = DictTokenizer.IAssessRow;\n\nexport const init = DictTokenizer.init.bind(DictTokenizer) as ISubTokenizerCreate<DictTokenizer>;\n\nexport default DictTokenizer;\n"]} |
\ | No newline at end of file |