1 | ;
|
2 | Object.defineProperty(exports, "__esModule", { value: true });
|
3 | exports.init = exports.DictOptimizer = void 0;
|
4 | const mod_1 = require("../mod");
|
5 | const DIRECTIONS_REGEXP = /^[東西南北东]+$/;
|
6 | /**
|
7 | * 词典优化模块
|
8 | *
|
9 | * @author 老雷<leizongmin@gmail.com>
|
10 | */
|
11 | class DictOptimizer extends mod_1.SubSModuleOptimizer {
|
12 | constructor() {
|
13 | super(...arguments);
|
14 | this.name = 'DictOptimizer';
|
15 | }
|
16 | _cache() {
|
17 | super._cache();
|
18 | this._TABLE = this.segment.getDict('TABLE');
|
19 | this._POSTAG = this.segment.POSTAG;
|
20 | }
|
21 | isMergeable(w1, w2, { POSTAG, TABLE, nw, i, nw_cache, nw_cache_exists, }) {
|
22 | let bool;
|
23 | let m;
|
24 | /**
|
25 | * 原始判斷模式
|
26 | */
|
27 | if (w1.p == w2.p) {
|
28 | bool = true;
|
29 | }
|
30 | /**
|
31 | * 不確定沒有BUG 但原始模式已經不合需求 因為單一項目多個詞性
|
32 | */
|
33 | else if (m = (w1.p & w2.p)) {
|
34 | if (1 || m & POSTAG.D_N) {
|
35 | bool = true;
|
36 | }
|
37 | }
|
38 | /**
|
39 | * 允許例如 幾 + %
|
40 | */
|
41 | else if (w1.p && typeof w2.p == 'undefined') {
|
42 | bool = true;
|
43 | }
|
44 | else if (w1.p & POSTAG.D_D && w2.p & POSTAG.D_V) {
|
45 | ({
|
46 | nw_cache,
|
47 | nw_cache_exists,
|
48 | } = this._getWordCache(nw, nw_cache, nw_cache_exists));
|
49 | let mw = nw_cache;
|
50 | if (mw && (mw.p & POSTAG.D_D || mw.p & POSTAG.D_V)) {
|
51 | bool = true;
|
52 | }
|
53 | }
|
54 | return bool
|
55 | && this._getWordCache(nw, nw_cache, nw_cache_exists).nw_cache_exists;
|
56 | }
|
57 | _getWordCache(nw, nw_cache, nw_cache_exists) {
|
58 | if (typeof nw_cache_exists === 'undefined') {
|
59 | const TABLE = this._TABLE;
|
60 | nw_cache = nw_cache || TABLE[nw];
|
61 | nw_cache_exists = !!nw_cache;
|
62 | }
|
63 | return {
|
64 | nw,
|
65 | nw_cache,
|
66 | nw_cache_exists,
|
67 | };
|
68 | }
|
69 | /**
|
70 | * 词典优化
|
71 | *
|
72 | * @param {array} words 单词数组
|
73 | * @param {bool} is_not_first 是否为管理器调用的
|
74 | * @return {array}
|
75 | */
|
76 | doOptimize(words, is_not_first) {
|
77 | var _a;
|
78 | //debug(words);
|
79 | if (typeof is_not_first == 'undefined') {
|
80 | is_not_first = false;
|
81 | }
|
82 | // 合并相邻的能组成一个单词的两个词
|
83 | const TABLE = this._TABLE;
|
84 | const POSTAG = this._POSTAG;
|
85 | const self = this;
|
86 | let i = 0;
|
87 | let ie = words.length - 1;
|
88 | while (i < ie) {
|
89 | let w1 = words[i];
|
90 | let w2 = words[i + 1];
|
91 | //debug(w1.w + ', ' + w2.w);
|
92 | // ==========================================
|
93 | let nw = w1.w + w2.w;
|
94 | let nw_cache;
|
95 | let nw_cache_exists;
|
96 | /**
|
97 | * 形容词 + 助词 = 形容词,如: 不同 + 的 = 不同的
|
98 | */
|
99 | if (w1.w != '了'
|
100 | && (w1.p & POSTAG.D_A)
|
101 | && (w2.p & POSTAG.D_U)) {
|
102 | let p = POSTAG.D_A;
|
103 | let f;
|
104 | ({
|
105 | nw_cache,
|
106 | nw_cache_exists,
|
107 | } = self._getWordCache(nw, nw_cache, nw_cache_exists));
|
108 | let mw = nw_cache;
|
109 | if (!mw || (mw.p & POSTAG.D_A)) {
|
110 | if (mw && (mw.p & POSTAG.D_A)) {
|
111 | p = mw.p;
|
112 | f = mw.f;
|
113 | }
|
114 | else if (w1.p & POSTAG.BAD) {
|
115 | p = POSTAG.D_A + POSTAG.BAD;
|
116 | }
|
117 | this.sliceToken(words, i, 2, {
|
118 | w: nw,
|
119 | //p: ((nw in TABLE && TABLE[nw].p & POSTAG.D_A) ? TABLE[nw].p : POSTAG.D_A),
|
120 | p,
|
121 | f,
|
122 | m: [w1, w2],
|
123 | }, undefined, {
|
124 | [this.name]: 1,
|
125 | });
|
126 | ie--;
|
127 | continue;
|
128 | }
|
129 | }
|
130 | /**
|
131 | * 形容詞 + 名詞 = 名詞
|
132 | */
|
133 | if ((w1.p & POSTAG.D_A)
|
134 | && (w2.p & POSTAG.D_N)) {
|
135 | ({
|
136 | nw_cache,
|
137 | nw_cache_exists,
|
138 | } = self._getWordCache(nw, nw_cache, nw_cache_exists));
|
139 | if (nw_cache_exists) {
|
140 | let mw = nw_cache;
|
141 | if (mw.p & POSTAG.D_N) {
|
142 | this.sliceToken(words, i, 2, {
|
143 | w: nw,
|
144 | p: mw.p,
|
145 | f: mw.f,
|
146 | m: [w1, w2],
|
147 | }, undefined, {
|
148 | [this.name]: 7,
|
149 | });
|
150 | ie--;
|
151 | continue;
|
152 | }
|
153 | }
|
154 | }
|
155 | // 能组成一个新词的(词性必须相同)
|
156 | if (this.isMergeable(w1, w2, {
|
157 | nw,
|
158 | POSTAG,
|
159 | TABLE,
|
160 | i,
|
161 | nw_cache,
|
162 | nw_cache_exists,
|
163 | }))
|
164 | //if (w1.p == w2.p && nw in TABLE)
|
165 | {
|
166 | ({
|
167 | nw_cache,
|
168 | nw_cache_exists,
|
169 | } = self._getWordCache(nw, nw_cache, nw_cache_exists));
|
170 | let mw = nw_cache;
|
171 | this.sliceToken(words, i, 2, {
|
172 | w: nw,
|
173 | p: mw.p,
|
174 | f: mw.f,
|
175 | m: [w1, w2],
|
176 | }, undefined, {
|
177 | [this.name]: 2,
|
178 | });
|
179 | ie--;
|
180 | continue;
|
181 | }
|
182 | // ============================================
|
183 | // 数词组合
|
184 | if ((w1.p & POSTAG.A_M)) {
|
185 | //debug(w2.w + ' ' + (w2.p & POSTAG.A_M));
|
186 | // 百分比数字 如 10%,或者下一个词也是数词,则合并
|
187 | if ((w2.p & POSTAG.A_M
|
188 | && !/^第/.test(w2.w)) || w2.w == '%' || w2.w == '%') {
|
189 | this.sliceToken(words, i, 2, {
|
190 | w: w1.w + w2.w,
|
191 | p: POSTAG.A_M,
|
192 | m: [w1, w2],
|
193 | }, undefined, {
|
194 | [this.name]: 3,
|
195 | });
|
196 | ie--;
|
197 | continue;
|
198 | }
|
199 | // 数词 + 量词,合并。如: 100个
|
200 | if ((w2.p & POSTAG.A_Q)) {
|
201 | // 数量词
|
202 | let p = POSTAG.D_MQ;
|
203 | let nw = w1.w + w2.w;
|
204 | ({
|
205 | nw_cache,
|
206 | nw_cache_exists,
|
207 | } = self._getWordCache(nw, nw_cache, nw_cache_exists));
|
208 | p = this._mergeWordHowManyProp(p, w2.p, nw_cache === null || nw_cache === void 0 ? void 0 : nw_cache.p);
|
209 | /*
|
210 | if (nw_cache)
|
211 | {
|
212 | p = nw_cache.p | POSTAG.D_MQ;
|
213 | }
|
214 | else
|
215 | {
|
216 | if (w2.p & POSTAG.D_T)
|
217 | {
|
218 | p = p | POSTAG.D_T;
|
219 | }
|
220 | if (w2.p & POSTAG.D_N)
|
221 | {
|
222 | p = p | POSTAG.D_N;
|
223 | }
|
224 | if (w2.p & POSTAG.D_V)
|
225 | {
|
226 | p = p | POSTAG.D_V;
|
227 | }
|
228 | }
|
229 | */
|
230 | this.sliceToken(words, i, 2, {
|
231 | w: nw,
|
232 | p,
|
233 | m: [w1, w2],
|
234 | }, undefined, {
|
235 | [this.name]: 4,
|
236 | });
|
237 | ie--;
|
238 | continue;
|
239 | }
|
240 | // 带小数点的数字 ,如 “3 . 14”,或者 “十五点三”
|
241 | // 数词 + "分之" + 数词,如“五十分之一”
|
242 | let w3 = words[i + 2];
|
243 | if (w3 && (w3.p & POSTAG.A_M)) {
|
244 | if (w2.w == '.'
|
245 | || w2.w == '点'
|
246 | || w2.w == '點'
|
247 | || w2.w == '分之') {
|
248 | this.sliceToken(words, i, 3, {
|
249 | w: w1.w + w2.w + w3.w,
|
250 | p: POSTAG.A_M,
|
251 | m: [w1, w2, w3],
|
252 | }, undefined, {
|
253 | [this.name]: 5,
|
254 | });
|
255 | ie -= 2;
|
256 | continue;
|
257 | }
|
258 | /**
|
259 | * 支援 `最多容納59,000個人,或5.9萬人,再多就不行了.這是環評的結論.`
|
260 | */
|
261 | if (w2.w == ',') {
|
262 | let _r1 = /^[\d0-9]+$/;
|
263 | let _r2 = /^(?:(?:[\d0-9]+)?(?:\.[\d0-9]+)|(?:[\d0-9]+))$/;
|
264 | if (_r1.test(w1.w) && _r2.test(w3.w)) {
|
265 | this.sliceToken(words, i, 3, {
|
266 | w: w1.w + w2.w + w3.w,
|
267 | p: POSTAG.A_M,
|
268 | m: [w1, w2, w3],
|
269 | }, undefined, {
|
270 | [this.name]: 6,
|
271 | });
|
272 | ie -= 2;
|
273 | continue;
|
274 | }
|
275 | }
|
276 | }
|
277 | }
|
278 | if (/^[數数幾几][百千萬十億兆万亿]$/.test(w1.w) && w2.p & POSTAG.A_Q) {
|
279 | let ow = w1.w + w2.w;
|
280 | let nw = w1.w + w2.w;
|
281 | if (0 && /^几/.test(nw)) {
|
282 | nw = nw.replace(/^几/, '幾');
|
283 | }
|
284 | ({
|
285 | nw_cache,
|
286 | nw_cache_exists,
|
287 | } = self._getWordCache(nw, nw_cache, nw_cache_exists));
|
288 | let p = this._mergeWordHowManyProp(POSTAG.D_MQ, w2.p, nw_cache === null || nw_cache === void 0 ? void 0 : nw_cache.p);
|
289 | this.sliceToken(words, i, 2, {
|
290 | w: nw,
|
291 | p,
|
292 | m: [w1, w2],
|
293 | }, undefined, {
|
294 | [this.name]: 9,
|
295 | });
|
296 | ie--;
|
297 | continue;
|
298 | }
|
299 | if (/^[數数幾几]$/.test(w1.w) && w2.p & POSTAG.A_M && ((_a = words[i + 2]) === null || _a === void 0 ? void 0 : _a.p) & POSTAG.A_Q) {
|
300 | let w3 = words[i + 2];
|
301 | let nw;
|
302 | if (0 && w1.w === '几') {
|
303 | nw = '幾' + w2.w + w3.w;
|
304 | }
|
305 | else {
|
306 | nw = w1.w + w2.w + w3.w;
|
307 | }
|
308 | let nw_cache = this._TABLE[nw];
|
309 | /**
|
310 | * 已經看過數百遍的動畫。
|
311 | */
|
312 | if (!(nw_cache === null || nw_cache === void 0 ? void 0 : nw_cache.p)) {
|
313 | let p = this._mergeWordHowManyProp(POSTAG.D_MQ, w3.p, nw_cache === null || nw_cache === void 0 ? void 0 : nw_cache.p);
|
314 | this.sliceToken(words, i, 3, {
|
315 | w: nw,
|
316 | p,
|
317 | m: [w1, w2, w3],
|
318 | }, undefined, {
|
319 | [this.name]: 9,
|
320 | });
|
321 | ie -= 2;
|
322 | continue;
|
323 | }
|
324 | }
|
325 | // 修正 “十五点五八”问题
|
326 | if ((w1.p & POSTAG.D_MQ) && ['點', '点'].includes(w1.w.substr(-1)) && w2.p & POSTAG.A_M) {
|
327 | //debug(w1, w2);
|
328 | let i2 = 2;
|
329 | let w4w = '';
|
330 | for (let j = i + i2; j < ie; j++) {
|
331 | let w3 = words[j];
|
332 | if ((w3.p & POSTAG.A_M) > 0) {
|
333 | w4w += w3.w;
|
334 | i2++;
|
335 | }
|
336 | else {
|
337 | break;
|
338 | }
|
339 | }
|
340 | this.sliceToken(words, i, i2, {
|
341 | w: w1.w + w2.w + w4w,
|
342 | p: POSTAG.D_MQ,
|
343 | m: [w1, w2, w4w],
|
344 | }, undefined, {
|
345 | [this.name]: 6,
|
346 | });
|
347 | ie -= i2 - 1;
|
348 | continue;
|
349 | }
|
350 | /**
|
351 | * 合併 東南西北
|
352 | */
|
353 | if (DIRECTIONS_REGEXP.test(w1.w)) {
|
354 | if (DIRECTIONS_REGEXP.test(w2.w)) {
|
355 | ({
|
356 | nw_cache,
|
357 | nw_cache_exists,
|
358 | } = self._getWordCache(nw, nw_cache, nw_cache_exists));
|
359 | let mw = this.createToken({
|
360 | p: POSTAG.D_F,
|
361 | ...nw_cache,
|
362 | w: nw,
|
363 | m: [w1, w2],
|
364 | });
|
365 | mw.p = mw.p | POSTAG.D_F;
|
366 | this.sliceToken(words, i, 2, mw, true, {
|
367 | [this.name]: 8,
|
368 | });
|
369 | ie--;
|
370 | continue;
|
371 | }
|
372 | }
|
373 | // 移到下一个词
|
374 | i++;
|
375 | }
|
376 | // 针对组合数字后无法识别新组合的数字问题,需要重新扫描一次
|
377 | return is_not_first === true ? words : this.doOptimize(words, true);
|
378 | }
|
379 | /**
|
380 | * 數詞 + 量詞
|
381 | */
|
382 | _mergeWordHowManyProp(p, p2, p3) {
|
383 | if (p3) {
|
384 | p = p3 | this._POSTAG.D_MQ;
|
385 | }
|
386 | else {
|
387 | if (p2 & this._POSTAG.D_T) {
|
388 | p = p | this._POSTAG.D_T;
|
389 | }
|
390 | if (p2 & this._POSTAG.D_N) {
|
391 | p = p | this._POSTAG.D_N;
|
392 | }
|
393 | if (p2 & this._POSTAG.D_V) {
|
394 | p = p | this._POSTAG.D_V;
|
395 | }
|
396 | }
|
397 | return p;
|
398 | }
|
399 | }
|
400 | exports.DictOptimizer = DictOptimizer;
|
401 | exports.init = DictOptimizer.init.bind(DictOptimizer);
|
402 | exports.default = DictOptimizer;
|
403 | //# sourceMappingURL=DictOptimizer.js.map |
\ | No newline at end of file |