UNPKG

14.3 kBJavaScriptView Raw
1'use strict';
2Object.defineProperty(exports, "__esModule", { value: true });
3exports.init = exports.DictOptimizer = void 0;
4const mod_1 = require("../mod");
5const DIRECTIONS_REGEXP = /^[東西南北东]+$/;
6/**
7 * 词典优化模块
8 *
9 * @author 老雷<leizongmin@gmail.com>
10 */
11class DictOptimizer extends mod_1.SubSModuleOptimizer {
12 constructor() {
13 super(...arguments);
14 this.name = 'DictOptimizer';
15 }
16 _cache() {
17 super._cache();
18 this._TABLE = this.segment.getDict('TABLE');
19 this._POSTAG = this.segment.POSTAG;
20 }
21 isMergeable(w1, w2, { POSTAG, TABLE, nw, i, nw_cache, nw_cache_exists, }) {
22 let bool;
23 let m;
24 /**
25 * 原始判斷模式
26 */
27 if (w1.p == w2.p) {
28 bool = true;
29 }
30 /**
31 * 不確定沒有BUG 但原始模式已經不合需求 因為單一項目多個詞性
32 */
33 else if (m = (w1.p & w2.p)) {
34 if (1 || m & POSTAG.D_N) {
35 bool = true;
36 }
37 }
38 /**
39 * 允許例如 幾 + %
40 */
41 else if (w1.p && typeof w2.p == 'undefined') {
42 bool = true;
43 }
44 else if (w1.p & POSTAG.D_D && w2.p & POSTAG.D_V) {
45 ({
46 nw_cache,
47 nw_cache_exists,
48 } = this._getWordCache(nw, nw_cache, nw_cache_exists));
49 let mw = nw_cache;
50 if (mw && (mw.p & POSTAG.D_D || mw.p & POSTAG.D_V)) {
51 bool = true;
52 }
53 }
54 return bool
55 && this._getWordCache(nw, nw_cache, nw_cache_exists).nw_cache_exists;
56 }
57 _getWordCache(nw, nw_cache, nw_cache_exists) {
58 if (typeof nw_cache_exists === 'undefined') {
59 const TABLE = this._TABLE;
60 nw_cache = nw_cache || TABLE[nw];
61 nw_cache_exists = !!nw_cache;
62 }
63 return {
64 nw,
65 nw_cache,
66 nw_cache_exists,
67 };
68 }
69 /**
70 * 词典优化
71 *
72 * @param {array} words 单词数组
73 * @param {bool} is_not_first 是否为管理器调用的
74 * @return {array}
75 */
76 doOptimize(words, is_not_first) {
77 var _a;
78 //debug(words);
79 if (typeof is_not_first == 'undefined') {
80 is_not_first = false;
81 }
82 // 合并相邻的能组成一个单词的两个词
83 const TABLE = this._TABLE;
84 const POSTAG = this._POSTAG;
85 const self = this;
86 let i = 0;
87 let ie = words.length - 1;
88 while (i < ie) {
89 let w1 = words[i];
90 let w2 = words[i + 1];
91 //debug(w1.w + ', ' + w2.w);
92 // ==========================================
93 let nw = w1.w + w2.w;
94 let nw_cache;
95 let nw_cache_exists;
96 /**
97 * 形容词 + 助词 = 形容词,如: 不同 + 的 = 不同的
98 */
99 if (w1.w != '了'
100 && (w1.p & POSTAG.D_A)
101 && (w2.p & POSTAG.D_U)) {
102 let p = POSTAG.D_A;
103 let f;
104 ({
105 nw_cache,
106 nw_cache_exists,
107 } = self._getWordCache(nw, nw_cache, nw_cache_exists));
108 let mw = nw_cache;
109 if (!mw || (mw.p & POSTAG.D_A)) {
110 if (mw && (mw.p & POSTAG.D_A)) {
111 p = mw.p;
112 f = mw.f;
113 }
114 else if (w1.p & POSTAG.BAD) {
115 p = POSTAG.D_A + POSTAG.BAD;
116 }
117 this.sliceToken(words, i, 2, {
118 w: nw,
119 //p: ((nw in TABLE && TABLE[nw].p & POSTAG.D_A) ? TABLE[nw].p : POSTAG.D_A),
120 p,
121 f,
122 m: [w1, w2],
123 }, undefined, {
124 [this.name]: 1,
125 });
126 ie--;
127 continue;
128 }
129 }
130 /**
131 * 形容詞 + 名詞 = 名詞
132 */
133 if ((w1.p & POSTAG.D_A)
134 && (w2.p & POSTAG.D_N)) {
135 ({
136 nw_cache,
137 nw_cache_exists,
138 } = self._getWordCache(nw, nw_cache, nw_cache_exists));
139 if (nw_cache_exists) {
140 let mw = nw_cache;
141 if (mw.p & POSTAG.D_N) {
142 this.sliceToken(words, i, 2, {
143 w: nw,
144 p: mw.p,
145 f: mw.f,
146 m: [w1, w2],
147 }, undefined, {
148 [this.name]: 7,
149 });
150 ie--;
151 continue;
152 }
153 }
154 }
155 // 能组成一个新词的(词性必须相同)
156 if (this.isMergeable(w1, w2, {
157 nw,
158 POSTAG,
159 TABLE,
160 i,
161 nw_cache,
162 nw_cache_exists,
163 }))
164 //if (w1.p == w2.p && nw in TABLE)
165 {
166 ({
167 nw_cache,
168 nw_cache_exists,
169 } = self._getWordCache(nw, nw_cache, nw_cache_exists));
170 let mw = nw_cache;
171 this.sliceToken(words, i, 2, {
172 w: nw,
173 p: mw.p,
174 f: mw.f,
175 m: [w1, w2],
176 }, undefined, {
177 [this.name]: 2,
178 });
179 ie--;
180 continue;
181 }
182 // ============================================
183 // 数词组合
184 if ((w1.p & POSTAG.A_M)) {
185 //debug(w2.w + ' ' + (w2.p & POSTAG.A_M));
186 // 百分比数字 如 10%,或者下一个词也是数词,则合并
187 if ((w2.p & POSTAG.A_M
188 && !/^第/.test(w2.w)) || w2.w == '%' || w2.w == '%') {
189 this.sliceToken(words, i, 2, {
190 w: w1.w + w2.w,
191 p: POSTAG.A_M,
192 m: [w1, w2],
193 }, undefined, {
194 [this.name]: 3,
195 });
196 ie--;
197 continue;
198 }
199 // 数词 + 量词,合并。如: 100个
200 if ((w2.p & POSTAG.A_Q)) {
201 // 数量词
202 let p = POSTAG.D_MQ;
203 let nw = w1.w + w2.w;
204 ({
205 nw_cache,
206 nw_cache_exists,
207 } = self._getWordCache(nw, nw_cache, nw_cache_exists));
208 p = this._mergeWordHowManyProp(p, w2.p, nw_cache === null || nw_cache === void 0 ? void 0 : nw_cache.p);
209 /*
210 if (nw_cache)
211 {
212 p = nw_cache.p | POSTAG.D_MQ;
213 }
214 else
215 {
216 if (w2.p & POSTAG.D_T)
217 {
218 p = p | POSTAG.D_T;
219 }
220 if (w2.p & POSTAG.D_N)
221 {
222 p = p | POSTAG.D_N;
223 }
224 if (w2.p & POSTAG.D_V)
225 {
226 p = p | POSTAG.D_V;
227 }
228 }
229 */
230 this.sliceToken(words, i, 2, {
231 w: nw,
232 p,
233 m: [w1, w2],
234 }, undefined, {
235 [this.name]: 4,
236 });
237 ie--;
238 continue;
239 }
240 // 带小数点的数字 ,如 “3 . 14”,或者 “十五点三”
241 // 数词 + "分之" + 数词,如“五十分之一”
242 let w3 = words[i + 2];
243 if (w3 && (w3.p & POSTAG.A_M)) {
244 if (w2.w == '.'
245 || w2.w == '点'
246 || w2.w == '點'
247 || w2.w == '分之') {
248 this.sliceToken(words, i, 3, {
249 w: w1.w + w2.w + w3.w,
250 p: POSTAG.A_M,
251 m: [w1, w2, w3],
252 }, undefined, {
253 [this.name]: 5,
254 });
255 ie -= 2;
256 continue;
257 }
258 /**
259 * 支援 `最多容納59,000個人,或5.9萬人,再多就不行了.這是環評的結論.`
260 */
261 if (w2.w == ',') {
262 let _r1 = /^[\d0-9]+$/;
263 let _r2 = /^(?:(?:[\d0-9]+)?(?:\.[\d0-9]+)|(?:[\d0-9]+))$/;
264 if (_r1.test(w1.w) && _r2.test(w3.w)) {
265 this.sliceToken(words, i, 3, {
266 w: w1.w + w2.w + w3.w,
267 p: POSTAG.A_M,
268 m: [w1, w2, w3],
269 }, undefined, {
270 [this.name]: 6,
271 });
272 ie -= 2;
273 continue;
274 }
275 }
276 }
277 }
278 if (/^[數数幾几][百千萬十億兆万亿]$/.test(w1.w) && w2.p & POSTAG.A_Q) {
279 let ow = w1.w + w2.w;
280 let nw = w1.w + w2.w;
281 if (0 && /^几/.test(nw)) {
282 nw = nw.replace(/^几/, '幾');
283 }
284 ({
285 nw_cache,
286 nw_cache_exists,
287 } = self._getWordCache(nw, nw_cache, nw_cache_exists));
288 let p = this._mergeWordHowManyProp(POSTAG.D_MQ, w2.p, nw_cache === null || nw_cache === void 0 ? void 0 : nw_cache.p);
289 this.sliceToken(words, i, 2, {
290 w: nw,
291 p,
292 m: [w1, w2],
293 }, undefined, {
294 [this.name]: 9,
295 });
296 ie--;
297 continue;
298 }
299 if (/^[數数幾几]$/.test(w1.w) && w2.p & POSTAG.A_M && ((_a = words[i + 2]) === null || _a === void 0 ? void 0 : _a.p) & POSTAG.A_Q) {
300 let w3 = words[i + 2];
301 let nw;
302 if (0 && w1.w === '几') {
303 nw = '幾' + w2.w + w3.w;
304 }
305 else {
306 nw = w1.w + w2.w + w3.w;
307 }
308 let nw_cache = this._TABLE[nw];
309 /**
310 * 已經看過數百遍的動畫。
311 */
312 if (!(nw_cache === null || nw_cache === void 0 ? void 0 : nw_cache.p)) {
313 let p = this._mergeWordHowManyProp(POSTAG.D_MQ, w3.p, nw_cache === null || nw_cache === void 0 ? void 0 : nw_cache.p);
314 this.sliceToken(words, i, 3, {
315 w: nw,
316 p,
317 m: [w1, w2, w3],
318 }, undefined, {
319 [this.name]: 9,
320 });
321 ie -= 2;
322 continue;
323 }
324 }
325 // 修正 “十五点五八”问题
326 if ((w1.p & POSTAG.D_MQ) && ['點', '点'].includes(w1.w.substr(-1)) && w2.p & POSTAG.A_M) {
327 //debug(w1, w2);
328 let i2 = 2;
329 let w4w = '';
330 for (let j = i + i2; j < ie; j++) {
331 let w3 = words[j];
332 if ((w3.p & POSTAG.A_M) > 0) {
333 w4w += w3.w;
334 i2++;
335 }
336 else {
337 break;
338 }
339 }
340 this.sliceToken(words, i, i2, {
341 w: w1.w + w2.w + w4w,
342 p: POSTAG.D_MQ,
343 m: [w1, w2, w4w],
344 }, undefined, {
345 [this.name]: 6,
346 });
347 ie -= i2 - 1;
348 continue;
349 }
350 /**
351 * 合併 東南西北
352 */
353 if (DIRECTIONS_REGEXP.test(w1.w)) {
354 if (DIRECTIONS_REGEXP.test(w2.w)) {
355 ({
356 nw_cache,
357 nw_cache_exists,
358 } = self._getWordCache(nw, nw_cache, nw_cache_exists));
359 let mw = this.createToken({
360 p: POSTAG.D_F,
361 ...nw_cache,
362 w: nw,
363 m: [w1, w2],
364 });
365 mw.p = mw.p | POSTAG.D_F;
366 this.sliceToken(words, i, 2, mw, true, {
367 [this.name]: 8,
368 });
369 ie--;
370 continue;
371 }
372 }
373 // 移到下一个词
374 i++;
375 }
376 // 针对组合数字后无法识别新组合的数字问题,需要重新扫描一次
377 return is_not_first === true ? words : this.doOptimize(words, true);
378 }
379 /**
380 * 數詞 + 量詞
381 */
382 _mergeWordHowManyProp(p, p2, p3) {
383 if (p3) {
384 p = p3 | this._POSTAG.D_MQ;
385 }
386 else {
387 if (p2 & this._POSTAG.D_T) {
388 p = p | this._POSTAG.D_T;
389 }
390 if (p2 & this._POSTAG.D_N) {
391 p = p | this._POSTAG.D_N;
392 }
393 if (p2 & this._POSTAG.D_V) {
394 p = p | this._POSTAG.D_V;
395 }
396 }
397 return p;
398 }
399}
400exports.DictOptimizer = DictOptimizer;
401exports.init = DictOptimizer.init.bind(DictOptimizer);
402exports.default = DictOptimizer;
403//# sourceMappingURL=DictOptimizer.js.map
\No newline at end of file