UNPKG

37.3 kBJavaScriptView Raw
1'use strict';
2Object.defineProperty(exports, "__esModule", { value: true });
3exports.init = exports.DictOptimizer = void 0;
4const mod_1 = require("../mod");
5const DIRECTIONS_REGEXP = /^[東西南北东]+$/;
6/**
7 * 词典优化模块
8 *
9 * @author 老雷<leizongmin@gmail.com>
10 */
11class DictOptimizer extends mod_1.SubSModuleOptimizer {
12 constructor() {
13 super(...arguments);
14 this.name = 'DictOptimizer';
15 }
16 _cache() {
17 super._cache();
18 this._TABLE = this.segment.getDict('TABLE');
19 this._POSTAG = this.segment.POSTAG;
20 }
21 isMergeable(w1, w2, { POSTAG, TABLE, nw, i, nw_cache, nw_cache_exists, }) {
22 let bool;
23 let m;
24 /**
25 * 原始判斷模式
26 */
27 if (w1.p == w2.p) {
28 bool = true;
29 }
30 /**
31 * 不確定沒有BUG 但原始模式已經不合需求 因為單一項目多個詞性
32 */
33 else if (m = (w1.p & w2.p)) {
34 if (1 || m & POSTAG.D_N) {
35 bool = true;
36 }
37 }
38 /**
39 * 允許例如 幾 + %
40 */
41 else if (w1.p && typeof w2.p == 'undefined') {
42 bool = true;
43 }
44 else if (w1.p & POSTAG.D_D && w2.p & POSTAG.D_V) {
45 ({
46 nw_cache,
47 nw_cache_exists,
48 } = this._getWordCache(nw, nw_cache, nw_cache_exists));
49 let mw = nw_cache;
50 if (mw && (mw.p & POSTAG.D_D || mw.p & POSTAG.D_V)) {
51 bool = true;
52 }
53 }
54 return bool
55 && this._getWordCache(nw, nw_cache, nw_cache_exists).nw_cache_exists;
56 }
57 _getWordCache(nw, nw_cache, nw_cache_exists) {
58 if (typeof nw_cache_exists === 'undefined') {
59 const TABLE = this._TABLE;
60 nw_cache = nw_cache || TABLE[nw];
61 nw_cache_exists = !!nw_cache;
62 }
63 return {
64 nw,
65 nw_cache,
66 nw_cache_exists,
67 };
68 }
69 /**
70 * 词典优化
71 *
72 * @param {array} words 单词数组
73 * @param {bool} is_not_first 是否为管理器调用的
74 * @return {array}
75 */
76 doOptimize(words, is_not_first) {
77 //debug(words);
78 if (typeof is_not_first == 'undefined') {
79 is_not_first = false;
80 }
81 // 合并相邻的能组成一个单词的两个词
82 const TABLE = this._TABLE;
83 const POSTAG = this._POSTAG;
84 const self = this;
85 let i = 0;
86 let ie = words.length - 1;
87 while (i < ie) {
88 let w1 = words[i];
89 let w2 = words[i + 1];
90 //debug(w1.w + ', ' + w2.w);
91 // ==========================================
92 let nw = w1.w + w2.w;
93 let nw_cache;
94 let nw_cache_exists;
95 /**
96 * 形容词 + 助词 = 形容词,如: 不同 + 的 = 不同的
97 */
98 if (w1.w != '了'
99 && (w1.p & POSTAG.D_A)
100 && (w2.p & POSTAG.D_U)) {
101 let p = POSTAG.D_A;
102 let f;
103 ({
104 nw_cache,
105 nw_cache_exists,
106 } = self._getWordCache(nw, nw_cache, nw_cache_exists));
107 let mw = nw_cache;
108 if (!mw || (mw.p & POSTAG.D_A)) {
109 if (mw && (mw.p & POSTAG.D_A)) {
110 p = mw.p;
111 f = mw.f;
112 }
113 else if (w1.p & POSTAG.BAD) {
114 p = POSTAG.D_A + POSTAG.BAD;
115 }
116 this.sliceToken(words, i, 2, {
117 w: nw,
118 //p: ((nw in TABLE && TABLE[nw].p & POSTAG.D_A) ? TABLE[nw].p : POSTAG.D_A),
119 p,
120 f,
121 m: [w1, w2],
122 }, undefined, {
123 [this.name]: 1,
124 });
125 ie--;
126 continue;
127 }
128 }
129 /**
130 * 形容詞 + 名詞 = 名詞
131 */
132 if ((w1.p & POSTAG.D_A)
133 && (w2.p & POSTAG.D_N)) {
134 ({
135 nw_cache,
136 nw_cache_exists,
137 } = self._getWordCache(nw, nw_cache, nw_cache_exists));
138 if (nw_cache_exists) {
139 let mw = nw_cache;
140 if (mw.p & POSTAG.D_N) {
141 this.sliceToken(words, i, 2, {
142 w: nw,
143 p: mw.p,
144 f: mw.f,
145 m: [w1, w2],
146 }, undefined, {
147 [this.name]: 7,
148 });
149 ie--;
150 continue;
151 }
152 }
153 }
154 // 能组成一个新词的(词性必须相同)
155 if (this.isMergeable(w1, w2, {
156 nw,
157 POSTAG,
158 TABLE,
159 i,
160 nw_cache,
161 nw_cache_exists,
162 }))
163 //if (w1.p == w2.p && nw in TABLE)
164 {
165 ({
166 nw_cache,
167 nw_cache_exists,
168 } = self._getWordCache(nw, nw_cache, nw_cache_exists));
169 let mw = nw_cache;
170 this.sliceToken(words, i, 2, {
171 w: nw,
172 p: mw.p,
173 f: mw.f,
174 m: [w1, w2],
175 }, undefined, {
176 [this.name]: 2,
177 });
178 ie--;
179 continue;
180 }
181 // ============================================
182 // 数词组合
183 if ((w1.p & POSTAG.A_M)) {
184 //debug(w2.w + ' ' + (w2.p & POSTAG.A_M));
185 // 百分比数字 如 10%,或者下一个词也是数词,则合并
186 if ((w2.p & POSTAG.A_M
187 && !/^第/.test(w2.w)) || w2.w == '%' || w2.w == '%') {
188 this.sliceToken(words, i, 2, {
189 w: w1.w + w2.w,
190 p: POSTAG.A_M,
191 m: [w1, w2],
192 }, undefined, {
193 [this.name]: 3,
194 });
195 ie--;
196 continue;
197 }
198 // 数词 + 量词,合并。如: 100个
199 if ((w2.p & POSTAG.A_Q)) {
200 // 数量词
201 let p = POSTAG.D_MQ;
202 let nw = w1.w + w2.w;
203 ({
204 nw_cache,
205 nw_cache_exists,
206 } = self._getWordCache(nw, nw_cache, nw_cache_exists));
207 if (nw_cache) {
208 p = nw_cache.p | POSTAG.D_MQ;
209 }
210 else {
211 if (w2.p & POSTAG.D_T) {
212 p = p | POSTAG.D_T;
213 }
214 if (w2.p & POSTAG.D_N) {
215 p = p | POSTAG.D_N;
216 }
217 if (w2.p & POSTAG.D_V) {
218 p = p | POSTAG.D_V;
219 }
220 }
221 this.sliceToken(words, i, 2, {
222 w: nw,
223 p,
224 m: [w1, w2],
225 }, undefined, {
226 [this.name]: 4,
227 });
228 ie--;
229 continue;
230 }
231 // 带小数点的数字 ,如 “3 . 14”,或者 “十五点三”
232 // 数词 + "分之" + 数词,如“五十分之一”
233 let w3 = words[i + 2];
234 if (w3 && (w3.p & POSTAG.A_M)) {
235 if (w2.w == '.'
236 || w2.w == '点'
237 || w2.w == '點'
238 || w2.w == '分之') {
239 this.sliceToken(words, i, 3, {
240 w: w1.w + w2.w + w3.w,
241 p: POSTAG.A_M,
242 m: [w1, w2, w3],
243 }, undefined, {
244 [this.name]: 5,
245 });
246 ie -= 2;
247 continue;
248 }
249 /**
250 * 支援 `最多容納59,000個人,或5.9萬人,再多就不行了.這是環評的結論.`
251 */
252 if (w2.w == ',') {
253 let _r1 = /^[\d0-9]+$/;
254 let _r2 = /^(?:(?:[\d0-9]+)?(?:\.[\d0-9]+)|(?:[\d0-9]+))$/;
255 if (_r1.test(w1.w) && _r2.test(w3.w)) {
256 this.sliceToken(words, i, 3, {
257 w: w1.w + w2.w + w3.w,
258 p: POSTAG.A_M,
259 m: [w1, w2, w3],
260 }, undefined, {
261 [this.name]: 6,
262 });
263 ie -= 2;
264 continue;
265 }
266 }
267 }
268 }
269 // 修正 “十五点五八”问题
270 if ((w1.p & POSTAG.D_MQ) && ['點', '点'].includes(w1.w.substr(-1)) && w2.p & POSTAG.A_M) {
271 //debug(w1, w2);
272 let i2 = 2;
273 let w4w = '';
274 for (let j = i + i2; j < ie; j++) {
275 let w3 = words[j];
276 if ((w3.p & POSTAG.A_M) > 0) {
277 w4w += w3.w;
278 i2++;
279 }
280 else {
281 break;
282 }
283 }
284 this.sliceToken(words, i, i2, {
285 w: w1.w + w2.w + w4w,
286 p: POSTAG.D_MQ,
287 m: [w1, w2, w4w],
288 }, undefined, {
289 [this.name]: 6,
290 });
291 ie -= i2 - 1;
292 continue;
293 }
294 /**
295 * 合併 東南西北
296 */
297 if (DIRECTIONS_REGEXP.test(w1.w)) {
298 if (DIRECTIONS_REGEXP.test(w2.w)) {
299 ({
300 nw_cache,
301 nw_cache_exists,
302 } = self._getWordCache(nw, nw_cache, nw_cache_exists));
303 let mw = this.createToken({
304 p: POSTAG.D_F,
305 ...nw_cache,
306 w: nw,
307 m: [w1, w2],
308 });
309 mw.p = mw.p | POSTAG.D_F;
310 this.sliceToken(words, i, 2, mw, true, {
311 [this.name]: 8,
312 });
313 ie--;
314 continue;
315 }
316 }
317 // 移到下一个词
318 i++;
319 }
320 // 针对组合数字后无法识别新组合的数字问题,需要重新扫描一次
321 return is_not_first === true ? words : this.doOptimize(words, true);
322 }
323}
324exports.DictOptimizer = DictOptimizer;
325exports.init = DictOptimizer.init.bind(DictOptimizer);
326exports.default = DictOptimizer;
327//# sourceMappingURL=data:application/json;base64,
\No newline at end of file