UNPKG

9.25 kBJavaScriptView Raw
1/**
2 * 人名优化模块
3 *
4 * @author 老雷<leizongmin@gmail.com>
5 * @version 0.1
6 */
7'use strict';
8Object.defineProperty(exports, "__esModule", { value: true });
9exports.init = exports.ChsNameOptimizer = void 0;
10const mod_1 = require("../mod");
11const CHS_NAMES_1 = require("../mod/CHS_NAMES");
12/**
13 * @todo 支援 XX氏
14 */
15class ChsNameOptimizer extends mod_1.SubSModuleOptimizer {
16 constructor() {
17 super(...arguments);
18 this.name = 'ChsNameOptimizer';
19 }
20 _cache() {
21 super._cache();
22 this._TABLE = this.segment.getDict('TABLE');
23 this._BLACKLIST = this.segment.getDict("BLACKLIST_FOR_OPTIMIZER" /* BLACKLIST_FOR_OPTIMIZER */) || {};
24 }
25 isMergeable2(...words) {
26 let nw = words.join('');
27 if (!this._BLACKLIST[nw]) {
28 return true;
29 }
30 return null;
31 }
32 isMergeable(word, nextword) {
33 if (word && nextword) {
34 let nw = word.w + nextword.w;
35 /**
36 * 不合併存在於 BLACKLIST 內的字詞
37 */
38 if (!this._BLACKLIST[nw]) {
39 return true;
40 /*
41 return {
42 word,
43 nextword,
44 nw,
45 bool: true,
46 }
47 */
48 }
49 }
50 return null;
51 }
52 /**
53 * 只有新詞屬於人名或未知詞時才會合併
54 */
55 validUnknownNewWord(ws, cb) {
56 var _a;
57 let nw = typeof ws === 'string' ? ws : ws.join('');
58 let ew = this._TABLE[nw];
59 if (!(ew === null || ew === void 0 ? void 0 : ew.p) || ew.p & this._POSTAG.A_NR) {
60 let ret = (_a = cb === null || cb === void 0 ? void 0 : cb(nw, ew, ws)) !== null && _a !== void 0 ? _a : true;
61 if (ret) {
62 return typeof ret === 'object' ? ret : (ew !== null && ew !== void 0 ? ew : true);
63 }
64 }
65 }
66 /**
67 * 对可能是人名的单词进行优化
68 *
69 * @param {array} words 单词数组
70 * @return {array}
71 */
72 doOptimize(words) {
73 //debug(words);
74 const POSTAG = this._POSTAG;
75 let i = 0;
76 /* 第一遍扫描 */
77 while (i < words.length) {
78 let word = words[i];
79 let nextword = words[i + 1];
80 if (this.isMergeable(word, nextword) && this.validUnknownNewWord(word.w + nextword.w)) {
81 let nw = word.w + nextword.w;
82 //debug(nextword);
83 // 如果为 "小|老" + 姓
84 if (nextword && (word.w == '小' || word.w == '老') &&
85 (nextword.w in CHS_NAMES_1.default.FAMILY_NAME_1 || nextword.w in CHS_NAMES_1.default.FAMILY_NAME_2)) {
86 /*
87 words.splice(i, 2, {
88 w: word.w + nextword.w,
89 p: POSTAG.A_NR,
90 m: [word, nextword],
91 });
92 */
93 this.sliceToken(words, i, 2, {
94 w: nw,
95 p: POSTAG.A_NR,
96 m: [word, nextword],
97 }, undefined, {
98 [this.name]: 1,
99 });
100 i++;
101 continue;
102 }
103 // 如果是 姓 + 名(2字以内)
104 if ((word.w in CHS_NAMES_1.default.FAMILY_NAME_1 || word.w in CHS_NAMES_1.default.FAMILY_NAME_2) &&
105 ((nextword.p & POSTAG.A_NR) > 0 && nextword.w.length <= 2)) {
106 /*
107 words.splice(i, 2, {
108 w: word.w + nextword.w,
109 p: POSTAG.A_NR,
110 m: [word, nextword],
111 });
112 */
113 this.sliceToken(words, i, 2, {
114 w: nw,
115 p: POSTAG.A_NR,
116 m: [word, nextword],
117 }, undefined, {
118 [this.name]: 2,
119 });
120 i++;
121 continue;
122 }
123 // 如果相邻两个均为单字且至少有一个字是未识别的,则尝试判断其是否为人名
124 if (!word.p || !nextword.p) {
125 if ((word.w in CHS_NAMES_1.default.SINGLE_NAME && word.w == nextword.w) ||
126 (word.w in CHS_NAMES_1.default.DOUBLE_NAME_1 && nextword.w in CHS_NAMES_1.default.DOUBLE_NAME_2)) {
127 /*
128 words.splice(i, 2, {
129 w: word.w + nextword.w,
130 p: POSTAG.A_NR,
131 m: [word, nextword],
132 });
133 */
134 this.sliceToken(words, i, 2, {
135 w: nw,
136 p: POSTAG.A_NR,
137 m: [word, nextword],
138 }, undefined, {
139 [this.name]: 3,
140 });
141 // 如果上一个单词可能是一个姓,则合并
142 let preword = words[i - 1];
143 if (preword
144 && (preword.w in CHS_NAMES_1.default.FAMILY_NAME_1 || preword.w in CHS_NAMES_1.default.FAMILY_NAME_2)
145 && this.isMergeable2(preword.w, word.w, nextword.w)) {
146 let nw = preword.w + word.w + nextword.w;
147 /*
148 words.splice(i - 1, 2, {
149 w: preword.w + word.w + nextword.w,
150 p: POSTAG.A_NR,
151 m: [preword, word, nextword],
152 });
153 */
154 this.sliceToken(words, i - 1, 2, {
155 w: nw,
156 p: POSTAG.A_NR,
157 m: [preword, word, nextword],
158 }, undefined, {
159 [this.name]: 4,
160 });
161 }
162 else {
163 i++;
164 }
165 continue;
166 }
167 }
168 // 如果为 无歧义的姓 + 名(2字以内) 且其中一个未未识别词
169 if ((word.w in CHS_NAMES_1.default.FAMILY_NAME_1 || word.w in CHS_NAMES_1.default.FAMILY_NAME_2)
170 && (!word.p || !nextword.p)
171 /**
172 * 防止將標點符號當作名字的BUG
173 */
174 && !(word.p & POSTAG.D_W || nextword.p & POSTAG.D_W)) {
175 //debug(word, nextword);
176 /*
177 words.splice(i, 2, {
178 w: word.w + nextword.w,
179 p: POSTAG.A_NR,
180 m: [word, nextword],
181 });
182 */
183 this.sliceToken(words, i, 2, {
184 w: nw,
185 p: POSTAG.A_NR,
186 m: [word, nextword],
187 }, undefined, {
188 [this.name]: 5,
189 });
190 }
191 }
192 // 移到下一个单词
193 i++;
194 }
195 /* 第二遍扫描 */
196 i = 0;
197 while (i < words.length) {
198 let word = words[i];
199 let nextword = words[i + 1];
200 if (this.isMergeable(word, nextword)) {
201 // 如果为 姓 + 单字名
202 if ((word.w in CHS_NAMES_1.default.FAMILY_NAME_1 || word.w in CHS_NAMES_1.default.FAMILY_NAME_2)
203 &&
204 nextword.w in CHS_NAMES_1.default.SINGLE_NAME) {
205 /*
206 words.splice(i, 2, {
207 w: word.w + nextword.w,
208 p: POSTAG.A_NR,
209 m: [word, nextword],
210 });
211 */
212 let nw = word.w + nextword.w;
213 let ew = this._TABLE[nw];
214 /**
215 * 更改為只有新詞屬於人名或未知詞時才會合併
216 */
217 if (!ew || !ew.p || ew.p & POSTAG.A_NR) {
218 this.sliceToken(words, i, 2, {
219 w: nw,
220 p: POSTAG.A_NR,
221 m: [word, nextword],
222 }, undefined, {
223 [this.name]: 6,
224 exists_word: ew,
225 });
226 i++;
227 continue;
228 }
229 }
230 }
231 // 移到下一个单词
232 i++;
233 }
234 return words;
235 }
236}
237exports.ChsNameOptimizer = ChsNameOptimizer;
238exports.init = ChsNameOptimizer.init.bind(ChsNameOptimizer);
239exports.default = ChsNameOptimizer;
240//# sourceMappingURL=ChsNameOptimizer.js.map
\No newline at end of file