1 | /**
|
2 | * 人名优化模块
|
3 | *
|
4 | * @author 老雷<leizongmin@gmail.com>
|
5 | * @version 0.1
|
6 | */
|
7 | ;
|
8 | Object.defineProperty(exports, "__esModule", { value: true });
|
9 | exports.init = exports.ChsNameOptimizer = void 0;
|
10 | const mod_1 = require("../mod");
|
11 | const CHS_NAMES_1 = require("../mod/CHS_NAMES");
|
12 | /**
|
13 | * @todo 支援 XX氏
|
14 | */
|
15 | class ChsNameOptimizer extends mod_1.SubSModuleOptimizer {
|
16 | constructor() {
|
17 | super(...arguments);
|
18 | this.name = 'ChsNameOptimizer';
|
19 | }
|
20 | _cache() {
|
21 | super._cache();
|
22 | this._TABLE = this.segment.getDict('TABLE');
|
23 | this._BLACKLIST = this.segment.getDict("BLACKLIST_FOR_OPTIMIZER" /* BLACKLIST_FOR_OPTIMIZER */) || {};
|
24 | }
|
25 | isMergeable2(...words) {
|
26 | let nw = words.join('');
|
27 | if (!this._BLACKLIST[nw]) {
|
28 | return true;
|
29 | }
|
30 | return null;
|
31 | }
|
32 | isMergeable(word, nextword) {
|
33 | if (word && nextword) {
|
34 | let nw = word.w + nextword.w;
|
35 | /**
|
36 | * 不合併存在於 BLACKLIST 內的字詞
|
37 | */
|
38 | if (!this._BLACKLIST[nw]) {
|
39 | return true;
|
40 | /*
|
41 | return {
|
42 | word,
|
43 | nextword,
|
44 | nw,
|
45 | bool: true,
|
46 | }
|
47 | */
|
48 | }
|
49 | }
|
50 | return null;
|
51 | }
|
52 | /**
|
53 | * 只有新詞屬於人名或未知詞時才會合併
|
54 | */
|
55 | validUnknownNewWord(ws, cb) {
|
56 | var _a;
|
57 | let nw = typeof ws === 'string' ? ws : ws.join('');
|
58 | let ew = this._TABLE[nw];
|
59 | if (!(ew === null || ew === void 0 ? void 0 : ew.p) || ew.p & this._POSTAG.A_NR) {
|
60 | let ret = (_a = cb === null || cb === void 0 ? void 0 : cb(nw, ew, ws)) !== null && _a !== void 0 ? _a : true;
|
61 | if (ret) {
|
62 | return typeof ret === 'object' ? ret : (ew !== null && ew !== void 0 ? ew : true);
|
63 | }
|
64 | }
|
65 | }
|
66 | /**
|
67 | * 对可能是人名的单词进行优化
|
68 | *
|
69 | * @param {array} words 单词数组
|
70 | * @return {array}
|
71 | */
|
72 | doOptimize(words) {
|
73 | //debug(words);
|
74 | const POSTAG = this._POSTAG;
|
75 | let i = 0;
|
76 | /* 第一遍扫描 */
|
77 | while (i < words.length) {
|
78 | let word = words[i];
|
79 | let nextword = words[i + 1];
|
80 | if (this.isMergeable(word, nextword) && this.validUnknownNewWord(word.w + nextword.w)) {
|
81 | let nw = word.w + nextword.w;
|
82 | //debug(nextword);
|
83 | // 如果为 "小|老" + 姓
|
84 | if (nextword && (word.w == '小' || word.w == '老') &&
|
85 | (nextword.w in CHS_NAMES_1.default.FAMILY_NAME_1 || nextword.w in CHS_NAMES_1.default.FAMILY_NAME_2)) {
|
86 | /*
|
87 | words.splice(i, 2, {
|
88 | w: word.w + nextword.w,
|
89 | p: POSTAG.A_NR,
|
90 | m: [word, nextword],
|
91 | });
|
92 | */
|
93 | this.sliceToken(words, i, 2, {
|
94 | w: nw,
|
95 | p: POSTAG.A_NR,
|
96 | m: [word, nextword],
|
97 | }, undefined, {
|
98 | [this.name]: 1,
|
99 | });
|
100 | i++;
|
101 | continue;
|
102 | }
|
103 | // 如果是 姓 + 名(2字以内)
|
104 | if ((word.w in CHS_NAMES_1.default.FAMILY_NAME_1 || word.w in CHS_NAMES_1.default.FAMILY_NAME_2) &&
|
105 | ((nextword.p & POSTAG.A_NR) > 0 && nextword.w.length <= 2)) {
|
106 | /*
|
107 | words.splice(i, 2, {
|
108 | w: word.w + nextword.w,
|
109 | p: POSTAG.A_NR,
|
110 | m: [word, nextword],
|
111 | });
|
112 | */
|
113 | this.sliceToken(words, i, 2, {
|
114 | w: nw,
|
115 | p: POSTAG.A_NR,
|
116 | m: [word, nextword],
|
117 | }, undefined, {
|
118 | [this.name]: 2,
|
119 | });
|
120 | i++;
|
121 | continue;
|
122 | }
|
123 | // 如果相邻两个均为单字且至少有一个字是未识别的,则尝试判断其是否为人名
|
124 | if (!word.p || !nextword.p) {
|
125 | if ((word.w in CHS_NAMES_1.default.SINGLE_NAME && word.w == nextword.w) ||
|
126 | (word.w in CHS_NAMES_1.default.DOUBLE_NAME_1 && nextword.w in CHS_NAMES_1.default.DOUBLE_NAME_2)) {
|
127 | /*
|
128 | words.splice(i, 2, {
|
129 | w: word.w + nextword.w,
|
130 | p: POSTAG.A_NR,
|
131 | m: [word, nextword],
|
132 | });
|
133 | */
|
134 | this.sliceToken(words, i, 2, {
|
135 | w: nw,
|
136 | p: POSTAG.A_NR,
|
137 | m: [word, nextword],
|
138 | }, undefined, {
|
139 | [this.name]: 3,
|
140 | });
|
141 | // 如果上一个单词可能是一个姓,则合并
|
142 | let preword = words[i - 1];
|
143 | if (preword
|
144 | && (preword.w in CHS_NAMES_1.default.FAMILY_NAME_1 || preword.w in CHS_NAMES_1.default.FAMILY_NAME_2)
|
145 | && this.isMergeable2(preword.w, word.w, nextword.w)) {
|
146 | let nw = preword.w + word.w + nextword.w;
|
147 | /*
|
148 | words.splice(i - 1, 2, {
|
149 | w: preword.w + word.w + nextword.w,
|
150 | p: POSTAG.A_NR,
|
151 | m: [preword, word, nextword],
|
152 | });
|
153 | */
|
154 | this.sliceToken(words, i - 1, 2, {
|
155 | w: nw,
|
156 | p: POSTAG.A_NR,
|
157 | m: [preword, word, nextword],
|
158 | }, undefined, {
|
159 | [this.name]: 4,
|
160 | });
|
161 | }
|
162 | else {
|
163 | i++;
|
164 | }
|
165 | continue;
|
166 | }
|
167 | }
|
168 | // 如果为 无歧义的姓 + 名(2字以内) 且其中一个未未识别词
|
169 | if ((word.w in CHS_NAMES_1.default.FAMILY_NAME_1 || word.w in CHS_NAMES_1.default.FAMILY_NAME_2)
|
170 | && (!word.p || !nextword.p)
|
171 | /**
|
172 | * 防止將標點符號當作名字的BUG
|
173 | */
|
174 | && !(word.p & POSTAG.D_W || nextword.p & POSTAG.D_W)) {
|
175 | //debug(word, nextword);
|
176 | /*
|
177 | words.splice(i, 2, {
|
178 | w: word.w + nextword.w,
|
179 | p: POSTAG.A_NR,
|
180 | m: [word, nextword],
|
181 | });
|
182 | */
|
183 | this.sliceToken(words, i, 2, {
|
184 | w: nw,
|
185 | p: POSTAG.A_NR,
|
186 | m: [word, nextword],
|
187 | }, undefined, {
|
188 | [this.name]: 5,
|
189 | });
|
190 | }
|
191 | }
|
192 | // 移到下一个单词
|
193 | i++;
|
194 | }
|
195 | /* 第二遍扫描 */
|
196 | i = 0;
|
197 | while (i < words.length) {
|
198 | let word = words[i];
|
199 | let nextword = words[i + 1];
|
200 | if (this.isMergeable(word, nextword)) {
|
201 | // 如果为 姓 + 单字名
|
202 | if ((word.w in CHS_NAMES_1.default.FAMILY_NAME_1 || word.w in CHS_NAMES_1.default.FAMILY_NAME_2)
|
203 | &&
|
204 | nextword.w in CHS_NAMES_1.default.SINGLE_NAME) {
|
205 | /*
|
206 | words.splice(i, 2, {
|
207 | w: word.w + nextword.w,
|
208 | p: POSTAG.A_NR,
|
209 | m: [word, nextword],
|
210 | });
|
211 | */
|
212 | let nw = word.w + nextword.w;
|
213 | let ew = this._TABLE[nw];
|
214 | /**
|
215 | * 更改為只有新詞屬於人名或未知詞時才會合併
|
216 | */
|
217 | if (!ew || !ew.p || ew.p & POSTAG.A_NR) {
|
218 | this.sliceToken(words, i, 2, {
|
219 | w: nw,
|
220 | p: POSTAG.A_NR,
|
221 | m: [word, nextword],
|
222 | }, undefined, {
|
223 | [this.name]: 6,
|
224 | exists_word: ew,
|
225 | });
|
226 | i++;
|
227 | continue;
|
228 | }
|
229 | }
|
230 | }
|
231 | // 移到下一个单词
|
232 | i++;
|
233 | }
|
234 | return words;
|
235 | }
|
236 | }
|
237 | exports.ChsNameOptimizer = ChsNameOptimizer;
|
238 | exports.init = ChsNameOptimizer.init.bind(ChsNameOptimizer);
|
239 | exports.default = ChsNameOptimizer;
|
240 | //# sourceMappingURL=ChsNameOptimizer.js.map |
\ | No newline at end of file |