UNPKG

4.26 kBJavaScriptView Raw
1'use strict';
2Object.defineProperty(exports, "__esModule", { value: true });
3exports.init = exports.ChsNameTokenizer = void 0;
4/**
5 * 中文人名识别模块
6 *
7 * @author 老雷<leizongmin@gmail.com>
8 */
9const CHS_NAMES_1 = require("../mod/CHS_NAMES");
10const mod_1 = require("../mod");
11class ChsNameTokenizer extends mod_1.SubSModuleTokenizer {
12 constructor() {
13 super(...arguments);
14 this.name = 'ChsNameTokenizer';
15 }
16 _cache() {
17 super._cache();
18 this._TABLE = this.segment.getDict('TABLE');
19 this._POSTAG = this.segment.POSTAG;
20 }
21 /**
22 * 对未识别的单词进行分词
23 *
24 * @param {array} words 单词数组
25 * @return {array}
26 */
27 split(words) {
28 const POSTAG = this._POSTAG;
29 let ret = [];
30 for (let i = 0, word; word = words[i]; i++) {
31 if (word.p) {
32 // 仅对未识别的词进行匹配
33 ret.push(word);
34 continue;
35 }
36 let nameinfo = this.matchName(word.w);
37 if (nameinfo.length < 1) {
38 ret.push(word);
39 continue;
40 }
41 // 分离出人名
42 let lastc = 0;
43 for (let ui = 0, url; url = nameinfo[ui]; ui++) {
44 if (url.c > lastc) {
45 ret.push(this.debugToken({
46 w: word.w.substr(lastc, url.c - lastc),
47 }, {
48 [this.name]: false,
49 }, true));
50 }
51 ret.push(this.debugToken({
52 w: url.w,
53 p: POSTAG.A_NR
54 }, {
55 [this.name]: true,
56 }, true));
57 lastc = url.c + url.w.length;
58 }
59 let lastn = nameinfo[nameinfo.length - 1];
60 if (lastn.c + lastn.w.length < word.w.length) {
61 ret.push(this.debugToken({
62 w: word.w.substr(lastn.c + lastn.w.length),
63 }, {
64 [this.name]: false,
65 }, true));
66 }
67 }
68 return ret;
69 }
70 /**
71 * 匹配包含的人名,并返回相关信息
72 *
73 * @param {string} text 文本
74 * @param {int} cur 开始位置
75 * @return {array} 返回格式 {w: '人名', c: 开始位置}
76 */
77 matchName(text, cur = 0) {
78 if (isNaN(cur))
79 cur = 0;
80 let ret = [];
81 while (cur < text.length) {
82 //debug('cur=' + cur + ', ' + text.charAt(cur));
83 let name = null;
84 // 复姓
85 let f2 = text.substr(cur, 2);
86 if (f2 in CHS_NAMES_1.FAMILY_NAME_2) {
87 let n1 = text.charAt(cur + 2);
88 let n2 = text.charAt(cur + 3);
89 if (n1 in CHS_NAMES_1.DOUBLE_NAME_1 && n2 in CHS_NAMES_1.DOUBLE_NAME_2) {
90 name = f2 + n1 + n2;
91 }
92 else if (n1 in CHS_NAMES_1.SINGLE_NAME) {
93 name = f2 + n1 + (n1 == n2 ? n2 : '');
94 }
95 }
96 // 单姓
97 let f1 = text.charAt(cur);
98 if (name === null && f1 in CHS_NAMES_1.FAMILY_NAME_1) {
99 let n1 = text.charAt(cur + 1);
100 let n2 = text.charAt(cur + 2);
101 if (n1 in CHS_NAMES_1.DOUBLE_NAME_1 && n2 in CHS_NAMES_1.DOUBLE_NAME_2) {
102 name = f1 + n1 + n2;
103 }
104 else if (n1 in CHS_NAMES_1.SINGLE_NAME) {
105 name = f1 + n1 + (n1 == n2 ? n2 : '');
106 }
107 }
108 // 检查是否匹配成功
109 if (name === null) {
110 cur++;
111 }
112 else {
113 ret.push({ w: name, c: cur });
114 cur += name.length;
115 }
116 }
117 return ret;
118 }
119}
120exports.ChsNameTokenizer = ChsNameTokenizer;
121// ======================================================================
122// debug(matchName('刘德华和李娜娜、司马光、上官飞飞'));
123// debug(matchName('李克'));
124exports.init = ChsNameTokenizer.init.bind(ChsNameTokenizer);
125exports.default = ChsNameTokenizer;
126//# sourceMappingURL=ChsNameTokenizer.js.map
\No newline at end of file