UNPKG

14.3 kBJavaScriptView Raw
1'use strict';
2Object.defineProperty(exports, "__esModule", { value: true });
3exports.init = exports.ChsNameTokenizer = void 0;
4/**
5 * 中文人名识别模块
6 *
7 * @author 老雷<leizongmin@gmail.com>
8 */
9const CHS_NAMES_1 = require("../mod/CHS_NAMES");
10const mod_1 = require("../mod");
11class ChsNameTokenizer extends mod_1.SubSModuleTokenizer {
12 constructor() {
13 super(...arguments);
14 this.name = 'ChsNameTokenizer';
15 }
16 _cache() {
17 super._cache();
18 this._TABLE = this.segment.getDict('TABLE');
19 this._POSTAG = this.segment.POSTAG;
20 }
21 /**
22 * 对未识别的单词进行分词
23 *
24 * @param {array} words 单词数组
25 * @return {array}
26 */
27 split(words) {
28 const POSTAG = this._POSTAG;
29 let ret = [];
30 for (let i = 0, word; word = words[i]; i++) {
31 if (word.p) {
32 // 仅对未识别的词进行匹配
33 ret.push(word);
34 continue;
35 }
36 let nameinfo = this.matchName(word.w);
37 if (nameinfo.length < 1) {
38 ret.push(word);
39 continue;
40 }
41 // 分离出人名
42 let lastc = 0;
43 for (let ui = 0, url; url = nameinfo[ui]; ui++) {
44 if (url.c > lastc) {
45 ret.push(this.debugToken({
46 w: word.w.substr(lastc, url.c - lastc),
47 }, {
48 [this.name]: false,
49 }, true));
50 }
51 ret.push(this.debugToken({
52 w: url.w,
53 p: POSTAG.A_NR
54 }, {
55 [this.name]: true,
56 }, true));
57 lastc = url.c + url.w.length;
58 }
59 let lastn = nameinfo[nameinfo.length - 1];
60 if (lastn.c + lastn.w.length < word.w.length) {
61 ret.push(this.debugToken({
62 w: word.w.substr(lastn.c + lastn.w.length),
63 }, {
64 [this.name]: false,
65 }, true));
66 }
67 }
68 return ret;
69 }
70 /**
71 * 匹配包含的人名,并返回相关信息
72 *
73 * @param {string} text 文本
74 * @param {int} cur 开始位置
75 * @return {array} 返回格式 {w: '人名', c: 开始位置}
76 */
77 matchName(text, cur = 0) {
78 if (isNaN(cur))
79 cur = 0;
80 let ret = [];
81 while (cur < text.length) {
82 //debug('cur=' + cur + ', ' + text.charAt(cur));
83 let name = null;
84 // 复姓
85 let f2 = text.substr(cur, 2);
86 if (f2 in CHS_NAMES_1.FAMILY_NAME_2) {
87 let n1 = text.charAt(cur + 2);
88 let n2 = text.charAt(cur + 3);
89 if (n1 in CHS_NAMES_1.DOUBLE_NAME_1 && n2 in CHS_NAMES_1.DOUBLE_NAME_2) {
90 name = f2 + n1 + n2;
91 }
92 else if (n1 in CHS_NAMES_1.SINGLE_NAME) {
93 name = f2 + n1 + (n1 == n2 ? n2 : '');
94 }
95 }
96 // 单姓
97 let f1 = text.charAt(cur);
98 if (name === null && f1 in CHS_NAMES_1.FAMILY_NAME_1) {
99 let n1 = text.charAt(cur + 1);
100 let n2 = text.charAt(cur + 2);
101 if (n1 in CHS_NAMES_1.DOUBLE_NAME_1 && n2 in CHS_NAMES_1.DOUBLE_NAME_2) {
102 name = f1 + n1 + n2;
103 }
104 else if (n1 in CHS_NAMES_1.SINGLE_NAME) {
105 name = f1 + n1 + (n1 == n2 ? n2 : '');
106 }
107 }
108 // 检查是否匹配成功
109 if (name === null) {
110 cur++;
111 }
112 else {
113 ret.push({ w: name, c: cur });
114 cur += name.length;
115 }
116 }
117 return ret;
118 }
119}
120exports.ChsNameTokenizer = ChsNameTokenizer;
121// ======================================================================
122// debug(matchName('刘德华和李娜娜、司马光、上官飞飞'));
123// debug(matchName('李克'));
124exports.init = ChsNameTokenizer.init.bind(ChsNameTokenizer);
125exports.default = ChsNameTokenizer;
126//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"ChsNameTokenizer.js","sourceRoot":"","sources":["ChsNameTokenizer.ts"],"names":[],"mappings":"AAAA,YAAY,CAAC;;;AAEb;;;;GAIG;AAEH,gDAAsH;AAEtH,gCAAyD;AAKzD,MAAa,gBAAiB,SAAQ,yBAAmB;IAAzD;;QAKC,SAAI,GAAG,kBAAkB,CAAC;IA+H3B,CAAC;IA7HA,MAAM;QAEL,KAAK,CAAC,MAAM,EAAE,CAAC;QAEf,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAC5C,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC;IACpC,CAAC;IAED;;;;;OAKG;IACH,KAAK,CAAC,KAAc;QAEnB,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC;QAC5B,IAAI,GAAG,GAAY,EAAE,CAAC;QAEtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,EAAE,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAC1C;YACC,IAAI,IAAI,CAAC,CAAC,EACV;gBACC,cAAc;gBACd,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACf,SAAS;aACT;YACD,IAAI,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACtC,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EACvB;gBACC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACf,SAAS;aACT;YACD,QAAQ;YACR,IAAI,KAAK,GAAG,CAAC,CAAC;YACd,KAAK,IAAI,EAAE,GAAG,CAAC,EAAE,GAAG,EAAE,GAAG,GAAG,QAAQ,CAAC,EAAE,CAAC,EAAE,EAAE,EAAE,EAC9C;gBACC,IAAI,GAAG,CAAC,CAAC,GAAG,KAAK,EACjB;oBACC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC;wBACxB,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,GAAG,KAAK,CAAC;qBACtC,EAAE;wBACF,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,KAAK;qBAClB,EAAE,IAAI,CAAC,CAAC,CAAC;iBACV;gBAED,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC;oBACxB,CAAC,EAAE,GAAG,CAAC,CAAC;oBACR,CAAC,EAAE,MAAM,CAAC,IAAI;iBACd,EAAE;oBACF,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,IAAI;iBACjB,EAAE,IAAI,CAAC,CAAC,CAAC;gBACV,KAAK,GAAG,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC;aAC7B;YACD,IAAI,KAAK,GAAG,QAAQ,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;YAC1C,IAAI,KAAK,CAAC,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAAC,MAAM,EAC5C;gBACC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC;oBACxB,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC;iBAC1C,EAAE;oBACF,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,KAAK;iBAClB,EAAE,IAAI,CAAC,CAAC,CAAC;aACV;SACD;QACD,OAAO,GAAG,CAAC;IACZ,CAAC;IAED;;;;;;OAMG;IACH,SAAS,CAAC,IAAY,EAAE,MAAc,CAAC;QAEtC,IAAI,KAAK,CAAC,GAAG,CAAC;YAAE,GAAG,GAAG,CAAC,CAAC;QACxB,IAAI,GAAG,GAAY,EAAE,CAAC;QACtB,OAAO,GAAG,GAAG,IAAI,CAAC,MAAM,EACxB;YACC,gDAAgD;YAChD,IAAI,IAAI,GAAW,IAAI,CAAC;YACxB,KAAK;YACL,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;YAC7B,IAAI,EAAE,IAAI,yBAAa,EACvB;gBACC,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC;gBAC9B,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC;gBAC9B,IAAI,EAAE,IAAI,yBAAa,IAAI,EAAE,IAAI,yBAAa,EAC9C;oBACC,IAAI,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC;iBACpB;qBACI,IAAI,EAAE,IAAI,uBAAW,EAC1B;oBACC,IAAI,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;iBACtC;aACD;YACD,KAAK;YACL,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YAC1B,IAAI,IAAI,KAAK,IAAI,IAAI,EAAE,IAAI,yBAAa,EACxC;gBACC,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC;gBAC9B,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC;gBAC9B,IAAI,EAAE,IAAI,yBAAa,IAAI,EAAE,IAAI,yBAAa,EAC9C;oBACC,IAAI,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC;iBACpB;qBACI,IAAI,EAAE,IAAI,uBAAW,EAC1B;oBACC,IAAI,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;iBACtC;aACD;YACD,WAAW;YACX,IAAI,IAAI,KAAK,IAAI,EACjB;gBACC,GAAG,EAAE,CAAC;aACN;iBAED;gBACC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC;gBAC9B,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC;aACnB;SACD;QACD,OAAO,GAAG,CAAC;IACZ,CAAC;CACD;AApID,4CAoIC;AAED,yEAAyE;AAEzE,wCAAwC;AACxC,0BAA0B;AAEb,QAAA,IAAI,GAAG,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,gBAAgB,CAAiC,CAAC;AAEjG,kBAAe,gBAAgB,CAAC","sourcesContent":["'use strict';\n\n/**\n * 中文人名识别模块\n *\n * @author 老雷<leizongmin@gmail.com>\n */\n\nimport CHS_NAMES, { FAMILY_NAME_1, FAMILY_NAME_2, SINGLE_NAME, DOUBLE_NAME_1, DOUBLE_NAME_2 } from '../mod/CHS_NAMES';\nimport { debug } from '../util';\nimport { SubSModule, SubSModuleTokenizer } from '../mod';\nimport { Segment, IWord, IDICT, IDICT_SYNONYM } from '../Segment';\n// @ts-ignore\nimport { UString } from 'uni-string';\n\nexport class ChsNameTokenizer extends SubSModuleTokenizer\n{\n\n\tprotected _TABLE: IDICT<IWord>;\n\n\tname = 'ChsNameTokenizer';\n\n\t_cache()\n\t{\n\t\tsuper._cache();\n\n\t\tthis._TABLE = this.segment.getDict('TABLE');\n\t\tthis._POSTAG = this.segment.POSTAG;\n\t}\n\n\t/**\n\t * 对未识别的单词进行分词\n\t *\n\t * @param {array} words 单词数组\n\t * @return {array}\n\t */\n\tsplit(words: IWord[]): IWord[]\n\t{\n\t\tconst POSTAG = this._POSTAG;\n\t\tlet ret: IWord[] = [];\n\n\t\tfor (let i = 0, word; word = words[i]; i++)\n\t\t{\n\t\t\tif (word.p)\n\t\t\t{\n\t\t\t\t// 仅对未识别的词进行匹配\n\t\t\t\tret.push(word);\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tlet nameinfo = this.matchName(word.w);\n\t\t\tif (nameinfo.length < 1)\n\t\t\t{\n\t\t\t\tret.push(word);\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\t// 分离出人名\n\t\t\tlet lastc = 0;\n\t\t\tfor (let ui = 0, url; url = nameinfo[ui]; ui++)\n\t\t\t{\n\t\t\t\tif (url.c > lastc)\n\t\t\t\t{\n\t\t\t\t\tret.push(this.debugToken({\n\t\t\t\t\t\tw: word.w.substr(lastc, url.c - lastc),\n\t\t\t\t\t}, {\n\t\t\t\t\t\t[this.name]: false,\n\t\t\t\t\t}, true));\n\t\t\t\t}\n\n\t\t\t\tret.push(this.debugToken({\n\t\t\t\t\tw: url.w,\n\t\t\t\t\tp: POSTAG.A_NR\n\t\t\t\t}, {\n\t\t\t\t\t[this.name]: true,\n\t\t\t\t}, true));\n\t\t\t\tlastc = url.c + url.w.length;\n\t\t\t}\n\t\t\tlet lastn = nameinfo[nameinfo.length - 1];\n\t\t\tif (lastn.c + lastn.w.length < word.w.length)\n\t\t\t{\n\t\t\t\tret.push(this.debugToken({\n\t\t\t\t\tw: word.w.substr(lastn.c + lastn.w.length),\n\t\t\t\t}, {\n\t\t\t\t\t[this.name]: false,\n\t\t\t\t}, true));\n\t\t\t}\n\t\t}\n\t\treturn ret;\n\t}\n\n\t/**\n\t * 匹配包含的人名，并返回相关信息\n\t *\n\t * @param {string} text 文本\n\t * @param {int} cur 开始位置\n\t * @return {array}  返回格式   {w: '人名', c: 开始位置}\n\t */\n\tmatchName(text: string, cur: number = 0): IWord[]\n\t{\n\t\tif (isNaN(cur)) cur = 0;\n\t\tlet ret: IWord[] = [];\n\t\twhile (cur < text.length)\n\t\t{\n\t\t\t//debug('cur=' + cur + ', ' + text.charAt(cur));\n\t\t\tlet name: string = null;\n\t\t\t// 复姓\n\t\t\tlet f2 = text.substr(cur, 2);\n\t\t\tif (f2 in FAMILY_NAME_2)\n\t\t\t{\n\t\t\t\tlet n1 = text.charAt(cur + 2);\n\t\t\t\tlet n2 = text.charAt(cur + 3);\n\t\t\t\tif (n1 in DOUBLE_NAME_1 && n2 in DOUBLE_NAME_2)\n\t\t\t\t{\n\t\t\t\t\tname = f2 + n1 + n2;\n\t\t\t\t}\n\t\t\t\telse if (n1 in SINGLE_NAME)\n\t\t\t\t{\n\t\t\t\t\tname = f2 + n1 + (n1 == n2 ? n2 : '');\n\t\t\t\t}\n\t\t\t}\n\t\t\t// 单姓\n\t\t\tlet f1 = text.charAt(cur);\n\t\t\tif (name === null && f1 in FAMILY_NAME_1)\n\t\t\t{\n\t\t\t\tlet n1 = text.charAt(cur + 1);\n\t\t\t\tlet n2 = text.charAt(cur + 2);\n\t\t\t\tif (n1 in DOUBLE_NAME_1 && n2 in DOUBLE_NAME_2)\n\t\t\t\t{\n\t\t\t\t\tname = f1 + n1 + n2;\n\t\t\t\t}\n\t\t\t\telse if (n1 in SINGLE_NAME)\n\t\t\t\t{\n\t\t\t\t\tname = f1 + n1 + (n1 == n2 ? n2 : '');\n\t\t\t\t}\n\t\t\t}\n\t\t\t// 检查是否匹配成功\n\t\t\tif (name === null)\n\t\t\t{\n\t\t\t\tcur++;\n\t\t\t}\n\t\t\telse\n\t\t\t{\n\t\t\t\tret.push({ w: name, c: cur });\n\t\t\t\tcur += name.length;\n\t\t\t}\n\t\t}\n\t\treturn ret;\n\t}\n}\n\n// ======================================================================\n\n// debug(matchName('刘德华和李娜娜、司马光、上官飞飞'));\n// debug(matchName('李克'));\n\nexport const init = ChsNameTokenizer.init.bind(ChsNameTokenizer) as typeof ChsNameTokenizer.init;\n\nexport default ChsNameTokenizer;\n"]}
\No newline at end of file