UNPKG

7.53 kBSource Map (JSON)View Raw
1{"version":3,"file":"ChsNameTokenizer.js","sourceRoot":"","sources":["ChsNameTokenizer.ts"],"names":[],"mappings":"AAAA,YAAY,CAAC;;;AAEb;;;;GAIG;AAEH,gDAAsH;AAEtH,gCAAyD;AAKzD,MAAa,gBAAiB,SAAQ,yBAAmB;IAAzD;;QAKC,SAAI,GAAG,kBAAkB,CAAC;IA+H3B,CAAC;IA7HA,MAAM;QAEL,KAAK,CAAC,MAAM,EAAE,CAAC;QAEf,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAC5C,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC;IACpC,CAAC;IAED;;;;;OAKG;IACH,KAAK,CAAC,KAAc;QAEnB,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC;QAC5B,IAAI,GAAG,GAAY,EAAE,CAAC;QAEtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,EAAE,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAC1C;YACC,IAAI,IAAI,CAAC,CAAC,EACV;gBACC,cAAc;gBACd,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACf,SAAS;aACT;YACD,IAAI,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACtC,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EACvB;gBACC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACf,SAAS;aACT;YACD,QAAQ;YACR,IAAI,KAAK,GAAG,CAAC,CAAC;YACd,KAAK,IAAI,EAAE,GAAG,CAAC,EAAE,GAAG,EAAE,GAAG,GAAG,QAAQ,CAAC,EAAE,CAAC,EAAE,EAAE,EAAE,EAC9C;gBACC,IAAI,GAAG,CAAC,CAAC,GAAG,KAAK,EACjB;oBACC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC;wBACxB,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,GAAG,KAAK,CAAC;qBACtC,EAAE;wBACF,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,KAAK;qBAClB,EAAE,IAAI,CAAC,CAAC,CAAC;iBACV;gBAED,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC;oBACxB,CAAC,EAAE,GAAG,CAAC,CAAC;oBACR,CAAC,EAAE,MAAM,CAAC,IAAI;iBACd,EAAE;oBACF,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,IAAI;iBACjB,EAAE,IAAI,CAAC,CAAC,CAAC;gBACV,KAAK,GAAG,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC;aAC7B;YACD,IAAI,KAAK,GAAG,QAAQ,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;YAC1C,IAAI,KAAK,CAAC,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAAC,MAAM,EAC5C;gBACC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC;oBACxB,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC;iBAC1C,EAAE;oBACF,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,KAAK;iBAClB,EAAE,IAAI,CAAC,CAAC,CAAC;aACV;SACD;QACD,OAAO,GAAG,CAAC;IACZ,CAAC;IAED;;;;;;OAMG;IACH,SAAS,CAAC,IAAY,EAAE,MAAc,CAAC;QAEtC,IAAI,KAAK,CAAC,GAAG,CAAC;YAAE,GAAG,GAAG,CAAC,CAAC;QACxB,IAAI,GAAG,GAAY,EAAE,CAAC;QACtB,OAAO,GAAG,GAAG,IAAI,CAAC,MAAM,EACxB;YACC,gDAAgD;YAChD,IAAI,IAAI,GAAW,IAAI,CAAC;YACxB,KAAK;YACL,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;YAC7B,IAAI,EAAE,IAAI,yBAAa,EACvB;gBACC,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC;gBAC9B,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC;gBAC9B,IAAI,EAAE,IAAI,yBAAa,IAAI,EAAE,IAAI,yBAAa,EAC9C;oBACC,IAAI,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC;iBACpB;qBACI,IAAI,EAAE,IAAI,uBAAW,EAC1B;oBACC,IAAI,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;iBACtC;aACD;YACD,KAAK;YACL,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YAC1B,IAAI,IAAI,KAAK,IAAI,IAAI,EAAE,IAAI,yBAAa,EACxC;gBACC,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC;gBAC9B,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC;gBAC9B,IAAI,EAAE,IAAI,yBAAa,IAAI,EAAE,IAAI,yBAAa,EAC9C;oBACC,IAAI,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC;iBACpB;qBACI,IAAI,EAAE,IAAI,uBAAW,EAC1B;oBACC,IAAI,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;iBACtC;aACD;YACD,WAAW;YACX,IAAI,IAAI,KAAK,IAAI,EACjB;gBACC,GAAG,EAAE,CAAC;aACN;iBAED;gBACC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC;gBAC9B,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC;aACnB;SACD;QACD,OAAO,GAAG,CAAC;IACZ,CAAC;CACD;AApID,4CAoIC;AAED,yEAAyE;AAEzE,wCAAwC;AACxC,0BAA0B;AAEb,QAAA,IAAI,GAAG,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,gBAAgB,CAAiC,CAAC;AAEjG,kBAAe,gBAAgB,CAAC","sourcesContent":["'use strict';\n\n/**\n * 中文人名识别模块\n *\n * @author 老雷<leizongmin@gmail.com>\n */\n\nimport CHS_NAMES, { FAMILY_NAME_1, FAMILY_NAME_2, SINGLE_NAME, DOUBLE_NAME_1, DOUBLE_NAME_2 } from '../mod/CHS_NAMES';\nimport { debug } from '../util';\nimport { SubSModule, SubSModuleTokenizer } from '../mod';\nimport { Segment, IWord, IDICT, IDICT_SYNONYM } from '../Segment';\n// @ts-ignore\nimport { UString } from 'uni-string';\n\nexport class ChsNameTokenizer extends SubSModuleTokenizer\n{\n\n\tprotected _TABLE: IDICT<IWord>;\n\n\tname = 'ChsNameTokenizer';\n\n\t_cache()\n\t{\n\t\tsuper._cache();\n\n\t\tthis._TABLE = this.segment.getDict('TABLE');\n\t\tthis._POSTAG = this.segment.POSTAG;\n\t}\n\n\t/**\n\t * 对未识别的单词进行分词\n\t *\n\t * @param {array} words 单词数组\n\t * @return {array}\n\t */\n\tsplit(words: IWord[]): IWord[]\n\t{\n\t\tconst POSTAG = this._POSTAG;\n\t\tlet ret: IWord[] = [];\n\n\t\tfor (let i = 0, word; word = words[i]; i++)\n\t\t{\n\t\t\tif (word.p)\n\t\t\t{\n\t\t\t\t// 仅对未识别的词进行匹配\n\t\t\t\tret.push(word);\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tlet nameinfo = this.matchName(word.w);\n\t\t\tif (nameinfo.length < 1)\n\t\t\t{\n\t\t\t\tret.push(word);\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\t// 分离出人名\n\t\t\tlet lastc = 0;\n\t\t\tfor (let ui = 0, url; url = nameinfo[ui]; ui++)\n\t\t\t{\n\t\t\t\tif (url.c > lastc)\n\t\t\t\t{\n\t\t\t\t\tret.push(this.debugToken({\n\t\t\t\t\t\tw: word.w.substr(lastc, url.c - lastc),\n\t\t\t\t\t}, {\n\t\t\t\t\t\t[this.name]: false,\n\t\t\t\t\t}, true));\n\t\t\t\t}\n\n\t\t\t\tret.push(this.debugToken({\n\t\t\t\t\tw: url.w,\n\t\t\t\t\tp: POSTAG.A_NR\n\t\t\t\t}, {\n\t\t\t\t\t[this.name]: true,\n\t\t\t\t}, true));\n\t\t\t\tlastc = url.c + url.w.length;\n\t\t\t}\n\t\t\tlet lastn = nameinfo[nameinfo.length - 1];\n\t\t\tif (lastn.c + lastn.w.length < word.w.length)\n\t\t\t{\n\t\t\t\tret.push(this.debugToken({\n\t\t\t\t\tw: word.w.substr(lastn.c + lastn.w.length),\n\t\t\t\t}, {\n\t\t\t\t\t[this.name]: false,\n\t\t\t\t}, true));\n\t\t\t}\n\t\t}\n\t\treturn ret;\n\t}\n\n\t/**\n\t * 匹配包含的人名,并返回相关信息\n\t *\n\t * @param {string} text 文本\n\t * @param {int} cur 开始位置\n\t * @return {array} 返回格式 {w: '人名', c: 开始位置}\n\t */\n\tmatchName(text: string, cur: number = 0): IWord[]\n\t{\n\t\tif (isNaN(cur)) cur = 0;\n\t\tlet ret: IWord[] = [];\n\t\twhile (cur < text.length)\n\t\t{\n\t\t\t//debug('cur=' + cur + ', ' + text.charAt(cur));\n\t\t\tlet name: string = null;\n\t\t\t// 复姓\n\t\t\tlet f2 = text.substr(cur, 2);\n\t\t\tif (f2 in FAMILY_NAME_2)\n\t\t\t{\n\t\t\t\tlet n1 = text.charAt(cur + 2);\n\t\t\t\tlet n2 = text.charAt(cur + 3);\n\t\t\t\tif (n1 in DOUBLE_NAME_1 && n2 in DOUBLE_NAME_2)\n\t\t\t\t{\n\t\t\t\t\tname = f2 + n1 + n2;\n\t\t\t\t}\n\t\t\t\telse if (n1 in SINGLE_NAME)\n\t\t\t\t{\n\t\t\t\t\tname = f2 + n1 + (n1 == n2 ? n2 : '');\n\t\t\t\t}\n\t\t\t}\n\t\t\t// 单姓\n\t\t\tlet f1 = text.charAt(cur);\n\t\t\tif (name === null && f1 in FAMILY_NAME_1)\n\t\t\t{\n\t\t\t\tlet n1 = text.charAt(cur + 1);\n\t\t\t\tlet n2 = text.charAt(cur + 2);\n\t\t\t\tif (n1 in DOUBLE_NAME_1 && n2 in DOUBLE_NAME_2)\n\t\t\t\t{\n\t\t\t\t\tname = f1 + n1 + n2;\n\t\t\t\t}\n\t\t\t\telse if (n1 in SINGLE_NAME)\n\t\t\t\t{\n\t\t\t\t\tname = f1 + n1 + (n1 == n2 ? n2 : '');\n\t\t\t\t}\n\t\t\t}\n\t\t\t// 检查是否匹配成功\n\t\t\tif (name === null)\n\t\t\t{\n\t\t\t\tcur++;\n\t\t\t}\n\t\t\telse\n\t\t\t{\n\t\t\t\tret.push({ w: name, c: cur });\n\t\t\t\tcur += name.length;\n\t\t\t}\n\t\t}\n\t\treturn ret;\n\t}\n}\n\n// ======================================================================\n\n// debug(matchName('刘德华和李娜娜、司马光、上官飞飞'));\n// debug(matchName('李克'));\n\nexport const init = ChsNameTokenizer.init.bind(ChsNameTokenizer) as typeof ChsNameTokenizer.init;\n\nexport default ChsNameTokenizer;\n"]}
\No newline at end of file