UNPKG

14.6 kBSource Map (JSON)View Raw
1{"version":3,"file":"ForeignTokenizer.js","sourceRoot":"","sources":["ForeignTokenizer.ts"],"names":[],"mappings":"AAAA,YAAY,CAAC;;;AAEb;;;;GAIG;AACH,gCAA8E;AAE9E,yCAA2C;AAK3C,MAAa,gBAAiB,SAAQ,yBAAmB;IAAzD;;QAGC,SAAI,GAAG,kBAAkB,CAAC;IA4U3B,CAAC;IAjUA,MAAM;QAEL,KAAK,CAAC,MAAM,EAAE,CAAC;QACf,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAE5C,IAAI,GAAG,GAAG;YACT,uCAAuC;YACvC,0CAA0C;YAC1C,+BAA+B;YAC/B,kBAAkB;YAClB,oDAAoD;YACpD,kBAAkB;SAClB,CAAC;QAEF,IAAI,CAAC,eAAe,GAAG,IAAI,MAAM,CAAC,GAAG,GAAE,KAAK,CAAC;YAC5C,kBAAkB;SAClB,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,GAAG,GAAG,EAAE,IAAI,CAAC,CAAC;QAE5B,IAAI,CAAC,eAAe,GAAG,IAAI,MAAM,CAAC,GAAG,GAAE,KAAK,CAAC,GAAG,CAAC,GAAG,GAAG,EAAE,IAAI,CAAC,CAAC;QAE/D,SAAS,KAAK,CAAC,GAA2B;YAEzC,OAAO,GAAG,CAAC,MAAM,CAAC,UAAU,CAAC,EAAE,CAAC;gBAE/B,IAAI,CAAC,YAAY,MAAM,EACvB;oBACC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;iBACjB;qBAED;oBACC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;iBACV;gBAED,OAAO,CAAC,CAAC;YACV,CAAC,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;QACjB,CAAC;IACF,CAAC;IAED;;;;;OAKG;IACH,KAAK,CAAC,KAAc;QAEnB,qDAAqD;QACrD,OAAO,IAAI,CAAC,YAAY,CAAC,KAAK,EAAE,IAAI,CAAC,aAAa,CAAC,CAAC;QAEpD;;;;;;;;;;;;;;;;;UAiBE;IACH,CAAC;IAED;;;;OAIG;IACH,aAAa,CAAC,IAAY,EAAE,GAAY;QAEvC,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC;QACnC,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC;QAE1B,gCAAgC;QAEhC,IAAI,GAAG,GAAY,EAAE,CAAC;QACtB,IAAI,IAAI,GAAG,IAAI,CAAC;QAEhB,IAAI,EAAE,GAAG,IAAI;aACX,KAAK,CAAC,IAAI,CAAC,eAAe,CAAC,CAC5B;QAED,KAAK,IAAI,CAAC,IAAI,EAAE,EAChB;YACC,IAAI,CAAC,KAAK,EAAE,EACZ;gBACC,IAAI,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,EAChC;oBACC,IAAI,EAAE,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;oBAElB,IAAI,EAAE,EACN;wBACC,IAAI,EAAE,GAAG,IAAI,CAAC,cAAc,CAAC;4BAC5B,CAAC;yBACD,EAAE,EAAE,EAAE;4BACN,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;yBACd,CAAC,CAAC;wBAEH,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;wBACb,SAAS;qBACT;oBAED;;;uBAGG;oBACH,IAAI,GAAG,GAAG,CAAC;yBACT,KAAK,CAAC,aAAa,CAAC,CACrB;oBAED,KAAK,IAAI,CAAC,IAAI,GAAG,EACjB;wBACC,IAAI,CAAC,KAAK,EAAE,EACZ;4BACC,SAAS;yBACT;wBAED,IAAI,QAAQ,GAAG,CAAC,CAAC;wBAEjB,IAAI,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;wBACxB,IAAI,CAAC,IAAI,KAAK,IAAI,CAAC,IAAI,KAAK;4BAAE,CAAC,IAAI,KAAK,CAAC;wBAEzC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,EACtB;4BACC,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC;yBACtB,CAAA,4BAA4B;6BACxB,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,GAAG,CAAC,EACtD;4BACC,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC;yBACvB;6BAED;4BACC,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC;yBACtB;wBAED,IAAI,QAAQ,KAAK,MAAM,CAAC,IAAI,EAC5B;4BACC,IAAI,EAAE,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;4BAElB,IAAI,EAAE,EACN;gCACC,IAAI,EAAE,GAAG,IAAI,CAAC,cAAc,CAAC;oCAC5B,CAAC;iCACD,EAAE,EAAE,EAAE;oCACN,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;iCACd,CAAC,CAAC;gCAEH,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gCACb,SAAS;6BACT;yBACD;wBAED,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC;4BACxB,CAAC,EAAE,CAAC;4BACJ,CAAC,EAAE,QAAQ,IAAI,SAAS;yBACxB,EAAE;4BACF,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;yBACd,EAAE,IAAI,CAAC,CAAC,CAAC;qBACV;iBACD;qBAED;oBACC,GAAG,CAAC,IAAI,CAAC;wBACR,CAAC;qBACD,CAAC,CAAC;iBACH;aACD;SACD;QAED,mCAAmC;QAEnC,mBAAmB;QAEnB,OAAO,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC;IACrC,CAAC;IAED;;;;;;OAMG;IACH,YAAY,CAAC,IAAY,EAAE,GAAY;QAEtC,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC;QACnC,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC;QAE1B,+BAA+B;QAE/B,IAAI,KAAK,CAAC,GAAG,CAAC;YAAE,GAAG,GAAG,CAAC,CAAC;QACxB,IAAI,GAAG,GAAG,EAAE,CAAC;QAEb,gBAAgB;QAChB,IAAI,OAAO,GAAG,CAAC,CAAC;QAChB,IAAI,QAAQ,GAAG,CAAC,CAAC;QACjB,IAAI,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QAC3B,UAAU;QACV,IAAI,CAAC,IAAI,KAAK,IAAI,CAAC,IAAI,KAAK;YAAE,CAAC,IAAI,KAAK,CAAC;QACzC,4BAA4B;QAC5B,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,EACtB;YACC,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC;SACtB,CAAA,4BAA4B;aACxB,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,GAAG,CAAC,EACtD;YACC,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC;SACvB;aAED;YACC,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC;SACtB;QAED,IAAI,CAAS,CAAC;QAEd,KAAK,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAChC;YACC,IAAI,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;YAC3B,UAAU;YACV,IAAI,CAAC,IAAI,KAAK,IAAI,CAAC,IAAI,KAAK;gBAAE,CAAC,IAAI,KAAK,CAAC;YACzC,4BAA4B;YAC5B,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,EACtB;gBACC,IAAI,QAAQ,KAAK,MAAM,CAAC,GAAG,EAC3B;oBACC,IAAI,EAAE,GAAG,IAAI,CAAC,kBAAkB,CAAC;wBAChC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC,GAAG,OAAO,CAAC;qBACpC,EAAE,QAAQ,EAAE;wBACZ,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;qBACd,CAAC,CAAC;oBACH,6DAA6D;oBAE7D,+CAA+C;oBAC/C,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACb,OAAO,GAAG,CAAC,CAAC;iBACZ;gBACD,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC;aACtB;iBACI,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,GAAG,CAAC,EACtD;gBACC,4BAA4B;gBAC5B,IAAI,QAAQ,KAAK,MAAM,CAAC,IAAI,EAC5B;oBACC,6DAA6D;oBAE7D,IAAI,EAAE,GAAG,IAAI,CAAC,cAAc,CAAC;wBAC5B,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC,GAAG,OAAO,CAAC;qBACpC,EAAE;wBACF,CAAC,EAAE,QAAQ;qBACX,EAAE;wBACF,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;qBACd,CAAC,CAAC;oBAEH,+CAA+C;oBAC/C,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACb,OAAO,GAAG,CAAC,CAAC;iBACZ;gBACD,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC;aACvB;iBAED;gBACC,KAAK;gBACL,IAAI,QAAQ,KAAK,MAAM,CAAC,GAAG,EAC3B;oBACC,IAAI,EAAE,GAAG,IAAI,CAAC,kBAAkB,CAAC;wBAChC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC,GAAG,OAAO,CAAC;wBACpC,CAAC,EAAE,QAAQ;qBACX,EAAE,SAAS,EAAE;wBACb,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;qBACd,CAAC,CAAC;oBAEH,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACb,OAAO,GAAG,CAAC,CAAC;iBACZ;gBACD,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC;aACtB;SACD;QACD,OAAO;QACP,6DAA6D;QAE7D,IAAI,EAAE,GAAG,IAAI,CAAC,cAAc,CAAQ;YACnC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC,GAAG,OAAO,CAAC;SACpC,CAAC,CAAC;QAEH,IAAI,QAAQ,KAAK,MAAM,CAAC,GAAG;YAAE,EAAE,CAAC,CAAC,GAAG,QAAQ,CAAC;QAC7C,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAEb,kCAAkC;QAElC,aAAa;QACb,OAAO,GAAG,CAAC;IACZ,CAAC;IAED,kBAAkB,CAAC,IAAW,EAAE,QAAiB,EAAE,IAAqB;QAEvE,IAAI,EAAE,GAAG,IAAI,CAAC,WAAW,CAAQ,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC;QAEnD,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAE3B,IAAI,EAAE,EACN;YACC,kBAAU,CAAC,EAAE,EAAE;gBACd,OAAO,EAAE,EAAE;aACX,CAAC,CAAC;YAEH,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;SACnB;QAED,IAAI,QAAQ,IAAI,QAAQ,KAAK,IAAI,CAAC,OAAO,CAAC,GAAG,EAC7C;YACC,EAAE,CAAC,CAAC,GAAG,QAAQ,GAAG,EAAE,CAAC,CAAC,CAAC;SACvB;QAED,OAAO,EAAE,CAAC;IACX,CAAC;CACD;AA/UD,4CA+UC;AAEY,QAAA,IAAI,GAAG,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,gBAAgB,CAA0C,CAAC;AAE1G,kBAAe,gBAAgB,CAAC","sourcesContent":["'use strict';\n\n/**\n * 外文字符、数字识别模块\n *\n * @author 老雷<leizongmin@gmail.com>\n */\nimport { SubSModule, SubSModuleTokenizer, ISubTokenizerCreate } from '../mod';\nimport { Segment, IWord } from '../Segment';\nimport { debugToken } from '../util/debug';\nimport UString from 'uni-string';\nimport { debug } from '../util';\nimport { IWordDebugInfo } from '../util/index';\n\nexport class ForeignTokenizer extends SubSModuleTokenizer\n{\n\n\tname = 'ForeignTokenizer';\n\n\t/**\n\t * 分詞用(包含中文)\n\t */\n\t_REGEXP_SPLIT_1: RegExp;\n\t/**\n\t * 分詞用(不包含中文的全詞符合)\n\t */\n\t_REGEXP_SPLIT_2: RegExp;\n\n\t_cache()\n\t{\n\t\tsuper._cache();\n\t\tthis._TABLE = this.segment.getDict('TABLE');\n\n\t\tlet arr = [\n\t\t\t/[\\d0-9]+(?:,[\\d0-9]+)?(?:\\.[\\d0-9]+)?/,\n\t\t\t/[\\w0-9A-Za-z\\u0100-\\u017F\\u00A1-\\u00FF]+/,\n\t\t\t/[\\u0600-\\u06FF\\u0750-\\u077F]+/,\n\t\t\t/[\\u0400-\\u04FF]+/,\n\t\t\t// https://unicode-table.com/cn/blocks/greek-coptic/\n\t\t\t/[\\u0370-\\u03FF]+/,\n\t\t];\n\n\t\tthis._REGEXP_SPLIT_1 = new RegExp('(' +_join([\n\t\t\t/[\\u4E00-\\u9FFF]+/,\n\t\t].concat(arr)) + ')', 'iu');\n\n\t\tthis._REGEXP_SPLIT_2 = new RegExp('(' +_join(arr) + ')', 'iu');\n\n\t\tfunction _join(arr: Array<string | RegExp>)\n\t\t{\n\t\t\treturn arr.reduce(function (a, b)\n\t\t\t{\n\t\t\t\tif (b instanceof RegExp)\n\t\t\t\t{\n\t\t\t\t\ta.push(b.source);\n\t\t\t\t}\n\t\t\t\telse\n\t\t\t\t{\n\t\t\t\t\ta.push(b);\n\t\t\t\t}\n\n\t\t\t\treturn a;\n\t\t\t}, []).join('|')\n\t\t}\n\t}\n\n\t/**\n\t * 对未识别的单词进行分词\n\t *\n\t * @param {array} words 单词数组\n\t * @return {array}\n\t */\n\tsplit(words: IWord[]): IWord[]\n\t{\n\t\t//return this._splitUnknow(words, this.splitForeign);\n\t\treturn this._splitUnknow(words, this.splitForeign2);\n\n\t\t/*\n\t\tconst POSTAG = this.segment.POSTAG;\n\n\t\tlet ret = [];\n\t\tfor (let i = 0, word; word = words[i]; i++)\n\t\t{\n\t\t\tif (word.p)\n\t\t\t{\n\t\t\t\tret.push(word);\n\t\t\t}\n\t\t\telse\n\t\t\t{\n\t\t\t\t// 仅对未识别的词进行匹配\n\t\t\t\tret = ret.concat(this.splitForeign(word.w));\n\t\t\t}\n\t\t}\n\t\treturn ret;\n\t\t*/\n\t}\n\n\t/**\n\t * 支援更多外文判定(但可能會降低效率)\n\t *\n\t * 並且避免誤切割 例如 latīna Русский\n\t */\n\tsplitForeign2(text: string, cur?: number): IWord[]\n\t{\n\t\tconst POSTAG = this.segment.POSTAG;\n\t\tconst TABLE = this._TABLE;\n\n\t\t//console.time('splitForeign2');\n\n\t\tlet ret: IWord[] = [];\n\t\tlet self = this;\n\n\t\tlet ls = text\n\t\t\t.split(this._REGEXP_SPLIT_1)\n\t\t;\n\n\t\tfor (let w of ls)\n\t\t{\n\t\t\tif (w !== '')\n\t\t\t{\n\t\t\t\tif (this._REGEXP_SPLIT_2.test(w))\n\t\t\t\t{\n\t\t\t\t\tlet cw = TABLE[w];\n\n\t\t\t\t\tif (cw)\n\t\t\t\t\t{\n\t\t\t\t\t\tlet nw = this.createRawToken({\n\t\t\t\t\t\t\tw,\n\t\t\t\t\t\t}, cw, {\n\t\t\t\t\t\t\t[this.name]: 1,\n\t\t\t\t\t\t});\n\n\t\t\t\t\t\tret.push(nw);\n\t\t\t\t\t\tcontinue;\n\t\t\t\t\t}\n\n\t\t\t\t\t/**\n\t\t\t\t\t * 當分詞不存在於字典中時\n\t\t\t\t\t * 則再度分詞一次\n\t\t\t\t\t */\n\t\t\t\t\tlet ls2 = w\n\t\t\t\t\t\t.split(/([\\d+0-9]+)/)\n\t\t\t\t\t;\n\n\t\t\t\t\tfor (let w of ls2)\n\t\t\t\t\t{\n\t\t\t\t\t\tif (w === '')\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tcontinue;\n\t\t\t\t\t\t}\n\n\t\t\t\t\t\tlet lasttype = 0;\n\n\t\t\t\t\t\tlet c = w.charCodeAt(0);\n\t\t\t\t\t\tif (c >= 65296 && c <= 65370) c -= 65248;\n\n\t\t\t\t\t\tif (c >= 48 && c <= 57)\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tlasttype = POSTAG.A_M;\n\t\t\t\t\t\t}// 字母 lasttype = POSTAG.A_NX\n\t\t\t\t\t\telse if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122))\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tlasttype = POSTAG.A_NX;\n\t\t\t\t\t\t}\n\t\t\t\t\t\telse\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tlasttype = POSTAG.UNK;\n\t\t\t\t\t\t}\n\n\t\t\t\t\t\tif (lasttype === POSTAG.A_NX)\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tlet cw = TABLE[w];\n\n\t\t\t\t\t\t\tif (cw)\n\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\tlet nw = this.createRawToken({\n\t\t\t\t\t\t\t\t\tw,\n\t\t\t\t\t\t\t\t}, cw, {\n\t\t\t\t\t\t\t\t\t[this.name]: 2,\n\t\t\t\t\t\t\t\t});\n\n\t\t\t\t\t\t\t\tret.push(nw);\n\t\t\t\t\t\t\t\tcontinue;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\n\t\t\t\t\t\tret.push(self.debugToken({\n\t\t\t\t\t\t\tw: w,\n\t\t\t\t\t\t\tp: lasttype || undefined,\n\t\t\t\t\t\t}, {\n\t\t\t\t\t\t\t[self.name]: 3,\n\t\t\t\t\t\t}, true));\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\telse\n\t\t\t\t{\n\t\t\t\t\tret.push({\n\t\t\t\t\t\tw,\n\t\t\t\t\t});\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\t//console.timeEnd('splitForeign2');\n\n\t\t//console.log(ret);\n\n\t\treturn ret.length ? ret : undefined;\n\t}\n\n\t/**\n\t * 匹配包含的英文字符和数字,并分割\n\t *\n\t * @param {string} text 文本\n\t * @param {int} cur 开始位置\n\t * @return {array} 返回格式 {w: '单词', c: 开始位置}\n\t */\n\tsplitForeign(text: string, cur?: number): IWord[]\n\t{\n\t\tconst POSTAG = this.segment.POSTAG;\n\t\tconst TABLE = this._TABLE;\n\n\t\t//console.time('splitForeign');\n\n\t\tif (isNaN(cur)) cur = 0;\n\t\tlet ret = [];\n\n\t\t// 取第一个字符的ASCII码\n\t\tlet lastcur = 0;\n\t\tlet lasttype = 0;\n\t\tlet c = text.charCodeAt(0);\n\t\t// 全角数字或字母\n\t\tif (c >= 65296 && c <= 65370) c -= 65248;\n\t\t// 数字 lasttype = POSTAG.A_M\n\t\tif (c >= 48 && c <= 57)\n\t\t{\n\t\t\tlasttype = POSTAG.A_M;\n\t\t}// 字母 lasttype = POSTAG.A_NX\n\t\telse if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122))\n\t\t{\n\t\t\tlasttype = POSTAG.A_NX;\n\t\t}\n\t\telse\n\t\t{\n\t\t\tlasttype = POSTAG.UNK;\n\t\t}\n\n\t\tlet i: number;\n\n\t\tfor (i = 1; i < text.length; i++)\n\t\t{\n\t\t\tlet c = text.charCodeAt(i);\n\t\t\t// 全角数字或字母\n\t\t\tif (c >= 65296 && c <= 65370) c -= 65248;\n\t\t\t// 数字 lasttype = POSTAG.A_M\n\t\t\tif (c >= 48 && c <= 57)\n\t\t\t{\n\t\t\t\tif (lasttype !== POSTAG.A_M)\n\t\t\t\t{\n\t\t\t\t\tlet nw = this.createForeignToken({\n\t\t\t\t\t\tw: text.substr(lastcur, i - lastcur),\n\t\t\t\t\t}, lasttype, {\n\t\t\t\t\t\t[this.name]: 1,\n\t\t\t\t\t});\n\t\t\t\t\t//let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;\n\n\t\t\t\t\t//if (lasttype !== POSTAG.UNK) nw.p = lasttype;\n\t\t\t\t\tret.push(nw);\n\t\t\t\t\tlastcur = i;\n\t\t\t\t}\n\t\t\t\tlasttype = POSTAG.A_M;\n\t\t\t}\n\t\t\telse if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122))\n\t\t\t{\n\t\t\t\t// 字母 lasttype = POSTAG.A_NX\n\t\t\t\tif (lasttype !== POSTAG.A_NX)\n\t\t\t\t{\n\t\t\t\t\t//let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;\n\n\t\t\t\t\tlet nw = this.createRawToken({\n\t\t\t\t\t\tw: text.substr(lastcur, i - lastcur),\n\t\t\t\t\t}, {\n\t\t\t\t\t\tp: lasttype\n\t\t\t\t\t}, {\n\t\t\t\t\t\t[this.name]: 2,\n\t\t\t\t\t});\n\n\t\t\t\t\t//if (lasttype !== POSTAG.UNK) nw.p = lasttype;\n\t\t\t\t\tret.push(nw);\n\t\t\t\t\tlastcur = i;\n\t\t\t\t}\n\t\t\t\tlasttype = POSTAG.A_NX;\n\t\t\t}\n\t\t\telse\n\t\t\t{\n\t\t\t\t// 其他\n\t\t\t\tif (lasttype !== POSTAG.UNK)\n\t\t\t\t{\n\t\t\t\t\tlet nw = this.createForeignToken({\n\t\t\t\t\t\tw: text.substr(lastcur, i - lastcur),\n\t\t\t\t\t\tp: lasttype\n\t\t\t\t\t}, undefined, {\n\t\t\t\t\t\t[this.name]: 3,\n\t\t\t\t\t});\n\n\t\t\t\t\tret.push(nw);\n\t\t\t\t\tlastcur = i;\n\t\t\t\t}\n\t\t\t\tlasttype = POSTAG.UNK;\n\t\t\t}\n\t\t}\n\t\t// 剩余部分\n\t\t//let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;\n\n\t\tlet nw = this.createRawToken<IWord>({\n\t\t\tw: text.substr(lastcur, i - lastcur),\n\t\t});\n\n\t\tif (lasttype !== POSTAG.UNK) nw.p = lasttype;\n\t\tret.push(nw);\n\n\t\t//console.timeEnd('splitForeign');\n\n\t\t//debug(ret);\n\t\treturn ret;\n\t}\n\n\tcreateForeignToken(word: IWord, lasttype?: number, attr?: IWordDebugInfo)\n\t{\n\t\tlet nw = this.createToken<IWord>(word, true, attr);\n\n\t\tlet ow = this._TABLE[nw.w];\n\n\t\tif (ow)\n\t\t{\n\t\t\tdebugToken(nw, {\n\t\t\t\t_source: ow,\n\t\t\t});\n\n\t\t\tnw.p = nw.p | ow.p;\n\t\t}\n\n\t\tif (lasttype && lasttype !== this._POSTAG.UNK)\n\t\t{\n\t\t\tnw.p = lasttype | nw.p;\n\t\t}\n\n\t\treturn nw;\n\t}\n}\n\nexport const init = ForeignTokenizer.init.bind(ForeignTokenizer) as ISubTokenizerCreate<ForeignTokenizer>;\n\nexport default ForeignTokenizer;\n\n//debug(splitForeign('ad222经济核算123非'));\n"]}
\No newline at end of file