UNPKG

28.6 kBJavaScriptView Raw
1'use strict';
2Object.defineProperty(exports, "__esModule", { value: true });
3exports.init = exports.ForeignTokenizer = void 0;
4/**
5 * 外文字符、数字识别模块
6 *
7 * @author 老雷<leizongmin@gmail.com>
8 */
9const mod_1 = require("../mod");
10const debug_1 = require("../util/debug");
11class ForeignTokenizer extends mod_1.SubSModuleTokenizer {
12 constructor() {
13 super(...arguments);
14 this.name = 'ForeignTokenizer';
15 }
16 _cache() {
17 super._cache();
18 this._TABLE = this.segment.getDict('TABLE');
19 let arr = [
20 /[\d0-9]+(?:,[\d0-9]+)?(?:\.[\d0-9]+)?/,
21 /[\w0-9A-Za-z\u0100-\u017F\u00A1-\u00FF]+/,
22 /[\u0600-\u06FF\u0750-\u077F]+/,
23 /[\u0400-\u04FF]+/,
24 // https://unicode-table.com/cn/blocks/greek-coptic/
25 /[\u0370-\u03FF]+/,
26 ];
27 this._REGEXP_SPLIT_1 = new RegExp('(' + _join([
28 /[\u4E00-\u9FFF]+/,
29 ].concat(arr)) + ')', 'iu');
30 this._REGEXP_SPLIT_2 = new RegExp('(' + _join(arr) + ')', 'iu');
31 function _join(arr) {
32 return arr.reduce(function (a, b) {
33 if (b instanceof RegExp) {
34 a.push(b.source);
35 }
36 else {
37 a.push(b);
38 }
39 return a;
40 }, []).join('|');
41 }
42 }
43 /**
44 * 对未识别的单词进行分词
45 *
46 * @param {array} words 单词数组
47 * @return {array}
48 */
49 split(words) {
50 //return this._splitUnknow(words, this.splitForeign);
51 return this._splitUnknow(words, this.splitForeign2);
52 /*
53 const POSTAG = this.segment.POSTAG;
54
55 let ret = [];
56 for (let i = 0, word; word = words[i]; i++)
57 {
58 if (word.p)
59 {
60 ret.push(word);
61 }
62 else
63 {
64 // 仅对未识别的词进行匹配
65 ret = ret.concat(this.splitForeign(word.w));
66 }
67 }
68 return ret;
69 */
70 }
71 /**
72 * 支援更多外文判定(但可能會降低效率)
73 *
74 * 並且避免誤切割 例如 latīna Русский
75 */
76 splitForeign2(text, cur) {
77 const POSTAG = this.segment.POSTAG;
78 const TABLE = this._TABLE;
79 //console.time('splitForeign2');
80 let ret = [];
81 let self = this;
82 let ls = text
83 .split(this._REGEXP_SPLIT_1);
84 for (let w of ls) {
85 if (w !== '') {
86 if (this._REGEXP_SPLIT_2.test(w)) {
87 let cw = TABLE[w];
88 if (cw) {
89 let nw = this.createRawToken({
90 w,
91 }, cw, {
92 [this.name]: 1,
93 });
94 ret.push(nw);
95 continue;
96 }
97 /**
98 * 當分詞不存在於字典中時
99 * 則再度分詞一次
100 */
101 let ls2 = w
102 .split(/([\d+0-9]+)/);
103 for (let w of ls2) {
104 if (w === '') {
105 continue;
106 }
107 let lasttype = 0;
108 let c = w.charCodeAt(0);
109 if (c >= 65296 && c <= 65370)
110 c -= 65248;
111 if (c >= 48 && c <= 57) {
112 lasttype = POSTAG.A_M;
113 } // 字母 lasttype = POSTAG.A_NX
114 else if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122)) {
115 lasttype = POSTAG.A_NX;
116 }
117 else {
118 lasttype = POSTAG.UNK;
119 }
120 if (lasttype === POSTAG.A_NX) {
121 let cw = TABLE[w];
122 if (cw) {
123 let nw = this.createRawToken({
124 w,
125 }, cw, {
126 [this.name]: 2,
127 });
128 ret.push(nw);
129 continue;
130 }
131 }
132 ret.push(self.debugToken({
133 w: w,
134 p: lasttype || undefined,
135 }, {
136 [self.name]: 3,
137 }, true));
138 }
139 }
140 else {
141 ret.push({
142 w,
143 });
144 }
145 }
146 }
147 //console.timeEnd('splitForeign2');
148 //console.log(ret);
149 return ret.length ? ret : undefined;
150 }
151 /**
152 * 匹配包含的英文字符和数字,并分割
153 *
154 * @param {string} text 文本
155 * @param {int} cur 开始位置
156 * @return {array} 返回格式 {w: '单词', c: 开始位置}
157 */
158 splitForeign(text, cur) {
159 const POSTAG = this.segment.POSTAG;
160 const TABLE = this._TABLE;
161 //console.time('splitForeign');
162 if (isNaN(cur))
163 cur = 0;
164 let ret = [];
165 // 取第一个字符的ASCII码
166 let lastcur = 0;
167 let lasttype = 0;
168 let c = text.charCodeAt(0);
169 // 全角数字或字母
170 if (c >= 65296 && c <= 65370)
171 c -= 65248;
172 // 数字 lasttype = POSTAG.A_M
173 if (c >= 48 && c <= 57) {
174 lasttype = POSTAG.A_M;
175 } // 字母 lasttype = POSTAG.A_NX
176 else if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122)) {
177 lasttype = POSTAG.A_NX;
178 }
179 else {
180 lasttype = POSTAG.UNK;
181 }
182 let i;
183 for (i = 1; i < text.length; i++) {
184 let c = text.charCodeAt(i);
185 // 全角数字或字母
186 if (c >= 65296 && c <= 65370)
187 c -= 65248;
188 // 数字 lasttype = POSTAG.A_M
189 if (c >= 48 && c <= 57) {
190 if (lasttype !== POSTAG.A_M) {
191 let nw = this.createForeignToken({
192 w: text.substr(lastcur, i - lastcur),
193 }, lasttype, {
194 [this.name]: 1,
195 });
196 //let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;
197 //if (lasttype !== POSTAG.UNK) nw.p = lasttype;
198 ret.push(nw);
199 lastcur = i;
200 }
201 lasttype = POSTAG.A_M;
202 }
203 else if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122)) {
204 // 字母 lasttype = POSTAG.A_NX
205 if (lasttype !== POSTAG.A_NX) {
206 //let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;
207 let nw = this.createRawToken({
208 w: text.substr(lastcur, i - lastcur),
209 }, {
210 p: lasttype
211 }, {
212 [this.name]: 2,
213 });
214 //if (lasttype !== POSTAG.UNK) nw.p = lasttype;
215 ret.push(nw);
216 lastcur = i;
217 }
218 lasttype = POSTAG.A_NX;
219 }
220 else {
221 // 其他
222 if (lasttype !== POSTAG.UNK) {
223 let nw = this.createForeignToken({
224 w: text.substr(lastcur, i - lastcur),
225 p: lasttype
226 }, undefined, {
227 [this.name]: 3,
228 });
229 ret.push(nw);
230 lastcur = i;
231 }
232 lasttype = POSTAG.UNK;
233 }
234 }
235 // 剩余部分
236 //let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;
237 let nw = this.createRawToken({
238 w: text.substr(lastcur, i - lastcur),
239 });
240 if (lasttype !== POSTAG.UNK)
241 nw.p = lasttype;
242 ret.push(nw);
243 //console.timeEnd('splitForeign');
244 //debug(ret);
245 return ret;
246 }
247 createForeignToken(word, lasttype, attr) {
248 let nw = this.createToken(word, true, attr);
249 let ow = this._TABLE[nw.w];
250 if (ow) {
251 debug_1.debugToken(nw, {
252 _source: ow,
253 });
254 nw.p = nw.p | ow.p;
255 }
256 if (lasttype && lasttype !== this._POSTAG.UNK) {
257 nw.p = lasttype | nw.p;
258 }
259 return nw;
260 }
261}
262exports.ForeignTokenizer = ForeignTokenizer;
263exports.init = ForeignTokenizer.init.bind(ForeignTokenizer);
264exports.default = ForeignTokenizer;
265//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"ForeignTokenizer.js","sourceRoot":"","sources":["ForeignTokenizer.ts"],"names":[],"mappings":"AAAA,YAAY,CAAC;;;AAEb;;;;GAIG;AACH,gCAA8E;AAE9E,yCAA2C;AAK3C,MAAa,gBAAiB,SAAQ,yBAAmB;IAAzD;;QAGC,SAAI,GAAG,kBAAkB,CAAC;IA4U3B,CAAC;IAjUA,MAAM;QAEL,KAAK,CAAC,MAAM,EAAE,CAAC;QACf,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAE5C,IAAI,GAAG,GAAG;YACT,uCAAuC;YACvC,0CAA0C;YAC1C,+BAA+B;YAC/B,kBAAkB;YAClB,oDAAoD;YACpD,kBAAkB;SAClB,CAAC;QAEF,IAAI,CAAC,eAAe,GAAG,IAAI,MAAM,CAAC,GAAG,GAAE,KAAK,CAAC;YAC5C,kBAAkB;SAClB,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,GAAG,GAAG,EAAE,IAAI,CAAC,CAAC;QAE5B,IAAI,CAAC,eAAe,GAAG,IAAI,MAAM,CAAC,GAAG,GAAE,KAAK,CAAC,GAAG,CAAC,GAAG,GAAG,EAAE,IAAI,CAAC,CAAC;QAE/D,SAAS,KAAK,CAAC,GAA2B;YAEzC,OAAO,GAAG,CAAC,MAAM,CAAC,UAAU,CAAC,EAAE,CAAC;gBAE/B,IAAI,CAAC,YAAY,MAAM,EACvB;oBACC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;iBACjB;qBAED;oBACC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;iBACV;gBAED,OAAO,CAAC,CAAC;YACV,CAAC,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;QACjB,CAAC;IACF,CAAC;IAED;;;;;OAKG;IACH,KAAK,CAAC,KAAc;QAEnB,qDAAqD;QACrD,OAAO,IAAI,CAAC,YAAY,CAAC,KAAK,EAAE,IAAI,CAAC,aAAa,CAAC,CAAC;QAEpD;;;;;;;;;;;;;;;;;UAiBE;IACH,CAAC;IAED;;;;OAIG;IACH,aAAa,CAAC,IAAY,EAAE,GAAY;QAEvC,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC;QACnC,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC;QAE1B,gCAAgC;QAEhC,IAAI,GAAG,GAAY,EAAE,CAAC;QACtB,IAAI,IAAI,GAAG,IAAI,CAAC;QAEhB,IAAI,EAAE,GAAG,IAAI;aACX,KAAK,CAAC,IAAI,CAAC,eAAe,CAAC,CAC5B;QAED,KAAK,IAAI,CAAC,IAAI,EAAE,EAChB;YACC,IAAI,CAAC,KAAK,EAAE,EACZ;gBACC,IAAI,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,EAChC;oBACC,IAAI,EAAE,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;oBAElB,IAAI,EAAE,EACN;wBACC,IAAI,EAAE,GAAG,IAAI,CAAC,cAAc,CAAC;4BAC5B,CAAC;yBACD,EAAE,EAAE,EAAE;4BACN,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;yBACd,CAAC,CAAC;wBAEH,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;wBACb,SAAS;qBACT;oBAED;;;uBAGG;oBACH,IAAI,GAAG,GAAG,CAAC;yBACT,KAAK,CAAC,aAAa,CAAC,CACrB;oBAED,KAAK,IAAI,CAAC,IAAI,GAAG,EACjB;wBACC,IAAI,CAAC,KAAK,EAAE,EACZ;4BACC,SAAS;yBACT;wBAED,IAAI,QAAQ,GAAG,CAAC,CAAC;wBAEjB,IAAI,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;wBACxB,IAAI,CAAC,IAAI,KAAK,IAAI,CAAC,IAAI,KAAK;4BAAE,CAAC,IAAI,KAAK,CAAC;wBAEzC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,EACtB;4BACC,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC;yBACtB,CAAA,4BAA4B;6BACxB,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,GAAG,CAAC,EACtD;4BACC,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC;yBACvB;6BAED;4BACC,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC;yBACtB;wBAED,IAAI,QAAQ,KAAK,MAAM,CAAC,IAAI,EAC5B;4BACC,IAAI,EAAE,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;4BAElB,IAAI,EAAE,EACN;gCACC,IAAI,EAAE,GAAG,IAAI,CAAC,cAAc,CAAC;oCAC5B,CAAC;iCACD,EAAE,EAAE,EAAE;oCACN,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;iCACd,CAAC,CAAC;gCAEH,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gCACb,SAAS;6BACT;yBACD;wBAED,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC;4BACxB,CAAC,EAAE,CAAC;4BACJ,CAAC,EAAE,QAAQ,IAAI,SAAS;yBACxB,EAAE;4BACF,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;yBACd,EAAE,IAAI,CAAC,CAAC,CAAC;qBACV;iBACD;qBAED;oBACC,GAAG,CAAC,IAAI,CAAC;wBACR,CAAC;qBACD,CAAC,CAAC;iBACH;aACD;SACD;QAED,mCAAmC;QAEnC,mBAAmB;QAEnB,OAAO,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC;IACrC,CAAC;IAED;;;;;;OAMG;IACH,YAAY,CAAC,IAAY,EAAE,GAAY;QAEtC,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC;QACnC,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC;QAE1B,+BAA+B;QAE/B,IAAI,KAAK,CAAC,GAAG,CAAC;YAAE,GAAG,GAAG,CAAC,CAAC;QACxB,IAAI,GAAG,GAAG,EAAE,CAAC;QAEb,gBAAgB;QAChB,IAAI,OAAO,GAAG,CAAC,CAAC;QAChB,IAAI,QAAQ,GAAG,CAAC,CAAC;QACjB,IAAI,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QAC3B,UAAU;QACV,IAAI,CAAC,IAAI,KAAK,IAAI,CAAC,IAAI,KAAK;YAAE,CAAC,IAAI,KAAK,CAAC;QACzC,4BAA4B;QAC5B,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,EACtB;YACC,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC;SACtB,CAAA,4BAA4B;aACxB,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,GAAG,CAAC,EACtD;YACC,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC;SACvB;aAED;YACC,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC;SACtB;QAED,IAAI,CAAS,CAAC;QAEd,KAAK,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAChC;YACC,IAAI,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;YAC3B,UAAU;YACV,IAAI,CAAC,IAAI,KAAK,IAAI,CAAC,IAAI,KAAK;gBAAE,CAAC,IAAI,KAAK,CAAC;YACzC,4BAA4B;YAC5B,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,EACtB;gBACC,IAAI,QAAQ,KAAK,MAAM,CAAC,GAAG,EAC3B;oBACC,IAAI,EAAE,GAAG,IAAI,CAAC,kBAAkB,CAAC;wBAChC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC,GAAG,OAAO,CAAC;qBACpC,EAAE,QAAQ,EAAE;wBACZ,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;qBACd,CAAC,CAAC;oBACH,6DAA6D;oBAE7D,+CAA+C;oBAC/C,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACb,OAAO,GAAG,CAAC,CAAC;iBACZ;gBACD,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC;aACtB;iBACI,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,GAAG,CAAC,EACtD;gBACC,4BAA4B;gBAC5B,IAAI,QAAQ,KAAK,MAAM,CAAC,IAAI,EAC5B;oBACC,6DAA6D;oBAE7D,IAAI,EAAE,GAAG,IAAI,CAAC,cAAc,CAAC;wBAC5B,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC,GAAG,OAAO,CAAC;qBACpC,EAAE;wBACF,CAAC,EAAE,QAAQ;qBACX,EAAE;wBACF,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;qBACd,CAAC,CAAC;oBAEH,+CAA+C;oBAC/C,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACb,OAAO,GAAG,CAAC,CAAC;iBACZ;gBACD,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC;aACvB;iBAED;gBACC,KAAK;gBACL,IAAI,QAAQ,KAAK,MAAM,CAAC,GAAG,EAC3B;oBACC,IAAI,EAAE,GAAG,IAAI,CAAC,kBAAkB,CAAC;wBAChC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC,GAAG,OAAO,CAAC;wBACpC,CAAC,EAAE,QAAQ;qBACX,EAAE,SAAS,EAAE;wBACb,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;qBACd,CAAC,CAAC;oBAEH,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACb,OAAO,GAAG,CAAC,CAAC;iBACZ;gBACD,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC;aACtB;SACD;QACD,OAAO;QACP,6DAA6D;QAE7D,IAAI,EAAE,GAAG,IAAI,CAAC,cAAc,CAAQ;YACnC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC,GAAG,OAAO,CAAC;SACpC,CAAC,CAAC;QAEH,IAAI,QAAQ,KAAK,MAAM,CAAC,GAAG;YAAE,EAAE,CAAC,CAAC,GAAG,QAAQ,CAAC;QAC7C,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAEb,kCAAkC;QAElC,aAAa;QACb,OAAO,GAAG,CAAC;IACZ,CAAC;IAED,kBAAkB,CAAC,IAAW,EAAE,QAAiB,EAAE,IAAqB;QAEvE,IAAI,EAAE,GAAG,IAAI,CAAC,WAAW,CAAQ,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC;QAEnD,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAE3B,IAAI,EAAE,EACN;YACC,kBAAU,CAAC,EAAE,EAAE;gBACd,OAAO,EAAE,EAAE;aACX,CAAC,CAAC;YAEH,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;SACnB;QAED,IAAI,QAAQ,IAAI,QAAQ,KAAK,IAAI,CAAC,OAAO,CAAC,GAAG,EAC7C;YACC,EAAE,CAAC,CAAC,GAAG,QAAQ,GAAG,EAAE,CAAC,CAAC,CAAC;SACvB;QAED,OAAO,EAAE,CAAC;IACX,CAAC;CACD;AA/UD,4CA+UC;AAEY,QAAA,IAAI,GAAG,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,gBAAgB,CAA0C,CAAC;AAE1G,kBAAe,gBAAgB,CAAC","sourcesContent":["'use strict';\n\n/**\n * 外文字符、数字识别模块\n *\n * @author 老雷<leizongmin@gmail.com>\n */\nimport { SubSModule, SubSModuleTokenizer, ISubTokenizerCreate } from '../mod';\nimport { Segment, IWord } from '../Segment';\nimport { debugToken } from '../util/debug';\nimport UString from 'uni-string';\nimport { debug } from '../util';\nimport { IWordDebugInfo } from '../util/index';\n\nexport class ForeignTokenizer extends SubSModuleTokenizer\n{\n\n\tname = 'ForeignTokenizer';\n\n\t/**\n\t * 分詞用(包含中文)\n\t */\n\t_REGEXP_SPLIT_1: RegExp;\n\t/**\n\t * 分詞用(不包含中文的全詞符合)\n\t */\n\t_REGEXP_SPLIT_2: RegExp;\n\n\t_cache()\n\t{\n\t\tsuper._cache();\n\t\tthis._TABLE = this.segment.getDict('TABLE');\n\n\t\tlet arr = [\n\t\t\t/[\\d０-９]+(?:,[\\d０-９]+)?(?:\\.[\\d０-９]+)?/,\n\t\t\t/[\\w０-９Ａ-Ｚａ-ｚ\\u0100-\\u017F\\u00A1-\\u00FF]+/,\n\t\t\t/[\\u0600-\\u06FF\\u0750-\\u077F]+/,\n\t\t\t/[\\u0400-\\u04FF]+/,\n\t\t\t// https://unicode-table.com/cn/blocks/greek-coptic/\n\t\t\t/[\\u0370-\\u03FF]+/,\n\t\t];\n\n\t\tthis._REGEXP_SPLIT_1 = new RegExp('(' +_join([\n\t\t\t/[\\u4E00-\\u9FFF]+/,\n\t\t].concat(arr)) + ')', 'iu');\n\n\t\tthis._REGEXP_SPLIT_2 = new RegExp('(' +_join(arr) + ')', 'iu');\n\n\t\tfunction _join(arr: Array<string | RegExp>)\n\t\t{\n\t\t\treturn arr.reduce(function (a, b)\n\t\t\t{\n\t\t\t\tif (b instanceof RegExp)\n\t\t\t\t{\n\t\t\t\t\ta.push(b.source);\n\t\t\t\t}\n\t\t\t\telse\n\t\t\t\t{\n\t\t\t\t\ta.push(b);\n\t\t\t\t}\n\n\t\t\t\treturn a;\n\t\t\t}, []).join('|')\n\t\t}\n\t}\n\n\t/**\n\t * 对未识别的单词进行分词\n\t *\n\t * @param {array} words 单词数组\n\t * @return {array}\n\t */\n\tsplit(words: IWord[]): IWord[]\n\t{\n\t\t//return this._splitUnknow(words, this.splitForeign);\n\t\treturn this._splitUnknow(words, this.splitForeign2);\n\n\t\t/*\n\t\tconst POSTAG = this.segment.POSTAG;\n\n\t\tlet ret = [];\n\t\tfor (let i = 0, word; word = words[i]; i++)\n\t\t{\n\t\t\tif (word.p)\n\t\t\t{\n\t\t\t\tret.push(word);\n\t\t\t}\n\t\t\telse\n\t\t\t{\n\t\t\t\t// 仅对未识别的词进行匹配\n\t\t\t\tret = ret.concat(this.splitForeign(word.w));\n\t\t\t}\n\t\t}\n\t\treturn ret;\n\t\t*/\n\t}\n\n\t/**\n\t * 支援更多外文判定(但可能會降低效率)\n\t *\n\t * 並且避免誤切割 例如 latīna Русский\n\t */\n\tsplitForeign2(text: string, cur?: number): IWord[]\n\t{\n\t\tconst POSTAG = this.segment.POSTAG;\n\t\tconst TABLE = this._TABLE;\n\n\t\t//console.time('splitForeign2');\n\n\t\tlet ret: IWord[] = [];\n\t\tlet self = this;\n\n\t\tlet ls = text\n\t\t\t.split(this._REGEXP_SPLIT_1)\n\t\t;\n\n\t\tfor (let w of ls)\n\t\t{\n\t\t\tif (w !== '')\n\t\t\t{\n\t\t\t\tif (this._REGEXP_SPLIT_2.test(w))\n\t\t\t\t{\n\t\t\t\t\tlet cw = TABLE[w];\n\n\t\t\t\t\tif (cw)\n\t\t\t\t\t{\n\t\t\t\t\t\tlet nw = this.createRawToken({\n\t\t\t\t\t\t\tw,\n\t\t\t\t\t\t}, cw, {\n\t\t\t\t\t\t\t[this.name]: 1,\n\t\t\t\t\t\t});\n\n\t\t\t\t\t\tret.push(nw);\n\t\t\t\t\t\tcontinue;\n\t\t\t\t\t}\n\n\t\t\t\t\t/**\n\t\t\t\t\t * 當分詞不存在於字典中時\n\t\t\t\t\t * 則再度分詞一次\n\t\t\t\t\t */\n\t\t\t\t\tlet ls2 = w\n\t\t\t\t\t\t.split(/([\\d+０-９]+)/)\n\t\t\t\t\t;\n\n\t\t\t\t\tfor (let w of ls2)\n\t\t\t\t\t{\n\t\t\t\t\t\tif (w === '')\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tcontinue;\n\t\t\t\t\t\t}\n\n\t\t\t\t\t\tlet lasttype = 0;\n\n\t\t\t\t\t\tlet c = w.charCodeAt(0);\n\t\t\t\t\t\tif (c >= 65296 && c <= 65370) c -= 65248;\n\n\t\t\t\t\t\tif (c >= 48 && c <= 57)\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tlasttype = POSTAG.A_M;\n\t\t\t\t\t\t}// 字母 lasttype = POSTAG.A_NX\n\t\t\t\t\t\telse if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122))\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tlasttype = POSTAG.A_NX;\n\t\t\t\t\t\t}\n\t\t\t\t\t\telse\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tlasttype = POSTAG.UNK;\n\t\t\t\t\t\t}\n\n\t\t\t\t\t\tif (lasttype === POSTAG.A_NX)\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tlet cw = TABLE[w];\n\n\t\t\t\t\t\t\tif (cw)\n\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\tlet nw = this.createRawToken({\n\t\t\t\t\t\t\t\t\tw,\n\t\t\t\t\t\t\t\t}, cw, {\n\t\t\t\t\t\t\t\t\t[this.name]: 2,\n\t\t\t\t\t\t\t\t});\n\n\t\t\t\t\t\t\t\tret.push(nw);\n\t\t\t\t\t\t\t\tcontinue;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\n\t\t\t\t\t\tret.push(self.debugToken({\n\t\t\t\t\t\t\tw: w,\n\t\t\t\t\t\t\tp: lasttype || undefined,\n\t\t\t\t\t\t}, {\n\t\t\t\t\t\t\t[self.name]: 3,\n\t\t\t\t\t\t}, true));\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\telse\n\t\t\t\t{\n\t\t\t\t\tret.push({\n\t\t\t\t\t\tw,\n\t\t\t\t\t});\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\t//console.timeEnd('splitForeign2');\n\n\t\t//console.log(ret);\n\n\t\treturn ret.length ? ret : undefined;\n\t}\n\n\t/**\n\t * 匹配包含的英文字符和数字，并分割\n\t *\n\t * @param {string} text 文本\n\t * @param {int} cur 开始位置\n\t * @return {array}  返回格式   {w: '单词', c: 开始位置}\n\t */\n\tsplitForeign(text: string, cur?: number): IWord[]\n\t{\n\t\tconst POSTAG = this.segment.POSTAG;\n\t\tconst TABLE = this._TABLE;\n\n\t\t//console.time('splitForeign');\n\n\t\tif (isNaN(cur)) cur = 0;\n\t\tlet ret = [];\n\n\t\t// 取第一个字符的ASCII码\n\t\tlet lastcur = 0;\n\t\tlet lasttype = 0;\n\t\tlet c = text.charCodeAt(0);\n\t\t// 全角数字或字母\n\t\tif (c >= 65296 && c <= 65370) c -= 65248;\n\t\t// 数字  lasttype = POSTAG.A_M\n\t\tif (c >= 48 && c <= 57)\n\t\t{\n\t\t\tlasttype = POSTAG.A_M;\n\t\t}// 字母 lasttype = POSTAG.A_NX\n\t\telse if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122))\n\t\t{\n\t\t\tlasttype = POSTAG.A_NX;\n\t\t}\n\t\telse\n\t\t{\n\t\t\tlasttype = POSTAG.UNK;\n\t\t}\n\n\t\tlet i: number;\n\n\t\tfor (i = 1; i < text.length; i++)\n\t\t{\n\t\t\tlet c = text.charCodeAt(i);\n\t\t\t// 全角数字或字母\n\t\t\tif (c >= 65296 && c <= 65370) c -= 65248;\n\t\t\t// 数字  lasttype = POSTAG.A_M\n\t\t\tif (c >= 48 && c <= 57)\n\t\t\t{\n\t\t\t\tif (lasttype !== POSTAG.A_M)\n\t\t\t\t{\n\t\t\t\t\tlet nw = this.createForeignToken({\n\t\t\t\t\t\tw: text.substr(lastcur, i - lastcur),\n\t\t\t\t\t}, lasttype, {\n\t\t\t\t\t\t[this.name]: 1,\n\t\t\t\t\t});\n\t\t\t\t\t//let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;\n\n\t\t\t\t\t//if (lasttype !== POSTAG.UNK) nw.p = lasttype;\n\t\t\t\t\tret.push(nw);\n\t\t\t\t\tlastcur = i;\n\t\t\t\t}\n\t\t\t\tlasttype = POSTAG.A_M;\n\t\t\t}\n\t\t\telse if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122))\n\t\t\t{\n\t\t\t\t// 字母 lasttype = POSTAG.A_NX\n\t\t\t\tif (lasttype !== POSTAG.A_NX)\n\t\t\t\t{\n\t\t\t\t\t//let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;\n\n\t\t\t\t\tlet nw = this.createRawToken({\n\t\t\t\t\t\tw: text.substr(lastcur, i - lastcur),\n\t\t\t\t\t}, {\n\t\t\t\t\t\tp: lasttype\n\t\t\t\t\t}, {\n\t\t\t\t\t\t[this.name]: 2,\n\t\t\t\t\t});\n\n\t\t\t\t\t//if (lasttype !== POSTAG.UNK) nw.p = lasttype;\n\t\t\t\t\tret.push(nw);\n\t\t\t\t\tlastcur = i;\n\t\t\t\t}\n\t\t\t\tlasttype = POSTAG.A_NX;\n\t\t\t}\n\t\t\telse\n\t\t\t{\n\t\t\t\t// 其他\n\t\t\t\tif (lasttype !== POSTAG.UNK)\n\t\t\t\t{\n\t\t\t\t\tlet nw = this.createForeignToken({\n\t\t\t\t\t\tw: text.substr(lastcur, i - lastcur),\n\t\t\t\t\t\tp: lasttype\n\t\t\t\t\t}, undefined, {\n\t\t\t\t\t\t[this.name]: 3,\n\t\t\t\t\t});\n\n\t\t\t\t\tret.push(nw);\n\t\t\t\t\tlastcur = i;\n\t\t\t\t}\n\t\t\t\tlasttype = POSTAG.UNK;\n\t\t\t}\n\t\t}\n\t\t// 剩余部分\n\t\t//let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;\n\n\t\tlet nw = this.createRawToken<IWord>({\n\t\t\tw: text.substr(lastcur, i - lastcur),\n\t\t});\n\n\t\tif (lasttype !== POSTAG.UNK) nw.p = lasttype;\n\t\tret.push(nw);\n\n\t\t//console.timeEnd('splitForeign');\n\n\t\t//debug(ret);\n\t\treturn ret;\n\t}\n\n\tcreateForeignToken(word: IWord, lasttype?: number, attr?: IWordDebugInfo)\n\t{\n\t\tlet nw = this.createToken<IWord>(word, true, attr);\n\n\t\tlet ow = this._TABLE[nw.w];\n\n\t\tif (ow)\n\t\t{\n\t\t\tdebugToken(nw, {\n\t\t\t\t_source: ow,\n\t\t\t});\n\n\t\t\tnw.p = nw.p | ow.p;\n\t\t}\n\n\t\tif (lasttype && lasttype !== this._POSTAG.UNK)\n\t\t{\n\t\t\tnw.p = lasttype | nw.p;\n\t\t}\n\n\t\treturn nw;\n\t}\n}\n\nexport const init = ForeignTokenizer.init.bind(ForeignTokenizer) as ISubTokenizerCreate<ForeignTokenizer>;\n\nexport default ForeignTokenizer;\n\n//debug(splitForeign('ad222经济核算123非'));\n"]}
\No newline at end of file