1 | ;
|
2 | Object.defineProperty(exports, "__esModule", { value: true });
|
3 | exports.init = exports.ForeignTokenizer = void 0;
|
4 | /**
|
5 | * 外文字符、数字识别模块
|
6 | *
|
7 | * @author 老雷<leizongmin@gmail.com>
|
8 | */
|
9 | const mod_1 = require("../mod");
|
10 | const debug_1 = require("../util/debug");
|
11 | class ForeignTokenizer extends mod_1.SubSModuleTokenizer {
|
12 | constructor() {
|
13 | super(...arguments);
|
14 | this.name = 'ForeignTokenizer';
|
15 | }
|
16 | _cache() {
|
17 | super._cache();
|
18 | this._TABLE = this.segment.getDict('TABLE');
|
19 | let arr = [
|
20 | /[\d0-9]+(?:,[\d0-9]+)?(?:\.[\d0-9]+)?/,
|
21 | /[\w0-9A-Za-z\u0100-\u017F\u00A1-\u00FF]+/,
|
22 | /[\u0600-\u06FF\u0750-\u077F]+/,
|
23 | /[\u0400-\u04FF]+/,
|
24 | // https://unicode-table.com/cn/blocks/greek-coptic/
|
25 | /[\u0370-\u03FF]+/,
|
26 | ];
|
27 | this._REGEXP_SPLIT_1 = new RegExp('(' + _join([
|
28 | /[\u4E00-\u9FFF]+/,
|
29 | ].concat(arr)) + ')', 'iu');
|
30 | this._REGEXP_SPLIT_2 = new RegExp('(' + _join(arr) + ')', 'iu');
|
31 | function _join(arr) {
|
32 | return arr.reduce(function (a, b) {
|
33 | if (b instanceof RegExp) {
|
34 | a.push(b.source);
|
35 | }
|
36 | else {
|
37 | a.push(b);
|
38 | }
|
39 | return a;
|
40 | }, []).join('|');
|
41 | }
|
42 | }
|
43 | /**
|
44 | * 对未识别的单词进行分词
|
45 | *
|
46 | * @param {array} words 单词数组
|
47 | * @return {array}
|
48 | */
|
49 | split(words) {
|
50 | //return this._splitUnknow(words, this.splitForeign);
|
51 | return this._splitUnknow(words, this.splitForeign2);
|
52 | /*
|
53 | const POSTAG = this.segment.POSTAG;
|
54 |
|
55 | let ret = [];
|
56 | for (let i = 0, word; word = words[i]; i++)
|
57 | {
|
58 | if (word.p)
|
59 | {
|
60 | ret.push(word);
|
61 | }
|
62 | else
|
63 | {
|
64 | // 仅对未识别的词进行匹配
|
65 | ret = ret.concat(this.splitForeign(word.w));
|
66 | }
|
67 | }
|
68 | return ret;
|
69 | */
|
70 | }
|
71 | /**
|
72 | * 支援更多外文判定(但可能會降低效率)
|
73 | *
|
74 | * 並且避免誤切割 例如 latīna Русский
|
75 | */
|
76 | splitForeign2(text, cur) {
|
77 | const POSTAG = this.segment.POSTAG;
|
78 | const TABLE = this._TABLE;
|
79 | //console.time('splitForeign2');
|
80 | let ret = [];
|
81 | let self = this;
|
82 | let ls = text
|
83 | .split(this._REGEXP_SPLIT_1);
|
84 | for (let w of ls) {
|
85 | if (w !== '') {
|
86 | if (this._REGEXP_SPLIT_2.test(w)) {
|
87 | let cw = TABLE[w];
|
88 | if (cw) {
|
89 | let nw = this.createRawToken({
|
90 | w,
|
91 | }, cw, {
|
92 | [this.name]: 1,
|
93 | });
|
94 | ret.push(nw);
|
95 | continue;
|
96 | }
|
97 | /**
|
98 | * 當分詞不存在於字典中時
|
99 | * 則再度分詞一次
|
100 | */
|
101 | let ls2 = w
|
102 | .split(/([\d+0-9]+)/);
|
103 | for (let w of ls2) {
|
104 | if (w === '') {
|
105 | continue;
|
106 | }
|
107 | let lasttype = 0;
|
108 | let c = w.charCodeAt(0);
|
109 | if (c >= 65296 && c <= 65370)
|
110 | c -= 65248;
|
111 | if (c >= 48 && c <= 57) {
|
112 | lasttype = POSTAG.A_M;
|
113 | } // 字母 lasttype = POSTAG.A_NX
|
114 | else if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122)) {
|
115 | lasttype = POSTAG.A_NX;
|
116 | }
|
117 | else {
|
118 | lasttype = POSTAG.UNK;
|
119 | }
|
120 | if (lasttype === POSTAG.A_NX) {
|
121 | let cw = TABLE[w];
|
122 | if (cw) {
|
123 | let nw = this.createRawToken({
|
124 | w,
|
125 | }, cw, {
|
126 | [this.name]: 2,
|
127 | });
|
128 | ret.push(nw);
|
129 | continue;
|
130 | }
|
131 | }
|
132 | ret.push(self.debugToken({
|
133 | w: w,
|
134 | p: lasttype || undefined,
|
135 | }, {
|
136 | [self.name]: 3,
|
137 | }, true));
|
138 | }
|
139 | }
|
140 | else {
|
141 | ret.push({
|
142 | w,
|
143 | });
|
144 | }
|
145 | }
|
146 | }
|
147 | //console.timeEnd('splitForeign2');
|
148 | //console.log(ret);
|
149 | return ret.length ? ret : undefined;
|
150 | }
|
151 | /**
|
152 | * 匹配包含的英文字符和数字,并分割
|
153 | *
|
154 | * @param {string} text 文本
|
155 | * @param {int} cur 开始位置
|
156 | * @return {array} 返回格式 {w: '单词', c: 开始位置}
|
157 | */
|
158 | splitForeign(text, cur) {
|
159 | const POSTAG = this.segment.POSTAG;
|
160 | const TABLE = this._TABLE;
|
161 | //console.time('splitForeign');
|
162 | if (isNaN(cur))
|
163 | cur = 0;
|
164 | let ret = [];
|
165 | // 取第一个字符的ASCII码
|
166 | let lastcur = 0;
|
167 | let lasttype = 0;
|
168 | let c = text.charCodeAt(0);
|
169 | // 全角数字或字母
|
170 | if (c >= 65296 && c <= 65370)
|
171 | c -= 65248;
|
172 | // 数字 lasttype = POSTAG.A_M
|
173 | if (c >= 48 && c <= 57) {
|
174 | lasttype = POSTAG.A_M;
|
175 | } // 字母 lasttype = POSTAG.A_NX
|
176 | else if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122)) {
|
177 | lasttype = POSTAG.A_NX;
|
178 | }
|
179 | else {
|
180 | lasttype = POSTAG.UNK;
|
181 | }
|
182 | let i;
|
183 | for (i = 1; i < text.length; i++) {
|
184 | let c = text.charCodeAt(i);
|
185 | // 全角数字或字母
|
186 | if (c >= 65296 && c <= 65370)
|
187 | c -= 65248;
|
188 | // 数字 lasttype = POSTAG.A_M
|
189 | if (c >= 48 && c <= 57) {
|
190 | if (lasttype !== POSTAG.A_M) {
|
191 | let nw = this.createForeignToken({
|
192 | w: text.substr(lastcur, i - lastcur),
|
193 | }, lasttype, {
|
194 | [this.name]: 1,
|
195 | });
|
196 | //let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;
|
197 | //if (lasttype !== POSTAG.UNK) nw.p = lasttype;
|
198 | ret.push(nw);
|
199 | lastcur = i;
|
200 | }
|
201 | lasttype = POSTAG.A_M;
|
202 | }
|
203 | else if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122)) {
|
204 | // 字母 lasttype = POSTAG.A_NX
|
205 | if (lasttype !== POSTAG.A_NX) {
|
206 | //let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;
|
207 | let nw = this.createRawToken({
|
208 | w: text.substr(lastcur, i - lastcur),
|
209 | }, {
|
210 | p: lasttype
|
211 | }, {
|
212 | [this.name]: 2,
|
213 | });
|
214 | //if (lasttype !== POSTAG.UNK) nw.p = lasttype;
|
215 | ret.push(nw);
|
216 | lastcur = i;
|
217 | }
|
218 | lasttype = POSTAG.A_NX;
|
219 | }
|
220 | else {
|
221 | // 其他
|
222 | if (lasttype !== POSTAG.UNK) {
|
223 | let nw = this.createForeignToken({
|
224 | w: text.substr(lastcur, i - lastcur),
|
225 | p: lasttype
|
226 | }, undefined, {
|
227 | [this.name]: 3,
|
228 | });
|
229 | ret.push(nw);
|
230 | lastcur = i;
|
231 | }
|
232 | lasttype = POSTAG.UNK;
|
233 | }
|
234 | }
|
235 | // 剩余部分
|
236 | //let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;
|
237 | let nw = this.createRawToken({
|
238 | w: text.substr(lastcur, i - lastcur),
|
239 | });
|
240 | if (lasttype !== POSTAG.UNK)
|
241 | nw.p = lasttype;
|
242 | ret.push(nw);
|
243 | //console.timeEnd('splitForeign');
|
244 | //debug(ret);
|
245 | return ret;
|
246 | }
|
247 | createForeignToken(word, lasttype, attr) {
|
248 | let nw = this.createToken(word, true, attr);
|
249 | let ow = this._TABLE[nw.w];
|
250 | if (ow) {
|
251 | debug_1.debugToken(nw, {
|
252 | _source: ow,
|
253 | });
|
254 | nw.p = nw.p | ow.p;
|
255 | }
|
256 | if (lasttype && lasttype !== this._POSTAG.UNK) {
|
257 | nw.p = lasttype | nw.p;
|
258 | }
|
259 | return nw;
|
260 | }
|
261 | }
|
262 | exports.ForeignTokenizer = ForeignTokenizer;
|
263 | exports.init = ForeignTokenizer.init.bind(ForeignTokenizer);
|
264 | exports.default = ForeignTokenizer;
|
265 | //# sourceMappingURL=data:application/json;base64,{"version":3,"file":"ForeignTokenizer.js","sourceRoot":"","sources":["ForeignTokenizer.ts"],"names":[],"mappings":"AAAA,YAAY,CAAC;;;AAEb;;;;GAIG;AACH,gCAA8E;AAE9E,yCAA2C;AAK3C,MAAa,gBAAiB,SAAQ,yBAAmB;IAAzD;;QAGC,SAAI,GAAG,kBAAkB,CAAC;IA4U3B,CAAC;IAjUA,MAAM;QAEL,KAAK,CAAC,MAAM,EAAE,CAAC;QACf,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAE5C,IAAI,GAAG,GAAG;YACT,uCAAuC;YACvC,0CAA0C;YAC1C,+BAA+B;YAC/B,kBAAkB;YAClB,oDAAoD;YACpD,kBAAkB;SAClB,CAAC;QAEF,IAAI,CAAC,eAAe,GAAG,IAAI,MAAM,CAAC,GAAG,GAAE,KAAK,CAAC;YAC5C,kBAAkB;SAClB,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,GAAG,GAAG,EAAE,IAAI,CAAC,CAAC;QAE5B,IAAI,CAAC,eAAe,GAAG,IAAI,MAAM,CAAC,GAAG,GAAE,KAAK,CAAC,GAAG,CAAC,GAAG,GAAG,EAAE,IAAI,CAAC,CAAC;QAE/D,SAAS,KAAK,CAAC,GAA2B;YAEzC,OAAO,GAAG,CAAC,MAAM,CAAC,UAAU,CAAC,EAAE,CAAC;gBAE/B,IAAI,CAAC,YAAY,MAAM,EACvB;oBACC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;iBACjB;qBAED;oBACC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;iBACV;gBAED,OAAO,CAAC,CAAC;YACV,CAAC,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;QACjB,CAAC;IACF,CAAC;IAED;;;;;OAKG;IACH,KAAK,CAAC,KAAc;QAEnB,qDAAqD;QACrD,OAAO,IAAI,CAAC,YAAY,CAAC,KAAK,EAAE,IAAI,CAAC,aAAa,CAAC,CAAC;QAEpD;;;;;;;;;;;;;;;;;UAiBE;IACH,CAAC;IAED;;;;OAIG;IACH,aAAa,CAAC,IAAY,EAAE,GAAY;QAEvC,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC;QACnC,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC;QAE1B,gCAAgC;QAEhC,IAAI,GAAG,GAAY,EAAE,CAAC;QACtB,IAAI,IAAI,GAAG,IAAI,CAAC;QAEhB,IAAI,EAAE,GAAG,IAAI;aACX,KAAK,CAAC,IAAI,CAAC,eAAe,CAAC,CAC5B;QAED,KAAK,IAAI,CAAC,IAAI,EAAE,EAChB;YACC,IAAI,CAAC,KAAK,EAAE,EACZ;gBACC,IAAI,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,EAChC;oBACC,IAAI,EAAE,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;oBAElB,IAAI,EAAE,EACN;wBACC,IAAI,EAAE,GAAG,IAAI,CAAC,cAAc,CAAC;4BAC5B,CAAC;yBACD,EAAE,EAAE,EAAE;4BACN,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;yBACd,CAAC,CAAC;wBAEH,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;wBACb,SAAS;qBACT;oBAED;;;uBAGG;oBACH,IAAI,GAAG,GAAG,CAAC;yBACT,KAAK,CAAC,aAAa,CAAC,CACrB;oBAED,KAAK,IAAI,CAAC,IAAI,GAAG,EACjB;wBACC,IAAI,CAAC,KAAK,EAAE,EACZ;4BACC,SAAS;yBACT;wBAED,IAAI,QAAQ,GAAG,CAAC,CAAC;wBAEjB,IAAI,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;wBACxB,IAAI,CAAC,IAAI,KAAK,IAAI,CAAC,IAAI,KAAK;4BAAE,CAAC,IAAI,KAAK,CAAC;wBAEzC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,EACtB;4BACC,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC;yBACtB,CAAA,4BAA4B;6BACxB,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,GAAG,CAAC,EACtD;4BACC,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC;yBACvB;6BAED;4BACC,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC;yBACtB;wBAED,IAAI,QAAQ,KAAK,MAAM,CAAC,IAAI,EAC5B;4BACC,IAAI,EAAE,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;4BAElB,IAAI,EAAE,EACN;gCACC,IAAI,EAAE,GAAG,IAAI,CAAC,cAAc,CAAC;oCAC5B,CAAC;iCACD,EAAE,EAAE,EAAE;oCACN,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;iCACd,CAAC,CAAC;gCAEH,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gCACb,SAAS;6BACT;yBACD;wBAED,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC;4BACxB,CAAC,EAAE,CAAC;4BACJ,CAAC,EAAE,QAAQ,IAAI,SAAS;yBACxB,EAAE;4BACF,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;yBACd,EAAE,IAAI,CAAC,CAAC,CAAC;qBACV;iBACD;qBAED;oBACC,GAAG,CAAC,IAAI,CAAC;wBACR,CAAC;qBACD,CAAC,CAAC;iBACH;aACD;SACD;QAED,mCAAmC;QAEnC,mBAAmB;QAEnB,OAAO,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC;IACrC,CAAC;IAED;;;;;;OAMG;IACH,YAAY,CAAC,IAAY,EAAE,GAAY;QAEtC,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC;QACnC,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC;QAE1B,+BAA+B;QAE/B,IAAI,KAAK,CAAC,GAAG,CAAC;YAAE,GAAG,GAAG,CAAC,CAAC;QACxB,IAAI,GAAG,GAAG,EAAE,CAAC;QAEb,gBAAgB;QAChB,IAAI,OAAO,GAAG,CAAC,CAAC;QAChB,IAAI,QAAQ,GAAG,CAAC,CAAC;QACjB,IAAI,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QAC3B,UAAU;QACV,IAAI,CAAC,IAAI,KAAK,IAAI,CAAC,IAAI,KAAK;YAAE,CAAC,IAAI,KAAK,CAAC;QACzC,4BAA4B;QAC5B,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,EACtB;YACC,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC;SACtB,CAAA,4BAA4B;aACxB,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,GAAG,CAAC,EACtD;YACC,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC;SACvB;aAED;YACC,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC;SACtB;QAED,IAAI,CAAS,CAAC;QAEd,KAAK,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAChC;YACC,IAAI,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;YAC3B,UAAU;YACV,IAAI,CAAC,IAAI,KAAK,IAAI,CAAC,IAAI,KAAK;gBAAE,CAAC,IAAI,KAAK,CAAC;YACzC,4BAA4B;YAC5B,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,EACtB;gBACC,IAAI,QAAQ,KAAK,MAAM,CAAC,GAAG,EAC3B;oBACC,IAAI,EAAE,GAAG,IAAI,CAAC,kBAAkB,CAAC;wBAChC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC,GAAG,OAAO,CAAC;qBACpC,EAAE,QAAQ,EAAE;wBACZ,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;qBACd,CAAC,CAAC;oBACH,6DAA6D;oBAE7D,+CAA+C;oBAC/C,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACb,OAAO,GAAG,CAAC,CAAC;iBACZ;gBACD,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC;aACtB;iBACI,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,GAAG,CAAC,EACtD;gBACC,4BAA4B;gBAC5B,IAAI,QAAQ,KAAK,MAAM,CAAC,IAAI,EAC5B;oBACC,6DAA6D;oBAE7D,IAAI,EAAE,GAAG,IAAI,CAAC,cAAc,CAAC;wBAC5B,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC,GAAG,OAAO,CAAC;qBACpC,EAAE;wBACF,CAAC,EAAE,QAAQ;qBACX,EAAE;wBACF,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;qBACd,CAAC,CAAC;oBAEH,+CAA+C;oBAC/C,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACb,OAAO,GAAG,CAAC,CAAC;iBACZ;gBACD,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC;aACvB;iBAED;gBACC,KAAK;gBACL,IAAI,QAAQ,KAAK,MAAM,CAAC,GAAG,EAC3B;oBACC,IAAI,EAAE,GAAG,IAAI,CAAC,kBAAkB,CAAC;wBAChC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC,GAAG,OAAO,CAAC;wBACpC,CAAC,EAAE,QAAQ;qBACX,EAAE,SAAS,EAAE;wBACb,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;qBACd,CAAC,CAAC;oBAEH,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACb,OAAO,GAAG,CAAC,CAAC;iBACZ;gBACD,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC;aACtB;SACD;QACD,OAAO;QACP,6DAA6D;QAE7D,IAAI,EAAE,GAAG,IAAI,CAAC,cAAc,CAAQ;YACnC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC,GAAG,OAAO,CAAC;SACpC,CAAC,CAAC;QAEH,IAAI,QAAQ,KAAK,MAAM,CAAC,GAAG;YAAE,EAAE,CAAC,CAAC,GAAG,QAAQ,CAAC;QAC7C,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAEb,kCAAkC;QAElC,aAAa;QACb,OAAO,GAAG,CAAC;IACZ,CAAC;IAED,kBAAkB,CAAC,IAAW,EAAE,QAAiB,EAAE,IAAqB;QAEvE,IAAI,EAAE,GAAG,IAAI,CAAC,WAAW,CAAQ,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC;QAEnD,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAE3B,IAAI,EAAE,EACN;YACC,kBAAU,CAAC,EAAE,EAAE;gBACd,OAAO,EAAE,EAAE;aACX,CAAC,CAAC;YAEH,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;SACnB;QAED,IAAI,QAAQ,IAAI,QAAQ,KAAK,IAAI,CAAC,OAAO,CAAC,GAAG,EAC7C;YACC,EAAE,CAAC,CAAC,GAAG,QAAQ,GAAG,EAAE,CAAC,CAAC,CAAC;SACvB;QAED,OAAO,EAAE,CAAC;IACX,CAAC;CACD;AA/UD,4CA+UC;AAEY,QAAA,IAAI,GAAG,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,gBAAgB,CAA0C,CAAC;AAE1G,kBAAe,gBAAgB,CAAC","sourcesContent":["'use strict';\n\n/**\n * 外文字符、数字识别模块\n *\n * @author 老雷<leizongmin@gmail.com>\n */\nimport { SubSModule, SubSModuleTokenizer, ISubTokenizerCreate } from '../mod';\nimport { Segment, IWord } from '../Segment';\nimport { debugToken } from '../util/debug';\nimport UString from 'uni-string';\nimport { debug } from '../util';\nimport { IWordDebugInfo } from '../util/index';\n\nexport class ForeignTokenizer extends SubSModuleTokenizer\n{\n\n\tname = 'ForeignTokenizer';\n\n\t/**\n\t * 分詞用(包含中文)\n\t */\n\t_REGEXP_SPLIT_1: RegExp;\n\t/**\n\t * 分詞用(不包含中文的全詞符合)\n\t */\n\t_REGEXP_SPLIT_2: RegExp;\n\n\t_cache()\n\t{\n\t\tsuper._cache();\n\t\tthis._TABLE = this.segment.getDict('TABLE');\n\n\t\tlet arr = [\n\t\t\t/[\\d０-９]+(?:,[\\d０-９]+)?(?:\\.[\\d０-９]+)?/,\n\t\t\t/[\\w０-９Ａ-Ｚａ-ｚ\\u0100-\\u017F\\u00A1-\\u00FF]+/,\n\t\t\t/[\\u0600-\\u06FF\\u0750-\\u077F]+/,\n\t\t\t/[\\u0400-\\u04FF]+/,\n\t\t\t// https://unicode-table.com/cn/blocks/greek-coptic/\n\t\t\t/[\\u0370-\\u03FF]+/,\n\t\t];\n\n\t\tthis._REGEXP_SPLIT_1 = new RegExp('(' +_join([\n\t\t\t/[\\u4E00-\\u9FFF]+/,\n\t\t].concat(arr)) + ')', 'iu');\n\n\t\tthis._REGEXP_SPLIT_2 = new RegExp('(' +_join(arr) + ')', 'iu');\n\n\t\tfunction _join(arr: Array<string | RegExp>)\n\t\t{\n\t\t\treturn arr.reduce(function (a, b)\n\t\t\t{\n\t\t\t\tif (b instanceof RegExp)\n\t\t\t\t{\n\t\t\t\t\ta.push(b.source);\n\t\t\t\t}\n\t\t\t\telse\n\t\t\t\t{\n\t\t\t\t\ta.push(b);\n\t\t\t\t}\n\n\t\t\t\treturn a;\n\t\t\t}, []).join('|')\n\t\t}\n\t}\n\n\t/**\n\t * 对未识别的单词进行分词\n\t *\n\t * @param {array} words 单词数组\n\t * @return {array}\n\t */\n\tsplit(words: IWord[]): IWord[]\n\t{\n\t\t//return this._splitUnknow(words, this.splitForeign);\n\t\treturn this._splitUnknow(words, this.splitForeign2);\n\n\t\t/*\n\t\tconst POSTAG = this.segment.POSTAG;\n\n\t\tlet ret = [];\n\t\tfor (let i = 0, word; word = words[i]; i++)\n\t\t{\n\t\t\tif (word.p)\n\t\t\t{\n\t\t\t\tret.push(word);\n\t\t\t}\n\t\t\telse\n\t\t\t{\n\t\t\t\t// 仅对未识别的词进行匹配\n\t\t\t\tret = ret.concat(this.splitForeign(word.w));\n\t\t\t}\n\t\t}\n\t\treturn ret;\n\t\t*/\n\t}\n\n\t/**\n\t * 支援更多外文判定(但可能會降低效率)\n\t *\n\t * 並且避免誤切割 例如 latīna Русский\n\t */\n\tsplitForeign2(text: string, cur?: number): IWord[]\n\t{\n\t\tconst POSTAG = this.segment.POSTAG;\n\t\tconst TABLE = this._TABLE;\n\n\t\t//console.time('splitForeign2');\n\n\t\tlet ret: IWord[] = [];\n\t\tlet self = this;\n\n\t\tlet ls = text\n\t\t\t.split(this._REGEXP_SPLIT_1)\n\t\t;\n\n\t\tfor (let w of ls)\n\t\t{\n\t\t\tif (w !== '')\n\t\t\t{\n\t\t\t\tif (this._REGEXP_SPLIT_2.test(w))\n\t\t\t\t{\n\t\t\t\t\tlet cw = TABLE[w];\n\n\t\t\t\t\tif (cw)\n\t\t\t\t\t{\n\t\t\t\t\t\tlet nw = this.createRawToken({\n\t\t\t\t\t\t\tw,\n\t\t\t\t\t\t}, cw, {\n\t\t\t\t\t\t\t[this.name]: 1,\n\t\t\t\t\t\t});\n\n\t\t\t\t\t\tret.push(nw);\n\t\t\t\t\t\tcontinue;\n\t\t\t\t\t}\n\n\t\t\t\t\t/**\n\t\t\t\t\t * 當分詞不存在於字典中時\n\t\t\t\t\t * 則再度分詞一次\n\t\t\t\t\t */\n\t\t\t\t\tlet ls2 = w\n\t\t\t\t\t\t.split(/([\\d+０-９]+)/)\n\t\t\t\t\t;\n\n\t\t\t\t\tfor (let w of ls2)\n\t\t\t\t\t{\n\t\t\t\t\t\tif (w === '')\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tcontinue;\n\t\t\t\t\t\t}\n\n\t\t\t\t\t\tlet lasttype = 0;\n\n\t\t\t\t\t\tlet c = w.charCodeAt(0);\n\t\t\t\t\t\tif (c >= 65296 && c <= 65370) c -= 65248;\n\n\t\t\t\t\t\tif (c >= 48 && c <= 57)\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tlasttype = POSTAG.A_M;\n\t\t\t\t\t\t}// 字母 lasttype = POSTAG.A_NX\n\t\t\t\t\t\telse if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122))\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tlasttype = POSTAG.A_NX;\n\t\t\t\t\t\t}\n\t\t\t\t\t\telse\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tlasttype = POSTAG.UNK;\n\t\t\t\t\t\t}\n\n\t\t\t\t\t\tif (lasttype === POSTAG.A_NX)\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\tlet cw = TABLE[w];\n\n\t\t\t\t\t\t\tif (cw)\n\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\tlet nw = this.createRawToken({\n\t\t\t\t\t\t\t\t\tw,\n\t\t\t\t\t\t\t\t}, cw, {\n\t\t\t\t\t\t\t\t\t[this.name]: 2,\n\t\t\t\t\t\t\t\t});\n\n\t\t\t\t\t\t\t\tret.push(nw);\n\t\t\t\t\t\t\t\tcontinue;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\n\t\t\t\t\t\tret.push(self.debugToken({\n\t\t\t\t\t\t\tw: w,\n\t\t\t\t\t\t\tp: lasttype || undefined,\n\t\t\t\t\t\t}, {\n\t\t\t\t\t\t\t[self.name]: 3,\n\t\t\t\t\t\t}, true));\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\telse\n\t\t\t\t{\n\t\t\t\t\tret.push({\n\t\t\t\t\t\tw,\n\t\t\t\t\t});\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\t//console.timeEnd('splitForeign2');\n\n\t\t//console.log(ret);\n\n\t\treturn ret.length ? ret : undefined;\n\t}\n\n\t/**\n\t * 匹配包含的英文字符和数字，并分割\n\t *\n\t * @param {string} text 文本\n\t * @param {int} cur 开始位置\n\t * @return {array}  返回格式   {w: '单词', c: 开始位置}\n\t */\n\tsplitForeign(text: string, cur?: number): IWord[]\n\t{\n\t\tconst POSTAG = this.segment.POSTAG;\n\t\tconst TABLE = this._TABLE;\n\n\t\t//console.time('splitForeign');\n\n\t\tif (isNaN(cur)) cur = 0;\n\t\tlet ret = [];\n\n\t\t// 取第一个字符的ASCII码\n\t\tlet lastcur = 0;\n\t\tlet lasttype = 0;\n\t\tlet c = text.charCodeAt(0);\n\t\t// 全角数字或字母\n\t\tif (c >= 65296 && c <= 65370) c -= 65248;\n\t\t// 数字  lasttype = POSTAG.A_M\n\t\tif (c >= 48 && c <= 57)\n\t\t{\n\t\t\tlasttype = POSTAG.A_M;\n\t\t}// 字母 lasttype = POSTAG.A_NX\n\t\telse if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122))\n\t\t{\n\t\t\tlasttype = POSTAG.A_NX;\n\t\t}\n\t\telse\n\t\t{\n\t\t\tlasttype = POSTAG.UNK;\n\t\t}\n\n\t\tlet i: number;\n\n\t\tfor (i = 1; i < text.length; i++)\n\t\t{\n\t\t\tlet c = text.charCodeAt(i);\n\t\t\t// 全角数字或字母\n\t\t\tif (c >= 65296 && c <= 65370) c -= 65248;\n\t\t\t// 数字  lasttype = POSTAG.A_M\n\t\t\tif (c >= 48 && c <= 57)\n\t\t\t{\n\t\t\t\tif (lasttype !== POSTAG.A_M)\n\t\t\t\t{\n\t\t\t\t\tlet nw = this.createForeignToken({\n\t\t\t\t\t\tw: text.substr(lastcur, i - lastcur),\n\t\t\t\t\t}, lasttype, {\n\t\t\t\t\t\t[this.name]: 1,\n\t\t\t\t\t});\n\t\t\t\t\t//let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;\n\n\t\t\t\t\t//if (lasttype !== POSTAG.UNK) nw.p = lasttype;\n\t\t\t\t\tret.push(nw);\n\t\t\t\t\tlastcur = i;\n\t\t\t\t}\n\t\t\t\tlasttype = POSTAG.A_M;\n\t\t\t}\n\t\t\telse if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122))\n\t\t\t{\n\t\t\t\t// 字母 lasttype = POSTAG.A_NX\n\t\t\t\tif (lasttype !== POSTAG.A_NX)\n\t\t\t\t{\n\t\t\t\t\t//let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;\n\n\t\t\t\t\tlet nw = this.createRawToken({\n\t\t\t\t\t\tw: text.substr(lastcur, i - lastcur),\n\t\t\t\t\t}, {\n\t\t\t\t\t\tp: lasttype\n\t\t\t\t\t}, {\n\t\t\t\t\t\t[this.name]: 2,\n\t\t\t\t\t});\n\n\t\t\t\t\t//if (lasttype !== POSTAG.UNK) nw.p = lasttype;\n\t\t\t\t\tret.push(nw);\n\t\t\t\t\tlastcur = i;\n\t\t\t\t}\n\t\t\t\tlasttype = POSTAG.A_NX;\n\t\t\t}\n\t\t\telse\n\t\t\t{\n\t\t\t\t// 其他\n\t\t\t\tif (lasttype !== POSTAG.UNK)\n\t\t\t\t{\n\t\t\t\t\tlet nw = this.createForeignToken({\n\t\t\t\t\t\tw: text.substr(lastcur, i - lastcur),\n\t\t\t\t\t\tp: lasttype\n\t\t\t\t\t}, undefined, {\n\t\t\t\t\t\t[this.name]: 3,\n\t\t\t\t\t});\n\n\t\t\t\t\tret.push(nw);\n\t\t\t\t\tlastcur = i;\n\t\t\t\t}\n\t\t\t\tlasttype = POSTAG.UNK;\n\t\t\t}\n\t\t}\n\t\t// 剩余部分\n\t\t//let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;\n\n\t\tlet nw = this.createRawToken<IWord>({\n\t\t\tw: text.substr(lastcur, i - lastcur),\n\t\t});\n\n\t\tif (lasttype !== POSTAG.UNK) nw.p = lasttype;\n\t\tret.push(nw);\n\n\t\t//console.timeEnd('splitForeign');\n\n\t\t//debug(ret);\n\t\treturn ret;\n\t}\n\n\tcreateForeignToken(word: IWord, lasttype?: number, attr?: IWordDebugInfo)\n\t{\n\t\tlet nw = this.createToken<IWord>(word, true, attr);\n\n\t\tlet ow = this._TABLE[nw.w];\n\n\t\tif (ow)\n\t\t{\n\t\t\tdebugToken(nw, {\n\t\t\t\t_source: ow,\n\t\t\t});\n\n\t\t\tnw.p = nw.p | ow.p;\n\t\t}\n\n\t\tif (lasttype && lasttype !== this._POSTAG.UNK)\n\t\t{\n\t\t\tnw.p = lasttype | nw.p;\n\t\t}\n\n\t\treturn nw;\n\t}\n}\n\nexport const init = ForeignTokenizer.init.bind(ForeignTokenizer) as ISubTokenizerCreate<ForeignTokenizer>;\n\nexport default ForeignTokenizer;\n\n//debug(splitForeign('ad222经济核算123非'));\n"]} |
\ | No newline at end of file |