UNPKG

9.03 kBJavaScriptView Raw
1'use strict';
2Object.defineProperty(exports, "__esModule", { value: true });
3exports.init = exports.ForeignTokenizer = void 0;
4/**
5 * 外文字符、数字识别模块
6 *
7 * @author 老雷<leizongmin@gmail.com>
8 */
9const mod_1 = require("../mod");
10const debug_1 = require("../util/debug");
11class ForeignTokenizer extends mod_1.SubSModuleTokenizer {
12 constructor() {
13 super(...arguments);
14 this.name = 'ForeignTokenizer';
15 }
16 _cache() {
17 super._cache();
18 this._TABLE = this.segment.getDict('TABLE');
19 let arr = [
20 /[\d0-9]+(?:,[\d0-9]+)?(?:\.[\d0-9]+)?/,
21 /[\w0-9A-Za-z\u0100-\u017F\u00A1-\u00FF]+/,
22 /[\u0600-\u06FF\u0750-\u077F]+/,
23 /[\u0400-\u04FF]+/,
24 // https://unicode-table.com/cn/blocks/greek-coptic/
25 /[\u0370-\u03FF]+/,
26 ];
27 this._REGEXP_SPLIT_1 = new RegExp('(' + _join([
28 /[\u4E00-\u9FFF]+/,
29 ].concat(arr)) + ')', 'iu');
30 this._REGEXP_SPLIT_2 = new RegExp('(' + _join(arr) + ')', 'iu');
31 function _join(arr) {
32 return arr.reduce(function (a, b) {
33 if (b instanceof RegExp) {
34 a.push(b.source);
35 }
36 else {
37 a.push(b);
38 }
39 return a;
40 }, []).join('|');
41 }
42 }
43 /**
44 * 对未识别的单词进行分词
45 *
46 * @param {array} words 单词数组
47 * @return {array}
48 */
49 split(words) {
50 //return this._splitUnknow(words, this.splitForeign);
51 return this._splitUnknow(words, this.splitForeign2);
52 /*
53 const POSTAG = this.segment.POSTAG;
54
55 let ret = [];
56 for (let i = 0, word; word = words[i]; i++)
57 {
58 if (word.p)
59 {
60 ret.push(word);
61 }
62 else
63 {
64 // 仅对未识别的词进行匹配
65 ret = ret.concat(this.splitForeign(word.w));
66 }
67 }
68 return ret;
69 */
70 }
71 /**
72 * 支援更多外文判定(但可能會降低效率)
73 *
74 * 並且避免誤切割 例如 latīna Русский
75 */
76 splitForeign2(text, cur) {
77 const POSTAG = this.segment.POSTAG;
78 const TABLE = this._TABLE;
79 //console.time('splitForeign2');
80 let ret = [];
81 let self = this;
82 let ls = text
83 .split(this._REGEXP_SPLIT_1);
84 for (let w of ls) {
85 if (w !== '') {
86 if (this._REGEXP_SPLIT_2.test(w)) {
87 let cw = TABLE[w];
88 if (cw) {
89 let nw = this.createRawToken({
90 w,
91 }, cw, {
92 [this.name]: 1,
93 });
94 ret.push(nw);
95 continue;
96 }
97 /**
98 * 當分詞不存在於字典中時
99 * 則再度分詞一次
100 */
101 let ls2 = w
102 .split(/([\d+0-9]+)/);
103 for (let w of ls2) {
104 if (w === '') {
105 continue;
106 }
107 let lasttype = 0;
108 let c = w.charCodeAt(0);
109 if (c >= 65296 && c <= 65370)
110 c -= 65248;
111 if (c >= 48 && c <= 57) {
112 lasttype = POSTAG.A_M;
113 } // 字母 lasttype = POSTAG.A_NX
114 else if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122)) {
115 lasttype = POSTAG.A_NX;
116 }
117 else {
118 lasttype = POSTAG.UNK;
119 }
120 if (lasttype === POSTAG.A_NX) {
121 let cw = TABLE[w];
122 if (cw) {
123 let nw = this.createRawToken({
124 w,
125 }, cw, {
126 [this.name]: 2,
127 });
128 ret.push(nw);
129 continue;
130 }
131 }
132 ret.push(self.debugToken({
133 w: w,
134 p: lasttype || undefined,
135 }, {
136 [self.name]: 3,
137 }, true));
138 }
139 }
140 else {
141 ret.push({
142 w,
143 });
144 }
145 }
146 }
147 //console.timeEnd('splitForeign2');
148 //console.log(ret);
149 return ret.length ? ret : undefined;
150 }
151 /**
152 * 匹配包含的英文字符和数字,并分割
153 *
154 * @param {string} text 文本
155 * @param {int} cur 开始位置
156 * @return {array} 返回格式 {w: '单词', c: 开始位置}
157 */
158 splitForeign(text, cur) {
159 const POSTAG = this.segment.POSTAG;
160 const TABLE = this._TABLE;
161 //console.time('splitForeign');
162 if (isNaN(cur))
163 cur = 0;
164 let ret = [];
165 // 取第一个字符的ASCII码
166 let lastcur = 0;
167 let lasttype = 0;
168 let c = text.charCodeAt(0);
169 // 全角数字或字母
170 if (c >= 65296 && c <= 65370)
171 c -= 65248;
172 // 数字 lasttype = POSTAG.A_M
173 if (c >= 48 && c <= 57) {
174 lasttype = POSTAG.A_M;
175 } // 字母 lasttype = POSTAG.A_NX
176 else if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122)) {
177 lasttype = POSTAG.A_NX;
178 }
179 else {
180 lasttype = POSTAG.UNK;
181 }
182 let i;
183 for (i = 1; i < text.length; i++) {
184 let c = text.charCodeAt(i);
185 // 全角数字或字母
186 if (c >= 65296 && c <= 65370)
187 c -= 65248;
188 // 数字 lasttype = POSTAG.A_M
189 if (c >= 48 && c <= 57) {
190 if (lasttype !== POSTAG.A_M) {
191 let nw = this.createForeignToken({
192 w: text.substr(lastcur, i - lastcur),
193 }, lasttype, {
194 [this.name]: 1,
195 });
196 //let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;
197 //if (lasttype !== POSTAG.UNK) nw.p = lasttype;
198 ret.push(nw);
199 lastcur = i;
200 }
201 lasttype = POSTAG.A_M;
202 }
203 else if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122)) {
204 // 字母 lasttype = POSTAG.A_NX
205 if (lasttype !== POSTAG.A_NX) {
206 //let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;
207 let nw = this.createRawToken({
208 w: text.substr(lastcur, i - lastcur),
209 }, {
210 p: lasttype
211 }, {
212 [this.name]: 2,
213 });
214 //if (lasttype !== POSTAG.UNK) nw.p = lasttype;
215 ret.push(nw);
216 lastcur = i;
217 }
218 lasttype = POSTAG.A_NX;
219 }
220 else {
221 // 其他
222 if (lasttype !== POSTAG.UNK) {
223 let nw = this.createForeignToken({
224 w: text.substr(lastcur, i - lastcur),
225 p: lasttype
226 }, undefined, {
227 [this.name]: 3,
228 });
229 ret.push(nw);
230 lastcur = i;
231 }
232 lasttype = POSTAG.UNK;
233 }
234 }
235 // 剩余部分
236 //let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;
237 let nw = this.createRawToken({
238 w: text.substr(lastcur, i - lastcur),
239 });
240 if (lasttype !== POSTAG.UNK)
241 nw.p = lasttype;
242 ret.push(nw);
243 //console.timeEnd('splitForeign');
244 //debug(ret);
245 return ret;
246 }
247 createForeignToken(word, lasttype, attr) {
248 let nw = this.createToken(word, true, attr);
249 let ow = this._TABLE[nw.w];
250 if (ow) {
251 debug_1.debugToken(nw, {
252 _source: ow,
253 });
254 nw.p = nw.p | ow.p;
255 }
256 if (lasttype && lasttype !== this._POSTAG.UNK) {
257 nw.p = lasttype | nw.p;
258 }
259 return nw;
260 }
261}
262exports.ForeignTokenizer = ForeignTokenizer;
263exports.init = ForeignTokenizer.init.bind(ForeignTokenizer);
264exports.default = ForeignTokenizer;
265//# sourceMappingURL=ForeignTokenizer.js.map
\No newline at end of file