1 | ;
|
2 | Object.defineProperty(exports, "__esModule", { value: true });
|
3 | exports.init = exports.ForeignTokenizer = void 0;
|
4 | /**
|
5 | * 外文字符、数字识别模块
|
6 | *
|
7 | * @author 老雷<leizongmin@gmail.com>
|
8 | */
|
9 | const mod_1 = require("../mod");
|
10 | const debug_1 = require("../util/debug");
|
11 | class ForeignTokenizer extends mod_1.SubSModuleTokenizer {
|
12 | constructor() {
|
13 | super(...arguments);
|
14 | this.name = 'ForeignTokenizer';
|
15 | }
|
16 | _cache() {
|
17 | super._cache();
|
18 | this._TABLE = this.segment.getDict('TABLE');
|
19 | let arr = [
|
20 | /[\d0-9]+(?:,[\d0-9]+)?(?:\.[\d0-9]+)?/,
|
21 | /[\w0-9A-Za-z\u0100-\u017F\u00A1-\u00FF]+/,
|
22 | /[\u0600-\u06FF\u0750-\u077F]+/,
|
23 | /[\u0400-\u04FF]+/,
|
24 | // https://unicode-table.com/cn/blocks/greek-coptic/
|
25 | /[\u0370-\u03FF]+/,
|
26 | ];
|
27 | this._REGEXP_SPLIT_1 = new RegExp('(' + _join([
|
28 | /[\u4E00-\u9FFF]+/,
|
29 | ].concat(arr)) + ')', 'iu');
|
30 | this._REGEXP_SPLIT_2 = new RegExp('(' + _join(arr) + ')', 'iu');
|
31 | function _join(arr) {
|
32 | return arr.reduce(function (a, b) {
|
33 | if (b instanceof RegExp) {
|
34 | a.push(b.source);
|
35 | }
|
36 | else {
|
37 | a.push(b);
|
38 | }
|
39 | return a;
|
40 | }, []).join('|');
|
41 | }
|
42 | }
|
43 | /**
|
44 | * 对未识别的单词进行分词
|
45 | *
|
46 | * @param {array} words 单词数组
|
47 | * @return {array}
|
48 | */
|
49 | split(words) {
|
50 | //return this._splitUnknow(words, this.splitForeign);
|
51 | return this._splitUnknow(words, this.splitForeign2);
|
52 | /*
|
53 | const POSTAG = this.segment.POSTAG;
|
54 |
|
55 | let ret = [];
|
56 | for (let i = 0, word; word = words[i]; i++)
|
57 | {
|
58 | if (word.p)
|
59 | {
|
60 | ret.push(word);
|
61 | }
|
62 | else
|
63 | {
|
64 | // 仅对未识别的词进行匹配
|
65 | ret = ret.concat(this.splitForeign(word.w));
|
66 | }
|
67 | }
|
68 | return ret;
|
69 | */
|
70 | }
|
71 | /**
|
72 | * 支援更多外文判定(但可能會降低效率)
|
73 | *
|
74 | * 並且避免誤切割 例如 latīna Русский
|
75 | */
|
76 | splitForeign2(text, cur) {
|
77 | const POSTAG = this.segment.POSTAG;
|
78 | const TABLE = this._TABLE;
|
79 | //console.time('splitForeign2');
|
80 | let ret = [];
|
81 | let self = this;
|
82 | let ls = text
|
83 | .split(this._REGEXP_SPLIT_1);
|
84 | for (let w of ls) {
|
85 | if (w !== '') {
|
86 | if (this._REGEXP_SPLIT_2.test(w)) {
|
87 | let cw = TABLE[w];
|
88 | if (cw) {
|
89 | let nw = this.createRawToken({
|
90 | w,
|
91 | }, cw, {
|
92 | [this.name]: 1,
|
93 | });
|
94 | ret.push(nw);
|
95 | continue;
|
96 | }
|
97 | /**
|
98 | * 當分詞不存在於字典中時
|
99 | * 則再度分詞一次
|
100 | */
|
101 | let ls2 = w
|
102 | .split(/([\d+0-9]+)/);
|
103 | for (let w of ls2) {
|
104 | if (w === '') {
|
105 | continue;
|
106 | }
|
107 | let lasttype = 0;
|
108 | let c = w.charCodeAt(0);
|
109 | if (c >= 65296 && c <= 65370)
|
110 | c -= 65248;
|
111 | if (c >= 48 && c <= 57) {
|
112 | lasttype = POSTAG.A_M;
|
113 | } // 字母 lasttype = POSTAG.A_NX
|
114 | else if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122)) {
|
115 | lasttype = POSTAG.A_NX;
|
116 | }
|
117 | else {
|
118 | lasttype = POSTAG.UNK;
|
119 | }
|
120 | if (lasttype === POSTAG.A_NX) {
|
121 | let cw = TABLE[w];
|
122 | if (cw) {
|
123 | let nw = this.createRawToken({
|
124 | w,
|
125 | }, cw, {
|
126 | [this.name]: 2,
|
127 | });
|
128 | ret.push(nw);
|
129 | continue;
|
130 | }
|
131 | }
|
132 | ret.push(self.debugToken({
|
133 | w: w,
|
134 | p: lasttype || undefined,
|
135 | }, {
|
136 | [self.name]: 3,
|
137 | }, true));
|
138 | }
|
139 | }
|
140 | else {
|
141 | ret.push({
|
142 | w,
|
143 | });
|
144 | }
|
145 | }
|
146 | }
|
147 | //console.timeEnd('splitForeign2');
|
148 | //console.log(ret);
|
149 | return ret.length ? ret : undefined;
|
150 | }
|
151 | /**
|
152 | * 匹配包含的英文字符和数字,并分割
|
153 | *
|
154 | * @param {string} text 文本
|
155 | * @param {int} cur 开始位置
|
156 | * @return {array} 返回格式 {w: '单词', c: 开始位置}
|
157 | */
|
158 | splitForeign(text, cur) {
|
159 | const POSTAG = this.segment.POSTAG;
|
160 | const TABLE = this._TABLE;
|
161 | //console.time('splitForeign');
|
162 | if (isNaN(cur))
|
163 | cur = 0;
|
164 | let ret = [];
|
165 | // 取第一个字符的ASCII码
|
166 | let lastcur = 0;
|
167 | let lasttype = 0;
|
168 | let c = text.charCodeAt(0);
|
169 | // 全角数字或字母
|
170 | if (c >= 65296 && c <= 65370)
|
171 | c -= 65248;
|
172 | // 数字 lasttype = POSTAG.A_M
|
173 | if (c >= 48 && c <= 57) {
|
174 | lasttype = POSTAG.A_M;
|
175 | } // 字母 lasttype = POSTAG.A_NX
|
176 | else if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122)) {
|
177 | lasttype = POSTAG.A_NX;
|
178 | }
|
179 | else {
|
180 | lasttype = POSTAG.UNK;
|
181 | }
|
182 | let i;
|
183 | for (i = 1; i < text.length; i++) {
|
184 | let c = text.charCodeAt(i);
|
185 | // 全角数字或字母
|
186 | if (c >= 65296 && c <= 65370)
|
187 | c -= 65248;
|
188 | // 数字 lasttype = POSTAG.A_M
|
189 | if (c >= 48 && c <= 57) {
|
190 | if (lasttype !== POSTAG.A_M) {
|
191 | let nw = this.createForeignToken({
|
192 | w: text.substr(lastcur, i - lastcur),
|
193 | }, lasttype, {
|
194 | [this.name]: 1,
|
195 | });
|
196 | //let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;
|
197 | //if (lasttype !== POSTAG.UNK) nw.p = lasttype;
|
198 | ret.push(nw);
|
199 | lastcur = i;
|
200 | }
|
201 | lasttype = POSTAG.A_M;
|
202 | }
|
203 | else if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122)) {
|
204 | // 字母 lasttype = POSTAG.A_NX
|
205 | if (lasttype !== POSTAG.A_NX) {
|
206 | //let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;
|
207 | let nw = this.createRawToken({
|
208 | w: text.substr(lastcur, i - lastcur),
|
209 | }, {
|
210 | p: lasttype
|
211 | }, {
|
212 | [this.name]: 2,
|
213 | });
|
214 | //if (lasttype !== POSTAG.UNK) nw.p = lasttype;
|
215 | ret.push(nw);
|
216 | lastcur = i;
|
217 | }
|
218 | lasttype = POSTAG.A_NX;
|
219 | }
|
220 | else {
|
221 | // 其他
|
222 | if (lasttype !== POSTAG.UNK) {
|
223 | let nw = this.createForeignToken({
|
224 | w: text.substr(lastcur, i - lastcur),
|
225 | p: lasttype
|
226 | }, undefined, {
|
227 | [this.name]: 3,
|
228 | });
|
229 | ret.push(nw);
|
230 | lastcur = i;
|
231 | }
|
232 | lasttype = POSTAG.UNK;
|
233 | }
|
234 | }
|
235 | // 剩余部分
|
236 | //let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;
|
237 | let nw = this.createRawToken({
|
238 | w: text.substr(lastcur, i - lastcur),
|
239 | });
|
240 | if (lasttype !== POSTAG.UNK)
|
241 | nw.p = lasttype;
|
242 | ret.push(nw);
|
243 | //console.timeEnd('splitForeign');
|
244 | //debug(ret);
|
245 | return ret;
|
246 | }
|
247 | createForeignToken(word, lasttype, attr) {
|
248 | let nw = this.createToken(word, true, attr);
|
249 | let ow = this._TABLE[nw.w];
|
250 | if (ow) {
|
251 | debug_1.debugToken(nw, {
|
252 | _source: ow,
|
253 | });
|
254 | nw.p = nw.p | ow.p;
|
255 | }
|
256 | if (lasttype && lasttype !== this._POSTAG.UNK) {
|
257 | nw.p = lasttype | nw.p;
|
258 | }
|
259 | return nw;
|
260 | }
|
261 | }
|
262 | exports.ForeignTokenizer = ForeignTokenizer;
|
263 | exports.init = ForeignTokenizer.init.bind(ForeignTokenizer);
|
264 | exports.default = ForeignTokenizer;
|
265 | //# sourceMappingURL=ForeignTokenizer.js.map |
\ | No newline at end of file |