UNPKG

novel-segment/lib/submod/ForeignTokenizer.js

Version:

9.03 kBJavaScriptView Raw

1'use strict';
2Object.defineProperty(exports, "__esModule", { value: true });
3exports.init = exports.ForeignTokenizer = void 0;
4/**
* 外文字符、数字识别模块
*
* @author 老雷<leizongmin@gmail.com>
*/
9const mod_1 = require("../mod");
10const debug_1 = require("../util/debug");
11class ForeignTokenizer extends mod_1.SubSModuleTokenizer {
  constructor() {
      super(...arguments);
      this.name = 'ForeignTokenizer';
  }
  _cache() {
      super._cache();
      this._TABLE = this.segment.getDict('TABLE');
      let arr = [
          /[\d０-９]+(?:,[\d０-９]+)?(?:\.[\d０-９]+)?/,
          /[\w０-９Ａ-Ｚａ-ｚ\u0100-\u017F\u00A1-\u00FF]+/,
          /[\u0600-\u06FF\u0750-\u077F]+/,
          /[\u0400-\u04FF]+/,
          // https://unicode-table.com/cn/blocks/greek-coptic/
          /[\u0370-\u03FF]+/,
      ];
      this._REGEXP_SPLIT_1 = new RegExp('(' + _join([
          /[\u4E00-\u9FFF]+/,
      ].concat(arr)) + ')', 'iu');
      this._REGEXP_SPLIT_2 = new RegExp('(' + _join(arr) + ')', 'iu');
      function _join(arr) {
          return arr.reduce(function (a, b) {
              if (b instanceof RegExp) {
                  a.push(b.source);
              }
              else {
                  a.push(b);
              }
              return a;
          }, []).join('|');
      }
  }
  /**
   * 对未识别的单词进行分词
   *
   * @param {array} words 单词数组
   * @return {array}
   */
  split(words) {
      //return this._splitUnknow(words, this.splitForeign);
      return this._splitUnknow(words, this.splitForeign2);
      /*
      const POSTAG = this.segment.POSTAG;
54
      let ret = [];
      for (let i = 0, word; word = words[i]; i++)
      {
          if (word.p)
          {
              ret.push(word);
          }
          else
          {
              // 仅对未识别的词进行匹配
              ret = ret.concat(this.splitForeign(word.w));
          }
      }
      return ret;
      */
  }
  /**
   * 支援更多外文判定(但可能會降低效率)
   *
   * 並且避免誤切割 例如 latīna Русский
   */
  splitForeign2(text, cur) {
      const POSTAG = this.segment.POSTAG;
      const TABLE = this._TABLE;
      //console.time('splitForeign2');
      let ret = [];
      let self = this;
      let ls = text
          .split(this._REGEXP_SPLIT_1);
      for (let w of ls) {
          if (w !== '') {
              if (this._REGEXP_SPLIT_2.test(w)) {
                  let cw = TABLE[w];
                  if (cw) {
                      let nw = this.createRawToken({
                          w,
                      }, cw, {
                          [this.name]: 1,
                      });
                      ret.push(nw);
                      continue;
                  }
                  /**
                   * 當分詞不存在於字典中時
                   * 則再度分詞一次
                   */
                  let ls2 = w
                      .split(/([\d+０-９]+)/);
                  for (let w of ls2) {
                      if (w === '') {
                          continue;
                      }
                      let lasttype = 0;
                      let c = w.charCodeAt(0);
                      if (c >= 65296 && c <= 65370)
                          c -= 65248;
                      if (c >= 48 && c <= 57) {
                          lasttype = POSTAG.A_M;
                      } // 字母 lasttype = POSTAG.A_NX
                      else if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122)) {
                          lasttype = POSTAG.A_NX;
                      }
                      else {
                          lasttype = POSTAG.UNK;
                      }
                      if (lasttype === POSTAG.A_NX) {
                          let cw = TABLE[w];
                          if (cw) {
                              let nw = this.createRawToken({
                                  w,
                              }, cw, {
                                  [this.name]: 2,
                              });
                              ret.push(nw);
                              continue;
                          }
                      }
                      ret.push(self.debugToken({
                          w: w,
                          p: lasttype || undefined,
                      }, {
                          [self.name]: 3,
                      }, true));
                  }
              }
              else {
                  ret.push({
                      w,
                  });
              }
          }
      }
      //console.timeEnd('splitForeign2');
      //console.log(ret);
      return ret.length ? ret : undefined;
  }
  /**
   * 匹配包含的英文字符和数字，并分割
   *
   * @param {string} text 文本
   * @param {int} cur 开始位置
   * @return {array}  返回格式   {w: '单词', c: 开始位置}
   */
  splitForeign(text, cur) {
      const POSTAG = this.segment.POSTAG;
      const TABLE = this._TABLE;
      //console.time('splitForeign');
      if (isNaN(cur))
          cur = 0;
      let ret = [];
      // 取第一个字符的ASCII码
      let lastcur = 0;
      let lasttype = 0;
      let c = text.charCodeAt(0);
      // 全角数字或字母
      if (c >= 65296 && c <= 65370)
          c -= 65248;
      // 数字  lasttype = POSTAG.A_M
      if (c >= 48 && c <= 57) {
          lasttype = POSTAG.A_M;
      } // 字母 lasttype = POSTAG.A_NX
      else if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122)) {
          lasttype = POSTAG.A_NX;
      }
      else {
          lasttype = POSTAG.UNK;
      }
      let i;
      for (i = 1; i < text.length; i++) {
          let c = text.charCodeAt(i);
          // 全角数字或字母
          if (c >= 65296 && c <= 65370)
              c -= 65248;
          // 数字  lasttype = POSTAG.A_M
          if (c >= 48 && c <= 57) {
              if (lasttype !== POSTAG.A_M) {
                  let nw = this.createForeignToken({
                      w: text.substr(lastcur, i - lastcur),
                  }, lasttype, {
                      [this.name]: 1,
                  });
                  //let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;
                  //if (lasttype !== POSTAG.UNK) nw.p = lasttype;
                  ret.push(nw);
                  lastcur = i;
              }
              lasttype = POSTAG.A_M;
          }
          else if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122)) {
              // 字母 lasttype = POSTAG.A_NX
              if (lasttype !== POSTAG.A_NX) {
                  //let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;
                  let nw = this.createRawToken({
                      w: text.substr(lastcur, i - lastcur),
                  }, {
                      p: lasttype
                  }, {
                      [this.name]: 2,
                  });
                  //if (lasttype !== POSTAG.UNK) nw.p = lasttype;
                  ret.push(nw);
                  lastcur = i;
              }
              lasttype = POSTAG.A_NX;
          }
          else {
              // 其他
              if (lasttype !== POSTAG.UNK) {
                  let nw = this.createForeignToken({
                      w: text.substr(lastcur, i - lastcur),
                      p: lasttype
                  }, undefined, {
                      [this.name]: 3,
                  });
                  ret.push(nw);
                  lastcur = i;
              }
              lasttype = POSTAG.UNK;
          }
      }
      // 剩余部分
      //let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;
      let nw = this.createRawToken({
          w: text.substr(lastcur, i - lastcur),
      });
      if (lasttype !== POSTAG.UNK)
          nw.p = lasttype;
      ret.push(nw);
      //console.timeEnd('splitForeign');
      //debug(ret);
      return ret;
  }
  createForeignToken(word, lasttype, attr) {
      let nw = this.createToken(word, true, attr);
      let ow = this._TABLE[nw.w];
      if (ow) {
          debug_1.debugToken(nw, {
              _source: ow,
          });
          nw.p = nw.p | ow.p;
      }
      if (lasttype && lasttype !== this._POSTAG.UNK) {
          nw.p = lasttype | nw.p;
      }
      return nw;
  }
261}
262exports.ForeignTokenizer = ForeignTokenizer;
263exports.init = ForeignTokenizer.init.bind(ForeignTokenizer);
264exports.default = ForeignTokenizer;
265//# sourceMappingURL=ForeignTokenizer.js.map
\No newline at end of file

1	`'use strict';`
2	`Object.defineProperty(exports, "__esModule", { value: true });`
3	`exports.init = exports.ForeignTokenizer = void 0;`
4	`/**`
5	`* 外文字符、数字识别模块`
6	`*`
7	`* @author 老雷<leizongmin@gmail.com>`
8	`*/`
9	`const mod_1 = require("../mod");`
10	`const debug_1 = require("../util/debug");`
11	`class ForeignTokenizer extends mod_1.SubSModuleTokenizer {`
12	`constructor() {`
13	`super(...arguments);`
14	`this.name = 'ForeignTokenizer';`
15	`}`
16	`_cache() {`
17	`super._cache();`
18	`this._TABLE = this.segment.getDict('TABLE');`
19	`let arr = [`
20	`/[\d０-９]+(?:,[\d０-９]+)?(?:\.[\d０-９]+)?/,`
21	`/[\w０-９Ａ-Ｚａ-ｚ\u0100-\u017F\u00A1-\u00FF]+/,`
22	`/[\u0600-\u06FF\u0750-\u077F]+/,`
23	`/[\u0400-\u04FF]+/,`
24	`// https://unicode-table.com/cn/blocks/greek-coptic/`
25	`/[\u0370-\u03FF]+/,`
26	`];`
27	`this._REGEXP_SPLIT_1 = new RegExp('(' + _join([`
28	`/[\u4E00-\u9FFF]+/,`
29	`].concat(arr)) + ')', 'iu');`
30	`this._REGEXP_SPLIT_2 = new RegExp('(' + _join(arr) + ')', 'iu');`
31	`function _join(arr) {`
32	`return arr.reduce(function (a, b) {`
33	`if (b instanceof RegExp) {`
34	`a.push(b.source);`
35	`}`
36	`else {`
37	`a.push(b);`
38	`}`
39	`return a;`
40	`}, []).join('\|');`
41	`}`
42	`}`
43	`/**`
44	`* 对未识别的单词进行分词`
45	`*`
46	`* @param {array} words 单词数组`
47	`* @return {array}`
48	`*/`
49	`split(words) {`
50	`//return this._splitUnknow(words, this.splitForeign);`
51	`return this._splitUnknow(words, this.splitForeign2);`
52	`/*`
53	`const POSTAG = this.segment.POSTAG;`
54
55	`let ret = [];`
56	`for (let i = 0, word; word = words[i]; i++)`
57	`{`
58	`if (word.p)`
59	`{`
60	`ret.push(word);`
61	`}`
62	`else`
63	`{`
64	`// 仅对未识别的词进行匹配`
65	`ret = ret.concat(this.splitForeign(word.w));`
66	`}`
67	`}`
68	`return ret;`
69	`*/`
70	`}`
71	`/**`
72	`* 支援更多外文判定(但可能會降低效率)`
73	`*`
74	`* 並且避免誤切割例如 latīna Русский`
75	`*/`
76	`splitForeign2(text, cur) {`
77	`const POSTAG = this.segment.POSTAG;`
78	`const TABLE = this._TABLE;`
79	`//console.time('splitForeign2');`
80	`let ret = [];`
81	`let self = this;`
82	`let ls = text`
83	`.split(this._REGEXP_SPLIT_1);`
84	`for (let w of ls) {`
85	`if (w !== '') {`
86	`if (this._REGEXP_SPLIT_2.test(w)) {`
87	`let cw = TABLE[w];`
88	`if (cw) {`
89	`let nw = this.createRawToken({`
90	`w,`
91	`}, cw, {`
92	`[this.name]: 1,`
93	`});`
94	`ret.push(nw);`
95	`continue;`
96	`}`
97	`/**`
98	`* 當分詞不存在於字典中時`
99	`* 則再度分詞一次`
100	`*/`
101	`let ls2 = w`
102	`.split(/([\d+０-９]+)/);`
103	`for (let w of ls2) {`
104	`if (w === '') {`
105	`continue;`
106	`}`
107	`let lasttype = 0;`
108	`let c = w.charCodeAt(0);`
109	`if (c >= 65296 && c <= 65370)`
110	`c -= 65248;`
111	`if (c >= 48 && c <= 57) {`
112	`lasttype = POSTAG.A_M;`
113	`} // 字母 lasttype = POSTAG.A_NX`
114	`else if ((c >= 65 && c <= 90) \|\| (c >= 97 && c <= 122)) {`
115	`lasttype = POSTAG.A_NX;`
116	`}`
117	`else {`
118	`lasttype = POSTAG.UNK;`
119	`}`
120	`if (lasttype === POSTAG.A_NX) {`
121	`let cw = TABLE[w];`
122	`if (cw) {`
123	`let nw = this.createRawToken({`
124	`w,`
125	`}, cw, {`
126	`[this.name]: 2,`
127	`});`
128	`ret.push(nw);`
129	`continue;`
130	`}`
131	`}`
132	`ret.push(self.debugToken({`
133	`w: w,`
134	`p: lasttype \|\| undefined,`
135	`}, {`
136	`[self.name]: 3,`
137	`}, true));`
138	`}`
139	`}`
140	`else {`
141	`ret.push({`
142	`w,`
143	`});`
144	`}`
145	`}`
146	`}`
147	`//console.timeEnd('splitForeign2');`
148	`//console.log(ret);`
149	`return ret.length ? ret : undefined;`
150	`}`
151	`/**`
152	`* 匹配包含的英文字符和数字，并分割`
153	`*`
154	`* @param {string} text 文本`
155	`* @param {int} cur 开始位置`
156	`* @return {array} 返回格式 {w: '单词', c: 开始位置}`
157	`*/`
158	`splitForeign(text, cur) {`
159	`const POSTAG = this.segment.POSTAG;`
160	`const TABLE = this._TABLE;`
161	`//console.time('splitForeign');`
162	`if (isNaN(cur))`
163	`cur = 0;`
164	`let ret = [];`
165	`// 取第一个字符的ASCII码`
166	`let lastcur = 0;`
167	`let lasttype = 0;`
168	`let c = text.charCodeAt(0);`
169	`// 全角数字或字母`
170	`if (c >= 65296 && c <= 65370)`
171	`c -= 65248;`
172	`// 数字 lasttype = POSTAG.A_M`
173	`if (c >= 48 && c <= 57) {`
174	`lasttype = POSTAG.A_M;`
175	`} // 字母 lasttype = POSTAG.A_NX`
176	`else if ((c >= 65 && c <= 90) \|\| (c >= 97 && c <= 122)) {`
177	`lasttype = POSTAG.A_NX;`
178	`}`
179	`else {`
180	`lasttype = POSTAG.UNK;`
181	`}`
182	`let i;`
183	`for (i = 1; i < text.length; i++) {`
184	`let c = text.charCodeAt(i);`
185	`// 全角数字或字母`
186	`if (c >= 65296 && c <= 65370)`
187	`c -= 65248;`
188	`// 数字 lasttype = POSTAG.A_M`
189	`if (c >= 48 && c <= 57) {`
190	`if (lasttype !== POSTAG.A_M) {`
191	`let nw = this.createForeignToken({`
192	`w: text.substr(lastcur, i - lastcur),`
193	`}, lasttype, {`
194	`[this.name]: 1,`
195	`});`
196	`//let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;`
197	`//if (lasttype !== POSTAG.UNK) nw.p = lasttype;`
198	`ret.push(nw);`
199	`lastcur = i;`
200	`}`
201	`lasttype = POSTAG.A_M;`
202	`}`
203	`else if ((c >= 65 && c <= 90) \|\| (c >= 97 && c <= 122)) {`
204	`// 字母 lasttype = POSTAG.A_NX`
205	`if (lasttype !== POSTAG.A_NX) {`
206	`//let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;`
207	`let nw = this.createRawToken({`
208	`w: text.substr(lastcur, i - lastcur),`
209	`}, {`
210	`p: lasttype`
211	`}, {`
212	`[this.name]: 2,`
213	`});`
214	`//if (lasttype !== POSTAG.UNK) nw.p = lasttype;`
215	`ret.push(nw);`
216	`lastcur = i;`
217	`}`
218	`lasttype = POSTAG.A_NX;`
219	`}`
220	`else {`
221	`// 其他`
222	`if (lasttype !== POSTAG.UNK) {`
223	`let nw = this.createForeignToken({`
224	`w: text.substr(lastcur, i - lastcur),`
225	`p: lasttype`
226	`}, undefined, {`
227	`[this.name]: 3,`
228	`});`
229	`ret.push(nw);`
230	`lastcur = i;`
231	`}`
232	`lasttype = POSTAG.UNK;`
233	`}`
234	`}`
235	`// 剩余部分`
236	`//let nw = { w: text.substr(lastcur, i - lastcur) } as IWord;`
237	`let nw = this.createRawToken({`
238	`w: text.substr(lastcur, i - lastcur),`
239	`});`
240	`if (lasttype !== POSTAG.UNK)`
241	`nw.p = lasttype;`
242	`ret.push(nw);`
243	`//console.timeEnd('splitForeign');`
244	`//debug(ret);`
245	`return ret;`
246	`}`
247	`createForeignToken(word, lasttype, attr) {`
248	`let nw = this.createToken(word, true, attr);`
249	`let ow = this._TABLE[nw.w];`
250	`if (ow) {`
251	`debug_1.debugToken(nw, {`
252	`_source: ow,`
253	`});`
254	`nw.p = nw.p \| ow.p;`
255	`}`
256	`if (lasttype && lasttype !== this._POSTAG.UNK) {`
257	`nw.p = lasttype \| nw.p;`
258	`}`
259	`return nw;`
260	`}`
261	`}`
262	`exports.ForeignTokenizer = ForeignTokenizer;`
263	`exports.init = ForeignTokenizer.init.bind(ForeignTokenizer);`
264	`exports.default = ForeignTokenizer;`
265	`//# sourceMappingURL=ForeignTokenizer.js.map`
\	No newline at end of file