UNPKG

3.31 kBJavaScriptView Raw
1'use strict';
2Object.defineProperty(exports, "__esModule", { value: true });
3exports.init = exports.WildcardTokenizer = void 0;
4/**
5 * 通配符识别模块
6 *
7 * @author 老雷<leizongmin@gmail.com>
8 */
9const mod_1 = require("../mod");
10class WildcardTokenizer extends mod_1.SubSModuleTokenizer {
11 constructor() {
12 super(...arguments);
13 this.name = 'WildcardTokenizer';
14 }
15 _cache() {
16 super._cache();
17 this._TABLE = this.segment.getDict('WILDCARD');
18 this._TABLE2 = this.segment.getDict('WILDCARD2');
19 }
20 /**
21 * 对未识别的单词进行分词
22 *
23 * @param {array} words 单词数组
24 * @return {array}
25 */
26 split(words) {
27 //return this._splitUnknow(words, this.splitForeign);
28 return this._splitUnknow(words, this.splitWildcard);
29 }
30 createWildcardToken(word, lasttype, attr) {
31 let nw = this.createToken(word, true, attr);
32 return nw;
33 }
34 splitWildcard(text, cur) {
35 //const POSTAG = this._POSTAG;
36 const TABLE = this._TABLE;
37 let ret = [];
38 let self = this;
39 // 分离出已识别的单词
40 let wordinfo = self.matchWord(text);
41 if (wordinfo.length) {
42 let lastc = 0;
43 for (let ui = 0, bw; bw = wordinfo[ui]; ui++) {
44 if (bw.c > lastc) {
45 ret.push({
46 w: text.substr(lastc, bw.c - lastc),
47 });
48 }
49 let nw = self.createWildcardToken({
50 w: bw.w,
51 p: TABLE[bw.w.toLowerCase()].p,
52 });
53 ret.push(nw);
54 lastc = bw.c + bw.w.length;
55 }
56 let lastword = wordinfo[wordinfo.length - 1];
57 if (lastword.c + lastword.w.length < text.length) {
58 ret.push({
59 w: text.substr(lastword.c + lastword.w.length),
60 });
61 }
62 }
63 return ret.length ? ret : undefined;
64 }
65 /**
66 * 匹配单词,返回相关信息
67 *
68 * @param {string} text 文本
69 * @param {int} cur 开始位置
70 * @return {array} 返回格式 {w: '单词', c: 开始位置}
71 */
72 matchWord(text, cur) {
73 //const POSTAG = this._POSTAG;
74 const TABLE = this._TABLE2;
75 if (isNaN(cur))
76 cur = 0;
77 let ret = [];
78 //let self = this;
79 let s = false;
80 // 匹配可能出现的单词,取长度最大的那个
81 let lowertext = text.toLowerCase();
82 while (cur < text.length) {
83 let stopword = null;
84 for (let i in TABLE) {
85 if (lowertext.substr(cur, i) in TABLE[i]) {
86 stopword = {
87 w: text.substr(cur, i),
88 c: cur,
89 };
90 }
91 }
92 if (stopword !== null) {
93 ret.push(stopword);
94 cur += stopword.w.length;
95 }
96 else {
97 cur++;
98 }
99 }
100 return ret;
101 }
102}
103exports.WildcardTokenizer = WildcardTokenizer;
104exports.init = WildcardTokenizer.init.bind(WildcardTokenizer);
105exports.default = WildcardTokenizer;
106//# sourceMappingURL=WildcardTokenizer.js.map
\No newline at end of file