1 | ;
|
2 | Object.defineProperty(exports, "__esModule", { value: true });
|
3 | exports.init = exports.WildcardTokenizer = void 0;
|
4 | /**
|
5 | * 通配符识别模块
|
6 | *
|
7 | * @author 老雷<leizongmin@gmail.com>
|
8 | */
|
9 | const mod_1 = require("../mod");
|
10 | class WildcardTokenizer extends mod_1.SubSModuleTokenizer {
|
11 | constructor() {
|
12 | super(...arguments);
|
13 | this.name = 'WildcardTokenizer';
|
14 | }
|
15 | _cache() {
|
16 | super._cache();
|
17 | this._TABLE = this.segment.getDict('WILDCARD');
|
18 | this._TABLE2 = this.segment.getDict('WILDCARD2');
|
19 | }
|
20 | /**
|
21 | * 对未识别的单词进行分词
|
22 | *
|
23 | * @param {array} words 单词数组
|
24 | * @return {array}
|
25 | */
|
26 | split(words) {
|
27 | //return this._splitUnknow(words, this.splitForeign);
|
28 | return this._splitUnknow(words, this.splitWildcard);
|
29 | }
|
30 | createWildcardToken(word, lasttype, attr) {
|
31 | let nw = this.createToken(word, true, attr);
|
32 | return nw;
|
33 | }
|
34 | splitWildcard(text, cur) {
|
35 | //const POSTAG = this._POSTAG;
|
36 | const TABLE = this._TABLE;
|
37 | let ret = [];
|
38 | let self = this;
|
39 | // 分离出已识别的单词
|
40 | let wordinfo = self.matchWord(text);
|
41 | if (wordinfo.length) {
|
42 | let lastc = 0;
|
43 | for (let ui = 0, bw; bw = wordinfo[ui]; ui++) {
|
44 | if (bw.c > lastc) {
|
45 | ret.push({
|
46 | w: text.substr(lastc, bw.c - lastc),
|
47 | });
|
48 | }
|
49 | let nw = self.createWildcardToken({
|
50 | w: bw.w,
|
51 | p: TABLE[bw.w.toLowerCase()].p,
|
52 | });
|
53 | ret.push(nw);
|
54 | lastc = bw.c + bw.w.length;
|
55 | }
|
56 | let lastword = wordinfo[wordinfo.length - 1];
|
57 | if (lastword.c + lastword.w.length < text.length) {
|
58 | ret.push({
|
59 | w: text.substr(lastword.c + lastword.w.length),
|
60 | });
|
61 | }
|
62 | }
|
63 | return ret.length ? ret : undefined;
|
64 | }
|
65 | /**
|
66 | * 匹配单词,返回相关信息
|
67 | *
|
68 | * @param {string} text 文本
|
69 | * @param {int} cur 开始位置
|
70 | * @return {array} 返回格式 {w: '单词', c: 开始位置}
|
71 | */
|
72 | matchWord(text, cur) {
|
73 | //const POSTAG = this._POSTAG;
|
74 | const TABLE = this._TABLE2;
|
75 | if (isNaN(cur))
|
76 | cur = 0;
|
77 | let ret = [];
|
78 | //let self = this;
|
79 | let s = false;
|
80 | // 匹配可能出现的单词,取长度最大的那个
|
81 | let lowertext = text.toLowerCase();
|
82 | while (cur < text.length) {
|
83 | let stopword = null;
|
84 | for (let i in TABLE) {
|
85 | if (lowertext.substr(cur, i) in TABLE[i]) {
|
86 | stopword = {
|
87 | w: text.substr(cur, i),
|
88 | c: cur,
|
89 | };
|
90 | }
|
91 | }
|
92 | if (stopword !== null) {
|
93 | ret.push(stopword);
|
94 | cur += stopword.w.length;
|
95 | }
|
96 | else {
|
97 | cur++;
|
98 | }
|
99 | }
|
100 | return ret;
|
101 | }
|
102 | }
|
103 | exports.WildcardTokenizer = WildcardTokenizer;
|
104 | exports.init = WildcardTokenizer.init.bind(WildcardTokenizer);
|
105 | exports.default = WildcardTokenizer;
|
106 | //# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiV2lsZGNhcmRUb2tlbml6ZXIuanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyJXaWxkY2FyZFRva2VuaXplci50cyJdLCJuYW1lcyI6W10sIm1hcHBpbmdzIjoiQUFBQSxZQUFZLENBQUM7OztBQUViOzs7O0dBSUc7QUFDSCxnQ0FBOEU7QUFPOUUsTUFBYSxpQkFBa0IsU0FBUSx5QkFBbUI7SUFBMUQ7O1FBR0MsU0FBSSxHQUFHLG1CQUFtQixDQUFDO0lBMkg1QixDQUFDO0lBdEhBLE1BQU07UUFFTCxLQUFLLENBQUMsTUFBTSxFQUFFLENBQUM7UUFDZixJQUFJLENBQUMsTUFBTSxHQUFHLElBQUksQ0FBQyxPQUFPLENBQUMsT0FBTyxDQUFDLFVBQVUsQ0FBQyxDQUFDO1FBQy9DLElBQUksQ0FBQyxPQUFPLEdBQUcsSUFBSSxDQUFDLE9BQU8sQ0FBQyxPQUFPLENBQUMsV0FBVyxDQUFDLENBQUM7SUFDbEQsQ0FBQztJQUVEOzs7OztPQUtHO0lBQ0gsS0FBSyxDQUFDLEtBQWM7UUFFbkIscURBQXFEO1FBQ3JELE9BQU8sSUFBSSxDQUFDLFlBQVksQ0FBQyxLQUFLLEVBQUUsSUFBSSxDQUFDLGFBQWEsQ0FBQyxDQUFDO0lBQ3JELENBQUM7SUFFRCxtQkFBbUIsQ0FBQyxJQUFXLEVBQUUsUUFBaUIsRUFBRSxJQUFxQjtRQUV4RSxJQUFJLEVBQUUsR0FBRyxJQUFJLENBQUMsV0FBVyxDQUFRLElBQUksRUFBRSxJQUFJLEVBQUUsSUFBSSxDQUFDLENBQUM7UUFFbkQsT0FBTyxFQUFFLENBQUM7SUFDWCxDQUFDO0lBRUQsYUFBYSxDQUFDLElBQVksRUFBRSxHQUFZO1FBRXZDLDhCQUE4QjtRQUM5QixNQUFNLEtBQUssR0FBRyxJQUFJLENBQUMsTUFBTSxDQUFDO1FBRTFCLElBQUksR0FBRyxHQUFZLEVBQUUsQ0FBQztRQUN0QixJQUFJLElBQUksR0FBRyxJQUFJLENBQUM7UUFFaEIsWUFBWTtRQUNaLElBQUksUUFBUSxHQUFHLElBQUksQ0FBQyxTQUFTLENBQUMsSUFBSSxDQUFDLENBQUM7UUFDcEMsSUFBSSxRQUFRLENBQUMsTUFBTSxFQUNuQjtZQUNDLElBQUksS0FBSyxHQUFHLENBQUMsQ0FBQztZQUNkLEtBQUssSUFBSSxFQUFFLEdBQUcsQ0FBQyxFQUFFLEVBQUUsRUFBRSxFQUFFLEdBQUcsUUFBUSxDQUFDLEVBQUUsQ0FBQyxFQUFFLEVBQUUsRUFBRSxFQUM1QztnQkFDQyxJQUFJLEVBQUUsQ0FBQyxDQUFDLEdBQUcsS0FBSyxFQUNoQjtvQkFDQyxHQUFHLENBQUMsSUFBSSxDQUFDO3dCQUNSLENBQUMsRUFBRSxJQUFJLENBQUMsTUFBTSxDQUFDLEtBQUssRUFBRSxFQUFFLENBQUMsQ0FBQyxHQUFHLEtBQUssQ0FBQztxQkFDbkMsQ0FBQyxDQUFDO2lCQUNIO2dCQUVELElBQUksRUFBRSxHQUFHLElBQUksQ0FBQyxtQkFBbUIsQ0FBQztvQkFDakMsQ0FBQyxFQUFFLEVBQUUsQ0FBQyxDQUFDO29CQUNQLENBQUMsRUFBRSxLQUFLLENBQUMsRUFBRSxDQUFDLENBQUMsQ0FBQyxXQUFXLEVBQUUsQ0FBQyxDQUFDLENBQUM7aUJBQzlCLENBQUMsQ0FBQztnQkFFSCxHQUFHLENBQUMsSUFBSSxDQUFDLEVBQUUsQ0FBQyxDQUFDO2dCQUViLEtBQUssR0FBRyxFQUFFLENBQUMsQ0FBQyxHQUFHLEVBQUUsQ0FBQyxDQUFDLENBQUMsTUFBTSxDQUFDO2FBQzNCO1lBRUQsSUFBSSxRQUFRLEdBQUcsUUFBUSxDQUFDLFFBQVEsQ0FBQyxNQUFNLEdBQUcsQ0FBQyxDQUFDLENBQUM7WUFDN0MsSUFBSSxRQUFRLENBQUMsQ0FBQyxHQUFHLFFBQVEsQ0FBQyxDQUFDLENBQUMsTUFBTSxHQUFHLElBQUksQ0FBQyxNQUFNLEVBQ2hEO2dCQUNDLEdBQUcsQ0FBQyxJQUFJLENBQUM7b0JBQ1IsQ0FBQyxFQUFFLElBQUksQ0FBQyxNQUFNLENBQUMsUUFBUSxDQUFDLENBQUMsR0FBRyxRQUFRLENBQUMsQ0FBQyxDQUFDLE1BQU0sQ0FBQztpQkFDOUMsQ0FBQyxDQUFDO2FBQ0g7U0FDRDtRQUVELE9BQU8sR0FBRyxDQUFDLE1BQU0sQ0FBQyxDQUFDLENBQUMsR0FBRyxDQUFDLENBQUMsQ0FBQyxTQUFTLENBQUM7SUFDckMsQ0FBQztJQUVEOzs7Ozs7T0FNRztJQUNILFNBQVMsQ0FBQyxJQUFZLEVBQUUsR0FBWTtRQUVuQyw4QkFBOEI7UUFDOUIsTUFBTSxLQUFLLEdBQUcsSUFBSSxDQUFDLE9BQU8sQ0FBQztRQUUzQixJQUFJLEtBQUssQ0FBQyxHQUFHLENBQUM7WUFBRSxHQUFHLEdBQUcsQ0FBQyxDQUFDO1FBRXhCLElBQUksR0FBRyxHQUFZLEVBQUUsQ0FBQztRQUN0QixrQkFBa0I7UUFFbEIsSUFBSSxDQUFDLEdBQUcsS0FBSyxDQUFDO1FBRWQscUJBQXFCO1FBQ3JCLElBQUksU0FBUyxHQUFHLElBQUksQ0FBQyxXQUFXLEVBQUUsQ0FBQztRQUVuQyxPQUFPLEdBQUcsR0FBRyxJQUFJLENBQUMsTUFBTSxFQUN4QjtZQUNDLElBQUksUUFBUSxHQUFVLElBQUksQ0FBQztZQUMzQixLQUFLLElBQUksQ0FBQyxJQUFJLEtBQUssRUFDbkI7Z0JBQ0MsSUFBSSxTQUFTLENBQUMsTUFBTSxDQUFDLEdBQUcsRUFBRSxDQUFRLENBQUMsSUFBSSxLQUFLLENBQUMsQ0FBQyxDQUFDLEVBQy9DO29CQUNDLFFBQVEsR0FBRzt3QkFDVixDQUFDLEVBQUUsSUFBSSxDQUFDLE1BQU0sQ0FBQyxHQUFHLEVBQUUsQ0FBUSxDQUFDO3dCQUM3QixDQUFDLEVBQUUsR0FBRztxQkFDTixDQUFDO2lCQUNGO2FBQ0Q7WUFDRCxJQUFJLFFBQVEsS0FBSyxJQUFJLEVBQ3JCO2dCQUNDLEdBQUcsQ0FBQyxJQUFJLENBQUMsUUFBUSxDQUFDLENBQUM7Z0JBQ25CLEdBQUcsSUFBSSxRQUFRLENBQUMsQ0FBQyxDQUFDLE1BQU0sQ0FBQzthQUN6QjtpQkFFRDtnQkFDQyxHQUFHLEVBQUUsQ0FBQzthQUNOO1NBQ0Q7UUFDRCxPQUFPLEdBQUcsQ0FBQztJQUNaLENBQUM7Q0FFRDtBQTlIRCw4Q0E4SEM7QUFFWSxRQUFBLElBQUksR0FBRyxpQkFBaUIsQ0FBQyxJQUFJLENBQUMsSUFBSSxDQUFDLGlCQUFpQixDQUEyQyxDQUFDO0FBRTdHLGtCQUFlLGlCQUFpQixDQUFDIiwic291cmNlc0NvbnRlbnQiOlsiJ3VzZSBzdHJpY3QnO1xuXG4vKipcbiAqIOmAmumFjeespuivhuWIq+aooeWdl1xuICpcbiAqIEBhdXRob3Ig6ICB6Zu3PGxlaXpvbmdtaW5AZ21haWwuY29tPlxuICovXG5pbXBvcnQgeyBTdWJTTW9kdWxlLCBTdWJTTW9kdWxlVG9rZW5pemVyLCBJU3ViVG9rZW5pemVyQ3JlYXRlIH0gZnJvbSAnLi4vbW9kJztcbmltcG9ydCB7IFNlZ21lbnQsIElXb3JkLCBJRElDVCwgSURJQ1QyIH0gZnJvbSAnLi4vU2VnbWVudCc7XG5pbXBvcnQgeyBkZWJ1Z1Rva2VuIH0gZnJvbSAnLi4vdXRpbC9kZWJ1Zyc7XG5pbXBvcnQgVVN0cmluZyBmcm9tICd1bmktc3RyaW5nJztcbmltcG9ydCB7IGRlYnVnIH0gZnJvbSAnLi4vdXRpbCc7XG5pbXBvcnQgeyBJV29yZERlYnVnSW5mbyB9IGZyb20gJy4uL3V0aWwvaW5kZXgnO1xuXG5leHBvcnQgY2xhc3MgV2lsZGNhcmRUb2tlbml6ZXIgZXh0ZW5kcyBTdWJTTW9kdWxlVG9rZW5pemVyXG57XG5cblx0bmFtZSA9ICdXaWxkY2FyZFRva2VuaXplcic7XG5cblx0cHJvdGVjdGVkIF9UQUJMRTogSURJQ1Q8SVdvcmQ+O1xuXHRwcm90ZWN0ZWQgX1RBQkxFMjogSURJQ1QyPElXb3JkPjtcblxuXHRfY2FjaGUoKVxuXHR7XG5cdFx0c3VwZXIuX2NhY2hlKCk7XG5cdFx0dGhpcy5fVEFCTEUgPSB0aGlzLnNlZ21lbnQuZ2V0RGljdCgnV0lMRENBUkQnKTtcblx0XHR0aGlzLl9UQUJMRTIgPSB0aGlzLnNlZ21lbnQuZ2V0RGljdCgnV0lMRENBUkQyJyk7XG5cdH1cblxuXHQvKipcblx0ICog5a+55pyq6K+G5Yir55qE5Y2V6K+N6L+b6KGM5YiG6K+NXG5cdCAqXG5cdCAqIEBwYXJhbSB7YXJyYXl9IHdvcmRzIOWNleivjeaVsOe7hFxuXHQgKiBAcmV0dXJuIHthcnJheX1cblx0ICovXG5cdHNwbGl0KHdvcmRzOiBJV29yZFtdKTogSVdvcmRbXVxuXHR7XG5cdFx0Ly9yZXR1cm4gdGhpcy5fc3BsaXRVbmtub3cod29yZHMsIHRoaXMuc3BsaXRGb3JlaWduKTtcblx0XHRyZXR1cm4gdGhpcy5fc3BsaXRVbmtub3cod29yZHMsIHRoaXMuc3BsaXRXaWxkY2FyZCk7XG5cdH1cblxuXHRjcmVhdGVXaWxkY2FyZFRva2VuKHdvcmQ6IElXb3JkLCBsYXN0dHlwZT86IG51bWJlciwgYXR0cj86IElXb3JkRGVidWdJbmZvKVxuXHR7XG5cdFx0bGV0IG53ID0gdGhpcy5jcmVhdGVUb2tlbjxJV29yZD4od29yZCwgdHJ1ZSwgYXR0cik7XG5cblx0XHRyZXR1cm4gbnc7XG5cdH1cblxuXHRzcGxpdFdpbGRjYXJkKHRleHQ6IHN0cmluZywgY3VyPzogbnVtYmVyKTogSVdvcmRbXVxuXHR7XG5cdFx0Ly9jb25zdCBQT1NUQUcgPSB0aGlzLl9QT1NUQUc7XG5cdFx0Y29uc3QgVEFCTEUgPSB0aGlzLl9UQUJMRTtcblxuXHRcdGxldCByZXQ6IElXb3JkW10gPSBbXTtcblx0XHRsZXQgc2VsZiA9IHRoaXM7XG5cblx0XHQvLyDliIbnprvlh7rlt7Lor4bliKvnmoTljZXor41cblx0XHRsZXQgd29yZGluZm8gPSBzZWxmLm1hdGNoV29yZCh0ZXh0KTtcblx0XHRpZiAod29yZGluZm8ubGVuZ3RoKVxuXHRcdHtcblx0XHRcdGxldCBsYXN0YyA9IDA7XG5cdFx0XHRmb3IgKGxldCB1aSA9IDAsIGJ3OyBidyA9IHdvcmRpbmZvW3VpXTsgdWkrKylcblx0XHRcdHtcblx0XHRcdFx0aWYgKGJ3LmMgPiBsYXN0Yylcblx0XHRcdFx0e1xuXHRcdFx0XHRcdHJldC5wdXNoKHtcblx0XHRcdFx0XHRcdHc6IHRleHQuc3Vic3RyKGxhc3RjLCBidy5jIC0gbGFzdGMpLFxuXHRcdFx0XHRcdH0pO1xuXHRcdFx0XHR9XG5cblx0XHRcdFx0bGV0IG53ID0gc2VsZi5jcmVhdGVXaWxkY2FyZFRva2VuKHtcblx0XHRcdFx0XHR3OiBidy53LFxuXHRcdFx0XHRcdHA6IFRBQkxFW2J3LncudG9Mb3dlckNhc2UoKV0ucCxcblx0XHRcdFx0fSk7XG5cblx0XHRcdFx0cmV0LnB1c2gobncpO1xuXG5cdFx0XHRcdGxhc3RjID0gYncuYyArIGJ3LncubGVuZ3RoO1xuXHRcdFx0fVxuXG5cdFx0XHRsZXQgbGFzdHdvcmQgPSB3b3JkaW5mb1t3b3JkaW5mby5sZW5ndGggLSAxXTtcblx0XHRcdGlmIChsYXN0d29yZC5jICsgbGFzdHdvcmQudy5sZW5ndGggPCB0ZXh0Lmxlbmd0aClcblx0XHRcdHtcblx0XHRcdFx0cmV0LnB1c2goe1xuXHRcdFx0XHRcdHc6IHRleHQuc3Vic3RyKGxhc3R3b3JkLmMgKyBsYXN0d29yZC53Lmxlbmd0aCksXG5cdFx0XHRcdH0pO1xuXHRcdFx0fVxuXHRcdH1cblxuXHRcdHJldHVybiByZXQubGVuZ3RoID8gcmV0IDogdW5kZWZpbmVkO1xuXHR9XG5cblx0LyoqXG5cdCAqIOWMuemFjeWNleivje+8jOi/lOWbnuebuOWFs+S/oeaBr1xuXHQgKlxuXHQgKiBAcGFyYW0ge3N0cmluZ30gdGV4dCDmlofmnKxcblx0ICogQHBhcmFtIHtpbnR9IGN1ciDlvIDlp4vkvY3nva5cblx0ICogQHJldHVybiB7YXJyYXl9ICDov5Tlm57moLzlvI8gICB7dzogJ+WNleivjScsIGM6IOW8gOWni+S9jee9rn1cblx0ICovXG5cdG1hdGNoV29yZCh0ZXh0OiBzdHJpbmcsIGN1cj86IG51bWJlcilcblx0e1xuXHRcdC8vY29uc3QgUE9TVEFHID0gdGhpcy5fUE9TVEFHO1xuXHRcdGNvbnN0IFRBQkxFID0gdGhpcy5fVEFCTEUyO1xuXG5cdFx0aWYgKGlzTmFOKGN1cikpIGN1ciA9IDA7XG5cblx0XHRsZXQgcmV0OiBJV29yZFtdID0gW107XG5cdFx0Ly9sZXQgc2VsZiA9IHRoaXM7XG5cblx0XHRsZXQgcyA9IGZhbHNlO1xuXG5cdFx0Ly8g5Yy56YWN5Y+v6IO95Ye6546w55qE5Y2V6K+N77yM5Y+W6ZW/5bqm5pyA5aSn55qE6YKj5LiqXG5cdFx0bGV0IGxvd2VydGV4dCA9IHRleHQudG9Mb3dlckNhc2UoKTtcblxuXHRcdHdoaWxlIChjdXIgPCB0ZXh0Lmxlbmd0aClcblx0XHR7XG5cdFx0XHRsZXQgc3RvcHdvcmQ6IElXb3JkID0gbnVsbDtcblx0XHRcdGZvciAobGV0IGkgaW4gVEFCTEUpXG5cdFx0XHR7XG5cdFx0XHRcdGlmIChsb3dlcnRleHQuc3Vic3RyKGN1ciwgaSBhcyBhbnkpIGluIFRBQkxFW2ldKVxuXHRcdFx0XHR7XG5cdFx0XHRcdFx0c3RvcHdvcmQgPSB7XG5cdFx0XHRcdFx0XHR3OiB0ZXh0LnN1YnN0cihjdXIsIGkgYXMgYW55KSxcblx0XHRcdFx0XHRcdGM6IGN1cixcblx0XHRcdFx0XHR9O1xuXHRcdFx0XHR9XG5cdFx0XHR9XG5cdFx0XHRpZiAoc3RvcHdvcmQgIT09IG51bGwpXG5cdFx0XHR7XG5cdFx0XHRcdHJldC5wdXNoKHN0b3B3b3JkKTtcblx0XHRcdFx0Y3VyICs9IHN0b3B3b3JkLncubGVuZ3RoO1xuXHRcdFx0fVxuXHRcdFx0ZWxzZVxuXHRcdFx0e1xuXHRcdFx0XHRjdXIrKztcblx0XHRcdH1cblx0XHR9XG5cdFx0cmV0dXJuIHJldDtcblx0fVxuXG59XG5cbmV4cG9ydCBjb25zdCBpbml0ID0gV2lsZGNhcmRUb2tlbml6ZXIuaW5pdC5iaW5kKFdpbGRjYXJkVG9rZW5pemVyKSBhcyBJU3ViVG9rZW5pemVyQ3JlYXRlPFdpbGRjYXJkVG9rZW5pemVyPjtcblxuZXhwb3J0IGRlZmF1bHQgV2lsZGNhcmRUb2tlbml6ZXI7XG4iXX0= |
\ | No newline at end of file |