1 | ;
|
2 | Object.defineProperty(exports, "__esModule", { value: true });
|
3 | exports.init = exports.PunctuationTokenizer = void 0;
|
4 | /**
|
5 | * 标点符号识别模块
|
6 | *
|
7 | * @author 老雷<leizongmin@gmail.com>
|
8 | */
|
9 | const mod_1 = require("../mod");
|
10 | const STOPWORD_1 = require("../mod/data/STOPWORD");
|
11 | class PunctuationTokenizer extends mod_1.SubSModuleTokenizer {
|
12 | constructor() {
|
13 | super(...arguments);
|
14 | this.name = 'PunctuationTokenizer';
|
15 | this._STOPWORD = STOPWORD_1._STOPWORD;
|
16 | this.STOPWORD = STOPWORD_1.STOPWORD;
|
17 | this.STOPWORD2 = STOPWORD_1.STOPWORD2;
|
18 | }
|
19 | /**
|
20 | * 对未识别的单词进行分词
|
21 | *
|
22 | * @param {array} words 单词数组
|
23 | * @return {array}
|
24 | */
|
25 | split(words) {
|
26 | const POSTAG = this._POSTAG;
|
27 | const self = this;
|
28 | let ret = [];
|
29 | for (let i = 0, word; word = words[i]; i++) {
|
30 | if (word.p > 0) {
|
31 | ret.push(word);
|
32 | continue;
|
33 | }
|
34 | // 仅对未识别的词进行匹配
|
35 | let stopinfo = self.matchStopword(word.w);
|
36 | if (stopinfo.length < 1) {
|
37 | ret.push(word);
|
38 | continue;
|
39 | }
|
40 | // 分离出标点符号
|
41 | let lastc = 0;
|
42 | for (let ui = 0, sw; sw = stopinfo[ui]; ui++) {
|
43 | if (sw.c > lastc) {
|
44 | ret.push({
|
45 | w: word.w.substr(lastc, sw.c - lastc)
|
46 | });
|
47 | }
|
48 | ret.push(self.debugToken({
|
49 | w: sw.w,
|
50 | p: POSTAG.D_W
|
51 | }, {
|
52 | [self.name]: true,
|
53 | }, true));
|
54 | lastc = sw.c + sw.w.length;
|
55 | }
|
56 | let lastsw = stopinfo[stopinfo.length - 1];
|
57 | if (lastsw.c + lastsw.w.length < word.w.length) {
|
58 | ret.push({
|
59 | w: word.w.substr(lastsw.c + lastsw.w.length)
|
60 | });
|
61 | }
|
62 | }
|
63 | return ret;
|
64 | }
|
65 | /**
|
66 | * 匹配包含的标点符号,返回相关信息
|
67 | *
|
68 | * @param {string} text 文本
|
69 | * @param {int} cur 开始位置
|
70 | * @return {array} 返回格式 {w: '网址', c: 开始位置}
|
71 | */
|
72 | matchStopword(text, cur) {
|
73 | const STOPWORD2 = this.STOPWORD2;
|
74 | if (isNaN(cur))
|
75 | cur = 0;
|
76 | let ret = [];
|
77 | let isMatch = false;
|
78 | while (cur < text.length) {
|
79 | let w;
|
80 | for (let i in STOPWORD2) {
|
81 | w = text.substr(cur, i);
|
82 | if (w in STOPWORD2[i]) {
|
83 | ret.push({ w: w, c: cur });
|
84 | isMatch = true;
|
85 | break;
|
86 | }
|
87 | }
|
88 | cur += isMatch === false ? 1 : w.length;
|
89 | isMatch = false;
|
90 | }
|
91 | return ret;
|
92 | }
|
93 | }
|
94 | exports.PunctuationTokenizer = PunctuationTokenizer;
|
95 | // debug(STOPWORD2);
|
96 | exports.init = PunctuationTokenizer.init.bind(PunctuationTokenizer);
|
97 | exports.default = PunctuationTokenizer;
|
98 | //# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiUHVuY3R1YXRpb25Ub2tlbml6ZXIuanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyJQdW5jdHVhdGlvblRva2VuaXplci50cyJdLCJuYW1lcyI6W10sIm1hcHBpbmdzIjoiQUFBQSxZQUFZLENBQUM7OztBQUViOzs7O0dBSUc7QUFFSCxnQ0FBNkM7QUFJN0MsbURBQXNFO0FBRXRFLE1BQWEsb0JBQXFCLFNBQVEseUJBQW1CO0lBQTdEOztRQUVDLFNBQUksR0FBRyxzQkFBc0IsQ0FBQztRQUV2QixjQUFTLEdBQUcsb0JBQVMsQ0FBQztRQUN0QixhQUFRLEdBQUcsbUJBQVEsQ0FBQztRQUNwQixjQUFTLEdBQUcsb0JBQVMsQ0FBQztJQTRGOUIsQ0FBQztJQTFGQTs7Ozs7T0FLRztJQUNILEtBQUssQ0FBQyxLQUFjO1FBRW5CLE1BQU0sTUFBTSxHQUFHLElBQUksQ0FBQyxPQUFPLENBQUM7UUFDNUIsTUFBTSxJQUFJLEdBQUcsSUFBSSxDQUFDO1FBRWxCLElBQUksR0FBRyxHQUFHLEVBQUUsQ0FBQztRQUNiLEtBQUssSUFBSSxDQUFDLEdBQUcsQ0FBQyxFQUFFLElBQUksRUFBRSxJQUFJLEdBQUcsS0FBSyxDQUFDLENBQUMsQ0FBQyxFQUFFLENBQUMsRUFBRSxFQUMxQztZQUNDLElBQUksSUFBSSxDQUFDLENBQUMsR0FBRyxDQUFDLEVBQ2Q7Z0JBQ0MsR0FBRyxDQUFDLElBQUksQ0FBQyxJQUFJLENBQUMsQ0FBQztnQkFDZixTQUFTO2FBQ1Q7WUFDRCxjQUFjO1lBQ2QsSUFBSSxRQUFRLEdBQUcsSUFBSSxDQUFDLGFBQWEsQ0FBQyxJQUFJLENBQUMsQ0FBQyxDQUFDLENBQUM7WUFDMUMsSUFBSSxRQUFRLENBQUMsTUFBTSxHQUFHLENBQUMsRUFDdkI7Z0JBQ0MsR0FBRyxDQUFDLElBQUksQ0FBQyxJQUFJLENBQUMsQ0FBQztnQkFDZixTQUFTO2FBQ1Q7WUFDRCxVQUFVO1lBQ1YsSUFBSSxLQUFLLEdBQUcsQ0FBQyxDQUFDO1lBQ2QsS0FBSyxJQUFJLEVBQUUsR0FBRyxDQUFDLEVBQUUsRUFBRSxFQUFFLEVBQUUsR0FBRyxRQUFRLENBQUMsRUFBRSxDQUFDLEVBQUUsRUFBRSxFQUFFLEVBQzVDO2dCQUNDLElBQUksRUFBRSxDQUFDLENBQUMsR0FBRyxLQUFLLEVBQ2hCO29CQUNDLEdBQUcsQ0FBQyxJQUFJLENBQUM7d0JBQ1IsQ0FBQyxFQUFFLElBQUksQ0FBQyxDQUFDLENBQUMsTUFBTSxDQUFDLEtBQUssRUFBRSxFQUFFLENBQUMsQ0FBQyxHQUFHLEtBQUssQ0FBQztxQkFDckMsQ0FBQyxDQUFDO2lCQUNIO2dCQUVELEdBQUcsQ0FBQyxJQUFJLENBQUMsSUFBSSxDQUFDLFVBQVUsQ0FBQztvQkFDeEIsQ0FBQyxFQUFFLEVBQUUsQ0FBQyxDQUFDO29CQUNQLENBQUMsRUFBRSxNQUFNLENBQUMsR0FBRztpQkFDYixFQUFFO29CQUNGLENBQUMsSUFBSSxDQUFDLElBQUksQ0FBQyxFQUFFLElBQUk7aUJBQ2pCLEVBQUUsSUFBSSxDQUFDLENBQUMsQ0FBQztnQkFFVixLQUFLLEdBQUcsRUFBRSxDQUFDLENBQUMsR0FBRyxFQUFFLENBQUMsQ0FBQyxDQUFDLE1BQU0sQ0FBQzthQUMzQjtZQUNELElBQUksTUFBTSxHQUFHLFFBQVEsQ0FBQyxRQUFRLENBQUMsTUFBTSxHQUFHLENBQUMsQ0FBQyxDQUFDO1lBQzNDLElBQUksTUFBTSxDQUFDLENBQUMsR0FBRyxNQUFNLENBQUMsQ0FBQyxDQUFDLE1BQU0sR0FBRyxJQUFJLENBQUMsQ0FBQyxDQUFDLE1BQU0sRUFDOUM7Z0JBQ0MsR0FBRyxDQUFDLElBQUksQ0FBQztvQkFDUixDQUFDLEVBQUUsSUFBSSxDQUFDLENBQUMsQ0FBQyxNQUFNLENBQUMsTUFBTSxDQUFDLENBQUMsR0FBRyxNQUFNLENBQUMsQ0FBQyxDQUFDLE1BQU0sQ0FBQztpQkFDNUMsQ0FBQyxDQUFDO2FBQ0g7U0FDRDtRQUNELE9BQU8sR0FBRyxDQUFDO0lBQ1osQ0FBQztJQUVEOzs7Ozs7T0FNRztJQUNILGFBQWEsQ0FBQyxJQUFZLEVBQUUsR0FBWTtRQUV2QyxNQUFNLFNBQVMsR0FBRyxJQUFJLENBQUMsU0FBUyxDQUFDO1FBRWpDLElBQUksS0FBSyxDQUFDLEdBQUcsQ0FBQztZQUFFLEdBQUcsR0FBRyxDQUFDLENBQUM7UUFDeEIsSUFBSSxHQUFHLEdBQUcsRUFBRSxDQUFDO1FBQ2IsSUFBSSxPQUFPLEdBQUcsS0FBSyxDQUFDO1FBQ3BCLE9BQU8sR0FBRyxHQUFHLElBQUksQ0FBQyxNQUFNLEVBQ3hCO1lBQ0MsSUFBSSxDQUFDLENBQUM7WUFDTixLQUFLLElBQUksQ0FBQyxJQUFJLFNBQVMsRUFDdkI7Z0JBQ0MsQ0FBQyxHQUFHLElBQUksQ0FBQyxNQUFNLENBQUMsR0FBRyxFQUFFLENBQWtCLENBQUMsQ0FBQztnQkFDekMsSUFBSSxDQUFDLElBQUksU0FBUyxDQUFDLENBQUMsQ0FBQyxFQUNyQjtvQkFDQyxHQUFHLENBQUMsSUFBSSxDQUFDLEVBQUUsQ0FBQyxFQUFFLENBQUMsRUFBRSxDQUFDLEVBQUUsR0FBRyxFQUFFLENBQUMsQ0FBQztvQkFDM0IsT0FBTyxHQUFHLElBQUksQ0FBQztvQkFDZixNQUFNO2lCQUNOO2FBQ0Q7WUFDRCxHQUFHLElBQUksT0FBTyxLQUFLLEtBQUssQ0FBQyxDQUFDLENBQUMsQ0FBQyxDQUFDLENBQUMsQ0FBQyxDQUFDLENBQUMsTUFBTSxDQUFDO1lBQ3hDLE9BQU8sR0FBRyxLQUFLLENBQUM7U0FDaEI7UUFFRCxPQUFPLEdBQUcsQ0FBQztJQUNaLENBQUM7Q0FDRDtBQWxHRCxvREFrR0M7QUFFRCxvQkFBb0I7QUFFUCxRQUFBLElBQUksR0FBRyxvQkFBb0IsQ0FBQyxJQUFJLENBQUMsSUFBSSxDQUFDLG9CQUFvQixDQUFxQyxDQUFDO0FBRTdHLGtCQUFlLG9CQUFvQixDQUFDIiwic291cmNlc0NvbnRlbnQiOlsiJ3VzZSBzdHJpY3QnO1xuXG4vKipcbiAqIOagh+eCueespuWPt+ivhuWIq+aooeWdl1xuICpcbiAqIEBhdXRob3Ig6ICB6Zu3PGxlaXpvbmdtaW5AZ21haWwuY29tPlxuICovXG5cbmltcG9ydCB7IFN1YlNNb2R1bGVUb2tlbml6ZXIgfSBmcm9tICcuLi9tb2QnO1xuaW1wb3J0IFNlZ21lbnQsIHsgSVdvcmQgfSBmcm9tICcuLi9TZWdtZW50JztcbmltcG9ydCB7IGRlYnVnIH0gZnJvbSAnLi4vdXRpbCc7XG5pbXBvcnQgVVN0cmluZyBmcm9tICd1bmktc3RyaW5nJztcbmltcG9ydCB7IF9TVE9QV09SRCwgU1RPUFdPUkQsIFNUT1BXT1JEMiB9IGZyb20gJy4uL21vZC9kYXRhL1NUT1BXT1JEJztcblxuZXhwb3J0IGNsYXNzIFB1bmN0dWF0aW9uVG9rZW5pemVyIGV4dGVuZHMgU3ViU01vZHVsZVRva2VuaXplclxue1xuXHRuYW1lID0gJ1B1bmN0dWF0aW9uVG9rZW5pemVyJztcblxuXHRwdWJsaWMgX1NUT1BXT1JEID0gX1NUT1BXT1JEO1xuXHRwdWJsaWMgU1RPUFdPUkQgPSBTVE9QV09SRDtcblx0cHVibGljIFNUT1BXT1JEMiA9IFNUT1BXT1JEMjtcblxuXHQvKipcblx0ICog5a+55pyq6K+G5Yir55qE5Y2V6K+N6L+b6KGM5YiG6K+NXG5cdCAqXG5cdCAqIEBwYXJhbSB7YXJyYXl9IHdvcmRzIOWNleivjeaVsOe7hFxuXHQgKiBAcmV0dXJuIHthcnJheX1cblx0ICovXG5cdHNwbGl0KHdvcmRzOiBJV29yZFtdKTogSVdvcmRbXVxuXHR7XG5cdFx0Y29uc3QgUE9TVEFHID0gdGhpcy5fUE9TVEFHO1xuXHRcdGNvbnN0IHNlbGYgPSB0aGlzO1xuXG5cdFx0bGV0IHJldCA9IFtdO1xuXHRcdGZvciAobGV0IGkgPSAwLCB3b3JkOyB3b3JkID0gd29yZHNbaV07IGkrKylcblx0XHR7XG5cdFx0XHRpZiAod29yZC5wID4gMClcblx0XHRcdHtcblx0XHRcdFx0cmV0LnB1c2god29yZCk7XG5cdFx0XHRcdGNvbnRpbnVlO1xuXHRcdFx0fVxuXHRcdFx0Ly8g5LuF5a+55pyq6K+G5Yir55qE6K+N6L+b6KGM5Yy56YWNXG5cdFx0XHRsZXQgc3RvcGluZm8gPSBzZWxmLm1hdGNoU3RvcHdvcmQod29yZC53KTtcblx0XHRcdGlmIChzdG9waW5mby5sZW5ndGggPCAxKVxuXHRcdFx0e1xuXHRcdFx0XHRyZXQucHVzaCh3b3JkKTtcblx0XHRcdFx0Y29udGludWU7XG5cdFx0XHR9XG5cdFx0XHQvLyDliIbnprvlh7rmoIfngrnnrKblj7dcblx0XHRcdGxldCBsYXN0YyA9IDA7XG5cdFx0XHRmb3IgKGxldCB1aSA9IDAsIHN3OyBzdyA9IHN0b3BpbmZvW3VpXTsgdWkrKylcblx0XHRcdHtcblx0XHRcdFx0aWYgKHN3LmMgPiBsYXN0Yylcblx0XHRcdFx0e1xuXHRcdFx0XHRcdHJldC5wdXNoKHtcblx0XHRcdFx0XHRcdHc6IHdvcmQudy5zdWJzdHIobGFzdGMsIHN3LmMgLSBsYXN0Yylcblx0XHRcdFx0XHR9KTtcblx0XHRcdFx0fVxuXG5cdFx0XHRcdHJldC5wdXNoKHNlbGYuZGVidWdUb2tlbih7XG5cdFx0XHRcdFx0dzogc3cudyxcblx0XHRcdFx0XHRwOiBQT1NUQUcuRF9XXG5cdFx0XHRcdH0sIHtcblx0XHRcdFx0XHRbc2VsZi5uYW1lXTogdHJ1ZSxcblx0XHRcdFx0fSwgdHJ1ZSkpO1xuXG5cdFx0XHRcdGxhc3RjID0gc3cuYyArIHN3LncubGVuZ3RoO1xuXHRcdFx0fVxuXHRcdFx0bGV0IGxhc3RzdyA9IHN0b3BpbmZvW3N0b3BpbmZvLmxlbmd0aCAtIDFdO1xuXHRcdFx0aWYgKGxhc3Rzdy5jICsgbGFzdHN3LncubGVuZ3RoIDwgd29yZC53Lmxlbmd0aClcblx0XHRcdHtcblx0XHRcdFx0cmV0LnB1c2goe1xuXHRcdFx0XHRcdHc6IHdvcmQudy5zdWJzdHIobGFzdHN3LmMgKyBsYXN0c3cudy5sZW5ndGgpXG5cdFx0XHRcdH0pO1xuXHRcdFx0fVxuXHRcdH1cblx0XHRyZXR1cm4gcmV0O1xuXHR9XG5cblx0LyoqXG5cdCAqIOWMuemFjeWMheWQq+eahOagh+eCueespuWPt++8jOi/lOWbnuebuOWFs+S/oeaBr1xuXHQgKlxuXHQgKiBAcGFyYW0ge3N0cmluZ30gdGV4dCDmlofmnKxcblx0ICogQHBhcmFtIHtpbnR9IGN1ciDlvIDlp4vkvY3nva5cblx0ICogQHJldHVybiB7YXJyYXl9ICDov5Tlm57moLzlvI8gICB7dzogJ+e9keWdgCcsIGM6IOW8gOWni+S9jee9rn1cblx0ICovXG5cdG1hdGNoU3RvcHdvcmQodGV4dDogc3RyaW5nLCBjdXI/OiBudW1iZXIpOiBJV29yZFtdXG5cdHtcblx0XHRjb25zdCBTVE9QV09SRDIgPSB0aGlzLlNUT1BXT1JEMjtcblxuXHRcdGlmIChpc05hTihjdXIpKSBjdXIgPSAwO1xuXHRcdGxldCByZXQgPSBbXTtcblx0XHRsZXQgaXNNYXRjaCA9IGZhbHNlO1xuXHRcdHdoaWxlIChjdXIgPCB0ZXh0Lmxlbmd0aClcblx0XHR7XG5cdFx0XHRsZXQgdztcblx0XHRcdGZvciAobGV0IGkgaW4gU1RPUFdPUkQyKVxuXHRcdFx0e1xuXHRcdFx0XHR3ID0gdGV4dC5zdWJzdHIoY3VyLCBpIGFzIGFueSBhcyBudW1iZXIpO1xuXHRcdFx0XHRpZiAodyBpbiBTVE9QV09SRDJbaV0pXG5cdFx0XHRcdHtcblx0XHRcdFx0XHRyZXQucHVzaCh7IHc6IHcsIGM6IGN1ciB9KTtcblx0XHRcdFx0XHRpc01hdGNoID0gdHJ1ZTtcblx0XHRcdFx0XHRicmVhaztcblx0XHRcdFx0fVxuXHRcdFx0fVxuXHRcdFx0Y3VyICs9IGlzTWF0Y2ggPT09IGZhbHNlID8gMSA6IHcubGVuZ3RoO1xuXHRcdFx0aXNNYXRjaCA9IGZhbHNlO1xuXHRcdH1cblxuXHRcdHJldHVybiByZXQ7XG5cdH1cbn1cblxuLy8gZGVidWcoU1RPUFdPUkQyKTtcblxuZXhwb3J0IGNvbnN0IGluaXQgPSBQdW5jdHVhdGlvblRva2VuaXplci5pbml0LmJpbmQoUHVuY3R1YXRpb25Ub2tlbml6ZXIpIGFzIHR5cGVvZiBQdW5jdHVhdGlvblRva2VuaXplci5pbml0O1xuXG5leHBvcnQgZGVmYXVsdCBQdW5jdHVhdGlvblRva2VuaXplcjtcbiJdfQ== |
\ | No newline at end of file |