UNPKG

27.8 kBJavaScriptView Raw
1/**
2 * 分词器接口
3 *
4 * @author 老雷<leizongmin@gmail.com>
5 */
6'use strict';
7Object.defineProperty(exports, "__esModule", { value: true });
8exports.SegmentCore = void 0;
9const POSTAG_1 = require("../POSTAG");
10const index_1 = require("../mod/index");
11const stringify_1 = require("./methods/stringify");
12const split_1 = require("./methods/split");
13const indexOf_1 = require("./methods/indexOf");
14const convertSynonym_1 = require("./methods/convertSynonym");
15const listModules_1 = require("./methods/listModules");
16const _get_text_1 = require("./methods/_get_text");
17const getOptionsDoSegment_1 = require("./methods/getOptionsDoSegment");
18const useModules_1 = require("./methods/useModules");
19const doSegment_1 = require("./methods/doSegment");
20/**
21 * 创建分词器接口
22 */
23class SegmentCore {
24 constructor(options = {}) {
25 /**
26 * 分段
27 *
28 * 由於 segment 是利用對內容的前後文分析來進行分詞
29 * 所以如何切割段落對於結果就會產生不同影響
30 *
31 * `RegExp` or 具有 `.[Symbol.split](input: string, limit?: number) => string[]` 的物件
32 *
33 * @type {Segment.ISPLIT}
34 */
35 this.SPLIT = /([\r\n]+|^[ \s]+|[ \s]+$|[ \s]{2,})/gm;
36 /**
37 * 分段之後 如果符合以下條件 則直接忽略分析
38 * `RegExp` or 具有 `.test(input: string) => boolean` 的物件
39 *
40 * @type {Segment.ISPLIT_FILTER}
41 */
42 this.SPLIT_FILTER = /^([\r\n]+)$/g;
43 /**
44 * 词性
45 * @type {POSTAG}
46 */
47 this.POSTAG = POSTAG_1.default;
48 /**
49 * 词典表
50 * @type {{}}
51 */
52 this.DICT = {};
53 this.modules = {
54 /**
55 * 分词模块
56 */
57 tokenizer: [],
58 /**
59 * 优化模块
60 */
61 optimizer: [],
62 };
63 this.db = {};
64 this.options = {};
65 const self = this;
66 this.options = Object.assign({}, this.options, options);
67 this.tokenizer = new index_1.Tokenizer(this);
68 this.optimizer = new index_1.Optimizer(this);
69 if (this.options.db) {
70 this.options.db.forEach(function (data) {
71 self.db[data.type] = data;
72 });
73 }
74 delete this.options.db;
75 }
76 getDictDatabase(type, autocreate, libTableDict) {
77 // @ts-ignore
78 return this.db[type];
79 }
80 use(mod, ...argv) {
81 useModules_1.useModules(this, mod, ...argv);
82 return this;
83 }
84 getDict(type) {
85 return this.DICT[type];
86 }
87 getOptionsDoSegment(options) {
88 return getOptionsDoSegment_1.getOptionsDoSegment(options, this.options.optionsDoSegment);
89 }
90 _get_text(text) {
91 return _get_text_1._get_text(text);
92 }
93 addBlacklist(word, remove) {
94 let me = this;
95 const BLACKLIST = me.getDictDatabase("BLACKLIST" /* BLACKLIST */);
96 const TABLE = me.getDictDatabase("TABLE" /* TABLE */);
97 let bool = !remove;
98 if (bool) {
99 BLACKLIST.add(word);
100 TABLE.remove(word);
101 }
102 else {
103 BLACKLIST.remove(word);
104 }
105 return this;
106 }
107 /**
108 * remove key in TABLE by BLACKLIST
109 */
110 doBlacklist() {
111 let me = this;
112 const BLACKLIST = me.getDict("BLACKLIST" /* BLACKLIST */);
113 const TABLE = me.getDictDatabase("TABLE" /* TABLE */);
114 Object.entries(BLACKLIST)
115 .forEach(function ([key, bool]) {
116 bool && TABLE.remove(key);
117 });
118 return this;
119 }
120 listModules(options = {}) {
121 options = this.getOptionsDoSegment(options);
122 return listModules_1.listModules(this.modules, options);
123 }
124 doSegment(text, options = {}) {
125 const me = this;
126 options = me.getOptionsDoSegment(options);
127 //console.dir(options);
128 let text_list = me._get_text(text)
129 // @ts-ignore
130 .split(this.SPLIT);
131 text = undefined;
132 const mods = me.listModules(options).enable;
133 // 将文本按照换行符分割成多段,并逐一分词
134 let ret = text_list.reduce(function (ret, section) {
135 //console.dir(section);
136 if (me.SPLIT_FILTER.test(section)) {
137 ret = ret.concat({ w: section });
138 // @ts-ignore
139 section = [];
140 }
141 //section = section.trim();
142 if (section.length > 0) {
143 // 分词
144 let sret = me.tokenizer.split(section, mods.tokenizer);
145 // 优化
146 sret = me.optimizer.doOptimize(sret, mods.optimizer);
147 // 连接分词结果
148 if (sret.length > 0) {
149 ret = ret.concat(sret);
150 }
151 }
152 return ret;
153 }, []);
154 // 去除标点符号
155 if (options.stripPunctuation) {
156 ret = doSegment_1._doSegmentStripPOSTAG(ret, POSTAG_1.default.D_W);
157 }
158 if (options.convertSynonym) {
159 ret = this.convertSynonym(ret);
160 }
161 // 去除停止符
162 if (options.stripStopword) {
163 ret = doSegment_1._doSegmentStripStopword(ret, me.getDict('STOPWORD'));
164 }
165 if (options.stripSpace) {
166 ret = doSegment_1._doSegmentStripSpace(ret);
167 }
168 // 仅返回单词内容
169 if (options.simple) {
170 ret = doSegment_1._doSegmentSimple(ret);
171 }
172 return ret;
173 }
174 convertSynonym(ret, showcount) {
175 return convertSynonym_1.convertSynonym(ret, {
176 showcount,
177 DICT_SYNONYM: this.getDict('SYNONYM'),
178 DICT_TABLE: this.getDict('TABLE'),
179 POSTAG: this.POSTAG,
180 });
181 }
182 /**
183 * 将单词数组连接成字符串
184 *
185 * @param {Array} words 单词数组
186 * @return {String}
187 */
188 stringify(words, ...argv) {
189 return stringify_1.stringify(words, ...argv);
190 }
191 /**
192 * 将单词数组连接成字符串
193 *
194 * @param {Array} words 单词数组
195 * @return {String}
196 */
197 static stringify(words, ...argv) {
198 return stringify_1.stringify(words, ...argv);
199 }
200 /**
201 * 根据某个单词或词性来分割单词数组
202 *
203 * @param {Array} words 单词数组
204 * @param {Number|String} s 用于分割的单词或词性
205 * @return {Array}
206 */
207 split(words, s, ...argv) {
208 return split_1.split(words, s, ...argv);
209 }
210 /**
211 * 在单词数组中查找某一个单词或词性所在的位置
212 *
213 * @param {Array} words 单词数组
214 * @param {Number|String} s 要查找的单词或词性
215 * @param {Number} cur 开始位置
216 * @return {Number} 找不到,返回-1
217 */
218 indexOf(words, s, cur, ...argv) {
219 return indexOf_1.indexOf(words, cur, ...argv);
220 }
221}
222exports.SegmentCore = SegmentCore;
223exports.default = SegmentCore;
224//# sourceMappingURL=data:application/json;base64,
\No newline at end of file