UNPKG

9.98 kBJavaScriptView Raw
1/**
2 * 分词器接口
3 *
4 * @author 老雷<leizongmin@gmail.com>
5 */
6'use strict';
7Object.defineProperty(exports, "__esModule", { value: true });
8exports.Segment = void 0;
9const path = require("path");
10const get_1 = require("./fs/get");
11const blacklist_1 = require("./table/blacklist");
12const dict_1 = require("./table/dict");
13const loader_1 = require("./loader");
14const stopword_1 = require("./table/stopword");
15const synonym_1 = require("./table/synonym");
16const segment_dict_1 = require("segment-dict");
17const project_config_1 = require("../project.config");
18const core_1 = require("./segment/core");
19const defaults_1 = require("./segment/defaults");
20const index_1 = require("./defaults/index");
21const useModules2_1 = require("./segment/methods/useModules2");
22/**
23 * 创建分词器接口
24 */
25class Segment extends core_1.default {
26 getDictDatabase(type, autocreate, libTableDict) {
27 if ((autocreate || this.inited) && !this.db[type]) {
28 if (type == synonym_1.default.type) {
29 libTableDict = libTableDict || synonym_1.default;
30 }
31 else if (type == stopword_1.TableDictStopword.type) {
32 libTableDict = libTableDict || stopword_1.TableDictStopword;
33 }
34 else if (type == blacklist_1.default.type || type == "BLACKLIST_FOR_OPTIMIZER" /* BLACKLIST_FOR_OPTIMIZER */ || type == "BLACKLIST_FOR_SYNONYM" /* BLACKLIST_FOR_SYNONYM */) {
35 libTableDict = libTableDict || blacklist_1.default;
36 }
37 else {
38 libTableDict = libTableDict || dict_1.TableDict;
39 }
40 this.db[type] = new libTableDict(type, this.options, {
41 TABLE: this.DICT[type],
42 });
43 }
44 return this.db[type];
45 }
46 use(mod, ...argv) {
47 useModules2_1.useModules(this, mod, ...argv);
48 this.inited = true;
49 return this;
50 }
51 _resolveDictFilename(name, pathPlus = [], extPlus = []) {
52 let options = {
53 paths: [
54 '',
55 project_config_1.default.dict_root,
56 ...pathPlus,
57 path.resolve(segment_dict_1.default.DICT_ROOT, 'segment'),
58 ],
59 extensions: [
60 '',
61 ...extPlus,
62 '.utf8',
63 '.txt',
64 ],
65 onlyFile: true,
66 };
67 if (name.indexOf('*') != -1) {
68 let ls = get_1.searchGlobSync(name, options);
69 if (!ls || !ls.length) {
70 throw Error(`Cannot find dict glob file "${name}".`);
71 }
72 return ls;
73 }
74 let filename = get_1.searchFirstSync(name, options);
75 if (!filename) {
76 //console.log(name, pathPlus, extPlus);
77 throw Error(`Cannot find dict file "${name}".`);
78 }
79 return filename;
80 }
81 /**
82 * 载入字典文件
83 *
84 * @param {String} name 字典文件名
85 * @param {String} type 类型
86 * @param {Boolean} convert_to_lower 是否全部转换为小写
87 * @return {Segment}
88 */
89 loadDict(name, type, convert_to_lower, skipExists) {
90 let filename = this._resolveDictFilename(name);
91 if (Array.isArray(filename)) {
92 let self = this;
93 filename.forEach(v => this.loadDict(v, type, convert_to_lower, skipExists));
94 //console.log(filename);
95 return this;
96 }
97 if (!type)
98 type = 'TABLE'; // 默认为TABLE
99 const db = this.getDictDatabase(type, true);
100 const TABLE = this.DICT[type] = db.TABLE;
101 const TABLE2 = this.DICT[type + '2'] = db.TABLE2;
102 /*
103 // 初始化词典
104 if (!this.DICT[type]) this.DICT[type] = {};
105 if (!this.DICT[type + '2']) this.DICT[type + '2'] = {};
106 let TABLE = this.DICT[type]; // 词典表 '词' => {属性}
107 let TABLE2 = this.DICT[type + '2']; // 词典表 '长度' => '词' => 属性
108 */
109 // 导入数据
110 const POSTAG = this.POSTAG;
111 let data = loader_1.default.SegmentDictLoader.loadSync(filename);
112 data.forEach(function (data) {
113 if (convert_to_lower) {
114 data[0] = data[0].toLowerCase();
115 }
116 db.add(data, skipExists);
117 /*
118 let [w, p, f] = data;
119
120 if (w.length == 0)
121 {
122 throw new Error()
123 }
124
125 TABLE[w] = { p, f, };
126 if (!TABLE2[w.length]) TABLE2[w.length] = {};
127 TABLE2[w.length][w] = TABLE[w];
128 */
129 });
130 data = undefined;
131 this.inited = true;
132 return this;
133 }
134 /**
135 * 载入同义词词典
136 *
137 * @param {String} name 字典文件名
138 */
139 loadSynonymDict(name, skipExists) {
140 let filename = this._resolveDictFilename(name, [
141 path.resolve(segment_dict_1.default.DICT_ROOT, 'synonym'),
142 ]);
143 if (Array.isArray(filename)) {
144 let self = this;
145 filename.forEach(v => this.loadSynonymDict(v, skipExists));
146 return this;
147 }
148 let type = 'SYNONYM';
149 const db = this.getDictDatabase(type, true);
150 const TABLE = this.DICT[type] = db.TABLE;
151 /*
152 // 初始化词典
153 if (!this.DICT[type]) this.DICT[type] = {};
154 // 词典表 '同义词' => '标准词'
155 let TABLE = this.DICT[type] as IDICT_SYNONYM;
156 // 导入数据
157 */
158 let data = loader_1.default.SegmentSynonymLoader.loadSync(filename);
159 data.forEach(function (blocks) {
160 db.add(blocks, skipExists);
161 /*
162 let [n1, n2] = blocks;
163
164 TABLE[n1] = n2;
165 if (TABLE[n2] === n1)
166 {
167 delete TABLE[n2];
168 }
169 */
170 });
171 //console.log(TABLE);
172 data = undefined;
173 this.inited = true;
174 return this;
175 }
176 _loadBlacklistDict(name, type) {
177 let filename = this._resolveDictFilename(name, [
178 path.resolve(segment_dict_1.default.DICT_ROOT, 'blacklist'),
179 ]);
180 if (Array.isArray(filename)) {
181 let self = this;
182 filename.forEach(v => this._loadBlacklistDict(v, type));
183 return this;
184 }
185 const db = this.getDictDatabase(type, true);
186 const TABLE = this.DICT[type] = db.TABLE;
187 let data = loader_1.default.SegmentDict
188 .requireLoaderModule('line')
189 .loadSync(filename, {
190 filter(line) {
191 return line.trim();
192 },
193 });
194 data.forEach(v => db.add(v));
195 data = undefined;
196 this.inited = true;
197 return this;
198 }
199 /**
200 * 字典黑名單 在主字典內刪除此字典內有的條目
201 */
202 loadBlacklistDict(name) {
203 return this._loadBlacklistDict(name, "BLACKLIST" /* BLACKLIST */);
204 }
205 /**
206 * 優化器黑名單 會防止部分優化器去組合此字典內的詞
207 * 例如 人名 自動組合之類
208 */
209 loadBlacklistOptimizerDict(name) {
210 return this._loadBlacklistDict(name, "BLACKLIST_FOR_OPTIMIZER" /* BLACKLIST_FOR_OPTIMIZER */);
211 }
212 /**
213 * 轉換黑名單 動態轉換字詞時會忽略此字典內的詞
214 */
215 loadBlacklistSynonymDict(name) {
216 return this._loadBlacklistDict(name, "BLACKLIST_FOR_SYNONYM" /* BLACKLIST_FOR_SYNONYM */);
217 }
218 /**
219 * 载入停止符词典
220 *
221 * @param {String} name 字典文件名
222 */
223 loadStopwordDict(name) {
224 let filename = this._resolveDictFilename(name, [
225 path.resolve(segment_dict_1.default.DICT_ROOT, 'stopword'),
226 ]);
227 if (Array.isArray(filename)) {
228 let self = this;
229 filename.forEach(v => this.loadStopwordDict(v));
230 return this;
231 }
232 const type = "STOPWORD" /* STOPWORD */;
233 const db = this.getDictDatabase(type, true);
234 const TABLE = this.DICT[type] = db.TABLE;
235 let data = loader_1.default.SegmentDict
236 .requireLoaderModule('line')
237 .loadSync(filename, {
238 filter(line) {
239 return line.trim();
240 },
241 });
242 data.forEach(v => db.add(v));
243 data = undefined;
244 this.inited = true;
245 return this;
246 }
247 useDefault(...argv) {
248 index_1.useDefault(this, ...argv);
249 this.inited = true;
250 return this;
251 }
252 /**
253 * 此函數只需執行一次,並且一般狀況下不需要手動呼叫
254 */
255 autoInit(options) {
256 if (!this.inited) {
257 this.inited = true;
258 if (!this.modules.tokenizer.length) {
259 this.useDefault(options);
260 }
261 }
262 return this;
263 }
264 addBlacklist(word, remove) {
265 let me = this;
266 this.autoInit(this.options);
267 const BLACKLIST = me.getDictDatabase("BLACKLIST" /* BLACKLIST */);
268 const TABLE = me.getDictDatabase("TABLE" /* TABLE */);
269 let bool = !remove;
270 if (bool) {
271 BLACKLIST.add(word);
272 TABLE.remove(word);
273 }
274 else {
275 BLACKLIST.remove(word);
276 }
277 return this;
278 }
279 /**
280 * remove key in TABLE by BLACKLIST
281 */
282 doBlacklist() {
283 let me = this;
284 this.autoInit(this.options);
285 const BLACKLIST = me.getDict("BLACKLIST" /* BLACKLIST */);
286 const TABLE = me.getDictDatabase("TABLE" /* TABLE */);
287 Object.entries(BLACKLIST)
288 .forEach(function ([key, bool]) {
289 bool && TABLE.remove(key);
290 });
291 return this;
292 }
293 doSegment(text, options = {}) {
294 this.autoInit(this.options);
295 return super.doSegment(text, options);
296 }
297}
298exports.Segment = Segment;
299Segment.defaultOptionsDoSegment = defaults_1.defaultOptionsDoSegment;
300exports.default = Segment;
301//# sourceMappingURL=Segment.js.map
\No newline at end of file