UNPKG

37.2 kBJavaScriptView Raw
1/**
2 * 分词器接口
3 *
4 * @author 老雷<leizongmin@gmail.com>
5 */
6'use strict';
7Object.defineProperty(exports, "__esModule", { value: true });
8const path = require("path");
9const get_1 = require("./fs/get");
10const blacklist_1 = require("./table/blacklist");
11const dict_1 = require("./table/dict");
12const loader_1 = require("./loader");
13const stopword_1 = require("./table/stopword");
14const synonym_1 = require("./table/synonym");
15const segment_dict_1 = require("segment-dict");
16const project_config_1 = require("../project.config");
17const core_1 = require("./segment/core");
18const defaults_1 = require("./segment/defaults");
19const index_1 = require("./defaults/index");
20const useModules2_1 = require("./segment/methods/useModules2");
21/**
22 * 创建分词器接口
23 */
24class Segment extends core_1.default {
25 getDictDatabase(type, autocreate, libTableDict) {
26 if ((autocreate || this.inited) && !this.db[type]) {
27 if (type == synonym_1.default.type) {
28 libTableDict = libTableDict || synonym_1.default;
29 }
30 else if (type == stopword_1.TableDictStopword.type) {
31 libTableDict = libTableDict || stopword_1.TableDictStopword;
32 }
33 else if (type == blacklist_1.default.type || type == "BLACKLIST_FOR_OPTIMIZER" /* BLACKLIST_FOR_OPTIMIZER */ || type == "BLACKLIST_FOR_SYNONYM" /* BLACKLIST_FOR_SYNONYM */) {
34 libTableDict = libTableDict || blacklist_1.default;
35 }
36 else {
37 libTableDict = libTableDict || dict_1.TableDict;
38 }
39 this.db[type] = new libTableDict(type, this.options, {
40 TABLE: this.DICT[type],
41 });
42 }
43 return this.db[type];
44 }
45 use(mod, ...argv) {
46 useModules2_1.useModules(this, mod, ...argv);
47 this.inited = true;
48 return this;
49 }
50 _resolveDictFilename(name, pathPlus = [], extPlus = []) {
51 let options = {
52 paths: [
53 '',
54 project_config_1.default.dict_root,
55 ...pathPlus,
56 path.resolve(segment_dict_1.default.DICT_ROOT, 'segment'),
57 ],
58 extensions: [
59 '',
60 ...extPlus,
61 '.utf8',
62 '.txt',
63 ],
64 onlyFile: true,
65 };
66 if (name.indexOf('*') != -1) {
67 let ls = get_1.searchGlobSync(name, options);
68 if (!ls || !ls.length) {
69 throw Error(`Cannot find dict glob file "${name}".`);
70 }
71 return ls;
72 }
73 let filename = get_1.searchFirstSync(name, options);
74 if (!filename) {
75 //console.log(name, pathPlus, extPlus);
76 throw Error(`Cannot find dict file "${name}".`);
77 }
78 return filename;
79 }
80 /**
81 * 载入字典文件
82 *
83 * @param {String} name 字典文件名
84 * @param {String} type 类型
85 * @param {Boolean} convert_to_lower 是否全部转换为小写
86 * @return {Segment}
87 */
88 loadDict(name, type, convert_to_lower, skipExists) {
89 let filename = this._resolveDictFilename(name);
90 if (Array.isArray(filename)) {
91 let self = this;
92 filename.forEach(v => this.loadDict(v, type, convert_to_lower, skipExists));
93 //console.log(filename);
94 return this;
95 }
96 if (!type)
97 type = 'TABLE'; // 默认为TABLE
98 const db = this.getDictDatabase(type, true);
99 const TABLE = this.DICT[type] = db.TABLE;
100 const TABLE2 = this.DICT[type + '2'] = db.TABLE2;
101 /*
102 // 初始化词典
103 if (!this.DICT[type]) this.DICT[type] = {};
104 if (!this.DICT[type + '2']) this.DICT[type + '2'] = {};
105 let TABLE = this.DICT[type]; // 词典表 '词' => {属性}
106 let TABLE2 = this.DICT[type + '2']; // 词典表 '长度' => '词' => 属性
107 */
108 // 导入数据
109 const POSTAG = this.POSTAG;
110 let data = loader_1.default.SegmentDictLoader.loadSync(filename);
111 data.forEach(function (data) {
112 if (convert_to_lower) {
113 data[0] = data[0].toLowerCase();
114 }
115 db.add(data, skipExists);
116 /*
117 let [w, p, f] = data;
118
119 if (w.length == 0)
120 {
121 throw new Error()
122 }
123
124 TABLE[w] = { p, f, };
125 if (!TABLE2[w.length]) TABLE2[w.length] = {};
126 TABLE2[w.length][w] = TABLE[w];
127 */
128 });
129 data = undefined;
130 this.inited = true;
131 return this;
132 }
133 /**
134 * 载入同义词词典
135 *
136 * @param {String} name 字典文件名
137 */
138 loadSynonymDict(name, skipExists) {
139 let filename = this._resolveDictFilename(name, [
140 path.resolve(segment_dict_1.default.DICT_ROOT, 'synonym'),
141 ]);
142 if (Array.isArray(filename)) {
143 let self = this;
144 filename.forEach(v => this.loadSynonymDict(v, skipExists));
145 return this;
146 }
147 let type = 'SYNONYM';
148 const db = this.getDictDatabase(type, true);
149 const TABLE = this.DICT[type] = db.TABLE;
150 /*
151 // 初始化词典
152 if (!this.DICT[type]) this.DICT[type] = {};
153 // 词典表 '同义词' => '标准词'
154 let TABLE = this.DICT[type] as IDICT_SYNONYM;
155 // 导入数据
156 */
157 let data = loader_1.default.SegmentSynonymLoader.loadSync(filename);
158 data.forEach(function (blocks) {
159 db.add(blocks, skipExists);
160 /*
161 let [n1, n2] = blocks;
162
163 TABLE[n1] = n2;
164 if (TABLE[n2] === n1)
165 {
166 delete TABLE[n2];
167 }
168 */
169 });
170 //console.log(TABLE);
171 data = undefined;
172 this.inited = true;
173 return this;
174 }
175 _loadBlacklistDict(name, type) {
176 let filename = this._resolveDictFilename(name, [
177 path.resolve(segment_dict_1.default.DICT_ROOT, 'blacklist'),
178 ]);
179 if (Array.isArray(filename)) {
180 let self = this;
181 filename.forEach(v => this._loadBlacklistDict(v, type));
182 return this;
183 }
184 const db = this.getDictDatabase(type, true);
185 const TABLE = this.DICT[type] = db.TABLE;
186 let data = loader_1.default.SegmentDict
187 .requireLoaderModule('line')
188 .loadSync(filename, {
189 filter(line) {
190 return line.trim();
191 },
192 });
193 data.forEach(v => db.add(v));
194 data = undefined;
195 this.inited = true;
196 return this;
197 }
198 /**
199 * 字典黑名單 在主字典內刪除此字典內有的條目
200 */
201 loadBlacklistDict(name) {
202 return this._loadBlacklistDict(name, "BLACKLIST" /* BLACKLIST */);
203 }
204 /**
205 * 優化器黑名單 會防止部分優化器去組合此字典內的詞
206 * 例如 人名 自動組合之類
207 */
208 loadBlacklistOptimizerDict(name) {
209 return this._loadBlacklistDict(name, "BLACKLIST_FOR_OPTIMIZER" /* BLACKLIST_FOR_OPTIMIZER */);
210 }
211 /**
212 * 轉換黑名單 動態轉換字詞時會忽略此字典內的詞
213 */
214 loadBlacklistSynonymDict(name) {
215 return this._loadBlacklistDict(name, "BLACKLIST_FOR_SYNONYM" /* BLACKLIST_FOR_SYNONYM */);
216 }
217 /**
218 * 载入停止符词典
219 *
220 * @param {String} name 字典文件名
221 */
222 loadStopwordDict(name) {
223 let filename = this._resolveDictFilename(name, [
224 path.resolve(segment_dict_1.default.DICT_ROOT, 'stopword'),
225 ]);
226 if (Array.isArray(filename)) {
227 let self = this;
228 filename.forEach(v => this.loadStopwordDict(v));
229 return this;
230 }
231 const type = "STOPWORD" /* STOPWORD */;
232 const db = this.getDictDatabase(type, true);
233 const TABLE = this.DICT[type] = db.TABLE;
234 let data = loader_1.default.SegmentDict
235 .requireLoaderModule('line')
236 .loadSync(filename, {
237 filter(line) {
238 return line.trim();
239 },
240 });
241 data.forEach(v => db.add(v));
242 data = undefined;
243 this.inited = true;
244 return this;
245 }
246 useDefault(...argv) {
247 index_1.useDefault(this, ...argv);
248 this.inited = true;
249 return this;
250 }
251 /**
252 * 此函數只需執行一次,並且一般狀況下不需要手動呼叫
253 */
254 autoInit(options) {
255 if (!this.inited) {
256 this.inited = true;
257 if (!this.modules.tokenizer.length) {
258 this.useDefault(options);
259 }
260 }
261 return this;
262 }
263 addBlacklist(word, remove) {
264 let me = this;
265 this.autoInit(this.options);
266 const BLACKLIST = me.getDictDatabase("BLACKLIST" /* BLACKLIST */);
267 const TABLE = me.getDictDatabase("TABLE" /* TABLE */);
268 let bool = !remove;
269 if (bool) {
270 BLACKLIST.add(word);
271 TABLE.remove(word);
272 }
273 else {
274 BLACKLIST.remove(word);
275 }
276 return this;
277 }
278 /**
279 * remove key in TABLE by BLACKLIST
280 */
281 doBlacklist() {
282 let me = this;
283 this.autoInit(this.options);
284 const BLACKLIST = me.getDict("BLACKLIST" /* BLACKLIST */);
285 const TABLE = me.getDictDatabase("TABLE" /* TABLE */);
286 Object.entries(BLACKLIST)
287 .forEach(function ([key, bool]) {
288 bool && TABLE.remove(key);
289 });
290 return this;
291 }
292 doSegment(text, options = {}) {
293 this.autoInit(this.options);
294 return super.doSegment(text, options);
295 }
296}
297exports.Segment = Segment;
298Segment.defaultOptionsDoSegment = defaults_1.defaultOptionsDoSegment;
299exports.default = Segment;
300//# sourceMappingURL=data:application/json;base64,
\No newline at end of file