UNPKG

37.2 kBJavaScriptView Raw
1/**
2 * 分词器接口
3 *
4 * @author 老雷<leizongmin@gmail.com>
5 */
6'use strict';
7Object.defineProperty(exports, "__esModule", { value: true });
8const path = require("path");
9const get_1 = require("./fs/get");
10const blacklist_1 = require("./table/blacklist");
11const dict_1 = require("./table/dict");
12const loader_1 = require("./loader");
13const stopword_1 = require("./table/stopword");
14const synonym_1 = require("./table/synonym");
15const segment_dict_1 = require("segment-dict");
16const project_config_1 = require("../project.config");
17const core_1 = require("./segment/core");
18const defaults_1 = require("./segment/defaults");
19const index_1 = require("./defaults/index");
20const useModules2_1 = require("./segment/methods/useModules2");
21/**
22 * 创建分词器接口
23 */
24class Segment extends core_1.default {
25 getDictDatabase(type, autocreate, libTableDict) {
26 if ((autocreate || this.inited) && !this.db[type]) {
27 if (type == synonym_1.default.type) {
28 libTableDict = libTableDict || synonym_1.default;
29 }
30 else if (type == stopword_1.TableDictStopword.type) {
31 libTableDict = libTableDict || stopword_1.TableDictStopword;
32 }
33 else if (type == blacklist_1.default.type || type == "BLACKLIST_FOR_OPTIMIZER" /* BLACKLIST_FOR_OPTIMIZER */ || type == "BLACKLIST_FOR_SYNONYM" /* BLACKLIST_FOR_SYNONYM */) {
34 libTableDict = libTableDict || blacklist_1.default;
35 }
36 else {
37 libTableDict = libTableDict || dict_1.TableDict;
38 }
39 this.db[type] = new libTableDict(type, this.options, {
40 TABLE: this.DICT[type],
41 });
42 }
43 return this.db[type];
44 }
45 use(mod, ...argv) {
46 useModules2_1.useModules(this, mod, ...argv);
47 this.inited = true;
48 return this;
49 }
50 _resolveDictFilename(name, pathPlus = [], extPlus = []) {
51 let options = {
52 paths: [
53 '',
54 project_config_1.default.dict_root,
55 ...pathPlus,
56 path.resolve(segment_dict_1.default.DICT_ROOT, 'segment'),
57 ],
58 extensions: [
59 '',
60 ...extPlus,
61 '.utf8',
62 '.txt',
63 ],
64 onlyFile: true,
65 };
66 if (name.indexOf('*') != -1) {
67 let ls = get_1.searchGlobSync(name, options);
68 if (!ls || !ls.length) {
69 throw Error(`Cannot find dict glob file "${name}".`);
70 }
71 return ls;
72 }
73 let filename = get_1.searchFirstSync(name, options);
74 if (!filename) {
75 //console.log(name, pathPlus, extPlus);
76 throw Error(`Cannot find dict file "${name}".`);
77 }
78 return filename;
79 }
80 /**
81 * 载入字典文件
82 *
83 * @param {String} name 字典文件名
84 * @param {String} type 类型
85 * @param {Boolean} convert_to_lower 是否全部转换为小写
86 * @return {Segment}
87 */
88 loadDict(name, type, convert_to_lower, skipExists) {
89 let filename = this._resolveDictFilename(name);
90 if (Array.isArray(filename)) {
91 let self = this;
92 filename.forEach(v => this.loadDict(v, type, convert_to_lower, skipExists));
93 //console.log(filename);
94 return this;
95 }
96 if (!type)
97 type = 'TABLE'; // 默认为TABLE
98 const db = this.getDictDatabase(type, true);
99 const TABLE = this.DICT[type] = db.TABLE;
100 const TABLE2 = this.DICT[type + '2'] = db.TABLE2;
101 /*
102 // 初始化词典
103 if (!this.DICT[type]) this.DICT[type] = {};
104 if (!this.DICT[type + '2']) this.DICT[type + '2'] = {};
105 let TABLE = this.DICT[type]; // 词典表 '词' => {属性}
106 let TABLE2 = this.DICT[type + '2']; // 词典表 '长度' => '词' => 属性
107 */
108 // 导入数据
109 const POSTAG = this.POSTAG;
110 let data = loader_1.default.SegmentDictLoader.loadSync(filename);
111 data.forEach(function (data) {
112 if (convert_to_lower) {
113 data[0] = data[0].toLowerCase();
114 }
115 db.add(data, skipExists);
116 /*
117 let [w, p, f] = data;
118
119 if (w.length == 0)
120 {
121 throw new Error()
122 }
123
124 TABLE[w] = { p, f, };
125 if (!TABLE2[w.length]) TABLE2[w.length] = {};
126 TABLE2[w.length][w] = TABLE[w];
127 */
128 });
129 data = undefined;
130 this.inited = true;
131 return this;
132 }
133 /**
134 * 载入同义词词典
135 *
136 * @param {String} name 字典文件名
137 */
138 loadSynonymDict(name, skipExists) {
139 let filename = this._resolveDictFilename(name, [
140 path.resolve(segment_dict_1.default.DICT_ROOT, 'synonym'),
141 ]);
142 if (Array.isArray(filename)) {
143 let self = this;
144 filename.forEach(v => this.loadSynonymDict(v, skipExists));
145 return this;
146 }
147 let type = 'SYNONYM';
148 const db = this.getDictDatabase(type, true);
149 const TABLE = this.DICT[type] = db.TABLE;
150 /*
151 // 初始化词典
152 if (!this.DICT[type]) this.DICT[type] = {};
153 // 词典表 '同义词' => '标准词'
154 let TABLE = this.DICT[type] as IDICT_SYNONYM;
155 // 导入数据
156 */
157 let data = loader_1.default.SegmentSynonymLoader.loadSync(filename);
158 data.forEach(function (blocks) {
159 db.add(blocks, skipExists);
160 /*
161 let [n1, n2] = blocks;
162
163 TABLE[n1] = n2;
164 if (TABLE[n2] === n1)
165 {
166 delete TABLE[n2];
167 }
168 */
169 });
170 //console.log(TABLE);
171 data = undefined;
172 this.inited = true;
173 return this;
174 }
175 _loadBlacklistDict(name, type) {
176 let filename = this._resolveDictFilename(name, [
177 path.resolve(segment_dict_1.default.DICT_ROOT, 'blacklist'),
178 ]);
179 if (Array.isArray(filename)) {
180 let self = this;
181 filename.forEach(v => this._loadBlacklistDict(v, type));
182 return this;
183 }
184 const db = this.getDictDatabase(type, true);
185 const TABLE = this.DICT[type] = db.TABLE;
186 let data = loader_1.default.SegmentDict
187 .requireLoaderModule('line')
188 .loadSync(filename, {
189 filter(line) {
190 return line.trim();
191 },
192 });
193 data.forEach(v => db.add(v));
194 data = undefined;
195 this.inited = true;
196 return this;
197 }
198 /**
199 * 字典黑名單 在主字典內刪除此字典內有的條目
200 */
201 loadBlacklistDict(name) {
202 return this._loadBlacklistDict(name, "BLACKLIST" /* BLACKLIST */);
203 }
204 /**
205 * 優化器黑名單 會防止部分優化器去組合此字典內的詞
206 * 例如 人名 自動組合之類
207 */
208 loadBlacklistOptimizerDict(name) {
209 return this._loadBlacklistDict(name, "BLACKLIST_FOR_OPTIMIZER" /* BLACKLIST_FOR_OPTIMIZER */);
210 }
211 /**
212 * 轉換黑名單 動態轉換字詞時會忽略此字典內的詞
213 */
214 loadBlacklistSynonymDict(name) {
215 return this._loadBlacklistDict(name, "BLACKLIST_FOR_SYNONYM" /* BLACKLIST_FOR_SYNONYM */);
216 }
217 /**
218 * 载入停止符词典
219 *
220 * @param {String} name 字典文件名
221 */
222 loadStopwordDict(name) {
223 let filename = this._resolveDictFilename(name, [
224 path.resolve(segment_dict_1.default.DICT_ROOT, 'stopword'),
225 ]);
226 if (Array.isArray(filename)) {
227 let self = this;
228 filename.forEach(v => this.loadStopwordDict(v));
229 return this;
230 }
231 const type = "STOPWORD" /* STOPWORD */;
232 const db = this.getDictDatabase(type, true);
233 const TABLE = this.DICT[type] = db.TABLE;
234 let data = loader_1.default.SegmentDict
235 .requireLoaderModule('line')
236 .loadSync(filename, {
237 filter(line) {
238 return line.trim();
239 },
240 });
241 data.forEach(v => db.add(v));
242 data = undefined;
243 this.inited = true;
244 return this;
245 }
246 useDefault(...argv) {
247 index_1.useDefault(this, ...argv);
248 this.inited = true;
249 return this;
250 }
251 /**
252 * 此函數只需執行一次,並且一般狀況下不需要手動呼叫
253 */
254 autoInit(options) {
255 if (!this.inited) {
256 this.inited = true;
257 if (!this.modules.tokenizer.length) {
258 this.useDefault(options);
259 }
260 }
261 return this;
262 }
263 addBlacklist(word, remove) {
264 let me = this;
265 this.autoInit(this.options);
266 const BLACKLIST = me.getDictDatabase("BLACKLIST" /* BLACKLIST */);
267 const TABLE = me.getDictDatabase("TABLE" /* TABLE */);
268 let bool = !remove;
269 if (bool) {
270 BLACKLIST.add(word);
271 TABLE.remove(word);
272 }
273 else {
274 BLACKLIST.remove(word);
275 }
276 return this;
277 }
278 /**
279 * remove key in TABLE by BLACKLIST
280 */
281 doBlacklist() {
282 let me = this;
283 this.autoInit(this.options);
284 const BLACKLIST = me.getDict("BLACKLIST" /* BLACKLIST */);
285 const TABLE = me.getDictDatabase("TABLE" /* TABLE */);
286 Object.entries(BLACKLIST)
287 .forEach(function ([key, bool]) {
288 bool && TABLE.remove(key);
289 });
290 return this;
291 }
292 doSegment(text, options = {}) {
293 this.autoInit(this.options);
294 return super.doSegment(text, options);
295 }
296}
297exports.Segment = Segment;
298Segment.defaultOptionsDoSegment = defaults_1.defaultOptionsDoSegment;
299exports.default = Segment;
300//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"Segment.js","sourceRoot":"","sources":["Segment.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,YAAY,CAAC;;AAEb,6BAA6B;AAC7B,kCAA2D;AAE3D,iDAAmD;AAEnD,uCAAwE;AAExE,qCAA8B;AAE9B,+CAAqD;AACrD,6CAA+C;AAC/C,+CAAuC;AAKvC,sDAA8C;AAkB9C,yCAAyC;AAGzC,iDAA6D;AAC7D,4CAAkE;AAClE,+DAA2D;AAE3D;;GAEG;AACH,MAAa,OAAQ,SAAQ,cAAW;IAiCvC,eAAe,CAAC,IAAY,EAAE,UAAoB,EAAE,YAAa;QAEhE,IAAI,CAAC,UAAU,IAAI,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,EACjD;YACC,IAAI,IAAI,IAAI,iBAAgB,CAAC,IAAI,EACjC;gBACC,YAAY,GAAG,YAAY,IAAI,iBAAgB,CAAC;aAChD;iBACI,IAAI,IAAI,IAAI,4BAAiB,CAAC,IAAI,EACvC;gBACC,YAAY,GAAG,YAAY,IAAI,4BAAiB,CAAC;aACjD;iBACI,IAAI,IAAI,IAAI,mBAAkB,CAAC,IAAI,IAAI,IAAI,2DAA4C,IAAI,IAAI,uDAA0C,EAC9I;gBACC,YAAY,GAAG,YAAY,IAAI,mBAAkB,CAAC;aAClD;iBAED;gBACC,YAAY,GAAG,YAAY,IAAI,gBAAS,CAAC;aACzC;YAED,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,GAAG,IAAI,YAAY,CAAC,IAAI,EAAE,IAAI,CAAC,OAAO,EAAE;gBACpD,KAAK,EAAE,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC;aACtB,CAAC,CAAC;SACH;QAED,OAAO,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC;IACtB,CAAC;IAaD,GAAG,CAAC,GAAG,EAAE,GAAG,IAAI;QAEf,wBAAU,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,CAAC;QAE/B,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC;QAEnB,OAAO,IAAI,CAAC;IACb,CAAC;IAED,oBAAoB,CAAC,IAAY,EAAE,WAAqB,EAAE,EAAE,UAAoB,EAAE;QAEjF,IAAI,OAAO,GAAG;YACb,KAAK,EAAE;gBACN,EAAE;gBACF,wBAAa,CAAC,SAAS;gBAEvB,GAAG,QAAQ;gBACX,IAAI,CAAC,OAAO,CAAC,sBAAW,CAAC,SAAS,EAAE,SAAS,CAAC;aAC9C;YACD,UAAU,EAAE;gBACX,EAAE;gBACF,GAAG,OAAO;gBACV,OAAO;gBACP,MAAM;aACN;YAED,QAAQ,EAAE,IAAI;SACd,CAAC;QAEF,IAAI,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,EAC3B;YACC,IAAI,EAAE,GAAG,oBAAc,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;YAEvC,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,CAAC,MAAM,EACrB;gBACC,MAAM,KAAK,CAAC,+BAA+B,IAAI,IAAI,CAAC,CAAC;aACrD;YAED,OAAO,EAAE,CAAC;SACV;QAED,IAAI,QAAQ,GAAG,qBAAe,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QAE9C,IAAI,CAAC,QAAQ,EACb;YACC,uCAAuC;YAEvC,MAAM,KAAK,CAAC,0BAA0B,IAAI,IAAI,CAAC,CAAC;SAChD;QAED,OAAO,QAAQ,CAAC;IACjB,CAAC;IAED;;;;;;;OAOG;IACH,QAAQ,CAAC,IAAY,EAAE,IAAa,EAAE,gBAA0B,EAAE,UAAoB;QAErF,IAAI,QAAQ,GAAG,IAAI,CAAC,oBAAoB,CAAC,IAAI,CAAC,CAAC;QAE/C,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,EAC3B;YACC,IAAI,IAAI,GAAG,IAAI,CAAC;YAEhB,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,EAAE,IAAI,EAAE,gBAAgB,EAAE,UAAU,CAAC,CAAC,CAAC;YAE5E,wBAAwB;YAExB,OAAO,IAAI,CAAC;SACZ;QAED,IAAI,CAAC,IAAI;YAAE,IAAI,GAAG,OAAO,CAAC,CAAK,WAAW;QAE1C,MAAM,EAAE,GAAG,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;QAE5C,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC;QACzC,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,GAAG,GAAG,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC;QAEjD;;;;;;UAME;QACF,OAAO;QACP,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC;QAE3B,IAAI,IAAI,GAAG,gBAAM,CAAC,iBAAiB,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;QAEvD,IAAI,CAAC,OAAO,CAAC,UAAU,IAAI;YAE1B,IAAI,gBAAgB,EACpB;gBACC,IAAI,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC;aAChC;YAED,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;YAEzB;;;;;;;;;;;cAWE;QACH,CAAC,CAAC,CAAC;QAEH,IAAI,GAAG,SAAS,CAAC;QAEjB,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC;QAEnB,OAAO,IAAI,CAAC;IACb,CAAC;IAED;;;;OAIG;IACH,eAAe,CAAC,IAAY,EAAE,UAAoB;QAEjD,IAAI,QAAQ,GAAG,IAAI,CAAC,oBAAoB,CAAC,IAAI,EAAE;YAC9C,IAAI,CAAC,OAAO,CAAC,sBAAW,CAAC,SAAS,EAAE,SAAS,CAAC;SAC9C,CAAC,CAAC;QAEH,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,EAC3B;YACC,IAAI,IAAI,GAAG,IAAI,CAAC;YAEhB,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC,CAAC;YAE3D,OAAO,IAAI,CAAC;SACZ;QAED,IAAI,IAAI,GAAG,SAAS,CAAC;QAErB,MAAM,EAAE,GAAG,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;QAE5C,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC;QAEzC;;;;;;UAME;QAEF,IAAI,IAAI,GAAG,gBAAM,CAAC,oBAAoB,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;QAE1D,IAAI,CAAC,OAAO,CAAC,UAAU,MAAgB;YAEtC,EAAE,CAAC,GAAG,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;YAE3B;;;;;;;;cAQE;QACH,CAAC,CAAC,CAAC;QAEH,qBAAqB;QAErB,IAAI,GAAG,SAAS,CAAC;QAEjB,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC;QAEnB,OAAO,IAAI,CAAC;IACb,CAAC;IAES,kBAAkB,CAAC,IAAY,EAAE,IAAsB;QAEhE,IAAI,QAAQ,GAAG,IAAI,CAAC,oBAAoB,CAAC,IAAI,EAAE;YAC9C,IAAI,CAAC,OAAO,CAAC,sBAAW,CAAC,SAAS,EAAE,WAAW,CAAC;SAChD,CAAC,CAAC;QAEH,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,EAC3B;YACC,IAAI,IAAI,GAAG,IAAI,CAAC;YAEhB,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC;YAExD,OAAO,IAAI,CAAC;SACZ;QAED,MAAM,EAAE,GAAG,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;QAE5C,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC;QAEzC,IAAI,IAAI,GAAG,gBAAM,CAAC,WAAW;aAC3B,mBAAmB,CAAC,MAAM,CAAC;aAC3B,QAAQ,CAAC,QAAQ,EAAE;YACnB,MAAM,CAAC,IAAY;gBAElB,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC;YACpB,CAAC;SACD,CAAC,CACF;QAED,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAE7B,IAAI,GAAG,SAAS,CAAC;QAEjB,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC;QAEnB,OAAO,IAAI,CAAC;IACb,CAAC;IAED;;OAEG;IACH,iBAAiB,CAAC,IAAY;QAE7B,OAAO,IAAI,CAAC,kBAAkB,CAAC,IAAI,8BAA6B,CAAA;IACjE,CAAC;IAED;;;OAGG;IACH,0BAA0B,CAAC,IAAY;QAEtC,OAAO,IAAI,CAAC,kBAAkB,CAAC,IAAI,0DAA2C,CAAA;IAC/E,CAAC;IAED;;OAEG;IACH,wBAAwB,CAAC,IAAY;QAEpC,OAAO,IAAI,CAAC,kBAAkB,CAAC,IAAI,sDAAyC,CAAA;IAC7E,CAAC;IAED;;;;OAIG;IACH,gBAAgB,CAAC,IAAY;QAE5B,IAAI,QAAQ,GAAG,IAAI,CAAC,oBAAoB,CAAC,IAAI,EAAE;YAC9C,IAAI,CAAC,OAAO,CAAC,sBAAW,CAAC,SAAS,EAAE,UAAU,CAAC;SAC/C,CAAC,CAAC;QAEH,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,EAC3B;YACC,IAAI,IAAI,GAAG,IAAI,CAAC;YAEhB,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC,CAAC,CAAC,CAAC;YAEhD,OAAO,IAAI,CAAC;SACZ;QAED,MAAM,IAAI,4BAA4B,CAAC;QAEvC,MAAM,EAAE,GAAG,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;QAE5C,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC;QAEzC,IAAI,IAAI,GAAG,gBAAM,CAAC,WAAW;aAC3B,mBAAmB,CAAC,MAAM,CAAC;aAC3B,QAAQ,CAAC,QAAQ,EAAE;YACnB,MAAM,CAAC,IAAY;gBAElB,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC;YACpB,CAAC;SACD,CAAC,CACF;QAED,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAE7B,IAAI,GAAG,SAAS,CAAC;QAEjB,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC;QAEnB,OAAO,IAAI,CAAC;IACb,CAAC;IASD,UAAU,CAAC,GAAG,IAAI;QAEjB,kBAAU,CAAC,IAAI,EAAE,GAAG,IAAI,CAAC,CAAC;QAE1B,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC;QAEnB,OAAO,IAAI,CAAC;IACb,CAAC;IAED;;OAEG;IACH,QAAQ,CAAC,OAA4B;QAEpC,IAAI,CAAC,IAAI,CAAC,MAAM,EAChB;YACC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC;YAEnB,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,MAAM,EAClC;gBACC,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC;aACzB;SACD;QAED,OAAO,IAAI,CAAC;IACb,CAAC;IAED,YAAY,CAAC,IAAY,EAAE,MAAgB;QAE1C,IAAI,EAAE,GAAG,IAAI,CAAC;QAEd,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAE5B,MAAM,SAAS,GAAG,EAAE,CAAC,eAAe,6BAA4B,CAAC;QACjE,MAAM,KAAK,GAAG,EAAE,CAAC,eAAe,qBAAwB,CAAC;QAEzD,IAAI,IAAI,GAAG,CAAC,MAAM,CAAC;QAEnB,IAAI,IAAI,EACR;YACC,SAAS,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YACpB,KAAK,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;SACnB;aAED;YACC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;SACtB;QAED,OAAO,IAAI,CAAA;IACZ,CAAC;IAED;;OAEG;IACH,WAAW;QAEV,IAAI,EAAE,GAAG,IAAI,CAAC;QAEd,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAE5B,MAAM,SAAS,GAAG,EAAE,CAAC,OAAO,6BAA4B,CAAC;QACzD,MAAM,KAAK,GAAG,EAAE,CAAC,eAAe,qBAAwB,CAAC;QAEzD,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC;aACvB,OAAO,CAAC,UAAU,CAAC,GAAG,EAAE,IAAI,CAAC;YAE7B,IAAI,IAAI,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAA;QAC1B,CAAC,CAAC,CACF;QAED,OAAO,IAAI,CAAA;IACZ,CAAC;IAiBD,SAAS,CAAC,IAAI,EAAE,UAA6B,EAAE;QAE9C,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAE5B,OAAO,KAAK,CAAC,SAAS,CAAC,IAAI,EAAE,OAAO,CAAQ,CAAA;IAC7C,CAAC;;AAldF,0BAodC;AAjdO,+BAAuB,GAAsB,kCAAuB,CAAC;AA0f7E,kBAAe,OAAO,CAAC","sourcesContent":["/**\n * 分词器接口\n *\n * @author 老雷<leizongmin@gmail.com>\n */\n\n'use strict';\n\nimport * as path from 'path';\nimport { searchFirstSync, searchGlobSync } from './fs/get';\nimport POSTAG from './POSTAG';\nimport TableDictBlacklist from './table/blacklist';\nimport AbstractTableDictCore from './table/core';\nimport { IOptions as IOptionsTableDict, TableDict } from './table/dict';\n\nimport Loader from './loader';\nimport { crlf } from 'crlf-normalize';\nimport { TableDictStopword } from './table/stopword';\nimport TableDictSynonym from './table/synonym';\nimport SegmentDict from 'segment-dict';\nimport { ISubOptimizer, ISubTokenizer, Optimizer, Tokenizer } from './mod';\nimport { debugToken } from './util/debug';\nimport { IWordDebug } from './util/index';\n\nimport ProjectConfig from '../project.config';\n\nimport deepmerge from 'deepmerge-plus/core';\nimport { EnumDictDatabase } from './const';\nimport { ENUM_SUBMODS, ENUM_SUBMODS_NAME, ENUM_SUBMODS_OTHER } from './mod/index';\n\nimport {\n\tIDICT,\n\tIDICT2,\n\tIDICT_BLACKLIST,\n\tIDICT_STOPWORD,\n\tIDICT_SYNONYM,\n\tIOptionsDoSegment,\n\tIOptionsSegment,\n\tISPLIT,\n\tISPLIT_FILTER,\n\tIWord,\n} from './segment/types';\nimport SegmentCore from './segment/core';\nimport { _isIgnoreModules } from './segment/methods/useModules';\nimport { ITSOverwrite } from 'ts-type';\nimport { defaultOptionsDoSegment } from './segment/defaults';\nimport { IUseDefaultOptions, useDefault } from './defaults/index';\nimport { useModules } from './segment/methods/useModules2';\n\n/**\n * 创建分词器接口\n */\nexport class Segment extends SegmentCore\n{\n\n\tstatic defaultOptionsDoSegment: IOptionsDoSegment = defaultOptionsDoSegment;\n\n\tgetDictDatabase<R extends TableDictSynonym>(type: EnumDictDatabase.SYNONYM,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase<R extends TableDict>(type: EnumDictDatabase.TABLE,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase<R extends TableDictStopword>(type: EnumDictDatabase.STOPWORD,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase<R extends TableDictBlacklist>(type: EnumDictDatabase.BLACKLIST,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase<R extends TableDictBlacklist>(type: EnumDictDatabase.BLACKLIST_FOR_OPTIMIZER,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase<R extends TableDictBlacklist>(type: EnumDictDatabase.BLACKLIST_FOR_SYNONYM,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase<R extends AbstractTableDictCore<any>>(type: string | EnumDictDatabase,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase(type: string, autocreate?: boolean, libTableDict?)\n\t{\n\t\tif ((autocreate || this.inited) && !this.db[type])\n\t\t{\n\t\t\tif (type == TableDictSynonym.type)\n\t\t\t{\n\t\t\t\tlibTableDict = libTableDict || TableDictSynonym;\n\t\t\t}\n\t\t\telse if (type == TableDictStopword.type)\n\t\t\t{\n\t\t\t\tlibTableDict = libTableDict || TableDictStopword;\n\t\t\t}\n\t\t\telse if (type == TableDictBlacklist.type || type == EnumDictDatabase.BLACKLIST_FOR_OPTIMIZER || type == EnumDictDatabase.BLACKLIST_FOR_SYNONYM)\n\t\t\t{\n\t\t\t\tlibTableDict = libTableDict || TableDictBlacklist;\n\t\t\t}\n\t\t\telse\n\t\t\t{\n\t\t\t\tlibTableDict = libTableDict || TableDict;\n\t\t\t}\n\n\t\t\tthis.db[type] = new libTableDict(type, this.options, {\n\t\t\t\tTABLE: this.DICT[type],\n\t\t\t});\n\t\t}\n\n\t\treturn this.db[type];\n\t}\n\n\t/**\n\t * 载入分词模块\n\t *\n\t * @param {String|Array|Object} module 模块名称(数组)或模块对象\n\t * @return {Segment}\n\t */\n\tuse(mod: ISubOptimizer, ...argv)\n\tuse(mod: ISubTokenizer, ...argv)\n\tuse(mod: Array<ISubTokenizer | ISubOptimizer | string>, ...argv)\n\tuse(mod: string, ...argv)\n\tuse(mod, ...argv)\n\tuse(mod, ...argv)\n\t{\n\t\tuseModules(this, mod, ...argv);\n\n\t\tthis.inited = true;\n\n\t\treturn this;\n\t}\n\n\t_resolveDictFilename(name: string, pathPlus: string[] = [], extPlus: string[] = []): string | string[]\n\t{\n\t\tlet options = {\n\t\t\tpaths: [\n\t\t\t\t'',\n\t\t\t\tProjectConfig.dict_root,\n\n\t\t\t\t...pathPlus,\n\t\t\t\tpath.resolve(SegmentDict.DICT_ROOT, 'segment'),\n\t\t\t],\n\t\t\textensions: [\n\t\t\t\t'',\n\t\t\t\t...extPlus,\n\t\t\t\t'.utf8',\n\t\t\t\t'.txt',\n\t\t\t],\n\n\t\t\tonlyFile: true,\n\t\t};\n\n\t\tif (name.indexOf('*') != -1)\n\t\t{\n\t\t\tlet ls = searchGlobSync(name, options);\n\n\t\t\tif (!ls || !ls.length)\n\t\t\t{\n\t\t\t\tthrow Error(`Cannot find dict glob file \"${name}\".`);\n\t\t\t}\n\n\t\t\treturn ls;\n\t\t}\n\n\t\tlet filename = searchFirstSync(name, options);\n\n\t\tif (!filename)\n\t\t{\n\t\t\t//console.log(name, pathPlus, extPlus);\n\n\t\t\tthrow Error(`Cannot find dict file \"${name}\".`);\n\t\t}\n\n\t\treturn filename;\n\t}\n\n\t/**\n\t * 载入字典文件\n\t *\n\t * @param {String} name 字典文件名\n\t * @param {String} type 类型\n\t * @param {Boolean} convert_to_lower 是否全部转换为小写\n\t * @return {Segment}\n\t */\n\tloadDict(name: string, type?: string, convert_to_lower?: boolean, skipExists?: boolean)\n\t{\n\t\tlet filename = this._resolveDictFilename(name);\n\n\t\tif (Array.isArray(filename))\n\t\t{\n\t\t\tlet self = this;\n\n\t\t\tfilename.forEach(v => this.loadDict(v, type, convert_to_lower, skipExists));\n\n\t\t\t//console.log(filename);\n\n\t\t\treturn this;\n\t\t}\n\n\t\tif (!type) type = 'TABLE';     // 默认为TABLE\n\n\t\tconst db = this.getDictDatabase(type, true);\n\n\t\tconst TABLE = this.DICT[type] = db.TABLE;\n\t\tconst TABLE2 = this.DICT[type + '2'] = db.TABLE2;\n\n\t\t/*\n\t\t// 初始化词典\n\t\tif (!this.DICT[type]) this.DICT[type] = {};\n\t\tif (!this.DICT[type + '2']) this.DICT[type + '2'] = {};\n\t\tlet TABLE = this.DICT[type];        // 词典表  '词' => {属性}\n\t\tlet TABLE2 = this.DICT[type + '2']; // 词典表  '长度' => '词' => 属性\n\t\t*/\n\t\t// 导入数据\n\t\tconst POSTAG = this.POSTAG;\n\n\t\tlet data = Loader.SegmentDictLoader.loadSync(filename);\n\n\t\tdata.forEach(function (data)\n\t\t{\n\t\t\tif (convert_to_lower)\n\t\t\t{\n\t\t\t\tdata[0] = data[0].toLowerCase();\n\t\t\t}\n\n\t\t\tdb.add(data, skipExists);\n\n\t\t\t/*\n\t\t\tlet [w, p, f] = data;\n\n\t\t\tif (w.length == 0)\n\t\t\t{\n\t\t\t\tthrow new Error()\n\t\t\t}\n\n\t\t\tTABLE[w] = { p, f, };\n\t\t\tif (!TABLE2[w.length]) TABLE2[w.length] = {};\n\t\t\tTABLE2[w.length][w] = TABLE[w];\n\t\t\t*/\n\t\t});\n\n\t\tdata = undefined;\n\n\t\tthis.inited = true;\n\n\t\treturn this;\n\t}\n\n\t/**\n\t * 载入同义词词典\n\t *\n\t * @param {String} name 字典文件名\n\t */\n\tloadSynonymDict(name: string, skipExists?: boolean)\n\t{\n\t\tlet filename = this._resolveDictFilename(name, [\n\t\t\tpath.resolve(SegmentDict.DICT_ROOT, 'synonym'),\n\t\t]);\n\n\t\tif (Array.isArray(filename))\n\t\t{\n\t\t\tlet self = this;\n\n\t\t\tfilename.forEach(v => this.loadSynonymDict(v, skipExists));\n\n\t\t\treturn this;\n\t\t}\n\n\t\tlet type = 'SYNONYM';\n\n\t\tconst db = this.getDictDatabase(type, true);\n\n\t\tconst TABLE = this.DICT[type] = db.TABLE;\n\n\t\t/*\n\t\t// 初始化词典\n\t\tif (!this.DICT[type]) this.DICT[type] = {};\n\t\t// 词典表  '同义词' => '标准词'\n\t\tlet TABLE = this.DICT[type] as IDICT_SYNONYM;\n\t\t// 导入数据\n\t\t*/\n\n\t\tlet data = Loader.SegmentSynonymLoader.loadSync(filename);\n\n\t\tdata.forEach(function (blocks: string[])\n\t\t{\n\t\t\tdb.add(blocks, skipExists);\n\n\t\t\t/*\n\t\t\tlet [n1, n2] = blocks;\n\n\t\t\tTABLE[n1] = n2;\n\t\t\tif (TABLE[n2] === n1)\n\t\t\t{\n\t\t\t\tdelete TABLE[n2];\n\t\t\t}\n\t\t\t*/\n\t\t});\n\n\t\t//console.log(TABLE);\n\n\t\tdata = undefined;\n\n\t\tthis.inited = true;\n\n\t\treturn this;\n\t}\n\n\tprotected _loadBlacklistDict(name: string, type: EnumDictDatabase)\n\t{\n\t\tlet filename = this._resolveDictFilename(name, [\n\t\t\tpath.resolve(SegmentDict.DICT_ROOT, 'blacklist'),\n\t\t]);\n\n\t\tif (Array.isArray(filename))\n\t\t{\n\t\t\tlet self = this;\n\n\t\t\tfilename.forEach(v => this._loadBlacklistDict(v, type));\n\n\t\t\treturn this;\n\t\t}\n\n\t\tconst db = this.getDictDatabase(type, true);\n\n\t\tconst TABLE = this.DICT[type] = db.TABLE;\n\n\t\tlet data = Loader.SegmentDict\n\t\t\t.requireLoaderModule('line')\n\t\t\t.loadSync(filename, {\n\t\t\t\tfilter(line: string)\n\t\t\t\t{\n\t\t\t\t\treturn line.trim();\n\t\t\t\t},\n\t\t\t})\n\t\t;\n\n\t\tdata.forEach(v => db.add(v));\n\n\t\tdata = undefined;\n\n\t\tthis.inited = true;\n\n\t\treturn this;\n\t}\n\n\t/**\n\t * 字典黑名單 在主字典內刪除此字典內有的條目\n\t */\n\tloadBlacklistDict(name: string)\n\t{\n\t\treturn this._loadBlacklistDict(name, EnumDictDatabase.BLACKLIST)\n\t}\n\n\t/**\n\t * 優化器黑名單 會防止部分優化器去組合此字典內的詞\n\t * 例如 人名 自動組合之類\n\t */\n\tloadBlacklistOptimizerDict(name: string)\n\t{\n\t\treturn this._loadBlacklistDict(name, EnumDictDatabase.BLACKLIST_FOR_OPTIMIZER)\n\t}\n\n\t/**\n\t * 轉換黑名單 動態轉換字詞時會忽略此字典內的詞\n\t */\n\tloadBlacklistSynonymDict(name: string)\n\t{\n\t\treturn this._loadBlacklistDict(name, EnumDictDatabase.BLACKLIST_FOR_SYNONYM)\n\t}\n\n\t/**\n\t * 载入停止符词典\n\t *\n\t * @param {String} name 字典文件名\n\t */\n\tloadStopwordDict(name: string)\n\t{\n\t\tlet filename = this._resolveDictFilename(name, [\n\t\t\tpath.resolve(SegmentDict.DICT_ROOT, 'stopword'),\n\t\t]);\n\n\t\tif (Array.isArray(filename))\n\t\t{\n\t\t\tlet self = this;\n\n\t\t\tfilename.forEach(v => this.loadStopwordDict(v));\n\n\t\t\treturn this;\n\t\t}\n\n\t\tconst type = EnumDictDatabase.STOPWORD;\n\n\t\tconst db = this.getDictDatabase(type, true);\n\n\t\tconst TABLE = this.DICT[type] = db.TABLE;\n\n\t\tlet data = Loader.SegmentDict\n\t\t\t.requireLoaderModule('line')\n\t\t\t.loadSync(filename, {\n\t\t\t\tfilter(line: string)\n\t\t\t\t{\n\t\t\t\t\treturn line.trim();\n\t\t\t\t},\n\t\t\t})\n\t\t;\n\n\t\tdata.forEach(v => db.add(v));\n\n\t\tdata = undefined;\n\n\t\tthis.inited = true;\n\n\t\treturn this;\n\t}\n\n\t/**\n\t * 使用默认的识别模块和字典文件\n\t * 在使用預設值的情況下，不需要主動呼叫此函數\n\t *\n\t * @return {Segment}\n\t */\n\tuseDefault(options?: IUseDefaultOptions, ...argv)\n\tuseDefault(...argv)\n\t{\n\t\tuseDefault(this, ...argv);\n\n\t\tthis.inited = true;\n\n\t\treturn this;\n\t}\n\n\t/**\n\t * 此函數只需執行一次，並且一般狀況下不需要手動呼叫\n\t */\n\tautoInit(options?: IUseDefaultOptions)\n\t{\n\t\tif (!this.inited)\n\t\t{\n\t\t\tthis.inited = true;\n\n\t\t\tif (!this.modules.tokenizer.length)\n\t\t\t{\n\t\t\t\tthis.useDefault(options);\n\t\t\t}\n\t\t}\n\n\t\treturn this;\n\t}\n\n\taddBlacklist(word: string, remove?: boolean)\n\t{\n\t\tlet me = this;\n\n\t\tthis.autoInit(this.options);\n\n\t\tconst BLACKLIST = me.getDictDatabase(EnumDictDatabase.BLACKLIST);\n\t\tconst TABLE = me.getDictDatabase(EnumDictDatabase.TABLE);\n\n\t\tlet bool = !remove;\n\n\t\tif (bool)\n\t\t{\n\t\t\tBLACKLIST.add(word);\n\t\t\tTABLE.remove(word);\n\t\t}\n\t\telse\n\t\t{\n\t\t\tBLACKLIST.remove(word)\n\t\t}\n\n\t\treturn this\n\t}\n\n\t/**\n\t * remove key in TABLE by BLACKLIST\n\t */\n\tdoBlacklist()\n\t{\n\t\tlet me = this;\n\n\t\tthis.autoInit(this.options);\n\n\t\tconst BLACKLIST = me.getDict(EnumDictDatabase.BLACKLIST);\n\t\tconst TABLE = me.getDictDatabase(EnumDictDatabase.TABLE);\n\n\t\tObject.entries(BLACKLIST)\n\t\t\t.forEach(function ([key, bool])\n\t\t\t{\n\t\t\t\tbool && TABLE.remove(key)\n\t\t\t})\n\t\t;\n\n\t\treturn this\n\t}\n\n\t/**\n\t * 开始分词\n\t *\n\t * @param {String} text 文本\n\t * @param {Object} options 选项\n\t *   - {Boolean} simple 是否仅返回单词内容\n\t *   - {Boolean} stripPunctuation 去除标点符号\n\t *   - {Boolean} convertSynonym 转换同义词\n\t *   - {Boolean} stripStopword 去除停止符\n\t * @return {Array}\n\t */\n\tdoSegment(text: string | Buffer, options: ITSOverwrite<IOptionsDoSegment, {\n\t\tsimple: true,\n\t}>): string[]\n\tdoSegment(text: string | Buffer, options?: IOptionsDoSegment): IWord[]\n\tdoSegment(text, options: IOptionsDoSegment = {})\n\t{\n\t\tthis.autoInit(this.options);\n\n\t\treturn super.doSegment(text, options) as any\n\t}\n\n}\n\nexport declare namespace Segment\n{\n\texport {\n\t\t// @ts-ignore\n\t\tIDICT,\n\t\t// @ts-ignore\n\t\tIDICT2,\n\t\t// @ts-ignore\n\t\tIDICT_BLACKLIST,\n\t\t// @ts-ignore\n\t\tIDICT_STOPWORD,\n\t\t// @ts-ignore\n\t\tIDICT_SYNONYM,\n\t\t// @ts-ignore\n\t\tIOptionsDoSegment,\n\t\t// @ts-ignore\n\t\tIOptionsSegment,\n\t\t// @ts-ignore\n\t\tISPLIT,\n\t\t// @ts-ignore\n\t\tISPLIT_FILTER,\n\t\t// @ts-ignore\n\t\tIWord,\n\t}\n}\n\nexport {\n\tIDICT,\n\tIDICT2,\n\tIDICT_BLACKLIST,\n\tIDICT_STOPWORD,\n\tIDICT_SYNONYM,\n\tIOptionsDoSegment,\n\tIOptionsSegment,\n\tISPLIT,\n\tISPLIT_FILTER,\n\tIWord,\n}\n\nexport default Segment;\n"]}
\No newline at end of file