1 | /**
|
2 | * 分词器接口
|
3 | *
|
4 | * @author 老雷<leizongmin@gmail.com>
|
5 | */
|
6 | ;
|
7 | Object.defineProperty(exports, "__esModule", { value: true });
|
8 | const path = require("path");
|
9 | const get_1 = require("./fs/get");
|
10 | const blacklist_1 = require("./table/blacklist");
|
11 | const dict_1 = require("./table/dict");
|
12 | const loader_1 = require("./loader");
|
13 | const stopword_1 = require("./table/stopword");
|
14 | const synonym_1 = require("./table/synonym");
|
15 | const segment_dict_1 = require("segment-dict");
|
16 | const project_config_1 = require("../project.config");
|
17 | const core_1 = require("./segment/core");
|
18 | const defaults_1 = require("./segment/defaults");
|
19 | const index_1 = require("./defaults/index");
|
20 | const useModules2_1 = require("./segment/methods/useModules2");
|
21 | /**
|
22 | * 创建分词器接口
|
23 | */
|
24 | class Segment extends core_1.default {
|
25 | getDictDatabase(type, autocreate, libTableDict) {
|
26 | if ((autocreate || this.inited) && !this.db[type]) {
|
27 | if (type == synonym_1.default.type) {
|
28 | libTableDict = libTableDict || synonym_1.default;
|
29 | }
|
30 | else if (type == stopword_1.TableDictStopword.type) {
|
31 | libTableDict = libTableDict || stopword_1.TableDictStopword;
|
32 | }
|
33 | else if (type == blacklist_1.default.type || type == "BLACKLIST_FOR_OPTIMIZER" /* BLACKLIST_FOR_OPTIMIZER */ || type == "BLACKLIST_FOR_SYNONYM" /* BLACKLIST_FOR_SYNONYM */) {
|
34 | libTableDict = libTableDict || blacklist_1.default;
|
35 | }
|
36 | else {
|
37 | libTableDict = libTableDict || dict_1.TableDict;
|
38 | }
|
39 | this.db[type] = new libTableDict(type, this.options, {
|
40 | TABLE: this.DICT[type],
|
41 | });
|
42 | }
|
43 | return this.db[type];
|
44 | }
|
45 | use(mod, ...argv) {
|
46 | useModules2_1.useModules(this, mod, ...argv);
|
47 | this.inited = true;
|
48 | return this;
|
49 | }
|
50 | _resolveDictFilename(name, pathPlus = [], extPlus = []) {
|
51 | let options = {
|
52 | paths: [
|
53 | '',
|
54 | project_config_1.default.dict_root,
|
55 | ...pathPlus,
|
56 | path.resolve(segment_dict_1.default.DICT_ROOT, 'segment'),
|
57 | ],
|
58 | extensions: [
|
59 | '',
|
60 | ...extPlus,
|
61 | '.utf8',
|
62 | '.txt',
|
63 | ],
|
64 | onlyFile: true,
|
65 | };
|
66 | if (name.indexOf('*') != -1) {
|
67 | let ls = get_1.searchGlobSync(name, options);
|
68 | if (!ls || !ls.length) {
|
69 | throw Error(`Cannot find dict glob file "${name}".`);
|
70 | }
|
71 | return ls;
|
72 | }
|
73 | let filename = get_1.searchFirstSync(name, options);
|
74 | if (!filename) {
|
75 | //console.log(name, pathPlus, extPlus);
|
76 | throw Error(`Cannot find dict file "${name}".`);
|
77 | }
|
78 | return filename;
|
79 | }
|
80 | /**
|
81 | * 载入字典文件
|
82 | *
|
83 | * @param {String} name 字典文件名
|
84 | * @param {String} type 类型
|
85 | * @param {Boolean} convert_to_lower 是否全部转换为小写
|
86 | * @return {Segment}
|
87 | */
|
88 | loadDict(name, type, convert_to_lower, skipExists) {
|
89 | let filename = this._resolveDictFilename(name);
|
90 | if (Array.isArray(filename)) {
|
91 | let self = this;
|
92 | filename.forEach(v => this.loadDict(v, type, convert_to_lower, skipExists));
|
93 | //console.log(filename);
|
94 | return this;
|
95 | }
|
96 | if (!type)
|
97 | type = 'TABLE'; // 默认为TABLE
|
98 | const db = this.getDictDatabase(type, true);
|
99 | const TABLE = this.DICT[type] = db.TABLE;
|
100 | const TABLE2 = this.DICT[type + '2'] = db.TABLE2;
|
101 | /*
|
102 | // 初始化词典
|
103 | if (!this.DICT[type]) this.DICT[type] = {};
|
104 | if (!this.DICT[type + '2']) this.DICT[type + '2'] = {};
|
105 | let TABLE = this.DICT[type]; // 词典表 '词' => {属性}
|
106 | let TABLE2 = this.DICT[type + '2']; // 词典表 '长度' => '词' => 属性
|
107 | */
|
108 | // 导入数据
|
109 | const POSTAG = this.POSTAG;
|
110 | let data = loader_1.default.SegmentDictLoader.loadSync(filename);
|
111 | data.forEach(function (data) {
|
112 | if (convert_to_lower) {
|
113 | data[0] = data[0].toLowerCase();
|
114 | }
|
115 | db.add(data, skipExists);
|
116 | /*
|
117 | let [w, p, f] = data;
|
118 |
|
119 | if (w.length == 0)
|
120 | {
|
121 | throw new Error()
|
122 | }
|
123 |
|
124 | TABLE[w] = { p, f, };
|
125 | if (!TABLE2[w.length]) TABLE2[w.length] = {};
|
126 | TABLE2[w.length][w] = TABLE[w];
|
127 | */
|
128 | });
|
129 | data = undefined;
|
130 | this.inited = true;
|
131 | return this;
|
132 | }
|
133 | /**
|
134 | * 载入同义词词典
|
135 | *
|
136 | * @param {String} name 字典文件名
|
137 | */
|
138 | loadSynonymDict(name, skipExists) {
|
139 | let filename = this._resolveDictFilename(name, [
|
140 | path.resolve(segment_dict_1.default.DICT_ROOT, 'synonym'),
|
141 | ]);
|
142 | if (Array.isArray(filename)) {
|
143 | let self = this;
|
144 | filename.forEach(v => this.loadSynonymDict(v, skipExists));
|
145 | return this;
|
146 | }
|
147 | let type = 'SYNONYM';
|
148 | const db = this.getDictDatabase(type, true);
|
149 | const TABLE = this.DICT[type] = db.TABLE;
|
150 | /*
|
151 | // 初始化词典
|
152 | if (!this.DICT[type]) this.DICT[type] = {};
|
153 | // 词典表 '同义词' => '标准词'
|
154 | let TABLE = this.DICT[type] as IDICT_SYNONYM;
|
155 | // 导入数据
|
156 | */
|
157 | let data = loader_1.default.SegmentSynonymLoader.loadSync(filename);
|
158 | data.forEach(function (blocks) {
|
159 | db.add(blocks, skipExists);
|
160 | /*
|
161 | let [n1, n2] = blocks;
|
162 |
|
163 | TABLE[n1] = n2;
|
164 | if (TABLE[n2] === n1)
|
165 | {
|
166 | delete TABLE[n2];
|
167 | }
|
168 | */
|
169 | });
|
170 | //console.log(TABLE);
|
171 | data = undefined;
|
172 | this.inited = true;
|
173 | return this;
|
174 | }
|
175 | _loadBlacklistDict(name, type) {
|
176 | let filename = this._resolveDictFilename(name, [
|
177 | path.resolve(segment_dict_1.default.DICT_ROOT, 'blacklist'),
|
178 | ]);
|
179 | if (Array.isArray(filename)) {
|
180 | let self = this;
|
181 | filename.forEach(v => this._loadBlacklistDict(v, type));
|
182 | return this;
|
183 | }
|
184 | const db = this.getDictDatabase(type, true);
|
185 | const TABLE = this.DICT[type] = db.TABLE;
|
186 | let data = loader_1.default.SegmentDict
|
187 | .requireLoaderModule('line')
|
188 | .loadSync(filename, {
|
189 | filter(line) {
|
190 | return line.trim();
|
191 | },
|
192 | });
|
193 | data.forEach(v => db.add(v));
|
194 | data = undefined;
|
195 | this.inited = true;
|
196 | return this;
|
197 | }
|
198 | /**
|
199 | * 字典黑名單 在主字典內刪除此字典內有的條目
|
200 | */
|
201 | loadBlacklistDict(name) {
|
202 | return this._loadBlacklistDict(name, "BLACKLIST" /* BLACKLIST */);
|
203 | }
|
204 | /**
|
205 | * 優化器黑名單 會防止部分優化器去組合此字典內的詞
|
206 | * 例如 人名 自動組合之類
|
207 | */
|
208 | loadBlacklistOptimizerDict(name) {
|
209 | return this._loadBlacklistDict(name, "BLACKLIST_FOR_OPTIMIZER" /* BLACKLIST_FOR_OPTIMIZER */);
|
210 | }
|
211 | /**
|
212 | * 轉換黑名單 動態轉換字詞時會忽略此字典內的詞
|
213 | */
|
214 | loadBlacklistSynonymDict(name) {
|
215 | return this._loadBlacklistDict(name, "BLACKLIST_FOR_SYNONYM" /* BLACKLIST_FOR_SYNONYM */);
|
216 | }
|
217 | /**
|
218 | * 载入停止符词典
|
219 | *
|
220 | * @param {String} name 字典文件名
|
221 | */
|
222 | loadStopwordDict(name) {
|
223 | let filename = this._resolveDictFilename(name, [
|
224 | path.resolve(segment_dict_1.default.DICT_ROOT, 'stopword'),
|
225 | ]);
|
226 | if (Array.isArray(filename)) {
|
227 | let self = this;
|
228 | filename.forEach(v => this.loadStopwordDict(v));
|
229 | return this;
|
230 | }
|
231 | const type = "STOPWORD" /* STOPWORD */;
|
232 | const db = this.getDictDatabase(type, true);
|
233 | const TABLE = this.DICT[type] = db.TABLE;
|
234 | let data = loader_1.default.SegmentDict
|
235 | .requireLoaderModule('line')
|
236 | .loadSync(filename, {
|
237 | filter(line) {
|
238 | return line.trim();
|
239 | },
|
240 | });
|
241 | data.forEach(v => db.add(v));
|
242 | data = undefined;
|
243 | this.inited = true;
|
244 | return this;
|
245 | }
|
246 | useDefault(...argv) {
|
247 | index_1.useDefault(this, ...argv);
|
248 | this.inited = true;
|
249 | return this;
|
250 | }
|
251 | /**
|
252 | * 此函數只需執行一次,並且一般狀況下不需要手動呼叫
|
253 | */
|
254 | autoInit(options) {
|
255 | if (!this.inited) {
|
256 | this.inited = true;
|
257 | if (!this.modules.tokenizer.length) {
|
258 | this.useDefault(options);
|
259 | }
|
260 | }
|
261 | return this;
|
262 | }
|
263 | addBlacklist(word, remove) {
|
264 | let me = this;
|
265 | this.autoInit(this.options);
|
266 | const BLACKLIST = me.getDictDatabase("BLACKLIST" /* BLACKLIST */);
|
267 | const TABLE = me.getDictDatabase("TABLE" /* TABLE */);
|
268 | let bool = !remove;
|
269 | if (bool) {
|
270 | BLACKLIST.add(word);
|
271 | TABLE.remove(word);
|
272 | }
|
273 | else {
|
274 | BLACKLIST.remove(word);
|
275 | }
|
276 | return this;
|
277 | }
|
278 | /**
|
279 | * remove key in TABLE by BLACKLIST
|
280 | */
|
281 | doBlacklist() {
|
282 | let me = this;
|
283 | this.autoInit(this.options);
|
284 | const BLACKLIST = me.getDict("BLACKLIST" /* BLACKLIST */);
|
285 | const TABLE = me.getDictDatabase("TABLE" /* TABLE */);
|
286 | Object.entries(BLACKLIST)
|
287 | .forEach(function ([key, bool]) {
|
288 | bool && TABLE.remove(key);
|
289 | });
|
290 | return this;
|
291 | }
|
292 | doSegment(text, options = {}) {
|
293 | this.autoInit(this.options);
|
294 | return super.doSegment(text, options);
|
295 | }
|
296 | }
|
297 | exports.Segment = Segment;
|
298 | Segment.defaultOptionsDoSegment = defaults_1.defaultOptionsDoSegment;
|
299 | exports.default = Segment;
|
300 | //# sourceMappingURL=data:application/json;base64,{"version":3,"file":"Segment.js","sourceRoot":"","sources":["Segment.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,YAAY,CAAC;;AAEb,6BAA6B;AAC7B,kCAA2D;AAE3D,iDAAmD;AAEnD,uCAAwE;AAExE,qCAA8B;AAE9B,+CAAqD;AACrD,6CAA+C;AAC/C,+CAAuC;AAKvC,sDAA8C;AAkB9C,yCAAyC;AAGzC,iDAA6D;AAC7D,4CAAkE;AAClE,+DAA2D;AAE3D;;GAEG;AACH,MAAa,OAAQ,SAAQ,cAAW;IAiCvC,eAAe,CAAC,IAAY,EAAE,UAAoB,EAAE,YAAa;QAEhE,IAAI,CAAC,UAAU,IAAI,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,EACjD;YACC,IAAI,IAAI,IAAI,iBAAgB,CAAC,IAAI,EACjC;gBACC,YAAY,GAAG,YAAY,IAAI,iBAAgB,CAAC;aAChD;iBACI,IAAI,IAAI,IAAI,4BAAiB,CAAC,IAAI,EACvC;gBACC,YAAY,GAAG,YAAY,IAAI,4BAAiB,CAAC;aACjD;iBACI,IAAI,IAAI,IAAI,mBAAkB,CAAC,IAAI,IAAI,IAAI,2DAA4C,IAAI,IAAI,uDAA0C,EAC9I;gBACC,YAAY,GAAG,YAAY,IAAI,mBAAkB,CAAC;aAClD;iBAED;gBACC,YAAY,GAAG,YAAY,IAAI,gBAAS,CAAC;aACzC;YAED,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,GAAG,IAAI,YAAY,CAAC,IAAI,EAAE,IAAI,CAAC,OAAO,EAAE;gBACpD,KAAK,EAAE,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC;aACtB,CAAC,CAAC;SACH;QAED,OAAO,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC;IACtB,CAAC;IAaD,GAAG,CAAC,GAAG,EAAE,GAAG,IAAI;QAEf,wBAAU,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,CAAC;QAE/B,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC;QAEnB,OAAO,IAAI,CAAC;IACb,CAAC;IAED,oBAAoB,CAAC,IAAY,EAAE,WAAqB,EAAE,EAAE,UAAoB,EAAE;QAEjF,IAAI,OAAO,GAAG;YACb,KAAK,EAAE;gBACN,EAAE;gBACF,wBAAa,CAAC,SAAS;gBAEvB,GAAG,QAAQ;gBACX,IAAI,CAAC,OAAO,CAAC,sBAAW,CAAC,SAAS,EAAE,SAAS,CAAC;aAC9C;YACD,UAAU,EAAE;gBACX,EAAE;gBACF,GAAG,OAAO;gBACV,OAAO;gBACP,MAAM;aACN;YAED,QAAQ,EAAE,IAAI;SACd,CAAC;QAEF,IAAI,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,EAC3B;YACC,IAAI,EAAE,GAAG,oBAAc,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;YAEvC,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,CAAC,MAAM,EACrB;gBACC,MAAM,KAAK,CAAC,+BAA+B,IAAI,IAAI,CAAC,CAAC;aACrD;YAED,OAAO,EAAE,CAAC;SACV;QAED,IAAI,QAAQ,GAAG,qBAAe,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QAE9C,IAAI,CAAC,QAAQ,EACb;YACC,uCAAuC;YAEvC,MAAM,KAAK,CAAC,0BAA0B,IAAI,IAAI,CAAC,CAAC;SAChD;QAED,OAAO,QAAQ,CAAC;IACjB,CAAC;IAED;;;;;;;OAOG;IACH,QAAQ,CAAC,IAAY,EAAE,IAAa,EAAE,gBAA0B,EAAE,UAAoB;QAErF,IAAI,QAAQ,GAAG,IAAI,CAAC,oBAAoB,CAAC,IAAI,CAAC,CAAC;QAE/C,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,EAC3B;YACC,IAAI,IAAI,GAAG,IAAI,CAAC;YAEhB,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,EAAE,IAAI,EAAE,gBAAgB,EAAE,UAAU,CAAC,CAAC,CAAC;YAE5E,wBAAwB;YAExB,OAAO,IAAI,CAAC;SACZ;QAED,IAAI,CAAC,IAAI;YAAE,IAAI,GAAG,OAAO,CAAC,CAAK,WAAW;QAE1C,MAAM,EAAE,GAAG,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;QAE5C,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC;QACzC,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,GAAG,GAAG,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC;QAEjD;;;;;;UAME;QACF,OAAO;QACP,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC;QAE3B,IAAI,IAAI,GAAG,gBAAM,CAAC,iBAAiB,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;QAEvD,IAAI,CAAC,OAAO,CAAC,UAAU,IAAI;YAE1B,IAAI,gBAAgB,EACpB;gBACC,IAAI,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC;aAChC;YAED,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;YAEzB;;;;;;;;;;;cAWE;QACH,CAAC,CAAC,CAAC;QAEH,IAAI,GAAG,SAAS,CAAC;QAEjB,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC;QAEnB,OAAO,IAAI,CAAC;IACb,CAAC;IAED;;;;OAIG;IACH,eAAe,CAAC,IAAY,EAAE,UAAoB;QAEjD,IAAI,QAAQ,GAAG,IAAI,CAAC,oBAAoB,CAAC,IAAI,EAAE;YAC9C,IAAI,CAAC,OAAO,CAAC,sBAAW,CAAC,SAAS,EAAE,SAAS,CAAC;SAC9C,CAAC,CAAC;QAEH,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,EAC3B;YACC,IAAI,IAAI,GAAG,IAAI,CAAC;YAEhB,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC,CAAC;YAE3D,OAAO,IAAI,CAAC;SACZ;QAED,IAAI,IAAI,GAAG,SAAS,CAAC;QAErB,MAAM,EAAE,GAAG,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;QAE5C,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC;QAEzC;;;;;;UAME;QAEF,IAAI,IAAI,GAAG,gBAAM,CAAC,oBAAoB,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;QAE1D,IAAI,CAAC,OAAO,CAAC,UAAU,MAAgB;YAEtC,EAAE,CAAC,GAAG,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;YAE3B;;;;;;;;cAQE;QACH,CAAC,CAAC,CAAC;QAEH,qBAAqB;QAErB,IAAI,GAAG,SAAS,CAAC;QAEjB,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC;QAEnB,OAAO,IAAI,CAAC;IACb,CAAC;IAES,kBAAkB,CAAC,IAAY,EAAE,IAAsB;QAEhE,IAAI,QAAQ,GAAG,IAAI,CAAC,oBAAoB,CAAC,IAAI,EAAE;YAC9C,IAAI,CAAC,OAAO,CAAC,sBAAW,CAAC,SAAS,EAAE,WAAW,CAAC;SAChD,CAAC,CAAC;QAEH,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,EAC3B;YACC,IAAI,IAAI,GAAG,IAAI,CAAC;YAEhB,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC;YAExD,OAAO,IAAI,CAAC;SACZ;QAED,MAAM,EAAE,GAAG,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;QAE5C,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC;QAEzC,IAAI,IAAI,GAAG,gBAAM,CAAC,WAAW;aAC3B,mBAAmB,CAAC,MAAM,CAAC;aAC3B,QAAQ,CAAC,QAAQ,EAAE;YACnB,MAAM,CAAC,IAAY;gBAElB,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC;YACpB,CAAC;SACD,CAAC,CACF;QAED,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAE7B,IAAI,GAAG,SAAS,CAAC;QAEjB,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC;QAEnB,OAAO,IAAI,CAAC;IACb,CAAC;IAED;;OAEG;IACH,iBAAiB,CAAC,IAAY;QAE7B,OAAO,IAAI,CAAC,kBAAkB,CAAC,IAAI,8BAA6B,CAAA;IACjE,CAAC;IAED;;;OAGG;IACH,0BAA0B,CAAC,IAAY;QAEtC,OAAO,IAAI,CAAC,kBAAkB,CAAC,IAAI,0DAA2C,CAAA;IAC/E,CAAC;IAED;;OAEG;IACH,wBAAwB,CAAC,IAAY;QAEpC,OAAO,IAAI,CAAC,kBAAkB,CAAC,IAAI,sDAAyC,CAAA;IAC7E,CAAC;IAED;;;;OAIG;IACH,gBAAgB,CAAC,IAAY;QAE5B,IAAI,QAAQ,GAAG,IAAI,CAAC,oBAAoB,CAAC,IAAI,EAAE;YAC9C,IAAI,CAAC,OAAO,CAAC,sBAAW,CAAC,SAAS,EAAE,UAAU,CAAC;SAC/C,CAAC,CAAC;QAEH,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,EAC3B;YACC,IAAI,IAAI,GAAG,IAAI,CAAC;YAEhB,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC,CAAC,CAAC,CAAC;YAEhD,OAAO,IAAI,CAAC;SACZ;QAED,MAAM,IAAI,4BAA4B,CAAC;QAEvC,MAAM,EAAE,GAAG,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;QAE5C,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC;QAEzC,IAAI,IAAI,GAAG,gBAAM,CAAC,WAAW;aAC3B,mBAAmB,CAAC,MAAM,CAAC;aAC3B,QAAQ,CAAC,QAAQ,EAAE;YACnB,MAAM,CAAC,IAAY;gBAElB,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC;YACpB,CAAC;SACD,CAAC,CACF;QAED,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAE7B,IAAI,GAAG,SAAS,CAAC;QAEjB,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC;QAEnB,OAAO,IAAI,CAAC;IACb,CAAC;IASD,UAAU,CAAC,GAAG,IAAI;QAEjB,kBAAU,CAAC,IAAI,EAAE,GAAG,IAAI,CAAC,CAAC;QAE1B,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC;QAEnB,OAAO,IAAI,CAAC;IACb,CAAC;IAED;;OAEG;IACH,QAAQ,CAAC,OAA4B;QAEpC,IAAI,CAAC,IAAI,CAAC,MAAM,EAChB;YACC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC;YAEnB,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,MAAM,EAClC;gBACC,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC;aACzB;SACD;QAED,OAAO,IAAI,CAAC;IACb,CAAC;IAED,YAAY,CAAC,IAAY,EAAE,MAAgB;QAE1C,IAAI,EAAE,GAAG,IAAI,CAAC;QAEd,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAE5B,MAAM,SAAS,GAAG,EAAE,CAAC,eAAe,6BAA4B,CAAC;QACjE,MAAM,KAAK,GAAG,EAAE,CAAC,eAAe,qBAAwB,CAAC;QAEzD,IAAI,IAAI,GAAG,CAAC,MAAM,CAAC;QAEnB,IAAI,IAAI,EACR;YACC,SAAS,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YACpB,KAAK,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;SACnB;aAED;YACC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;SACtB;QAED,OAAO,IAAI,CAAA;IACZ,CAAC;IAED;;OAEG;IACH,WAAW;QAEV,IAAI,EAAE,GAAG,IAAI,CAAC;QAEd,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAE5B,MAAM,SAAS,GAAG,EAAE,CAAC,OAAO,6BAA4B,CAAC;QACzD,MAAM,KAAK,GAAG,EAAE,CAAC,eAAe,qBAAwB,CAAC;QAEzD,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC;aACvB,OAAO,CAAC,UAAU,CAAC,GAAG,EAAE,IAAI,CAAC;YAE7B,IAAI,IAAI,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAA;QAC1B,CAAC,CAAC,CACF;QAED,OAAO,IAAI,CAAA;IACZ,CAAC;IAiBD,SAAS,CAAC,IAAI,EAAE,UAA6B,EAAE;QAE9C,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAE5B,OAAO,KAAK,CAAC,SAAS,CAAC,IAAI,EAAE,OAAO,CAAQ,CAAA;IAC7C,CAAC;;AAldF,0BAodC;AAjdO,+BAAuB,GAAsB,kCAAuB,CAAC;AA0f7E,kBAAe,OAAO,CAAC","sourcesContent":["/**\n * 分词器接口\n *\n * @author 老雷<leizongmin@gmail.com>\n */\n\n'use strict';\n\nimport * as path from 'path';\nimport { searchFirstSync, searchGlobSync } from './fs/get';\nimport POSTAG from './POSTAG';\nimport TableDictBlacklist from './table/blacklist';\nimport AbstractTableDictCore from './table/core';\nimport { IOptions as IOptionsTableDict, TableDict } from './table/dict';\n\nimport Loader from './loader';\nimport { crlf } from 'crlf-normalize';\nimport { TableDictStopword } from './table/stopword';\nimport TableDictSynonym from './table/synonym';\nimport SegmentDict from 'segment-dict';\nimport { ISubOptimizer, ISubTokenizer, Optimizer, Tokenizer } from './mod';\nimport { debugToken } from './util/debug';\nimport { IWordDebug } from './util/index';\n\nimport ProjectConfig from '../project.config';\n\nimport deepmerge from 'deepmerge-plus/core';\nimport { EnumDictDatabase } from './const';\nimport { ENUM_SUBMODS, ENUM_SUBMODS_NAME, ENUM_SUBMODS_OTHER } from './mod/index';\n\nimport {\n\tIDICT,\n\tIDICT2,\n\tIDICT_BLACKLIST,\n\tIDICT_STOPWORD,\n\tIDICT_SYNONYM,\n\tIOptionsDoSegment,\n\tIOptionsSegment,\n\tISPLIT,\n\tISPLIT_FILTER,\n\tIWord,\n} from './segment/types';\nimport SegmentCore from './segment/core';\nimport { _isIgnoreModules } from './segment/methods/useModules';\nimport { ITSOverwrite } from 'ts-type';\nimport { defaultOptionsDoSegment } from './segment/defaults';\nimport { IUseDefaultOptions, useDefault } from './defaults/index';\nimport { useModules } from './segment/methods/useModules2';\n\n/**\n * 创建分词器接口\n */\nexport class Segment extends SegmentCore\n{\n\n\tstatic defaultOptionsDoSegment: IOptionsDoSegment = defaultOptionsDoSegment;\n\n\tgetDictDatabase<R extends TableDictSynonym>(type: EnumDictDatabase.SYNONYM,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase<R extends TableDict>(type: EnumDictDatabase.TABLE,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase<R extends TableDictStopword>(type: EnumDictDatabase.STOPWORD,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase<R extends TableDictBlacklist>(type: EnumDictDatabase.BLACKLIST,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase<R extends TableDictBlacklist>(type: EnumDictDatabase.BLACKLIST_FOR_OPTIMIZER,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase<R extends TableDictBlacklist>(type: EnumDictDatabase.BLACKLIST_FOR_SYNONYM,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase<R extends AbstractTableDictCore<any>>(type: string | EnumDictDatabase,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase(type: string, autocreate?: boolean, libTableDict?)\n\t{\n\t\tif ((autocreate || this.inited) && !this.db[type])\n\t\t{\n\t\t\tif (type == TableDictSynonym.type)\n\t\t\t{\n\t\t\t\tlibTableDict = libTableDict || TableDictSynonym;\n\t\t\t}\n\t\t\telse if (type == TableDictStopword.type)\n\t\t\t{\n\t\t\t\tlibTableDict = libTableDict || TableDictStopword;\n\t\t\t}\n\t\t\telse if (type == TableDictBlacklist.type || type == EnumDictDatabase.BLACKLIST_FOR_OPTIMIZER || type == EnumDictDatabase.BLACKLIST_FOR_SYNONYM)\n\t\t\t{\n\t\t\t\tlibTableDict = libTableDict || TableDictBlacklist;\n\t\t\t}\n\t\t\telse\n\t\t\t{\n\t\t\t\tlibTableDict = libTableDict || TableDict;\n\t\t\t}\n\n\t\t\tthis.db[type] = new libTableDict(type, this.options, {\n\t\t\t\tTABLE: this.DICT[type],\n\t\t\t});\n\t\t}\n\n\t\treturn this.db[type];\n\t}\n\n\t/**\n\t * 载入分词模块\n\t *\n\t * @param {String|Array|Object} module 模块名称(数组)或模块对象\n\t * @return {Segment}\n\t */\n\tuse(mod: ISubOptimizer, ...argv)\n\tuse(mod: ISubTokenizer, ...argv)\n\tuse(mod: Array<ISubTokenizer | ISubOptimizer | string>, ...argv)\n\tuse(mod: string, ...argv)\n\tuse(mod, ...argv)\n\tuse(mod, ...argv)\n\t{\n\t\tuseModules(this, mod, ...argv);\n\n\t\tthis.inited = true;\n\n\t\treturn this;\n\t}\n\n\t_resolveDictFilename(name: string, pathPlus: string[] = [], extPlus: string[] = []): string | string[]\n\t{\n\t\tlet options = {\n\t\t\tpaths: [\n\t\t\t\t'',\n\t\t\t\tProjectConfig.dict_root,\n\n\t\t\t\t...pathPlus,\n\t\t\t\tpath.resolve(SegmentDict.DICT_ROOT, 'segment'),\n\t\t\t],\n\t\t\textensions: [\n\t\t\t\t'',\n\t\t\t\t...extPlus,\n\t\t\t\t'.utf8',\n\t\t\t\t'.txt',\n\t\t\t],\n\n\t\t\tonlyFile: true,\n\t\t};\n\n\t\tif (name.indexOf('*') != -1)\n\t\t{\n\t\t\tlet ls = searchGlobSync(name, options);\n\n\t\t\tif (!ls || !ls.length)\n\t\t\t{\n\t\t\t\tthrow Error(`Cannot find dict glob file \"${name}\".`);\n\t\t\t}\n\n\t\t\treturn ls;\n\t\t}\n\n\t\tlet filename = searchFirstSync(name, options);\n\n\t\tif (!filename)\n\t\t{\n\t\t\t//console.log(name, pathPlus, extPlus);\n\n\t\t\tthrow Error(`Cannot find dict file \"${name}\".`);\n\t\t}\n\n\t\treturn filename;\n\t}\n\n\t/**\n\t * 载入字典文件\n\t *\n\t * @param {String} name 字典文件名\n\t * @param {String} type 类型\n\t * @param {Boolean} convert_to_lower 是否全部转换为小写\n\t * @return {Segment}\n\t */\n\tloadDict(name: string, type?: string, convert_to_lower?: boolean, skipExists?: boolean)\n\t{\n\t\tlet filename = this._resolveDictFilename(name);\n\n\t\tif (Array.isArray(filename))\n\t\t{\n\t\t\tlet self = this;\n\n\t\t\tfilename.forEach(v => this.loadDict(v, type, convert_to_lower, skipExists));\n\n\t\t\t//console.log(filename);\n\n\t\t\treturn this;\n\t\t}\n\n\t\tif (!type) type = 'TABLE';     // 默认为TABLE\n\n\t\tconst db = this.getDictDatabase(type, true);\n\n\t\tconst TABLE = this.DICT[type] = db.TABLE;\n\t\tconst TABLE2 = this.DICT[type + '2'] = db.TABLE2;\n\n\t\t/*\n\t\t// 初始化词典\n\t\tif (!this.DICT[type]) this.DICT[type] = {};\n\t\tif (!this.DICT[type + '2']) this.DICT[type + '2'] = {};\n\t\tlet TABLE = this.DICT[type];        // 词典表  '词' => {属性}\n\t\tlet TABLE2 = this.DICT[type + '2']; // 词典表  '长度' => '词' => 属性\n\t\t*/\n\t\t// 导入数据\n\t\tconst POSTAG = this.POSTAG;\n\n\t\tlet data = Loader.SegmentDictLoader.loadSync(filename);\n\n\t\tdata.forEach(function (data)\n\t\t{\n\t\t\tif (convert_to_lower)\n\t\t\t{\n\t\t\t\tdata[0] = data[0].toLowerCase();\n\t\t\t}\n\n\t\t\tdb.add(data, skipExists);\n\n\t\t\t/*\n\t\t\tlet [w, p, f] = data;\n\n\t\t\tif (w.length == 0)\n\t\t\t{\n\t\t\t\tthrow new Error()\n\t\t\t}\n\n\t\t\tTABLE[w] = { p, f, };\n\t\t\tif (!TABLE2[w.length]) TABLE2[w.length] = {};\n\t\t\tTABLE2[w.length][w] = TABLE[w];\n\t\t\t*/\n\t\t});\n\n\t\tdata = undefined;\n\n\t\tthis.inited = true;\n\n\t\treturn this;\n\t}\n\n\t/**\n\t * 载入同义词词典\n\t *\n\t * @param {String} name 字典文件名\n\t */\n\tloadSynonymDict(name: string, skipExists?: boolean)\n\t{\n\t\tlet filename = this._resolveDictFilename(name, [\n\t\t\tpath.resolve(SegmentDict.DICT_ROOT, 'synonym'),\n\t\t]);\n\n\t\tif (Array.isArray(filename))\n\t\t{\n\t\t\tlet self = this;\n\n\t\t\tfilename.forEach(v => this.loadSynonymDict(v, skipExists));\n\n\t\t\treturn this;\n\t\t}\n\n\t\tlet type = 'SYNONYM';\n\n\t\tconst db = this.getDictDatabase(type, true);\n\n\t\tconst TABLE = this.DICT[type] = db.TABLE;\n\n\t\t/*\n\t\t// 初始化词典\n\t\tif (!this.DICT[type]) this.DICT[type] = {};\n\t\t// 词典表  '同义词' => '标准词'\n\t\tlet TABLE = this.DICT[type] as IDICT_SYNONYM;\n\t\t// 导入数据\n\t\t*/\n\n\t\tlet data = Loader.SegmentSynonymLoader.loadSync(filename);\n\n\t\tdata.forEach(function (blocks: string[])\n\t\t{\n\t\t\tdb.add(blocks, skipExists);\n\n\t\t\t/*\n\t\t\tlet [n1, n2] = blocks;\n\n\t\t\tTABLE[n1] = n2;\n\t\t\tif (TABLE[n2] === n1)\n\t\t\t{\n\t\t\t\tdelete TABLE[n2];\n\t\t\t}\n\t\t\t*/\n\t\t});\n\n\t\t//console.log(TABLE);\n\n\t\tdata = undefined;\n\n\t\tthis.inited = true;\n\n\t\treturn this;\n\t}\n\n\tprotected _loadBlacklistDict(name: string, type: EnumDictDatabase)\n\t{\n\t\tlet filename = this._resolveDictFilename(name, [\n\t\t\tpath.resolve(SegmentDict.DICT_ROOT, 'blacklist'),\n\t\t]);\n\n\t\tif (Array.isArray(filename))\n\t\t{\n\t\t\tlet self = this;\n\n\t\t\tfilename.forEach(v => this._loadBlacklistDict(v, type));\n\n\t\t\treturn this;\n\t\t}\n\n\t\tconst db = this.getDictDatabase(type, true);\n\n\t\tconst TABLE = this.DICT[type] = db.TABLE;\n\n\t\tlet data = Loader.SegmentDict\n\t\t\t.requireLoaderModule('line')\n\t\t\t.loadSync(filename, {\n\t\t\t\tfilter(line: string)\n\t\t\t\t{\n\t\t\t\t\treturn line.trim();\n\t\t\t\t},\n\t\t\t})\n\t\t;\n\n\t\tdata.forEach(v => db.add(v));\n\n\t\tdata = undefined;\n\n\t\tthis.inited = true;\n\n\t\treturn this;\n\t}\n\n\t/**\n\t * 字典黑名單 在主字典內刪除此字典內有的條目\n\t */\n\tloadBlacklistDict(name: string)\n\t{\n\t\treturn this._loadBlacklistDict(name, EnumDictDatabase.BLACKLIST)\n\t}\n\n\t/**\n\t * 優化器黑名單 會防止部分優化器去組合此字典內的詞\n\t * 例如 人名 自動組合之類\n\t */\n\tloadBlacklistOptimizerDict(name: string)\n\t{\n\t\treturn this._loadBlacklistDict(name, EnumDictDatabase.BLACKLIST_FOR_OPTIMIZER)\n\t}\n\n\t/**\n\t * 轉換黑名單 動態轉換字詞時會忽略此字典內的詞\n\t */\n\tloadBlacklistSynonymDict(name: string)\n\t{\n\t\treturn this._loadBlacklistDict(name, EnumDictDatabase.BLACKLIST_FOR_SYNONYM)\n\t}\n\n\t/**\n\t * 载入停止符词典\n\t *\n\t * @param {String} name 字典文件名\n\t */\n\tloadStopwordDict(name: string)\n\t{\n\t\tlet filename = this._resolveDictFilename(name, [\n\t\t\tpath.resolve(SegmentDict.DICT_ROOT, 'stopword'),\n\t\t]);\n\n\t\tif (Array.isArray(filename))\n\t\t{\n\t\t\tlet self = this;\n\n\t\t\tfilename.forEach(v => this.loadStopwordDict(v));\n\n\t\t\treturn this;\n\t\t}\n\n\t\tconst type = EnumDictDatabase.STOPWORD;\n\n\t\tconst db = this.getDictDatabase(type, true);\n\n\t\tconst TABLE = this.DICT[type] = db.TABLE;\n\n\t\tlet data = Loader.SegmentDict\n\t\t\t.requireLoaderModule('line')\n\t\t\t.loadSync(filename, {\n\t\t\t\tfilter(line: string)\n\t\t\t\t{\n\t\t\t\t\treturn line.trim();\n\t\t\t\t},\n\t\t\t})\n\t\t;\n\n\t\tdata.forEach(v => db.add(v));\n\n\t\tdata = undefined;\n\n\t\tthis.inited = true;\n\n\t\treturn this;\n\t}\n\n\t/**\n\t * 使用默认的识别模块和字典文件\n\t * 在使用預設值的情況下，不需要主動呼叫此函數\n\t *\n\t * @return {Segment}\n\t */\n\tuseDefault(options?: IUseDefaultOptions, ...argv)\n\tuseDefault(...argv)\n\t{\n\t\tuseDefault(this, ...argv);\n\n\t\tthis.inited = true;\n\n\t\treturn this;\n\t}\n\n\t/**\n\t * 此函數只需執行一次，並且一般狀況下不需要手動呼叫\n\t */\n\tautoInit(options?: IUseDefaultOptions)\n\t{\n\t\tif (!this.inited)\n\t\t{\n\t\t\tthis.inited = true;\n\n\t\t\tif (!this.modules.tokenizer.length)\n\t\t\t{\n\t\t\t\tthis.useDefault(options);\n\t\t\t}\n\t\t}\n\n\t\treturn this;\n\t}\n\n\taddBlacklist(word: string, remove?: boolean)\n\t{\n\t\tlet me = this;\n\n\t\tthis.autoInit(this.options);\n\n\t\tconst BLACKLIST = me.getDictDatabase(EnumDictDatabase.BLACKLIST);\n\t\tconst TABLE = me.getDictDatabase(EnumDictDatabase.TABLE);\n\n\t\tlet bool = !remove;\n\n\t\tif (bool)\n\t\t{\n\t\t\tBLACKLIST.add(word);\n\t\t\tTABLE.remove(word);\n\t\t}\n\t\telse\n\t\t{\n\t\t\tBLACKLIST.remove(word)\n\t\t}\n\n\t\treturn this\n\t}\n\n\t/**\n\t * remove key in TABLE by BLACKLIST\n\t */\n\tdoBlacklist()\n\t{\n\t\tlet me = this;\n\n\t\tthis.autoInit(this.options);\n\n\t\tconst BLACKLIST = me.getDict(EnumDictDatabase.BLACKLIST);\n\t\tconst TABLE = me.getDictDatabase(EnumDictDatabase.TABLE);\n\n\t\tObject.entries(BLACKLIST)\n\t\t\t.forEach(function ([key, bool])\n\t\t\t{\n\t\t\t\tbool && TABLE.remove(key)\n\t\t\t})\n\t\t;\n\n\t\treturn this\n\t}\n\n\t/**\n\t * 开始分词\n\t *\n\t * @param {String} text 文本\n\t * @param {Object} options 选项\n\t *   - {Boolean} simple 是否仅返回单词内容\n\t *   - {Boolean} stripPunctuation 去除标点符号\n\t *   - {Boolean} convertSynonym 转换同义词\n\t *   - {Boolean} stripStopword 去除停止符\n\t * @return {Array}\n\t */\n\tdoSegment(text: string | Buffer, options: ITSOverwrite<IOptionsDoSegment, {\n\t\tsimple: true,\n\t}>): string[]\n\tdoSegment(text: string | Buffer, options?: IOptionsDoSegment): IWord[]\n\tdoSegment(text, options: IOptionsDoSegment = {})\n\t{\n\t\tthis.autoInit(this.options);\n\n\t\treturn super.doSegment(text, options) as any\n\t}\n\n}\n\nexport declare namespace Segment\n{\n\texport {\n\t\t// @ts-ignore\n\t\tIDICT,\n\t\t// @ts-ignore\n\t\tIDICT2,\n\t\t// @ts-ignore\n\t\tIDICT_BLACKLIST,\n\t\t// @ts-ignore\n\t\tIDICT_STOPWORD,\n\t\t// @ts-ignore\n\t\tIDICT_SYNONYM,\n\t\t// @ts-ignore\n\t\tIOptionsDoSegment,\n\t\t// @ts-ignore\n\t\tIOptionsSegment,\n\t\t// @ts-ignore\n\t\tISPLIT,\n\t\t// @ts-ignore\n\t\tISPLIT_FILTER,\n\t\t// @ts-ignore\n\t\tIWord,\n\t}\n}\n\nexport {\n\tIDICT,\n\tIDICT2,\n\tIDICT_BLACKLIST,\n\tIDICT_STOPWORD,\n\tIDICT_SYNONYM,\n\tIOptionsDoSegment,\n\tIOptionsSegment,\n\tISPLIT,\n\tISPLIT_FILTER,\n\tIWord,\n}\n\nexport default Segment;\n"]} |
\ | No newline at end of file |