UNPKG

27.8 kBJavaScriptView Raw
1/**
2 * 分词器接口
3 *
4 * @author 老雷<leizongmin@gmail.com>
5 */
6'use strict';
7Object.defineProperty(exports, "__esModule", { value: true });
8exports.SegmentCore = void 0;
9const POSTAG_1 = require("../POSTAG");
10const index_1 = require("../mod/index");
11const stringify_1 = require("./methods/stringify");
12const split_1 = require("./methods/split");
13const indexOf_1 = require("./methods/indexOf");
14const convertSynonym_1 = require("./methods/convertSynonym");
15const listModules_1 = require("./methods/listModules");
16const _get_text_1 = require("./methods/_get_text");
17const getOptionsDoSegment_1 = require("./methods/getOptionsDoSegment");
18const useModules_1 = require("./methods/useModules");
19const doSegment_1 = require("./methods/doSegment");
20/**
21 * 创建分词器接口
22 */
23class SegmentCore {
24 constructor(options = {}) {
25 /**
26 * 分段
27 *
28 * 由於 segment 是利用對內容的前後文分析來進行分詞
29 * 所以如何切割段落對於結果就會產生不同影響
30 *
31 * `RegExp` or 具有 `.[Symbol.split](input: string, limit?: number) => string[]` 的物件
32 *
33 * @type {Segment.ISPLIT}
34 */
35 this.SPLIT = /([\r\n]+|^[ \s]+|[ \s]+$|[ \s]{2,})/gm;
36 /**
37 * 分段之後 如果符合以下條件 則直接忽略分析
38 * `RegExp` or 具有 `.test(input: string) => boolean` 的物件
39 *
40 * @type {Segment.ISPLIT_FILTER}
41 */
42 this.SPLIT_FILTER = /^([\r\n]+)$/g;
43 /**
44 * 词性
45 * @type {POSTAG}
46 */
47 this.POSTAG = POSTAG_1.default;
48 /**
49 * 词典表
50 * @type {{}}
51 */
52 this.DICT = {};
53 this.modules = {
54 /**
55 * 分词模块
56 */
57 tokenizer: [],
58 /**
59 * 优化模块
60 */
61 optimizer: [],
62 };
63 this.db = {};
64 this.options = {};
65 const self = this;
66 this.options = Object.assign({}, this.options, options);
67 this.tokenizer = new index_1.Tokenizer(this);
68 this.optimizer = new index_1.Optimizer(this);
69 if (this.options.db) {
70 this.options.db.forEach(function (data) {
71 self.db[data.type] = data;
72 });
73 }
74 delete this.options.db;
75 }
76 getDictDatabase(type, autocreate, libTableDict) {
77 // @ts-ignore
78 return this.db[type];
79 }
80 use(mod, ...argv) {
81 useModules_1.useModules(this, mod, ...argv);
82 return this;
83 }
84 getDict(type) {
85 return this.DICT[type];
86 }
87 getOptionsDoSegment(options) {
88 return getOptionsDoSegment_1.getOptionsDoSegment(options, this.options.optionsDoSegment);
89 }
90 _get_text(text) {
91 return _get_text_1._get_text(text);
92 }
93 addBlacklist(word, remove) {
94 let me = this;
95 const BLACKLIST = me.getDictDatabase("BLACKLIST" /* BLACKLIST */);
96 const TABLE = me.getDictDatabase("TABLE" /* TABLE */);
97 let bool = !remove;
98 if (bool) {
99 BLACKLIST.add(word);
100 TABLE.remove(word);
101 }
102 else {
103 BLACKLIST.remove(word);
104 }
105 return this;
106 }
107 /**
108 * remove key in TABLE by BLACKLIST
109 */
110 doBlacklist() {
111 let me = this;
112 const BLACKLIST = me.getDict("BLACKLIST" /* BLACKLIST */);
113 const TABLE = me.getDictDatabase("TABLE" /* TABLE */);
114 Object.entries(BLACKLIST)
115 .forEach(function ([key, bool]) {
116 bool && TABLE.remove(key);
117 });
118 return this;
119 }
120 listModules(options = {}) {
121 options = this.getOptionsDoSegment(options);
122 return listModules_1.listModules(this.modules, options);
123 }
124 doSegment(text, options = {}) {
125 const me = this;
126 options = me.getOptionsDoSegment(options);
127 //console.dir(options);
128 let text_list = me._get_text(text)
129 // @ts-ignore
130 .split(this.SPLIT);
131 text = undefined;
132 const mods = me.listModules(options).enable;
133 // 将文本按照换行符分割成多段,并逐一分词
134 let ret = text_list.reduce(function (ret, section) {
135 //console.dir(section);
136 if (me.SPLIT_FILTER.test(section)) {
137 ret = ret.concat({ w: section });
138 // @ts-ignore
139 section = [];
140 }
141 //section = section.trim();
142 if (section.length > 0) {
143 // 分词
144 let sret = me.tokenizer.split(section, mods.tokenizer);
145 // 优化
146 sret = me.optimizer.doOptimize(sret, mods.optimizer);
147 // 连接分词结果
148 if (sret.length > 0) {
149 ret = ret.concat(sret);
150 }
151 }
152 return ret;
153 }, []);
154 // 去除标点符号
155 if (options.stripPunctuation) {
156 ret = doSegment_1._doSegmentStripPOSTAG(ret, POSTAG_1.default.D_W);
157 }
158 if (options.convertSynonym) {
159 ret = this.convertSynonym(ret);
160 }
161 // 去除停止符
162 if (options.stripStopword) {
163 ret = doSegment_1._doSegmentStripStopword(ret, me.getDict('STOPWORD'));
164 }
165 if (options.stripSpace) {
166 ret = doSegment_1._doSegmentStripSpace(ret);
167 }
168 // 仅返回单词内容
169 if (options.simple) {
170 ret = doSegment_1._doSegmentSimple(ret);
171 }
172 return ret;
173 }
174 convertSynonym(ret, showcount) {
175 return convertSynonym_1.convertSynonym(ret, {
176 showcount,
177 DICT_SYNONYM: this.getDict('SYNONYM'),
178 DICT_TABLE: this.getDict('TABLE'),
179 POSTAG: this.POSTAG,
180 });
181 }
182 /**
183 * 将单词数组连接成字符串
184 *
185 * @param {Array} words 单词数组
186 * @return {String}
187 */
188 stringify(words, ...argv) {
189 return stringify_1.stringify(words, ...argv);
190 }
191 /**
192 * 将单词数组连接成字符串
193 *
194 * @param {Array} words 单词数组
195 * @return {String}
196 */
197 static stringify(words, ...argv) {
198 return stringify_1.stringify(words, ...argv);
199 }
200 /**
201 * 根据某个单词或词性来分割单词数组
202 *
203 * @param {Array} words 单词数组
204 * @param {Number|String} s 用于分割的单词或词性
205 * @return {Array}
206 */
207 split(words, s, ...argv) {
208 return split_1.split(words, s, ...argv);
209 }
210 /**
211 * 在单词数组中查找某一个单词或词性所在的位置
212 *
213 * @param {Array} words 单词数组
214 * @param {Number|String} s 要查找的单词或词性
215 * @param {Number} cur 开始位置
216 * @return {Number} 找不到,返回-1
217 */
218 indexOf(words, s, cur, ...argv) {
219 return indexOf_1.indexOf(words, cur, ...argv);
220 }
221}
222exports.SegmentCore = SegmentCore;
223exports.default = SegmentCore;
224//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"core.js","sourceRoot":"","sources":["core.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,YAAY,CAAC;;;AAKb,sCAA+B;AAU/B,wCAAkF;AASlF,mDAAgD;AAChD,2CAAwC;AACxC,+CAA4C;AAC5C,6DAAwF;AACxF,uDAAoD;AACpD,mDAAgD;AAChD,uEAAoE;AACpE,qDAAkD;AAClD,mDAK6B;AAI7B;;GAEG;AACH,MAAa,WAAW;IA+DvB,YAAY,UAA2B,EAAE;QA5DzC;;;;;;;;;WASG;QACH,UAAK,GAAW,uCAAiD,CAAC;QAElE;;;;;WAKG;QACH,iBAAY,GAAkB,cAA+B,CAAC;QAE9D;;;WAGG;QACH,WAAM,GAAG,gBAAM,CAAC;QAChB;;;WAGG;QACH,SAAI,GAKA,EAAE,CAAC;QACP,YAAO,GAAG;YACT;;eAEG;YACH,SAAS,EAAE,EAAE;YACb;;eAEG;YACH,SAAS,EAAE,EAAE;SAIb,CAAC;QAKF,OAAE,GAEE,EAAE,CAAC;QAEP,YAAO,GAAoB,EAAE,CAAC;QAM7B,MAAM,IAAI,GAAG,IAAI,CAAC;QAElB,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC,EAAE,EAAE,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;QAExD,IAAI,CAAC,SAAS,GAAG,IAAI,iBAAS,CAAC,IAAW,CAAC,CAAC;QAC5C,IAAI,CAAC,SAAS,GAAG,IAAI,iBAAS,CAAC,IAAW,CAAC,CAAC;QAE5C,IAAI,IAAI,CAAC,OAAO,CAAC,EAAE,EACnB;YACC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,UAAU,IAAI;gBAErC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;YAC3B,CAAC,CAAC,CAAC;SACH;QAED,OAAO,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;IACxB,CAAC;IAuBD,eAAe,CAAC,IAAY,EAAE,UAAoB,EAAE,YAAa;QAEhE,aAAa;QACb,OAAO,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC;IACtB,CAAC;IAWD,GAAG,CAAC,GAAG,EAAE,GAAG,IAAI;QAEf,uBAAU,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,CAAC;QAE/B,OAAO,IAAI,CAAA;IACZ,CAAC;IAgBD,OAAO,CAAC,IAAI;QAEX,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACxB,CAAC;IAED,mBAAmB,CAA8B,OAAW;QAE3D,OAAO,yCAAmB,CAAI,OAAO,EAAE,IAAI,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAA;IACtE,CAAC;IAES,SAAS,CAAC,IAAqB;QAExC,OAAO,qBAAS,CAAC,IAAI,CAAC,CAAA;IACvB,CAAC;IAED,YAAY,CAAC,IAAY,EAAE,MAAgB;QAE1C,IAAI,EAAE,GAAG,IAAI,CAAC;QAEd,MAAM,SAAS,GAAG,EAAE,CAAC,eAAe,6BAA4B,CAAC;QACjE,MAAM,KAAK,GAAG,EAAE,CAAC,eAAe,qBAAwB,CAAC;QAEzD,IAAI,IAAI,GAAG,CAAC,MAAM,CAAC;QAEnB,IAAI,IAAI,EACR;YACC,SAAS,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YACpB,KAAK,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;SACnB;aAED;YACC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;SACtB;QAED,OAAO,IAAI,CAAA;IACZ,CAAC;IAED;;OAEG;IACH,WAAW;QAEV,IAAI,EAAE,GAAG,IAAI,CAAC;QAEd,MAAM,SAAS,GAAG,EAAE,CAAC,OAAO,6BAA4B,CAAC;QACzD,MAAM,KAAK,GAAG,EAAE,CAAC,eAAe,qBAAwB,CAAC;QAEzD,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC;aACvB,OAAO,CAAC,UAAU,CAAC,GAAG,EAAE,IAAI,CAAC;YAE7B,IAAI,IAAI,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAA;QAC1B,CAAC,CAAC,CACF;QAED,OAAO,IAAI,CAAA;IACZ,CAAC;IAED,WAAW,CAAC,UAA6B,EAAE;QAE1C,OAAO,GAAG,IAAI,CAAC,mBAAmB,CAAC,OAAO,CAAC,CAAC;QAE5C,OAAO,yBAAW,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;IAC3C,CAAC;IAiBD,SAAS,CAAC,IAAI,EAAE,UAA6B,EAAE;QAE9C,MAAM,EAAE,GAAG,IAAI,CAAC;QAEhB,OAAO,GAAG,EAAE,CAAC,mBAAmB,CAAC,OAAO,CAAC,CAAC;QAE1C,uBAAuB;QAEvB,IAAI,SAAS,GAAG,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC;YACjC,aAAa;aACZ,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAClB;QACD,IAAI,GAAG,SAAS,CAAC;QAEjB,MAAM,IAAI,GAAG,EAAE,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;QAE5C,sBAAsB;QACtB,IAAI,GAAG,GAAG,SAAS,CAAC,MAAM,CAAC,UAAU,GAAG,EAAE,OAAO;YAEhD,uBAAuB;YAEvB,IAAI,EAAE,CAAC,YAAY,CAAC,IAAI,CAAC,OAAO,CAAC,EACjC;gBACC,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC;gBAEjC,aAAa;gBACb,OAAO,GAAG,EAAE,CAAC;aACb;YAED,2BAA2B;YAC3B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EACtB;gBACC,KAAK;gBACL,IAAI,IAAI,GAAG,EAAE,CAAC,SAAS,CAAC,KAAK,CAAC,OAAO,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;gBAEvD,KAAK;gBACL,IAAI,GAAG,EAAE,CAAC,SAAS,CAAC,UAAU,CAAC,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;gBAErD,SAAS;gBACT,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EACnB;oBACC,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;iBACvB;aACD;YAED,OAAO,GAAG,CAAC;QACZ,CAAC,EAAE,EAAE,CAAC,CAAC;QAEP,SAAS;QACT,IAAI,OAAO,CAAC,gBAAgB,EAC5B;YACC,GAAG,GAAG,iCAAqB,CAAC,GAAG,EAAE,gBAAM,CAAC,GAAG,CAAC,CAAA;SAC5C;QAED,IAAI,OAAO,CAAC,cAAc,EAC1B;YACC,GAAG,GAAG,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;SAC/B;QAED,QAAQ;QACR,IAAI,OAAO,CAAC,aAAa,EACzB;YACC,GAAG,GAAG,mCAAuB,CAAC,GAAG,EAAE,EAAE,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAA;SAC1D;QAED,IAAI,OAAO,CAAC,UAAU,EACtB;YACC,GAAG,GAAG,gCAAoB,CAAC,GAAG,CAAC,CAAA;SAC/B;QAED,UAAU;QACV,IAAI,OAAO,CAAC,MAAM,EAClB;YACC,GAAG,GAAG,4BAAgB,CAAC,GAAG,CAAC,CAAA;SAC3B;QAED,OAAO,GAAG,CAAC;IACZ,CAAC;IAaD,cAAc,CAAC,GAAiB,EAAE,SAAmB;QAEpD,OAAO,+BAAc,CAAC,GAAG,EAAE;YAC1B,SAAS;YACT,YAAY,EAAE,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC;YACrC,UAAU,EAAE,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC;YACjC,MAAM,EAAE,IAAI,CAAC,MAAM;SACnB,CAAgD,CAAC;IACnD,CAAC;IAED;;;;;OAKG;IACH,SAAS,CAAC,KAA4B,EAAE,GAAG,IAAI;QAE9C,OAAO,qBAAS,CAAC,KAAK,EAAE,GAAG,IAAI,CAAC,CAAC;IAClC,CAAC;IAED;;;;;OAKG;IACH,MAAM,CAAC,SAAS,CAAC,KAA4B,EAAE,GAAG,IAAI;QAErD,OAAO,qBAAS,CAAC,KAAK,EAAE,GAAG,IAAI,CAAC,CAAA;IACjC,CAAC;IAED;;;;;;OAMG;IACH,KAAK,CAAC,KAAc,EAAE,CAAkB,EAAE,GAAG,IAAI;QAEhD,OAAO,aAAK,CAAC,KAAK,EAAE,CAAC,EAAE,GAAG,IAAI,CAAC,CAAA;IAChC,CAAC;IAED;;;;;;;OAOG;IACH,OAAO,CAAC,KAAc,EAAE,CAAkB,EAAE,GAAY,EAAE,GAAG,IAAI;QAEhE,OAAO,iBAAO,CAAC,KAAK,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,CAAA;IACpC,CAAC;CAED;AA9WD,kCA8WC;AAID,kBAAe,WAAW,CAAC","sourcesContent":["/**\n * 分词器接口\n *\n * @author 老雷<leizongmin@gmail.com>\n */\n\n'use strict';\n\n// @ts-ignore\n// @ts-ignore\nimport path = require('path');\nimport POSTAG from '../POSTAG';\nimport TableDictBlacklist from '../table/blacklist';\nimport AbstractTableDictCore from '../table/core';\nimport { IOptions as IOptionsTableDict, TableDict } from '../table/dict';\n\nimport Loader from '../loader';\nimport { crlf } from 'crlf-normalize';\nimport { TableDictStopword } from '../table/stopword';\nimport TableDictSynonym from '../table/synonym';\nimport SegmentDict from 'segment-dict';\nimport { ISubOptimizer, ISubTokenizer, Optimizer, Tokenizer } from '../mod/index';\nimport { debugToken } from '../util/debug';\nimport { IWordDebug } from '../util/index';\n\nimport deepmerge from 'deepmerge-plus/core';\nimport { EnumDictDatabase } from '../const';\nimport { ENUM_SUBMODS, ENUM_SUBMODS_NAME, ENUM_SUBMODS_OTHER } from '../mod/index';\nimport { defaultOptionsDoSegment } from './defaults';\nimport { IDICT, IDICT2, IDICT_BLACKLIST, IDICT_STOPWORD, IDICT_SYNONYM, IOptionsDoSegment, IOptionsSegment, ISPLIT, ISPLIT_FILTER, IWord } from './types';\nimport { stringify } from './methods/stringify';\nimport { split } from './methods/split';\nimport { indexOf } from './methods/indexOf';\nimport { convertSynonym, IConvertSynonymWithShowcount } from './methods/convertSynonym';\nimport { listModules } from './methods/listModules';\nimport { _get_text } from './methods/_get_text';\nimport { getOptionsDoSegment } from './methods/getOptionsDoSegment';\nimport { useModules } from './methods/useModules';\nimport {\n\t_doSegmentSimple,\n\t_doSegmentStripPOSTAG,\n\t_doSegmentStripSpace,\n\t_doSegmentStripStopword,\n} from './methods/doSegment';\nimport { ITSOverwrite } from 'ts-type';\n\n\n/**\n * 创建分词器接口\n */\nexport class SegmentCore\n{\n\n\t/**\n\t * 分段\n\t *\n\t * 由於 segment 是利用對內容的前後文分析來進行分詞\n\t * 所以如何切割段落對於結果就會產生不同影響\n\t *\n\t * `RegExp` or 具有 `.[Symbol.split](input: string, limit?: number) => string[]` 的物件\n\t *\n\t * @type {Segment.ISPLIT}\n\t */\n\tSPLIT: ISPLIT = /([\\r\\n]+|^[　\\s]+|[　\\s]+$|[　\\s]{2,})/gm as ISPLIT;\n\n\t/**\n\t * 分段之後 如果符合以下條件 則直接忽略分析\n\t * `RegExp` or 具有 `.test(input: string) => boolean` 的物件\n\t *\n\t * @type {Segment.ISPLIT_FILTER}\n\t */\n\tSPLIT_FILTER: ISPLIT_FILTER = /^([\\r\\n]+)$/g as ISPLIT_FILTER;\n\n\t/**\n\t * 词性\n\t * @type {POSTAG}\n\t */\n\tPOSTAG = POSTAG;\n\t/**\n\t * 词典表\n\t * @type {{}}\n\t */\n\tDICT: {\n\t\tSTOPWORD?: IDICT_STOPWORD,\n\t\tSYNONYM?: IDICT_SYNONYM,\n\n\t\t[key: string]: IDICT,\n\t} = {};\n\tmodules = {\n\t\t/**\n\t\t * 分词模块\n\t\t */\n\t\ttokenizer: [],\n\t\t/**\n\t\t * 优化模块\n\t\t */\n\t\toptimizer: [],\n\t} as {\n\t\ttokenizer: ISubTokenizer[],\n\t\toptimizer: ISubOptimizer[],\n\t};\n\n\ttokenizer: Tokenizer;\n\toptimizer: Optimizer;\n\n\tdb: {\n\t\t[key: string]: TableDict,\n\t} = {};\n\n\toptions: IOptionsSegment = {};\n\n\tinited?: boolean;\n\n\tconstructor(options: IOptionsSegment = {})\n\t{\n\t\tconst self = this;\n\n\t\tthis.options = Object.assign({}, this.options, options);\n\n\t\tthis.tokenizer = new Tokenizer(this as any);\n\t\tthis.optimizer = new Optimizer(this as any);\n\n\t\tif (this.options.db)\n\t\t{\n\t\t\tthis.options.db.forEach(function (data)\n\t\t\t{\n\t\t\t\tself.db[data.type] = data;\n\t\t\t});\n\t\t}\n\n\t\tdelete this.options.db;\n\t}\n\n\tgetDictDatabase<R extends TableDictSynonym>(type: EnumDictDatabase.SYNONYM,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase<R extends TableDict>(type: EnumDictDatabase.TABLE, autocreate?: boolean, libTableDict?: { new(...argv): R }): R\n\tgetDictDatabase<R extends TableDictStopword>(type: EnumDictDatabase.STOPWORD,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase<R extends TableDictBlacklist>(type: EnumDictDatabase.BLACKLIST,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase<R extends TableDictBlacklist>(type: EnumDictDatabase.BLACKLIST_FOR_OPTIMIZER,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase<R extends AbstractTableDictCore<any>>(type: string | EnumDictDatabase,\n\t\tautocreate?: boolean,\n\t\tlibTableDict?: { new(...argv): R },\n\t): R\n\tgetDictDatabase(type: string, autocreate?: boolean, libTableDict?)\n\t{\n\t\t// @ts-ignore\n\t\treturn this.db[type];\n\t}\n\n\t/**\n\t * 载入分词模块\n\t *\n\t * @param {String|Array|Object} module 模块名称(数组)或模块对象\n\t * @return {Segment}\n\t */\n\tuse(mod: ISubOptimizer, ...argv): this\n\tuse(mod: ISubTokenizer, ...argv): this\n\tuse(mod, ...argv): this\n\tuse(mod, ...argv)\n\t{\n\t\tuseModules(this, mod, ...argv);\n\n\t\treturn this\n\t}\n\n\t/**\n\t * 取词典表\n\t *\n\t * @param {String} type 类型\n\t * @return {object}\n\t */\n\tgetDict(type: EnumDictDatabase.STOPWORD): IDICT_STOPWORD\n\tgetDict(type: EnumDictDatabase.SYNONYM): IDICT_SYNONYM\n\tgetDict(type: EnumDictDatabase.TABLE): IDICT<IWord>\n\tgetDict(type: EnumDictDatabase.BLACKLIST): IDICT_BLACKLIST\n\tgetDict(type: EnumDictDatabase.BLACKLIST_FOR_OPTIMIZER): IDICT_BLACKLIST\n\tgetDict(type: 'TABLE2'): IDICT2<IWord>\n\tgetDict(type: EnumDictDatabase): IDICT\n\tgetDict(type): IDICT\n\tgetDict(type)\n\t{\n\t\treturn this.DICT[type];\n\t}\n\n\tgetOptionsDoSegment<T extends IOptionsDoSegment>(options?: T): T\n\t{\n\t\treturn getOptionsDoSegment<T>(options, this.options.optionsDoSegment)\n\t}\n\n\tprotected _get_text(text: string | Buffer): string\n\t{\n\t\treturn _get_text(text)\n\t}\n\n\taddBlacklist(word: string, remove?: boolean)\n\t{\n\t\tlet me = this;\n\n\t\tconst BLACKLIST = me.getDictDatabase(EnumDictDatabase.BLACKLIST);\n\t\tconst TABLE = me.getDictDatabase(EnumDictDatabase.TABLE);\n\n\t\tlet bool = !remove;\n\n\t\tif (bool)\n\t\t{\n\t\t\tBLACKLIST.add(word);\n\t\t\tTABLE.remove(word);\n\t\t}\n\t\telse\n\t\t{\n\t\t\tBLACKLIST.remove(word)\n\t\t}\n\n\t\treturn this\n\t}\n\n\t/**\n\t * remove key in TABLE by BLACKLIST\n\t */\n\tdoBlacklist()\n\t{\n\t\tlet me = this;\n\n\t\tconst BLACKLIST = me.getDict(EnumDictDatabase.BLACKLIST);\n\t\tconst TABLE = me.getDictDatabase(EnumDictDatabase.TABLE);\n\n\t\tObject.entries(BLACKLIST)\n\t\t\t.forEach(function ([key, bool])\n\t\t\t{\n\t\t\t\tbool && TABLE.remove(key)\n\t\t\t})\n\t\t;\n\n\t\treturn this\n\t}\n\n\tlistModules(options: IOptionsDoSegment = {})\n\t{\n\t\toptions = this.getOptionsDoSegment(options);\n\n\t\treturn listModules(this.modules, options);\n\t}\n\n\t/**\n\t * 开始分词\n\t *\n\t * @param {String} text 文本\n\t * @param {Object} options 选项\n\t *   - {Boolean} simple 是否仅返回单词内容\n\t *   - {Boolean} stripPunctuation 去除标点符号\n\t *   - {Boolean} convertSynonym 转换同义词\n\t *   - {Boolean} stripStopword 去除停止符\n\t * @return {Array}\n\t */\n\tdoSegment(text: string | Buffer, options: ITSOverwrite<IOptionsDoSegment, {\n\t\tsimple: true,\n\t}>): string[]\n\tdoSegment(text: string | Buffer, options?: IOptionsDoSegment): IWord[]\n\tdoSegment(text, options: IOptionsDoSegment = {})\n\t{\n\t\tconst me = this;\n\n\t\toptions = me.getOptionsDoSegment(options);\n\n\t\t//console.dir(options);\n\n\t\tlet text_list = me._get_text(text)\n\t\t\t// @ts-ignore\n\t\t\t.split(this.SPLIT)\n\t\t;\n\t\ttext = undefined;\n\n\t\tconst mods = me.listModules(options).enable;\n\n\t\t// 将文本按照换行符分割成多段，并逐一分词\n\t\tlet ret = text_list.reduce(function (ret, section)\n\t\t{\n\t\t\t//console.dir(section);\n\n\t\t\tif (me.SPLIT_FILTER.test(section))\n\t\t\t{\n\t\t\t\tret = ret.concat({ w: section });\n\n\t\t\t\t// @ts-ignore\n\t\t\t\tsection = [];\n\t\t\t}\n\n\t\t\t//section = section.trim();\n\t\t\tif (section.length > 0)\n\t\t\t{\n\t\t\t\t// 分词\n\t\t\t\tlet sret = me.tokenizer.split(section, mods.tokenizer);\n\n\t\t\t\t// 优化\n\t\t\t\tsret = me.optimizer.doOptimize(sret, mods.optimizer);\n\n\t\t\t\t// 连接分词结果\n\t\t\t\tif (sret.length > 0)\n\t\t\t\t{\n\t\t\t\t\tret = ret.concat(sret);\n\t\t\t\t}\n\t\t\t}\n\n\t\t\treturn ret;\n\t\t}, []);\n\n\t\t// 去除标点符号\n\t\tif (options.stripPunctuation)\n\t\t{\n\t\t\tret = _doSegmentStripPOSTAG(ret, POSTAG.D_W)\n\t\t}\n\n\t\tif (options.convertSynonym)\n\t\t{\n\t\t\tret = this.convertSynonym(ret);\n\t\t}\n\n\t\t// 去除停止符\n\t\tif (options.stripStopword)\n\t\t{\n\t\t\tret = _doSegmentStripStopword(ret, me.getDict('STOPWORD'))\n\t\t}\n\n\t\tif (options.stripSpace)\n\t\t{\n\t\t\tret = _doSegmentStripSpace(ret)\n\t\t}\n\n\t\t// 仅返回单词内容\n\t\tif (options.simple)\n\t\t{\n\t\t\tret = _doSegmentSimple(ret)\n\t\t}\n\n\t\treturn ret;\n\t}\n\n\t/**\n\t * 转换同义词\n\t */\n\tconvertSynonym(ret: IWordDebug[], showcount: true): {\n\t\tcount: number,\n\t\tlist: IWordDebug[],\n\t}\n\t/**\n\t * 转换同义词\n\t */\n\tconvertSynonym(ret: IWordDebug[], showcount?: boolean): IWordDebug[]\n\tconvertSynonym(ret: IWordDebug[], showcount?: boolean)\n\t{\n\t\treturn convertSynonym(ret, {\n\t\t\tshowcount,\n\t\t\tDICT_SYNONYM: this.getDict('SYNONYM'),\n\t\t\tDICT_TABLE: this.getDict('TABLE'),\n\t\t\tPOSTAG: this.POSTAG,\n\t\t}) as IWordDebug[] | IConvertSynonymWithShowcount;\n\t}\n\n\t/**\n\t * 将单词数组连接成字符串\n\t *\n\t * @param {Array} words 单词数组\n\t * @return {String}\n\t */\n\tstringify(words: Array<IWord | string>, ...argv): string\n\t{\n\t\treturn stringify(words, ...argv);\n\t}\n\n\t/**\n\t * 将单词数组连接成字符串\n\t *\n\t * @param {Array} words 单词数组\n\t * @return {String}\n\t */\n\tstatic stringify(words: Array<IWord | string>, ...argv): string\n\t{\n\t\treturn stringify(words, ...argv)\n\t}\n\n\t/**\n\t * 根据某个单词或词性来分割单词数组\n\t *\n\t * @param {Array} words 单词数组\n\t * @param {Number|String} s 用于分割的单词或词性\n\t * @return {Array}\n\t */\n\tsplit(words: IWord[], s: string | number, ...argv): IWord[]\n\t{\n\t\treturn split(words, s, ...argv)\n\t}\n\n\t/**\n\t * 在单词数组中查找某一个单词或词性所在的位置\n\t *\n\t * @param {Array} words 单词数组\n\t * @param {Number|String} s 要查找的单词或词性\n\t * @param {Number} cur 开始位置\n\t * @return {Number} 找不到，返回-1\n\t */\n\tindexOf(words: IWord[], s: string | number, cur?: number, ...argv)\n\t{\n\t\treturn indexOf(words, cur, ...argv)\n\t}\n\n}\n\nexport { IDICT, IDICT2, IDICT_BLACKLIST, IDICT_STOPWORD, IDICT_SYNONYM, IOptionsDoSegment, IOptionsSegment, ISPLIT, ISPLIT_FILTER, IWord }\n\nexport default SegmentCore;\n"]}
\No newline at end of file