UNPKG

6.34 kBTypeScriptView Raw
1/**
2 * 分词器接口
3 *
4 * @author 老雷<leizongmin@gmail.com>
5 */
6/// <reference types="node" />
7import POSTAG from '../POSTAG';
8import TableDictBlacklist from '../table/blacklist';
9import AbstractTableDictCore from '../table/core';
10import { TableDict } from '../table/dict';
11import { TableDictStopword } from '../table/stopword';
12import TableDictSynonym from '../table/synonym';
13import { ISubOptimizer, ISubTokenizer, Optimizer, Tokenizer } from '../mod/index';
14import { IWordDebug } from '../util/index';
15import { EnumDictDatabase } from '../const';
16import { IDICT, IDICT2, IDICT_BLACKLIST, IDICT_STOPWORD, IDICT_SYNONYM, IOptionsDoSegment, IOptionsSegment, ISPLIT, ISPLIT_FILTER, IWord } from './types';
17import { ITSOverwrite } from 'ts-type';
18/**
19 * 创建分词器接口
20 */
21export declare class SegmentCore {
22 /**
23 * 分段
24 *
25 * 由於 segment 是利用對內容的前後文分析來進行分詞
26 * 所以如何切割段落對於結果就會產生不同影響
27 *
28 * `RegExp` or 具有 `.[Symbol.split](input: string, limit?: number) => string[]` 的物件
29 *
30 * @type {Segment.ISPLIT}
31 */
32 SPLIT: ISPLIT;
33 /**
34 * 分段之後 如果符合以下條件 則直接忽略分析
35 * `RegExp` or 具有 `.test(input: string) => boolean` 的物件
36 *
37 * @type {Segment.ISPLIT_FILTER}
38 */
39 SPLIT_FILTER: ISPLIT_FILTER;
40 /**
41 * 词性
42 * @type {POSTAG}
43 */
44 POSTAG: typeof POSTAG;
45 /**
46 * 词典表
47 * @type {{}}
48 */
49 DICT: {
50 STOPWORD?: IDICT_STOPWORD;
51 SYNONYM?: IDICT_SYNONYM;
52 [key: string]: IDICT;
53 };
54 modules: {
55 tokenizer: ISubTokenizer[];
56 optimizer: ISubOptimizer[];
57 };
58 tokenizer: Tokenizer;
59 optimizer: Optimizer;
60 db: {
61 [key: string]: TableDict;
62 };
63 options: IOptionsSegment;
64 inited?: boolean;
65 constructor(options?: IOptionsSegment);
66 getDictDatabase<R extends TableDictSynonym>(type: EnumDictDatabase.SYNONYM, autocreate?: boolean, libTableDict?: {
67 new (...argv: any[]): R;
68 }): R;
69 getDictDatabase<R extends TableDict>(type: EnumDictDatabase.TABLE, autocreate?: boolean, libTableDict?: {
70 new (...argv: any[]): R;
71 }): R;
72 getDictDatabase<R extends TableDictStopword>(type: EnumDictDatabase.STOPWORD, autocreate?: boolean, libTableDict?: {
73 new (...argv: any[]): R;
74 }): R;
75 getDictDatabase<R extends TableDictBlacklist>(type: EnumDictDatabase.BLACKLIST, autocreate?: boolean, libTableDict?: {
76 new (...argv: any[]): R;
77 }): R;
78 getDictDatabase<R extends TableDictBlacklist>(type: EnumDictDatabase.BLACKLIST_FOR_OPTIMIZER, autocreate?: boolean, libTableDict?: {
79 new (...argv: any[]): R;
80 }): R;
81 getDictDatabase<R extends AbstractTableDictCore<any>>(type: string | EnumDictDatabase, autocreate?: boolean, libTableDict?: {
82 new (...argv: any[]): R;
83 }): R;
84 /**
85 * 载入分词模块
86 *
87 * @param {String|Array|Object} module 模块名称(数组)或模块对象
88 * @return {Segment}
89 */
90 use(mod: ISubOptimizer, ...argv: any[]): this;
91 use(mod: ISubTokenizer, ...argv: any[]): this;
92 use(mod: any, ...argv: any[]): this;
93 /**
94 * 取词典表
95 *
96 * @param {String} type 类型
97 * @return {object}
98 */
99 getDict(type: EnumDictDatabase.STOPWORD): IDICT_STOPWORD;
100 getDict(type: EnumDictDatabase.SYNONYM): IDICT_SYNONYM;
101 getDict(type: EnumDictDatabase.TABLE): IDICT<IWord>;
102 getDict(type: EnumDictDatabase.BLACKLIST): IDICT_BLACKLIST;
103 getDict(type: EnumDictDatabase.BLACKLIST_FOR_OPTIMIZER): IDICT_BLACKLIST;
104 getDict(type: 'TABLE2'): IDICT2<IWord>;
105 getDict(type: EnumDictDatabase): IDICT;
106 getDict(type: any): IDICT;
107 getOptionsDoSegment<T extends IOptionsDoSegment>(options?: T): T;
108 protected _get_text(text: string | Buffer): string;
109 addBlacklist(word: string, remove?: boolean): this;
110 /**
111 * remove key in TABLE by BLACKLIST
112 */
113 doBlacklist(): this;
114 listModules(options?: IOptionsDoSegment): {
115 enable: {
116 tokenizer: ISubTokenizer[];
117 optimizer: ISubOptimizer[];
118 };
119 disable: {
120 tokenizer: ISubTokenizer[];
121 optimizer: ISubOptimizer[];
122 };
123 };
124 /**
125 * 开始分词
126 *
127 * @param {String} text 文本
128 * @param {Object} options 选项
129 * - {Boolean} simple 是否仅返回单词内容
130 * - {Boolean} stripPunctuation 去除标点符号
131 * - {Boolean} convertSynonym 转换同义词
132 * - {Boolean} stripStopword 去除停止符
133 * @return {Array}
134 */
135 doSegment(text: string | Buffer, options: ITSOverwrite<IOptionsDoSegment, {
136 simple: true;
137 }>): string[];
138 doSegment(text: string | Buffer, options?: IOptionsDoSegment): IWord[];
139 /**
140 * 转换同义词
141 */
142 convertSynonym(ret: IWordDebug[], showcount: true): {
143 count: number;
144 list: IWordDebug[];
145 };
146 /**
147 * 转换同义词
148 */
149 convertSynonym(ret: IWordDebug[], showcount?: boolean): IWordDebug[];
150 /**
151 * 将单词数组连接成字符串
152 *
153 * @param {Array} words 单词数组
154 * @return {String}
155 */
156 stringify(words: Array<IWord | string>, ...argv: any[]): string;
157 /**
158 * 将单词数组连接成字符串
159 *
160 * @param {Array} words 单词数组
161 * @return {String}
162 */
163 static stringify(words: Array<IWord | string>, ...argv: any[]): string;
164 /**
165 * 根据某个单词或词性来分割单词数组
166 *
167 * @param {Array} words 单词数组
168 * @param {Number|String} s 用于分割的单词或词性
169 * @return {Array}
170 */
171 split(words: IWord[], s: string | number, ...argv: any[]): IWord[];
172 /**
173 * 在单词数组中查找某一个单词或词性所在的位置
174 *
175 * @param {Array} words 单词数组
176 * @param {Number|String} s 要查找的单词或词性
177 * @param {Number} cur 开始位置
178 * @return {Number} 找不到,返回-1
179 */
180 indexOf(words: IWord[], s: string | number, cur?: number, ...argv: any[]): number;
181}
182export { IDICT, IDICT2, IDICT_BLACKLIST, IDICT_STOPWORD, IDICT_SYNONYM, IOptionsDoSegment, IOptionsSegment, ISPLIT, ISPLIT_FILTER, IWord };
183export default SegmentCore;