UNPKG

15.2 kBTypeScriptView Raw
1export as namespace nlp
2
3declare interface nlp<D extends object, W extends object> {
4 /** normal usage */
5 (text: string): nlp.ExtendedDocument<D, W>
6 /** tozenize string */
7 tokenize(text: string): nlp.ExtendedDocument<D, W>
8 /** mix in a compromise-plugin */
9 extend<P>(
10 plugin: P
11 ): nlp<
12 P extends nlp.Plugin<infer PD, infer PW> ? { [k in keyof (PD & D)]: (PD & D)[k] } : { [k in keyof D]: D[k] },
13 P extends nlp.Plugin<infer PD, infer PW> ? { [k in keyof (PW & W)]: (PW & W)[k] } : { [k in keyof W]: W[k] }
14 >
15
16 /** re-generate a Doc object from .json() results */
17 load(json: any): nlp.ExtendedDocument<D, W>
18 /** log our decision-making for debugging */
19 verbose(bool: boolean): nlp.ExtendedDocument<D, W>
20 /** current semver version of the library */
21 version: nlp.ExtendedDocument<D, W>
22}
23
24declare function nlp(text: string): nlp.DefaultDocument
25declare function nlp<D extends object, W extends object>(text: string): nlp.ExtendedDocument<D, W>
26
27// possible values to .json()
28declare interface JsonOptions {
29 /** a perfect copy of the input text */
30 text?: boolean
31 /** normalized whitespace, case, unicode, punctuation */
32 normal?: boolean
33 /** lowercase, trimmed, contractions expanded. */
34 reduced?: boolean
35 /** cleanup whitespace */
36 trim?: boolean
37 /** character-position where this begins */
38 offset?: boolean
39 /** frequency of this match in the document */
40 count?: boolean
41 /** remove duplicate results*/
42 unique?: boolean
43 /** starting term # in document */
44 index?: boolean
45 /** options for each term */
46 terms?: {
47 text?: boolean
48 normal?: boolean
49 clean?: boolean
50 implicit?: boolean
51 tags?: boolean
52 whitespace?: boolean
53 id?: boolean
54 offset?: boolean
55 bestTag?: boolean
56 }
57}
58
59// Constructor
60declare module nlp {
61 export function tokenize(text: string): DefaultDocument
62 /** mix in a compromise-plugin */
63 export function extend<P>(
64 plugin: P
65 ): nlp<P extends Plugin<infer D, infer W> ? D : {}, P extends Plugin<infer D, infer W> ? W : {}>
66 /** re-generate a Doc object from .json() results */
67 export function load(json: any): DefaultDocument
68 /** log our decision-making for debugging */
69 export function verbose(bool: boolean): DefaultDocument
70 /** current semver version of the library */
71 export const version: number
72
73 type Plugin<D extends object, W extends object> = (
74 Doc: Document<World & W> & D & { prototype: D },
75 world: World & W
76 ) => void
77
78 type ExtendedWorld<W extends object> = nlp.World & W
79 type ExtendedDocument<D extends object, W extends object> = {
80 [k in keyof (nlp.Document<ExtendedWorld<W>> & D)]: (nlp.Document<ExtendedWorld<W>> & D)[k]
81 }
82 type DefaultDocument = {
83 [k in keyof nlp.Document]: nlp.Document[k]
84 }
85
86 class Document<W extends World = World> {
87 // Utils
88 /** return the whole original document ('zoom out') */
89 all(): Document<W>
90 /** is this document empty? */
91 found: boolean
92 /** return the previous result */
93 parent(): Document<W>
94 /** return all of the previous results */
95 parents(): Document<W>[]
96 /** (re)run the part-of-speech tagger on this document */
97 tagger(): Document<W>
98 /** count the # of terms in each match */
99 wordCount(): number
100 /** count the # of characters of each match */
101 length(): number
102 /** deep-copy the document, so that no references remain */
103 clone(shallow?: boolean): Document<W>
104 /** freeze the current state of the document, for speed-purposes */
105 cache(options?: object): Document<W>
106 /** un-freezes the current state of the document, so it may be transformed */
107 uncache(options?: object): Document<W>
108 /** the current world */
109 world: W
110
111 // Accessors
112 /** use only the first result(s) */
113 first(n?: number): Document<W>
114 /** use only the last result(s) */
115 last(n?: number): Document<W>
116 /** grab a subset of the results */
117 slice(start: number, end?: number): Document<W>
118 /** use only the nth result */
119 eq(n: number): Document<W>
120 /** get the first word in each match */
121 firstTerm(): Document<W>
122 /** get the end word in each match */
123 lastTerm(): Document<W>
124 /** return a flat list of all Term objects in match */
125 termList(): any
126
127 // Match
128 /** return a new Doc, with this one as a parent */
129 match(match: string | Document<W>): Document<W>
130 /** return all results except for this */
131 not(match: string | Document<W>): Document<W>
132 /** return only the first match */
133 matchOne(match: string | Document<W>): Document<W>
134 /** return each current phrase, only if it contains this match */
135 if(match: string | Document<W>): Document<W>
136 /** Filter-out any current phrases that have this match */
137 ifNo(match: string | Document<W>): Document<W>
138 /** Return a boolean if this match exists */
139 has(match: string | Document<W>): boolean
140 /** search through earlier terms, in the sentence */
141 lookBehind(match: string | Document<W>): Document<W>
142 /** search through following terms, in the sentence */
143 lookAhead(match: string | Document<W>): Document<W>
144 /** return the terms before each match */
145 before(match: string | Document<W>): Document<W>
146 /** return the terms after each match */
147 after(match: string | Document<W>): Document<W>
148 /** quick find for an array of string matches */
149 lookup(matches: string[]): Document<W>
150
151 // Case
152 /** turn every letter of every term to lower-cse */
153 toLowerCase(): Document<W>
154 /** turn every letter of every term to upper case */
155 toUpperCase(): Document<W>
156 /** upper-case the first letter of each term */
157 toTitleCase(): Document<W>
158 /** remove whitespace and title-case each term */
159 toCamelCase(): Document<W>
160
161 // Whitespace
162 /** add this punctuation or whitespace before each match */
163 pre(str: string, concat: boolean): Document<W>
164 /** add this punctuation or whitespace after each match */
165 post(str: string, concat: boolean): Document<W>
166 /** remove start and end whitespace */
167 trim(): Document<W>
168 /** connect words with hyphen, and remove whitespace */
169 hyphenate(): Document<W>
170 /** remove hyphens between words, and set whitespace */
171 dehyphenate(): Document<W>
172
173 // Tag
174 /** Give all terms the given tag */
175 tag(tag: string, reason?: string): Document<W>
176 /** Only apply tag to terms if it is consistent with current tags */
177 tagSafe(tag: string, reason?: string): Document<W>
178 /** Remove this term from the given terms */
179 unTag(tag: string, reason?: string): Document<W>
180 /** return only the terms that can be this tag */
181 canBe(tag: string): Document<W>
182
183 // Loops
184 /** run each phrase through a function, and create a new document */
185 map(fn: Function): Document<W> | []
186 /** run a function on each phrase, as an individual document */
187 forEach(fn: Function): Document<W>
188 /** return only the phrases that return true */
189 filter(fn: Function): Document<W>
190 /** return a document with only the first phrase that matches */
191 find(fn: Function): Document<W> | undefined
192 /** return true or false if there is one matching phrase */
193 some(fn: Function): Document<W>
194 /** sample a subset of the results */
195 random(n?: number): Document<W>
196
197 // Insert
198 /** substitute-in new content */
199 replaceWith(text: string | Function, keepTags?: boolean | object, keepCase?: boolean): Document<W>
200 /** search and replace match with new content */
201 replace(match: string, text?: string | Function, keepTags?: boolean | object, keepCase?: boolean): Document<W>
202 /** fully remove these terms from the document */
203 delete(match: string): Document<W>
204 /** add these new terms to the end (insertAfter) */
205 append(text: string): Document<W>
206 /** add these new terms to the front (insertBefore) */
207 prepend(text: string): Document<W>
208 /** add these new things to the end */
209 concat(text: string): Document<W>
210
211 // transform
212 /**re-arrange the order of the matches (in place) */
213 sort(method?: string | Function): Document<W>
214 /**reverse the order of the matches, but not the words */
215 reverse(): Document<W>
216 /** clean-up the document, in various ways */
217 normalize(options?: string | object): string
218 /** remove any duplicate matches */
219 unique(): Document<W>
220 /** return a Document with three parts for every match ('splitOn') */
221 split(match?: string): Document<W>
222 /** separate everything after the match as a new phrase */
223 splitBefore(match?: string): Document<W>
224 /** separate everything before the word, as a new phrase */
225 splitAfter(match?: string): Document<W>
226 /** split a document into labeled sections */
227 segment(regs: object, options?: object): Document<W>
228 /** make all phrases into one phrase */
229 join(str?: string): Document<W>
230
231 // Output
232 /** return the document as text */
233 text(options?: string | object): string
234 /** pull out desired metadata from the document */
235 json(options?: JsonOptions | string): any
236 /** some named output formats */
237 out(format?: 'text' | 'normal' | 'offset' | 'terms'): string
238 out(format: 'array'): string[]
239 out(format: 'tags' | 'terms'): Array<{ normal: string; text: string; tags: string[] }>
240 out(format: 'json'): Array<{ normal: string; text: string; tags: () => void }>[]
241 out(format: 'debug'): Text
242 out(format: 'topk'): Array<{ normal: string; count: number; percent: number }>
243 /** pretty-print the current document and its tags */
244 debug(): Document<W>
245 /** store a parsed document for later use */
246 export(): any
247
248 // Selections
249 /** split-up results by each individual term */
250 terms(n?: number): Document<W>
251 /** split-up results into multi-term phrases */
252 clauses(n?: number): Document<W>
253 /** return all terms connected with a hyphen or dash like `'wash-out'`*/
254 hyphenated(n?: number): Document<W>
255 /** add quoation marks around each match */
256 toQuoations(start?: string, end?: string): Document<W>
257 /** add brackets around each match */
258 toParentheses(start?: string, end?: string): Document<W>
259 /** return things like `'(939) 555-0113'` */
260 phoneNumbers(n?: number): Document<W>
261 /** return things like `'#nlp'` */
262 hashTags(n?: number): Document<W>
263 /** return things like `'hi@compromise.cool'` */
264 emails(n?: number): Document<W>
265 /** return things like `:)` */
266 emoticons(n?: number): Document<W>
267 /** return athings like `💋` */
268 emoji(n?: number): Document<W>
269 /** return things like `'@nlp_compromise'`*/
270 atMentions(n?: number): Document<W>
271 /** return things like `'compromise.cool'` */
272 urls(n?: number): Document<W>
273 /** return things like `'quickly'` */
274 adverbs(n?: number): Document<W>
275 /** return things like `'he'` */
276 pronouns(n?: number): Document<W>
277 /** return things like `'but'`*/
278 conjunctions(n?: number): Document<W>
279 /** return things like `'of'`*/
280 prepositions(n?: number): Document<W>
281 /** return person names like `'John A. Smith'`*/
282 people(n?: number): Document<W>
283 /** return location names like `'Paris, France'`*/
284 places(n?: number): Document<W>
285 /** return companies and org names like `'Google Inc.'`*/
286 organizations(n?: number): Document<W>
287 /** return people, places, and organizations */
288 topics(n?: number): Document<W>
289
290 // Subsets
291 /** alias for .all(), until plugin overloading */
292 sentences(): Document<W>
293 /** return things like `'Mrs.'`*/
294 abbreviations(n?: number): Abbreviations<W>
295 /** return any multi-word terms, like "didn't" */
296 contractions(n?: number): Contractions<W>
297 /** contract words that can combine, like "did not" */
298 contract(): Document<W>
299 /** return anything inside (parentheses) */
300 parentheses(n?: number): Parentheses<W>
301 /** return things like "Spencer's" */
302 possessives(n?: number): Possessives<W>
303 /** return any terms inside 'quotation marks' */
304 quotations(n?: number): Quotations<W>
305 /** return things like `'FBI'` */
306 acronyms(n?: number): Acronyms<W>
307 /** return things like `'eats, shoots, and leaves'` */
308 lists(n?: number): Lists<W>
309 /** return any subsequent terms tagged as a Noun */
310 nouns(n?: number): Nouns<W>
311 /** return any subsequent terms tagged as a Verb */
312 verbs(n?: number): Verbs<W>
313 }
314
315 // Nouns class
316 interface Nouns<W extends World = World> extends ExtendedDocument<{}, W> {
317 /** get any adjectives describing this noun*/
318 adjectives(): Document<W>
319 /** return only plural nouns */
320 isPlural(): Document<W>
321 /** return only nouns that _can be_ inflected as plural */
322 hasPlural(): Document<W>
323 /** 'football captain' → 'football captains' */
324 toPlural(setArticle?: boolean): Document<W>
325 /** 'turnovers' → 'turnover' */
326 toSingular(setArticle?: boolean): Document<W>
327 /** add a `'s` to the end, in a safe manner. */
328 toPossessive(): Document<W>
329 }
330
331 // Verbs class
332 interface Verbs<W extends World = World> extends Document<W> {
333 /** return the adverbs describing this verb */
334 adverbs(): Document<W>
335 /** return only plural nouns */
336 isPlural(): Document<W>
337 /** return only singular nouns */
338 isSingular(): Document<W>
339 /** return all forms of these verbs */
340 conjugate(): Document<W>
341 /** 'will go' → 'went' */
342 toPastTense(): Document<W>
343 /** 'walked' → 'walks' */
344 toPresentTense(): Document<W>
345 /** 'walked' → 'will walk' */
346 toFutureTense(): Document<W>
347 /** 'walks' → 'walk' */
348 toInfinitive(): Document<W>
349 /** 'walks' → 'walking' */
350 toGerund(): Document<W>
351 /** return verbs with 'not' */
352 isNegative(): Document<W>
353 /** only verbs without 'not'*/
354 isPositive(): Document<W>
355 /** 'went' → 'did not go'*/
356 toNegative(): Document<W>
357 /** "didn't study" → 'studied' */
358 toPositive(): Document<W>
359 }
360
361 interface Abbreviations<W extends World = World> extends Document<W> {
362 /** */
363 stripPeriods(): Document<W>
364 /** */
365 addPeriods(): Document<W>
366 }
367
368 interface Acronyms<W extends World = World> extends Document<W> {
369 /** */
370 stripPeriods(): Document<W>
371 /** */
372 addPeriods(): Document<W>
373 }
374
375 interface Contractions<W extends World = World> extends Document<W> {
376 /** */
377 expand(): Document<W>
378 }
379
380 interface Parentheses<W extends World = World> extends Document<W> {
381 /** */
382 unwrap(): Document<W>
383 }
384
385 interface Possessives<W extends World = World> extends Document<W> {
386 /** */
387 strip(): Document<W>
388 }
389
390 interface Quotations<W extends World = World> extends Document<W> {
391 /** */
392 unwrap(): Document<W>
393 }
394
395 interface Lists<W extends World = World> extends Document<W> {
396 /** */
397 conjunctions(): Document<W>
398 /** */
399 parts(): Document<W>
400 /** */
401 items(): Document<W>
402 /** */
403 add(): Document<W>
404 /** */
405 remove(): Document<W>
406 /** */
407 hasOxfordComma(): Document<W>
408 }
409
410 class World {}
411}
412
413export default nlp