UNPKG

14.8 kBJavaScriptView Raw
1import split from 'graphemesplit';
2
3const whitespaceRegex = /^\s+$/;
4const nonWordRegex = /^[`~!@#$%^&*()\-=_+{}[\]\|\\;':",./<>?]+$/;
5const sortKind = {
6 insertOrder: "insertOrder",
7 bestMatch: "bestMatch"
8}; // the default options, which will be used for any unset option
9
10const defaultOptions = {
11 keySelector: s => s,
12 threshold: .6,
13 ignoreCase: true,
14 ignoreSymbols: true,
15 normalizeWhitespace: true,
16 returnMatchData: false,
17 useDamerau: true,
18 useSellers: true,
19 sortBy: sortKind.bestMatch
20};
21
22const noop = () => {};
23
24const arrayWrap = item => item instanceof Array ? item : [item]; // return normalized string, with map included
25
26
27function normalize(string, options) {
28 const lower = options.ignoreCase ? string.toLocaleLowerCase() : string; // track transformations
29
30 const normal = [];
31 const map = [];
32 let lastWasWhitespace = true;
33 let length = 0;
34
35 for (const grapheme of split(lower)) {
36 whitespaceRegex.lastIndex = 0;
37 nonWordRegex.lastIndex = 0;
38
39 if (options.normalizeWhitespace && whitespaceRegex.test(grapheme)) {
40 if (!lastWasWhitespace) {
41 normal.push(" ");
42 map.push(length);
43 lastWasWhitespace = true;
44 }
45 } else if (!(options.ignoreSymbols && nonWordRegex.test(grapheme))) {
46 normal.push(grapheme.normalize());
47 map.push(length);
48 lastWasWhitespace = false;
49 }
50
51 length += grapheme.length;
52 } // add the end of the string
53
54
55 map.push(string.length);
56
57 while (normal[normal.length - 1] === " ") {
58 normal.pop();
59 map.pop();
60 }
61
62 return {
63 original: string,
64 normal,
65 map
66 };
67} // translates a match to the original string
68
69
70function denormalizeMatchPosition(match, map) {
71 return {
72 index: map[match.start],
73 length: map[match.end + 1] - map[match.start]
74 };
75} // walks back up the matrix to find the match index and length
76
77
78function walkBack(rows, scoreIndex) {
79 if (scoreIndex === 0) {
80 return {
81 index: 0,
82 length: 0
83 };
84 }
85
86 let start = scoreIndex;
87
88 for (let i = rows.length - 2; i > 0 && start > 1; i--) {
89 const row = rows[i];
90 start = row[start] < row[start - 1] ? start : start - 1;
91 }
92
93 return {
94 start: start - 1,
95 end: scoreIndex - 1
96 };
97} // walkback is a noop for non-sellers, but should still return an object
98
99
100function noopWalkback() {
101 return {
102 start: 0,
103 end: 0
104 };
105}
106
107const levUpdateScore = () => true;
108
109const sellersUpdateScore = (cur, min) => cur < min;
110
111function getLevScore(rows, length) {
112 const lastRow = rows[rows.length - 1];
113 const lastCell = lastRow[length - 1];
114 const scoreLength = Math.max(rows.length, length);
115 return {
116 score: 1 - lastCell / (scoreLength - 1),
117 scoreIndex: length - 1
118 };
119}
120
121function getSellersScore(rows, length) {
122 // search term was empty string, return perfect score
123 if (rows.length === 1) {
124 return {
125 score: 1,
126 scoreIndex: 0
127 };
128 }
129
130 const lastRow = rows[rows.length - 1];
131 let minValue = lastRow[0];
132 let minIndex = 0;
133
134 for (let i = 1; i < length; i++) {
135 const val = lastRow[i];
136
137 if (val < minValue) {
138 minValue = val;
139 minIndex = i;
140 }
141 }
142
143 return {
144 score: 1 - minValue / (rows.length - 1),
145 scoreIndex: minIndex
146 };
147}
148
149function initLevRows(rowCount, columnCount) {
150 const rows = new Array(rowCount);
151
152 for (let i = 0; i < rowCount; i++) {
153 rows[i] = new Array(columnCount);
154 rows[i][0] = i;
155 }
156
157 for (let i = 0; i < columnCount; i++) {
158 rows[0][i] = i;
159 }
160
161 return rows;
162}
163
164function initSellersRows(rowCount, columnCount) {
165 const rows = new Array(rowCount);
166 rows[0] = new Array(columnCount).fill(0);
167
168 for (let i = 1; i < rowCount; i++) {
169 rows[i] = new Array(columnCount);
170 rows[i][0] = i;
171 }
172
173 return rows;
174} // the content of the innermost loop of levenshtein
175
176
177function levCore(term, candidate, rows, i, j) {
178 const rowA = rows[i];
179 const rowB = rows[i + 1];
180 const cost = term[i] === candidate[j] ? 0 : 1;
181 let m;
182 let min = rowB[j] + 1; // insertion
183
184 if ((m = rowA[j + 1] + 1) < min) min = m; // deletion
185
186 if ((m = rowA[j] + cost) < min) min = m; // substitution
187
188 rowB[j + 1] = min;
189} // runtime complexity: O(mn) where m and n are the lengths of term and candidate, respectively
190// Note: this method only runs on a single column
191
192
193function levenshtein(term, candidate, rows, j) {
194 for (let i = 0; i < term.length; i++) {
195 levCore(term, candidate, rows, i, j);
196 }
197} // has all the runtime characteristics of the above, but punishes transpositions less,
198// resulting in better tolerance to those types of typos
199// Note: this method only runs on a single column
200
201
202function damerauLevenshtein(term, candidate, rows, j) {
203 // if j === 0, we can't check for transpositions,
204 // so use normal levenshtein instead
205 if (j === 0) {
206 levenshtein(term, candidate, rows, j);
207 return;
208 } // for i === 0, we also can't check for transpositions, so calculate
209 // the first row using normal levenshtein as well
210
211
212 if (term.length > 0) {
213 levCore(term, candidate, rows, 0, j);
214 }
215
216 for (let i = 1; i < term.length; i++) {
217 const rowA = rows[i - 1];
218 const rowB = rows[i];
219 const rowC = rows[i + 1];
220 const cost = term[i] === candidate[j] ? 0 : 1;
221 let m; // insertion
222
223 let min = rowC[j] + 1; // deletion
224
225 if ((m = rowB[j + 1] + 1) < min) min = m; // substitution
226
227 if ((m = rowB[j] + cost) < min) min = m; // transposition
228
229 if (term[i] === candidate[j - 1] && term[i - 1] === candidate[j] && (m = rowA[j - 1] + cost) < min) min = m;
230 rowC[j + 1] = min;
231 }
232} // method for creating a trie from search candidates
233// using a trie can significantly improve search time
234
235
236function trieInsert(trie, string, item) {
237 let walker = trie;
238
239 for (let i = 0; i < string.length; i++) {
240 const char = string[i]; // add child node if not already present
241
242 if (walker.children[char] == null) {
243 walker.children[char] = {
244 children: {},
245 candidates: [],
246 depth: 0
247 };
248 } // log max depth of this subtree
249
250
251 walker.depth = Math.max(walker.depth, string.length - i); // step into child node
252
253 walker = walker.children[char];
254 }
255
256 walker.candidates.push(item);
257} // transforms a list of candidates into objects with normalized search keys,
258// and inserts them into a trie
259// the keySelector is used to pick strings from an object to search by
260
261
262function createSearchTrie(trie, index, items, options) {
263 for (const item of items) {
264 const candidates = arrayWrap(options.keySelector(item)).map((key, keyIndex) => ({
265 index,
266 keyIndex,
267 item,
268 normalized: normalize(key, options)
269 }));
270 index++;
271
272 for (const candidate of candidates) {
273 trieInsert(trie, candidate.normalized.normal, candidate);
274 }
275 }
276} // scored item comparator
277
278
279function compareItemsBestScore(a, b) {
280 // highest priority is raw levenshtein score
281 const scoreDiff = b.score - a.score;
282
283 if (scoreDiff !== 0) {
284 return scoreDiff;
285 } // ties are broken by earlier match positions
286
287
288 const matchPosDiff = a.match.start - b.match.start;
289
290 if (matchPosDiff !== 0) {
291 return matchPosDiff;
292 } // prioritize earlier keys
293
294
295 const keyIndexDiff = a.keyIndex - b.keyIndex;
296
297 if (keyIndexDiff !== 0) {
298 return keyIndexDiff;
299 } // lastly, break ties by preferring the closer length match
300
301
302 const lengthDiff = a.lengthDiff - b.lengthDiff;
303
304 if (lengthDiff !== 0) {
305 return lengthDiff;
306 } // if all else fails, resort to insertion order
307
308
309 return compareItemsInsertOrder(a, b);
310}
311
312function compareItemsInsertOrder(a, b) {
313 return a.index - b.index;
314}
315
316function getCompareFunc(sortBy) {
317 switch (sortBy) {
318 case sortKind.bestMatch:
319 return compareItemsBestScore;
320
321 case sortKind.insertOrder:
322 return compareItemsInsertOrder;
323
324 default:
325 throw new Error(`unknown sortBy method ${sortBy}`);
326 }
327} // dedupes and adds results to the results list/map
328
329
330function addResult(results, resultMap, candidate, score, match, lengthDiff, compareItems) {
331 const scoredItem = {
332 item: candidate.item,
333 normalized: candidate.normalized,
334 score,
335 match,
336 index: candidate.index,
337 keyIndex: candidate.keyIndex,
338 lengthDiff
339 };
340
341 if (resultMap[candidate.index] == null) {
342 resultMap[candidate.index] = results.length;
343 results.push(scoredItem);
344 } else if (compareItems(scoredItem, results[resultMap[candidate.index]]) < 0) {
345 results[resultMap[candidate.index]] = scoredItem;
346 }
347}
348
349const getLevLength = Math.max;
350
351const getSellersLength = termLength => termLength; // skip any subtrees for which it is impossible to score >= threshold
352
353
354function levShouldContinue(node, pos, term, threshold, sValue) {
355 // earliest point (length) at which sValue could return to 0
356 const p1 = pos + sValue; // point (length) at which string lengths would match
357
358 const p2 = Math.min(term.length, pos + node.depth + 1); // the best score possible is the string which minimizes the value
359 // max(sValue, strLenDiff), which is always halfway between p1 and p2
360
361 const length = Math.ceil((p1 + p2) / 2);
362 const bestPossibleValue = length - p2;
363 return 1 - bestPossibleValue / length >= threshold;
364}
365
366function sellersShouldContinue(node, _, term, threshold, sValue, lastValue) {
367 const bestPossibleValue = Math.min(sValue, lastValue - (node.depth + 1));
368 return 1 - bestPossibleValue / term.length >= threshold;
369} // (pseudo) recursively walk the trie
370
371
372function searchRecurse(trie, term, scoreMethods, rows, results, resultMap, options) {
373 const stack = [];
374
375 for (const key in trie.children) {
376 const node = trie.children[key];
377 stack.push([node, 1, key, 0, term.length]);
378 }
379
380 const acc = new Array(trie.depth);
381
382 while (stack.length !== 0) {
383 const [node, len, char, si, sv] = stack.pop();
384 acc[len - 1] = char; // build rows
385
386 scoreMethods.score(term, acc, rows, len - 1); // track best score and position
387
388 const lastIndex = len;
389 const lastValue = rows[rows.length - 1][lastIndex];
390 let sIndex = si,
391 sValue = sv;
392
393 if (scoreMethods.shouldUpdateScore(lastValue, sv)) {
394 sIndex = lastIndex;
395 sValue = lastValue;
396 } // insert results
397
398
399 if (node.candidates.length > 0) {
400 const length = scoreMethods.getLength(term.length, len);
401 const score = 1 - sValue / length;
402
403 if (score >= options.threshold) {
404 const match = walkBack(rows, sIndex);
405 const lengthDiff = Math.abs(len - term.length);
406
407 for (const candidate of node.candidates) {
408 addResult(results, resultMap, candidate, score, match, lengthDiff, scoreMethods.compareItems);
409 }
410 }
411 } // recurse for children
412
413
414 for (const key in node.children) {
415 const child = node.children[key];
416
417 if (scoreMethods.shouldContinue(child, len, term, options.threshold, sValue, lastValue)) {
418 stack.push([child, len + 1, key, sIndex, sValue]);
419 }
420 }
421 }
422} // the core match finder: returns a sorted, filtered list of matches
423// this does not normalize input, requiring users to normalize themselves
424
425
426function searchCore(term, trie, options) {
427 const initMethod = options.useSellers ? initSellersRows : initLevRows;
428 const scoreMethods = {
429 score: options.useDamerau ? damerauLevenshtein : levenshtein,
430 getLength: options.useSellers ? getSellersLength : getLevLength,
431 shouldUpdateScore: options.useSellers ? sellersUpdateScore : levUpdateScore,
432 shouldContinue: options.useSellers ? sellersShouldContinue : levShouldContinue,
433 walkBack: options.useSellers ? walkBack : noopWalkback,
434 compareItems: getCompareFunc(options.sortBy)
435 }; // walk the trie, scoring and storing the candidates
436
437 const resultMap = {};
438 const results = [];
439 const rows = initMethod(term.length + 1, trie.depth + 1);
440
441 if (options.threshold <= 0 || term.length === 0) {
442 for (const candidate of trie.candidates) {
443 addResult(results, resultMap, candidate, 0, {
444 index: 0,
445 length: 0
446 }, term.length, scoreMethods.compareItems);
447 }
448 }
449
450 searchRecurse(trie, term, scoreMethods, rows, results, resultMap, options);
451 const sorted = results.sort(scoreMethods.compareItems);
452
453 if (options.returnMatchData) {
454 const denormalize = options.useSellers ? denormalizeMatchPosition : noop;
455 return sorted.map(candidate => ({
456 item: candidate.item,
457 original: candidate.normalized.original,
458 key: candidate.normalized.normal.join(""),
459 score: candidate.score,
460 match: denormalize(candidate.match, candidate.normalized.map)
461 }));
462 }
463
464 return sorted.map(candidate => candidate.item);
465} // wrapper for exporting sellers while allowing options to be passed in
466
467
468function fuzzy(term, candidate, options) {
469 options = { ...defaultOptions,
470 ...options
471 };
472 const initMethod = options.useSellers ? initSellersRows : initLevRows;
473 const scoreMethod = options.useDamerau ? damerauLevenshtein : levenshtein;
474 const getScore = options.useSellers ? getSellersScore : getLevScore;
475 term = normalize(term, options).normal;
476 const normalized = normalize(candidate, options);
477 const rows = initMethod(term.length + 1, normalized.normal.length + 1);
478
479 for (let j = 0; j < normalized.normal.length; j++) {
480 scoreMethod(term, normalized.normal, rows, j);
481 }
482
483 const scoreResult = getScore(rows, normalized.normal.length + 1);
484 return options.returnMatchData ? {
485 item: candidate,
486 original: normalized.original,
487 key: normalized.normal.join(""),
488 score: scoreResult.score,
489 match: options.useSellers ? denormalizeMatchPosition(walkBack(rows, scoreResult.scoreIndex), normalized.map) : noop()
490 } : scoreResult.score;
491} // simple one-off search. Useful if you don't expect to use the same candidate list again
492
493function search(term, candidates, options) {
494 options = { ...defaultOptions,
495 ...options
496 };
497 const trie = {
498 children: {},
499 candidates: [],
500 depth: 0
501 };
502 createSearchTrie(trie, 0, candidates, options);
503 return searchCore(normalize(term, options).normal, trie, options);
504} // class that improves performance of searching the same set multiple times
505// normalizes the strings and caches the result for future calls
506
507class Searcher {
508 constructor(candidates, options) {
509 this.options = Object.assign({}, defaultOptions, options);
510 this.trie = {
511 children: {},
512 candidates: [],
513 depth: 0
514 };
515 createSearchTrie(this.trie, 0, candidates, this.options);
516 this.count = candidates.length;
517 }
518
519 add(...candidates) {
520 createSearchTrie(this.trie, this.count, candidates, this.options);
521 this.count += candidates.length;
522 }
523
524 search(term, options) {
525 options = Object.assign({}, this.options, options);
526 return searchCore(normalize(term, this.options).normal, this.trie, options);
527 }
528
529}
530
531export { Searcher, fuzzy, search, sortKind };