UNPKG

14.9 kBJavaScriptView Raw
1import split from 'graphemesplit';
2
3function _defineProperty(obj, key, value) {
4 if (key in obj) {
5 Object.defineProperty(obj, key, {
6 value: value,
7 enumerable: true,
8 configurable: true,
9 writable: true
10 });
11 } else {
12 obj[key] = value;
13 }
14
15 return obj;
16}
17
18function ownKeys(object, enumerableOnly) {
19 var keys = Object.keys(object);
20
21 if (Object.getOwnPropertySymbols) {
22 var symbols = Object.getOwnPropertySymbols(object);
23 if (enumerableOnly) symbols = symbols.filter(function (sym) {
24 return Object.getOwnPropertyDescriptor(object, sym).enumerable;
25 });
26 keys.push.apply(keys, symbols);
27 }
28
29 return keys;
30}
31
32function _objectSpread2(target) {
33 for (var i = 1; i < arguments.length; i++) {
34 var source = arguments[i] != null ? arguments[i] : {};
35
36 if (i % 2) {
37 ownKeys(Object(source), true).forEach(function (key) {
38 _defineProperty(target, key, source[key]);
39 });
40 } else if (Object.getOwnPropertyDescriptors) {
41 Object.defineProperties(target, Object.getOwnPropertyDescriptors(source));
42 } else {
43 ownKeys(Object(source)).forEach(function (key) {
44 Object.defineProperty(target, key, Object.getOwnPropertyDescriptor(source, key));
45 });
46 }
47 }
48
49 return target;
50}
51
52const whitespaceRegex = /^\s+$/;
53const nonWordRegex = /^[`~!@#$%^&*()\-=_+{}[\]\|\\;':",./<>?]+$/; // the default options, which will be used for any unset option
54
55const defaultOptions = {
56 keySelector: s => s,
57 threshold: .6,
58 ignoreCase: true,
59 ignoreSymbols: true,
60 normalizeWhitespace: true,
61 returnMatchData: false,
62 useDamerau: true,
63 useSellers: true
64};
65
66const noop = () => {};
67
68const arrayWrap = item => item instanceof Array ? item : [item]; // return normalized string, with map included
69
70
71function normalize(string, options) {
72 const lower = options.ignoreCase ? string.toLocaleLowerCase() : string; // track transformations
73
74 const normal = [];
75 const map = [];
76 let lastWasWhitespace = true;
77 let length = 0;
78
79 for (const grapheme of split(lower)) {
80 whitespaceRegex.lastIndex = 0;
81 nonWordRegex.lastIndex = 0;
82
83 if (options.normalizeWhitespace && whitespaceRegex.test(grapheme)) {
84 if (!lastWasWhitespace) {
85 normal.push(" ");
86 map.push(length);
87 lastWasWhitespace = true;
88 }
89 } else if (!(options.ignoreSymbols && nonWordRegex.test(grapheme))) {
90 normal.push(grapheme.normalize());
91 map.push(length);
92 lastWasWhitespace = false;
93 }
94
95 length += grapheme.length;
96 } // add the end of the string
97
98
99 map.push(string.length);
100
101 while (normal[normal.length - 1] === " ") {
102 normal.pop();
103 map.pop();
104 }
105
106 return {
107 original: string,
108 normal,
109 map
110 };
111} // translates a match to the original string
112
113
114function denormalizeMatchPosition(match, map) {
115 return {
116 index: map[match.start],
117 length: map[match.end + 1] - map[match.start]
118 };
119} // walks back up the matrix to find the match index and length
120
121
122function walkBack(rows, scoreIndex) {
123 if (scoreIndex === 0) {
124 return {
125 index: 0,
126 length: 0
127 };
128 }
129
130 let start = scoreIndex;
131
132 for (let i = rows.length - 2; i > 0 && start > 1; i--) {
133 const row = rows[i];
134 start = row[start] < row[start - 1] ? start : start - 1;
135 }
136
137 return {
138 start: start - 1,
139 end: scoreIndex - 1
140 };
141}
142
143const levUpdateScore = () => true;
144
145const sellersUpdateScore = (cur, min) => cur < min;
146
147function getLevScore(rows, length) {
148 const lastRow = rows[rows.length - 1];
149 const lastCell = lastRow[length - 1];
150 const scoreLength = Math.max(rows.length, length);
151 return {
152 score: 1 - lastCell / (scoreLength - 1),
153 scoreIndex: length - 1
154 };
155}
156
157function getSellersScore(rows, length) {
158 // search term was empty string, return perfect score
159 if (rows.length === 1) {
160 return {
161 score: 1,
162 scoreIndex: 0
163 };
164 }
165
166 const lastRow = rows[rows.length - 1];
167 let minValue = lastRow[0];
168 let minIndex = 0;
169
170 for (let i = 1; i < length; i++) {
171 const val = lastRow[i];
172
173 if (val < minValue) {
174 minValue = val;
175 minIndex = i;
176 }
177 }
178
179 return {
180 score: 1 - minValue / (rows.length - 1),
181 scoreIndex: minIndex
182 };
183}
184
185function initLevRows(rowCount, columnCount) {
186 const rows = new Array(rowCount);
187
188 for (let i = 0; i < rowCount; i++) {
189 rows[i] = new Array(columnCount);
190 rows[i][0] = i;
191 }
192
193 for (let i = 0; i < columnCount; i++) {
194 rows[0][i] = i;
195 }
196
197 return rows;
198}
199
200function initSellersRows(rowCount, columnCount) {
201 const rows = new Array(rowCount);
202 rows[0] = new Array(columnCount).fill(0);
203
204 for (let i = 1; i < rowCount; i++) {
205 rows[i] = new Array(columnCount);
206 rows[i][0] = i;
207 }
208
209 return rows;
210} // the content of the innermost loop of levenshtein
211
212
213function levCore(term, candidate, rows, i, j) {
214 const rowA = rows[i];
215 const rowB = rows[i + 1];
216 const cost = term[i] === candidate[j] ? 0 : 1;
217 let m;
218 let min = rowB[j] + 1; // insertion
219
220 if ((m = rowA[j + 1] + 1) < min) min = m; // deletion
221
222 if ((m = rowA[j] + cost) < min) min = m; // substitution
223
224 rowB[j + 1] = min;
225} // runtime complexity: O(mn) where m and n are the lengths of term and candidate, respectively
226// Note: this method only runs on a single column
227
228
229function levenshtein(term, candidate, rows, j) {
230 for (let i = 0; i < term.length; i++) {
231 levCore(term, candidate, rows, i, j);
232 }
233} // has all the runtime characteristics of the above, but punishes transpositions less,
234// resulting in better tolerance to those types of typos
235// Note: this method only runs on a single column
236
237
238function damerauLevenshtein(term, candidate, rows, j) {
239 // if j === 0, we can't check for transpositions,
240 // so use normal levenshtein instead
241 if (j === 0) {
242 levenshtein(term, candidate, rows, j);
243 return;
244 } // for i === 0, we also can't check for transpositions, so calculate
245 // the first row using normal levenshtein as well
246
247
248 if (term.length > 0) {
249 levCore(term, candidate, rows, 0, j);
250 }
251
252 for (let i = 1; i < term.length; i++) {
253 const rowA = rows[i - 1];
254 const rowB = rows[i];
255 const rowC = rows[i + 1];
256 const cost = term[i] === candidate[j] ? 0 : 1;
257 let m; // insertion
258
259 let min = rowC[j] + 1; // deletion
260
261 if ((m = rowB[j + 1] + 1) < min) min = m; // substitution
262
263 if ((m = rowB[j] + cost) < min) min = m; // transposition
264
265 if (term[i] === candidate[j - 1] && term[i - 1] === candidate[j] && (m = rowA[j - 1] + cost) < min) min = m;
266 rowC[j + 1] = min;
267 }
268} // method for creating a trie from search candidates
269// using a trie can significantly improve search time
270
271
272function trieInsert(trie, string, item) {
273 let walker = trie;
274
275 for (let i = 0; i < string.length; i++) {
276 const char = string[i]; // add child node if not already present
277
278 if (walker.children[char] == null) {
279 walker.children[char] = {
280 children: {},
281 candidates: [],
282 depth: 0
283 };
284 } // log max depth of this subtree
285
286
287 walker.depth = Math.max(walker.depth, string.length - i); // step into child node
288
289 walker = walker.children[char];
290 }
291
292 walker.candidates.push(item);
293} // transforms a list of candidates into objects with normalized search keys,
294// and inserts them into a trie
295// the keySelector is used to pick strings from an object to search by
296
297
298function createSearchTrie(trie, index, items, options) {
299 for (const item of items) {
300 const candidates = arrayWrap(options.keySelector(item)).map((key, keyIndex) => ({
301 index,
302 keyIndex,
303 item,
304 normalized: normalize(key, options)
305 }));
306 index++;
307
308 for (const candidate of candidates) {
309 trieInsert(trie, candidate.normalized.normal, candidate);
310 }
311 }
312} // scored item comparator
313
314
315function compareItems(a, b) {
316 const scoreDiff = b.score - a.score;
317
318 if (scoreDiff !== 0) {
319 return scoreDiff;
320 }
321
322 const keyIndexDiff = a.keyIndex - b.keyIndex;
323
324 if (keyIndexDiff !== 0) {
325 return keyIndexDiff;
326 }
327
328 const lengthDiff = a.lengthDiff - b.lengthDiff;
329
330 if (lengthDiff !== 0) {
331 return lengthDiff;
332 }
333
334 return a.index - b.index;
335} // dedupes and adds results to the results list/map
336
337
338function addResult(results, resultMap, candidate, score, match, lengthDiff) {
339 const scoredItem = {
340 item: candidate.item,
341 normalized: candidate.normalized,
342 score,
343 match,
344 index: candidate.index,
345 keyIndex: candidate.keyIndex,
346 lengthDiff
347 };
348
349 if (resultMap[candidate.index] == null) {
350 resultMap[candidate.index] = results.length;
351 results.push(scoredItem);
352 } else if (compareItems(scoredItem, results[resultMap[candidate.index]]) < 0) {
353 results[resultMap[candidate.index]] = scoredItem;
354 }
355}
356
357const getLevLength = Math.max;
358
359const getSellersLength = termLength => termLength; // skip any subtrees for which it is impossible to score >= threshold
360
361
362function levShouldContinue(node, pos, term, threshold, sValue) {
363 // earliest point (length) at which sValue could return to 0
364 const p1 = pos + sValue; // point (length) at which string lengths would match
365
366 const p2 = Math.min(term.length, pos + node.depth + 1); // the best score possible is the string which minimizes the value
367 // max(sValue, strLenDiff), which is always halfway between p1 and p2
368
369 const length = Math.ceil((p1 + p2) / 2);
370 const bestPossibleValue = length - p2;
371 return 1 - bestPossibleValue / length >= threshold;
372}
373
374function sellersShouldContinue(node, _, term, threshold, sValue, lastValue) {
375 const bestPossibleValue = Math.min(sValue, lastValue - (node.depth + 1));
376 return 1 - bestPossibleValue / term.length >= threshold;
377} // recursively walk the trie
378
379
380function searchRecurse(node, acc, len, term, scoreMethods, rows, results, resultMap, options, sIndex, sValue) {
381 // build rows
382 scoreMethods.score(term, acc, rows, len - 1); // track best score and position
383
384 const lastIndex = len;
385 const lastValue = rows[rows.length - 1][lastIndex];
386
387 if (scoreMethods.shouldUpdateScore(lastValue, sValue)) {
388 sIndex = lastIndex;
389 sValue = lastValue;
390 } // insert results
391
392
393 if (node.candidates.length > 0) {
394 const length = scoreMethods.getLength(term.length, len);
395 const score = 1 - sValue / length;
396
397 if (score >= options.threshold) {
398 const match = scoreMethods.walkBack(rows, sIndex);
399 const lengthDiff = Math.abs(len - term.length);
400
401 for (const candidate of node.candidates) {
402 addResult(results, resultMap, candidate, score, match, lengthDiff);
403 }
404 }
405 } // recurse for children
406
407
408 for (const key in node.children) {
409 const child = node.children[key];
410
411 if (scoreMethods.shouldContinue(child, len, term, options.threshold, sValue, lastValue)) {
412 acc[len] = key;
413 searchRecurse(child, acc, len + 1, term, scoreMethods, rows, results, resultMap, options, sIndex, sValue);
414 }
415 }
416} // the core match finder: returns a sorted, filtered list of matches
417// this does not normalize input, requiring users to normalize themselves
418
419
420function searchCore(term, trie, options) {
421 const initMethod = options.useSellers ? initSellersRows : initLevRows;
422 const scoreMethods = {
423 score: options.useDamerau ? damerauLevenshtein : levenshtein,
424 getLength: options.useSellers ? getSellersLength : getLevLength,
425 shouldUpdateScore: options.useSellers ? sellersUpdateScore : levUpdateScore,
426 shouldContinue: options.useSellers ? sellersShouldContinue : levShouldContinue,
427 walkBack: options.returnMatchData && options.useSellers ? walkBack : noop
428 }; // walk the trie, scoring and storing the candidates
429
430 const resultMap = {};
431 const results = [];
432 const rows = initMethod(term.length + 1, trie.depth + 1);
433
434 if (options.threshold <= 0 || term.length === 0) {
435 for (const candidate of trie.candidates) {
436 addResult(results, resultMap, candidate, 0, {
437 index: 0,
438 length: 0
439 }, term.length);
440 }
441 }
442
443 for (const key in trie.children) {
444 const value = trie.children[key];
445 const acc = new Array(trie.depth);
446 acc[0] = key;
447 searchRecurse(value, acc, 1, term, scoreMethods, rows, results, resultMap, options, 0, term.length);
448 }
449
450 const sorted = results.sort(compareItems);
451
452 if (options.returnMatchData) {
453 const denormalize = options.useSellers ? denormalizeMatchPosition : noop;
454 return sorted.map(candidate => ({
455 item: candidate.item,
456 original: candidate.normalized.original,
457 key: candidate.normalized.normal.join(""),
458 score: candidate.score,
459 match: denormalize(candidate.match, candidate.normalized.map)
460 }));
461 }
462
463 return sorted.map(candidate => candidate.item);
464} // wrapper for exporting sellers while allowing options to be passed in
465
466
467function fuzzy(term, candidate, options) {
468 options = _objectSpread2(_objectSpread2({}, defaultOptions), options);
469 const initMethod = options.useSellers ? initSellersRows : initLevRows;
470 const scoreMethod = options.useDamerau ? damerauLevenshtein : levenshtein;
471 const getScore = options.useSellers ? getSellersScore : getLevScore;
472 term = normalize(term, options).normal;
473 const normalized = normalize(candidate, options);
474 const rows = initMethod(term.length + 1, normalized.normal.length + 1);
475
476 for (let j = 0; j < normalized.normal.length; j++) {
477 scoreMethod(term, normalized.normal, rows, j);
478 }
479
480 const scoreResult = getScore(rows, normalized.normal.length + 1);
481 return options.returnMatchData ? {
482 item: candidate,
483 original: normalized.original,
484 key: normalized.normal.join(""),
485 score: scoreResult.score,
486 match: options.useSellers ? denormalizeMatchPosition(walkBack(rows, scoreResult.scoreIndex), normalized.map) : noop()
487 } : scoreResult.score;
488} // simple one-off search. Useful if you don't expect to use the same candidate list again
489
490function search(term, candidates, options) {
491 options = _objectSpread2(_objectSpread2({}, defaultOptions), options);
492 const trie = {
493 children: {},
494 candidates: [],
495 depth: 0
496 };
497 createSearchTrie(trie, 0, candidates, options);
498 return searchCore(normalize(term, options).normal, trie, options);
499} // class that improves performance of searching the same set multiple times
500// normalizes the strings and caches the result for future calls
501
502class Searcher {
503 constructor(candidates, options) {
504 this.options = Object.assign({}, defaultOptions, options);
505 this.trie = {
506 children: {},
507 candidates: [],
508 depth: 0
509 };
510 createSearchTrie(this.trie, 0, candidates, this.options);
511 this.count = candidates.length;
512 }
513
514 add(...candidates) {
515 createSearchTrie(this.trie, this.count, candidates, this.options);
516 this.count += candidates.length;
517 }
518
519 search(term, options) {
520 options = Object.assign({}, this.options, options);
521 return searchCore(normalize(term, this.options).normal, this.trie, options);
522 }
523
524}
525
526export { Searcher, fuzzy, search };