UNPKG

20.4 kBtext/coffeescriptView Raw
1frequency_lists = require('./frequency_lists')
2adjacency_graphs = require('./adjacency_graphs')
3scoring = require('./scoring')
4
5build_ranked_dict = (ordered_list) ->
6 result = {}
7 i = 1 # rank starts at 1, not 0
8 for word in ordered_list
9 result[word] = i
10 i += 1
11 result
12
13RANKED_DICTIONARIES =
14 passwords: build_ranked_dict frequency_lists.passwords
15 english: build_ranked_dict frequency_lists.english
16 surnames: build_ranked_dict frequency_lists.surnames
17 male_names: build_ranked_dict frequency_lists.male_names
18 female_names: build_ranked_dict frequency_lists.female_names
19
20GRAPHS =
21 qwerty: adjacency_graphs.qwerty
22 dvorak: adjacency_graphs.dvorak
23 keypad: adjacency_graphs.keypad
24 mac_keypad: adjacency_graphs.mac_keypad
25
26SEQUENCES =
27 lower: 'abcdefghijklmnopqrstuvwxyz'
28 upper: 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
29 digits: '0123456789'
30
31L33T_TABLE =
32 a: ['4', '@']
33 b: ['8']
34 c: ['(', '{', '[', '<']
35 e: ['3']
36 g: ['6', '9']
37 i: ['1', '!', '|']
38 l: ['1', '|', '7']
39 o: ['0']
40 s: ['$', '5']
41 t: ['+', '7']
42 x: ['%']
43 z: ['2']
44
45REGEXEN =
46 recent_year: /19\d\d|200\d|201\d/g
47
48DATE_MAX_YEAR = 2050
49DATE_MIN_YEAR = 1000
50DATE_SPLITS =
51 4:[ # for length-4 strings, eg 1191 or 9111, two ways to split:
52 [1, 2] # 1 1 91 (2nd split starts at index 1, 3rd at index 2)
53 [2, 3] # 91 1 1
54 ]
55 5:[
56 [1, 3] # 1 11 91
57 [2, 3] # 11 1 91
58 ]
59 6:[
60 [1, 2] # 1 1 1991
61 [2, 4] # 11 11 91
62 [4, 5] # 1991 1 1
63 ]
64 7:[
65 [1, 3] # 1 11 1991
66 [2, 3] # 11 1 1991
67 [4, 5] # 1991 1 11
68 [4, 6] # 1991 11 1
69 ]
70 8:[
71 [2, 4] # 11 11 1991
72 [4, 6] # 1991 11 11
73 ]
74
75matching =
76 empty: (obj) -> (k for k of obj).length == 0
77 extend: (lst, lst2) -> lst.push.apply lst, lst2
78 translate: (string, chr_map) -> (chr_map[chr] or chr for chr in string.split('')).join('')
79 mod: (n, m) -> ((n % m) + m) % m # mod impl that works for negative numbers
80 sorted: (matches) ->
81 # sort on i primary, j secondary
82 matches.sort (m1, m2) ->
83 (m1.i - m2.i) or (m1.j - m2.j)
84
85 # ------------------------------------------------------------------------------
86 # omnimatch -- combine everything ----------------------------------------------
87 # ------------------------------------------------------------------------------
88
89 omnimatch: (password) ->
90 matches = []
91 matchers = [
92 @dictionary_match
93 @reverse_dictionary_match
94 @l33t_match
95 @spatial_match
96 @repeat_match
97 @sequence_match
98 @regex_match
99 @date_match
100 ]
101 for matcher in matchers
102 @extend matches, matcher.call(this, password)
103 @sorted matches
104
105 #-------------------------------------------------------------------------------
106 # dictionary match (common passwords, english, last names, etc) ----------------
107 #-------------------------------------------------------------------------------
108
109 dictionary_match: (password, _ranked_dictionaries = RANKED_DICTIONARIES) ->
110 # _ranked_dictionaries variable is for unit testing purposes
111 matches = []
112 len = password.length
113 password_lower = password.toLowerCase()
114 for dictionary_name, ranked_dict of _ranked_dictionaries
115 for i in [0...len]
116 for j in [i...len]
117 if password_lower[i..j] of ranked_dict
118 word = password_lower[i..j]
119 rank = ranked_dict[word]
120 matches.push
121 pattern: 'dictionary'
122 i: i
123 j: j
124 token: password[i..j]
125 matched_word: word
126 rank: rank
127 dictionary_name: dictionary_name
128 reversed: false
129 l33t: false
130 @sorted matches
131
132 reverse_dictionary_match: (password, _ranked_dictionaries = RANKED_DICTIONARIES) ->
133 reversed_password = password.split('').reverse().join('')
134 matches = @dictionary_match reversed_password, _ranked_dictionaries
135 for match in matches
136 match.token = match.token.split('').reverse().join('') # reverse back
137 match.reversed = true
138 # map coordinates back to original string
139 [match.i, match.j] = [
140 password.length - 1 - match.j
141 password.length - 1 - match.i
142 ]
143 @sorted matches
144
145 set_user_input_dictionary: (ordered_list) ->
146 RANKED_DICTIONARIES['user_inputs'] = build_ranked_dict ordered_list.slice()
147
148 #-------------------------------------------------------------------------------
149 # dictionary match with common l33t substitutions ------------------------------
150 #-------------------------------------------------------------------------------
151
152 # makes a pruned copy of l33t_table that only includes password's possible substitutions
153 relevant_l33t_subtable: (password, table) ->
154 password_chars = {}
155 for chr in password.split('')
156 password_chars[chr] = true
157 subtable = {}
158 for letter, subs of table
159 relevant_subs = (sub for sub in subs when sub of password_chars)
160 if relevant_subs.length > 0
161 subtable[letter] = relevant_subs
162 subtable
163
164 # returns the list of possible 1337 replacement dictionaries for a given password
165 enumerate_l33t_subs: (table) ->
166 keys = (k for k of table)
167 subs = [[]]
168
169 dedup = (subs) ->
170 deduped = []
171 members = {}
172 for sub in subs
173 assoc = ([k,v] for k,v in sub)
174 assoc.sort()
175 label = (k+','+v for k,v in assoc).join('-')
176 unless label of members
177 members[label] = true
178 deduped.push sub
179 deduped
180
181 helper = (keys) ->
182 return if not keys.length
183 first_key = keys[0]
184 rest_keys = keys[1..]
185 next_subs = []
186 for l33t_chr in table[first_key]
187 for sub in subs
188 dup_l33t_index = -1
189 for i in [0...sub.length]
190 if sub[i][0] == l33t_chr
191 dup_l33t_index = i
192 break
193 if dup_l33t_index == -1
194 sub_extension = sub.concat [[l33t_chr, first_key]]
195 next_subs.push sub_extension
196 else
197 sub_alternative = sub.slice(0)
198 sub_alternative.splice(dup_l33t_index, 1)
199 sub_alternative.push [l33t_chr, first_key]
200 next_subs.push sub
201 next_subs.push sub_alternative
202 subs = dedup next_subs
203 helper(rest_keys)
204
205 helper(keys)
206 sub_dicts = [] # convert from assoc lists to dicts
207 for sub in subs
208 sub_dict = {}
209 for [l33t_chr, chr] in sub
210 sub_dict[l33t_chr] = chr
211 sub_dicts.push sub_dict
212 sub_dicts
213
214 l33t_match: (password, _ranked_dictionaries = RANKED_DICTIONARIES, _l33t_table = L33T_TABLE) ->
215 matches = []
216 for sub in @enumerate_l33t_subs @relevant_l33t_subtable(password, _l33t_table)
217 break if @empty sub # corner case: password has no relevant subs.
218 subbed_password = @translate password, sub
219 for match in @dictionary_match(subbed_password, _ranked_dictionaries)
220 token = password[match.i..match.j]
221 if token.toLowerCase() == match.matched_word
222 continue # only return the matches that contain an actual substitution
223 match_sub = {} # subset of mappings in sub that are in use for this match
224 for subbed_chr, chr of sub when token.indexOf(subbed_chr) != -1
225 match_sub[subbed_chr] = chr
226 match.l33t = true
227 match.token = token
228 match.sub = match_sub
229 match.sub_display = ("#{k} -> #{v}" for k,v of match_sub).join(', ')
230 matches.push match
231 @sorted matches.filter (match) ->
232 # filter single-character l33t matches to reduce noise.
233 # otherwise '1' matches 'i', '4' matches 'a', both very common English words
234 # with low dictionary rank.
235 match.token.length > 1
236
237 # ------------------------------------------------------------------------------
238 # spatial match (qwerty/dvorak/keypad) -----------------------------------------
239 # ------------------------------------------------------------------------------
240
241 spatial_match: (password, _graphs = GRAPHS) ->
242 matches = []
243 for graph_name, graph of _graphs
244 @extend matches, @spatial_match_helper(password, graph, graph_name)
245 @sorted matches
246
247 SHIFTED_RX: /[~!@#$%^&*()_+QWERTYUIOP{}|ASDFGHJKL:"ZXCVBNM<>?]/
248 spatial_match_helper: (password, graph, graph_name) ->
249 matches = []
250 i = 0
251 while i < password.length - 1
252 j = i + 1
253 last_direction = null
254 turns = 0
255 if graph_name in ['qwerty', 'dvorak'] and @SHIFTED_RX.exec(password.charAt(i))
256 # initial character is shifted
257 shifted_count = 1
258 else
259 shifted_count = 0
260 loop
261 prev_char = password.charAt(j-1)
262 found = false
263 found_direction = -1
264 cur_direction = -1
265 adjacents = graph[prev_char] or []
266 # consider growing pattern by one character if j hasn't gone over the edge.
267 if j < password.length
268 cur_char = password.charAt(j)
269 for adj in adjacents
270 cur_direction += 1
271 if adj and adj.indexOf(cur_char) != -1
272 found = true
273 found_direction = cur_direction
274 if adj.indexOf(cur_char) == 1
275 # index 1 in the adjacency means the key is shifted,
276 # 0 means unshifted: A vs a, % vs 5, etc.
277 # for example, 'q' is adjacent to the entry '2@'.
278 # @ is shifted w/ index 1, 2 is unshifted.
279 shifted_count += 1
280 if last_direction != found_direction
281 # adding a turn is correct even in the initial case when last_direction is null:
282 # every spatial pattern starts with a turn.
283 turns += 1
284 last_direction = found_direction
285 break
286 # if the current pattern continued, extend j and try to grow again
287 if found
288 j += 1
289 # otherwise push the pattern discovered so far, if any...
290 else
291 if j - i > 2 # don't consider length 1 or 2 chains.
292 matches.push
293 pattern: 'spatial'
294 i: i
295 j: j-1
296 token: password[i...j]
297 graph: graph_name
298 turns: turns
299 shifted_count: shifted_count
300 # ...and then start a new search for the rest of the password.
301 i = j
302 break
303 matches
304
305 #-------------------------------------------------------------------------------
306 # repeats (aaa, abcabcabc) and sequences (abcdef) ------------------------------
307 #-------------------------------------------------------------------------------
308
309 repeat_match: (password) ->
310 matches = []
311 greedy = /(.+)\1+/g
312 lazy = /(.+?)\1+/g
313 lazy_anchored = /^(.+?)\1+$/
314 lastIndex = 0
315 while lastIndex < password.length
316 greedy.lastIndex = lazy.lastIndex = lastIndex
317 greedy_match = greedy.exec password
318 lazy_match = lazy.exec password
319 break unless greedy_match?
320 if greedy_match[0].length > lazy_match[0].length
321 # greedy beats lazy for 'aabaab'
322 # greedy: [aabaab, aab]
323 # lazy: [aa, a]
324 match = greedy_match
325 # greedy's repeated string might itself be repeated, eg.
326 # aabaab in aabaabaabaab.
327 # run an anchored lazy match on greedy's repeated string
328 # to find the shortest repeated string
329 base_token = lazy_anchored.exec(match[0])[1]
330 else
331 # lazy beats greedy for 'aaaaa'
332 # greedy: [aaaa, aa]
333 # lazy: [aaaaa, a]
334 match = lazy_match
335 base_token = match[1]
336 [i, j] = [match.index, match.index + match[0].length - 1]
337 # recursively match and score the base string
338 base_analysis = scoring.most_guessable_match_sequence(
339 base_token
340 @omnimatch base_token
341 )
342 base_matches = base_analysis.match_sequence
343 base_guesses = base_analysis.guesses
344 matches.push
345 pattern: 'repeat'
346 i: i
347 j: j
348 token: match[0]
349 base_token: base_token
350 base_guesses: base_guesses
351 base_matches: base_matches
352 repeat_count: match[0].length / base_token.length
353 lastIndex = j + 1
354 matches
355
356 sequence_match: (password) ->
357 matches = []
358 for sequence_name, sequence of SEQUENCES
359 for direction in [1, -1]
360 i = 0
361 while i < password.length
362 unless password.charAt(i) in sequence
363 i += 1
364 continue
365 j = i + 1
366 sequence_position = sequence.indexOf password.charAt(i)
367 while j < password.length
368 # mod by sequence length to allow sequences to wrap around: xyzabc
369 next_sequence_position = @mod sequence_position + direction, sequence.length
370 unless sequence.indexOf(password.charAt(j)) == next_sequence_position
371 break
372 j += 1
373 sequence_position = next_sequence_position
374 j -= 1
375 if j - i + 1 > 1
376 matches.push
377 pattern: 'sequence'
378 i: i
379 j: j
380 token: password[i..j]
381 sequence_name: sequence_name
382 sequence_space: sequence.length
383 ascending: direction == 1
384 i = j + 1
385 @sorted matches
386
387 #-------------------------------------------------------------------------------
388 # regex matching ---------------------------------------------------------------
389 #-------------------------------------------------------------------------------
390
391 regex_match: (password, _regexen = REGEXEN) ->
392 matches = []
393 for name, regex of _regexen
394 regex.lastIndex = 0 # keeps regex_match stateless
395 while rx_match = regex.exec password
396 token = rx_match[0]
397 matches.push
398 pattern: 'regex'
399 token: token
400 i: rx_match.index
401 j: rx_match.index + rx_match[0].length - 1
402 regex_name: name
403 regex_match: rx_match
404 @sorted matches
405
406 #-------------------------------------------------------------------------------
407 # date matching ----------------------------------------------------------------
408 #-------------------------------------------------------------------------------
409
410 date_match: (password) ->
411 # a "date" is recognized as:
412 # any 3-tuple that starts or ends with a 2- or 4-digit year,
413 # with 2 or 0 separator chars (1.1.91 or 1191),
414 # maybe zero-padded (01-01-91 vs 1-1-91),
415 # a month between 1 and 12,
416 # a day between 1 and 31.
417 #
418 # note: this isn't true date parsing in that "feb 31st" is allowed,
419 # this doesn't check for leap years, etc.
420 #
421 # recipe:
422 # start with regex to find maybe-dates, then attempt to map the integers
423 # onto month-day-year to filter the maybe-dates into dates.
424 # finally, remove matches that are substrings of other matches to reduce noise.
425 #
426 # note: instead of using a lazy or greedy regex to find many dates over the full string,
427 # this uses a ^...$ regex against every substring of the password -- less performant but leads
428 # to every possible date match.
429 matches = []
430 maybe_date_no_separator = /^\d{4,8}$/
431 maybe_date_with_separator = ///
432 ^
433 ( \d{1,4} ) # day, month, year
434 ( [\s/\\_.-] ) # separator
435 ( \d{1,2} ) # day, month
436 \2 # same separator
437 ( \d{1,4} ) # day, month, year
438 $
439 ///
440
441 # dates without separators are between length 4 '1191' and 8 '11111991'
442 for i in [0..password.length - 4]
443 for j in [i + 3..i + 7]
444 break if j >= password.length
445 token = password[i..j]
446 continue unless maybe_date_no_separator.exec token
447 candidates = []
448 for [k,l] in DATE_SPLITS[token.length]
449 dmy = @map_ints_to_dmy [
450 parseInt token[0...k]
451 parseInt token[k...l]
452 parseInt token[l...]
453 ]
454 candidates.push dmy if dmy?
455 continue unless candidates.length > 0
456 # at this point: different possible dmy mappings for the same i,j substring.
457 # match the candidate date that likely takes the fewest guesses: a year closest to 2000.
458 # (scoring.REFERENCE_YEAR).
459 #
460 # ie, considering '111504', prefer 11-15-04 to 1-1-1504
461 # (interpreting '04' as 2004)
462 best_candidate = candidates[0]
463 metric = (candidate) -> Math.abs candidate.year - scoring.REFERENCE_YEAR
464 min_distance = metric candidates[0]
465 for candidate in candidates[1..]
466 distance = metric candidate
467 if distance < min_distance
468 [best_candidate, min_distance] = [candidate, distance]
469 matches.push
470 pattern: 'date'
471 token: token
472 i: i
473 j: j
474 separator: ''
475 year: best_candidate.year
476 month: best_candidate.month
477 day: best_candidate.day
478
479 # dates with separators are between length 6 '1/1/91' and 10 '11/11/1991'
480 for i in [0..password.length - 6]
481 for j in [i + 5..i + 9]
482 break if j >= password.length
483 token = password[i..j]
484 rx_match = maybe_date_with_separator.exec token
485 continue unless rx_match?
486 dmy = @map_ints_to_dmy [
487 parseInt rx_match[1]
488 parseInt rx_match[3]
489 parseInt rx_match[4]
490 ]
491 continue unless dmy?
492 matches.push
493 pattern: 'date'
494 token: token
495 i: i
496 j: j
497 separator: rx_match[2]
498 year: dmy.year
499 month: dmy.month
500 day: dmy.day
501
502 # matches now contains all valid date strings in a way that is tricky to capture
503 # with regexes only. while thorough, it will contain some unintuitive noise:
504 #
505 # '2015_06_04', in addition to matching 2015_06_04, will also contain
506 # 5(!) other date matches: 15_06_04, 5_06_04, ..., even 2015 (matched as 5/1/2020)
507 #
508 # to reduce noise, remove date matches that are strict substrings of others
509 @sorted matches.filter (match) ->
510 is_submatch = false
511 for other_match in matches
512 continue if match is other_match
513 if other_match.i <= match.i and other_match.j >= match.j
514 is_submatch = true
515 break
516 not is_submatch
517
518 map_ints_to_dmy: (ints) ->
519 # given a 3-tuple, discard if:
520 # middle int is over 31 (for all dmy formats, years are never allowed in the middle)
521 # middle int is zero
522 # any int is over the max allowable year
523 # any int is over two digits but under the min allowable year
524 # 2 ints are over 31, the max allowable day
525 # 2 ints are zero
526 # all ints are over 12, the max allowable month
527 return if ints[1] > 31 or ints[1] <= 0
528 over_12 = 0
529 over_31 = 0
530 under_1 = 0
531 for int in ints
532 return if 99 < int < DATE_MIN_YEAR or int > DATE_MAX_YEAR
533 over_31 += 1 if int > 31
534 over_12 += 1 if int > 12
535 under_1 += 1 if int <= 0
536 return if over_31 >= 2 or over_12 == 3 or under_1 >= 2
537
538 # first look for a four digit year: yyyy + daymonth or daymonth + yyyy
539 possible_year_splits = [
540 [ints[2], ints[0..1]] # year last
541 [ints[0], ints[1..2]] # year first
542 ]
543 for [y, rest] in possible_year_splits
544 if DATE_MIN_YEAR <= y <= DATE_MAX_YEAR
545 dm = @map_ints_to_dm rest
546 if dm?
547 return {
548 year: y
549 month: dm.month
550 day: dm.day
551 }
552 else
553 # for a candidate that includes a four-digit year,
554 # when the remaining ints don't match to a day and month,
555 # it is not a date.
556 return
557
558 # given no four-digit year, two digit years are the most flexible int to match, so
559 # try to parse a day-month out of ints[0..1] or ints[1..0]
560 for [y, rest] in possible_year_splits
561 dm = @map_ints_to_dm rest
562 if dm?
563 y = @two_to_four_digit_year y
564 return {
565 year: y
566 month: dm.month
567 day: dm.day
568 }
569
570 map_ints_to_dm: (ints) ->
571 for [d, m] in [ints, ints.slice().reverse()]
572 if 1 <= d <= 31 and 1 <= m <= 12
573 return {
574 day: d
575 month: m
576 }
577
578 two_to_four_digit_year: (year) ->
579 if year > 99
580 year
581 else if year > 50
582 # 87 -> 1987
583 year + scoring.REFERENCE_YEAR - 100
584 else
585 # 15 -> 2015
586 year + scoring.REFERENCE_YEAR
587
588module.exports = matching