| #include <zxcvbn/scoring.hpp> |
| |
| #include <zxcvbn/adjacency_graphs.hpp> |
| #include <zxcvbn/util.hpp> |
| |
| #include <numeric> |
| #include <string> |
| #include <vector> |
| |
| #include <cmath> |
| |
| #include "base/no_destructor.h" |
| |
| namespace std { |
| |
| template<class T, class U> |
| struct hash<std::pair<T, U>> { |
| std::size_t operator()(const std::pair<T, U> & v) const { |
| return std::hash<T>()(v.first) ^ std::hash<U>()(v.second); |
| } |
| }; |
| |
| } |
| |
| namespace zxcvbn { |
| |
| const auto BRUTEFORCE_CARDINALITY = static_cast<guesses_t>(10); |
| const auto MIN_GUESSES_BEFORE_GROWING_SEQUENCE = static_cast<guesses_t>(10000); |
| const auto MIN_SUBMATCH_GUESSES_SINGLE_CHAR = static_cast<guesses_t>(10); |
| const auto MIN_SUBMATCH_GUESSES_MULTI_CHAR = static_cast<guesses_t>(50); |
| |
| const std::regex& START_UPPER() { |
| static base::NoDestructor<std::regex> start_upper(R"(^[A-Z][^A-Z]+$)"); |
| return *start_upper; |
| } |
| |
| const std::regex& END_UPPER() { |
| static base::NoDestructor<std::regex> end_upper(R"(^[^A-Z]+[A-Z]$)"); |
| return *end_upper; |
| } |
| |
| const std::regex& ALL_UPPER() { |
| static base::NoDestructor<std::regex> all_upper(R"(^[^a-z]+$)"); |
| return *all_upper; |
| } |
| |
| const std::regex& ALL_LOWER() { |
| static base::NoDestructor<std::regex> all_lower(R"(^[^A-Z]+$)"); |
| return *all_lower; |
| } |
| |
| template<class Tret, class Tin> |
| Tret factorial(Tin n) { |
| // unoptimized, called only on small n |
| if (n < 2) return 1; |
| Tret f = 1; |
| for (Tin i = 2; i <= n; ++i) { |
| f *= i; |
| } |
| return f; |
| } |
| |
| template<class M, class K, class V> |
| static |
| void insert_or_assign(M & m, const K & k, V && v) { |
| auto p = m.insert(std::make_pair(k, std::forward<V>(v))); |
| if (!p.second) { |
| p.first->second = std::forward<V>(v); |
| } |
| } |
| |
| static |
| std::size_t token_len(const Match & m) __attribute__((pure)); |
| |
| static |
| std::size_t token_len(const Match & m) { |
| std::size_t result = m.j - m.i + 1; |
| // Bruteforce matches might be any substring of the original string, which are |
| // not necessarily aligned to UTF8 code points, and thus m.token might not be |
| // a valid UTF8 string. |
| if (m.get_pattern() != MatchPattern::BRUTEFORCE) |
| assert(result == util::character_len(m.token)); |
| return result; |
| } |
| |
| // ------------------------------------------------------------------------------ |
| // search --- most guessable match sequence ------------------------------------- |
| // ------------------------------------------------------------------------------ |
| // |
| // takes a sequence of overlapping matches, returns the non-overlapping sequence with |
| // minimum guesses. the following is a O(l_max * (n + m)) dynamic programming algorithm |
| // for a length-n password with m candidate matches. l_max is the maximum optimal |
| // sequence length spanning each prefix of the password. In practice it rarely exceeds 5 and the |
| // search terminates rapidly. |
| // |
| // the optimal "minimum guesses" sequence is here defined to be the sequence that |
| // minimizes the following function: |
| // |
| // g = l! * Product(m.guesses for m in sequence) + D^(l - 1) |
| // |
| // where l is the length of the sequence. |
| // |
| // the factorial term is the number of ways to order l patterns. |
| // |
| // the D^(l-1) term is another length penalty, roughly capturing the idea that an |
| // attacker will try lower-length sequences first before trying length-l sequences. |
| // |
| // for example, consider a sequence that is date-repeat-dictionary. |
| // - an attacker would need to try other date-repeat-dictionary combinations, |
| // hence the product term. |
| // - an attacker would need to try repeat-date-dictionary, dictionary-repeat-date, |
| // ..., hence the factorial term. |
| // - an attacker would also likely try length-1 (dictionary) and length-2 (dictionary-date) |
| // sequences before length-3. assuming at minimum D guesses per pattern type, |
| // D^(l-1) approximates Sum(D^i for i in [1..l-1] |
| // |
| // ------------------------------------------------------------------------------ |
| |
| ScoringResult most_guessable_match_sequence(const std::string & password, |
| std::vector<Match> & matches, |
| bool exclude_additive) { |
| auto n = password.length(); |
| |
| // partition matches into sublists according to ending index j |
| std::unordered_map<idx_t, std::vector<std::reference_wrapper<Match>>> matches_by_j; |
| for (auto & m : matches) { |
| matches_by_j[m.j].push_back(m); |
| } |
| // small detail: for deterministic output, sort each sublist by i |
| for (auto & item : matches_by_j) { |
| std::sort(item.second.begin(), item.second.end(), |
| [&] (const std::reference_wrapper<Match> & a, |
| const std::reference_wrapper<Match> & b) { |
| return a.get().i < b.get().i; |
| }); |
| } |
| |
| struct { |
| // optimal.m[k][l] holds final match in the best length-l match sequence covering the |
| // password prefix up to k, inclusive. |
| // if there is no length-l sequence that scores better (fewer guesses) than |
| // a shorter match sequence spanning the same prefix, optimal.m[k][l] is undefined. |
| std::unordered_map<idx_t, std::unordered_map<idx_t, std::reference_wrapper<Match>>> m; |
| |
| // same structure as optimal.m -- holds the product term Prod(m.guesses for m in sequence). |
| // optimal.pi allows for fast (non-looping) updates to the minimization function. |
| std::unordered_map<idx_t, std::unordered_map<idx_t, guesses_t>> pi; |
| |
| // same structure as optimal.m -- holds the overall metric. |
| std::unordered_map<idx_t, std::unordered_map<idx_t, guesses_t>> g; |
| } optimal; |
| |
| // helper: considers whether a length-l sequence ending at match m is better (fewer guesses) |
| // than previously encountered sequences, updating state if so. |
| auto update = [&] (Match & m, idx_t l) { |
| auto k = m.j; |
| auto pi = estimate_guesses(m, password); |
| if (l > 1) { |
| // we're considering a length-l sequence ending with match m: |
| // obtain the product term in the minimization function by multiplying m's guesses |
| // by the product of the length-(l-1) sequence ending just before m, at m.i - 1. |
| pi *= optimal.pi[m.i - 1][l - 1]; |
| } |
| // calculate the minimization func |
| auto g = factorial<guesses_t>(l) * pi; |
| if (!exclude_additive) { |
| g += std::pow(MIN_GUESSES_BEFORE_GROWING_SEQUENCE, l - 1); |
| } |
| // update state if new best. |
| // first see if any competing sequences covering this prefix, with l or fewer matches, |
| // fare better than this sequence. if so, skip it and return. |
| for (const auto & item : optimal.g[k]) { |
| auto & competing_l = item.first; |
| auto & competing_g = item.second; |
| if (competing_l > l) continue; |
| if (competing_g <= g) return; |
| } |
| // this sequence might be part of the final optimal sequence. |
| insert_or_assign(optimal.g[k], l, g); |
| insert_or_assign(optimal.m[k], l, std::ref(m)); |
| insert_or_assign(optimal.pi[k], l, pi); |
| }; |
| |
| // helper: make bruteforce match objects spanning i to j, inclusive. |
| // TODO: we store bruteforce matches in this vector so that we can |
| // store references in optimal.m, this is arguable hacked, so fix this |
| std::unordered_map<std::pair<idx_t, idx_t>, std::unique_ptr<Match>> bruteforces; |
| auto make_bruteforce_match = [&] (idx_t i, idx_t j) -> std::reference_wrapper<Match> { |
| auto p = bruteforces.insert(std::make_pair(std::make_pair(i, j), |
| std::make_unique<Match> |
| (i, j, |
| password.substr(i, j - i + 1), |
| BruteforceMatch{}))); |
| return std::ref(*p.first->second); |
| }; |
| |
| // helper: evaluate bruteforce matches ending at k. |
| auto bruteforce_update = [&] (idx_t k) { |
| // see if a single bruteforce match spanning the k-prefix is optimal. |
| auto m = make_bruteforce_match(0, k); |
| update(m, 1); |
| for (idx_t i = 1; i <= k; ++i) { |
| // generate k bruteforce matches, spanning from (i=1, j=k) up to (i=k, j=k). |
| // see if adding these new matches to any of the sequences in optimal[i-1] |
| // leads to new bests. |
| auto m2 = make_bruteforce_match(i, k); |
| for (const auto & item : optimal.m[i - 1]) { |
| auto & l = item.first; |
| auto & last_m = item.second; |
| // corner: an optimal sequence will never have two adjacent bruteforce matches. |
| // it is strictly better to have a single bruteforce match spanning the same region: |
| // same contribution to the guess product with a lower length. |
| // --> safe to skip those cases. |
| if (last_m.get().get_pattern() == MatchPattern::BRUTEFORCE) continue; |
| // try adding m to this length-l sequence. |
| update(m2, l + 1); |
| } |
| } |
| }; |
| |
| // helper: step backwards through optimal.m starting at the end, |
| // constructing the final optimal match sequence. |
| auto unwind = [&] (idx_t n) { |
| std::vector<std::reference_wrapper<Match>> optimal_match_sequence; |
| if (!n) return optimal_match_sequence; |
| auto k = n - 1; |
| idx_t l = optimal.g[k].begin()->first; |
| guesses_t g = optimal.g[k].begin()->second; |
| for (const auto & item : optimal.g[k]) { |
| auto & candidate_l = item.first; |
| auto & candidate_g = item.second; |
| if (candidate_g < g) { |
| l = candidate_l; |
| g = candidate_g; |
| } |
| } |
| while (true) { |
| auto it = optimal.m[k].find(l); |
| assert(it != optimal.m[k].end()); |
| auto & m = it->second; |
| optimal_match_sequence.push_back(m); |
| if (!m.get().i) break; |
| k = m.get().i - 1; |
| l -= 1; |
| } |
| std::reverse(optimal_match_sequence.begin(), optimal_match_sequence.end()); |
| return optimal_match_sequence; |
| }; |
| |
| for (idx_t k = 0; k < n; ++k) { |
| for (const auto & m : matches_by_j[k]) { |
| if (m.get().i > 0) { |
| for (const auto & item : optimal.m[m.get().i - 1]) { |
| auto & l = item.first; |
| update(m, l + 1); |
| } |
| } |
| else { |
| update(m, 1); |
| } |
| } |
| bruteforce_update(k); |
| } |
| auto optimal_match_sequence = unwind(n); |
| auto optimal_l = optimal_match_sequence.size(); |
| |
| guesses_t guesses; |
| // corner: empty password |
| if (password.length() == 0) { |
| guesses = 1; |
| } |
| else { |
| guesses = optimal.g[n - 1][optimal_l]; |
| } |
| |
| // retrieve referenced bruteforce matches |
| std::vector<std::unique_ptr<Match>> bruteforce_matches; |
| for (const auto & ref : optimal_match_sequence) { |
| auto & m = ref.get(); |
| if (m.get_pattern() != MatchPattern::BRUTEFORCE) continue; |
| auto it = bruteforces.find(std::make_pair(m.i, m.j)); |
| if (it == bruteforces.end()) continue; |
| bruteforce_matches.push_back(std::move(it->second)); |
| } |
| |
| return { |
| password, |
| guesses, |
| static_cast<guesses_log10_t>(std::log10(guesses)), |
| std::move(bruteforce_matches), |
| std::move(optimal_match_sequence), |
| }; |
| } |
| |
| // ------------------------------------------------------------------------------ |
| // guess estimation -- one function per match pattern --------------------------- |
| // ------------------------------------------------------------------------------ |
| |
| guesses_t estimate_guesses(Match & match, const std::string & password) { |
| if (match.guesses) return match.guesses; // a match's guess estimate doesn't change. cache it. |
| guesses_t min_guesses = 1; |
| if (match.token.length() < password.length()) { |
| min_guesses = (token_len(match) == 1) |
| ? MIN_SUBMATCH_GUESSES_SINGLE_CHAR |
| : MIN_SUBMATCH_GUESSES_MULTI_CHAR; |
| } |
| #define MATCH_FN(title, upper, lower) \ |
| : match.get_pattern() == MatchPattern::upper ? lower##_guesses |
| guesses_t (*estimation_function)(const Match &) = |
| (false) ? nullptr MATCH_RUN() : nullptr; |
| #undef MATCH_FN |
| assert(estimation_function != nullptr); |
| auto guesses = estimation_function(match); |
| match.guesses = std::max(guesses, min_guesses); |
| match.guesses_log10 = static_cast<guesses_log10_t>(std::log10(match.guesses)); |
| return match.guesses; |
| } |
| |
| guesses_t unknown_guesses(const Match & match) { |
| assert(match.guesses); |
| return match.guesses; |
| } |
| |
| guesses_t bruteforce_guesses(const Match & match) { |
| auto guesses = std::pow(BRUTEFORCE_CARDINALITY, token_len(match)); |
| // small detail: make bruteforce matches at minimum one guess bigger than smallest allowed |
| // submatch guesses, such that non-bruteforce submatches over the same [i..j] take precedence. |
| auto min_guesses = (token_len(match) == 1) |
| ? MIN_SUBMATCH_GUESSES_SINGLE_CHAR + 1 |
| : MIN_SUBMATCH_GUESSES_MULTI_CHAR + 1; |
| return std::max(guesses, min_guesses); |
| } |
| |
| guesses_t repeat_guesses(const Match & match) { |
| return match.get_repeat().base_guesses * match.get_repeat().repeat_count; |
| } |
| |
| guesses_t sequence_guesses(const Match & match) { |
| auto second_chr_pos = util::utf8_iter(match.token.begin(), match.token.end()); |
| auto first_chr = std::string(match.token.begin(), second_chr_pos); |
| guesses_t base_guesses; |
| // lower guesses for obvious starting points |
| if (first_chr == "a" || first_chr == "A" || first_chr == "z" || |
| first_chr == "Z" || first_chr == "0" || first_chr == "1" || |
| first_chr == "9") { |
| base_guesses = 4; |
| } |
| else { |
| if (std::regex_match(first_chr, std::regex(R"(\d)"))) { |
| base_guesses = 10; // digits |
| } |
| else { |
| // could give a higher base for uppercase, |
| // assigning 26 to both upper and lower sequences is more conservative. |
| base_guesses = 26; |
| } |
| } |
| if (!match.get_sequence().ascending) { |
| // need to try a descending sequence in addition to every ascending sequence -> |
| // 2x guesses |
| base_guesses *= 2; |
| } |
| return base_guesses * token_len(match); |
| } |
| |
| guesses_t regex_guesses(const Match & match) { |
| switch (match.get_regex().regex_tag) { |
| case RegexTag::RECENT_YEAR: |
| { |
| // conservative estimate of year space: num years from REFERENCE_YEAR. |
| // if year is close to REFERENCE_YEAR, estimate a year space of MIN_YEAR_SPACE. |
| auto pre_year_space = std::stoul(match.get_regex().regex_match.matches[0]); |
| if (pre_year_space > REFERENCE_YEAR) { |
| pre_year_space -= REFERENCE_YEAR; |
| } |
| else { |
| pre_year_space = REFERENCE_YEAR - pre_year_space; |
| } |
| guesses_t year_space = pre_year_space; |
| year_space = std::max(year_space, MIN_YEAR_SPACE); |
| return year_space; |
| } |
| case RegexTag::ALPHA_LOWER: case RegexTag::ALPHANUMERIC: { |
| auto base = [&] { |
| switch (match.get_regex().regex_tag) { |
| case RegexTag::ALPHA_LOWER: return 26; |
| case RegexTag::ALPHANUMERIC: return 62; |
| default: assert(false); return 0; |
| } |
| }(); |
| return std::pow(base, token_len(match)); |
| } |
| default: |
| return 0; |
| } |
| } |
| |
| guesses_t date_guesses(const Match & match) { |
| // base guesses: (year distance from REFERENCE_YEAR) * num_days * num_years |
| auto pre_year_space = match.get_date().year; |
| if (pre_year_space > REFERENCE_YEAR) { |
| pre_year_space -= REFERENCE_YEAR; |
| } |
| else { |
| pre_year_space = REFERENCE_YEAR - pre_year_space; |
| } |
| |
| guesses_t year_space = pre_year_space; |
| year_space = std::max(year_space, MIN_YEAR_SPACE); |
| auto guesses = year_space * 365; |
| // double for four-digit years |
| if (match.get_date().has_full_year) guesses *= 2; |
| // add factor of 4 for separator selection (one of ~4 choices) |
| if (match.get_date().separator.length()) guesses *= 4; |
| return guesses; |
| } |
| |
| guesses_t spatial_guesses(const Match & match) { |
| std::size_t s; |
| guesses_t d; |
| if (match.get_spatial().graph == GraphTag::QWERTY || |
| match.get_spatial().graph == GraphTag::DVORAK) { |
| s = KEYBOARD_STARTING_POSITIONS; |
| d = KEYBOARD_AVERAGE_DEGREE; |
| } |
| else { |
| s = KEYPAD_STARTING_POSITIONS; |
| d = KEYPAD_AVERAGE_DEGREE; |
| } |
| guesses_t guesses = 0; |
| auto L = token_len(match); |
| auto t = static_cast<decltype(L)>(match.get_spatial().turns); |
| // estimate the number of possible patterns w/ length L or less with t turns or less. |
| for (decltype(L) i = 2; i <= L; ++i) { |
| auto possible_turns = std::min(t, i - 1); |
| for (decltype(possible_turns) j = 1; j <= possible_turns; ++j) { |
| guesses += nCk(i - 1, j - 1) * s * std::pow(d, j); |
| } |
| } |
| // add extra guesses for shifted keys. (% instead of 5, A instead of a.) |
| // math is similar to extra guesses of l33t substitutions in dictionary matches. |
| if (match.get_spatial().shifted_count) { |
| auto S = match.get_spatial().shifted_count; |
| decltype(S) U = token_len(match) - match.get_spatial().shifted_count; // unshifted count |
| if (S == 0 || U == 0) { |
| guesses *= 2; |
| } |
| else { |
| auto shifted_variations = 0; |
| for (decltype(S) i = 1; i <= std::min(S, U); ++i) { |
| shifted_variations += nCk(S + U, i); |
| } |
| guesses *= shifted_variations; |
| } |
| } |
| |
| return guesses; |
| } |
| |
| guesses_t dictionary_guesses(const Match & match) { |
| auto base_guesses = match.get_dictionary().rank; // keep these as properties for display purposes |
| auto uppercase_variations_ = uppercase_variations(match); |
| auto l33t_variations_ = l33t_variations(match); |
| auto reversed_variations = match.get_dictionary().reversed ? 2 : 1; |
| return (base_guesses * uppercase_variations_ * l33t_variations_ * reversed_variations); |
| } |
| |
| guesses_t uppercase_variations(const Match & match) { |
| auto & word = match.token; |
| if (std::regex_match(word, ALL_LOWER()) || !word.size()) |
| return 1; |
| // a capitalized word is the most common capitalization scheme, |
| // so it only doubles the search space (uncapitalized + capitalized). |
| // allcaps and end-capitalized are common enough too, underestimate as 2x factor to be safe. |
| for (const auto& regex : {START_UPPER(), END_UPPER(), ALL_UPPER()}) { |
| if (std::regex_match(word, regex)) return 2; |
| } |
| // otherwise calculate the number of ways to capitalize U+L uppercase+lowercase letters |
| // with U uppercase letters or less. or, if there's more uppercase than lower (for eg. PASSwORD), |
| // the number of ways to lowercase U+L letters with L lowercase letters or less. |
| auto match_chr = [] (const std::string & str, const std::regex & regex) { |
| decltype(str.length()) toret = 0; |
| for (auto it = str.begin(); it != str.end();) { |
| auto it2 = util::utf8_iter(it, str.end()); |
| auto s = std::string(it, it2); |
| if (std::regex_match(s, regex)) { |
| toret += 1; |
| } |
| it = it2; |
| } |
| return toret; |
| }; |
| auto U = match_chr(word, std::regex(R"([A-Z])")); |
| auto L = match_chr(word, std::regex(R"([a-z])")); |
| guesses_t variations = 0; |
| for (decltype(U) i = 1; i <= std::min(U, L); ++i) { |
| variations += nCk(U + L, i); |
| } |
| return variations; |
| } |
| |
| guesses_t l33t_variations(const Match & match) { |
| auto & dmatch = match.get_dictionary(); |
| if (!dmatch.l33t) return 1; |
| guesses_t variations = 1; |
| for (const auto & item : dmatch.sub) { |
| auto & subbed = item.first; |
| auto & unsubbed = item.second; |
| // lower-case match.token before calculating: capitalization shouldn't affect l33t calc. |
| idx_t S = 0, U = 0; |
| // XXX: using ascii_lower is okay for now since our |
| // sub dictionaries are ascii only |
| auto ltoken = util::ascii_lower(match.token); |
| for (auto it = ltoken.begin(); it != ltoken.end();) { |
| auto it2 = util::utf8_iter(it, ltoken.end()); |
| auto cs = std::string(it, it2); |
| if (cs == subbed) S += 1; |
| if (cs == unsubbed) U += 1; |
| it = it2; |
| } |
| if (!S || !U) { |
| // for this sub, password is either fully subbed (444) or fully unsubbed (aaa) |
| // treat that as doubling the space (attacker needs to try fully subbed chars in addition to |
| // unsubbed.) |
| variations *= 2; |
| } |
| else { |
| // this case is similar to capitalization: |
| // with aa44a, U = 3, S = 2, attacker needs to try unsubbed + one sub + two subs |
| auto p = std::min(U, S); |
| guesses_t possibilities = 0; |
| for (decltype(p) i = 1; i <= p; ++i) { |
| possibilities += nCk(U + S, i); |
| } |
| variations *= possibilities; |
| } |
| } |
| return variations; |
| } |
| |
| } |