blob: e5c120a86a5c4142340a505f70ad9d37abececfc [file] [log] [blame]
#include <zxcvbn/scoring.hpp>
#include <zxcvbn/adjacency_graphs.hpp>
#include <zxcvbn/util.hpp>
#include <numeric>
#include <string>
#include <vector>
#include <cmath>
#include "base/no_destructor.h"
namespace std {
template<class T, class U>
struct hash<std::pair<T, U>> {
std::size_t operator()(const std::pair<T, U> & v) const {
return std::hash<T>()(v.first) ^ std::hash<U>()(v.second);
}
};
}
namespace zxcvbn {
const auto BRUTEFORCE_CARDINALITY = static_cast<guesses_t>(10);
const auto MIN_GUESSES_BEFORE_GROWING_SEQUENCE = static_cast<guesses_t>(10000);
const auto MIN_SUBMATCH_GUESSES_SINGLE_CHAR = static_cast<guesses_t>(10);
const auto MIN_SUBMATCH_GUESSES_MULTI_CHAR = static_cast<guesses_t>(50);
const std::regex& START_UPPER() {
static base::NoDestructor<std::regex> start_upper(R"(^[A-Z][^A-Z]+$)");
return *start_upper;
}
const std::regex& END_UPPER() {
static base::NoDestructor<std::regex> end_upper(R"(^[^A-Z]+[A-Z]$)");
return *end_upper;
}
const std::regex& ALL_UPPER() {
static base::NoDestructor<std::regex> all_upper(R"(^[^a-z]+$)");
return *all_upper;
}
const std::regex& ALL_LOWER() {
static base::NoDestructor<std::regex> all_lower(R"(^[^A-Z]+$)");
return *all_lower;
}
template<class Tret, class Tin>
Tret factorial(Tin n) {
// unoptimized, called only on small n
if (n < 2) return 1;
Tret f = 1;
for (Tin i = 2; i <= n; ++i) {
f *= i;
}
return f;
}
template<class M, class K, class V>
static
void insert_or_assign(M & m, const K & k, V && v) {
auto p = m.insert(std::make_pair(k, std::forward<V>(v)));
if (!p.second) {
p.first->second = std::forward<V>(v);
}
}
static
std::size_t token_len(const Match & m) __attribute__((pure));
static
std::size_t token_len(const Match & m) {
std::size_t result = m.j - m.i + 1;
// Bruteforce matches might be any substring of the original string, which are
// not necessarily aligned to UTF8 code points, and thus m.token might not be
// a valid UTF8 string.
if (m.get_pattern() != MatchPattern::BRUTEFORCE)
assert(result == util::character_len(m.token));
return result;
}
// ------------------------------------------------------------------------------
// search --- most guessable match sequence -------------------------------------
// ------------------------------------------------------------------------------
//
// takes a sequence of overlapping matches, returns the non-overlapping sequence with
// minimum guesses. the following is a O(l_max * (n + m)) dynamic programming algorithm
// for a length-n password with m candidate matches. l_max is the maximum optimal
// sequence length spanning each prefix of the password. In practice it rarely exceeds 5 and the
// search terminates rapidly.
//
// the optimal "minimum guesses" sequence is here defined to be the sequence that
// minimizes the following function:
//
// g = l! * Product(m.guesses for m in sequence) + D^(l - 1)
//
// where l is the length of the sequence.
//
// the factorial term is the number of ways to order l patterns.
//
// the D^(l-1) term is another length penalty, roughly capturing the idea that an
// attacker will try lower-length sequences first before trying length-l sequences.
//
// for example, consider a sequence that is date-repeat-dictionary.
// - an attacker would need to try other date-repeat-dictionary combinations,
// hence the product term.
// - an attacker would need to try repeat-date-dictionary, dictionary-repeat-date,
// ..., hence the factorial term.
// - an attacker would also likely try length-1 (dictionary) and length-2 (dictionary-date)
// sequences before length-3. assuming at minimum D guesses per pattern type,
// D^(l-1) approximates Sum(D^i for i in [1..l-1]
//
// ------------------------------------------------------------------------------
ScoringResult most_guessable_match_sequence(const std::string & password,
std::vector<Match> & matches,
bool exclude_additive) {
auto n = password.length();
// partition matches into sublists according to ending index j
std::unordered_map<idx_t, std::vector<std::reference_wrapper<Match>>> matches_by_j;
for (auto & m : matches) {
matches_by_j[m.j].push_back(m);
}
// small detail: for deterministic output, sort each sublist by i
for (auto & item : matches_by_j) {
std::sort(item.second.begin(), item.second.end(),
[&] (const std::reference_wrapper<Match> & a,
const std::reference_wrapper<Match> & b) {
return a.get().i < b.get().i;
});
}
struct {
// optimal.m[k][l] holds final match in the best length-l match sequence covering the
// password prefix up to k, inclusive.
// if there is no length-l sequence that scores better (fewer guesses) than
// a shorter match sequence spanning the same prefix, optimal.m[k][l] is undefined.
std::unordered_map<idx_t, std::unordered_map<idx_t, std::reference_wrapper<Match>>> m;
// same structure as optimal.m -- holds the product term Prod(m.guesses for m in sequence).
// optimal.pi allows for fast (non-looping) updates to the minimization function.
std::unordered_map<idx_t, std::unordered_map<idx_t, guesses_t>> pi;
// same structure as optimal.m -- holds the overall metric.
std::unordered_map<idx_t, std::unordered_map<idx_t, guesses_t>> g;
} optimal;
// helper: considers whether a length-l sequence ending at match m is better (fewer guesses)
// than previously encountered sequences, updating state if so.
auto update = [&] (Match & m, idx_t l) {
auto k = m.j;
auto pi = estimate_guesses(m, password);
if (l > 1) {
// we're considering a length-l sequence ending with match m:
// obtain the product term in the minimization function by multiplying m's guesses
// by the product of the length-(l-1) sequence ending just before m, at m.i - 1.
pi *= optimal.pi[m.i - 1][l - 1];
}
// calculate the minimization func
auto g = factorial<guesses_t>(l) * pi;
if (!exclude_additive) {
g += std::pow(MIN_GUESSES_BEFORE_GROWING_SEQUENCE, l - 1);
}
// update state if new best.
// first see if any competing sequences covering this prefix, with l or fewer matches,
// fare better than this sequence. if so, skip it and return.
for (const auto & item : optimal.g[k]) {
auto & competing_l = item.first;
auto & competing_g = item.second;
if (competing_l > l) continue;
if (competing_g <= g) return;
}
// this sequence might be part of the final optimal sequence.
insert_or_assign(optimal.g[k], l, g);
insert_or_assign(optimal.m[k], l, std::ref(m));
insert_or_assign(optimal.pi[k], l, pi);
};
// helper: make bruteforce match objects spanning i to j, inclusive.
// TODO: we store bruteforce matches in this vector so that we can
// store references in optimal.m, this is arguable hacked, so fix this
std::unordered_map<std::pair<idx_t, idx_t>, std::unique_ptr<Match>> bruteforces;
auto make_bruteforce_match = [&] (idx_t i, idx_t j) -> std::reference_wrapper<Match> {
auto p = bruteforces.insert(std::make_pair(std::make_pair(i, j),
std::make_unique<Match>
(i, j,
password.substr(i, j - i + 1),
BruteforceMatch{})));
return std::ref(*p.first->second);
};
// helper: evaluate bruteforce matches ending at k.
auto bruteforce_update = [&] (idx_t k) {
// see if a single bruteforce match spanning the k-prefix is optimal.
auto m = make_bruteforce_match(0, k);
update(m, 1);
for (idx_t i = 1; i <= k; ++i) {
// generate k bruteforce matches, spanning from (i=1, j=k) up to (i=k, j=k).
// see if adding these new matches to any of the sequences in optimal[i-1]
// leads to new bests.
auto m2 = make_bruteforce_match(i, k);
for (const auto & item : optimal.m[i - 1]) {
auto & l = item.first;
auto & last_m = item.second;
// corner: an optimal sequence will never have two adjacent bruteforce matches.
// it is strictly better to have a single bruteforce match spanning the same region:
// same contribution to the guess product with a lower length.
// --> safe to skip those cases.
if (last_m.get().get_pattern() == MatchPattern::BRUTEFORCE) continue;
// try adding m to this length-l sequence.
update(m2, l + 1);
}
}
};
// helper: step backwards through optimal.m starting at the end,
// constructing the final optimal match sequence.
auto unwind = [&] (idx_t n) {
std::vector<std::reference_wrapper<Match>> optimal_match_sequence;
if (!n) return optimal_match_sequence;
auto k = n - 1;
idx_t l = optimal.g[k].begin()->first;
guesses_t g = optimal.g[k].begin()->second;
for (const auto & item : optimal.g[k]) {
auto & candidate_l = item.first;
auto & candidate_g = item.second;
if (candidate_g < g) {
l = candidate_l;
g = candidate_g;
}
}
while (true) {
auto it = optimal.m[k].find(l);
assert(it != optimal.m[k].end());
auto & m = it->second;
optimal_match_sequence.push_back(m);
if (!m.get().i) break;
k = m.get().i - 1;
l -= 1;
}
std::reverse(optimal_match_sequence.begin(), optimal_match_sequence.end());
return optimal_match_sequence;
};
for (idx_t k = 0; k < n; ++k) {
for (const auto & m : matches_by_j[k]) {
if (m.get().i > 0) {
for (const auto & item : optimal.m[m.get().i - 1]) {
auto & l = item.first;
update(m, l + 1);
}
}
else {
update(m, 1);
}
}
bruteforce_update(k);
}
auto optimal_match_sequence = unwind(n);
auto optimal_l = optimal_match_sequence.size();
guesses_t guesses;
// corner: empty password
if (password.length() == 0) {
guesses = 1;
}
else {
guesses = optimal.g[n - 1][optimal_l];
}
// retrieve referenced bruteforce matches
std::vector<std::unique_ptr<Match>> bruteforce_matches;
for (const auto & ref : optimal_match_sequence) {
auto & m = ref.get();
if (m.get_pattern() != MatchPattern::BRUTEFORCE) continue;
auto it = bruteforces.find(std::make_pair(m.i, m.j));
if (it == bruteforces.end()) continue;
bruteforce_matches.push_back(std::move(it->second));
}
return {
password,
guesses,
static_cast<guesses_log10_t>(std::log10(guesses)),
std::move(bruteforce_matches),
std::move(optimal_match_sequence),
};
}
// ------------------------------------------------------------------------------
// guess estimation -- one function per match pattern ---------------------------
// ------------------------------------------------------------------------------
guesses_t estimate_guesses(Match & match, const std::string & password) {
if (match.guesses) return match.guesses; // a match's guess estimate doesn't change. cache it.
guesses_t min_guesses = 1;
if (match.token.length() < password.length()) {
min_guesses = (token_len(match) == 1)
? MIN_SUBMATCH_GUESSES_SINGLE_CHAR
: MIN_SUBMATCH_GUESSES_MULTI_CHAR;
}
#define MATCH_FN(title, upper, lower) \
: match.get_pattern() == MatchPattern::upper ? lower##_guesses
guesses_t (*estimation_function)(const Match &) =
(false) ? nullptr MATCH_RUN() : nullptr;
#undef MATCH_FN
assert(estimation_function != nullptr);
auto guesses = estimation_function(match);
match.guesses = std::max(guesses, min_guesses);
match.guesses_log10 = static_cast<guesses_log10_t>(std::log10(match.guesses));
return match.guesses;
}
guesses_t unknown_guesses(const Match & match) {
assert(match.guesses);
return match.guesses;
}
guesses_t bruteforce_guesses(const Match & match) {
auto guesses = std::pow(BRUTEFORCE_CARDINALITY, token_len(match));
// small detail: make bruteforce matches at minimum one guess bigger than smallest allowed
// submatch guesses, such that non-bruteforce submatches over the same [i..j] take precedence.
auto min_guesses = (token_len(match) == 1)
? MIN_SUBMATCH_GUESSES_SINGLE_CHAR + 1
: MIN_SUBMATCH_GUESSES_MULTI_CHAR + 1;
return std::max(guesses, min_guesses);
}
guesses_t repeat_guesses(const Match & match) {
return match.get_repeat().base_guesses * match.get_repeat().repeat_count;
}
guesses_t sequence_guesses(const Match & match) {
auto second_chr_pos = util::utf8_iter(match.token.begin(), match.token.end());
auto first_chr = std::string(match.token.begin(), second_chr_pos);
guesses_t base_guesses;
// lower guesses for obvious starting points
if (first_chr == "a" || first_chr == "A" || first_chr == "z" ||
first_chr == "Z" || first_chr == "0" || first_chr == "1" ||
first_chr == "9") {
base_guesses = 4;
}
else {
if (std::regex_match(first_chr, std::regex(R"(\d)"))) {
base_guesses = 10; // digits
}
else {
// could give a higher base for uppercase,
// assigning 26 to both upper and lower sequences is more conservative.
base_guesses = 26;
}
}
if (!match.get_sequence().ascending) {
// need to try a descending sequence in addition to every ascending sequence ->
// 2x guesses
base_guesses *= 2;
}
return base_guesses * token_len(match);
}
guesses_t regex_guesses(const Match & match) {
switch (match.get_regex().regex_tag) {
case RegexTag::RECENT_YEAR:
{
// conservative estimate of year space: num years from REFERENCE_YEAR.
// if year is close to REFERENCE_YEAR, estimate a year space of MIN_YEAR_SPACE.
auto pre_year_space = std::stoul(match.get_regex().regex_match.matches[0]);
if (pre_year_space > REFERENCE_YEAR) {
pre_year_space -= REFERENCE_YEAR;
}
else {
pre_year_space = REFERENCE_YEAR - pre_year_space;
}
guesses_t year_space = pre_year_space;
year_space = std::max(year_space, MIN_YEAR_SPACE);
return year_space;
}
case RegexTag::ALPHA_LOWER: case RegexTag::ALPHANUMERIC: {
auto base = [&] {
switch (match.get_regex().regex_tag) {
case RegexTag::ALPHA_LOWER: return 26;
case RegexTag::ALPHANUMERIC: return 62;
default: assert(false); return 0;
}
}();
return std::pow(base, token_len(match));
}
default:
return 0;
}
}
guesses_t date_guesses(const Match & match) {
// base guesses: (year distance from REFERENCE_YEAR) * num_days * num_years
auto pre_year_space = match.get_date().year;
if (pre_year_space > REFERENCE_YEAR) {
pre_year_space -= REFERENCE_YEAR;
}
else {
pre_year_space = REFERENCE_YEAR - pre_year_space;
}
guesses_t year_space = pre_year_space;
year_space = std::max(year_space, MIN_YEAR_SPACE);
auto guesses = year_space * 365;
// double for four-digit years
if (match.get_date().has_full_year) guesses *= 2;
// add factor of 4 for separator selection (one of ~4 choices)
if (match.get_date().separator.length()) guesses *= 4;
return guesses;
}
guesses_t spatial_guesses(const Match & match) {
std::size_t s;
guesses_t d;
if (match.get_spatial().graph == GraphTag::QWERTY ||
match.get_spatial().graph == GraphTag::DVORAK) {
s = KEYBOARD_STARTING_POSITIONS;
d = KEYBOARD_AVERAGE_DEGREE;
}
else {
s = KEYPAD_STARTING_POSITIONS;
d = KEYPAD_AVERAGE_DEGREE;
}
guesses_t guesses = 0;
auto L = token_len(match);
auto t = static_cast<decltype(L)>(match.get_spatial().turns);
// estimate the number of possible patterns w/ length L or less with t turns or less.
for (decltype(L) i = 2; i <= L; ++i) {
auto possible_turns = std::min(t, i - 1);
for (decltype(possible_turns) j = 1; j <= possible_turns; ++j) {
guesses += nCk(i - 1, j - 1) * s * std::pow(d, j);
}
}
// add extra guesses for shifted keys. (% instead of 5, A instead of a.)
// math is similar to extra guesses of l33t substitutions in dictionary matches.
if (match.get_spatial().shifted_count) {
auto S = match.get_spatial().shifted_count;
decltype(S) U = token_len(match) - match.get_spatial().shifted_count; // unshifted count
if (S == 0 || U == 0) {
guesses *= 2;
}
else {
auto shifted_variations = 0;
for (decltype(S) i = 1; i <= std::min(S, U); ++i) {
shifted_variations += nCk(S + U, i);
}
guesses *= shifted_variations;
}
}
return guesses;
}
guesses_t dictionary_guesses(const Match & match) {
auto base_guesses = match.get_dictionary().rank; // keep these as properties for display purposes
auto uppercase_variations_ = uppercase_variations(match);
auto l33t_variations_ = l33t_variations(match);
auto reversed_variations = match.get_dictionary().reversed ? 2 : 1;
return (base_guesses * uppercase_variations_ * l33t_variations_ * reversed_variations);
}
guesses_t uppercase_variations(const Match & match) {
auto & word = match.token;
if (std::regex_match(word, ALL_LOWER()) || !word.size())
return 1;
// a capitalized word is the most common capitalization scheme,
// so it only doubles the search space (uncapitalized + capitalized).
// allcaps and end-capitalized are common enough too, underestimate as 2x factor to be safe.
for (const auto& regex : {START_UPPER(), END_UPPER(), ALL_UPPER()}) {
if (std::regex_match(word, regex)) return 2;
}
// otherwise calculate the number of ways to capitalize U+L uppercase+lowercase letters
// with U uppercase letters or less. or, if there's more uppercase than lower (for eg. PASSwORD),
// the number of ways to lowercase U+L letters with L lowercase letters or less.
auto match_chr = [] (const std::string & str, const std::regex & regex) {
decltype(str.length()) toret = 0;
for (auto it = str.begin(); it != str.end();) {
auto it2 = util::utf8_iter(it, str.end());
auto s = std::string(it, it2);
if (std::regex_match(s, regex)) {
toret += 1;
}
it = it2;
}
return toret;
};
auto U = match_chr(word, std::regex(R"([A-Z])"));
auto L = match_chr(word, std::regex(R"([a-z])"));
guesses_t variations = 0;
for (decltype(U) i = 1; i <= std::min(U, L); ++i) {
variations += nCk(U + L, i);
}
return variations;
}
guesses_t l33t_variations(const Match & match) {
auto & dmatch = match.get_dictionary();
if (!dmatch.l33t) return 1;
guesses_t variations = 1;
for (const auto & item : dmatch.sub) {
auto & subbed = item.first;
auto & unsubbed = item.second;
// lower-case match.token before calculating: capitalization shouldn't affect l33t calc.
idx_t S = 0, U = 0;
// XXX: using ascii_lower is okay for now since our
// sub dictionaries are ascii only
auto ltoken = util::ascii_lower(match.token);
for (auto it = ltoken.begin(); it != ltoken.end();) {
auto it2 = util::utf8_iter(it, ltoken.end());
auto cs = std::string(it, it2);
if (cs == subbed) S += 1;
if (cs == unsubbed) U += 1;
it = it2;
}
if (!S || !U) {
// for this sub, password is either fully subbed (444) or fully unsubbed (aaa)
// treat that as doubling the space (attacker needs to try fully subbed chars in addition to
// unsubbed.)
variations *= 2;
}
else {
// this case is similar to capitalization:
// with aa44a, U = 3, S = 2, attacker needs to try unsubbed + one sub + two subs
auto p = std::min(U, S);
guesses_t possibilities = 0;
for (decltype(p) i = 1; i <= p; ++i) {
possibilities += nCk(U + S, i);
}
variations *= possibilities;
}
}
return variations;
}
}