| #include <zxcvbn/matching.hpp> |
| |
| #include <zxcvbn/adjacency_graphs.hpp> |
| #include <zxcvbn/common.hpp> |
| #include <zxcvbn/optional.hpp> |
| #include <zxcvbn/frequency_lists.hpp> |
| #include <zxcvbn/scoring.hpp> |
| #include <zxcvbn/util.hpp> |
| |
| #include <algorithm> |
| #include <array> |
| #include <functional> |
| #include <initializer_list> |
| #include <regex> |
| #include <sstream> |
| #include <string> |
| #include <vector> |
| #include <unordered_map> |
| #include <utility> |
| #include <unordered_set> |
| |
| #include "base/no_destructor.h" |
| #include "base/strings/string_util.h" |
| #include "third_party/icu/source/common/unicode/unistr.h" |
| #include "third_party/icu/source/i18n/unicode/regex.h" |
| |
| namespace zxcvbn { |
| |
| // TODO: make this a constexpr |
| const std::vector<std::pair<std::string, std::vector<std::string>>>& |
| L33T_TABLE() { |
| static base::NoDestructor< |
| std::vector<std::pair<std::string, std::vector<std::string>>>> |
| leet_table({ |
| {"a", {"4", "@"}}, |
| {"b", {"8"}}, |
| {"c", {"(", "{", "[", "<"}}, |
| {"e", {"3"}}, |
| {"g", {"6", "9"}}, |
| {"i", {"1", "!", "|"}}, |
| {"l", {"1", "|", "7"}}, |
| {"o", {"0"}}, |
| {"s", {"$", "5"}}, |
| {"t", {"+", "7"}}, |
| {"x", {"%"}}, |
| {"z", {"2"}}, |
| }); |
| |
| return *leet_table; |
| } |
| |
| // TODO: make this constexpr |
| const std::vector<std::pair<RegexTag, std::regex>>& REGEXEN() { |
| static base::NoDestructor<std::vector<std::pair<RegexTag, std::regex>>> |
| regexen({ |
| {RegexTag::RECENT_YEAR, std::regex(R"(19\d\d|200\d|201\d)")}, |
| }); |
| return *regexen; |
| } |
| |
| const auto DATE_MAX_YEAR = 2050; |
| const auto DATE_MIN_YEAR = 1000; |
| constexpr std::initializer_list<std::pair<int, int>> DATE_SPLITS[] = { |
| { // for length-4 strings, eg 1191 or 9111, two ways to split: |
| {1, 2}, // 1 1 91 (2nd split starts at index 1, 3rd at index 2) |
| {2, 3}, // 91 1 1 |
| }, |
| { |
| {1, 3}, // 1 11 91 |
| {2, 3}, // 11 1 91 |
| }, |
| { |
| {1, 2}, // 1 1 1991 |
| {2, 4}, // 11 11 91 |
| {4, 5}, // 1991 1 1 |
| }, |
| { |
| {1, 3}, // 1 11 1991 |
| {2, 3}, // 11 1 1991 |
| {4, 5}, // 1991 1 11 |
| {4, 6}, // 1991 11 1 |
| }, |
| { |
| {2, 4}, // 11 11 1991 |
| {4, 6}, // 1991 11 11 |
| }, |
| }; |
| |
| static |
| std::string translate(const std::string & string, |
| const std::unordered_map<std::string, std::string> & chr_map) { |
| std::string toret; |
| auto bit = std::back_inserter(toret); |
| toret.reserve(string.size()); |
| for (auto it = string.begin(); it != string.end();) { |
| auto nextit = util::utf8_iter(it, string.end()); |
| auto ch = std::string(it, nextit); |
| auto mit = chr_map.find(ch); |
| if (mit != chr_map.end()) { |
| ch = mit->second; |
| } |
| std::copy(ch.begin(), ch.end(), bit); |
| it = nextit; |
| } |
| return toret; |
| } |
| |
| static |
| std::vector<Match> & sorted(std::vector<Match> & matches) { |
| std::sort(matches.begin(), matches.end(), |
| [&] (const Match & m1, const Match & m2) -> bool { |
| return std::make_pair(m1.i, m1.j) < std::make_pair(m2.i, m2.j); |
| }); |
| return matches; |
| } |
| |
| static |
| std::string dict_normalize(const std::string & str) { |
| // NB: we only have ascii strings in the dictionaries |
| // TODO: when we have more complex strings in the dictionaries, |
| // do a more complex normalization |
| return util::ascii_lower(str); |
| } |
| |
| std::vector<Match> omnimatch(const std::string & password, |
| const std::vector<std::string> & ordered_list) { |
| auto ranked_dictionaries = default_ranked_dicts(); |
| |
| auto ranked_dict = build_ranked_dict(ordered_list); |
| ranked_dictionaries.insert( |
| std::make_pair(DictionaryTag::USER_INPUTS, &ranked_dict)); |
| |
| std::vector<Match> matches; |
| std::function<std::vector<Match>(const std::string&)> matchers[] = { |
| std::bind(dictionary_match, std::placeholders::_1, |
| std::cref(ranked_dictionaries)), |
| std::bind(reverse_dictionary_match, std::placeholders::_1, |
| std::cref(ranked_dictionaries)), |
| std::bind(l33t_match, std::placeholders::_1, |
| std::cref(ranked_dictionaries), std::cref(L33T_TABLE())), |
| std::bind(spatial_match, std::placeholders::_1, std::cref(graphs())), |
| repeat_match, |
| sequence_match, |
| std::bind(regex_match, std::placeholders::_1, std::cref(REGEXEN())), |
| date_match, |
| }; |
| for (const auto & matcher : matchers) { |
| auto ret = matcher(password); |
| std::move(ret.begin(), ret.end(), std::back_inserter(matches)); |
| } |
| return sorted(matches); |
| } |
| |
| //------------------------------------------------------------------------------- |
| // dictionary match (common passwords, english, last names, etc) ---------------- |
| //------------------------------------------------------------------------------- |
| |
| std::vector<Match> dictionary_match(const std::string & password, |
| const RankedDicts & ranked_dictionaries) { |
| std::vector<Match> matches; |
| auto len = password.length(); |
| auto password_lower = dict_normalize(password); |
| for (const auto & item : ranked_dictionaries) { |
| auto dictionary_tag = item.first; |
| auto& ranked_dict = *item.second; |
| for (decltype(len) i = 0, idx = 0; idx < len; util::utf8_decode(password, idx), ++i) { |
| for (decltype(len) j = i, jdx = idx; jdx < len; ++j) { |
| // j is inclusive, but jdx is not so eagerly iterate jdx |
| util::utf8_decode(password, jdx); |
| |
| auto word = password_lower.substr(idx, jdx - idx); |
| auto it = ranked_dict.find(word); |
| if (it != ranked_dict.end()) { |
| auto rank = it->second; |
| matches.push_back(Match(i, j, password.substr(idx, jdx - idx), |
| DictionaryMatch{ |
| dictionary_tag, |
| word, rank, |
| false, |
| false, {}, ""})); |
| matches.back().idx = idx; |
| matches.back().jdx = jdx; |
| } |
| } |
| } |
| } |
| return sorted(matches); |
| } |
| |
| std::vector<Match> reverse_dictionary_match(const std::string & password, |
| const RankedDicts & ranked_dictionaries) { |
| auto clen = util::character_len(password); |
| auto reversed_password = util::reverse_string(password); |
| auto matches = dictionary_match(reversed_password, ranked_dictionaries); |
| for (auto & match : matches) { |
| match.token = util::reverse_string(match.token); // reverse back |
| match.get_dictionary().reversed = true; |
| // map coordinates back to original string |
| std::tie(match.i, match.j) = std::make_tuple( |
| clen - 1 - match.j, |
| clen - 1 - match.i |
| ); |
| std::tie(match.idx, match.jdx) = std::make_tuple( |
| password.length() - match.jdx, |
| password.length() - match.idx |
| ); |
| } |
| return sorted(matches); |
| } |
| |
| //------------------------------------------------------------------------------- |
| // dictionary match with common l33t substitutions ------------------------------ |
| //------------------------------------------------------------------------------- |
| |
| // makes a pruned copy of l33t_table that only includes password's possible substitutions |
| std::unordered_map<std::string, std::vector<std::string>> relevant_l33t_subtable(const std::string & password, const std::vector<std::pair<std::string, std::vector<std::string>>> & table) { |
| std::unordered_map<std::string, std::vector<std::string>> subtable; |
| for (const auto & item : table) { |
| auto & letter = item.first; |
| auto & subs = item.second; |
| std::vector<std::string> relevant_subs; |
| for (const auto & sub : subs) { |
| if (password.find(sub) != password.npos) { |
| relevant_subs.push_back(sub); |
| } |
| } |
| |
| if (relevant_subs.size()) { |
| subtable.insert(std::make_pair(letter, relevant_subs)); |
| } |
| } |
| return subtable; |
| } |
| |
| // returns the list of possible 1337 replacement dictionaries for a given password |
| std::vector<std::unordered_map<std::string, std::string>> enumerate_l33t_subs(const std::unordered_map<std::string, std::vector<std::string>> & table) { |
| using SubsType = std::vector<std::vector<std::pair<std::string, std::string>>>; |
| SubsType subs = {{}}; |
| |
| auto dedup = [] (const SubsType & subs) { |
| SubsType deduped; |
| std::unordered_set<std::string> members; |
| for (const auto & sub : subs) { |
| auto assoc = sub; |
| std::sort(assoc.begin(), assoc.end()); |
| std::ostringstream label_stream; |
| for (const auto & item : assoc) { |
| label_stream << item.first << "," << item.second << "-"; |
| } |
| auto label = label_stream.str(); |
| if (members.find(label) == members.end()) { |
| members.insert(std::move(label)); |
| deduped.push_back(sub); |
| } |
| } |
| return deduped; |
| }; |
| |
| for (const auto & item : table) { |
| auto & first_key = item.first; |
| SubsType next_subs; |
| for (const auto & l33t_chr : item.second) { |
| for (const auto & sub : subs) { |
| auto sub_alternative = sub; |
| auto it = std::find_if( |
| sub_alternative.begin(), sub_alternative.end(), |
| [&] (const std::pair<std::string, std::string> & sub_elt) { |
| return sub_elt.first == l33t_chr; |
| }); |
| if (it == sub_alternative.end()) { |
| sub_alternative.push_back(std::make_pair(l33t_chr, first_key)); |
| next_subs.push_back(std::move(sub_alternative)); |
| } |
| else { |
| sub_alternative.erase(it); |
| sub_alternative.push_back(std::make_pair(l33t_chr, first_key)); |
| next_subs.push_back(sub); |
| next_subs.push_back(std::move(sub_alternative)); |
| } |
| } |
| } |
| subs = dedup(next_subs); |
| } |
| |
| // convert from assoc lists to dicts |
| std::vector<std::unordered_map<std::string, std::string>> sub_dicts; |
| for (const auto & sub : subs) { |
| sub_dicts.push_back(std::unordered_map<std::string, std::string> |
| (sub.begin(), sub.end())); |
| } |
| |
| return sub_dicts; |
| } |
| |
| std::vector<Match> l33t_match(const std::string & password, |
| const RankedDicts & ranked_dictionaries, |
| const std::vector<std::pair<std::string, std::vector<std::string>>> & l33t_table) { |
| std::vector<Match> matches; |
| for (const auto & sub : enumerate_l33t_subs(relevant_l33t_subtable(password, l33t_table))) { |
| if (!sub.size()) break; |
| auto subbed_password = translate(password, sub); |
| for (auto & match : dictionary_match(subbed_password, ranked_dictionaries)) { |
| auto & dmatch = match.get_dictionary(); |
| auto token = password.substr(match.idx, match.jdx - match.idx); |
| if (dict_normalize(token) == dmatch.matched_word) { |
| // only return the matches that contain an actual substitution |
| continue; |
| } |
| // subset of mappings in sub that are in use for this match |
| std::unordered_map<std::string, std::string> match_sub; |
| for (const auto & item : sub) { |
| auto & subbed_chr = item.first; |
| if (token.find(subbed_chr) == token.npos) continue; |
| match_sub.insert(item); |
| } |
| dmatch.l33t = true; |
| match.token = token; |
| dmatch.sub = match_sub; |
| std::ostringstream os; |
| std::string sep = ""; |
| for (const auto & item : match_sub) { |
| os << sep << item.first << " -> " << item.second; |
| if (!sep.size()) { |
| sep = ", "; |
| } |
| } |
| dmatch.sub_display = os.str(); |
| matches.push_back(std::move(match)); |
| } |
| } |
| |
| matches.erase(std::remove_if(matches.begin(), matches.end(), [] (const Match & a) { |
| // filter single-character l33t matches to reduce noise. |
| // otherwise '1' matches 'i', '4' matches 'a', both very common English words |
| // with low dictionary rank. |
| return util::character_len(a.token) <= 1; |
| }), |
| matches.end()); |
| |
| return sorted(matches); |
| } |
| |
| // ------------------------------------------------------------------------------ |
| // spatial match (qwerty/dvorak/keypad) ----------------------------------------- |
| // ------------------------------------------------------------------------------ |
| |
| static |
| std::vector<Match> spatial_match_helper(const std::string & password, |
| const Graph & graph, |
| GraphTag tag); |
| |
| std::vector<Match> spatial_match(const std::string & password, |
| const Graphs & graphs) { |
| std::vector<Match> matches; |
| for (const auto & item : graphs) { |
| auto ret = spatial_match_helper(password, item.second, item.first); |
| std::move(ret.begin(), ret.end(), std::back_inserter(matches)); |
| } |
| return matches; |
| } |
| |
| static |
| std::vector<Match> spatial_match_helper(const std::string & password, |
| const Graph & graph, |
| GraphTag graph_tag) { |
| const auto SHIFTED_RX = |
| std::regex("[~!@#$%^&*()_+QWERTYUIOP{}|ASDFGHJKL:\"ZXCVBNM<>?]"); |
| |
| std::vector<Match> matches; |
| if (!password.length()) return matches; |
| idx_t idx = 0; |
| idx_t i = 0; |
| auto clen = util::character_len(password); |
| while (i < clen - 1) { |
| auto jdx = idx; |
| util::utf8_decode(password, jdx); |
| auto j = i + 1; |
| auto last_direction = -1; |
| unsigned turns = 0; |
| unsigned shifted_count; |
| if ((graph_tag == GraphTag::QWERTY || |
| graph_tag == GraphTag::DVORAK) && |
| std::regex_search(password.substr(idx, jdx - idx), SHIFTED_RX)) { |
| shifted_count = 1; |
| } |
| else { |
| shifted_count = 0; |
| } |
| auto prev_jdx = idx; |
| while (true) { |
| auto prev_char = password.substr(prev_jdx, jdx - prev_jdx); |
| auto found = false; |
| auto found_direction = -1; |
| auto cur_direction = -1; |
| const auto & adjacents = [&] { |
| auto it = graph.find(prev_char); |
| if (it != graph.end()) { |
| return it->second; |
| } |
| return Graph::mapped_type(); |
| }(); |
| // consider growing pattern by one character if j hasn't gone over the edge. |
| if (j < clen) { |
| auto next_jdx = jdx; |
| util::utf8_decode(password, next_jdx); |
| auto cur_char = password.substr(jdx, next_jdx - jdx); |
| for (auto & adj : adjacents) { |
| cur_direction += 1; |
| if (adj.find(cur_char) != adj.npos) { |
| found = true; |
| found_direction = cur_direction; |
| if (adj.find(cur_char) == 1) { |
| // index 1 in the adjacency means the key is shifted, |
| // 0 means unshifted: A vs a, % vs 5, etc. |
| // for example, 'q' is adjacent to the entry '2@'. |
| // @ is shifted w/ index 1, 2 is unshifted. |
| shifted_count += 1; |
| } |
| if (last_direction != found_direction) { |
| // adding a turn is correct even in the initial case when last_direction is null: |
| // every spatial pattern starts with a turn. |
| turns += 1; |
| last_direction = found_direction; |
| } |
| break; |
| } |
| } |
| } |
| // if the current pattern continued, extend j and try to grow again |
| if (found) { |
| j += 1; |
| prev_jdx = jdx; |
| util::utf8_decode(password, jdx); |
| } |
| // otherwise push the pattern discovered so far, if any... |
| else { |
| if (j - i > 2) { // don't consider length 1 or 2 chains. |
| matches.push_back(Match(i, j - 1, password.substr(idx, jdx - idx), |
| SpatialMatch{ |
| graph_tag, turns, shifted_count, |
| })); |
| matches.back().idx = idx; |
| matches.back().jdx = jdx; |
| } |
| // ...and then start a new search for the rest of the password. |
| i = j; |
| idx = jdx; |
| break; |
| } |
| } |
| } |
| return matches; |
| } |
| |
| //------------------------------------------------------------------------------- |
| // repeats (aaa, abcabcabc) and sequences (abcdef) ------------------------------ |
| //------------------------------------------------------------------------------- |
| |
| std::vector<Match> repeat_match(const std::string& password) { |
| std::vector<Match> matches; |
| |
| auto unicode_password = icu::UnicodeString::fromUTF8(password); |
| |
| UErrorCode status = U_ZERO_ERROR; |
| std::unique_ptr<icu::RegexPattern> greedy_pattern(icu::RegexPattern::compile( |
| icu::UnicodeString::fromUTF8(R"((.+)\1+)"), 0, status)); |
| std::unique_ptr<icu::RegexMatcher> greedy_matcher( |
| greedy_pattern->matcher(unicode_password, status)); |
| |
| std::unique_ptr<icu::RegexPattern> lazy_pattern(icu::RegexPattern::compile( |
| icu::UnicodeString::fromUTF8(R"((.+?)\1+)"), 0, status)); |
| std::unique_ptr<icu::RegexMatcher> lazy_matcher( |
| lazy_pattern->matcher(unicode_password, status)); |
| |
| std::unique_ptr<icu::RegexPattern> lazy_anchored_pattern( |
| icu::RegexPattern::compile(icu::UnicodeString::fromUTF8(R"(^(.+?)\1+$)"), |
| 0, status)); |
| |
| int lastUnicodeIndex = 0; |
| size_t lastIndex = 0; |
| while (lastIndex < password.length()) { |
| if (!greedy_matcher->find(lastUnicodeIndex, status) || |
| !lazy_matcher->find(lastUnicodeIndex, status)) { |
| break; |
| } |
| |
| icu::RegexMatcher* matcher = nullptr; |
| icu::UnicodeString base_token; |
| if (greedy_matcher->group(status).length() > |
| lazy_matcher->group(status).length()) { |
| // greedy beats lazy for 'aabaab' |
| // greedy: [aabaab, aab] |
| // lazy: [aa, a] |
| matcher = greedy_matcher.get(); |
| // greedy's repeated string might itself be repeated, eg. |
| // aabaab in aabaabaabaab. |
| // run an anchored lazy match on greedy's repeated string |
| // to find the shortest repeated string |
| auto greedy_found = matcher->group(status); |
| |
| std::unique_ptr<icu::RegexMatcher> lazy_anchored_matcher( |
| lazy_anchored_pattern->matcher(greedy_found, status)); |
| auto ret = lazy_anchored_matcher->find(status); |
| assert(ret); |
| (void) ret; |
| base_token = lazy_anchored_matcher->group(1, status); |
| } else { |
| // lazy beats greedy for 'aaaaa' |
| // greedy: [aaaa, aa] |
| // lazy: [aaaaa, a] |
| matcher = lazy_matcher.get(); |
| base_token = matcher->group(1, status); |
| } |
| |
| std::string matched_string; |
| matcher->group(status).toUTF8String(matched_string); |
| |
| auto idx = password.find(matched_string, lastIndex); |
| auto jdx = idx + matched_string.size(); |
| |
| auto i = util::character_len(password, 0, idx); |
| auto j = i + util::character_len(password, idx, jdx) - 1; |
| // recursively match and score the base string |
| std::string base_string; |
| base_token.toUTF8String(base_string); |
| auto sub_matches = omnimatch(base_string); |
| auto base_analysis = |
| most_guessable_match_sequence(base_string, sub_matches, false); |
| std::vector<Match> base_matches; |
| std::move(base_analysis.sequence.begin(), base_analysis.sequence.end(), |
| std::back_inserter(base_matches)); |
| auto& base_guesses = base_analysis.guesses; |
| matches.push_back(Match(i, j, matched_string, |
| RepeatMatch{ |
| base_string, |
| base_guesses, |
| std::move(base_matches), |
| matched_string.size() / base_string.size(), |
| })); |
| matches.back().idx = idx; |
| matches.back().jdx = jdx; |
| lastUnicodeIndex = matcher->end(status); |
| lastIndex = jdx; |
| } |
| return matches; |
| } |
| |
| const auto MAX_DELTA = 5; |
| std::vector<Match> sequence_match(const std::string & password) { |
| // Identifies sequences by looking for repeated differences in unicode codepoint. |
| // this allows skipping, such as 9753, and also matches some extended unicode sequences |
| // such as Greek and Cyrillic alphabets. |
| // |
| // for example, consider the input 'abcdb975zy' |
| // |
| // password: a b c d b 9 7 5 z y |
| // index: 0 1 2 3 4 5 6 7 8 9 |
| // delta: 1 1 1 -2 -41 -2 -2 69 1 |
| // |
| // expected result: |
| // [(i, j, delta), ...] = [(0, 3, 1), (5, 7, -2), (8, 9, 1)] |
| |
| if (util::character_len(password) == 1) return {}; |
| |
| std::vector<Match> result; |
| |
| using delta_t = std::int32_t; |
| |
| auto update = [&] (idx_t i, idx_t j, idx_t idx, idx_t jdx, delta_t delta) { |
| if (j - i > 1 || std::abs(delta) == 1) { |
| if (0 < std::abs(delta) && std::abs(delta) <= MAX_DELTA) { |
| auto token = password.substr(idx, jdx - idx); |
| SequenceTag sequence_name; |
| unsigned sequence_space; |
| if (std::regex_search(token, std::regex(R"(^[a-z]+$)"))) { |
| sequence_name = SequenceTag::kLower; |
| sequence_space = 26; |
| } |
| else if (std::regex_search(token, std::regex(R"(^[A-Z]+$)"))) { |
| sequence_name = SequenceTag::kUpper; |
| sequence_space = 26; |
| } |
| else if (std::regex_search(token, std::regex(R"(^\d+$)"))) { |
| sequence_name = SequenceTag::kDigits; |
| sequence_space = 10; |
| } |
| else { |
| sequence_name = SequenceTag::kUnicode; |
| sequence_space = 26; |
| } |
| result.push_back(Match(i, j, token, |
| SequenceMatch{sequence_name, sequence_space, |
| delta > 0})); |
| result.back().idx = idx; |
| result.back().jdx = jdx; |
| } |
| } |
| }; |
| |
| if (!password.size()) return result; |
| |
| decltype(password.length()) i = 0; |
| decltype(password.length()) idx = 0; |
| optional::optional<delta_t> maybe_last_delta; |
| decltype(password.length()) kdx = 0; |
| auto last_kdx = kdx; |
| auto last_cp = util::utf8_decode(password, kdx); |
| for (idx_t k = 1; kdx < password.length(); ++k) { |
| auto next_kdx = kdx; |
| auto cp = util::utf8_decode(password, next_kdx); |
| assert(kdx != next_kdx); |
| delta_t delta = cp - last_cp; |
| if (!maybe_last_delta) { |
| maybe_last_delta = delta; |
| } |
| if (delta != *maybe_last_delta) { |
| auto jdx = kdx; |
| auto j = k - 1; |
| update(i, j, idx, jdx, *maybe_last_delta); |
| i = j; |
| idx = last_kdx; |
| maybe_last_delta = delta; |
| } |
| last_kdx = kdx; |
| kdx = next_kdx; |
| last_cp = cp; |
| } |
| if (maybe_last_delta) { |
| update(i, util::character_len(password) - 1, |
| idx, password.size(), *maybe_last_delta); |
| } |
| return result; |
| } |
| |
| |
| //------------------------------------------------------------------------------- |
| // regex matching --------------------------------------------------------------- |
| //------------------------------------------------------------------------------- |
| |
| std::vector<Match> regex_match(const std::string & password, |
| const std::vector<std::pair<RegexTag, std::regex>> & regexen) { |
| std::vector<Match> matches; |
| for (const auto & item : regexen) { |
| auto tag = item.first; |
| auto & regex = item.second; |
| std::smatch rx_match; |
| std::size_t lastIndex = 0; |
| while (std::regex_match(lastIndex + password.begin(), password.end(), |
| rx_match, regex)) { |
| auto token = rx_match.str(0); |
| auto idx = lastIndex + rx_match.position(); |
| auto jdx = lastIndex + rx_match.position() + rx_match[0].length(); |
| assert(token == password.substr(idx, jdx - idx)); |
| auto i = util::character_len(password, 0, idx); |
| auto j = i + util::character_len(password, idx, jdx) - 1; |
| matches.push_back(Match(i, j, |
| std::move(token), |
| RegexMatch{tag, PortableRegexMatch(rx_match)})); |
| matches.back().idx = idx; |
| matches.back().jdx = jdx; |
| lastIndex += rx_match[0].length(); |
| } |
| } |
| return sorted(matches); |
| } |
| |
| //------------------------------------------------------------------------------- |
| // date matching ---------------------------------------------------------------- |
| //------------------------------------------------------------------------------- |
| |
| using date_t = unsigned; |
| |
| struct DMY { |
| date_t year, month, day; |
| }; |
| |
| static |
| optional::optional<DMY> map_ints_to_dmy(const std::array<date_t, 3> & vals); |
| |
| static |
| date_t stou(const std::string & a) { |
| return static_cast<date_t>(std::stoul(a)); |
| } |
| |
| std::vector<Match> date_match(const std::string & password) { |
| // a "date" is recognized as: |
| // any 3-tuple that starts or ends with a 2- or 4-digit year, |
| // with 2 or 0 separator chars (1.1.91 or 1191), |
| // maybe zero-padded (01-01-91 vs 1-1-91), |
| // a month between 1 and 12, |
| // a day between 1 and 31. |
| // |
| // note: this isn't true date parsing in that "feb 31st" is allowed, |
| // this doesn't check for leap years, etc. |
| // |
| // recipe: |
| // start with regex to find maybe-dates, then attempt to map the integers |
| // onto month-day-year to filter the maybe-dates into dates. |
| // finally, remove matches that are substrings of other matches to reduce noise. |
| // |
| // note: instead of using a lazy or greedy regex to find many dates over the full string, |
| // this uses a ^...$ regex against every substring of the password -- less performant but leads |
| // to every possible date match. |
| std::vector<Match> matches; |
| std::regex maybe_date_no_separator(R"(^\d{4,8}$)"); |
| std::regex maybe_date_with_separator(R"(^(\d{1,4})([\s/\\_.-])(\d{1,2})\2(\d{1,4})$)"); |
| |
| // dates without separators are between length 4 '1191' and 8 '11111991' |
| std::vector<std::string::size_type> offsets; |
| offsets.reserve(9); |
| idx_t idx_dot = 0; |
| for (auto i = 0; i < 9; ++i) { |
| offsets.push_back(idx_dot); |
| if (idx_dot >= password.length()) { |
| break; |
| } |
| util::utf8_decode(password, idx_dot); |
| } |
| assert(offsets.size()); |
| for (decltype(password.length()) i = 0; offsets.size() - 1 >= 4; ++i) { |
| auto idx = offsets[0]; |
| for (decltype(i) offset = 3; offset <= 7 && offset < offsets.size() - 1; ++offset) { |
| auto j = i + offset; |
| auto jdx = offsets[offset + 1]; |
| |
| auto token = password.substr(idx, jdx - idx); |
| auto token_chr_len = j - i + 1; |
| assert(util::character_len(token) == token_chr_len); |
| if (!std::regex_search(token, maybe_date_no_separator)) continue; |
| std::vector<DMY> candidates; |
| for (const auto & item : DATE_SPLITS[token_chr_len - 4]) { |
| auto k = item.first; |
| auto l = item.second; |
| auto kdx = offsets[k] - idx; |
| auto ldx = offsets[l] - idx; |
| |
| auto dmy = map_ints_to_dmy(std::array<date_t, 3>{{ |
| stou(token.substr(0, kdx)), |
| stou(token.substr(kdx, ldx - kdx)), |
| stou(token.substr(ldx))}}); |
| if (dmy) candidates.push_back(*dmy); |
| } |
| if (!candidates.size()) continue; |
| // at this point: different possible dmy mappings for the same i,j substring. |
| // match the candidate date that likely takes the fewest guesses: a year closest to 2000. |
| // (scoring.REFERENCE_YEAR). |
| // |
| // ie, considering '111504', prefer 11-15-04 to 1-1-1504 |
| // (interpreting '04' as 2004) |
| auto metric = [] (const DMY & candidate) { |
| if (candidate.year >= REFERENCE_YEAR) { |
| return candidate.year - REFERENCE_YEAR; |
| } |
| else { |
| return REFERENCE_YEAR - candidate.year; |
| } |
| }; |
| auto best_candidate = *std::min_element(candidates.begin(), candidates.end(), |
| [=] (const DMY & a, const DMY & b) { |
| return metric(a) < metric(b); |
| }); |
| matches.push_back(Match(i, j, token, |
| DateMatch{"", |
| best_candidate.year, |
| best_candidate.month, |
| best_candidate.day, |
| false, |
| })); |
| matches.back().idx = idx; |
| matches.back().jdx = jdx; |
| } |
| |
| offsets.erase(offsets.begin()); |
| if (offsets.back() < password.length()) { |
| auto idx2 = offsets.back(); |
| util::utf8_decode(password, idx2); |
| offsets.push_back(idx2); |
| } |
| } |
| |
| // dates with separators are between length 6 '1/1/91' and 10 '11/11/1991' |
| offsets.clear(); |
| offsets.reserve(11); |
| idx_dot = 0; |
| for (auto i = 0; i < 11; ++i) { |
| offsets.push_back(idx_dot); |
| if (idx_dot >= password.length()) { |
| break; |
| } |
| util::utf8_decode(password, idx_dot); |
| } |
| assert(offsets.size()); |
| for (decltype(password.length()) i = 0; offsets.size() - 1 >= 6; ++i) { |
| auto idx = offsets[0]; |
| for (decltype(password.length()) offset = 5; offset <= 9 && offset < offsets.size() - 1; ++offset) { |
| auto j = offset + i; |
| auto jdx = offsets[offset + 1]; |
| auto token = password.substr(idx, jdx - idx); |
| std::smatch rx_match; |
| if (!std::regex_match(token, rx_match, maybe_date_with_separator)) { |
| continue; |
| } |
| auto dmy = map_ints_to_dmy(std::array<date_t, 3>{{ |
| stou(rx_match[1]), |
| stou(rx_match[3]), |
| stou(rx_match[4])}}); |
| if (!dmy) continue; |
| matches.push_back(Match(i, j, token, |
| DateMatch{rx_match[2], |
| dmy->year, |
| dmy->month, |
| dmy->day, |
| false, |
| })); |
| matches.back().idx = idx; |
| matches.back().jdx = jdx; |
| } |
| |
| offsets.erase(offsets.begin()); |
| if (offsets.back() < password.length()) { |
| auto idx2 = offsets.back(); |
| util::utf8_decode(password, idx2); |
| offsets.push_back(idx2); |
| } |
| } |
| |
| // matches now contains all valid date strings in a way that is tricky to capture |
| // with regexes only. while thorough, it will contain some unintuitive noise: |
| // |
| // '2015_06_04', in addition to matching 2015_06_04, will also contain |
| // 5(!) other date matches: 15_06_04, 5_06_04, ..., even 2015 (matched as 5/1/2020) |
| // |
| // to reduce noise, remove date matches that are strict substrings of others |
| matches.erase(std::remove_if(matches.begin(), matches.end(), [&] (const Match & match) { |
| for (auto & other_match : matches) { |
| if (other_match.i == match.i && other_match.j == match.j) continue; |
| if (other_match.i <= match.i && other_match.j >= match.j) { |
| return true; |
| } |
| } |
| return false; |
| }), |
| matches.end()); |
| |
| return sorted(matches); |
| } |
| |
| static |
| optional::optional<DMY> map_ints_to_dm(const std::array<date_t, 2> & vals); |
| |
| static |
| date_t two_to_four_digit_year(date_t val); |
| |
| static |
| optional::optional<DMY> map_ints_to_dmy(const std::array<date_t, 3> & vals) { |
| // given a 3-tuple, discard if: |
| // middle int is over 31 (for all dmy formats, years are never allowed in the middle) |
| // middle int is zero |
| // any int is over the max allowable year |
| // any int is over two digits but under the min allowable year |
| // 2 ints are over 31, the max allowable day |
| // 2 ints are zero |
| // all ints are over 12, the max allowable month |
| if (vals[1] > 31 || vals[1] == 0) return optional::nullopt; |
| auto over_12 = 0; |
| auto over_31 = 0; |
| auto under_1 = 0; |
| for (auto val : vals) { |
| if ((99 < val && val < DATE_MIN_YEAR) || val > DATE_MAX_YEAR) return optional::nullopt; |
| if (val > 31) over_31 += 1; |
| if (val > 12) over_12 += 1; |
| if (val <= 0) under_1 += 1; |
| } |
| if (over_31 >= 2 || over_12 == 3 || under_1 >= 2) return optional::nullopt; |
| |
| // first look for a four digit year: yyyy + daymonth or daymonth + yyyy |
| std::pair<date_t, std::array<date_t, 2>> possible_year_splits[] = { |
| {vals[2], {{vals[0], vals[1]}}}, // year last |
| {vals[0], {{vals[1], vals[2]}}}, // year first |
| }; |
| for (const auto & item : possible_year_splits) { |
| auto & y = item.first; |
| auto & rest = item.second; |
| if (DATE_MIN_YEAR <= y && y <= DATE_MAX_YEAR) { |
| auto dm = map_ints_to_dm(rest); |
| if (dm) { |
| return DMY{y, dm->month, dm->day}; |
| } |
| else { |
| // for a candidate that includes a four-digit year, |
| // when the remaining ints don't match to a day and month, |
| // it is not a date. |
| return optional::nullopt; |
| } |
| } |
| } |
| |
| // given no four-digit year, two digit years are the most flexible int to match, so |
| // try to parse a day-month out of ints[0..1] or ints[1..0] |
| for (const auto & item : possible_year_splits) { |
| auto y = item.first; |
| auto & rest = item.second; |
| auto dm = map_ints_to_dm(rest); |
| if (dm) { |
| y = two_to_four_digit_year(y); |
| return DMY{y, dm->month, dm->day}; |
| } |
| } |
| |
| return optional::nullopt; |
| } |
| |
| static |
| optional::optional<DMY> map_ints_to_dm(const std::array<date_t, 2> & vals) { |
| for (const auto & item : {vals, {{vals[1], vals[0]}}}) { |
| auto d = item[0], m = item[1]; |
| if (1 <= d && d <= 31 && 1 <= m && m <= 12) { |
| return DMY{0, m, d}; |
| } |
| } |
| return optional::nullopt; |
| } |
| |
| static |
| date_t two_to_four_digit_year(date_t year) { |
| if (year > 99) { |
| return year; |
| } |
| else if (year > 50) { |
| // 87 -> 1987 |
| return year + 1900; |
| } |
| else { |
| // 15 -> 2015 |
| return year + 2000; |
| } |
| } |
| |
| } |