blob: 3d09857fa5fabefb61969039042915fcda9428e3 [file] [log] [blame]
#include <zxcvbn/matching.hpp>
#include <zxcvbn/adjacency_graphs.hpp>
#include <zxcvbn/common.hpp>
#include <zxcvbn/optional.hpp>
#include <zxcvbn/frequency_lists.hpp>
#include <zxcvbn/scoring.hpp>
#include <zxcvbn/util.hpp>
#include <algorithm>
#include <array>
#include <functional>
#include <initializer_list>
#include <regex>
#include <sstream>
#include <string>
#include <vector>
#include <unordered_map>
#include <utility>
#include <unordered_set>
#include "base/no_destructor.h"
#include "base/strings/string_util.h"
#include "third_party/icu/source/common/unicode/unistr.h"
#include "third_party/icu/source/i18n/unicode/regex.h"
namespace zxcvbn {
// TODO: make this a constexpr
const std::vector<std::pair<std::string, std::vector<std::string>>>&
L33T_TABLE() {
static base::NoDestructor<
std::vector<std::pair<std::string, std::vector<std::string>>>>
leet_table({
{"a", {"4", "@"}},
{"b", {"8"}},
{"c", {"(", "{", "[", "<"}},
{"e", {"3"}},
{"g", {"6", "9"}},
{"i", {"1", "!", "|"}},
{"l", {"1", "|", "7"}},
{"o", {"0"}},
{"s", {"$", "5"}},
{"t", {"+", "7"}},
{"x", {"%"}},
{"z", {"2"}},
});
return *leet_table;
}
// TODO: make this constexpr
const std::vector<std::pair<RegexTag, std::regex>>& REGEXEN() {
static base::NoDestructor<std::vector<std::pair<RegexTag, std::regex>>>
regexen({
{RegexTag::RECENT_YEAR, std::regex(R"(19\d\d|200\d|201\d)")},
});
return *regexen;
}
const auto DATE_MAX_YEAR = 2050;
const auto DATE_MIN_YEAR = 1000;
constexpr std::initializer_list<std::pair<int, int>> DATE_SPLITS[] = {
{ // for length-4 strings, eg 1191 or 9111, two ways to split:
{1, 2}, // 1 1 91 (2nd split starts at index 1, 3rd at index 2)
{2, 3}, // 91 1 1
},
{
{1, 3}, // 1 11 91
{2, 3}, // 11 1 91
},
{
{1, 2}, // 1 1 1991
{2, 4}, // 11 11 91
{4, 5}, // 1991 1 1
},
{
{1, 3}, // 1 11 1991
{2, 3}, // 11 1 1991
{4, 5}, // 1991 1 11
{4, 6}, // 1991 11 1
},
{
{2, 4}, // 11 11 1991
{4, 6}, // 1991 11 11
},
};
static
std::string translate(const std::string & string,
const std::unordered_map<std::string, std::string> & chr_map) {
std::string toret;
auto bit = std::back_inserter(toret);
toret.reserve(string.size());
for (auto it = string.begin(); it != string.end();) {
auto nextit = util::utf8_iter(it, string.end());
auto ch = std::string(it, nextit);
auto mit = chr_map.find(ch);
if (mit != chr_map.end()) {
ch = mit->second;
}
std::copy(ch.begin(), ch.end(), bit);
it = nextit;
}
return toret;
}
static
std::vector<Match> & sorted(std::vector<Match> & matches) {
std::sort(matches.begin(), matches.end(),
[&] (const Match & m1, const Match & m2) -> bool {
return std::make_pair(m1.i, m1.j) < std::make_pair(m2.i, m2.j);
});
return matches;
}
static
std::string dict_normalize(const std::string & str) {
// NB: we only have ascii strings in the dictionaries
// TODO: when we have more complex strings in the dictionaries,
// do a more complex normalization
return util::ascii_lower(str);
}
std::vector<Match> omnimatch(const std::string & password,
const std::vector<std::string> & ordered_list) {
auto ranked_dictionaries = default_ranked_dicts();
auto ranked_dict = build_ranked_dict(ordered_list);
ranked_dictionaries.insert(
std::make_pair(DictionaryTag::USER_INPUTS, &ranked_dict));
std::vector<Match> matches;
std::function<std::vector<Match>(const std::string&)> matchers[] = {
std::bind(dictionary_match, std::placeholders::_1,
std::cref(ranked_dictionaries)),
std::bind(reverse_dictionary_match, std::placeholders::_1,
std::cref(ranked_dictionaries)),
std::bind(l33t_match, std::placeholders::_1,
std::cref(ranked_dictionaries), std::cref(L33T_TABLE())),
std::bind(spatial_match, std::placeholders::_1, std::cref(graphs())),
repeat_match,
sequence_match,
std::bind(regex_match, std::placeholders::_1, std::cref(REGEXEN())),
date_match,
};
for (const auto & matcher : matchers) {
auto ret = matcher(password);
std::move(ret.begin(), ret.end(), std::back_inserter(matches));
}
return sorted(matches);
}
//-------------------------------------------------------------------------------
// dictionary match (common passwords, english, last names, etc) ----------------
//-------------------------------------------------------------------------------
std::vector<Match> dictionary_match(const std::string & password,
const RankedDicts & ranked_dictionaries) {
std::vector<Match> matches;
auto len = password.length();
auto password_lower = dict_normalize(password);
for (const auto & item : ranked_dictionaries) {
auto dictionary_tag = item.first;
auto& ranked_dict = *item.second;
for (decltype(len) i = 0, idx = 0; idx < len; util::utf8_decode(password, idx), ++i) {
for (decltype(len) j = i, jdx = idx; jdx < len; ++j) {
// j is inclusive, but jdx is not so eagerly iterate jdx
util::utf8_decode(password, jdx);
auto word = password_lower.substr(idx, jdx - idx);
auto it = ranked_dict.find(word);
if (it != ranked_dict.end()) {
auto rank = it->second;
matches.push_back(Match(i, j, password.substr(idx, jdx - idx),
DictionaryMatch{
dictionary_tag,
word, rank,
false,
false, {}, ""}));
matches.back().idx = idx;
matches.back().jdx = jdx;
}
}
}
}
return sorted(matches);
}
std::vector<Match> reverse_dictionary_match(const std::string & password,
const RankedDicts & ranked_dictionaries) {
auto clen = util::character_len(password);
auto reversed_password = util::reverse_string(password);
auto matches = dictionary_match(reversed_password, ranked_dictionaries);
for (auto & match : matches) {
match.token = util::reverse_string(match.token); // reverse back
match.get_dictionary().reversed = true;
// map coordinates back to original string
std::tie(match.i, match.j) = std::make_tuple(
clen - 1 - match.j,
clen - 1 - match.i
);
std::tie(match.idx, match.jdx) = std::make_tuple(
password.length() - match.jdx,
password.length() - match.idx
);
}
return sorted(matches);
}
//-------------------------------------------------------------------------------
// dictionary match with common l33t substitutions ------------------------------
//-------------------------------------------------------------------------------
// makes a pruned copy of l33t_table that only includes password's possible substitutions
std::unordered_map<std::string, std::vector<std::string>> relevant_l33t_subtable(const std::string & password, const std::vector<std::pair<std::string, std::vector<std::string>>> & table) {
std::unordered_map<std::string, std::vector<std::string>> subtable;
for (const auto & item : table) {
auto & letter = item.first;
auto & subs = item.second;
std::vector<std::string> relevant_subs;
for (const auto & sub : subs) {
if (password.find(sub) != password.npos) {
relevant_subs.push_back(sub);
}
}
if (relevant_subs.size()) {
subtable.insert(std::make_pair(letter, relevant_subs));
}
}
return subtable;
}
// returns the list of possible 1337 replacement dictionaries for a given password
std::vector<std::unordered_map<std::string, std::string>> enumerate_l33t_subs(const std::unordered_map<std::string, std::vector<std::string>> & table) {
using SubsType = std::vector<std::vector<std::pair<std::string, std::string>>>;
SubsType subs = {{}};
auto dedup = [] (const SubsType & subs) {
SubsType deduped;
std::unordered_set<std::string> members;
for (const auto & sub : subs) {
auto assoc = sub;
std::sort(assoc.begin(), assoc.end());
std::ostringstream label_stream;
for (const auto & item : assoc) {
label_stream << item.first << "," << item.second << "-";
}
auto label = label_stream.str();
if (members.find(label) == members.end()) {
members.insert(std::move(label));
deduped.push_back(sub);
}
}
return deduped;
};
for (const auto & item : table) {
auto & first_key = item.first;
SubsType next_subs;
for (const auto & l33t_chr : item.second) {
for (const auto & sub : subs) {
auto sub_alternative = sub;
auto it = std::find_if(
sub_alternative.begin(), sub_alternative.end(),
[&] (const std::pair<std::string, std::string> & sub_elt) {
return sub_elt.first == l33t_chr;
});
if (it == sub_alternative.end()) {
sub_alternative.push_back(std::make_pair(l33t_chr, first_key));
next_subs.push_back(std::move(sub_alternative));
}
else {
sub_alternative.erase(it);
sub_alternative.push_back(std::make_pair(l33t_chr, first_key));
next_subs.push_back(sub);
next_subs.push_back(std::move(sub_alternative));
}
}
}
subs = dedup(next_subs);
}
// convert from assoc lists to dicts
std::vector<std::unordered_map<std::string, std::string>> sub_dicts;
for (const auto & sub : subs) {
sub_dicts.push_back(std::unordered_map<std::string, std::string>
(sub.begin(), sub.end()));
}
return sub_dicts;
}
std::vector<Match> l33t_match(const std::string & password,
const RankedDicts & ranked_dictionaries,
const std::vector<std::pair<std::string, std::vector<std::string>>> & l33t_table) {
std::vector<Match> matches;
for (const auto & sub : enumerate_l33t_subs(relevant_l33t_subtable(password, l33t_table))) {
if (!sub.size()) break;
auto subbed_password = translate(password, sub);
for (auto & match : dictionary_match(subbed_password, ranked_dictionaries)) {
auto & dmatch = match.get_dictionary();
auto token = password.substr(match.idx, match.jdx - match.idx);
if (dict_normalize(token) == dmatch.matched_word) {
// only return the matches that contain an actual substitution
continue;
}
// subset of mappings in sub that are in use for this match
std::unordered_map<std::string, std::string> match_sub;
for (const auto & item : sub) {
auto & subbed_chr = item.first;
if (token.find(subbed_chr) == token.npos) continue;
match_sub.insert(item);
}
dmatch.l33t = true;
match.token = token;
dmatch.sub = match_sub;
std::ostringstream os;
std::string sep = "";
for (const auto & item : match_sub) {
os << sep << item.first << " -> " << item.second;
if (!sep.size()) {
sep = ", ";
}
}
dmatch.sub_display = os.str();
matches.push_back(std::move(match));
}
}
matches.erase(std::remove_if(matches.begin(), matches.end(), [] (const Match & a) {
// filter single-character l33t matches to reduce noise.
// otherwise '1' matches 'i', '4' matches 'a', both very common English words
// with low dictionary rank.
return util::character_len(a.token) <= 1;
}),
matches.end());
return sorted(matches);
}
// ------------------------------------------------------------------------------
// spatial match (qwerty/dvorak/keypad) -----------------------------------------
// ------------------------------------------------------------------------------
static
std::vector<Match> spatial_match_helper(const std::string & password,
const Graph & graph,
GraphTag tag);
std::vector<Match> spatial_match(const std::string & password,
const Graphs & graphs) {
std::vector<Match> matches;
for (const auto & item : graphs) {
auto ret = spatial_match_helper(password, item.second, item.first);
std::move(ret.begin(), ret.end(), std::back_inserter(matches));
}
return matches;
}
static
std::vector<Match> spatial_match_helper(const std::string & password,
const Graph & graph,
GraphTag graph_tag) {
const auto SHIFTED_RX =
std::regex("[~!@#$%^&*()_+QWERTYUIOP{}|ASDFGHJKL:\"ZXCVBNM<>?]");
std::vector<Match> matches;
if (!password.length()) return matches;
idx_t idx = 0;
idx_t i = 0;
auto clen = util::character_len(password);
while (i < clen - 1) {
auto jdx = idx;
util::utf8_decode(password, jdx);
auto j = i + 1;
auto last_direction = -1;
unsigned turns = 0;
unsigned shifted_count;
if ((graph_tag == GraphTag::QWERTY ||
graph_tag == GraphTag::DVORAK) &&
std::regex_search(password.substr(idx, jdx - idx), SHIFTED_RX)) {
shifted_count = 1;
}
else {
shifted_count = 0;
}
auto prev_jdx = idx;
while (true) {
auto prev_char = password.substr(prev_jdx, jdx - prev_jdx);
auto found = false;
auto found_direction = -1;
auto cur_direction = -1;
const auto & adjacents = [&] {
auto it = graph.find(prev_char);
if (it != graph.end()) {
return it->second;
}
return Graph::mapped_type();
}();
// consider growing pattern by one character if j hasn't gone over the edge.
if (j < clen) {
auto next_jdx = jdx;
util::utf8_decode(password, next_jdx);
auto cur_char = password.substr(jdx, next_jdx - jdx);
for (auto & adj : adjacents) {
cur_direction += 1;
if (adj.find(cur_char) != adj.npos) {
found = true;
found_direction = cur_direction;
if (adj.find(cur_char) == 1) {
// index 1 in the adjacency means the key is shifted,
// 0 means unshifted: A vs a, % vs 5, etc.
// for example, 'q' is adjacent to the entry '2@'.
// @ is shifted w/ index 1, 2 is unshifted.
shifted_count += 1;
}
if (last_direction != found_direction) {
// adding a turn is correct even in the initial case when last_direction is null:
// every spatial pattern starts with a turn.
turns += 1;
last_direction = found_direction;
}
break;
}
}
}
// if the current pattern continued, extend j and try to grow again
if (found) {
j += 1;
prev_jdx = jdx;
util::utf8_decode(password, jdx);
}
// otherwise push the pattern discovered so far, if any...
else {
if (j - i > 2) { // don't consider length 1 or 2 chains.
matches.push_back(Match(i, j - 1, password.substr(idx, jdx - idx),
SpatialMatch{
graph_tag, turns, shifted_count,
}));
matches.back().idx = idx;
matches.back().jdx = jdx;
}
// ...and then start a new search for the rest of the password.
i = j;
idx = jdx;
break;
}
}
}
return matches;
}
//-------------------------------------------------------------------------------
// repeats (aaa, abcabcabc) and sequences (abcdef) ------------------------------
//-------------------------------------------------------------------------------
std::vector<Match> repeat_match(const std::string& password) {
std::vector<Match> matches;
auto unicode_password = icu::UnicodeString::fromUTF8(password);
UErrorCode status = U_ZERO_ERROR;
std::unique_ptr<icu::RegexPattern> greedy_pattern(icu::RegexPattern::compile(
icu::UnicodeString::fromUTF8(R"((.+)\1+)"), 0, status));
std::unique_ptr<icu::RegexMatcher> greedy_matcher(
greedy_pattern->matcher(unicode_password, status));
std::unique_ptr<icu::RegexPattern> lazy_pattern(icu::RegexPattern::compile(
icu::UnicodeString::fromUTF8(R"((.+?)\1+)"), 0, status));
std::unique_ptr<icu::RegexMatcher> lazy_matcher(
lazy_pattern->matcher(unicode_password, status));
std::unique_ptr<icu::RegexPattern> lazy_anchored_pattern(
icu::RegexPattern::compile(icu::UnicodeString::fromUTF8(R"(^(.+?)\1+$)"),
0, status));
int lastUnicodeIndex = 0;
size_t lastIndex = 0;
while (lastIndex < password.length()) {
if (!greedy_matcher->find(lastUnicodeIndex, status) ||
!lazy_matcher->find(lastUnicodeIndex, status)) {
break;
}
icu::RegexMatcher* matcher = nullptr;
icu::UnicodeString base_token;
if (greedy_matcher->group(status).length() >
lazy_matcher->group(status).length()) {
// greedy beats lazy for 'aabaab'
// greedy: [aabaab, aab]
// lazy: [aa, a]
matcher = greedy_matcher.get();
// greedy's repeated string might itself be repeated, eg.
// aabaab in aabaabaabaab.
// run an anchored lazy match on greedy's repeated string
// to find the shortest repeated string
auto greedy_found = matcher->group(status);
std::unique_ptr<icu::RegexMatcher> lazy_anchored_matcher(
lazy_anchored_pattern->matcher(greedy_found, status));
auto ret = lazy_anchored_matcher->find(status);
assert(ret);
(void) ret;
base_token = lazy_anchored_matcher->group(1, status);
} else {
// lazy beats greedy for 'aaaaa'
// greedy: [aaaa, aa]
// lazy: [aaaaa, a]
matcher = lazy_matcher.get();
base_token = matcher->group(1, status);
}
std::string matched_string;
matcher->group(status).toUTF8String(matched_string);
auto idx = password.find(matched_string, lastIndex);
auto jdx = idx + matched_string.size();
auto i = util::character_len(password, 0, idx);
auto j = i + util::character_len(password, idx, jdx) - 1;
// recursively match and score the base string
std::string base_string;
base_token.toUTF8String(base_string);
auto sub_matches = omnimatch(base_string);
auto base_analysis =
most_guessable_match_sequence(base_string, sub_matches, false);
std::vector<Match> base_matches;
std::move(base_analysis.sequence.begin(), base_analysis.sequence.end(),
std::back_inserter(base_matches));
auto& base_guesses = base_analysis.guesses;
matches.push_back(Match(i, j, matched_string,
RepeatMatch{
base_string,
base_guesses,
std::move(base_matches),
matched_string.size() / base_string.size(),
}));
matches.back().idx = idx;
matches.back().jdx = jdx;
lastUnicodeIndex = matcher->end(status);
lastIndex = jdx;
}
return matches;
}
const auto MAX_DELTA = 5;
std::vector<Match> sequence_match(const std::string & password) {
// Identifies sequences by looking for repeated differences in unicode codepoint.
// this allows skipping, such as 9753, and also matches some extended unicode sequences
// such as Greek and Cyrillic alphabets.
//
// for example, consider the input 'abcdb975zy'
//
// password: a b c d b 9 7 5 z y
// index: 0 1 2 3 4 5 6 7 8 9
// delta: 1 1 1 -2 -41 -2 -2 69 1
//
// expected result:
// [(i, j, delta), ...] = [(0, 3, 1), (5, 7, -2), (8, 9, 1)]
if (util::character_len(password) == 1) return {};
std::vector<Match> result;
using delta_t = std::int32_t;
auto update = [&] (idx_t i, idx_t j, idx_t idx, idx_t jdx, delta_t delta) {
if (j - i > 1 || std::abs(delta) == 1) {
if (0 < std::abs(delta) && std::abs(delta) <= MAX_DELTA) {
auto token = password.substr(idx, jdx - idx);
SequenceTag sequence_name;
unsigned sequence_space;
if (std::regex_search(token, std::regex(R"(^[a-z]+$)"))) {
sequence_name = SequenceTag::kLower;
sequence_space = 26;
}
else if (std::regex_search(token, std::regex(R"(^[A-Z]+$)"))) {
sequence_name = SequenceTag::kUpper;
sequence_space = 26;
}
else if (std::regex_search(token, std::regex(R"(^\d+$)"))) {
sequence_name = SequenceTag::kDigits;
sequence_space = 10;
}
else {
sequence_name = SequenceTag::kUnicode;
sequence_space = 26;
}
result.push_back(Match(i, j, token,
SequenceMatch{sequence_name, sequence_space,
delta > 0}));
result.back().idx = idx;
result.back().jdx = jdx;
}
}
};
if (!password.size()) return result;
decltype(password.length()) i = 0;
decltype(password.length()) idx = 0;
optional::optional<delta_t> maybe_last_delta;
decltype(password.length()) kdx = 0;
auto last_kdx = kdx;
auto last_cp = util::utf8_decode(password, kdx);
for (idx_t k = 1; kdx < password.length(); ++k) {
auto next_kdx = kdx;
auto cp = util::utf8_decode(password, next_kdx);
assert(kdx != next_kdx);
delta_t delta = cp - last_cp;
if (!maybe_last_delta) {
maybe_last_delta = delta;
}
if (delta != *maybe_last_delta) {
auto jdx = kdx;
auto j = k - 1;
update(i, j, idx, jdx, *maybe_last_delta);
i = j;
idx = last_kdx;
maybe_last_delta = delta;
}
last_kdx = kdx;
kdx = next_kdx;
last_cp = cp;
}
if (maybe_last_delta) {
update(i, util::character_len(password) - 1,
idx, password.size(), *maybe_last_delta);
}
return result;
}
//-------------------------------------------------------------------------------
// regex matching ---------------------------------------------------------------
//-------------------------------------------------------------------------------
std::vector<Match> regex_match(const std::string & password,
const std::vector<std::pair<RegexTag, std::regex>> & regexen) {
std::vector<Match> matches;
for (const auto & item : regexen) {
auto tag = item.first;
auto & regex = item.second;
std::smatch rx_match;
std::size_t lastIndex = 0;
while (std::regex_match(lastIndex + password.begin(), password.end(),
rx_match, regex)) {
auto token = rx_match.str(0);
auto idx = lastIndex + rx_match.position();
auto jdx = lastIndex + rx_match.position() + rx_match[0].length();
assert(token == password.substr(idx, jdx - idx));
auto i = util::character_len(password, 0, idx);
auto j = i + util::character_len(password, idx, jdx) - 1;
matches.push_back(Match(i, j,
std::move(token),
RegexMatch{tag, PortableRegexMatch(rx_match)}));
matches.back().idx = idx;
matches.back().jdx = jdx;
lastIndex += rx_match[0].length();
}
}
return sorted(matches);
}
//-------------------------------------------------------------------------------
// date matching ----------------------------------------------------------------
//-------------------------------------------------------------------------------
using date_t = unsigned;
struct DMY {
date_t year, month, day;
};
static
optional::optional<DMY> map_ints_to_dmy(const std::array<date_t, 3> & vals);
static
date_t stou(const std::string & a) {
return static_cast<date_t>(std::stoul(a));
}
std::vector<Match> date_match(const std::string & password) {
// a "date" is recognized as:
// any 3-tuple that starts or ends with a 2- or 4-digit year,
// with 2 or 0 separator chars (1.1.91 or 1191),
// maybe zero-padded (01-01-91 vs 1-1-91),
// a month between 1 and 12,
// a day between 1 and 31.
//
// note: this isn't true date parsing in that "feb 31st" is allowed,
// this doesn't check for leap years, etc.
//
// recipe:
// start with regex to find maybe-dates, then attempt to map the integers
// onto month-day-year to filter the maybe-dates into dates.
// finally, remove matches that are substrings of other matches to reduce noise.
//
// note: instead of using a lazy or greedy regex to find many dates over the full string,
// this uses a ^...$ regex against every substring of the password -- less performant but leads
// to every possible date match.
std::vector<Match> matches;
std::regex maybe_date_no_separator(R"(^\d{4,8}$)");
std::regex maybe_date_with_separator(R"(^(\d{1,4})([\s/\\_.-])(\d{1,2})\2(\d{1,4})$)");
// dates without separators are between length 4 '1191' and 8 '11111991'
std::vector<std::string::size_type> offsets;
offsets.reserve(9);
idx_t idx_dot = 0;
for (auto i = 0; i < 9; ++i) {
offsets.push_back(idx_dot);
if (idx_dot >= password.length()) {
break;
}
util::utf8_decode(password, idx_dot);
}
assert(offsets.size());
for (decltype(password.length()) i = 0; offsets.size() - 1 >= 4; ++i) {
auto idx = offsets[0];
for (decltype(i) offset = 3; offset <= 7 && offset < offsets.size() - 1; ++offset) {
auto j = i + offset;
auto jdx = offsets[offset + 1];
auto token = password.substr(idx, jdx - idx);
auto token_chr_len = j - i + 1;
assert(util::character_len(token) == token_chr_len);
if (!std::regex_search(token, maybe_date_no_separator)) continue;
std::vector<DMY> candidates;
for (const auto & item : DATE_SPLITS[token_chr_len - 4]) {
auto k = item.first;
auto l = item.second;
auto kdx = offsets[k] - idx;
auto ldx = offsets[l] - idx;
auto dmy = map_ints_to_dmy(std::array<date_t, 3>{{
stou(token.substr(0, kdx)),
stou(token.substr(kdx, ldx - kdx)),
stou(token.substr(ldx))}});
if (dmy) candidates.push_back(*dmy);
}
if (!candidates.size()) continue;
// at this point: different possible dmy mappings for the same i,j substring.
// match the candidate date that likely takes the fewest guesses: a year closest to 2000.
// (scoring.REFERENCE_YEAR).
//
// ie, considering '111504', prefer 11-15-04 to 1-1-1504
// (interpreting '04' as 2004)
auto metric = [] (const DMY & candidate) {
if (candidate.year >= REFERENCE_YEAR) {
return candidate.year - REFERENCE_YEAR;
}
else {
return REFERENCE_YEAR - candidate.year;
}
};
auto best_candidate = *std::min_element(candidates.begin(), candidates.end(),
[=] (const DMY & a, const DMY & b) {
return metric(a) < metric(b);
});
matches.push_back(Match(i, j, token,
DateMatch{"",
best_candidate.year,
best_candidate.month,
best_candidate.day,
false,
}));
matches.back().idx = idx;
matches.back().jdx = jdx;
}
offsets.erase(offsets.begin());
if (offsets.back() < password.length()) {
auto idx2 = offsets.back();
util::utf8_decode(password, idx2);
offsets.push_back(idx2);
}
}
// dates with separators are between length 6 '1/1/91' and 10 '11/11/1991'
offsets.clear();
offsets.reserve(11);
idx_dot = 0;
for (auto i = 0; i < 11; ++i) {
offsets.push_back(idx_dot);
if (idx_dot >= password.length()) {
break;
}
util::utf8_decode(password, idx_dot);
}
assert(offsets.size());
for (decltype(password.length()) i = 0; offsets.size() - 1 >= 6; ++i) {
auto idx = offsets[0];
for (decltype(password.length()) offset = 5; offset <= 9 && offset < offsets.size() - 1; ++offset) {
auto j = offset + i;
auto jdx = offsets[offset + 1];
auto token = password.substr(idx, jdx - idx);
std::smatch rx_match;
if (!std::regex_match(token, rx_match, maybe_date_with_separator)) {
continue;
}
auto dmy = map_ints_to_dmy(std::array<date_t, 3>{{
stou(rx_match[1]),
stou(rx_match[3]),
stou(rx_match[4])}});
if (!dmy) continue;
matches.push_back(Match(i, j, token,
DateMatch{rx_match[2],
dmy->year,
dmy->month,
dmy->day,
false,
}));
matches.back().idx = idx;
matches.back().jdx = jdx;
}
offsets.erase(offsets.begin());
if (offsets.back() < password.length()) {
auto idx2 = offsets.back();
util::utf8_decode(password, idx2);
offsets.push_back(idx2);
}
}
// matches now contains all valid date strings in a way that is tricky to capture
// with regexes only. while thorough, it will contain some unintuitive noise:
//
// '2015_06_04', in addition to matching 2015_06_04, will also contain
// 5(!) other date matches: 15_06_04, 5_06_04, ..., even 2015 (matched as 5/1/2020)
//
// to reduce noise, remove date matches that are strict substrings of others
matches.erase(std::remove_if(matches.begin(), matches.end(), [&] (const Match & match) {
for (auto & other_match : matches) {
if (other_match.i == match.i && other_match.j == match.j) continue;
if (other_match.i <= match.i && other_match.j >= match.j) {
return true;
}
}
return false;
}),
matches.end());
return sorted(matches);
}
static
optional::optional<DMY> map_ints_to_dm(const std::array<date_t, 2> & vals);
static
date_t two_to_four_digit_year(date_t val);
static
optional::optional<DMY> map_ints_to_dmy(const std::array<date_t, 3> & vals) {
// given a 3-tuple, discard if:
// middle int is over 31 (for all dmy formats, years are never allowed in the middle)
// middle int is zero
// any int is over the max allowable year
// any int is over two digits but under the min allowable year
// 2 ints are over 31, the max allowable day
// 2 ints are zero
// all ints are over 12, the max allowable month
if (vals[1] > 31 || vals[1] == 0) return optional::nullopt;
auto over_12 = 0;
auto over_31 = 0;
auto under_1 = 0;
for (auto val : vals) {
if ((99 < val && val < DATE_MIN_YEAR) || val > DATE_MAX_YEAR) return optional::nullopt;
if (val > 31) over_31 += 1;
if (val > 12) over_12 += 1;
if (val <= 0) under_1 += 1;
}
if (over_31 >= 2 || over_12 == 3 || under_1 >= 2) return optional::nullopt;
// first look for a four digit year: yyyy + daymonth or daymonth + yyyy
std::pair<date_t, std::array<date_t, 2>> possible_year_splits[] = {
{vals[2], {{vals[0], vals[1]}}}, // year last
{vals[0], {{vals[1], vals[2]}}}, // year first
};
for (const auto & item : possible_year_splits) {
auto & y = item.first;
auto & rest = item.second;
if (DATE_MIN_YEAR <= y && y <= DATE_MAX_YEAR) {
auto dm = map_ints_to_dm(rest);
if (dm) {
return DMY{y, dm->month, dm->day};
}
else {
// for a candidate that includes a four-digit year,
// when the remaining ints don't match to a day and month,
// it is not a date.
return optional::nullopt;
}
}
}
// given no four-digit year, two digit years are the most flexible int to match, so
// try to parse a day-month out of ints[0..1] or ints[1..0]
for (const auto & item : possible_year_splits) {
auto y = item.first;
auto & rest = item.second;
auto dm = map_ints_to_dm(rest);
if (dm) {
y = two_to_four_digit_year(y);
return DMY{y, dm->month, dm->day};
}
}
return optional::nullopt;
}
static
optional::optional<DMY> map_ints_to_dm(const std::array<date_t, 2> & vals) {
for (const auto & item : {vals, {{vals[1], vals[0]}}}) {
auto d = item[0], m = item[1];
if (1 <= d && d <= 31 && 1 <= m && m <= 12) {
return DMY{0, m, d};
}
}
return optional::nullopt;
}
static
date_t two_to_four_digit_year(date_t year) {
if (year > 99) {
return year;
}
else if (year > 50) {
// 87 -> 1987
return year + 1900;
}
else {
// 15 -> 2015
return year + 2000;
}
}
}