| diff --git a/third_party/zxcvbn-cpp/native-src/zxcvbn/matching.cpp b/third_party/zxcvbn-cpp/native-src/zxcvbn/matching.cpp |
| index 8f4e6d2f0e00..13465dee1cd7 100644 |
| --- a/third_party/zxcvbn-cpp/native-src/zxcvbn/matching.cpp |
| +++ b/third_party/zxcvbn-cpp/native-src/zxcvbn/matching.cpp |
| @@ -20,6 +20,9 @@ |
| #include <unordered_set> |
| |
| #include "base/no_destructor.h" |
| +#include "base/strings/string_util.h" |
| +#include "third_party/icu/source/common/unicode/unistr.h" |
| +#include "third_party/icu/source/i18n/unicode/regex.h" |
| |
| namespace zxcvbn { |
| |
| @@ -453,69 +456,91 @@ std::vector<Match> spatial_match_helper(const std::string & password, |
| // repeats (aaa, abcabcabc) and sequences (abcdef) ------------------------------ |
| //------------------------------------------------------------------------------- |
| |
| -std::vector<Match> repeat_match(const std::string & password) { |
| +std::vector<Match> repeat_match(const std::string& password) { |
| std::vector<Match> matches; |
| - std::regex greedy(R"((.+)\1+)"); |
| - std::regex lazy(R"((.+?)\1+)"); |
| - std::regex lazy_anchored(R"(^(.+?)\1+$)"); |
| - idx_t lastIndex = 0; |
| + |
| + auto unicode_password = icu::UnicodeString::fromUTF8(password); |
| + |
| + UErrorCode status = U_ZERO_ERROR; |
| + std::unique_ptr<icu::RegexPattern> greedy_pattern(icu::RegexPattern::compile( |
| + icu::UnicodeString::fromUTF8(R"((.+)\1+)"), 0, status)); |
| + std::unique_ptr<icu::RegexMatcher> greedy_matcher( |
| + greedy_pattern->matcher(unicode_password, status)); |
| + |
| + std::unique_ptr<icu::RegexPattern> lazy_pattern(icu::RegexPattern::compile( |
| + icu::UnicodeString::fromUTF8(R"((.+?)\1+)"), 0, status)); |
| + std::unique_ptr<icu::RegexMatcher> lazy_matcher( |
| + lazy_pattern->matcher(unicode_password, status)); |
| + |
| + std::unique_ptr<icu::RegexPattern> lazy_anchored_pattern( |
| + icu::RegexPattern::compile(icu::UnicodeString::fromUTF8(R"(^(.+?)\1+$)"), |
| + 0, status)); |
| + |
| + int lastUnicodeIndex = 0; |
| + size_t lastIndex = 0; |
| while (lastIndex < password.length()) { |
| - auto start_iter = lastIndex + password.begin(); |
| - std::smatch greedy_match, lazy_match; |
| - std::regex_search(start_iter, password.end(), |
| - greedy_match, greedy); |
| - std::regex_search(start_iter, password.end(), |
| - lazy_match, lazy); |
| - if (!greedy_match.size()) break; |
| - std::smatch match; |
| - std::string base_token; |
| - if (greedy_match[0].length() > lazy_match[0].length()) { |
| + if (!greedy_matcher->find(lastUnicodeIndex, status) || |
| + !lazy_matcher->find(lastUnicodeIndex, status)) { |
| + break; |
| + } |
| + |
| + icu::RegexMatcher* matcher = nullptr; |
| + icu::UnicodeString base_token; |
| + if (greedy_matcher->group(status).length() > |
| + lazy_matcher->group(status).length()) { |
| // greedy beats lazy for 'aabaab' |
| // greedy: [aabaab, aab] |
| // lazy: [aa, a] |
| - match = greedy_match; |
| + matcher = greedy_matcher.get(); |
| // greedy's repeated string might itself be repeated, eg. |
| // aabaab in aabaabaabaab. |
| // run an anchored lazy match on greedy's repeated string |
| // to find the shortest repeated string |
| - std::smatch lazy_anchored_match; |
| - auto greedy_found = match.str(0); |
| - auto ret = std::regex_search(greedy_found, lazy_anchored_match, lazy_anchored); |
| + auto greedy_found = matcher->group(status); |
| + |
| + std::unique_ptr<icu::RegexMatcher> lazy_anchored_matcher( |
| + lazy_anchored_pattern->matcher(greedy_found, status)); |
| + auto ret = lazy_anchored_matcher->find(status); |
| assert(ret); |
| (void) ret; |
| - base_token = lazy_anchored_match.str(1); |
| + base_token = lazy_anchored_matcher->group(1, status); |
| } |
| else { |
| // lazy beats greedy for 'aaaaa' |
| // greedy: [aaaa, aa] |
| // lazy: [aaaaa, a] |
| - match = std::move(lazy_match); |
| - base_token = match.str(1); |
| + matcher = lazy_matcher.get(); |
| + base_token = matcher->group(1, status); |
| } |
| - auto idx = lastIndex + match.position(); |
| - auto jdx = lastIndex + match.position() + match[0].length(); |
| + |
| + std::string matched_string; |
| + matcher->group(status).toUTF8String(matched_string); |
| + |
| + auto idx = password.find(matched_string, lastIndex); |
| + auto jdx = idx + matched_string.size(); |
| + |
| auto i = util::character_len(password, 0, idx); |
| auto j = i + util::character_len(password, idx, jdx) - 1; |
| // recursively match and score the base string |
| - auto sub_matches = omnimatch(base_token); |
| - auto base_analysis = most_guessable_match_sequence( |
| - base_token, |
| - sub_matches, |
| - false |
| - ); |
| + std::string base_string; |
| + base_token.toUTF8String(base_string); |
| + auto sub_matches = omnimatch(base_string); |
| + auto base_analysis = |
| + most_guessable_match_sequence(base_string, sub_matches, false); |
| std::vector<Match> base_matches; |
| std::move(base_analysis.sequence.begin(), base_analysis.sequence.end(), |
| std::back_inserter(base_matches)); |
| - auto & base_guesses = base_analysis.guesses; |
| - matches.push_back(Match(i, j, match.str(0), |
| + auto& base_guesses = base_analysis.guesses; |
| + matches.push_back(Match(i, j, matched_string, |
| RepeatMatch{ |
| - base_token, |
| + base_string, |
| base_guesses, |
| std::move(base_matches), |
| - match[0].length() / base_token.length(), |
| - })); |
| + matched_string.size() / base_string.size(), |
| + })); |
| matches.back().idx = idx; |
| matches.back().jdx = jdx; |
| + lastUnicodeIndex = matcher->end(status); |
| lastIndex = jdx; |
| } |
| return matches; |
| diff --git a/third_party/zxcvbn-cpp/native-src/zxcvbn/scoring.cpp b/third_party/zxcvbn-cpp/native-src/zxcvbn/scoring.cpp |
| index a4e341935ffb..e5c120a86a5c 100644 |
| --- a/third_party/zxcvbn-cpp/native-src/zxcvbn/scoring.cpp |
| +++ b/third_party/zxcvbn-cpp/native-src/zxcvbn/scoring.cpp |
| @@ -75,7 +75,11 @@ std::size_t token_len(const Match & m) __attribute__((pure)); |
| static |
| std::size_t token_len(const Match & m) { |
| std::size_t result = m.j - m.i + 1; |
| - assert(result == util::character_len(m.token)); |
| + // Bruteforce matches might be any substring of the original string, which are |
| + // not necessarily aligned to UTF8 code points, and thus m.token might not be |
| + // a valid UTF8 string. |
| + if (m.get_pattern() != MatchPattern::BRUTEFORCE) |
| + assert(result == util::character_len(m.token)); |
| return result; |
| } |
| |