blob: 24878c8562e0aac4cd2dd42345942e5ae6855e4b [file] [log] [blame]
// Copyright (c) 2019 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chromeos/components/string_matching/sequence_matcher.h"
#include <algorithm>
#include <cmath>
#include <queue>
#include "base/check_op.h"
namespace chromeos {
namespace string_matching {
namespace {
using Match = SequenceMatcher::Match;
using Matches = std::vector<Match>;
bool CompareMatches(const Match& m1, const Match& m2) {
return m1.pos_first_string < m2.pos_first_string;
}
} // namespace
SequenceMatcher::Match::Match() = default;
SequenceMatcher::Match::Match(int pos_first, int pos_second, int len)
: pos_first_string(pos_first), pos_second_string(pos_second), length(len) {
DCHECK_GE(pos_first_string, 0);
DCHECK_GE(pos_second_string, 0);
DCHECK_GE(length, 0);
}
SequenceMatcher::SequenceMatcher(const base::string16& first_string,
const base::string16& second_string,
bool use_edit_distance,
double num_matching_blocks_penalty)
: first_string_(first_string),
second_string_(second_string),
num_matching_blocks_penalty_(num_matching_blocks_penalty),
dp_common_string_(second_string.size() + 1, 0) {
DCHECK(!first_string_.empty() || !second_string_.empty());
for (size_t i = 0; i < second_string_.size(); i++) {
char_to_positions_[second_string_[i]].emplace_back(i);
}
use_edit_distance_ = use_edit_distance;
}
Match SequenceMatcher::FindLongestMatch(int first_start,
int first_end,
int second_start,
int second_end) {
Match match(first_start, second_start, 0);
// These two vectors are used to do "fast update".
// |dp_values_to_erase| contains the values should be erased from
// |dp_common_string_|.
// |dp_values_to_affect| contains the values should be updated from
// |dp_common_string_|.
std::vector<std::pair<int, int>> dp_values_to_erase;
std::vector<std::pair<int, int>> dp_values_to_affect;
for (int i = first_start; i < first_end; i++) {
dp_values_to_affect.clear();
for (auto j : char_to_positions_[first_string_[i]]) {
if (j < second_start) {
continue;
}
if (j >= second_end) {
break;
}
// dp_commong_string_[j + 1] is length of longest common substring
// ends at first_string_[i] and second_string_[j + 1]
const int length = dp_common_string_[j] + 1;
dp_values_to_affect.emplace_back(j + 1, length);
if (length > match.length) {
match.pos_first_string = i - length + 1;
match.pos_second_string = j - length + 1;
match.length = length;
}
}
// Updates dp_common_string_
for (auto const& element : dp_values_to_erase) {
dp_common_string_[element.first] = 0;
}
for (auto const& element : dp_values_to_affect) {
dp_common_string_[element.first] = element.second;
}
std::swap(dp_values_to_erase, dp_values_to_affect);
}
// Erases all updated value for the next call.
std::fill(dp_common_string_.begin(), dp_common_string_.end(), 0);
return match;
}
Matches SequenceMatcher::GetMatchingBlocks() {
if (!matching_blocks_.empty()) {
return matching_blocks_;
}
// This queue contains a tuple of 4 integers that represent 2 substrings to
// find the longest match in the following order: first_start, first_end,
// second_start, second_end.
std::queue<std::tuple<int, int, int, int>> queue_block;
queue_block.emplace(0, first_string_.size(), 0, second_string_.size());
// Find all matching blocks recursively.
while (!queue_block.empty()) {
int first_start, first_end, second_start, second_end;
std::tie(first_start, first_end, second_start, second_end) =
queue_block.front();
queue_block.pop();
const Match match =
FindLongestMatch(first_start, first_end, second_start, second_end);
if (match.length > 0) {
if (first_start < match.pos_first_string &&
second_start < match.pos_second_string) {
queue_block.emplace(first_start, match.pos_first_string, second_start,
match.pos_second_string);
}
if (match.pos_first_string + match.length < first_end &&
match.pos_second_string + match.length < second_end) {
queue_block.emplace(match.pos_first_string + match.length, first_end,
match.pos_second_string + match.length, second_end);
}
matching_blocks_.push_back(match);
}
}
matching_blocks_.push_back(
Match(first_string_.size(), second_string_.size(), 0));
sort(matching_blocks_.begin(), matching_blocks_.end(), CompareMatches);
return matching_blocks_;
}
int SequenceMatcher::EditDistance() {
// If edit distance is already calculated
if (edit_distance_ >= 0) {
return edit_distance_;
}
const int len_first = first_string_.size();
const int len_second = second_string_.size();
if (len_first == 0 || len_second == 0) {
edit_distance_ = std::max(len_first, len_second);
return edit_distance_;
}
// Memory for the dynamic programming:
// dp[i + 1][j + 1] is the edit distane of first |i| characters of
// |first_string_| and first |j| characters of |second_string_|
int dp[len_first + 1][len_second + 1];
// Initialize memory
for (int i = 0; i < len_first + 1; i++) {
dp[i][0] = i;
}
for (int j = 0; j < len_second + 1; j++) {
dp[0][j] = j;
}
// Calculate the edit distance
for (int i = 1; i < len_first + 1; i++) {
for (int j = 1; j < len_second + 1; j++) {
const int cost = first_string_[i - 1] == second_string_[j - 1] ? 0 : 1;
// Insertion and deletion
dp[i][j] = std::min(dp[i - 1][j], dp[i][j - 1]) + 1;
// Substitution
dp[i][j] = std::min(dp[i][j], dp[i - 1][j - 1] + cost);
// Transposition
if (i > 1 && j > 1 && first_string_[i - 2] == second_string_[j - 1] &&
first_string_[i - 1] == second_string_[j - 2]) {
dp[i][j] = std::min(dp[i][j], dp[i - 2][j - 2] + cost);
}
}
}
edit_distance_ = dp[len_first][len_second];
return edit_distance_;
}
double SequenceMatcher::Ratio() {
if (use_edit_distance_) {
if (edit_distance_ratio_ < 0) {
const int edit_distance = EditDistance();
edit_distance_ratio_ = std::max(
0.0, 1.0 - static_cast<double>(edit_distance) * 2 /
(first_string_.size() + second_string_.size()));
}
return edit_distance_ratio_;
}
// Uses block matching to calculate ratio.
if (block_matching_ratio_ < 0) {
int sum_match = 0;
const int sum_length = first_string_.size() + second_string_.size();
DCHECK_NE(sum_length, 0);
const int num_blocks = GetMatchingBlocks().size();
for (const auto& match : GetMatchingBlocks()) {
sum_match += match.length;
}
// Subtract two because the last one is always an "empty block". Hence
// actual number of matching blocks is |num_blocks - 1|.
block_matching_ratio_ =
2.0 * sum_match / sum_length *
exp(-(num_blocks - 2) * num_matching_blocks_penalty_);
}
return block_matching_ratio_;
}
} // namespace string_matching
} // namespace chromeos