chromeos/ash/components/string_matching/sequence_matcher.cc - chromium/src - Git at Google

 // Copyright 2019 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "chromeos/ash/components/string_matching/sequence_matcher.h"

 #include <algorithm>
 #include <cmath>
 #include <queue>

 #include "base/check_op.h"

 namespace ash::string_matching {

 namespace {

 using Match = SequenceMatcher::Match;
 using Matches = std::vector<Match>;

 // The maximum recognized text size if text-length agnosticism is applied.
 constexpr int kTextAgnosticismSize = 9;

 bool CompareMatches(const Match& m1, const Match& m2) {
   return m1.pos_first_string < m2.pos_first_string;
 }

 }  // namespace

 SequenceMatcher::Match::Match() = default;
 SequenceMatcher::Match::Match(int pos_first, int pos_second, int len)
     : pos_first_string(pos_first), pos_second_string(pos_second), length(len) {
   DCHECK_GE(pos_first_string, 0);
   DCHECK_GE(pos_second_string, 0);
   DCHECK_GE(length, 0);
 }

 SequenceMatcher::SequenceMatcher(const std::u16string& first_string,
                                  const std::u16string& second_string,
                                  double num_matching_blocks_penalty)
     : first_string_(first_string),
       second_string_(second_string),
       num_matching_blocks_penalty_(num_matching_blocks_penalty),
       dp_common_string_(second_string.size() + 1, 0) {
   if (first_string_.empty() && second_string_.empty()) {
     block_matching_ratio_ = 0;
   }

   for (size_t i = 0; i < second_string_.size(); i++) {
     char_to_positions_[second_string_[i]].emplace_back(i);
   }
 }

 SequenceMatcher::~SequenceMatcher() = default;

 // Compute the longest common substring, with optimisations for:
 //
 // 1) Time: By pre-computing some letter positions (stored in
 // `char_to_positions_`.
 //
 // 2) Memory: Store only the latest row of the DP table (in
 // `dp_common_string_`).
 //
 // 3) Time: Fast-update `dp_common_string_`.
 Match SequenceMatcher::FindLongestMatch(int first_start,
                                         int first_end,
                                         int second_start,
                                         int second_end) {
   Match match(first_start, second_start, 0);

   // These two vectors are used for fast updating of `dp_common_string_`.
   // Only erase or update values which are known to have been changed.
   //
   //   `dp_values_to_erase` contains the values which should be erased from
   //     `dp_common_string_`.
   //   `dp_values_to_affect` contains the values which should be updated in
   //     `dp_common_string_`.
   std::vector<std::pair<int, int>> dp_values_to_erase;
   std::vector<std::pair<int, int>> dp_values_to_affect;

   // Outer loop: Iterate through the characters of `first_string`.
   // Keep up-to-date `dp_common_string_` (the latest row of the DP table).
   for (int i = first_start; i < first_end; i++) {
     dp_values_to_affect.clear();

     // Inner loop: Iterate through characters of `second_string`, where those
     // characters are equal to first_string_[i], and within range.
     for (auto j : char_to_positions_[first_string_[i]]) {
       if (j < second_start) {
         continue;
       }
       if (j >= second_end) {
         break;
       }
       // dp_common_string_[j + 1] is the length of the longest common substring
       // ending at first_string_[i] and second_string_[j].
       const int length = dp_common_string_[j] + 1;
       dp_values_to_affect.emplace_back(j + 1, length);

       // Store newly-found longer matches.
       if (length > match.length) {
         match.pos_first_string = i - length + 1;
         match.pos_second_string = j - length + 1;
         match.length = length;
       }
     }
     // Update `dp_common_string_`.
     for (auto const& element : dp_values_to_erase) {
       dp_common_string_[element.first] = 0;
     }
     for (auto const& element : dp_values_to_affect) {
       dp_common_string_[element.first] = element.second;
     }
     std::swap(dp_values_to_erase, dp_values_to_affect);
   }
   // Erase temporary values in preparation for future calls.
   std::fill(dp_common_string_.begin(), dp_common_string_.end(), 0);

   return match;
 }

 Matches SequenceMatcher::GetMatchingBlocks() {
   if (!matching_blocks_.empty()) {
     return matching_blocks_;
   }

   // This queue contains a tuple of 4 integers that represent 2 substrings to
   // find the longest match in the following order: first_start, first_end,
   // second_start, second_end.
   std::queue<std::tuple<int, int, int, int>> queue_block;
   queue_block.emplace(0, first_string_.size(), 0, second_string_.size());

   // Find all matching blocks recursively. Prioritize longer blocks: Find the
   // longest matching block first, then recurse to the left and right into the
   // remaining as-yet unmatched sections of the two strings.
   while (!queue_block.empty()) {
     int first_start, first_end, second_start, second_end;
     std::tie(first_start, first_end, second_start, second_end) =
         queue_block.front();
     queue_block.pop();

     const Match match =
         FindLongestMatch(first_start, first_end, second_start, second_end);

     if (match.length > 0) {
       // Exclude matching blocks with whitespace only.
       const bool whitespace_only =
           match.length == 1 && first_string_[match.pos_first_string] == u' ';
       if (!whitespace_only) {
         matching_blocks_.push_back(match);
       }

       // Recurse left.
       if (first_start < match.pos_first_string &&
           second_start < match.pos_second_string) {
         queue_block.emplace(first_start, match.pos_first_string, second_start,
                             match.pos_second_string);
       }
       // Recurse right.
       if (match.pos_first_string + match.length < first_end &&
           match.pos_second_string + match.length < second_end) {
         queue_block.emplace(match.pos_first_string + match.length, first_end,
                             match.pos_second_string + match.length, second_end);
       }
     }
   }

   // Always store a final matching block. In case no matching blocks
   // were discovered above, this final matching block serves
   // the purpose of indicating that block matching has taken place.
   matching_blocks_.push_back(
       Match(first_string_.size(), second_string_.size(), 0));
   sort(matching_blocks_.begin(), matching_blocks_.end(), CompareMatches);
   return matching_blocks_;
 }

 double SequenceMatcher::Ratio(bool text_length_agnostic) {
   // Uses block matching to calculate ratio.
   if (block_matching_ratio_ < 0) {
     int sum_match = 0;
     const int query_size = first_string_.size();
     const int text_size = second_string_.size();

     int sum_length = query_size;
     // Text-length agnosticism is applied for long texts if
     // `text_length_agnostic` is true, but we still keep it not shorter
     // than the query length. Text length agnosticism is ignored if we have a
     // longer query than the text.
     if (text_length_agnostic && query_size < text_size) {
       int max_recognized_text_length =
           std::max(kTextAgnosticismSize, query_size);
       sum_length += std::min(text_size, max_recognized_text_length);
     } else {
       sum_length += text_size;
     }

     DCHECK_NE(sum_length, 0);
     const int num_blocks = GetMatchingBlocks().size();
     for (const auto& match : GetMatchingBlocks()) {
       sum_match += match.length;
     }
     // The last block is always a placeholder "empty" block, so subtract one.
     // And, allow for one "penalty-free" block, so subtract one again. Hence,
     // apply a penalty by using |num_blocks - 2|. Example:
     //
     // If num_blocks = 5, the actual number of matching blocks is 4. This
     // means there are 3 blocks in excess of 1.
     block_matching_ratio_ =
         2.0 * sum_match / sum_length *
         exp(-(num_blocks - 2) * num_matching_blocks_penalty_);
   }
   return block_matching_ratio_;
 }

 }  //  namespace ash::string_matching
	// Copyright 2019 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "chromeos/ash/components/string_matching/sequence_matcher.h"

	#include <algorithm>
	#include <cmath>
	#include <queue>

	#include "base/check_op.h"

	namespace ash::string_matching {

	namespace {

	using Match = SequenceMatcher::Match;
	using Matches = std::vector<Match>;

	// The maximum recognized text size if text-length agnosticism is applied.
	constexpr int kTextAgnosticismSize = 9;

	bool CompareMatches(const Match& m1, const Match& m2) {
	return m1.pos_first_string < m2.pos_first_string;
	}

	} // namespace

	SequenceMatcher::Match::Match() = default;
	SequenceMatcher::Match::Match(int pos_first, int pos_second, int len)
	: pos_first_string(pos_first), pos_second_string(pos_second), length(len) {
	DCHECK_GE(pos_first_string, 0);
	DCHECK_GE(pos_second_string, 0);
	DCHECK_GE(length, 0);
	}

	SequenceMatcher::SequenceMatcher(const std::u16string& first_string,
	const std::u16string& second_string,
	double num_matching_blocks_penalty)
	: first_string_(first_string),
	second_string_(second_string),
	num_matching_blocks_penalty_(num_matching_blocks_penalty),
	dp_common_string_(second_string.size() + 1, 0) {
	if (first_string_.empty() && second_string_.empty()) {
	block_matching_ratio_ = 0;
	}

	for (size_t i = 0; i < second_string_.size(); i++) {
	char_to_positions_[second_string_[i]].emplace_back(i);
	}
	}

	SequenceMatcher::~SequenceMatcher() = default;

	// Compute the longest common substring, with optimisations for:
	//
	// 1) Time: By pre-computing some letter positions (stored in
	// `char_to_positions_`.
	//
	// 2) Memory: Store only the latest row of the DP table (in
	// `dp_common_string_`).
	//
	// 3) Time: Fast-update `dp_common_string_`.
	Match SequenceMatcher::FindLongestMatch(int first_start,
	int first_end,
	int second_start,
	int second_end) {
	Match match(first_start, second_start, 0);

	// These two vectors are used for fast updating of `dp_common_string_`.
	// Only erase or update values which are known to have been changed.
	//
	// `dp_values_to_erase` contains the values which should be erased from
	// `dp_common_string_`.
	// `dp_values_to_affect` contains the values which should be updated in
	// `dp_common_string_`.
	std::vector<std::pair<int, int>> dp_values_to_erase;
	std::vector<std::pair<int, int>> dp_values_to_affect;

	// Outer loop: Iterate through the characters of `first_string`.
	// Keep up-to-date `dp_common_string_` (the latest row of the DP table).
	for (int i = first_start; i < first_end; i++) {
	dp_values_to_affect.clear();

	// Inner loop: Iterate through characters of `second_string`, where those
	// characters are equal to first_string_[i], and within range.
	for (auto j : char_to_positions_[first_string_[i]]) {
	if (j < second_start) {
	continue;
	}
	if (j >= second_end) {
	break;
	}
	// dp_common_string_[j + 1] is the length of the longest common substring
	// ending at first_string_[i] and second_string_[j].
	const int length = dp_common_string_[j] + 1;
	dp_values_to_affect.emplace_back(j + 1, length);

	// Store newly-found longer matches.
	if (length > match.length) {
	match.pos_first_string = i - length + 1;
	match.pos_second_string = j - length + 1;
	match.length = length;
	}
	}
	// Update `dp_common_string_`.
	for (auto const& element : dp_values_to_erase) {
	dp_common_string_[element.first] = 0;
	}
	for (auto const& element : dp_values_to_affect) {
	dp_common_string_[element.first] = element.second;
	}
	std::swap(dp_values_to_erase, dp_values_to_affect);
	}
	// Erase temporary values in preparation for future calls.
	std::fill(dp_common_string_.begin(), dp_common_string_.end(), 0);

	return match;
	}

	Matches SequenceMatcher::GetMatchingBlocks() {
	if (!matching_blocks_.empty()) {
	return matching_blocks_;
	}

	// This queue contains a tuple of 4 integers that represent 2 substrings to
	// find the longest match in the following order: first_start, first_end,
	// second_start, second_end.
	std::queue<std::tuple<int, int, int, int>> queue_block;
	queue_block.emplace(0, first_string_.size(), 0, second_string_.size());

	// Find all matching blocks recursively. Prioritize longer blocks: Find the
	// longest matching block first, then recurse to the left and right into the
	// remaining as-yet unmatched sections of the two strings.
	while (!queue_block.empty()) {
	int first_start, first_end, second_start, second_end;
	std::tie(first_start, first_end, second_start, second_end) =
	queue_block.front();
	queue_block.pop();

	const Match match =
	FindLongestMatch(first_start, first_end, second_start, second_end);

	if (match.length > 0) {
	// Exclude matching blocks with whitespace only.
	const bool whitespace_only =
	match.length == 1 && first_string_[match.pos_first_string] == u' ';
	if (!whitespace_only) {
	matching_blocks_.push_back(match);
	}

	// Recurse left.
	if (first_start < match.pos_first_string &&
	second_start < match.pos_second_string) {
	queue_block.emplace(first_start, match.pos_first_string, second_start,
	match.pos_second_string);
	}
	// Recurse right.
	if (match.pos_first_string + match.length < first_end &&
	match.pos_second_string + match.length < second_end) {
	queue_block.emplace(match.pos_first_string + match.length, first_end,
	match.pos_second_string + match.length, second_end);
	}
	}
	}

	// Always store a final matching block. In case no matching blocks
	// were discovered above, this final matching block serves
	// the purpose of indicating that block matching has taken place.
	matching_blocks_.push_back(
	Match(first_string_.size(), second_string_.size(), 0));
	sort(matching_blocks_.begin(), matching_blocks_.end(), CompareMatches);
	return matching_blocks_;
	}

	double SequenceMatcher::Ratio(bool text_length_agnostic) {
	// Uses block matching to calculate ratio.
	if (block_matching_ratio_ < 0) {
	int sum_match = 0;
	const int query_size = first_string_.size();
	const int text_size = second_string_.size();

	int sum_length = query_size;
	// Text-length agnosticism is applied for long texts if
	// `text_length_agnostic` is true, but we still keep it not shorter
	// than the query length. Text length agnosticism is ignored if we have a
	// longer query than the text.
	if (text_length_agnostic && query_size < text_size) {
	int max_recognized_text_length =
	std::max(kTextAgnosticismSize, query_size);
	sum_length += std::min(text_size, max_recognized_text_length);
	} else {
	sum_length += text_size;
	}

	DCHECK_NE(sum_length, 0);
	const int num_blocks = GetMatchingBlocks().size();
	for (const auto& match : GetMatchingBlocks()) {
	sum_match += match.length;
	}
	// The last block is always a placeholder "empty" block, so subtract one.
	// And, allow for one "penalty-free" block, so subtract one again. Hence,
	// apply a penalty by using \|num_blocks - 2\|. Example:
	//
	// If num_blocks = 5, the actual number of matching blocks is 4. This
	// means there are 3 blocks in excess of 1.
	block_matching_ratio_ =
	2.0 * sum_match / sum_length *
	exp(-(num_blocks - 2) * num_matching_blocks_penalty_);
	}
	return block_matching_ratio_;
	}

	} // namespace ash::string_matching