chromeos/ash/components/string_matching/sequence_matcher.h - chromium/src - Git at Google

 // Copyright 2019 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #ifndef CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_SEQUENCE_MATCHER_H_
 #define CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_SEQUENCE_MATCHER_H_

 #include <string>
 #include <unordered_map>
 #include <vector>

 namespace ash::string_matching {

 namespace {

 constexpr double kNumMatchingBlocksPenalty = 0.1;
 constexpr bool kUseTextLengthAgnosticism = false;

 }  // namespace

 // Performs the calculation of similarity level between 2 strings. This class's
 // functionality is inspired by python's difflib.SequenceMatcher library.
 // (https://docs.python.org/2/library/difflib.html#difflib.SequenceMatcher)
 //
 // TODO(crbug.com/1336160): The unused edit distance pathway has been removed.
 // Consider coming up with a new edit-distance method having the awareness of
 // string structures. (e.g., awareness of block matching and token break
 // locations).
 class SequenceMatcher {
  public:
   // Representing a common substring between `first_string_` and
   // `second_string_`.
   struct Match {
     Match();
     Match(int pos_first, int pos_second, int len);
     // Starting position of the common substring in `first_string_`.
     int pos_first_string;
     // Starting position of the common substring in `second_string_`.
     int pos_second_string;
     // Length of the common substring.
     int length;
   };

   // `num_matching_blocks_penalty` is used to penalize too many small matching
   // blocks. For the same number of matching characters, we prefer fewer
   // matching blocks. Value equal to 0 means no penalty. Values greater than 0
   // means heavier penalty will be applied to larger number of blocks.
   SequenceMatcher(
       const std::u16string& first_string,
       const std::u16string& second_string,
       double num_matching_blocks_penalty = kNumMatchingBlocksPenalty);

   SequenceMatcher(const SequenceMatcher&) = delete;
   SequenceMatcher& operator=(const SequenceMatcher&) = delete;

   ~SequenceMatcher();

   // Calculates similarity ratio of `first_string_` and `second_string_`.
   //
   // In the actual string matching in launcher searches, we will input with the
   // query as `first_string_` and the text as `second_string_`. As the query is
   // likely to be shorter than the text in most cases, we would like to
   // ignore/lower the influence of the amounts of any remaining unmatched
   // portions of the `second_string_` onto the ratio (i.e., "text-length
   // agnosticism").
   //
   // Thus, We will trim the text length if it is too long and
   // `text_length_agnostic` is true, and it only works for the block
   // matching algorithm.
   double Ratio(bool text_length_agnostic = kUseTextLengthAgnosticism);
   // Finds the longest common substring between
   // `first_string_[first_start:first_end]` and
   // `second_string_[second_start:second_end]`. Used by
   // GetMatchingBlocks().
   Match FindLongestMatch(int first_start,
                          int first_end,
                          int second_start,
                          int second_end);
   // Get all matching blocks of `first_string_` and `second_string_`.
   // All blocks will be sorted by their starting position within
   // `first_string_`.
   //
   // The last matching block will always be:
   //
   //   Match(first_string_.size(), second_string_.size(), 0).
   //
   // This is to cover the case where two strings have no matching blocks,
   // so that we have something to store in `matching_blocks_` to indicate
   // that matching has taken place.
   std::vector<Match> GetMatchingBlocks();

  private:
   std::u16string first_string_;
   std::u16string second_string_;
   double num_matching_blocks_penalty_;
   double block_matching_ratio_ = -1.0;
   std::vector<Match> matching_blocks_;

   // For each character `c` in `second_string_`, this variable
   // `char_to_positions_` stores all positions where `c` occurs in
   // `second_string_`.
   std::unordered_map<char, std::vector<int>> char_to_positions_;
   // Memory for dynamic programming algorithm used in FindLongestMatch().
   std::vector<int> dp_common_string_;
 };

 }  // namespace ash::string_matching

 #endif  // CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_SEQUENCE_MATCHER_H_
	// Copyright 2019 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#ifndef CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_SEQUENCE_MATCHER_H_
	#define CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_SEQUENCE_MATCHER_H_

	#include <string>
	#include <unordered_map>
	#include <vector>

	namespace ash::string_matching {

	namespace {

	constexpr double kNumMatchingBlocksPenalty = 0.1;
	constexpr bool kUseTextLengthAgnosticism = false;

	} // namespace

	// Performs the calculation of similarity level between 2 strings. This class's
	// functionality is inspired by python's difflib.SequenceMatcher library.
	// (https://docs.python.org/2/library/difflib.html#difflib.SequenceMatcher)
	//
	// TODO(crbug.com/1336160): The unused edit distance pathway has been removed.
	// Consider coming up with a new edit-distance method having the awareness of
	// string structures. (e.g., awareness of block matching and token break
	// locations).
	class SequenceMatcher {
	public:
	// Representing a common substring between `first_string_` and
	// `second_string_`.
	struct Match {
	Match();
	Match(int pos_first, int pos_second, int len);
	// Starting position of the common substring in `first_string_`.
	int pos_first_string;
	// Starting position of the common substring in `second_string_`.
	int pos_second_string;
	// Length of the common substring.
	int length;
	};

	// `num_matching_blocks_penalty` is used to penalize too many small matching
	// blocks. For the same number of matching characters, we prefer fewer
	// matching blocks. Value equal to 0 means no penalty. Values greater than 0
	// means heavier penalty will be applied to larger number of blocks.
	SequenceMatcher(
	const std::u16string& first_string,
	const std::u16string& second_string,
	double num_matching_blocks_penalty = kNumMatchingBlocksPenalty);

	SequenceMatcher(const SequenceMatcher&) = delete;
	SequenceMatcher& operator=(const SequenceMatcher&) = delete;

	~SequenceMatcher();

	// Calculates similarity ratio of `first_string_` and `second_string_`.
	//
	// In the actual string matching in launcher searches, we will input with the
	// query as `first_string_` and the text as `second_string_`. As the query is
	// likely to be shorter than the text in most cases, we would like to
	// ignore/lower the influence of the amounts of any remaining unmatched
	// portions of the `second_string_` onto the ratio (i.e., "text-length
	// agnosticism").
	//
	// Thus, We will trim the text length if it is too long and
	// `text_length_agnostic` is true, and it only works for the block
	// matching algorithm.
	double Ratio(bool text_length_agnostic = kUseTextLengthAgnosticism);
	// Finds the longest common substring between
	// `first_string_[first_start:first_end]` and
	// `second_string_[second_start:second_end]`. Used by
	// GetMatchingBlocks().
	Match FindLongestMatch(int first_start,
	int first_end,
	int second_start,
	int second_end);
	// Get all matching blocks of `first_string_` and `second_string_`.
	// All blocks will be sorted by their starting position within
	// `first_string_`.
	//
	// The last matching block will always be:
	//
	// Match(first_string_.size(), second_string_.size(), 0).
	//
	// This is to cover the case where two strings have no matching blocks,
	// so that we have something to store in `matching_blocks_` to indicate
	// that matching has taken place.
	std::vector<Match> GetMatchingBlocks();

	private:
	std::u16string first_string_;
	std::u16string second_string_;
	double num_matching_blocks_penalty_;
	double block_matching_ratio_ = -1.0;
	std::vector<Match> matching_blocks_;

	// For each character `c` in `second_string_`, this variable
	// `char_to_positions_` stores all positions where `c` occurs in
	// `second_string_`.
	std::unordered_map<char, std::vector<int>> char_to_positions_;
	// Memory for dynamic programming algorithm used in FindLongestMatch().
	std::vector<int> dp_common_string_;
	};

	} // namespace ash::string_matching

	#endif // CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_SEQUENCE_MATCHER_H_