blob: 140e42feeaabc65f39f1b40e71996aa3da797aa1 [file] [log] [blame]
// Copyright (c) 2019 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROMEOS_COMPONENTS_STRING_MATCHING_SEQUENCE_MATCHER_H_
#define CHROMEOS_COMPONENTS_STRING_MATCHING_SEQUENCE_MATCHER_H_
#include <string>
#include <unordered_map>
#include <vector>
#include "base/macros.h"
#include "base/strings/string16.h"
namespace chromeos {
namespace string_matching {
// Performs the calculation of similarity level between 2 strings. This class's
// functionality is inspired by python's difflib.SequenceMatcher library.
// (https://docs.python.org/2/library/difflib.html#difflib.SequenceMatcher)
class SequenceMatcher {
public:
// Representing a common substring between |first_string_| and
// |second_string_|.
struct Match {
Match();
Match(int pos_first, int pos_second, int len);
// Starting position of the common substring in |first_string_|.
int pos_first_string;
// Starting position of the common substring in |second_string_|.
int pos_second_string;
// Length of the common substring.
int length;
};
// |num_matching_blocks_penalty| is used to penalize too many small matching
// blocks. For the same number of matching characters, we prefer fewer
// matching blocks. Value equal to 0 means no penalty. Values greater than 0
// means heavier penalty will be applied to larger number of blocks. This is
// only appled if |use_edit_distance| is false.
SequenceMatcher(const base::string16& first_string,
const base::string16& second_string,
bool use_edit_distance,
double num_matching_blocks_penalty);
~SequenceMatcher() = default;
// Calculates similarity ratio of |first_string_| and |second_string_|.
double Ratio();
// Calculates the Damerau–Levenshtein distance between |first_string_| and
// |second_string_|.
// See https://en.wikipedia.org/wiki/Damerau–Levenshtein_distance for more
// details.
int EditDistance();
// Finds the longest common substring between
// |first_string_[first_start:first_end]| and
// |second_string_[second_start:second_end]|.
Match FindLongestMatch(int first_start,
int first_end,
int second_start,
int second_end);
// Get all matching blocks of |first_string_| and |second_string_|.
// All blocks will be sorted by starting position of them in the
// |first_string_|. The last matching block will always be
// Match(first_string_.size(), second_string_.size(), 0).
std::vector<Match> GetMatchingBlocks();
private:
base::string16 first_string_;
base::string16 second_string_;
double num_matching_blocks_penalty_ = 0.0;
double edit_distance_ratio_ = -1.0;
double block_matching_ratio_ = -1.0;
std::vector<Match> matching_blocks_;
// Controls whether to use edit distance to calculate ratio.
bool use_edit_distance_;
int edit_distance_ = -1;
// For each character |c| in |second_string_|, this variable
// |char_to_positions_| stores all positions where |c| occurs in
// |second_string_|.
std::unordered_map<char, std::vector<int>> char_to_positions_;
// Memory for dynamic programming algorithm used in FindLongestMatch().
std::vector<int> dp_common_string_;
DISALLOW_COPY_AND_ASSIGN(SequenceMatcher);
};
} // namespace string_matching
} // namespace chromeos
#endif // CHROMEOS_COMPONENTS_STRING_MATCHING_SEQUENCE_MATCHER_H_