| // Copyright (c) 2019 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #ifndef CHROMEOS_COMPONENTS_STRING_MATCHING_FUZZY_TOKENIZED_STRING_MATCH_H_ |
| #define CHROMEOS_COMPONENTS_STRING_MATCHING_FUZZY_TOKENIZED_STRING_MATCH_H_ |
| |
| #include "base/gtest_prod_util.h" |
| #include "chromeos/components/string_matching/tokenized_string.h" |
| #include "ui/gfx/range/range.h" |
| |
| namespace chromeos { |
| namespace string_matching { |
| |
| // FuzzyTokenizedStringMatch takes two tokenized strings: one as the text and |
| // the other one as the query. It matches the query against the text, |
| // calculates a relevance score between [0, 1] and marks the matched portions |
| // of text ("hits"). |
| // |
| // A relevance of zero means the two strings are completely different to each |
| // other. The higher the relevance score, the better the two strings are |
| // matched. Matched portions of text are stored as index ranges. |
| // |
| // TODO(crbug.com/1018613): each of these functions have too many input params, |
| // we should revise the structure and remove unnecessary ones. |
| // |
| // TODO(crbug.com/1336160): Terminology (for example: relevance vs. ratio) is |
| // confusing and could be clarified. |
| class FuzzyTokenizedStringMatch { |
| public: |
| typedef std::vector<gfx::Range> Hits; |
| |
| FuzzyTokenizedStringMatch(); |
| |
| FuzzyTokenizedStringMatch(const FuzzyTokenizedStringMatch&) = delete; |
| FuzzyTokenizedStringMatch& operator=(const FuzzyTokenizedStringMatch&) = |
| delete; |
| |
| ~FuzzyTokenizedStringMatch(); |
| |
| // TODO(crbug.com/1336160): The Ratio() methods are called in sequence under |
| // certain conditions, and trigger much computation. These could potentially |
| // be streamlined or compressed. |
| |
| // TokenSetRatio takes two sets of tokens, finds their intersection and |
| // differences. From the intersection and differences, it rewrites the |query| |
| // and |text| and find the similarity ratio between them. This function |
| // assumes that TokenizedString is already normalized (converted to lower |
| // case). Duplicate tokens will be removed for ratio computation. The return |
| // score is in range [0, 1]. |
| static double TokenSetRatio(const TokenizedString& query, |
| const TokenizedString& text, |
| bool partial, |
| double partial_match_penalty_rate, |
| bool use_edit_distance, |
| double num_matching_blocks_penalty); |
| |
| // TokenSortRatio takes two set of tokens, sorts them and find the similarity |
| // between two sorted strings. This function assumes that TokenizedString is |
| // already normalized (converted to lower case). The return score is in range |
| // [0, 1]. |
| static double TokenSortRatio(const TokenizedString& query, |
| const TokenizedString& text, |
| bool partial, |
| double partial_match_penalty_rate, |
| bool use_edit_distance, |
| double num_matching_blocks_penalty); |
| |
| // Finds the best ratio of shorter text with a part of longer text. |
| // This function assumes that TokenizedString is already normalized (converted |
| // to lower case). The return score is in range of [0, 1]. |
| static double PartialRatio(const std::u16string& query, |
| const std::u16string& text, |
| double partial_match_penalty_rate, |
| bool use_edit_distance, |
| double num_matching_blocks_penalty); |
| |
| // Combines scores from different ratio functions. This function assumes that |
| // TokenizedString is already normalized (converted to lower cases). |
| // The return score is in range of [0, 1]. |
| static double WeightedRatio(const TokenizedString& query, |
| const TokenizedString& text, |
| double partial_match_penalty_rate, |
| bool use_edit_distance, |
| double num_matching_blocks_penalty); |
| // TODO(crbug.com/1336160): Should prefix match always be favored over other |
| // matches? Reconsider this principle. |
| // |
| // Since prefix match should always be favored over other matches, this |
| // function is dedicated to calculate a prefix match score in range of [0, 1] |
| // using PrefixMatcher class. |
| // This score has two components: first character match and whole prefix |
| // match. |
| static double PrefixMatcher(const TokenizedString& query, |
| const TokenizedString& text); |
| |
| // TODO(crbug.com/1336160): Consider refactoring this method to directly |
| // return the relevance score, instead of a bool. This would remove the need |
| // to subsequently call relevance(). |
| // |
| // Calculates the relevance score of |query| relative to |text|. |
| // |
| // Returns true if two strings are somewhat matched, i.e. relevance score is |
| // greater than |relevance_threshold|. The actual relevance score may |
| // subsequently be retrieved by calling relevance(). |
| bool IsRelevant(const TokenizedString& query, |
| const TokenizedString& text, |
| double relevance_threshold, |
| bool use_weighted_ratio, |
| bool use_edit_distance, |
| double partial_match_penalty_rate, |
| double num_matching_blocks_penalty = 0.0); |
| double relevance() const { return relevance_; } |
| const Hits& hits() const { return hits_; } |
| |
| private: |
| // Score in range of [0,1] representing how well the query matches the text. |
| double relevance_ = 0; |
| Hits hits_; |
| }; |
| |
| } // namespace string_matching |
| } // namespace chromeos |
| |
| #endif // CHROMEOS_COMPONENTS_STRING_MATCHING_FUZZY_TOKENIZED_STRING_MATCH_H_ |