blob: 41fbddb427bdadb46dd045cf25830ab2889b9e70 [file] [log] [blame]
// Copyright 2019 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "autocomplete_match_classification.h"
#include <string>
#include <string_view>
#include "base/i18n/case_conversion.h"
#include "base/strings/string_util.h"
#include "components/omnibox/browser/autocomplete_match.h"
#include "components/omnibox/browser/in_memory_url_index_types.h"
#include "components/omnibox/browser/scored_history_match.h"
#include "in_memory_url_index_types.h"
namespace {
std::u16string clean(std::u16string_view text) {
const size_t kMaxTextLength = 2000;
return base::i18n::ToLower(text.substr(0, kMaxTextLength));
}
} // namespace
ACMatchClassifications ClassifyAllMatchesInString(
const std::u16string& find_text,
const std::u16string& text,
const bool text_is_search_query,
const ACMatchClassifications& original_class) {
DCHECK(!find_text.empty());
if (text.empty()) {
return original_class;
}
TermMatches term_matches = FindTermMatches(find_text, text);
ACMatchClassifications classifications;
if (text_is_search_query) {
classifications = ClassifyTermMatches(term_matches, text.size(),
ACMatchClassification::NONE,
ACMatchClassification::MATCH);
} else {
classifications = ClassifyTermMatches(term_matches, text.size(),
ACMatchClassification::MATCH,
ACMatchClassification::NONE);
}
return AutocompleteMatch::MergeClassifications(original_class,
classifications);
}
TermMatches FindTermMatches(std::u16string_view find_text,
std::u16string_view text,
bool allow_prefix_matching,
bool allow_mid_word_matching) {
std::u16string find_text_str = clean(find_text);
std::u16string text_str = clean(text);
// Some international characters become multiple characters when converting
// case. E.g. Armenian և is 1 character lowercase, but 2 characters Եվ
// uppercased. Turkish has examples of the opposite, where the lowercasing can
// increase length. If the string length changes when lowercased, term match
// indexes will be off when being used to style the original-cased `text`.
// This will cause either `DCHECK` crashes if the incorrect index is out of
// bounds; or incorrect styling if the incorrect index remains in bounds or
// `DCHECK`s are disabled. E.g. input 'ou' would bold 'Yo[uT]ube' if lower
// case 'Y' is 2 characters.
if (find_text_str.size() != find_text.size() ||
text_str.size() != text.size()) {
return {};
}
if (find_text_str.empty()) {
return {};
}
if (allow_prefix_matching &&
base::StartsWith(text_str, find_text_str, base::CompareCase::SENSITIVE)) {
return {{0, 0, find_text_str.length()}};
}
String16Vector find_terms =
String16VectorFromString16(find_text_str, nullptr);
WordStarts word_starts;
// `word_starts` is unused if `allow_mid_word_matching` is true.
if (!allow_mid_word_matching) {
String16VectorFromString16(text_str, &word_starts);
}
return FindTermMatchesForTerms(find_terms, WordStarts(find_terms.size(), 0),
text_str, word_starts,
allow_mid_word_matching);
}
TermMatches FindTermMatchesForTerms(const String16Vector& find_terms,
const WordStarts& find_terms_word_starts,
const std::u16string& cleaned_text,
const WordStarts& text_word_starts,
bool allow_mid_word_matching) {
TermMatches matches = MatchTermsInString(find_terms, cleaned_text);
matches = SortMatches(matches);
matches = DeoverlapMatches(matches);
if (allow_mid_word_matching)
return matches;
return ScoredHistoryMatch::FilterTermMatchesByWordStarts(
matches, find_terms_word_starts, text_word_starts, 0, std::string::npos);
}
ACMatchClassifications ClassifyTermMatches(const TermMatches& matches,
size_t text_length,
int match_style,
int non_match_style) {
ACMatchClassifications classes;
if (matches.empty()) {
if (text_length)
classes.push_back(ACMatchClassification(0, non_match_style));
return classes;
}
if (matches[0].offset)
classes.push_back(ACMatchClassification(0, non_match_style));
size_t match_count = matches.size();
for (size_t i = 0; i < match_count;) {
size_t offset = matches[i].offset;
classes.push_back(ACMatchClassification(offset, match_style));
// Skip all adjacent matches.
do {
offset += matches[i].length;
++i;
} while ((i < match_count) && (offset == matches[i].offset));
if (offset < text_length)
classes.push_back(ACMatchClassification(offset, non_match_style));
}
return classes;
}