blob: d4cf7a624ce9f8fbe7b8fae461e713bcb19c1bf4 [file] [log] [blame]
// Copyright 2019 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chromeos/components/string_matching/tokenized_string.h"
#include <stddef.h>
#include "base/i18n/break_iterator.h"
#include "base/i18n/case_conversion.h"
#include "base/logging.h"
#include "base/notreached.h"
#include "base/strings/strcat.h"
#include "base/strings/string_util.h"
#include "chromeos/components/string_matching/term_break_iterator.h"
namespace chromeos {
namespace string_matching {
using base::i18n::BreakIterator;
TokenizedString::TokenizedString(const base::string16& text, Mode mode)
: text_(text) {
switch (mode) {
case Mode::kCamelCase:
Tokenize();
break;
case Mode::kWords:
TokenizeWords();
break;
default:
break;
}
}
TokenizedString::~TokenizedString() = default;
void TokenizedString::Tokenize() {
BreakIterator break_iter(text_, BreakIterator::BREAK_WORD);
if (!break_iter.Init()) {
NOTREACHED() << "BreakIterator init failed"
<< ", text=\"" << text_ << "\"";
return;
}
while (break_iter.Advance()) {
if (!break_iter.IsWord())
continue;
const base::string16 word(break_iter.GetString());
const size_t word_start = break_iter.prev();
TermBreakIterator term_iter(word);
while (term_iter.Advance()) {
tokens_.emplace_back(base::i18n::ToLower(term_iter.GetCurrentTerm()));
mappings_.emplace_back(word_start + term_iter.prev(),
word_start + term_iter.pos());
}
}
}
void TokenizedString::TokenizeWords() {
BreakIterator break_iter(text_, BreakIterator::BREAK_WORD);
if (!break_iter.Init()) {
NOTREACHED() << "BreakIterator init failed"
<< ", text=\"" << text_ << "\"";
return;
}
// The token to be generated will be in [start, end) of |text_|.
size_t start = 0;
size_t end = 0;
while (break_iter.Advance()) {
if (break_iter.IsWord()) {
// Update |end| but do not generate a token yet because the next segment
// after Advance may be a non-whitespace char. We may include the next
// char in the token.
end = break_iter.pos();
continue;
}
// If this is not a word, it may be a sequence of whitespace chars or
// another punctuation.
// 1. Whitespace chars only: generate a token from |text_| in the range of
// [start, end). Also reset |start| and |end| for next token.
// 2. A punctuation: do nothing and Advance.
const base::string16 word(break_iter.GetString());
const bool only_whitechars =
base::ContainsOnlyChars(word, base::kWhitespaceUTF16);
if (only_whitechars) {
if (end - start > 1) {
tokens_.emplace_back(
base::i18n::ToLower(text_.substr(start, end - start)));
mappings_.emplace_back(start, end);
}
start = break_iter.pos();
end = start;
}
}
// Generate the last token.
if (end - start > 1) {
tokens_.emplace_back(base::i18n::ToLower(text_.substr(start, end - start)));
mappings_.emplace_back(start, end);
}
}
} // namespace string_matching
} // namespace chromeos