blob: 9183f9d5141ab48a3609b73452eb14ca7e5a996b [file] [log] [blame]
// Copyright 2019 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chromeos/components/string_matching/term_break_iterator.h"
#include <ostream>
#include "base/check.h"
#include "base/i18n/char_iterator.h"
#include "base/notreached.h"
#include "base/strings/string_util.h"
#include "third_party/icu/source/common/unicode/uchar.h"
namespace chromeos {
namespace string_matching {
TermBreakIterator::TermBreakIterator(const base::string16& word)
: word_(word),
prev_(npos),
pos_(0),
iter_(new base::i18n::UTF16CharIterator(&word)),
state_(STATE_START) {}
TermBreakIterator::~TermBreakIterator() = default;
bool TermBreakIterator::Advance() {
// 2D matrix that defines term boundaries. Each row represents current state.
// Each col represents new state from input char. Cells with true value
// represents a term boundary.
const bool kBoundary[][STATE_LAST] = {
// START NUMBER UPPER LOWER CHAR
{false, false, false, false, false}, // START
{false, false, true, true, true}, // NUMBER
{false, true, false, false, true}, // UPPER
{false, true, true, false, true}, // LOWER
{false, true, true, true, false}, // CHAR
};
while (iter_->Advance()) {
const State new_state = GetNewState(word_[iter_->array_pos()]);
const bool is_boundary = kBoundary[state_][new_state];
state_ = new_state;
if (is_boundary)
break;
}
prev_ = pos_;
pos_ = iter_->array_pos();
return prev_ != pos_ || !iter_->end();
}
const base::string16 TermBreakIterator::GetCurrentTerm() const {
DCHECK(prev_ != npos && pos_ != npos);
return word_.substr(prev_, pos_ - prev_);
}
TermBreakIterator::State TermBreakIterator::GetNewState(base::char16 ch) {
if (base::IsAsciiDigit(ch) || ch == '.' || ch == ',')
return STATE_NUMBER;
const bool is_upper = !!u_isUUppercase(ch);
const bool is_lower = !!u_isULowercase(ch);
if (is_upper && is_lower) {
NOTREACHED() << "Invalid state for ch=" << ch;
return STATE_CHAR;
}
if (is_upper)
return STATE_UPPER;
if (is_lower)
return STATE_LOWER;
return STATE_CHAR;
}
} // namespace string_matching
} // namespace chromeos