blob: d9e123b0dcf07c51d1204d6abd07dd1f3e353a19 [file] [log] [blame]
// Copyright 2019 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROMEOS_COMPONENTS_STRING_MATCHING_TERM_BREAK_ITERATOR_H_
#define CHROMEOS_COMPONENTS_STRING_MATCHING_TERM_BREAK_ITERATOR_H_
#include <stddef.h>
#include <memory>
#include <string>
namespace base {
namespace i18n {
class UTF16CharIterator;
}
} // namespace base
namespace chromeos {
namespace string_matching {
// TermBreakIterator breaks terms out of a word. Terms are broken on
// camel case boundaries and alpha/number boundaries. Numbers are defined
// as [0-9\.,]+.
// e.g.
// CamelCase -> Camel, Case
// Python2.7 -> Python, 2.7
class TermBreakIterator {
public:
// Note that |word| must out live this iterator.
explicit TermBreakIterator(const std::u16string& word);
TermBreakIterator(const TermBreakIterator&) = delete;
TermBreakIterator& operator=(const TermBreakIterator&) = delete;
~TermBreakIterator();
// Advance to the next term. Returns false if at the end of the word.
bool Advance();
// Returns the current term, which is the substr of |word_| in range
// [prev_, pos_).
const std::u16string GetCurrentTerm() const;
size_t prev() const { return prev_; }
size_t pos() const { return pos_; }
static const size_t npos = static_cast<size_t>(-1);
private:
enum State {
STATE_START, // Initial state
STATE_NUMBER, // Current char is a number [0-9\.,].
STATE_UPPER, // Current char is upper case.
STATE_LOWER, // Current char is lower case.
STATE_CHAR, // Current char has no case, e.g. a cjk char.
STATE_LAST,
};
// Returns new state for given |ch|.
State GetNewState(char16_t ch);
const std::u16string& word_;
size_t prev_;
size_t pos_;
std::unique_ptr<base::i18n::UTF16CharIterator> iter_;
State state_;
};
} // namespace string_matching
} // namespace chromeos
#endif // CHROMEOS_COMPONENTS_STRING_MATCHING_TERM_BREAK_ITERATOR_H_