blob: ba7f0e1b814936ef5a50dceefc74d4ea8c09a68c [file] [log] [blame]
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "utils/utf8/unilib-icu.h"
#include <functional>
#include <utility>
#include "utils/base/logging.h"
#include "utils/base/statusor.h"
#include "utils/utf8/unilib-common.h"
#include "unicode/unistr.h"
#include "unicode/utext.h"
namespace libtextclassifier3 {
bool UniLibBase::ParseInt32(const UnicodeText& text, int32* result) const {
return ParseInt(text, result);
}
bool UniLibBase::ParseInt64(const UnicodeText& text, int64* result) const {
return ParseInt(text, result);
}
bool UniLibBase::ParseDouble(const UnicodeText& text, double* result) const {
UErrorCode status = U_ZERO_ERROR;
std::unique_ptr<UNumberFormat, std::function<void(UNumberFormat*)>>
format_alias(
unum_open(/*style=*/UNUM_DECIMAL, /*pattern=*/nullptr,
/*patternLength=*/0, /*locale=*/"en_US_POSIX",
/*parseErr=*/nullptr, &status),
[](UNumberFormat* format_alias) { unum_close(format_alias); });
if (U_FAILURE(status)) {
return false;
}
auto it_dot = text.begin();
for (; it_dot != text.end() && !IsDot(*it_dot); it_dot++) {
}
int64 integer_part;
if (!ParseInt(UnicodeText::Substring(text.begin(), it_dot, /*do_copy=*/false),
&integer_part)) {
return false;
}
int64 fractional_part = 0;
if (it_dot != text.end()) {
if (!ParseInt(
UnicodeText::Substring(++it_dot, text.end(), /*do_copy=*/false),
&fractional_part)) {
return false;
}
}
double factional_part_double = fractional_part;
while (factional_part_double >= 1) {
factional_part_double /= 10;
}
*result = integer_part + factional_part_double;
return true;
}
bool UniLibBase::IsOpeningBracket(char32 codepoint) const {
return u_getIntPropertyValue(codepoint, UCHAR_BIDI_PAIRED_BRACKET_TYPE) ==
U_BPT_OPEN;
}
bool UniLibBase::IsClosingBracket(char32 codepoint) const {
return u_getIntPropertyValue(codepoint, UCHAR_BIDI_PAIRED_BRACKET_TYPE) ==
U_BPT_CLOSE;
}
bool UniLibBase::IsWhitespace(char32 codepoint) const {
return u_isWhitespace(codepoint);
}
bool UniLibBase::IsDigit(char32 codepoint) const {
return u_isdigit(codepoint);
}
bool UniLibBase::IsLower(char32 codepoint) const {
return u_islower(codepoint);
}
bool UniLibBase::IsUpper(char32 codepoint) const {
return u_isupper(codepoint);
}
bool UniLibBase::IsPunctuation(char32 codepoint) const {
return u_ispunct(codepoint);
}
char32 UniLibBase::ToLower(char32 codepoint) const {
return u_tolower(codepoint);
}
char32 UniLibBase::ToUpper(char32 codepoint) const {
return u_toupper(codepoint);
}
char32 UniLibBase::GetPairedBracket(char32 codepoint) const {
return u_getBidiPairedBracket(codepoint);
}
StatusOr<int32> UniLibBase::Length(const UnicodeText& text) const {
return icu::UnicodeString::fromUTF8({text.data(), text.size_bytes()})
.countChar32();
}
UniLibBase::RegexMatcher::RegexMatcher(icu::RegexPattern* pattern,
icu::UnicodeString text)
: text_(std::move(text)),
last_find_offset_(0),
last_find_offset_codepoints_(0),
last_find_offset_dirty_(true) {
UErrorCode status = U_ZERO_ERROR;
matcher_.reset(pattern->matcher(text_, status));
if (U_FAILURE(status)) {
matcher_.reset(nullptr);
}
}
UniLibBase::RegexPattern::RegexPattern(const UnicodeText& pattern, bool lazy)
: initialized_(false),
initialization_failure_(false),
pattern_text_(pattern) {
if (!lazy) {
LockedInitializeIfNotAlready();
}
}
void UniLibBase::RegexPattern::LockedInitializeIfNotAlready() const {
std::lock_guard<std::mutex> guard(mutex_);
if (initialized_ || initialization_failure_) {
return;
}
UErrorCode status = U_ZERO_ERROR;
pattern_.reset(icu::RegexPattern::compile(
icu::UnicodeString::fromUTF8(
icu::StringPiece(pattern_text_.data(), pattern_text_.size_bytes())),
/*flags=*/UREGEX_MULTILINE, status));
if (U_FAILURE(status) || pattern_ == nullptr) {
initialization_failure_ = true;
pattern_.reset();
return;
}
initialized_ = true;
pattern_text_.clear(); // We don't need this anymore.
}
std::unique_ptr<UniLibBase::RegexMatcher> UniLibBase::RegexPattern::Matcher(
const UnicodeText& input) const {
LockedInitializeIfNotAlready(); // Possibly lazy initialization.
if (initialization_failure_) {
return nullptr;
}
return std::unique_ptr<UniLibBase::RegexMatcher>(new UniLibBase::RegexMatcher(
pattern_.get(), icu::UnicodeString::fromUTF8(
icu::StringPiece(input.data(), input.size_bytes()))));
}
constexpr int UniLibBase::RegexMatcher::kError;
constexpr int UniLibBase::RegexMatcher::kNoError;
bool UniLibBase::RegexMatcher::Matches(int* status) const {
if (!matcher_) {
*status = kError;
return false;
}
UErrorCode icu_status = U_ZERO_ERROR;
const bool result = matcher_->matches(/*startIndex=*/0, icu_status);
if (U_FAILURE(icu_status)) {
*status = kError;
return false;
}
*status = kNoError;
return result;
}
bool UniLibBase::RegexMatcher::ApproximatelyMatches(int* status) {
if (!matcher_) {
*status = kError;
return false;
}
matcher_->reset();
*status = kNoError;
if (!Find(status) || *status != kNoError) {
return false;
}
const int found_start = Start(status);
if (*status != kNoError) {
return false;
}
const int found_end = End(status);
if (*status != kNoError) {
return false;
}
if (found_start != 0 || found_end != text_.countChar32()) {
return false;
}
return true;
}
bool UniLibBase::RegexMatcher::UpdateLastFindOffset() const {
if (!last_find_offset_dirty_) {
return true;
}
// Update the position of the match.
UErrorCode icu_status = U_ZERO_ERROR;
const int find_offset = matcher_->start(0, icu_status);
if (U_FAILURE(icu_status)) {
return false;
}
last_find_offset_codepoints_ +=
text_.countChar32(last_find_offset_, find_offset - last_find_offset_);
last_find_offset_ = find_offset;
last_find_offset_dirty_ = false;
return true;
}
bool UniLibBase::RegexMatcher::Find(int* status) {
if (!matcher_) {
*status = kError;
return false;
}
UErrorCode icu_status = U_ZERO_ERROR;
const bool result = matcher_->find(icu_status);
if (U_FAILURE(icu_status)) {
*status = kError;
return false;
}
last_find_offset_dirty_ = true;
*status = kNoError;
return result;
}
int UniLibBase::RegexMatcher::Start(int* status) const {
return Start(/*group_idx=*/0, status);
}
int UniLibBase::RegexMatcher::Start(int group_idx, int* status) const {
if (!matcher_ || !UpdateLastFindOffset()) {
*status = kError;
return kError;
}
UErrorCode icu_status = U_ZERO_ERROR;
const int result = matcher_->start(group_idx, icu_status);
if (U_FAILURE(icu_status)) {
*status = kError;
return kError;
}
*status = kNoError;
// If the group didn't participate in the match the result is -1 and is
// incompatible with the caching logic bellow.
if (result == -1) {
return -1;
}
return last_find_offset_codepoints_ +
text_.countChar32(/*start=*/last_find_offset_,
/*length=*/result - last_find_offset_);
}
int UniLibBase::RegexMatcher::End(int* status) const {
return End(/*group_idx=*/0, status);
}
int UniLibBase::RegexMatcher::End(int group_idx, int* status) const {
if (!matcher_ || !UpdateLastFindOffset()) {
*status = kError;
return kError;
}
UErrorCode icu_status = U_ZERO_ERROR;
const int result = matcher_->end(group_idx, icu_status);
if (U_FAILURE(icu_status)) {
*status = kError;
return kError;
}
*status = kNoError;
// If the group didn't participate in the match the result is -1 and is
// incompatible with the caching logic bellow.
if (result == -1) {
return -1;
}
return last_find_offset_codepoints_ +
text_.countChar32(/*start=*/last_find_offset_,
/*length=*/result - last_find_offset_);
}
UnicodeText UniLibBase::RegexMatcher::Group(int* status) const {
return Group(/*group_idx=*/0, status);
}
UnicodeText UniLibBase::RegexMatcher::Group(int group_idx, int* status) const {
if (!matcher_) {
*status = kError;
return UTF8ToUnicodeText("", /*do_copy=*/false);
}
std::string result = "";
UErrorCode icu_status = U_ZERO_ERROR;
const icu::UnicodeString result_icu = matcher_->group(group_idx, icu_status);
if (U_FAILURE(icu_status)) {
*status = kError;
return UTF8ToUnicodeText("", /*do_copy=*/false);
}
result_icu.toUTF8String(result);
*status = kNoError;
return UTF8ToUnicodeText(result, /*do_copy=*/true);
}
constexpr int UniLibBase::BreakIterator::kDone;
UniLibBase::BreakIterator::BreakIterator(const UnicodeText& text)
: text_(icu::UnicodeString::fromUTF8(
icu::StringPiece(text.data(), text.size_bytes()))),
last_break_index_(0),
last_unicode_index_(0) {
icu::ErrorCode status;
break_iterator_.reset(
icu::BreakIterator::createWordInstance(icu::Locale("en"), status));
if (!status.isSuccess()) {
break_iterator_.reset();
return;
}
break_iterator_->setText(text_);
}
int UniLibBase::BreakIterator::Next() {
const int break_index = break_iterator_->next();
if (break_index == icu::BreakIterator::DONE) {
return BreakIterator::kDone;
}
last_unicode_index_ +=
text_.countChar32(last_break_index_, break_index - last_break_index_);
last_break_index_ = break_index;
return last_unicode_index_;
}
std::unique_ptr<UniLibBase::RegexPattern> UniLibBase::CreateRegexPattern(
const UnicodeText& regex) const {
return std::unique_ptr<UniLibBase::RegexPattern>(
new UniLibBase::RegexPattern(regex));
}
std::unique_ptr<UniLibBase::RegexPattern> UniLibBase::CreateLazyRegexPattern(
const UnicodeText& regex) const {
return std::unique_ptr<UniLibBase::RegexPattern>(
new UniLibBase::RegexPattern(regex, /*lazy=*/true));
}
std::unique_ptr<UniLibBase::BreakIterator> UniLibBase::CreateBreakIterator(
const UnicodeText& text) const {
return std::unique_ptr<UniLibBase::BreakIterator>(
new UniLibBase::BreakIterator(text));
}
} // namespace libtextclassifier3