blob: e33f3bee3609ce8fc35247b7b5e47b0e80031fe1 [file] [log] [blame]
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_H_
#define LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_H_
#include "utils/base/integral_types.h"
#include "utils/utf8/unicodetext.h"
#include "utils/utf8/unilib-common.h"
#include "utils/utf8/unilib-icu.h"
#define INIT_UNILIB_FOR_TESTING(VAR) VAR()
namespace libtextclassifier3 {
class UniLib : public UniLibBase {
public:
using UniLibBase::UniLibBase;
// Lowercase a unicode string.
UnicodeText ToLowerText(const UnicodeText& text) const {
UnicodeText result;
for (const char32 codepoint : text) {
result.push_back(ToLower(codepoint));
}
return result;
}
// Uppercase a unicode string.
UnicodeText ToUpperText(const UnicodeText& text) const {
UnicodeText result;
for (const char32 codepoint : text) {
result.push_back(UniLibBase::ToUpper(codepoint));
}
return result;
}
bool IsLowerText(const UnicodeText& text) const {
for (const char32 codepoint : text) {
if (!IsLower(codepoint)) {
return false;
}
}
return true;
}
bool IsUpperText(const UnicodeText& text) const {
for (const char32 codepoint : text) {
if (!IsUpper(codepoint)) {
return false;
}
}
return true;
}
bool IsDigits(const UnicodeText& text) const {
for (const char32 codepoint : text) {
if (!IsDigit(codepoint)) {
return false;
}
}
return true;
}
bool IsPercentage(char32 codepoint) const {
return libtextclassifier3::IsPercentage(codepoint);
}
bool IsSlash(char32 codepoint) const {
return libtextclassifier3::IsSlash(codepoint);
}
bool IsMinus(char32 codepoint) const {
return libtextclassifier3::IsMinus(codepoint);
}
bool IsNumberSign(char32 codepoint) const {
return libtextclassifier3::IsNumberSign(codepoint);
}
bool IsDot(char32 codepoint) const {
return libtextclassifier3::IsDot(codepoint);
}
bool IsApostrophe(char32 codepoint) const {
return libtextclassifier3::IsApostrophe(codepoint);
}
bool IsQuotation(char32 codepoint) const {
return libtextclassifier3::IsQuotation(codepoint);
}
bool IsAmpersand(char32 codepoint) const {
return libtextclassifier3::IsAmpersand(codepoint);
}
bool IsLatinLetter(char32 codepoint) const {
return libtextclassifier3::IsLatinLetter(codepoint);
}
bool IsArabicLetter(char32 codepoint) const {
return libtextclassifier3::IsArabicLetter(codepoint);
}
bool IsCyrillicLetter(char32 codepoint) const {
return libtextclassifier3::IsCyrillicLetter(codepoint);
}
bool IsChineseLetter(char32 codepoint) const {
return libtextclassifier3::IsChineseLetter(codepoint);
}
bool IsJapaneseLetter(char32 codepoint) const {
return libtextclassifier3::IsJapaneseLetter(codepoint);
}
bool IsKoreanLetter(char32 codepoint) const {
return libtextclassifier3::IsKoreanLetter(codepoint);
}
bool IsThaiLetter(char32 codepoint) const {
return libtextclassifier3::IsThaiLetter(codepoint);
}
bool IsCJTletter(char32 codepoint) const {
return libtextclassifier3::IsCJTletter(codepoint);
}
bool IsLetter(char32 codepoint) const {
return libtextclassifier3::IsLetter(codepoint);
}
bool IsValidUtf8(const UnicodeText& text) const {
// Basic check of structural validity of UTF8.
if (!text.is_valid()) {
return false;
}
// In addition to that, we declare that a valid UTF8 is when the number of
// codepoints in the string as measured by ICU is the same as the number of
// codepoints as measured by UnicodeText. Because if we don't do this check,
// the indices might differ, and cause trouble, because the assumption
// throughout the code is that ICU indices and UnicodeText indices are the
// same.
// NOTE: This is not perfect, as this doesn't check the alignment of the
// codepoints, but for the practical purposes should be enough.
const StatusOr<int32> icu_length = Length(text);
if (!icu_length.ok()) {
return false;
}
if (icu_length.ValueOrDie() != text.size_codepoints()) {
return false;
}
return true;
}
};
} // namespace libtextclassifier3
#endif // LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_H_