| // Copyright 2020 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| |
| #ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_H_ |
| #define LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_H_ |
| |
| #include "utils/base/integral_types.h" |
| #include "utils/utf8/unicodetext.h" |
| #include "utils/utf8/unilib-common.h" |
| |
| #include "utils/utf8/unilib-icu.h" |
| #define INIT_UNILIB_FOR_TESTING(VAR) VAR() |
| |
| namespace libtextclassifier3 { |
| |
| class UniLib : public UniLibBase { |
| public: |
| using UniLibBase::UniLibBase; |
| |
| // Lowercase a unicode string. |
| UnicodeText ToLowerText(const UnicodeText& text) const { |
| UnicodeText result; |
| for (const char32 codepoint : text) { |
| result.push_back(ToLower(codepoint)); |
| } |
| return result; |
| } |
| |
| // Uppercase a unicode string. |
| UnicodeText ToUpperText(const UnicodeText& text) const { |
| UnicodeText result; |
| for (const char32 codepoint : text) { |
| result.push_back(UniLibBase::ToUpper(codepoint)); |
| } |
| return result; |
| } |
| |
| bool IsLowerText(const UnicodeText& text) const { |
| for (const char32 codepoint : text) { |
| if (!IsLower(codepoint)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| bool IsUpperText(const UnicodeText& text) const { |
| for (const char32 codepoint : text) { |
| if (!IsUpper(codepoint)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| bool IsDigits(const UnicodeText& text) const { |
| for (const char32 codepoint : text) { |
| if (!IsDigit(codepoint)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| bool IsPercentage(char32 codepoint) const { |
| return libtextclassifier3::IsPercentage(codepoint); |
| } |
| |
| bool IsSlash(char32 codepoint) const { |
| return libtextclassifier3::IsSlash(codepoint); |
| } |
| |
| bool IsMinus(char32 codepoint) const { |
| return libtextclassifier3::IsMinus(codepoint); |
| } |
| |
| bool IsNumberSign(char32 codepoint) const { |
| return libtextclassifier3::IsNumberSign(codepoint); |
| } |
| |
| bool IsDot(char32 codepoint) const { |
| return libtextclassifier3::IsDot(codepoint); |
| } |
| |
| bool IsApostrophe(char32 codepoint) const { |
| return libtextclassifier3::IsApostrophe(codepoint); |
| } |
| |
| bool IsQuotation(char32 codepoint) const { |
| return libtextclassifier3::IsQuotation(codepoint); |
| } |
| |
| bool IsAmpersand(char32 codepoint) const { |
| return libtextclassifier3::IsAmpersand(codepoint); |
| } |
| |
| bool IsLatinLetter(char32 codepoint) const { |
| return libtextclassifier3::IsLatinLetter(codepoint); |
| } |
| |
| bool IsArabicLetter(char32 codepoint) const { |
| return libtextclassifier3::IsArabicLetter(codepoint); |
| } |
| |
| bool IsCyrillicLetter(char32 codepoint) const { |
| return libtextclassifier3::IsCyrillicLetter(codepoint); |
| } |
| |
| bool IsChineseLetter(char32 codepoint) const { |
| return libtextclassifier3::IsChineseLetter(codepoint); |
| } |
| |
| bool IsJapaneseLetter(char32 codepoint) const { |
| return libtextclassifier3::IsJapaneseLetter(codepoint); |
| } |
| |
| bool IsKoreanLetter(char32 codepoint) const { |
| return libtextclassifier3::IsKoreanLetter(codepoint); |
| } |
| |
| bool IsThaiLetter(char32 codepoint) const { |
| return libtextclassifier3::IsThaiLetter(codepoint); |
| } |
| |
| bool IsCJTletter(char32 codepoint) const { |
| return libtextclassifier3::IsCJTletter(codepoint); |
| } |
| |
| bool IsLetter(char32 codepoint) const { |
| return libtextclassifier3::IsLetter(codepoint); |
| } |
| |
| bool IsValidUtf8(const UnicodeText& text) const { |
| // Basic check of structural validity of UTF8. |
| if (!text.is_valid()) { |
| return false; |
| } |
| // In addition to that, we declare that a valid UTF8 is when the number of |
| // codepoints in the string as measured by ICU is the same as the number of |
| // codepoints as measured by UnicodeText. Because if we don't do this check, |
| // the indices might differ, and cause trouble, because the assumption |
| // throughout the code is that ICU indices and UnicodeText indices are the |
| // same. |
| // NOTE: This is not perfect, as this doesn't check the alignment of the |
| // codepoints, but for the practical purposes should be enough. |
| const StatusOr<int32> icu_length = Length(text); |
| if (!icu_length.ok()) { |
| return false; |
| } |
| |
| if (icu_length.ValueOrDie() != text.size_codepoints()) { |
| return false; |
| } |
| |
| return true; |
| } |
| }; |
| |
| } // namespace libtextclassifier3 |
| #endif // LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_H_ |