blob: 893a911d20c19608f202e3176c32758e5aa75157 [file] [log] [blame]
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "annotator/translate/translate.h"
#include <memory>
#include "annotator/collections.h"
#include "annotator/entity-data_generated.h"
#include "annotator/types.h"
#include "lang_id/lang-id-wrapper.h"
#include "utils/base/logging.h"
#include "utils/i18n/locale.h"
#include "utils/utf8/unicodetext.h"
#include "lang_id/lang-id.h"
namespace libtextclassifier3 {
bool TranslateAnnotator::ClassifyText(
const UnicodeText& context, CodepointSpan selection_indices,
const std::string& user_familiar_language_tags,
ClassificationResult* classification_result) const {
std::vector<TranslateAnnotator::LanguageConfidence> confidences;
if (options_->algorithm() ==
TranslateAnnotatorOptions_::Algorithm::Algorithm_BACKOFF) {
if (options_->backoff_options() == nullptr) {
TC3_LOG(WARNING) << "No backoff options specified. Returning.";
return false;
}
confidences = BackoffDetectLanguages(context, selection_indices);
}
if (confidences.empty()) {
return false;
}
std::vector<Locale> user_familiar_languages;
if (!ParseLocales(user_familiar_language_tags, &user_familiar_languages)) {
TC3_LOG(WARNING) << "Couldn't parse the user-understood languages.";
return false;
}
if (user_familiar_languages.empty()) {
TC3_VLOG(INFO) << "user_familiar_languages is not set, not suggesting "
"translate action.";
return false;
}
bool user_can_understand_language_of_text = false;
for (const Locale& locale : user_familiar_languages) {
if (locale.Language() == confidences[0].language) {
user_can_understand_language_of_text = true;
break;
}
}
if (!user_can_understand_language_of_text) {
classification_result->collection = Collections::Translate();
classification_result->score = options_->score();
classification_result->priority_score = options_->priority_score();
classification_result->serialized_entity_data =
CreateSerializedEntityData(confidences);
return true;
}
return false;
}
std::string TranslateAnnotator::CreateSerializedEntityData(
const std::vector<TranslateAnnotator::LanguageConfidence>& confidences)
const {
EntityDataT entity_data;
entity_data.translate.reset(new EntityData_::TranslateT());
for (const LanguageConfidence& confidence : confidences) {
EntityData_::Translate_::LanguagePredictionResultT*
language_prediction_result =
new EntityData_::Translate_::LanguagePredictionResultT();
language_prediction_result->language_tag = confidence.language;
language_prediction_result->confidence_score = confidence.confidence;
entity_data.translate->language_prediction_results.emplace_back(
language_prediction_result);
}
flatbuffers::FlatBufferBuilder builder;
FinishEntityDataBuffer(builder, EntityData::Pack(builder, &entity_data));
return std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize());
}
std::vector<TranslateAnnotator::LanguageConfidence>
TranslateAnnotator::BackoffDetectLanguages(
const UnicodeText& context, CodepointSpan selection_indices) const {
const float penalize_ratio = options_->backoff_options()->penalize_ratio();
const int min_text_size = options_->backoff_options()->min_text_size();
if (selection_indices.second - selection_indices.first < min_text_size &&
penalize_ratio <= 0) {
return {};
}
const UnicodeText entity =
UnicodeText::Substring(context, selection_indices.first,
selection_indices.second, /*do_copy=*/false);
const std::vector<std::pair<std::string, float>> lang_id_result =
langid::GetPredictions(langid_model_, entity.data(), entity.size_bytes());
const float more_text_score_ratio =
1.0f - options_->backoff_options()->subject_text_score_ratio();
std::vector<std::pair<std::string, float>> more_lang_id_results;
if (more_text_score_ratio >= 0) {
const UnicodeText entity_with_context = TokenAlignedSubstringAroundSpan(
context, selection_indices, min_text_size);
more_lang_id_results =
langid::GetPredictions(langid_model_, entity_with_context.data(),
entity_with_context.size_bytes());
}
const float subject_text_score_ratio =
options_->backoff_options()->subject_text_score_ratio();
std::map<std::string, float> result_map;
for (const auto& [language, score] : lang_id_result) {
result_map[language] = subject_text_score_ratio * score;
}
for (const auto& [language, score] : more_lang_id_results) {
result_map[language] += more_text_score_ratio * score * penalize_ratio;
}
std::vector<TranslateAnnotator::LanguageConfidence> result;
result.reserve(result_map.size());
for (const auto& [key, value] : result_map) {
result.push_back({key, value});
}
std::sort(result.begin(), result.end(),
[](TranslateAnnotator::LanguageConfidence& a,
TranslateAnnotator::LanguageConfidence& b) {
return a.confidence > b.confidence;
});
return result;
}
UnicodeText::const_iterator
TranslateAnnotator::FindIndexOfNextWhitespaceOrPunctuation(
const UnicodeText& text, int start_index, int direction) const {
TC3_CHECK(direction == 1 || direction == -1);
auto it = text.begin();
std::advance(it, start_index);
while (it > text.begin() && it < text.end()) {
if (unilib_->IsWhitespace(*it) || unilib_->IsPunctuation(*it)) {
break;
}
std::advance(it, direction);
}
return it;
}
UnicodeText TranslateAnnotator::TokenAlignedSubstringAroundSpan(
const UnicodeText& text, CodepointSpan indices, int minimum_length) const {
const int text_size_codepoints = text.size_codepoints();
if (text_size_codepoints < minimum_length) {
return UnicodeText(text, /*do_copy=*/false);
}
const int start = indices.first;
const int end = indices.second;
const int length = end - start;
if (length >= minimum_length) {
return UnicodeText::Substring(text, start, end, /*do_copy=*/false);
}
const int offset = (minimum_length - length) / 2;
const int iter_start = std::max(
0, std::min(start - offset, text_size_codepoints - minimum_length));
const int iter_end =
std::min(text_size_codepoints, iter_start + minimum_length);
auto it_start = FindIndexOfNextWhitespaceOrPunctuation(text, iter_start, -1);
const auto it_end = FindIndexOfNextWhitespaceOrPunctuation(text, iter_end, 1);
// The it_start now points to whitespace/punctuation (unless it reached the
// beginning of the string). So we'll move it one position forward to point to
// the actual text.
if (it_start != it_end && unilib_->IsWhitespace(*it_start)) {
std::advance(it_start, 1);
}
return UnicodeText::Substring(it_start, it_end, /*do_copy=*/false);
}
} // namespace libtextclassifier3