annotator/translate/translate.cc - chromiumos/third_party/libtextclassifier - Git at Google

 // Copyright 2020 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //

 #include "annotator/translate/translate.h"

 #include <memory>

 #include "annotator/collections.h"
 #include "annotator/entity-data_generated.h"
 #include "annotator/types.h"
 #include "lang_id/lang-id-wrapper.h"
 #include "utils/base/logging.h"
 #include "utils/i18n/locale.h"
 #include "utils/utf8/unicodetext.h"
 #include "lang_id/lang-id.h"

 namespace libtextclassifier3 {

 bool TranslateAnnotator::ClassifyText(
     const UnicodeText& context, CodepointSpan selection_indices,
     const std::string& user_familiar_language_tags,
     ClassificationResult* classification_result) const {
   std::vector<TranslateAnnotator::LanguageConfidence> confidences;
   if (options_->algorithm() ==
       TranslateAnnotatorOptions_::Algorithm::Algorithm_BACKOFF) {
     if (options_->backoff_options() == nullptr) {
       TC3_LOG(WARNING) << "No backoff options specified. Returning.";
       return false;
     }
     confidences = BackoffDetectLanguages(context, selection_indices);
   }

   if (confidences.empty()) {
     return false;
   }

   std::vector<Locale> user_familiar_languages;
   if (!ParseLocales(user_familiar_language_tags, &user_familiar_languages)) {
     TC3_LOG(WARNING) << "Couldn't parse the user-understood languages.";
     return false;
   }
   if (user_familiar_languages.empty()) {
     TC3_VLOG(INFO) << "user_familiar_languages is not set, not suggesting "
                       "translate action.";
     return false;
   }
   bool user_can_understand_language_of_text = false;
   for (const Locale& locale : user_familiar_languages) {
     if (locale.Language() == confidences[0].language) {
       user_can_understand_language_of_text = true;
       break;
     }
   }

   if (!user_can_understand_language_of_text) {
     classification_result->collection = Collections::Translate();
     classification_result->score = options_->score();
     classification_result->priority_score = options_->priority_score();
     classification_result->serialized_entity_data =
         CreateSerializedEntityData(confidences);
     return true;
   }

   return false;
 }

 std::string TranslateAnnotator::CreateSerializedEntityData(
     const std::vector<TranslateAnnotator::LanguageConfidence>& confidences)
     const {
   EntityDataT entity_data;
   entity_data.translate.reset(new EntityData_::TranslateT());

   for (const LanguageConfidence& confidence : confidences) {
     EntityData_::Translate_::LanguagePredictionResultT*
         language_prediction_result =
             new EntityData_::Translate_::LanguagePredictionResultT();
     language_prediction_result->language_tag = confidence.language;
     language_prediction_result->confidence_score = confidence.confidence;
     entity_data.translate->language_prediction_results.emplace_back(
         language_prediction_result);
   }
   flatbuffers::FlatBufferBuilder builder;
   FinishEntityDataBuffer(builder, EntityData::Pack(builder, &entity_data));
   return std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),
                      builder.GetSize());
 }

 std::vector<TranslateAnnotator::LanguageConfidence>
 TranslateAnnotator::BackoffDetectLanguages(
     const UnicodeText& context, CodepointSpan selection_indices) const {
   const float penalize_ratio = options_->backoff_options()->penalize_ratio();
   const int min_text_size = options_->backoff_options()->min_text_size();
   if (selection_indices.second - selection_indices.first < min_text_size &&
       penalize_ratio <= 0) {
     return {};
   }

   const UnicodeText entity =
       UnicodeText::Substring(context, selection_indices.first,
                              selection_indices.second, /*do_copy=*/false);
   const std::vector<std::pair<std::string, float>> lang_id_result =
       langid::GetPredictions(langid_model_, entity.data(), entity.size_bytes());

   const float more_text_score_ratio =
       1.0f - options_->backoff_options()->subject_text_score_ratio();
   std::vector<std::pair<std::string, float>> more_lang_id_results;
   if (more_text_score_ratio >= 0) {
     const UnicodeText entity_with_context = TokenAlignedSubstringAroundSpan(
         context, selection_indices, min_text_size);
     more_lang_id_results =
         langid::GetPredictions(langid_model_, entity_with_context.data(),
                                entity_with_context.size_bytes());
   }

   const float subject_text_score_ratio =
       options_->backoff_options()->subject_text_score_ratio();

   std::map<std::string, float> result_map;
   for (const auto& [language, score] : lang_id_result) {
     result_map[language] = subject_text_score_ratio * score;
   }
   for (const auto& [language, score] : more_lang_id_results) {
     result_map[language] += more_text_score_ratio * score * penalize_ratio;
   }

   std::vector<TranslateAnnotator::LanguageConfidence> result;
   result.reserve(result_map.size());
   for (const auto& [key, value] : result_map) {
     result.push_back({key, value});
   }

   std::sort(result.begin(), result.end(),
             [](TranslateAnnotator::LanguageConfidence& a,
                TranslateAnnotator::LanguageConfidence& b) {
               return a.confidence > b.confidence;
             });
   return result;
 }

 UnicodeText::const_iterator
 TranslateAnnotator::FindIndexOfNextWhitespaceOrPunctuation(
     const UnicodeText& text, int start_index, int direction) const {
   TC3_CHECK(direction == 1 || direction == -1);
   auto it = text.begin();
   std::advance(it, start_index);
   while (it > text.begin() && it < text.end()) {
     if (unilib_->IsWhitespace(*it) || unilib_->IsPunctuation(*it)) {
       break;
     }
     std::advance(it, direction);
   }
   return it;
 }

 UnicodeText TranslateAnnotator::TokenAlignedSubstringAroundSpan(
     const UnicodeText& text, CodepointSpan indices, int minimum_length) const {
   const int text_size_codepoints = text.size_codepoints();
   if (text_size_codepoints < minimum_length) {
     return UnicodeText(text, /*do_copy=*/false);
   }

   const int start = indices.first;
   const int end = indices.second;
   const int length = end - start;
   if (length >= minimum_length) {
     return UnicodeText::Substring(text, start, end, /*do_copy=*/false);
   }

   const int offset = (minimum_length - length) / 2;
   const int iter_start = std::max(
       0, std::min(start - offset, text_size_codepoints - minimum_length));
   const int iter_end =
       std::min(text_size_codepoints, iter_start + minimum_length);

   auto it_start = FindIndexOfNextWhitespaceOrPunctuation(text, iter_start, -1);
   const auto it_end = FindIndexOfNextWhitespaceOrPunctuation(text, iter_end, 1);

   // The it_start now points to whitespace/punctuation (unless it reached the
   // beginning of the string). So we'll move it one position forward to point to
   // the actual text.
   if (it_start != it_end && unilib_->IsWhitespace(*it_start)) {
     std::advance(it_start, 1);
   }

   return UnicodeText::Substring(it_start, it_end, /*do_copy=*/false);
 }

 }  // namespace libtextclassifier3
	// Copyright 2020 Google LLC
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//

	#include "annotator/translate/translate.h"

	#include <memory>

	#include "annotator/collections.h"
	#include "annotator/entity-data_generated.h"
	#include "annotator/types.h"
	#include "lang_id/lang-id-wrapper.h"
	#include "utils/base/logging.h"
	#include "utils/i18n/locale.h"
	#include "utils/utf8/unicodetext.h"
	#include "lang_id/lang-id.h"

	namespace libtextclassifier3 {

	bool TranslateAnnotator::ClassifyText(
	const UnicodeText& context, CodepointSpan selection_indices,
	const std::string& user_familiar_language_tags,
	ClassificationResult* classification_result) const {
	std::vector<TranslateAnnotator::LanguageConfidence> confidences;
	if (options_->algorithm() ==
	TranslateAnnotatorOptions_::Algorithm::Algorithm_BACKOFF) {
	if (options_->backoff_options() == nullptr) {
	TC3_LOG(WARNING) << "No backoff options specified. Returning.";
	return false;
	}
	confidences = BackoffDetectLanguages(context, selection_indices);
	}

	if (confidences.empty()) {
	return false;
	}

	std::vector<Locale> user_familiar_languages;
	if (!ParseLocales(user_familiar_language_tags, &user_familiar_languages)) {
	TC3_LOG(WARNING) << "Couldn't parse the user-understood languages.";
	return false;
	}
	if (user_familiar_languages.empty()) {
	TC3_VLOG(INFO) << "user_familiar_languages is not set, not suggesting "
	"translate action.";
	return false;
	}
	bool user_can_understand_language_of_text = false;
	for (const Locale& locale : user_familiar_languages) {
	if (locale.Language() == confidences[0].language) {
	user_can_understand_language_of_text = true;
	break;
	}
	}

	if (!user_can_understand_language_of_text) {
	classification_result->collection = Collections::Translate();
	classification_result->score = options_->score();
	classification_result->priority_score = options_->priority_score();
	classification_result->serialized_entity_data =
	CreateSerializedEntityData(confidences);
	return true;
	}

	return false;
	}

	std::string TranslateAnnotator::CreateSerializedEntityData(
	const std::vector<TranslateAnnotator::LanguageConfidence>& confidences)
	const {
	EntityDataT entity_data;
	entity_data.translate.reset(new EntityData_::TranslateT());

	for (const LanguageConfidence& confidence : confidences) {
	EntityData_::Translate_::LanguagePredictionResultT*
	language_prediction_result =
	new EntityData_::Translate_::LanguagePredictionResultT();
	language_prediction_result->language_tag = confidence.language;
	language_prediction_result->confidence_score = confidence.confidence;
	entity_data.translate->language_prediction_results.emplace_back(
	language_prediction_result);
	}
	flatbuffers::FlatBufferBuilder builder;
	FinishEntityDataBuffer(builder, EntityData::Pack(builder, &entity_data));
	return std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),
	builder.GetSize());
	}

	std::vector<TranslateAnnotator::LanguageConfidence>
	TranslateAnnotator::BackoffDetectLanguages(
	const UnicodeText& context, CodepointSpan selection_indices) const {
	const float penalize_ratio = options_->backoff_options()->penalize_ratio();
	const int min_text_size = options_->backoff_options()->min_text_size();
	if (selection_indices.second - selection_indices.first < min_text_size &&
	penalize_ratio <= 0) {
	return {};
	}

	const UnicodeText entity =
	UnicodeText::Substring(context, selection_indices.first,
	selection_indices.second, /do_copy=/false);
	const std::vector<std::pair<std::string, float>> lang_id_result =
	langid::GetPredictions(langid_model_, entity.data(), entity.size_bytes());

	const float more_text_score_ratio =
	1.0f - options_->backoff_options()->subject_text_score_ratio();
	std::vector<std::pair<std::string, float>> more_lang_id_results;
	if (more_text_score_ratio >= 0) {
	const UnicodeText entity_with_context = TokenAlignedSubstringAroundSpan(
	context, selection_indices, min_text_size);
	more_lang_id_results =
	langid::GetPredictions(langid_model_, entity_with_context.data(),
	entity_with_context.size_bytes());
	}

	const float subject_text_score_ratio =
	options_->backoff_options()->subject_text_score_ratio();

	std::map<std::string, float> result_map;
	for (const auto& [language, score] : lang_id_result) {
	result_map[language] = subject_text_score_ratio * score;
	}
	for (const auto& [language, score] : more_lang_id_results) {
	result_map[language] += more_text_score_ratio * score * penalize_ratio;
	}

	std::vector<TranslateAnnotator::LanguageConfidence> result;
	result.reserve(result_map.size());
	for (const auto& [key, value] : result_map) {
	result.push_back({key, value});
	}

	std::sort(result.begin(), result.end(),
	[](TranslateAnnotator::LanguageConfidence& a,
	TranslateAnnotator::LanguageConfidence& b) {
	return a.confidence > b.confidence;
	});
	return result;
	}

	UnicodeText::const_iterator
	TranslateAnnotator::FindIndexOfNextWhitespaceOrPunctuation(
	const UnicodeText& text, int start_index, int direction) const {
	TC3_CHECK(direction == 1 \|\| direction == -1);
	auto it = text.begin();
	std::advance(it, start_index);
	while (it > text.begin() && it < text.end()) {
	if (unilib_->IsWhitespace(it) \|\| unilib_->IsPunctuation(it)) {
	break;
	}
	std::advance(it, direction);
	}
	return it;
	}

	UnicodeText TranslateAnnotator::TokenAlignedSubstringAroundSpan(
	const UnicodeText& text, CodepointSpan indices, int minimum_length) const {
	const int text_size_codepoints = text.size_codepoints();
	if (text_size_codepoints < minimum_length) {
	return UnicodeText(text, /do_copy=/false);
	}

	const int start = indices.first;
	const int end = indices.second;
	const int length = end - start;
	if (length >= minimum_length) {
	return UnicodeText::Substring(text, start, end, /do_copy=/false);
	}

	const int offset = (minimum_length - length) / 2;
	const int iter_start = std::max(
	0, std::min(start - offset, text_size_codepoints - minimum_length));
	const int iter_end =
	std::min(text_size_codepoints, iter_start + minimum_length);

	auto it_start = FindIndexOfNextWhitespaceOrPunctuation(text, iter_start, -1);
	const auto it_end = FindIndexOfNextWhitespaceOrPunctuation(text, iter_end, 1);

	// The it_start now points to whitespace/punctuation (unless it reached the
	// beginning of the string). So we'll move it one position forward to point to
	// the actual text.
	if (it_start != it_end && unilib_->IsWhitespace(*it_start)) {
	std::advance(it_start, 1);
	}

	return UnicodeText::Substring(it_start, it_end, /do_copy=/false);
	}

	} // namespace libtextclassifier3