blob: defbcbe77459c77e578f2bd531171cbe2a6c7e99 [file] [log] [blame]
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/live_caption/translation_util.h"
#include "base/strings/strcat.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "third_party/icu/source/common/unicode/brkiter.h"
#include "third_party/icu/source/common/unicode/unistr.h"
#include "third_party/icu/source/common/unicode/uscript.h"
#include "third_party/re2/src/re2/re2.h"
#include "ui/base/l10n/l10n_util.h"
namespace captions {
std::vector<std::string> SplitSentences(const std::string& text,
const std::string& locale) {
std::vector<std::string> sentences;
UErrorCode status = U_ZERO_ERROR;
// Use icu::BreakIterator instead of base::i18n::BreakIterator to avoid flakey
// mid-string sentence breaks.
icu::BreakIterator* iter =
icu::BreakIterator::createSentenceInstance(locale.c_str(), status);
DCHECK(U_SUCCESS(status))
<< "ICU could not open a break iterator: " << u_errorName(status) << " ("
<< status << ")";
// Set the text to be analyzed.
icu::UnicodeString unicode_text = icu::UnicodeString::fromUTF8(text);
iter->setText(unicode_text);
// Iterate over the sentences.
int32_t start = iter->first();
int32_t end = iter->next();
while (end != icu::BreakIterator::DONE) {
icu::UnicodeString sentence;
unicode_text.extractBetween(start, end, sentence);
std::string sentence_string;
sentence.toUTF8String(sentence_string);
sentences.emplace_back(sentence_string);
start = end;
end = iter->next();
}
delete iter;
return sentences;
}
bool ContainsTrailingSpace(const std::string& str) {
return !str.empty() && base::IsAsciiWhitespace(str.back());
}
std::string RemoveTrailingSpace(const std::string& str) {
if (ContainsTrailingSpace(str)) {
return str.substr(0, str.length() - 1);
}
return str;
}
std::string RemovePunctuationToLower(std::string str) {
re2::RE2::GlobalReplace(&str, "[[:punct:]]", "");
return base::ToLowerASCII(str);
}
std::string GetTranslationCacheKey(const std::string& source_language,
const std::string& target_language,
const std::string& transcription) {
return base::StrCat({source_language, target_language, "|",
RemovePunctuationToLower(transcription)});
}
bool IsIdeographicLocale(const std::string& locale) {
// Retrieve the script codes used by the given language from ICU. When the
// given language consists of two or more scripts, we just use the first
// script. The size of returned script codes is always < 8. Therefore, we use
// an array of size 8 so we can include all script codes without insufficient
// buffer errors.
UErrorCode error = U_ZERO_ERROR;
UScriptCode script_code[8];
int scripts = uscript_getCode(locale.c_str(), script_code,
std::size(script_code), &error);
return U_SUCCESS(error) && scripts >= 1 &&
(script_code[0] == USCRIPT_HAN || script_code[0] == USCRIPT_HIRAGANA ||
script_code[0] == USCRIPT_YI || script_code[0] == USCRIPT_KATAKANA);
}
TranslationCache::TranslationCache() = default;
TranslationCache::~TranslationCache() = default;
std::pair<std::string, std::string>
TranslationCache::FindCachedTranslationOrRemaining(
const std::string& transcript,
const std::string& source_language,
const std::string& target_language) const {
std::vector<std::string> sentences =
SplitSentences(transcript, source_language);
std::string cached_translation;
std::string string_to_translate;
bool cached_translation_found = true;
for (const std::string& sentence : sentences) {
if (cached_translation_found) {
std::string trailing_space =
ContainsTrailingSpace(sentence)
? sentence.substr(sentence.length() - 1, sentence.length())
: std::string();
auto translation_cache_key = GetTranslationCacheKey(
source_language, target_language,
trailing_space.empty() ? sentence : RemoveTrailingSpace(sentence));
auto iter = translation_cache_.find(translation_cache_key);
if (iter != translation_cache_.end()) {
cached_translation += iter->second;
if (!trailing_space.empty()) {
cached_translation += trailing_space;
}
continue;
}
cached_translation_found = false;
}
string_to_translate = base::StrCat({string_to_translate, sentence});
}
if (cached_translation_found) {
return std::make_pair("", cached_translation);
} else {
return std::make_pair(string_to_translate, cached_translation);
}
}
void TranslationCache::InsertIntoCache(
const std::string& original_transcription,
const std::string& result,
const std::string& source_language,
const std::string& target_language) {
auto original_sentences =
SplitSentences(original_transcription, source_language);
auto translated_sentences = SplitSentences(result, target_language);
if (original_sentences.size() > 1 &&
original_sentences.size() == translated_sentences.size()) {
for (size_t i = 0; i < original_sentences.size() - 1; i++) {
// Sentences are always cached without the trailing space.
std::string sentence = RemoveTrailingSpace(original_sentences[i]);
translation_cache_.insert(
{GetTranslationCacheKey(source_language, target_language, sentence),
RemoveTrailingSpace(translated_sentences[i])});
}
}
}
void TranslationCache::Clear() {
translation_cache_.clear();
}
} // namespace captions