blob: a93155675a721a852ea9027bd7b91cdc1684d20e [file] [log] [blame]
// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/translate/core/language_detection/language_detection_util.h"
#include "base/strings/string16.h"
#include "base/strings/utf_string_conversions.h"
#include "components/translate/core/common/translate_constants.h"
#include "testing/gtest/include/gtest/gtest.h"
typedef testing::Test LanguageDetectionUtilTest;
// Tests that well-known language code typos are fixed.
TEST_F(LanguageDetectionUtilTest, LanguageCodeTypoCorrection) {
std::string language;
// Strip the second and later codes.
language = std::string("ja,en");
translate::CorrectLanguageCodeTypo(&language);
EXPECT_EQ("ja", language);
// Replace dash with hyphen.
language = std::string("ja_JP");
translate::CorrectLanguageCodeTypo(&language);
EXPECT_EQ("ja-JP", language);
// Correct wrong cases.
language = std::string("JA-jp");
translate::CorrectLanguageCodeTypo(&language);
EXPECT_EQ("ja-JP", language);
}
// Tests if the language codes' format is invalid.
TEST_F(LanguageDetectionUtilTest, IsValidLanguageCode) {
std::string language;
language = std::string("ja");
EXPECT_TRUE(translate::IsValidLanguageCode(language));
language = std::string("ja-JP");
EXPECT_TRUE(translate::IsValidLanguageCode(language));
language = std::string("ceb");
EXPECT_TRUE(translate::IsValidLanguageCode(language));
language = std::string("ceb-XX");
EXPECT_TRUE(translate::IsValidLanguageCode(language));
// Invalid because the sub code consists of a number.
language = std::string("utf-8");
EXPECT_FALSE(translate::IsValidLanguageCode(language));
// Invalid because of six characters after hyphen.
language = std::string("ja-YUKARI");
EXPECT_FALSE(translate::IsValidLanguageCode(language));
// Invalid because of four characters.
language = std::string("DHMO");
EXPECT_FALSE(translate::IsValidLanguageCode(language));
}
// Tests that similar language table works.
TEST_F(LanguageDetectionUtilTest, SimilarLanguageCode) {
EXPECT_TRUE(translate::IsSameOrSimilarLanguages("en", "en"));
EXPECT_FALSE(translate::IsSameOrSimilarLanguages("en", "ja"));
// Language codes are same if the main parts are same. The synonyms should be
// took into account (ex: 'iw' and 'he').
EXPECT_TRUE(translate::IsSameOrSimilarLanguages("sr-ME", "sr"));
EXPECT_TRUE(translate::IsSameOrSimilarLanguages("sr", "sr-ME"));
EXPECT_TRUE(translate::IsSameOrSimilarLanguages("he", "he-IL"));
EXPECT_TRUE(translate::IsSameOrSimilarLanguages("eng", "eng-US"));
EXPECT_TRUE(translate::IsSameOrSimilarLanguages("eng-US", "eng"));
EXPECT_FALSE(translate::IsSameOrSimilarLanguages("eng", "enm"));
// Even though the main parts are different, some special language pairs are
// recognized as same languages.
EXPECT_TRUE(translate::IsSameOrSimilarLanguages("bs", "hr"));
EXPECT_TRUE(translate::IsSameOrSimilarLanguages("ne", "hi"));
EXPECT_FALSE(translate::IsSameOrSimilarLanguages("bs", "hi"));
}
// Tests that well-known languages which often have wrong server configuration
// are handles.
TEST_F(LanguageDetectionUtilTest, WellKnownWrongConfiguration) {
EXPECT_TRUE(translate::MaybeServerWrongConfiguration("en", "ja"));
EXPECT_TRUE(translate::MaybeServerWrongConfiguration("en-US", "ja"));
EXPECT_TRUE(translate::MaybeServerWrongConfiguration("en", "zh-CN"));
EXPECT_FALSE(translate::MaybeServerWrongConfiguration("ja", "en"));
EXPECT_FALSE(translate::MaybeServerWrongConfiguration("en", "he"));
}
// Tests that the language meta tag providing wrong information is ignored by
// LanguageDetectionUtil due to disagreement between meta tag and CLD.
TEST_F(LanguageDetectionUtilTest, CLDDisagreeWithWrongLanguageCode) {
base::string16 contents = base::ASCIIToUTF16(
"<html><head><meta http-equiv='Content-Language' content='ja'></head>"
"<body>This is a page apparently written in English. Even though "
"content-language is provided, the value will be ignored if the value "
"is suspicious.</body></html>");
std::string cld_language;
bool is_cld_reliable;
std::string language = translate::DeterminePageLanguage(std::string("ja"),
std::string(),
contents,
&cld_language,
&is_cld_reliable);
EXPECT_EQ(translate::kUnknownLanguageCode, language);
EXPECT_EQ("en", cld_language);
EXPECT_TRUE(is_cld_reliable);
}
// Tests that the language meta tag providing "en-US" style information is
// agreed by CLD.
TEST_F(LanguageDetectionUtilTest, CLDAgreeWithLanguageCodeHavingCountryCode) {
base::string16 contents = base::ASCIIToUTF16(
"<html><head><meta http-equiv='Content-Language' content='en-US'></head>"
"<body>This is a page apparently written in English. Even though "
"content-language is provided, the value will be ignored if the value "
"is suspicious.</body></html>");
std::string cld_language;
bool is_cld_reliable;
std::string language = translate::DeterminePageLanguage(std::string("en-US"),
std::string(),
contents,
&cld_language,
&is_cld_reliable);
EXPECT_EQ("en", language);
EXPECT_EQ("en", cld_language);
EXPECT_TRUE(is_cld_reliable);
}
// Tests that the language meta tag providing wrong information is ignored and
// CLD's language will be adopted by LanguageDetectionUtil due to an invalid
// meta tag.
TEST_F(LanguageDetectionUtilTest, InvalidLanguageMetaTagProviding) {
base::string16 contents = base::ASCIIToUTF16(
"<html><head><meta http-equiv='Content-Language' content='utf-8'></head>"
"<body>This is a page apparently written in English. Even though "
"content-language is provided, the value will be ignored and CLD's"
" language will be adopted if the value is invalid.</body></html>");
std::string cld_language;
bool is_cld_reliable;
std::string language = translate::DeterminePageLanguage(std::string("utf-8"),
std::string(),
contents,
&cld_language,
&is_cld_reliable);
EXPECT_EQ("en", language);
EXPECT_EQ("en", cld_language);
EXPECT_TRUE(is_cld_reliable);
}
// Tests that the language meta tag providing wrong information is ignored
// because of valid html lang attribute.
TEST_F(LanguageDetectionUtilTest, AdoptHtmlLang) {
base::string16 contents = base::ASCIIToUTF16(
"<html lang='en'><head><meta http-equiv='Content-Language' content='ja'>"
"</head><body>This is a page apparently written in English. Even though "
"content-language is provided, the value will be ignored if the value "
"is suspicious.</body></html>");
std::string cld_language;
bool is_cld_reliable;
std::string language = translate::DeterminePageLanguage(std::string("ja"),
std::string("en"),
contents,
&cld_language,
&is_cld_reliable);
EXPECT_EQ("en", language);
EXPECT_EQ("en", cld_language);
EXPECT_TRUE(is_cld_reliable);
}
// Tests that languages that often have the wrong server configuration are
// correctly identified. All incorrect language codes should be checked to
// make sure the binary_search is correct.
TEST_F(LanguageDetectionUtilTest, IsServerWrongConfigurationLanguage) {
// These languages should all be identified as having the wrong server
// configuration.
const char* const wrong_languages[] = {
"ar", "da", "de", "el", "es", "fa", "fr", "hi",
"hu", "id", "it", "ja", "ms", "nl", "pl", "pt",
"ro", "ru", "sv", "th", "tr", "vi", "zh-CN", "zh-TW"};
for (const char* const language : wrong_languages) {
EXPECT_TRUE(translate::IsServerWrongConfigurationLanguage(language));
}
// These languages should all be identified as having the right server
// configuration.
const char* const right_languages[] = {"en", "en-AU", "en-US",
"xx", "gg", "rr"};
for (const char* const language : right_languages) {
EXPECT_FALSE(translate::IsServerWrongConfigurationLanguage(language));
}
}