blob: 9da8c0351f129a05b90b474da404d7972d4795eb [file] [log] [blame]
// Copyright 2004-2009 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ========================================================================
//
// This file is for i18n. It contains two enums, namely Language and
// Encoding, where Language is the linguistic convention, and Encoding
// contains information on both language encoding and character set.
//
// The language and encoding are both based on Teragram's conventions,
// except for some common ISO-8859 encodings that are not detected by
// Teragram but might be in the future.
//
// This file also includes functions that do mappings among
// Language/Encoding enums, language/encoding string names (typically
// the output from Language Encoding identifier), and language codes
// (iso 639), and two-letter country codes (iso 3166)
//
// NOTE: Both Language and Encoding enums should always start from
// zero value. This assumption has been made and used.
#ifndef OMAHA_BASE_LANG_ENC_H_
#define OMAHA_BASE_LANG_ENC_H_
#include <windows.h>
// some of the popular encoding aliases
#define LATIN1 ISO_8859_1
#define LATIN2 ISO_8859_2
#define LATIN3 ISO_8859_3
#define LATIN4 ISO_8859_4
#define CYRILLIC ISO_8859_5
#define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language
#define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language
#define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language
#define LATIN5 ISO_8859_9
#define LATIN6 ISO_8859_10
#define KOREAN_HANGUL KOREAN_EUC_KR
// NOTE: Only add new languages to the end of this list (but before
// NUM_LANGUAGES).
enum Language {
ENGLISH = 0, /* 0 */
DANISH, /* 1 */
DUTCH, /* 2 */
FINNISH, /* 3 */
FRENCH, /* 4 */
GERMAN, /* 5 */
HEBREW, /* 6 */
ITALIAN, /* 7 */
JAPANESE, /* 8 */
KOREAN, /* 9 */
NORWEGIAN, /* 10 */
POLISH, /* 11 */
PORTUGUESE, /* 12 */
RUSSIAN, /* 13 */
SPANISH, /* 14 */
SWEDISH, /* 15 */
CHINESE, /* 16 */
CZECH, /* 17 */
GREEK, /* 18 */
ICELANDIC, /* 19 */
LATVIAN, /* 20 */
LITHUANIAN, /* 21 */
ROMANIAN, /* 22 */
HUNGARIAN, /* 23 */
ESTONIAN, /* 24 */
TG_UNKNOWN_LANGUAGE, /* 25 */
UNKNOWN_LANGUAGE, /* 26 */
BULGARIAN, /* 27 */
CROATIAN, /* 28 */
SERBIAN, /* 29 */
IRISH, /* 30 */
GALICIAN, /* 31 */
TAGALOG, /* 32 */
TURKISH, /* 33 */
UKRAINIAN, /* 34 */
HINDI, /* 35 */
MACEDONIAN, /* 36 */
BENGALI, /* 37 */
INDONESIAN, /* 38 */
LATIN, /* 39 */
MALAY, /* 40 */
MALAYALAM, /* 41 */
WELSH, /* 42 */
NEPALI, /* 43 */
TELUGU, /* 44 */
ALBANIAN, /* 45 */
TAMIL, /* 46 */
BELARUSIAN, /* 47 */
JAVANESE, /* 48 */
OCCITAN, /* 49 */
URDU, /* 50 */
BIHARI, /* 51 */
GUJARATI, /* 52 */
THAI, /* 53 */
ARABIC, /* 54 */
CATALAN, /* 55 */
ESPERANTO, /* 56 */
BASQUE, /* 57 */
INTERLINGUA, /* 58 */
KANNADA, /* 59 */
PUNJABI, /* 60 */
SCOTS_GAELIC, /* 61 */
SWAHILI, /* 62 */
SLOVENIAN, /* 63 */
MARATHI, /* 64 */
MALTESE, /* 65 */
VIETNAMESE, /* 66 */
FRISIAN, /* 67 */
SLOVAK, /* 68 */
CHINESE_T, /* 69 */ // This is added to solve the problem of
// distinguishing Traditional and Simplified
// Chinese when the encoding is UTF8.
FAROESE, /* 70 */
SUNDANESE, /* 71 */
UZBEK, /* 72 */
AMHARIC, /* 73 */
AZERBAIJANI, /* 74 */
GEORGIAN, /* 75 */
TIGRINYA, /* 76 */
PERSIAN, /* 77 */
BOSNIAN, /* 78 */
SINHALESE, /* 79 */
NORWEGIAN_N, /* 80 */
PORTUGUESE_P, /* 81 */
PORTUGUESE_B, /* 82 */
XHOSA, /* 83 */
ZULU, /* 84 */
GUARANI, /* 85 */
SESOTHO, /* 86 */
TURKMEN, /* 87 */
KYRGYZ, /* 88 */
BRETON, /* 89 */
TWI, /* 90 */
YIDDISH, /* 91 */
ORIYA, /* 92 */
SERBO_CROATIAN, /* 93 */
SOMALI, /* 94 */
UIGHUR, /* 95 */
KURDISH, /* 96 */
MONGOLIAN, /* 97 */
ARMENIAN, /* 98 */
LAOTHIAN, /* 99 */
SINDHI, /* 100! */
RHAETO_ROMANCE, /* 101 */
CHINESE_JAPANESE_KOREAN, /* 103 */ // Not really a language
PSEUDOTRANSLATION, /* 104 */ // Not really a language
NUM_LANGUAGES, // Always keep this at the end. It is not a
// valid Language enum, it is only used to
// indicate the total number of Languages.
};
// Language codes for those languages we support, used to map to IDs from
// the Language enumeration. We could have used the Rfc1766ToLcid from the
// Win32 system's mlang.dll to map these to LCIDs, but a) we don't want to
// have to load mlang.dll and b) we are using our own language IDs.
const TCHAR* const kLangCodeChinesePrc = _T("zh_cn");
const TCHAR* const kLangCodeChineseTaiwan = _T("zh_tw");
const TCHAR* const kLangCodeCjk = _T("cjk");
const TCHAR* const kLangCodeDutch = _T("nl");
const TCHAR* const kLangCodeEnglish = _T("en");
const TCHAR* const kLangCodeFrench = _T("fr");
const TCHAR* const kLangCodeGerman = _T("de");
const TCHAR* const kLangCodeItalian = _T("it");
const TCHAR* const kLangCodeJapanese = _T("ja");
const TCHAR* const kLangCodeKorean = _T("ko");
const TCHAR* const kLangCodePseudo = _T("x");
const TCHAR* const kLangCodeSpanish = _T("es");
// Maps language codes to languages. Terminated by a { NULL, UNKNOWN_LANGUAGE }
// item.
struct CodeToLanguage {
const TCHAR* code;
Language language;
};
SELECTANY CodeToLanguage codes_to_languages[] = {
{ kLangCodeChinesePrc, CHINESE },
{ kLangCodeChineseTaiwan, CHINESE_T },
{ kLangCodeCjk, CHINESE_JAPANESE_KOREAN },
{ kLangCodeDutch, DUTCH },
{ kLangCodeEnglish, ENGLISH },
{ kLangCodeFrench, FRENCH },
{ kLangCodeGerman, GERMAN },
{ kLangCodeItalian, ITALIAN },
{ kLangCodeJapanese, JAPANESE },
{ kLangCodeKorean, KOREAN },
{ kLangCodePseudo, PSEUDOTRANSLATION },
{ kLangCodeSpanish, SPANISH },
{ NULL, UNKNOWN_LANGUAGE }
};
// Macro to wrap the notion of "unknown language".
#define IS_LANGUAGE_UNKNOWN(l) \
((l) == TG_UNKNOWN_LANGUAGE || (l) == UNKNOWN_LANGUAGE)
// NOTE: Only add new encodings to the end of this list (but before
// NUM_ENCODINGS).
// NOTE: If you add an encoding here, you must also modify basistech_encoding()
// and google2/com/google/i18n/Encoding.java
enum Encoding {
ISO_8859_1 = 0, // 0: Teragram ASCII
ISO_8859_2, // 1: Teragram Latin2
ISO_8859_3, // 2: in BasisTech but not in Teragram
ISO_8859_4, // 3: Teragram Latin4
ISO_8859_5, // 4: Teragram ISO-8859-5
ISO_8859_6, // 5: Teragram Arabic
ISO_8859_7, // 6: Teragram Greek
ISO_8859_8, // 7: Teragram Hebrew
ISO_8859_9, // 8: in BasisTech but not in Teragram
ISO_8859_10, // 9: in BasisTech but not in Teragram
JAPANESE_EUC_JP, // 10: Teragram EUC_JP
JAPANESE_SHIFT_JIS, // 11: Teragram SJS
JAPANESE_JIS, // 12: Teragram JIS
CHINESE_BIG5, // 13: Teragram BIG5
CHINESE_GB, // 14: Teragram GB
CHINESE_EUC_CN, // 15: Teragram EUC-CN
KOREAN_EUC_KR, // 16: Teragram KSC
UNICODE_ENCODING, // 17: Teragram Unicode, changed to UNICODE_ENCODING
// from UNICODE, which is predefined by WINDOW
CHINESE_EUC_DEC, // 18: Teragram EUC
CHINESE_CNS, // 19: Teragram CNS
CHINESE_BIG5_CP950, // 20: Teragram BIG5_CP950
JAPANESE_CP932, // 21: Teragram CP932
UTF8, // 22
UNKNOWN_ENCODING, // 23
ASCII_7BIT, // 24: ISO_8859_1 with all characters <= 127.
// Should be present only in the crawler
// and in the repository,
// *never* as a result of Document::encoding().
RUSSIAN_KOI8_R, // 25: Teragram KOI8R
RUSSIAN_CP1251, // 26: Teragram CP1251
//----------------------------------------------------------
// These are _not_ output from teragram. Instead, they are as
// detected in the headers of usenet articles.
MSFT_CP1252, // 27: CP1252 aka MSFT euro ascii
RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, used for Ukrainian
MSFT_CP1250, // 29: CP1250 aka MSFT eastern european
ISO_8859_15, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized
//----------------------------------------------------------
//----------------------------------------------------------
// These are in BasisTech but not in Teragram. They are
// needed for new interface languages. Now detected by
// research langid
MSFT_CP1254, // 31: used for Turkish
MSFT_CP1257, // 32: used in Baltic countries
//----------------------------------------------------------
//----------------------------------------------------------
//----------------------------------------------------------
// New encodings detected by Teragram
ISO_8859_11, // 33: aka TIS-620, used for Thai
MSFT_CP874, // 34: used for Thai
MSFT_CP1256, // 35: used for Arabic
//----------------------------------------------------------
// Detected as ISO_8859_8 by Teragram, but can be found in META tags
MSFT_CP1255, // 36: Logical Hebrew Microsoft
ISO_8859_8_I, // 37: Iso Hebrew Logical
HEBREW_VISUAL, // 38: Iso Hebrew Visual
//----------------------------------------------------------
//----------------------------------------------------------
// Detected by research langid
CZECH_CP852, // 39
CZECH_CSN_369103, // 40: aka ISO_IR_139 aka KOI8_CS
MSFT_CP1253, // 41: used for Greek
RUSSIAN_CP866, // 42
//----------------------------------------------------------
HZ_ENCODING,
ISO2022_CN,
ISO2022_KR,
NUM_ENCODINGS // Always keep this at the end. It is not a
// valid Encoding enum, it is only used to
// indicate the total number of Encodings.
};
const int kNumLanguages = NUM_LANGUAGES;
const int kNumEncodings = NUM_ENCODINGS;
#endif // OMAHA_BASE_LANG_ENC_H_