base/lang_enc.h - external/omaha - Git at Google

 // Copyright 2004-2009 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // ========================================================================
 //
 // This file is for i18n. It contains two enums, namely Language and
 // Encoding, where Language is the linguistic convention, and Encoding
 // contains information on both language encoding and character set.
 //
 // The language and encoding are both based on Teragram's conventions,
 // except for some common ISO-8859 encodings that are not detected by
 // Teragram but might be in the future.
 //
 // This file also includes functions that do mappings among
 // Language/Encoding enums, language/encoding string names (typically
 // the output from Language Encoding identifier), and language codes
 // (iso 639), and two-letter country codes (iso 3166)
 //
 // NOTE: Both Language and Encoding enums should always start from
 // zero value. This assumption has been made and used.

 #ifndef  OMAHA_BASE_LANG_ENC_H_
 #define  OMAHA_BASE_LANG_ENC_H_

 #include <windows.h>

 // some of the popular encoding aliases
 #define LATIN1     ISO_8859_1
 #define LATIN2     ISO_8859_2
 #define LATIN3     ISO_8859_3
 #define LATIN4     ISO_8859_4
 #define CYRILLIC   ISO_8859_5
 #define ARABIC_ENCODING  ISO_8859_6     // avoiding the same name as language
 #define GREEK_ENCODING   ISO_8859_7     // avoiding the same name as language
 #define HEBREW_ENCODING  ISO_8859_8     // avoiding the same name as language
 #define LATIN5     ISO_8859_9
 #define LATIN6     ISO_8859_10
 #define KOREAN_HANGUL  KOREAN_EUC_KR

 // NOTE: Only add new languages to the end of this list (but before
 // NUM_LANGUAGES).
 enum Language {
   ENGLISH = 0,  /* 0 */
   DANISH,       /* 1 */
   DUTCH,        /* 2 */
   FINNISH,      /* 3 */
   FRENCH,       /* 4 */
   GERMAN,       /* 5 */
   HEBREW,       /* 6 */
   ITALIAN,      /* 7 */
   JAPANESE,     /* 8 */
   KOREAN,       /* 9 */
   NORWEGIAN,    /* 10 */
   POLISH,       /* 11 */
   PORTUGUESE,   /* 12 */
   RUSSIAN,      /* 13 */
   SPANISH,      /* 14 */
   SWEDISH,      /* 15 */
   CHINESE,      /* 16 */
   CZECH,        /* 17 */
   GREEK,        /* 18 */
   ICELANDIC,    /* 19 */
   LATVIAN,      /* 20 */
   LITHUANIAN,   /* 21 */
   ROMANIAN,     /* 22 */
   HUNGARIAN,    /* 23 */
   ESTONIAN,     /* 24 */
   TG_UNKNOWN_LANGUAGE,  /* 25 */
   UNKNOWN_LANGUAGE,     /* 26 */
   BULGARIAN,    /* 27 */
   CROATIAN,     /* 28 */
   SERBIAN,      /* 29 */
   IRISH,        /* 30 */
   GALICIAN,     /* 31 */
   TAGALOG,      /* 32 */
   TURKISH,      /* 33 */
   UKRAINIAN,    /* 34 */
   HINDI,        /* 35 */
   MACEDONIAN,   /* 36 */
   BENGALI,      /* 37 */
   INDONESIAN,   /* 38 */
   LATIN,        /* 39 */
   MALAY,        /* 40 */
   MALAYALAM,    /* 41 */
   WELSH,        /* 42 */
   NEPALI,       /* 43 */
   TELUGU,       /* 44 */
   ALBANIAN,     /* 45 */
   TAMIL,        /* 46 */
   BELARUSIAN,   /* 47 */
   JAVANESE,     /* 48 */
   OCCITAN,      /* 49 */
   URDU,         /* 50 */
   BIHARI,       /* 51 */
   GUJARATI,     /* 52 */
   THAI,         /* 53 */
   ARABIC,       /* 54 */
   CATALAN,      /* 55 */
   ESPERANTO,    /* 56 */
   BASQUE,       /* 57 */
   INTERLINGUA,  /* 58 */
   KANNADA,      /* 59 */
   PUNJABI,      /* 60 */
   SCOTS_GAELIC, /* 61 */
   SWAHILI,      /* 62 */
   SLOVENIAN,    /* 63 */
   MARATHI,      /* 64 */
   MALTESE,      /* 65 */
   VIETNAMESE,   /* 66 */
   FRISIAN,      /* 67 */
   SLOVAK,       /* 68 */
   CHINESE_T,    /* 69 */      // This is added to solve the problem of
                               // distinguishing Traditional and Simplified
                               // Chinese when the encoding is UTF8.
   FAROESE,      /* 70 */
   SUNDANESE,    /* 71 */
   UZBEK,        /* 72 */
   AMHARIC,      /* 73 */
   AZERBAIJANI,  /* 74 */
   GEORGIAN,     /* 75 */
   TIGRINYA,     /* 76 */
   PERSIAN,      /* 77 */
   BOSNIAN,      /* 78 */
   SINHALESE,    /* 79 */
   NORWEGIAN_N,  /* 80 */
   PORTUGUESE_P, /* 81 */
   PORTUGUESE_B, /* 82 */
   XHOSA,        /* 83 */
   ZULU,         /* 84 */
   GUARANI,      /* 85 */
   SESOTHO,      /* 86 */
   TURKMEN,      /* 87 */
   KYRGYZ,       /* 88 */
   BRETON,       /* 89 */
   TWI,          /* 90 */
   YIDDISH,      /* 91 */
   ORIYA,        /* 92 */
   SERBO_CROATIAN,       /* 93 */
   SOMALI,       /* 94 */
   UIGHUR,       /* 95 */
   KURDISH,      /* 96 */
   MONGOLIAN,    /* 97 */
   ARMENIAN,     /* 98 */
   LAOTHIAN,     /* 99 */
   SINDHI,       /* 100! */
   RHAETO_ROMANCE,  /* 101 */
   CHINESE_JAPANESE_KOREAN,  /* 103 */  // Not really a language
   PSEUDOTRANSLATION,  /* 104 */  // Not really a language
   NUM_LANGUAGES,              // Always keep this at the end. It is not a
                               // valid Language enum, it is only used to
                               // indicate the total number of Languages.
 };


 // Language codes for those languages we support, used to map to IDs from
 // the Language enumeration.  We could have used the Rfc1766ToLcid from the
 // Win32 system's mlang.dll to map these to LCIDs, but a) we don't want to
 // have to load mlang.dll and b) we are using our own language IDs.
 const TCHAR* const kLangCodeChinesePrc = _T("zh_cn");
 const TCHAR* const kLangCodeChineseTaiwan = _T("zh_tw");
 const TCHAR* const kLangCodeCjk = _T("cjk");
 const TCHAR* const kLangCodeDutch = _T("nl");
 const TCHAR* const kLangCodeEnglish = _T("en");
 const TCHAR* const kLangCodeFrench = _T("fr");
 const TCHAR* const kLangCodeGerman = _T("de");
 const TCHAR* const kLangCodeItalian = _T("it");
 const TCHAR* const kLangCodeJapanese = _T("ja");
 const TCHAR* const kLangCodeKorean = _T("ko");
 const TCHAR* const kLangCodePseudo = _T("x");
 const TCHAR* const kLangCodeSpanish = _T("es");


 // Maps language codes to languages.  Terminated by a { NULL, UNKNOWN_LANGUAGE }
 // item.
 struct CodeToLanguage {
   const TCHAR* code;
   Language language;
 };

 SELECTANY CodeToLanguage codes_to_languages[] = {
   { kLangCodeChinesePrc, CHINESE },
   { kLangCodeChineseTaiwan, CHINESE_T },
   { kLangCodeCjk, CHINESE_JAPANESE_KOREAN },
   { kLangCodeDutch, DUTCH },
   { kLangCodeEnglish, ENGLISH },
   { kLangCodeFrench, FRENCH },
   { kLangCodeGerman, GERMAN },
   { kLangCodeItalian, ITALIAN },
   { kLangCodeJapanese, JAPANESE },
   { kLangCodeKorean, KOREAN },
   { kLangCodePseudo, PSEUDOTRANSLATION },
   { kLangCodeSpanish, SPANISH },
   { NULL, UNKNOWN_LANGUAGE }
 };


 // Macro to wrap the notion of "unknown language".
 #define IS_LANGUAGE_UNKNOWN(l)  \
   ((l) == TG_UNKNOWN_LANGUAGE || (l) == UNKNOWN_LANGUAGE)

 // NOTE: Only add new encodings to the end of this list (but before
 // NUM_ENCODINGS).
 // NOTE: If you add an encoding here, you must also modify basistech_encoding()
 // and google2/com/google/i18n/Encoding.java
 enum Encoding {
   ISO_8859_1 = 0,       // 0: Teragram ASCII
   ISO_8859_2,           // 1: Teragram Latin2
   ISO_8859_3,           // 2: in BasisTech but not in Teragram
   ISO_8859_4,           // 3: Teragram Latin4
   ISO_8859_5,           // 4: Teragram ISO-8859-5
   ISO_8859_6,           // 5: Teragram Arabic
   ISO_8859_7,           // 6: Teragram Greek
   ISO_8859_8,           // 7: Teragram Hebrew
   ISO_8859_9,           // 8: in BasisTech but not in Teragram
   ISO_8859_10,          // 9: in BasisTech but not in Teragram
   JAPANESE_EUC_JP,      // 10: Teragram EUC_JP
   JAPANESE_SHIFT_JIS,   // 11: Teragram SJS
   JAPANESE_JIS,         // 12: Teragram JIS
   CHINESE_BIG5,         // 13: Teragram BIG5
   CHINESE_GB,           // 14: Teragram GB
   CHINESE_EUC_CN,       // 15: Teragram EUC-CN
   KOREAN_EUC_KR,        // 16: Teragram KSC
   UNICODE_ENCODING,     // 17: Teragram Unicode, changed to UNICODE_ENCODING
                         //     from UNICODE, which is predefined by WINDOW
   CHINESE_EUC_DEC,      // 18: Teragram EUC
   CHINESE_CNS,          // 19: Teragram CNS
   CHINESE_BIG5_CP950,   // 20: Teragram BIG5_CP950
   JAPANESE_CP932,       // 21: Teragram CP932
   UTF8,                 // 22
   UNKNOWN_ENCODING,     // 23
   ASCII_7BIT,           // 24: ISO_8859_1 with all characters <= 127.
                         //     Should be present only in the crawler
                         //     and in the repository,
                         //     *never* as a result of Document::encoding().
   RUSSIAN_KOI8_R,       // 25: Teragram KOI8R
   RUSSIAN_CP1251,       // 26: Teragram CP1251

   //----------------------------------------------------------
   // These are _not_ output from teragram. Instead, they are as
   // detected in the headers of usenet articles.
   MSFT_CP1252,          // 27: CP1252 aka MSFT euro ascii
   RUSSIAN_KOI8_RU,      // 28: CP21866 aka KOI8_RU, used for Ukrainian
   MSFT_CP1250,          // 29: CP1250 aka MSFT eastern european
   ISO_8859_15,          // 30: aka ISO_8859_0 aka ISO_8859_1 euroized
   //----------------------------------------------------------

   //----------------------------------------------------------
   // These are in BasisTech but not in Teragram. They are
   // needed for new interface languages. Now detected by
   // research langid
   MSFT_CP1254,          // 31: used for Turkish
   MSFT_CP1257,          // 32: used in Baltic countries
   //----------------------------------------------------------

   //----------------------------------------------------------
   //----------------------------------------------------------
   // New encodings detected by Teragram
   ISO_8859_11,          // 33: aka TIS-620, used for Thai
   MSFT_CP874,           // 34: used for Thai
   MSFT_CP1256,          // 35: used for Arabic

   //----------------------------------------------------------
   // Detected as ISO_8859_8 by Teragram, but can be found in META tags
   MSFT_CP1255,          // 36: Logical Hebrew Microsoft
   ISO_8859_8_I,         // 37: Iso Hebrew Logical
   HEBREW_VISUAL,        // 38: Iso Hebrew Visual
   //----------------------------------------------------------

   //----------------------------------------------------------
   // Detected by research langid
   CZECH_CP852,          // 39
   CZECH_CSN_369103,     // 40: aka ISO_IR_139 aka KOI8_CS
   MSFT_CP1253,          // 41: used for Greek
   RUSSIAN_CP866,        // 42
   //----------------------------------------------------------
   HZ_ENCODING,
   ISO2022_CN,
   ISO2022_KR,

   NUM_ENCODINGS              // Always keep this at the end. It is not a
                              // valid Encoding enum, it is only used to
                              // indicate the total number of Encodings.
 };

 const int kNumLanguages = NUM_LANGUAGES;
 const int kNumEncodings = NUM_ENCODINGS;

 #endif  // OMAHA_BASE_LANG_ENC_H_
	// Copyright 2004-2009 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	// ========================================================================
	//
	// This file is for i18n. It contains two enums, namely Language and
	// Encoding, where Language is the linguistic convention, and Encoding
	// contains information on both language encoding and character set.
	//
	// The language and encoding are both based on Teragram's conventions,
	// except for some common ISO-8859 encodings that are not detected by
	// Teragram but might be in the future.
	//
	// This file also includes functions that do mappings among
	// Language/Encoding enums, language/encoding string names (typically
	// the output from Language Encoding identifier), and language codes
	// (iso 639), and two-letter country codes (iso 3166)
	//
	// NOTE: Both Language and Encoding enums should always start from
	// zero value. This assumption has been made and used.

	#ifndef OMAHA_BASE_LANG_ENC_H_
	#define OMAHA_BASE_LANG_ENC_H_

	#include <windows.h>

	// some of the popular encoding aliases
	#define LATIN1 ISO_8859_1
	#define LATIN2 ISO_8859_2
	#define LATIN3 ISO_8859_3
	#define LATIN4 ISO_8859_4
	#define CYRILLIC ISO_8859_5
	#define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language
	#define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language
	#define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language
	#define LATIN5 ISO_8859_9
	#define LATIN6 ISO_8859_10
	#define KOREAN_HANGUL KOREAN_EUC_KR

	// NOTE: Only add new languages to the end of this list (but before
	// NUM_LANGUAGES).
	enum Language {
	ENGLISH = 0, /* 0 */
	DANISH, /* 1 */
	DUTCH, /* 2 */
	FINNISH, /* 3 */
	FRENCH, /* 4 */
	GERMAN, /* 5 */
	HEBREW, /* 6 */
	ITALIAN, /* 7 */
	JAPANESE, /* 8 */
	KOREAN, /* 9 */
	NORWEGIAN, /* 10 */
	POLISH, /* 11 */
	PORTUGUESE, /* 12 */
	RUSSIAN, /* 13 */
	SPANISH, /* 14 */
	SWEDISH, /* 15 */
	CHINESE, /* 16 */
	CZECH, /* 17 */
	GREEK, /* 18 */
	ICELANDIC, /* 19 */
	LATVIAN, /* 20 */
	LITHUANIAN, /* 21 */
	ROMANIAN, /* 22 */
	HUNGARIAN, /* 23 */
	ESTONIAN, /* 24 */
	TG_UNKNOWN_LANGUAGE, /* 25 */
	UNKNOWN_LANGUAGE, /* 26 */
	BULGARIAN, /* 27 */
	CROATIAN, /* 28 */
	SERBIAN, /* 29 */
	IRISH, /* 30 */
	GALICIAN, /* 31 */
	TAGALOG, /* 32 */
	TURKISH, /* 33 */
	UKRAINIAN, /* 34 */
	HINDI, /* 35 */
	MACEDONIAN, /* 36 */
	BENGALI, /* 37 */
	INDONESIAN, /* 38 */
	LATIN, /* 39 */
	MALAY, /* 40 */
	MALAYALAM, /* 41 */
	WELSH, /* 42 */
	NEPALI, /* 43 */
	TELUGU, /* 44 */
	ALBANIAN, /* 45 */
	TAMIL, /* 46 */
	BELARUSIAN, /* 47 */
	JAVANESE, /* 48 */
	OCCITAN, /* 49 */
	URDU, /* 50 */
	BIHARI, /* 51 */
	GUJARATI, /* 52 */
	THAI, /* 53 */
	ARABIC, /* 54 */
	CATALAN, /* 55 */
	ESPERANTO, /* 56 */
	BASQUE, /* 57 */
	INTERLINGUA, /* 58 */
	KANNADA, /* 59 */
	PUNJABI, /* 60 */
	SCOTS_GAELIC, /* 61 */
	SWAHILI, /* 62 */
	SLOVENIAN, /* 63 */
	MARATHI, /* 64 */
	MALTESE, /* 65 */
	VIETNAMESE, /* 66 */
	FRISIAN, /* 67 */
	SLOVAK, /* 68 */
	CHINESE_T, /* 69 */ // This is added to solve the problem of
	// distinguishing Traditional and Simplified
	// Chinese when the encoding is UTF8.
	FAROESE, /* 70 */
	SUNDANESE, /* 71 */
	UZBEK, /* 72 */
	AMHARIC, /* 73 */
	AZERBAIJANI, /* 74 */
	GEORGIAN, /* 75 */
	TIGRINYA, /* 76 */
	PERSIAN, /* 77 */
	BOSNIAN, /* 78 */
	SINHALESE, /* 79 */
	NORWEGIAN_N, /* 80 */
	PORTUGUESE_P, /* 81 */
	PORTUGUESE_B, /* 82 */
	XHOSA, /* 83 */
	ZULU, /* 84 */
	GUARANI, /* 85 */
	SESOTHO, /* 86 */
	TURKMEN, /* 87 */
	KYRGYZ, /* 88 */
	BRETON, /* 89 */
	TWI, /* 90 */
	YIDDISH, /* 91 */
	ORIYA, /* 92 */
	SERBO_CROATIAN, /* 93 */
	SOMALI, /* 94 */
	UIGHUR, /* 95 */
	KURDISH, /* 96 */
	MONGOLIAN, /* 97 */
	ARMENIAN, /* 98 */
	LAOTHIAN, /* 99 */
	SINDHI, /* 100! */
	RHAETO_ROMANCE, /* 101 */
	CHINESE_JAPANESE_KOREAN, /* 103 */ // Not really a language
	PSEUDOTRANSLATION, /* 104 */ // Not really a language
	NUM_LANGUAGES, // Always keep this at the end. It is not a
	// valid Language enum, it is only used to
	// indicate the total number of Languages.
	};


	// Language codes for those languages we support, used to map to IDs from
	// the Language enumeration. We could have used the Rfc1766ToLcid from the
	// Win32 system's mlang.dll to map these to LCIDs, but a) we don't want to
	// have to load mlang.dll and b) we are using our own language IDs.
	const TCHAR* const kLangCodeChinesePrc = _T("zh_cn");
	const TCHAR* const kLangCodeChineseTaiwan = _T("zh_tw");
	const TCHAR* const kLangCodeCjk = _T("cjk");
	const TCHAR* const kLangCodeDutch = _T("nl");
	const TCHAR* const kLangCodeEnglish = _T("en");
	const TCHAR* const kLangCodeFrench = _T("fr");
	const TCHAR* const kLangCodeGerman = _T("de");
	const TCHAR* const kLangCodeItalian = _T("it");
	const TCHAR* const kLangCodeJapanese = _T("ja");
	const TCHAR* const kLangCodeKorean = _T("ko");
	const TCHAR* const kLangCodePseudo = _T("x");
	const TCHAR* const kLangCodeSpanish = _T("es");


	// Maps language codes to languages. Terminated by a { NULL, UNKNOWN_LANGUAGE }
	// item.
	struct CodeToLanguage {
	const TCHAR* code;
	Language language;
	};

	SELECTANY CodeToLanguage codes_to_languages[] = {
	{ kLangCodeChinesePrc, CHINESE },
	{ kLangCodeChineseTaiwan, CHINESE_T },
	{ kLangCodeCjk, CHINESE_JAPANESE_KOREAN },
	{ kLangCodeDutch, DUTCH },
	{ kLangCodeEnglish, ENGLISH },
	{ kLangCodeFrench, FRENCH },
	{ kLangCodeGerman, GERMAN },
	{ kLangCodeItalian, ITALIAN },
	{ kLangCodeJapanese, JAPANESE },
	{ kLangCodeKorean, KOREAN },
	{ kLangCodePseudo, PSEUDOTRANSLATION },
	{ kLangCodeSpanish, SPANISH },
	{ NULL, UNKNOWN_LANGUAGE }
	};



	// Macro to wrap the notion of "unknown language".
	#define IS_LANGUAGE_UNKNOWN(l) \
	((l) == TG_UNKNOWN_LANGUAGE \|\| (l) == UNKNOWN_LANGUAGE)

	// NOTE: Only add new encodings to the end of this list (but before
	// NUM_ENCODINGS).
	// NOTE: If you add an encoding here, you must also modify basistech_encoding()
	// and google2/com/google/i18n/Encoding.java
	enum Encoding {
	ISO_8859_1 = 0, // 0: Teragram ASCII
	ISO_8859_2, // 1: Teragram Latin2
	ISO_8859_3, // 2: in BasisTech but not in Teragram
	ISO_8859_4, // 3: Teragram Latin4
	ISO_8859_5, // 4: Teragram ISO-8859-5
	ISO_8859_6, // 5: Teragram Arabic
	ISO_8859_7, // 6: Teragram Greek
	ISO_8859_8, // 7: Teragram Hebrew
	ISO_8859_9, // 8: in BasisTech but not in Teragram
	ISO_8859_10, // 9: in BasisTech but not in Teragram
	JAPANESE_EUC_JP, // 10: Teragram EUC_JP
	JAPANESE_SHIFT_JIS, // 11: Teragram SJS
	JAPANESE_JIS, // 12: Teragram JIS
	CHINESE_BIG5, // 13: Teragram BIG5
	CHINESE_GB, // 14: Teragram GB
	CHINESE_EUC_CN, // 15: Teragram EUC-CN
	KOREAN_EUC_KR, // 16: Teragram KSC
	UNICODE_ENCODING, // 17: Teragram Unicode, changed to UNICODE_ENCODING
	// from UNICODE, which is predefined by WINDOW
	CHINESE_EUC_DEC, // 18: Teragram EUC
	CHINESE_CNS, // 19: Teragram CNS
	CHINESE_BIG5_CP950, // 20: Teragram BIG5_CP950
	JAPANESE_CP932, // 21: Teragram CP932
	UTF8, // 22
	UNKNOWN_ENCODING, // 23
	ASCII_7BIT, // 24: ISO_8859_1 with all characters <= 127.
	// Should be present only in the crawler
	// and in the repository,
	// never as a result of Document::encoding().
	RUSSIAN_KOI8_R, // 25: Teragram KOI8R
	RUSSIAN_CP1251, // 26: Teragram CP1251

	//----------------------------------------------------------
	// These are _not_ output from teragram. Instead, they are as
	// detected in the headers of usenet articles.
	MSFT_CP1252, // 27: CP1252 aka MSFT euro ascii
	RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, used for Ukrainian
	MSFT_CP1250, // 29: CP1250 aka MSFT eastern european
	ISO_8859_15, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized
	//----------------------------------------------------------

	//----------------------------------------------------------
	// These are in BasisTech but not in Teragram. They are
	// needed for new interface languages. Now detected by
	// research langid
	MSFT_CP1254, // 31: used for Turkish
	MSFT_CP1257, // 32: used in Baltic countries
	//----------------------------------------------------------

	//----------------------------------------------------------
	//----------------------------------------------------------
	// New encodings detected by Teragram
	ISO_8859_11, // 33: aka TIS-620, used for Thai
	MSFT_CP874, // 34: used for Thai
	MSFT_CP1256, // 35: used for Arabic

	//----------------------------------------------------------
	// Detected as ISO_8859_8 by Teragram, but can be found in META tags
	MSFT_CP1255, // 36: Logical Hebrew Microsoft
	ISO_8859_8_I, // 37: Iso Hebrew Logical
	HEBREW_VISUAL, // 38: Iso Hebrew Visual
	//----------------------------------------------------------

	//----------------------------------------------------------
	// Detected by research langid
	CZECH_CP852, // 39
	CZECH_CSN_369103, // 40: aka ISO_IR_139 aka KOI8_CS
	MSFT_CP1253, // 41: used for Greek
	RUSSIAN_CP866, // 42
	//----------------------------------------------------------
	HZ_ENCODING,
	ISO2022_CN,
	ISO2022_KR,

	NUM_ENCODINGS // Always keep this at the end. It is not a
	// valid Encoding enum, it is only used to
	// indicate the total number of Encodings.
	};

	const int kNumLanguages = NUM_LANGUAGES;
	const int kNumEncodings = NUM_ENCODINGS;

	#endif // OMAHA_BASE_LANG_ENC_H_