third_party/cld/encodings/lang_enc.h - chromium/src - Git at Google

 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 // This file is for i18n. It contains two enums, namely Language and
 // Encoding, where Language is the linguistic convention, and Encoding
 // contains information on both language encoding and character set.
 //
 // The language and encoding are both based on Teragram's conventions,
 // except for some common ISO-8859 encodings that are not detected by
 // Teragram but might be in the future.
 //
 // This file also includes functions that do mappings among
 // Language/Encoding enums, language/encoding string names (typically
 // the output from Language Encoding identifier), and language codes
 // (iso 639), and two-letter country codes (iso 3166)
 //
 // NOTE: Both Language and Encoding enums should always start from
 // zero value. This assumption has been made and used.
 //

 #ifndef ENCODINGS_LANG_ENC_H__
 #define ENCODINGS_LANG_ENC_H__

 #include "languages/public/languages.h"
 #include "encodings/public/encodings.h"


 // EncodingsForLanguage
 // --------------------
 //
 // Given the language, returns a pointer to an array of encodings this
 // language supports. Typically, the encs array has at least one
 // element: UNKNOWN_ENCODING, which is always the last element of the
 // array. The first encoding is the default encoding of the language.
 // Return NULL if the input is invalid.
 //
 // Note: The output encoding array does not include ASCII_7BIT, UTF8
 // or UNICODE which are good for all languages. TODO: Find out whether
 // it is better to include ASCII_7BIT, UTF8 and UNICODE or leave them
 // as special cases.
 //
 const Encoding* EncodingsForLanguage(Language lang);


 // DefaultEncodingForLanguage
 // --------------------------
 //
 // Given the language, returns the default encoding for the language
 // via the argument encoding.
 //
 // The function returns true if the input lang is valid. Otherwise,
 // false is returned, and encoding is set to UNKNOWN_ENCODING.
 //
 bool DefaultEncodingForLanguage(Language lang,
                                 Encoding *encoding);

 // LanguagesForEncoding
 // --------------------
 //
 // Given the encoding, returns a pointer to an array of languages this
 // encoding supports. Typically, the langs array has at least one
 // element: UNKNOWN_LANGUAGE, which is always the last element of the
 // array. The first language in the array if the most popular
 // language for that encoding. NULL is returned if the input is
 // invalid.
 //
 // Note: For ASCII_7BIT, UNICODE and UTF8, only ENGLISH and
 // UNKNOWN_LANGUAGE are returned. TODO: Find out whether to return all
 // the languages or to treat these two encodings as special cases.
 //
 // For other known encodings, ENGLISH is always included. This is
 // because English (Latin) characters are included in each encoding.
 //
 const Language* LanguagesForEncoding(Encoding enc);

 // DefaultLanguageForEncoding
 // --------------------------
 //
 // Given the encoding, returns the default language for that encoding
 // via the argument language.
 //
 // The function returns true if the input enc is valid. Otherwise,
 // false is returned, and language is set to UNKNOWN_LANGUAGE.
 //
 // Note, this function is more useful for the encodings that have only
 // one corresponding language i.e. shift_jis => Japanese. There are
 // cases that multiple langauges have the same encoding, for which the
 // default language is an arbitrary choice from them.
 //
 bool DefaultLanguageForEncoding(Encoding enc, Language* language);

 //
 // IsLangEncCompatible
 // -------------------
 //
 // This function is to determine whether the input language and
 // encoding are compatible. For example, FRENCH and LATIN1 are
 // compatible, but FRENCH and GB are not.
 //
 // If either lang or enc is invalid return false.
 // If either lang is unknown, return true.
 //    (e.g. we can detect a page's encoding as latin1 from metatag info, but
 //     cannot derive it language since there are more than one
 //     language encoding in Latin1 )
 // If language is known, but encoding is unknown, return false.
 //    (return true will do us no good since we cannot convert to UTF8 anyway)
 // If enc is unicode or utf8, return true.
 // Otherwise check if lang is supported by enc and enc supported by
 // lang.
 //
 bool IsLangEncCompatible(Language lang, Encoding enc);

 //
 // DominantLanguageFromEncoding
 // ----------------------------
 //
 // This function determine if there exists a dominant language for the
 // input encoding. For example, the encoding GB has a dominant
 // language (Chinese), but Latin1 does not.
 //
 // The word "dominant" is used here because English characters are
 // included in each encoding.
 //
 // If there is no dominant langauge for the encoding, such as Latin1,
 // UNKNOWN_LANGUAGE is returned.
 //
 Language DominantLanguageFromEncoding(Encoding enc);

 // LanguageCode
 // ------------------------
 // Given the Language and Encoding, return language code with dialects
 // (>= 2 letters).  Encoding is necessary to disambiguate between
 // Simplified and Traditional Chinese.
 //
 // See the note on Chinese Language Codes in
 // i18n/languages/public/languages.h
 // for the details.

 const char* LanguageCode(Language lang, Encoding enc);

 //
 // IsEncodingWithSupportedLanguage()
 // ---------------------------------
 //
 // There are some encoding listed here just because they are commonly
 // used.  There is no interface language for them yet. They are not
 // detected by Teragram, but can be detected from the meta info of the
 // HTML page.
 //
 // For example, we have list ARABIC_ENCODING but there is no arabic in
 // the Language enum. If the user input an Arabic query from Google
 // main page, Netscape will just send the raw bytes to GWS, and GWS
 // will treat them as Latin1.  Therefore, there is no use to detect
 // ARABIC_ENCODING for indexing, since they will never match the
 // queries which are treated as Latin1 by GWS. On the contrary, if we
 // treat page with ARABIC_ENCODING as UNKNOWN_ENCODING, Google will
 // fall them through as Latin1 in indexing time. And there might be a
 // match for some ARABIC queries which are also treated as Latin1 by
 // GWS. In fact, some people are relying on this feature to do Arabic
 // searches.
 //
 // Thus for these type of encoding, before we have the UI support for
 // their language and have a pretty comprehensive language/encoding
 // identification quality, it is better to revert them as
 // UNKNOWN_ENCODING.
 //
 // This function checks whether the input encoding is one with
 // an interface language.
 bool IsEncodingWithSupportedLanguage(Encoding enc);


 //
 // LangsFromCountryCode and EncFromCountryCode
 // -------------------------------------------
 //
 // These two functions return the possible languages and encodings,
 // respectively, according to the input country code, which is a
 // 2-letter string. The country code is usually specified in the url
 // of a document.
 //
 //

 // LangsFromCountryCode
 // --------------------
 //
 // This function takes a string of arbitrary length. It treats the
 // first 2 bytes of the string as the country code, as defined in iso
 // 3166-1993 (E).  It returns, via arguments, an array of the
 // languages that are popular in that country, roughly in order of
 // popularity, together with the size of the array.
 //
 // This function returns true if we have language information for
 // country_code.  Otherwise, it returns false.
 //
 bool LangsFromCountryCode(const char* country_code,
                           const Language** lang_arry,
                           int* num_langs);


 //
 // EncFromCountryCode
 // ------------------
 //
 // This function takes a string of arbitrary length. It treats the
 // first 2 bytes of that string as the country code, as defined in iso
 // 3166-1993 (E). It sets *enc to the encoding that is
 // most often used for the languages spoken in that country.
 //
 // This function returns true if we have encoding information for
 // country_code.  Otherwise, it returns false, and *enc is set to
 // UNKNOWN_ENCODING.
 //
 bool EncFromCountryCode(const char* country_code, Encoding* enc);


 // VisualType
 // ----------
 //
 // Right-to-left documents may be in logical or visual order. When they
 // are in visual order we convert them to logical order before processing.
 // This enum lists the types of visual document we can encounter.
 // Some, but not all, documents in Hebrew/Arabic/Persian etc. will be visual.
 // The other documents in those languages, and all documents in non-RTL
 // languages, will be NOT_VISUAL_DOCUMENT.
 enum VisualType {
   NOT_VISUAL_DOCUMENT = 0,
   VISUAL_HEBREW_HTML,  // HTML documents in the legacy visual order.
   CONVERTED_RTL_PDF,   // Converted RTL PDFs, which are always visual.
 };

 VisualType default_visualtype();

 // VisualTypeName
 // --------------
 //
 // Given the visual type, returns a string name useful for debug output.
 const char* VisualTypeName(VisualType visualtype);


 // InitLangEnc
 // -----------
 //
 // Ensures the LangEnc module has been initialized.  Normally this
 // happens during InitGoogle, but this allows access for scripts that
 // don't support InitGoogle. InitLangEnc calls InitEncodings (see
 // i18n/encodings/public/encodings.h) and also initializes data
 // structures used in lang_enc.cc.
 //
 void InitLangEnc();

 #endif  // ENCODINGS_LANG_ENC_H__
	// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	// This file is for i18n. It contains two enums, namely Language and
	// Encoding, where Language is the linguistic convention, and Encoding
	// contains information on both language encoding and character set.
	//
	// The language and encoding are both based on Teragram's conventions,
	// except for some common ISO-8859 encodings that are not detected by
	// Teragram but might be in the future.
	//
	// This file also includes functions that do mappings among
	// Language/Encoding enums, language/encoding string names (typically
	// the output from Language Encoding identifier), and language codes
	// (iso 639), and two-letter country codes (iso 3166)
	//
	// NOTE: Both Language and Encoding enums should always start from
	// zero value. This assumption has been made and used.
	//

	#ifndef ENCODINGS_LANG_ENC_H__
	#define ENCODINGS_LANG_ENC_H__

	#include "languages/public/languages.h"
	#include "encodings/public/encodings.h"


	// EncodingsForLanguage
	// --------------------
	//
	// Given the language, returns a pointer to an array of encodings this
	// language supports. Typically, the encs array has at least one
	// element: UNKNOWN_ENCODING, which is always the last element of the
	// array. The first encoding is the default encoding of the language.
	// Return NULL if the input is invalid.
	//
	// Note: The output encoding array does not include ASCII_7BIT, UTF8
	// or UNICODE which are good for all languages. TODO: Find out whether
	// it is better to include ASCII_7BIT, UTF8 and UNICODE or leave them
	// as special cases.
	//
	const Encoding* EncodingsForLanguage(Language lang);


	// DefaultEncodingForLanguage
	// --------------------------
	//
	// Given the language, returns the default encoding for the language
	// via the argument encoding.
	//
	// The function returns true if the input lang is valid. Otherwise,
	// false is returned, and encoding is set to UNKNOWN_ENCODING.
	//
	bool DefaultEncodingForLanguage(Language lang,
	Encoding *encoding);

	// LanguagesForEncoding
	// --------------------
	//
	// Given the encoding, returns a pointer to an array of languages this
	// encoding supports. Typically, the langs array has at least one
	// element: UNKNOWN_LANGUAGE, which is always the last element of the
	// array. The first language in the array if the most popular
	// language for that encoding. NULL is returned if the input is
	// invalid.
	//
	// Note: For ASCII_7BIT, UNICODE and UTF8, only ENGLISH and
	// UNKNOWN_LANGUAGE are returned. TODO: Find out whether to return all
	// the languages or to treat these two encodings as special cases.
	//
	// For other known encodings, ENGLISH is always included. This is
	// because English (Latin) characters are included in each encoding.
	//
	const Language* LanguagesForEncoding(Encoding enc);

	// DefaultLanguageForEncoding
	// --------------------------
	//
	// Given the encoding, returns the default language for that encoding
	// via the argument language.
	//
	// The function returns true if the input enc is valid. Otherwise,
	// false is returned, and language is set to UNKNOWN_LANGUAGE.
	//
	// Note, this function is more useful for the encodings that have only
	// one corresponding language i.e. shift_jis => Japanese. There are
	// cases that multiple langauges have the same encoding, for which the
	// default language is an arbitrary choice from them.
	//
	bool DefaultLanguageForEncoding(Encoding enc, Language* language);

	//
	// IsLangEncCompatible
	// -------------------
	//
	// This function is to determine whether the input language and
	// encoding are compatible. For example, FRENCH and LATIN1 are
	// compatible, but FRENCH and GB are not.
	//
	// If either lang or enc is invalid return false.
	// If either lang is unknown, return true.
	// (e.g. we can detect a page's encoding as latin1 from metatag info, but
	// cannot derive it language since there are more than one
	// language encoding in Latin1 )
	// If language is known, but encoding is unknown, return false.
	// (return true will do us no good since we cannot convert to UTF8 anyway)
	// If enc is unicode or utf8, return true.
	// Otherwise check if lang is supported by enc and enc supported by
	// lang.
	//
	bool IsLangEncCompatible(Language lang, Encoding enc);

	//
	// DominantLanguageFromEncoding
	// ----------------------------
	//
	// This function determine if there exists a dominant language for the
	// input encoding. For example, the encoding GB has a dominant
	// language (Chinese), but Latin1 does not.
	//
	// The word "dominant" is used here because English characters are
	// included in each encoding.
	//
	// If there is no dominant langauge for the encoding, such as Latin1,
	// UNKNOWN_LANGUAGE is returned.
	//
	Language DominantLanguageFromEncoding(Encoding enc);

	// LanguageCode
	// ------------------------
	// Given the Language and Encoding, return language code with dialects
	// (>= 2 letters). Encoding is necessary to disambiguate between
	// Simplified and Traditional Chinese.
	//
	// See the note on Chinese Language Codes in
	// i18n/languages/public/languages.h
	// for the details.

	const char* LanguageCode(Language lang, Encoding enc);

	//
	// IsEncodingWithSupportedLanguage()
	// ---------------------------------
	//
	// There are some encoding listed here just because they are commonly
	// used. There is no interface language for them yet. They are not
	// detected by Teragram, but can be detected from the meta info of the
	// HTML page.
	//
	// For example, we have list ARABIC_ENCODING but there is no arabic in
	// the Language enum. If the user input an Arabic query from Google
	// main page, Netscape will just send the raw bytes to GWS, and GWS
	// will treat them as Latin1. Therefore, there is no use to detect
	// ARABIC_ENCODING for indexing, since they will never match the
	// queries which are treated as Latin1 by GWS. On the contrary, if we
	// treat page with ARABIC_ENCODING as UNKNOWN_ENCODING, Google will
	// fall them through as Latin1 in indexing time. And there might be a
	// match for some ARABIC queries which are also treated as Latin1 by
	// GWS. In fact, some people are relying on this feature to do Arabic
	// searches.
	//
	// Thus for these type of encoding, before we have the UI support for
	// their language and have a pretty comprehensive language/encoding
	// identification quality, it is better to revert them as
	// UNKNOWN_ENCODING.
	//
	// This function checks whether the input encoding is one with
	// an interface language.
	bool IsEncodingWithSupportedLanguage(Encoding enc);


	//
	// LangsFromCountryCode and EncFromCountryCode
	// -------------------------------------------
	//
	// These two functions return the possible languages and encodings,
	// respectively, according to the input country code, which is a
	// 2-letter string. The country code is usually specified in the url
	// of a document.
	//
	//

	// LangsFromCountryCode
	// --------------------
	//
	// This function takes a string of arbitrary length. It treats the
	// first 2 bytes of the string as the country code, as defined in iso
	// 3166-1993 (E). It returns, via arguments, an array of the
	// languages that are popular in that country, roughly in order of
	// popularity, together with the size of the array.
	//
	// This function returns true if we have language information for
	// country_code. Otherwise, it returns false.
	//
	bool LangsFromCountryCode(const char* country_code,
	const Language** lang_arry,
	int* num_langs);


	//
	// EncFromCountryCode
	// ------------------
	//
	// This function takes a string of arbitrary length. It treats the
	// first 2 bytes of that string as the country code, as defined in iso
	// 3166-1993 (E). It sets *enc to the encoding that is
	// most often used for the languages spoken in that country.
	//
	// This function returns true if we have encoding information for
	// country_code. Otherwise, it returns false, and *enc is set to
	// UNKNOWN_ENCODING.
	//
	bool EncFromCountryCode(const char* country_code, Encoding* enc);



	// VisualType
	// ----------
	//
	// Right-to-left documents may be in logical or visual order. When they
	// are in visual order we convert them to logical order before processing.
	// This enum lists the types of visual document we can encounter.
	// Some, but not all, documents in Hebrew/Arabic/Persian etc. will be visual.
	// The other documents in those languages, and all documents in non-RTL
	// languages, will be NOT_VISUAL_DOCUMENT.
	enum VisualType {
	NOT_VISUAL_DOCUMENT = 0,
	VISUAL_HEBREW_HTML, // HTML documents in the legacy visual order.
	CONVERTED_RTL_PDF, // Converted RTL PDFs, which are always visual.
	};

	VisualType default_visualtype();

	// VisualTypeName
	// --------------
	//
	// Given the visual type, returns a string name useful for debug output.
	const char* VisualTypeName(VisualType visualtype);



	// InitLangEnc
	// -----------
	//
	// Ensures the LangEnc module has been initialized. Normally this
	// happens during InitGoogle, but this allows access for scripts that
	// don't support InitGoogle. InitLangEnc calls InitEncodings (see
	// i18n/encodings/public/encodings.h) and also initializes data
	// structures used in lang_enc.cc.
	//
	void InitLangEnc();

	#endif // ENCODINGS_LANG_ENC_H__