| // Copyright (c) 2009 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "encodings/compact_lang_det/compact_lang_det.h" |
| #include "encodings/compact_lang_det/compact_lang_det_impl.h" |
| #include "encodings/compact_lang_det/win/cld_basictypes.h" |
| |
| // String is "code_version - data_scrape_date" |
| static const char* kDetectLanguageVersion = "V1.6 - 20081121"; |
| |
| // Large-table version for all ~160 languages (all Tiers) |
| |
| // Scan interchange-valid UTF-8 bytes and detect most likely language |
| Language CompactLangDet::DetectLanguage( |
| const DetectionTables* tables, |
| const char* buffer, |
| int buffer_length, |
| bool is_plain_text, |
| bool* is_reliable) { |
| bool allow_extended_lang = false; |
| Language language3[3]; |
| int percent3[3]; |
| double normalized_score3[3]; |
| int text_bytes; |
| int flags = 0; |
| Language plus_one = UNKNOWN_LANGUAGE; |
| const char* tld_hint = ""; |
| int encoding_hint = UNKNOWN_ENCODING; |
| Language language_hint = UNKNOWN_LANGUAGE; |
| |
| Language lang = CompactLangDetImpl::DetectLanguageSummaryV25( |
| tables, |
| buffer, |
| buffer_length, |
| is_plain_text, |
| tld_hint, // "id" boosts Indonesian |
| encoding_hint, // SJS boosts Japanese |
| language_hint, // ITALIAN boosts it |
| allow_extended_lang, |
| flags, |
| plus_one, |
| language3, |
| percent3, |
| normalized_score3, |
| &text_bytes, |
| is_reliable); |
| // Default to English. |
| if (lang == UNKNOWN_LANGUAGE) { |
| lang = ENGLISH; |
| } |
| return lang; |
| } |
| |
| // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. |
| Language CompactLangDet::DetectLanguageSummary( |
| const DetectionTables* tables, |
| const char* buffer, |
| int buffer_length, |
| bool is_plain_text, |
| Language* language3, |
| int* percent3, |
| int* text_bytes, |
| bool* is_reliable) { |
| double normalized_score3[3]; |
| bool allow_extended_lang = false; |
| int flags = 0; |
| Language plus_one = UNKNOWN_LANGUAGE; |
| const char* tld_hint = ""; |
| int encoding_hint = UNKNOWN_ENCODING; |
| Language language_hint = UNKNOWN_LANGUAGE; |
| |
| Language lang = CompactLangDetImpl::DetectLanguageSummaryV25( |
| tables, |
| buffer, |
| buffer_length, |
| is_plain_text, |
| tld_hint, // "id" boosts Indonesian |
| encoding_hint, // SJS boosts Japanese |
| language_hint, // ITALIAN boosts it |
| allow_extended_lang, |
| flags, |
| plus_one, |
| language3, |
| percent3, |
| normalized_score3, |
| text_bytes, |
| is_reliable); |
| // Default to English |
| if (lang == UNKNOWN_LANGUAGE) { |
| lang = ENGLISH; |
| } |
| return lang; |
| } |
| |
| // Same as above, with hints supplied |
| // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. |
| Language CompactLangDet::DetectLanguageSummary( |
| const DetectionTables* tables, |
| const char* buffer, |
| int buffer_length, |
| bool is_plain_text, |
| const char* tld_hint, // "id" boosts Indonesian |
| int encoding_hint, // SJS boosts Japanese |
| Language language_hint, // ITALIAN boosts it |
| Language* language3, |
| int* percent3, |
| int* text_bytes, |
| bool* is_reliable) { |
| double normalized_score3[3]; |
| bool allow_extended_lang = false; |
| int flags = 0; |
| Language plus_one = UNKNOWN_LANGUAGE; |
| |
| Language lang = CompactLangDetImpl::DetectLanguageSummaryV25( |
| tables, |
| buffer, |
| buffer_length, |
| is_plain_text, |
| tld_hint, // "id" boosts Indonesian |
| encoding_hint, // SJS boosts Japanese |
| language_hint, // ITALIAN boosts it |
| allow_extended_lang, |
| flags, |
| plus_one, |
| language3, |
| percent3, |
| normalized_score3, |
| text_bytes, |
| is_reliable); |
| // Default to English |
| if (lang == UNKNOWN_LANGUAGE) { |
| lang = ENGLISH; |
| } |
| return lang; |
| } |
| |
| |
| // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended |
| // languages. |
| // Extended languages are additional Google interface languages and Unicode |
| // single-language scripts, from ext_lang_enc.h |
| Language CompactLangDet::ExtDetectLanguageSummary( |
| const DetectionTables* tables, |
| const char* buffer, |
| int buffer_length, |
| bool is_plain_text, |
| Language* language3, |
| int* percent3, |
| int* text_bytes, |
| bool* is_reliable) { |
| double normalized_score3[3]; |
| bool allow_extended_lang = true; |
| int flags = 0; |
| Language plus_one = UNKNOWN_LANGUAGE; |
| const char* tld_hint = ""; |
| int encoding_hint = UNKNOWN_ENCODING; |
| Language language_hint = UNKNOWN_LANGUAGE; |
| |
| Language lang = CompactLangDetImpl::DetectLanguageSummaryV25( |
| tables, |
| buffer, |
| buffer_length, |
| is_plain_text, |
| tld_hint, // "id" boosts Indonesian |
| encoding_hint, // SJS boosts Japanese |
| language_hint, // ITALIAN boosts it |
| allow_extended_lang, |
| flags, |
| plus_one, |
| language3, |
| percent3, |
| normalized_score3, |
| text_bytes, |
| is_reliable); |
| // Do not default to English |
| return lang; |
| } |
| |
| // Same as above, with hints supplied |
| // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended |
| // languages. |
| // Extended languages are additional Google interface languages and Unicode |
| // single-language scripts, from ext_lang_enc.h |
| Language CompactLangDet::ExtDetectLanguageSummary( |
| const DetectionTables* tables, |
| const char* buffer, |
| int buffer_length, |
| bool is_plain_text, |
| const char* tld_hint, // "id" boosts Indonesian |
| int encoding_hint, // SJS boosts Japanese |
| Language language_hint, // ITALIAN boosts it |
| Language* language3, |
| int* percent3, |
| int* text_bytes, |
| bool* is_reliable) { |
| double normalized_score3[3]; |
| bool allow_extended_lang = true; |
| int flags = 0; |
| Language plus_one = UNKNOWN_LANGUAGE; |
| |
| Language lang = CompactLangDetImpl::DetectLanguageSummaryV25( |
| tables, |
| buffer, |
| buffer_length, |
| is_plain_text, |
| tld_hint, // "id" boosts Indonesian |
| encoding_hint, // SJS boosts Japanese |
| language_hint, // ITALIAN boosts it |
| allow_extended_lang, |
| flags, |
| plus_one, |
| language3, |
| percent3, |
| normalized_score3, |
| text_bytes, |
| is_reliable); |
| // Do not default to English |
| return lang; |
| } |
| |
| // Same as above, and also returns internal language scores as a ratio to |
| // normal score for real text in that language. Scores close to 1.0 indicate |
| // normal text, while scores far away from 1.0 indicate badly-skewed text or |
| // gibberish |
| // |
| Language CompactLangDet::ExtDetectLanguageSummary( |
| const DetectionTables* tables, |
| const char* buffer, |
| int buffer_length, |
| bool is_plain_text, |
| const char* tld_hint, // "id" boosts Indonesian |
| int encoding_hint, // SJS boosts Japanese |
| Language language_hint, // ITALIAN boosts it |
| Language* language3, |
| int* percent3, |
| double* normalized_score3, |
| int* text_bytes, |
| bool* is_reliable) { |
| bool allow_extended_lang = true; |
| int flags = 0; |
| Language plus_one = UNKNOWN_LANGUAGE; |
| |
| Language lang = CompactLangDetImpl::DetectLanguageSummaryV25( |
| tables, |
| buffer, |
| buffer_length, |
| is_plain_text, |
| tld_hint, // "id" boosts Indonesian |
| encoding_hint, // SJS boosts Japanese |
| language_hint, // ITALIAN boosts it |
| allow_extended_lang, |
| flags, |
| plus_one, |
| language3, |
| percent3, |
| normalized_score3, |
| text_bytes, |
| is_reliable); |
| // Do not default to English |
| return lang; |
| } |
| |
| |
| |
| // Return version text string |
| // String is "code_version - data_scrape_date" |
| const char* CompactLangDet::DetectLanguageVersion() { |
| return kDetectLanguageVersion; |
| } |
| |