| // Copyright 2013 Google Inc. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // |
| // Author: dsites@google.com (Dick Sites) |
| // Updated 2014.01 for dual table lookup |
| // |
| |
| #include <stdio.h> |
| #include <string.h> |
| #include <string> |
| #include <vector> |
| |
| #include "cldutil.h" |
| #include "debug.h" |
| #include "integral_types.h" |
| #include "lang_script.h" |
| #include "utf8statetable.h" |
| |
| #ifdef CLD2_DYNAMIC_MODE |
| #include "cld2_dynamic_data.h" |
| #include "cld2_dynamic_data_loader.h" |
| #endif |
| #include "cld2tablesummary.h" |
| #include "compact_lang_det_impl.h" |
| #include "compact_lang_det_hint_code.h" |
| #include "getonescriptspan.h" |
| #include "tote.h" |
| |
| |
| namespace CLD2 { |
| |
| using namespace std; |
| |
| // Linker supplies the right tables, From files |
| // cld_generated_cjk_uni_prop_80.cc cld2_generated_cjk_compatible.cc |
| // cld_generated_cjk_delta_bi_32.cc generated_distinct_bi_0.cc |
| // cld2_generated_quad*.cc cld2_generated_deltaocta*.cc |
| // cld2_generated_distinctocta*.cc |
| // cld_generated_score_quad_octa_1024_256.cc |
| |
| // 2014.01 Now implementing quadgram dual lookup tables, to allow main table |
| // sizes that are 1/3/5 times a power of two, instead of just powers of two. |
| // Gives more flexibility of total footprint for CLD2. |
| |
| extern const int kLanguageToPLangSize; |
| extern const int kCloseSetSize; |
| |
| extern const UTF8PropObj cld_generated_CjkUni_obj; |
| extern const CLD2TableSummary kCjkCompat_obj; |
| extern const CLD2TableSummary kCjkDeltaBi_obj; |
| extern const CLD2TableSummary kDistinctBiTable_obj; |
| extern const CLD2TableSummary kQuad_obj; |
| extern const CLD2TableSummary kQuad_obj2; // Dual lookup tables |
| extern const CLD2TableSummary kDeltaOcta_obj; |
| extern const CLD2TableSummary kDistinctOcta_obj; |
| extern const short kAvgDeltaOctaScore[]; |
| |
| #ifdef CLD2_DYNAMIC_MODE |
| // CLD2_DYNAMIC_MODE is defined: |
| // Data will be read from an mmap opened at runtime. |
| |
| // Convenience for nulling things out completely at any point. |
| static ScoringTables NULL_TABLES = { |
| NULL, //&cld_generated_CjkUni_obj, |
| NULL, //&kCjkCompat_obj, |
| NULL, //&kCjkDeltaBi_obj, |
| NULL, //&kDistinctBiTable_obj, |
| NULL, //&kQuad_obj, |
| NULL, //&kQuad_obj2, |
| NULL, //&kDeltaOcta_obj, |
| NULL, //&kDistinctOcta_obj, |
| NULL, //kAvgDeltaOctaScore, |
| }; |
| static ScoringTables kScoringtables = NULL_TABLES; // copy constructed |
| static bool dynamicDataLoaded = false; |
| static bool dataSourceIsFile = false; |
| static ScoringTables* dynamicTables = NULL; |
| static void* mmapAddress = NULL; |
| static int mmapLength = 0; |
| |
| bool isDataLoaded() { return dynamicDataLoaded; } |
| |
| void loadDataFromFile(const char* fileName) { |
| if (isDataLoaded()) { |
| unloadData(); |
| } |
| dynamicTables = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength); |
| kScoringtables = *dynamicTables; |
| dataSourceIsFile = true; |
| dynamicDataLoaded = true; |
| }; |
| |
| void loadDataFromRawAddress(const void* rawAddress, const int length) { |
| if (isDataLoaded()) { |
| unloadData(); |
| } |
| dynamicTables = CLD2DynamicDataLoader::loadDataRaw(rawAddress, length); |
| kScoringtables = *dynamicTables; |
| dataSourceIsFile = false; |
| dynamicDataLoaded = true; |
| } |
| |
| void unloadData() { |
| if (!dynamicDataLoaded) return; |
| if (dataSourceIsFile) { |
| CLD2DynamicDataLoader::unloadDataFile(&dynamicTables, &mmapAddress, &mmapLength); |
| } else { |
| CLD2DynamicDataLoader::unloadDataRaw(&dynamicTables); |
| } |
| dynamicDataLoaded = false; |
| dataSourceIsFile = false; // vacuous |
| kScoringtables = NULL_TABLES; // Housekeeping: null all pointers |
| } |
| #else |
| // This initializes kScoringtables.quadgram_obj etc. |
| static const ScoringTables kScoringtables = { |
| &cld_generated_CjkUni_obj, |
| &kCjkCompat_obj, |
| &kCjkDeltaBi_obj, |
| &kDistinctBiTable_obj, |
| |
| &kQuad_obj, |
| &kQuad_obj2, // Dual lookup tables |
| &kDeltaOcta_obj, |
| &kDistinctOcta_obj, |
| |
| kAvgDeltaOctaScore, |
| }; |
| #endif // #ifdef CLD2_DYNAMIC_MODE |
| |
| |
| static const bool FLAGS_cld_no_minimum_bytes = false; |
| static const bool FLAGS_cld_forcewords = true; |
| static const bool FLAGS_cld_showme = false; |
| static const bool FLAGS_cld_echotext = true; |
| static const int32 FLAGS_cld_textlimit = 160; |
| static const int32 FLAGS_cld_smoothwidth = 20; |
| static const bool FLAGS_cld_2011_hints = true; |
| static const int32 FLAGS_cld_max_lang_tag_scan_kb = 8; |
| |
| static const bool FLAGS_dbgscore = false; |
| |
| |
| static const int kLangHintInitial = 12; // Boost language by N initially |
| static const int kLangHintBoost = 12; // Boost language by N/16 per quadgram |
| |
| static const int kShortSpanThresh = 32; // Bytes |
| static const int kMaxSecondChanceLen = 1024; // Look at first 1K of short spans |
| |
| static const int kCheapSqueezeTestThresh = 4096; // Only look for squeezing |
| // after this many text bytes |
| static const int kCheapSqueezeTestLen = 256; // Bytes to test to trigger sqz |
| static const int kSpacesTriggerPercent = 25; // Trigger sqz if >=25% spaces |
| static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted |
| |
| static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks |
| static const int kSpacesThreshPercent = 25; // Squeeze if >=25% spaces |
| static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted |
| |
| static const int kMaxSpaceScan = 32; // Bytes |
| |
| static const int kGoodLang1Percent = 70; |
| static const int kGoodLang1and2Percent = 93; |
| static const int kShortTextThresh = 256; // Bytes |
| |
| static const int kMinChunkSizeQuads = 4; // Chunk is at least four quads |
| static const int kMaxChunkSizeQuads = 1024; // Chunk is at most 1K quads |
| |
| static const int kDefaultWordSpan = 256; // Scan at least this many initial |
| // bytes with word scoring |
| static const int kReallyBigWordSpan = 9999999; // Forces word scoring all text |
| |
| static const int kMinReliableSeq = 50; // Record in seq if >= 50% reliable |
| |
| static const int kPredictionTableSize = 4096; // Must be exactly 4096 for |
| // cheap compressor |
| |
| static const int kNonEnBoilerplateMinPercent = 17; // <this => no second |
| static const int kNonFIGSBoilerplateMinPercent = 20; // <this => no second |
| static const int kGoodFirstMinPercent = 26; // <this => UNK |
| static const int kGoodFirstReliableMinPercent = 51; // <this => unreli |
| static const int kIgnoreMaxPercent = 20; // >this => unreli |
| static const int kKeepMinPercent = 2; // <this => unreli |
| |
| |
| |
| // Statistically closest language, based on quadgram table |
| // Those that are far from other languges map to UNKNOWN_LANGUAGE |
| // Subscripted by Language |
| // |
| // From lang_correlation.txt and hand-edits |
| // sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/ |
| // (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE, |
| // \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt |
| // |
| static const int kMinCorrPercent = 24; // Pick off how close you want |
| // 24 catches PERSIAN <== ARABIC |
| // but not SPANISH <== PORTUGESE |
| static Language Unknown = UNKNOWN_LANGUAGE; |
| |
| // Suspect idea |
| // Subscripted by Language |
| static const Language kClosestAltLanguage[] = { |
| (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // ENGLISH |
| (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // DANISH |
| (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE, // DUTCH |
| (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // FINNISH |
| (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // FRENCH |
| (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE, // GERMAN |
| (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE, // HEBREW |
| (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE, // ITALIAN |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Japanese |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Korean |
| (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE, // NORWEGIAN |
| ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // POLISH |
| (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // PORTUGUESE |
| (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // RUSSIAN |
| (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE, // SPANISH |
| (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // SWEDISH |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Chinese |
| (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // CZECH |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GREEK |
| (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE, // ICELANDIC |
| ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE, // LATVIAN |
| ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE, // LITHUANIAN |
| ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ROMANIAN |
| ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // HUNGARIAN |
| (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE, // ESTONIAN |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Ignore |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Unknown |
| (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // BULGARIAN |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CROATIAN |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SERBIAN |
| (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE, // IRISH |
| (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GALICIAN |
| ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // TAGALOG |
| (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE, // TURKISH |
| (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // UKRAINIAN |
| (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // HINDI |
| (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // MACEDONIAN |
| (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE, // BENGALI |
| (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // INDONESIAN |
| ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // LATIN |
| (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // MALAY |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MALAYALAM |
| ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE, // WELSH |
| ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // NEPALI |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TELUGU |
| ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE, // ALBANIAN |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TAMIL |
| (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE, // BELARUSIAN |
| (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE, // JAVANESE |
| (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE, // OCCITAN |
| (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // URDU |
| (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // BIHARI |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GUJARATI |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // THAI |
| (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // ARABIC |
| (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // CATALAN |
| ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ESPERANTO |
| ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // BASQUE |
| ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // INTERLINGUA |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KANNADA |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PUNJABI |
| (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE, // SCOTS_GAELIC |
| ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SWAHILI |
| (28 >= kMinCorrPercent) ? SERBIAN : UNKNOWN_LANGUAGE, // SLOVENIAN |
| (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // MARATHI |
| ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // MALTESE |
| ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE, // VIETNAMESE |
| (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // FRISIAN |
| (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE, // SLOVAK |
| // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ChineseT |
| (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE, // ChineseT |
| (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE, // FAROESE |
| (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE, // SUNDANESE |
| (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE, // UZBEK |
| ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE, // AMHARIC |
| (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // AZERBAIJANI |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GEORGIAN |
| ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE, // TIGRINYA |
| (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // PERSIAN |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // BOSNIAN |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SINHALESE |
| (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // NORWEGIAN_N |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_P |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_B |
| (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // XHOSA |
| (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE, // ZULU |
| ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GUARANI |
| (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE, // SESOTHO |
| ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // TURKMEN |
| ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE, // KYRGYZ |
| ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE, // BRETON |
| ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE, // TWI |
| (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE, // YIDDISH |
| (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE, // SERBO_CROATIAN |
| (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // SOMALI |
| ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // UIGHUR |
| (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // KURDISH |
| ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // MONGOLIAN |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ARMENIAN |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // LAOTHIAN |
| ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // SINDHI |
| (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // RHAETO_ROMANCE |
| (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // AFRIKAANS |
| (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // LUXEMBOURGISH |
| ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // BURMESE |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KHMER |
| (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE, // TIBETAN |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // DHIVEHI |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CHEROKEE |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SYRIAC |
| ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // LIMBU |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ORIYA |
| (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE, // ASSAMESE |
| (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // CORSICAN |
| ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // INTERLINGUE |
| ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // KAZAKH |
| ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE, // LINGALA |
| (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // MOLDAVIAN |
| (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // PASHTO |
| ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE, // QUECHUA |
| ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SHONA |
| (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // TAJIK |
| (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE, // TATAR |
| (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE, // TONGA |
| ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE, // YORUBA |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_ENGLISH_BASED |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_FRENCH_BASED |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_OTHER |
| ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // MAORI |
| ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // WOLOF |
| ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE, // ABKHAZIAN |
| ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // AFAR |
| ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE, // AYMARA |
| (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE, // BASHKIR |
| ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // BISLAMA |
| (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE, // DZONGKHA |
| ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // FIJIAN |
| ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE, // GREENLANDIC |
| ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE, // HAUSA |
| ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // HAITIAN_CREOLE |
| ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE, // INUPIAK |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // INUKTITUT |
| ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // KASHMIRI |
| (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE, // KINYARWANDA |
| ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE, // MALAGASY |
| (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // NAURU |
| (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // OROMO |
| (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // RUNDI |
| (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // SAMOAN |
| ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE, // SANGO |
| (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // SANSKRIT |
| (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // SISWANT |
| ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE, // TSONGA |
| (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE, // TSWANA |
| ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // VOLAPUK |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ZHUANG |
| ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // KHASI |
| (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // SCOTS |
| (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // GANDA |
| ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // MANX |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MONTENEGRIN |
| |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // AKAN |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // IGBO |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MAURITIAN_CREOLE |
| ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // HAWAIIAN |
| }; |
| |
| // COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES, |
| // kClosestAltLanguage_has_incorrect_size); |
| |
| |
| inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;} |
| inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;} |
| inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;} |
| inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;} |
| inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;} |
| inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;} |
| inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;} |
| |
| |
| // Defines Top40 packed languages |
| |
| // Google top 40 languages |
| // |
| // Tier 0/1 Language enum list (16) |
| // ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH, // E - FIGS |
| // DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN, |
| // PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI, |
| // ARABIC, |
| // |
| // Tier 2 Language enum list (22) |
| // SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN, |
| // HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN, |
| // VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK, |
| // TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN, |
| // UKRAINIAN, HINDI, |
| // |
| // use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21) |
| // |
| // Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40 |
| |
| |
| void DemoteNotTop40(Tote* chunk_tote, uint16 psplus_one) { |
| // REVISIT |
| } |
| |
| void PrintText(FILE* f, Language cur_lang, const string& temp) { |
| if (temp.size() == 0) {return;} |
| fprintf(f, "PrintText[%s]%s<br>\n", LanguageName(cur_lang), temp.c_str()); |
| } |
| |
| |
| //------------------------------------------------------------------------------ |
| // For --cld_html debugging output. Not thread safe |
| //------------------------------------------------------------------------------ |
| static Language prior_lang = UNKNOWN_LANGUAGE; |
| static bool prior_unreliable = false; |
| |
| //------------------------------------------------------------------------------ |
| // End For --cld_html debugging output |
| //------------------------------------------------------------------------------ |
| |
| |
| // Backscan to word boundary, returning how many bytes n to go back |
| // so that src - n is non-space ans src - n - 1 is space. |
| // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary |
| int BackscanToSpace(const char* src, int limit) { |
| int n = 0; |
| limit = minint(limit, kMaxSpaceScan); |
| while (n < limit) { |
| if (src[-n - 1] == ' ') {return n;} // We are at _X |
| ++n; |
| } |
| n = 0; |
| while (n < limit) { |
| if ((src[-n] & 0xc0) != 0x80) {return n;} // We are at char begin |
| ++n; |
| } |
| return 0; |
| } |
| |
| // Forwardscan to word boundary, returning how many bytes n to go forward |
| // so that src + n is non-space ans src + n - 1 is space. |
| // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary |
| int ForwardscanToSpace(const char* src, int limit) { |
| int n = 0; |
| limit = minint(limit, kMaxSpaceScan); |
| while (n < limit) { |
| if (src[n] == ' ') {return n + 1;} // We are at _X |
| ++n; |
| } |
| n = 0; |
| while (n < limit) { |
| if ((src[n] & 0xc0) != 0x80) {return n;} // We are at char begin |
| ++n; |
| } |
| return 0; |
| } |
| |
| |
| // This uses a cheap predictor to get a measure of compression, and |
| // hence a measure of repetitiveness. It works on complete UTF-8 characters |
| // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly |
| // all the time when done with a byte-based count. Sigh. |
| // |
| // To allow running prediction across multiple chunks, caller passes in current |
| // 12-bit hash value and int[4096] prediction table. Caller inits these to 0. |
| // |
| // Returns the number of *bytes* correctly predicted, increments by 1..4 for |
| // each correctly-predicted character. |
| // |
| // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text |
| // |
| |
| // TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen |
| |
| int CountPredictedBytes(const char* isrc, int src_len, int* hash, int* tbl) { |
| int p_count = 0; |
| const uint8* src = reinterpret_cast<const uint8*>(isrc); |
| const uint8* srclimit = src + src_len; |
| int local_hash = *hash; |
| |
| while (src < srclimit) { |
| int c = src[0]; |
| int incr = 1; |
| |
| // Pick up one char and length |
| if (c < 0xc0) { |
| // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx |
| // Do nothing more |
| } else if ((c & 0xe0) == 0xc0) { |
| // Two-byte |
| c = (c << 8) | src[1]; |
| incr = 2; |
| } else if ((c & 0xf0) == 0xe0) { |
| // Three-byte |
| c = (c << 16) | (src[1] << 8) | src[2]; |
| incr = 3; |
| } else { |
| // Four-byte |
| c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; |
| incr = 4; |
| } |
| src += incr; |
| |
| int p = tbl[local_hash]; // Prediction |
| tbl[local_hash] = c; // Update prediction |
| if (c == p) { |
| p_count += incr; // Count bytes of good predictions |
| } |
| |
| local_hash = ((local_hash << 4) ^ c) & 0xfff; |
| } |
| *hash = local_hash; |
| return p_count; |
| } |
| |
| |
| |
| // Counts number of spaces; a little faster than one-at-a-time |
| // Doesn't count odd bytes at end |
| int CountSpaces4(const char* src, int src_len) { |
| int s_count = 0; |
| for (int i = 0; i < (src_len & ~3); i += 4) { |
| s_count += (src[i] == ' '); |
| s_count += (src[i+1] == ' '); |
| s_count += (src[i+2] == ' '); |
| s_count += (src[i+3] == ' '); |
| } |
| return s_count; |
| } |
| |
| |
| // Remove words of text that have more than half their letters predicted |
| // correctly by our cheap predictor, moving the remaining words in-place |
| // to the front of the input buffer. |
| // |
| // To allow running prediction across multiple chunks, caller passes in current |
| // 12-bit hash value and int[4096] prediction table. Caller inits these to 0. |
| // |
| // Return the new, possibly-shorter length |
| // |
| // Result Buffer ALWAYS has leading space and trailing space space space NUL, |
| // if input does |
| // |
| int CheapRepWordsInplace(char* isrc, int src_len, int* hash, int* tbl) { |
| const uint8* src = reinterpret_cast<const uint8*>(isrc); |
| const uint8* srclimit = src + src_len; |
| char* dst = isrc; |
| int local_hash = *hash; |
| char* word_dst = dst; // Start of next word |
| int good_predict_bytes = 0; |
| int word_length_bytes = 0; |
| |
| while (src < srclimit) { |
| int c = src[0]; |
| int incr = 1; |
| *dst++ = c; |
| |
| if (c == ' ') { |
| if ((good_predict_bytes * 2) > word_length_bytes) { |
| // Word is well-predicted: backup to start of this word |
| dst = word_dst; |
| if (FLAGS_cld_showme) { |
| // Mark the deletion point with period |
| // Don't repeat multiple periods |
| // Cannot mark with more bytes or may overwrite unseen input |
| if ((isrc < (dst - 2)) && (dst[-2] != '.')) { |
| *dst++ = '.'; |
| *dst++ = ' '; |
| } |
| } |
| } |
| word_dst = dst; // Start of next word |
| good_predict_bytes = 0; |
| word_length_bytes = 0; |
| } |
| |
| // Pick up one char and length |
| if (c < 0xc0) { |
| // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx |
| // Do nothing more |
| } else if ((c & 0xe0) == 0xc0) { |
| // Two-byte |
| *dst++ = src[1]; |
| c = (c << 8) | src[1]; |
| incr = 2; |
| } else if ((c & 0xf0) == 0xe0) { |
| // Three-byte |
| *dst++ = src[1]; |
| *dst++ = src[2]; |
| c = (c << 16) | (src[1] << 8) | src[2]; |
| incr = 3; |
| } else { |
| // Four-byte |
| *dst++ = src[1]; |
| *dst++ = src[2]; |
| *dst++ = src[3]; |
| c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; |
| incr = 4; |
| } |
| src += incr; |
| word_length_bytes += incr; |
| |
| int p = tbl[local_hash]; // Prediction |
| tbl[local_hash] = c; // Update prediction |
| if (c == p) { |
| good_predict_bytes += incr; // Count good predictions |
| } |
| |
| local_hash = ((local_hash << 4) ^ c) & 0xfff; |
| } |
| |
| *hash = local_hash; |
| |
| if ((dst - isrc) < (src_len - 3)) { |
| // Pad and make last char clean UTF-8 by putting following spaces |
| dst[0] = ' '; |
| dst[1] = ' '; |
| dst[2] = ' '; |
| dst[3] = '\0'; |
| } else if ((dst - isrc) < src_len) { |
| // Make last char clean UTF-8 by putting following space off the end |
| dst[0] = ' '; |
| } |
| |
| return static_cast<int>(dst - isrc); |
| } |
| |
| |
| // This alternate form overwrites redundant words, thus avoiding corrupting the |
| // backmap for generate a vector of original-text ranges. |
| int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) { |
| const uint8* src = reinterpret_cast<const uint8*>(isrc); |
| const uint8* srclimit = src + src_len; |
| char* dst = isrc; |
| int local_hash = *hash; |
| char* word_dst = dst; // Start of next word |
| int good_predict_bytes = 0; |
| int word_length_bytes = 0; |
| |
| while (src < srclimit) { |
| int c = src[0]; |
| int incr = 1; |
| *dst++ = c; |
| |
| if (c == ' ') { |
| if ((good_predict_bytes * 2) > word_length_bytes) { |
| // Word [word_dst..dst-1) is well-predicted: overwrite |
| for (char* p = word_dst; p < dst - 1; ++p) {*p = '.';} |
| } |
| word_dst = dst; // Start of next word |
| good_predict_bytes = 0; |
| word_length_bytes = 0; |
| } |
| |
| // Pick up one char and length |
| if (c < 0xc0) { |
| // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx |
| // Do nothing more |
| } else if ((c & 0xe0) == 0xc0) { |
| // Two-byte |
| *dst++ = src[1]; |
| c = (c << 8) | src[1]; |
| incr = 2; |
| } else if ((c & 0xf0) == 0xe0) { |
| // Three-byte |
| *dst++ = src[1]; |
| *dst++ = src[2]; |
| c = (c << 16) | (src[1] << 8) | src[2]; |
| incr = 3; |
| } else { |
| // Four-byte |
| *dst++ = src[1]; |
| *dst++ = src[2]; |
| *dst++ = src[3]; |
| c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; |
| incr = 4; |
| } |
| src += incr; |
| word_length_bytes += incr; |
| |
| int p = tbl[local_hash]; // Prediction |
| tbl[local_hash] = c; // Update prediction |
| if (c == p) { |
| good_predict_bytes += incr; // Count good predictions |
| } |
| |
| local_hash = ((local_hash << 4) ^ c) & 0xfff; |
| } |
| |
| *hash = local_hash; |
| |
| if ((dst - isrc) < (src_len - 3)) { |
| // Pad and make last char clean UTF-8 by putting following spaces |
| dst[0] = ' '; |
| dst[1] = ' '; |
| dst[2] = ' '; |
| dst[3] = '\0'; |
| } else if ((dst - isrc) < src_len) { |
| // Make last char clean UTF-8 by putting following space off the end |
| dst[0] = ' '; |
| } |
| |
| return static_cast<int>(dst - isrc); |
| } |
| |
| |
| // Remove portions of text that have a high density of spaces, or that are |
| // overly repetitive, squeezing the remaining text in-place to the front of the |
| // input buffer. |
| // |
| // Squeezing looks at density of space/prediced chars in fixed-size chunks, |
| // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes. |
| // |
| // Return the new, possibly-shorter length |
| // |
| // Result Buffer ALWAYS has leading space and trailing space space space NUL, |
| // if input does |
| // |
| int CheapSqueezeInplace(char* isrc, |
| int src_len, |
| int ichunksize) { |
| char* src = isrc; |
| char* dst = src; |
| char* srclimit = src + src_len; |
| bool skipping = false; |
| |
| int hash = 0; |
| // Allocate local prediction table. |
| int* predict_tbl = new int[kPredictionTableSize]; |
| memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); |
| |
| int chunksize = ichunksize; |
| if (chunksize == 0) {chunksize = kChunksizeDefault;} |
| int space_thresh = (chunksize * kSpacesThreshPercent) / 100; |
| int predict_thresh = (chunksize * kPredictThreshPercent) / 100; |
| |
| while (src < srclimit) { |
| int remaining_bytes = srclimit - src; |
| int len = minint(chunksize, remaining_bytes); |
| // Make len land us on a UTF-8 character boundary. |
| // Ah. Also fixes mispredict because we could get out of phase |
| // Loop always terminates at trailing space in buffer |
| while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes |
| |
| int space_n = CountSpaces4(src, len); |
| int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl); |
| if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) { |
| // Skip the text |
| if (!skipping) { |
| // Keeping-to-skipping transition; do it at a space |
| int n = BackscanToSpace(dst, static_cast<int>(dst - isrc)); |
| dst -= n; |
| if (dst == isrc) { |
| // Force a leading space if the first chunk is deleted |
| *dst++ = ' '; |
| } |
| if (FLAGS_cld_showme) { |
| // Mark the deletion point with black square U+25A0 |
| *dst++ = static_cast<unsigned char>(0xe2); |
| *dst++ = static_cast<unsigned char>(0x96); |
| *dst++ = static_cast<unsigned char>(0xa0); |
| *dst++ = ' '; |
| } |
| skipping = true; |
| } |
| } else { |
| // Keep the text |
| if (skipping) { |
| // Skipping-to-keeping transition; do it at a space |
| int n = ForwardscanToSpace(src, len); |
| src += n; |
| remaining_bytes -= n; // Shrink remaining length |
| len -= n; |
| skipping = false; |
| } |
| // "len" can be negative in some cases |
| if (len > 0) { |
| memmove(dst, src, len); |
| dst += len; |
| } |
| } |
| src += len; |
| } |
| |
| if ((dst - isrc) < (src_len - 3)) { |
| // Pad and make last char clean UTF-8 by putting following spaces |
| dst[0] = ' '; |
| dst[1] = ' '; |
| dst[2] = ' '; |
| dst[3] = '\0'; |
| } else if ((dst - isrc) < src_len) { |
| // Make last char clean UTF-8 by putting following space off the end |
| dst[0] = ' '; |
| } |
| |
| // Deallocate local prediction table |
| delete[] predict_tbl; |
| return static_cast<int>(dst - isrc); |
| } |
| |
| // This alternate form overwrites redundant words, thus avoiding corrupting the |
| // backmap for generate a vector of original-text ranges. |
| int CheapSqueezeInplaceOverwrite(char* isrc, |
| int src_len, |
| int ichunksize) { |
| char* src = isrc; |
| char* dst = src; |
| char* srclimit = src + src_len; |
| bool skipping = false; |
| |
| int hash = 0; |
| // Allocate local prediction table. |
| int* predict_tbl = new int[kPredictionTableSize]; |
| memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); |
| |
| int chunksize = ichunksize; |
| if (chunksize == 0) {chunksize = kChunksizeDefault;} |
| int space_thresh = (chunksize * kSpacesThreshPercent) / 100; |
| int predict_thresh = (chunksize * kPredictThreshPercent) / 100; |
| |
| // Always keep first byte (space) |
| ++src; |
| ++dst; |
| while (src < srclimit) { |
| int remaining_bytes = srclimit - src; |
| int len = minint(chunksize, remaining_bytes); |
| // Make len land us on a UTF-8 character boundary. |
| // Ah. Also fixes mispredict because we could get out of phase |
| // Loop always terminates at trailing space in buffer |
| while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes |
| |
| int space_n = CountSpaces4(src, len); |
| int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl); |
| if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) { |
| // Overwrite the text [dst-n..dst) |
| if (!skipping) { |
| // Keeping-to-skipping transition; do it at a space |
| int n = BackscanToSpace(dst, static_cast<int>(dst - isrc)); |
| // Text [word_dst..dst) is well-predicted: overwrite |
| for (char* p = dst - n; p < dst; ++p) {*p = '.';} |
| skipping = true; |
| } |
| // Overwrite the text [dst..dst+len) |
| for (char* p = dst; p < dst + len; ++p) {*p = '.';} |
| dst[len - 1] = ' '; // Space at end so we can see what is happening |
| } else { |
| // Keep the text |
| if (skipping) { |
| // Skipping-to-keeping transition; do it at a space |
| int n = ForwardscanToSpace(src, len); |
| // Text [dst..dst+n) is well-predicted: overwrite |
| for (char* p = dst; p < dst + n - 1; ++p) {*p = '.';} |
| skipping = false; |
| } |
| } |
| dst += len; |
| src += len; |
| } |
| |
| if ((dst - isrc) < (src_len - 3)) { |
| // Pad and make last char clean UTF-8 by putting following spaces |
| dst[0] = ' '; |
| dst[1] = ' '; |
| dst[2] = ' '; |
| dst[3] = '\0'; |
| } else if ((dst - isrc) < src_len) { |
| // Make last char clean UTF-8 by putting following space off the end |
| dst[0] = ' '; |
| } |
| |
| // Deallocate local prediction table |
| delete[] predict_tbl; |
| return static_cast<int>(dst - isrc); |
| } |
| |
| // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input |
| // About 90 MB/sec, with or without memcpy, chunksize 48 or 4096 |
| // Just CountSpaces is about 340 MB/sec |
| // Byte-only CountPredictedBytes is about 150 MB/sec |
| // Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec |
| // Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c |
| // Unjammed byte-only both = 170 MB/sec |
| // Jammed byte-only both = 120 MB/sec |
| // Back to original w/slight updates, 110 MB/sec |
| // |
| bool CheapSqueezeTriggerTest(const char* src, int src_len, int testsize) { |
| // Don't trigger at all on short text |
| if (src_len < testsize) {return false;} |
| int space_thresh = (testsize * kSpacesTriggerPercent) / 100; |
| int predict_thresh = (testsize * kPredictTriggerPercent) / 100; |
| int hash = 0; |
| // Allocate local prediction table. |
| int* predict_tbl = new int[kPredictionTableSize]; |
| memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); |
| |
| bool retval = false; |
| if ((CountSpaces4(src, testsize) >= space_thresh) || |
| (CountPredictedBytes(src, testsize, &hash, predict_tbl) >= |
| predict_thresh)) { |
| retval = true; |
| } |
| // Deallocate local prediction table |
| delete[] predict_tbl; |
| return retval; |
| } |
| |
| |
| |
| |
| // Delete any extended languages from doc_tote |
| void RemoveExtendedLanguages(DocTote* doc_tote) { |
| // Now a nop |
| } |
| |
| static const int kMinReliableKeepPercent = 41; // Remove lang if reli < this |
| |
| // For Tier3 languages, require a minimum number of bytes to be first-place lang |
| static const int kGoodFirstT3MinBytes = 24; // <this => no first |
| |
| // Move bytes for unreliable langs to another lang or UNKNOWN |
| // doc_tote is sorted, so cannot Add |
| // |
| // If both CHINESE and CHINESET are present and unreliable, do not delete both; |
| // merge both into CHINESE. |
| // |
| //dsites 2009.03.19 |
| // we also want to remove Tier3 languages as the first lang if there is very |
| // little text like ej1 ej2 ej3 ej4 |
| // maybe fold this back in earlier |
| // |
| void RemoveUnreliableLanguages(DocTote* doc_tote, |
| bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { |
| // Prepass to merge some low-reliablility languages |
| // TODO: this shouldn't really reach in to the internal structure of doc_tote |
| int total_bytes = 0; |
| for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { |
| int plang = doc_tote->Key(sub); |
| if (plang == DocTote::kUnusedKey) {continue;} // Empty slot |
| |
| Language lang = static_cast<Language>(plang); |
| int bytes = doc_tote->Value(sub); |
| int reli = doc_tote->Reliability(sub); |
| if (bytes == 0) {continue;} // Zero bytes |
| total_bytes += bytes; |
| |
| // Reliable percent = stored reliable score over stored bytecount |
| int reliable_percent = reli / bytes; |
| if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper |
| |
| // This language is too unreliable to keep, but we might merge it. |
| Language altlang = UNKNOWN_LANGUAGE; |
| if (lang <= HAWAIIAN) {altlang = kClosestAltLanguage[lang];} |
| if (altlang == UNKNOWN_LANGUAGE) {continue;} // No alternative |
| |
| // Look for alternative in doc_tote |
| int altsub = doc_tote->Find(altlang); |
| if (altsub < 0) {continue;} // No alternative text |
| |
| int bytes2 = doc_tote->Value(altsub); |
| int reli2 = doc_tote->Reliability(altsub); |
| if (bytes2 == 0) {continue;} // Zero bytes |
| |
| // Reliable percent is stored reliable score over stored bytecount |
| int reliable_percent2 = reli2 / bytes2; |
| |
| // Merge one language into the other. Break ties toward lower lang # |
| int tosub = altsub; |
| int fromsub = sub; |
| bool into_lang = false; |
| if ((reliable_percent2 < reliable_percent) || |
| ((reliable_percent2 == reliable_percent) && (lang < altlang))) { |
| tosub = sub; |
| fromsub = altsub; |
| into_lang = true; |
| } |
| |
| // Make sure merged reliability doesn't drop and is enough to avoid delete |
| int newpercent = maxint(reliable_percent, reliable_percent2); |
| newpercent = maxint(newpercent, kMinReliableKeepPercent); |
| int newbytes = bytes + bytes2; |
| int newreli = newpercent * newbytes; |
| |
| doc_tote->SetKey(fromsub, DocTote::kUnusedKey); |
| doc_tote->SetScore(fromsub, 0); |
| doc_tote->SetReliability(fromsub, 0); |
| doc_tote->SetScore(tosub, newbytes); |
| doc_tote->SetReliability(tosub, newreli); |
| |
| // Show fate of unreliable languages if at least 10 bytes |
| if (FLAGS_cld2_html && (newbytes >= 10) && |
| !FLAGS_cld2_quiet) { |
| if (into_lang) { |
| fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ", |
| LanguageCode(altlang), reliable_percent2, bytes2, |
| LanguageCode(lang)); |
| } else { |
| fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ", |
| LanguageCode(lang), reliable_percent, bytes, |
| LanguageCode(altlang)); |
| } |
| } |
| } |
| |
| |
| // Pass to delete any remaining unreliable languages |
| for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { |
| int plang = doc_tote->Key(sub); |
| if (plang == DocTote::kUnusedKey) {continue;} // Empty slot |
| |
| Language lang = static_cast<Language>(plang); |
| int bytes = doc_tote->Value(sub); |
| int reli = doc_tote->Reliability(sub); |
| if (bytes == 0) {continue;} // Zero bytes |
| |
| // Reliable percent is stored as reliable score over stored bytecount |
| int reliable_percent = reli / bytes; |
| if (reliable_percent >= kMinReliableKeepPercent) { // Keeper? |
| continue; // yes |
| } |
| |
| // Delete unreliable entry |
| doc_tote->SetKey(sub, DocTote::kUnusedKey); |
| doc_tote->SetScore(sub, 0); |
| doc_tote->SetReliability(sub, 0); |
| |
| // Show fate of unreliable languages if at least 10 bytes |
| if (FLAGS_cld2_html && (bytes >= 10) && |
| !FLAGS_cld2_quiet) { |
| fprintf(stderr, "{Unreli %s.%dR,%dB} ", |
| LanguageCode(lang), reliable_percent, bytes); |
| } |
| } |
| |
| ////if (FLAGS_cld2_html) {fprintf(stderr, "<br>\n");} |
| } |
| |
| |
| // Move all the text bytes from lower byte-count to higher one |
| void MoveLang1ToLang2(Language lang1, Language lang2, |
| int lang1_sub, int lang2_sub, |
| DocTote* doc_tote, |
| ResultChunkVector* resultchunkvector) { |
| // In doc_tote, move all the bytes lang1 => lang2 |
| int sum = doc_tote->Value(lang2_sub) + doc_tote->Value(lang1_sub); |
| doc_tote->SetValue(lang2_sub, sum); |
| sum = doc_tote->Score(lang2_sub) + doc_tote->Score(lang1_sub); |
| doc_tote->SetScore(lang2_sub, sum); |
| sum = doc_tote->Reliability(lang2_sub) + doc_tote->Reliability(lang1_sub); |
| doc_tote->SetReliability(lang2_sub, sum); |
| |
| // Delete old entry |
| doc_tote->SetKey(lang1_sub, DocTote::kUnusedKey); |
| doc_tote->SetScore(lang1_sub, 0); |
| doc_tote->SetReliability(lang1_sub, 0); |
| |
| // In resultchunkvector, move all the bytes lang1 => lang2 |
| if (resultchunkvector == NULL) {return;} |
| |
| int k = 0; |
| uint16 prior_lang = UNKNOWN_LANGUAGE; |
| for (int i = 0; i < static_cast<int>(resultchunkvector->size()); ++i) { |
| ResultChunk* rc = &(*resultchunkvector)[i]; |
| if (rc->lang1 == lang1) { |
| // Update entry[i] lang1 => lang2 |
| rc->lang1 = lang2; |
| } |
| // One change may produce two merges -- entry before and entry after |
| if ((rc->lang1 == prior_lang) && (k > 0)) { |
| // Merge with previous, deleting entry[i] |
| ResultChunk* prior_rc = &(*resultchunkvector)[k - 1]; |
| prior_rc->bytes += rc->bytes; |
| // fprintf(stderr, "MoveLang1ToLang2 merged [%d] => [%d]<br>\n", i, k-1); |
| } else { |
| // Keep entry[i] |
| (*resultchunkvector)[k] = (*resultchunkvector)[i]; |
| // fprintf(stderr, "MoveLang1ToLang2 keep [%d] => [%d]<br>\n", i, k); |
| ++k; |
| } |
| prior_lang = rc->lang1; |
| } |
| resultchunkvector->resize(k); |
| } |
| |
| |
| |
| // Move less likely byte count to more likely for close pairs of languages |
| // If given, also update resultchunkvector |
| void RefineScoredClosePairs(DocTote* doc_tote, |
| ResultChunkVector* resultchunkvector, |
| bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { |
| for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { |
| int close_packedlang = doc_tote->Key(sub); |
| int subscr = LanguageCloseSet(static_cast<Language>(close_packedlang)); |
| if (subscr == 0) {continue;} |
| |
| // We have a close pair language -- if the other one is also scored and the |
| // longword score differs enough, put all our eggs into one basket |
| |
| // Nonzero longword score: Go look for the other of this pair |
| for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) { |
| if (LanguageCloseSet(static_cast<Language>(doc_tote->Key(sub2))) == subscr) { |
| // We have a matching pair |
| int close_packedlang2 = doc_tote->Key(sub2); |
| |
| // Move all the text bytes from lower byte-count to higher one |
| int from_sub, to_sub; |
| Language from_lang, to_lang; |
| if (doc_tote->Value(sub) < doc_tote->Value(sub2)) { |
| from_sub = sub; |
| to_sub = sub2; |
| from_lang = static_cast<Language>(close_packedlang); |
| to_lang = static_cast<Language>(close_packedlang2); |
| } else { |
| from_sub = sub2; |
| to_sub = sub; |
| from_lang = static_cast<Language>(close_packedlang2); |
| to_lang = static_cast<Language>(close_packedlang); |
| } |
| |
| if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) { |
| // Show fate of closepair language |
| int val = doc_tote->Value(from_sub); // byte count |
| int reli = doc_tote->Reliability(from_sub); |
| int reliable_percent = reli / (val ? val : 1); // avoid zdiv |
| fprintf(stderr, "{CloseLangPair: %s.%dR,%dB => %s}<br>\n", |
| LanguageCode(from_lang), |
| reliable_percent, |
| doc_tote->Value(from_sub), |
| LanguageCode(to_lang)); |
| } |
| MoveLang1ToLang2(from_lang, to_lang, from_sub, to_sub, |
| doc_tote, resultchunkvector); |
| break; // Exit inner for sub2 loop |
| } |
| } // End for sub2 |
| } // End for sub |
| } |
| |
| |
| void ApplyAllLanguageHints(Tote* chunk_tote, int tote_grams, |
| uint8* lang_hint_boost) { |
| } |
| |
| |
| void PrintHtmlEscapedText(FILE* f, const char* txt, int len) { |
| string temp(txt, len); |
| fprintf(f, "%s", GetHtmlEscapedText(temp).c_str()); |
| } |
| |
| void PrintLang(FILE* f, Tote* chunk_tote, |
| Language cur_lang, bool cur_unreliable, |
| Language prior_lang, bool prior_unreliable) { |
| if (cur_lang == prior_lang) { |
| fprintf(f, "[]"); |
| } else { |
| fprintf(f, "[%s%s]", LanguageCode(cur_lang), cur_unreliable ? "*" : ""); |
| } |
| } |
| |
| |
| void PrintTopLang(Language top_lang) { |
| if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) { |
| fprintf(stderr, "[] "); |
| } else { |
| fprintf(stderr, "[%s] ", LanguageName(top_lang)); |
| prior_lang = top_lang; |
| } |
| } |
| |
| void PrintTopLangSpeculative(Language top_lang) { |
| fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0); |
| if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) { |
| fprintf(stderr, "[] "); |
| } else { |
| fprintf(stderr, "[%s] ", LanguageName(top_lang)); |
| prior_lang = top_lang; |
| } |
| fprintf(stderr, "</span>\n"); |
| } |
| |
| void PrintLangs(FILE* f, const Language* language3, const int* percent3, |
| const int* text_bytes, const bool* is_reliable) { |
| fprintf(f, "<br> Initial_Languages "); |
| if (language3[0] != UNKNOWN_LANGUAGE) { |
| fprintf(f, "%s%s(%d%%) ", |
| LanguageName(language3[0]), |
| *is_reliable ? "" : "*", |
| percent3[0]); |
| } |
| if (language3[1] != UNKNOWN_LANGUAGE) { |
| fprintf(f, "%s(%d%%) ", LanguageName(language3[1]), percent3[1]); |
| } |
| if (language3[2] != UNKNOWN_LANGUAGE) { |
| fprintf(f, "%s(%d%%) ", LanguageName(language3[2]), percent3[2]); |
| } |
| fprintf(f, "%d bytes \n", *text_bytes); |
| |
| fprintf(f, "<br>\n"); |
| } |
| |
| |
| // Return internal probability score (sum) per 1024 bytes |
| double GetNormalizedScore(Language lang, ULScript ulscript, |
| int bytecount, int score) { |
| if (bytecount <= 0) {return 0.0;} |
| return (score << 10) / bytecount; |
| } |
| |
| // Extract return values before fixups |
| void ExtractLangEtc(DocTote* doc_tote, int total_text_bytes, |
| int* reliable_percent3, Language* language3, int* percent3, |
| double* normalized_score3, |
| int* text_bytes, bool* is_reliable) { |
| reliable_percent3[0] = 0; |
| reliable_percent3[1] = 0; |
| reliable_percent3[2] = 0; |
| language3[0] = UNKNOWN_LANGUAGE; |
| language3[1] = UNKNOWN_LANGUAGE; |
| language3[2] = UNKNOWN_LANGUAGE; |
| percent3[0] = 0; |
| percent3[1] = 0; |
| percent3[2] = 0; |
| normalized_score3[0] = 0.0; |
| normalized_score3[1] = 0.0; |
| normalized_score3[2] = 0.0; |
| |
| *text_bytes = total_text_bytes; |
| *is_reliable = false; |
| |
| int bytecount1 = 0; |
| int bytecount2 = 0; |
| int bytecount3 = 0; |
| |
| int lang1 = doc_tote->Key(0); |
| if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) { |
| // We have a top language |
| language3[0] = static_cast<Language>(lang1); |
| bytecount1 = doc_tote->Value(0); |
| int reli1 = doc_tote->Reliability(0); |
| reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1); // avoid zdiv |
| normalized_score3[0] = GetNormalizedScore(language3[0], |
| ULScript_Common, |
| bytecount1, |
| doc_tote->Score(0)); |
| } |
| |
| int lang2 = doc_tote->Key(1); |
| if ((lang2 != DocTote::kUnusedKey) && (lang2 != UNKNOWN_LANGUAGE)) { |
| language3[1] = static_cast<Language>(lang2); |
| bytecount2 = doc_tote->Value(1); |
| int reli2 = doc_tote->Reliability(1); |
| reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1); // avoid zdiv |
| normalized_score3[1] = GetNormalizedScore(language3[1], |
| ULScript_Common, |
| bytecount2, |
| doc_tote->Score(1)); |
| } |
| |
| int lang3 = doc_tote->Key(2); |
| if ((lang3 != DocTote::kUnusedKey) && (lang3 != UNKNOWN_LANGUAGE)) { |
| language3[2] = static_cast<Language>(lang3); |
| bytecount3 = doc_tote->Value(2); |
| int reli3 = doc_tote->Reliability(2); |
| reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1); // avoid zdiv |
| normalized_score3[2] = GetNormalizedScore(language3[2], |
| ULScript_Common, |
| bytecount3, |
| doc_tote->Score(2)); |
| } |
| |
| // Increase total bytes to sum (top 3) if low for some reason |
| int total_bytecount12 = bytecount1 + bytecount2; |
| int total_bytecount123 = total_bytecount12 + bytecount3; |
| if (total_text_bytes < total_bytecount123) { |
| total_text_bytes = total_bytecount123; |
| *text_bytes = total_text_bytes; |
| } |
| |
| // Sum minus previous % gives better roundoff behavior than bytecount/total |
| int total_text_bytes_div = maxint(1, total_text_bytes); // Avoid zdiv |
| percent3[0] = (bytecount1 * 100) / total_text_bytes_div; |
| percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div; |
| percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div; |
| percent3[2] -= percent3[1]; |
| percent3[1] -= percent3[0]; |
| |
| // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2% |
| // Fix this explicitly |
| if (percent3[1] < percent3[2]) { |
| ++percent3[1]; |
| --percent3[2]; |
| } |
| if (percent3[0] < percent3[1]) { |
| ++percent3[0]; |
| --percent3[1]; |
| } |
| |
| *text_bytes = total_text_bytes; |
| |
| if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) { |
| // We have a top language |
| // Its reliability is overall result reliability |
| int bytecount = doc_tote->Value(0); |
| int reli = doc_tote->Reliability(0); |
| int reliable_percent = reli / (bytecount ? bytecount : 1); // avoid zdiv |
| *is_reliable = (reliable_percent >= kMinReliableKeepPercent); |
| } else { |
| // No top language at all. This can happen with zero text or 100% Klingon |
| // if extended=false. Just return all UNKNOWN_LANGUAGE, unreliable. |
| *is_reliable = false; |
| } |
| |
| // If ignore percent is too large, set unreliable. |
| int ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]); |
| if ((ignore_percent > kIgnoreMaxPercent)) { |
| *is_reliable = false; |
| } |
| } |
| |
| bool IsFIGS(Language lang) { |
| if (lang == FRENCH) {return true;} |
| if (lang == ITALIAN) {return true;} |
| if (lang == GERMAN) {return true;} |
| if (lang == SPANISH) {return true;} |
| return false; |
| } |
| |
| bool IsEFIGS(Language lang) { |
| if (lang == ENGLISH) {return true;} |
| if (lang == FRENCH) {return true;} |
| if (lang == ITALIAN) {return true;} |
| if (lang == GERMAN) {return true;} |
| if (lang == SPANISH) {return true;} |
| return false; |
| } |
| |
| // For Tier3 languages, require more bytes of text to override |
| // the first-place language |
| static const int kGoodSecondT1T2MinBytes = 15; // <this => no second |
| static const int kGoodSecondT3MinBytes = 128; // <this => no second |
| |
| // Calculate a single summary language for the document, and its reliability. |
| // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE |
| // This is the heart of matching human-rater perception. |
| // reliable_percent3[] is currently unused |
| // |
| // Do not return Tier3 second language unless there are at least 128 bytes |
| void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes, |
| const int* reliable_percent3, |
| const Language* language3, |
| const int* percent3, |
| Language* summary_lang, bool* is_reliable, |
| bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { |
| // Vector of active languages; changes if we delete some |
| int slot_count = 3; |
| int active_slot[3] = {0, 1, 2}; |
| |
| int ignore_percent = 0; |
| int return_percent = percent3[0]; // Default to top lang |
| *summary_lang = language3[0]; |
| *is_reliable = true; |
| if (percent3[0] < kKeepMinPercent) {*is_reliable = false;} |
| |
| // If any of top 3 is IGNORE, remove it and increment ignore_percent |
| for (int i = 0; i < 3; ++i) { |
| if (language3[i] == TG_UNKNOWN_LANGUAGE) { |
| ignore_percent += percent3[i]; |
| // Move the rest up, levaing input vectors unchanged |
| for (int j=i+1; j < 3; ++j) { |
| active_slot[j - 1] = active_slot[j]; |
| } |
| -- slot_count; |
| // Logically remove Ignore from percentage-text calculation |
| // (extra 1 in 101 avoids zdiv, biases slightly small) |
| return_percent = (percent3[0] * 100) / (101 - ignore_percent); |
| *summary_lang = language3[active_slot[0]]; |
| if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;} |
| } |
| } |
| |
| |
| // If English and X, where X (not UNK) is big enough, |
| // assume the English is boilerplate and return X. |
| // Logically remove English from percentage-text calculation |
| int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100; |
| // Require more bytes of text for Tier3 languages |
| int minbytesneeded = kGoodSecondT1T2MinBytes; |
| int plang_second = PerScriptNumber(ULScript_Latin, language3[active_slot[1]]); |
| |
| if ((language3[active_slot[0]] == ENGLISH) && |
| (language3[active_slot[1]] != ENGLISH) && |
| (language3[active_slot[1]] != UNKNOWN_LANGUAGE) && |
| (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) && |
| (second_bytes >= minbytesneeded)) { |
| ignore_percent += percent3[active_slot[0]]; |
| return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent); |
| *summary_lang = language3[active_slot[1]]; |
| if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;} |
| |
| // Else If FIGS and X, where X (not UNK, EFIGS) is big enough, |
| // assume the FIGS is boilerplate and return X. |
| // Logically remove FIGS from percentage-text calculation |
| } else if (IsFIGS(language3[active_slot[0]]) && |
| !IsEFIGS(language3[active_slot[1]]) && |
| (language3[active_slot[1]] != UNKNOWN_LANGUAGE) && |
| (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) && |
| (second_bytes >= minbytesneeded)) { |
| ignore_percent += percent3[active_slot[0]]; |
| return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent); |
| *summary_lang = language3[active_slot[1]]; |
| if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;} |
| |
| // Else we are returning the first language, but want to improve its |
| // return_percent if the second language should be ignored |
| } else if ((language3[active_slot[1]] == ENGLISH) && |
| (language3[active_slot[0]] != ENGLISH)) { |
| ignore_percent += percent3[active_slot[1]]; |
| return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent); |
| } else if (IsFIGS(language3[active_slot[1]]) && |
| !IsEFIGS(language3[active_slot[0]])) { |
| ignore_percent += percent3[active_slot[1]]; |
| return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent); |
| } |
| |
| // If return percent is too small (too many languages), return UNKNOWN |
| if ((return_percent < kGoodFirstMinPercent)) { |
| if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { |
| fprintf(stderr, "{Unreli %s %d%% percent too small} ", |
| LanguageCode(*summary_lang), return_percent); |
| } |
| *summary_lang = UNKNOWN_LANGUAGE; |
| *is_reliable = false; |
| } |
| |
| // If return percent is small, return language but set unreliable. |
| if ((return_percent < kGoodFirstReliableMinPercent)) { |
| *is_reliable = false; |
| } |
| |
| // If ignore percent is too large, set unreliable. |
| ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]); |
| if ((ignore_percent > kIgnoreMaxPercent)) { |
| *is_reliable = false; |
| } |
| |
| // If we removed all the active languages, return UNKNOWN |
| if (slot_count == 0) { |
| if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { |
| fprintf(stderr, "{Unreli %s no languages left} ", |
| LanguageCode(*summary_lang)); |
| } |
| *summary_lang = UNKNOWN_LANGUAGE; |
| *is_reliable = false; |
| } |
| } |
| |
| void AddLangPriorBoost(Language lang, uint32 langprob, |
| ScoringContext* scoringcontext) { |
| // This is called 0..n times with language hints |
| // but we don't know the script -- so boost either or both Latn, Othr. |
| |
| if (IsLatnLanguage(lang)) { |
| LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn; |
| int n = langprior_boost->n; |
| langprior_boost->langprob[n] = langprob; |
| langprior_boost->n = langprior_boost->wrap(n + 1); |
| } |
| |
| if (IsOthrLanguage(lang)) { |
| LangBoosts* langprior_boost = &scoringcontext->langprior_boost.othr; |
| int n = langprior_boost->n; |
| langprior_boost->langprob[n] = langprob; |
| langprior_boost->n = langprior_boost->wrap(n + 1); |
| } |
| |
| } |
| |
| void AddOneWhack(Language whacker_lang, Language whackee_lang, |
| ScoringContext* scoringcontext) { |
| uint32 langprob = MakeLangProb(whackee_lang, 1); |
| // This logic avoids hr-Latn whacking sr-Cyrl, but still whacks sr-Latn |
| if (IsLatnLanguage(whacker_lang) && IsLatnLanguage(whackee_lang)) { |
| LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn; |
| int n = langprior_whack->n; |
| langprior_whack->langprob[n] = langprob; |
| langprior_whack->n = langprior_whack->wrap(n + 1); |
| } |
| if (IsOthrLanguage(whacker_lang) && IsOthrLanguage(whackee_lang)) { |
| LangBoosts* langprior_whack = &scoringcontext->langprior_whack.othr; |
| int n = langprior_whack->n; |
| langprior_whack->langprob[n] = langprob; |
| langprior_whack->n = langprior_whack->wrap(n + 1); |
| } |
| } |
| |
| void AddCloseLangWhack(Language lang, ScoringContext* scoringcontext) { |
| // We do not in general want zh-Hans and zh-Hant to be close pairs, |
| // but we do here. |
| if (lang == CLD2::CHINESE) { |
| AddOneWhack(lang, CLD2::CHINESE_T, scoringcontext); |
| return; |
| } |
| if (lang == CLD2::CHINESE_T) { |
| AddOneWhack(lang, CLD2::CHINESE, scoringcontext); |
| return; |
| } |
| |
| int base_lang_set = LanguageCloseSet(lang); |
| if (base_lang_set == 0) {return;} |
| // TODO: add an explicit list of each set to avoid this 512-times loop |
| for (int i = 0; i < kLanguageToPLangSize; ++i) { |
| Language lang2 = static_cast<Language>(i); |
| if ((base_lang_set == LanguageCloseSet(lang2)) && (lang != lang2)) { |
| AddOneWhack(lang, lang2, scoringcontext); |
| } |
| } |
| } |
| |
| |
| void ApplyHints(const char* buffer, |
| int buffer_length, |
| bool is_plain_text, |
| const CLDHints* cld_hints, |
| ScoringContext* scoringcontext) { |
| CLDLangPriors lang_priors; |
| InitCLDLangPriors(&lang_priors); |
| |
| // We now use lang= tags. |
| // Last look, circa 2008 found only 15% of web pages with lang= tags and |
| // many of those were wrong. Now (July 2011), we find 44% of web pages have |
| // lang= tags, and most of them are correct. So we now give them substantial |
| // weight in each chunk scored. |
| if (!is_plain_text) { |
| // Get any contained language tags in first n KB |
| int32 max_scan_bytes = FLAGS_cld_max_lang_tag_scan_kb << 10; |
| string lang_tags = GetLangTagsFromHtml(buffer, buffer_length, |
| max_scan_bytes); |
| SetCLDLangTagsHint(lang_tags, &lang_priors); |
| if (scoringcontext->flags_cld2_html) { |
| if (!lang_tags.empty()) { |
| fprintf(scoringcontext->debug_file, "<br>lang_tags '%s'<br>\n", |
| lang_tags.c_str()); |
| } |
| } |
| } |
| |
| if (cld_hints != NULL) { |
| if ((cld_hints->content_language_hint != NULL) && |
| (cld_hints->content_language_hint[0] != '\0')) { |
| SetCLDContentLangHint(cld_hints->content_language_hint, &lang_priors); |
| } |
| |
| // Input is from GetTLD(), already lowercased |
| if ((cld_hints->tld_hint != NULL) && (cld_hints->tld_hint[0] != '\0')) { |
| SetCLDTLDHint(cld_hints->tld_hint, &lang_priors); |
| } |
| |
| if (cld_hints->encoding_hint != UNKNOWN_ENCODING) { |
| Encoding enc = static_cast<Encoding>(cld_hints->encoding_hint); |
| SetCLDEncodingHint(enc, &lang_priors); |
| } |
| |
| if (cld_hints->language_hint != UNKNOWN_LANGUAGE) { |
| SetCLDLanguageHint(cld_hints->language_hint, &lang_priors); |
| } |
| } |
| |
| // Keep no more than four different languages with hints |
| TrimCLDLangPriors(4, &lang_priors); |
| |
| if (scoringcontext->flags_cld2_html) { |
| string print_temp = DumpCLDLangPriors(&lang_priors); |
| if (!print_temp.empty()) { |
| fprintf(scoringcontext->debug_file, "DumpCLDLangPriors %s<br>\n", |
| print_temp.c_str()); |
| } |
| } |
| |
| // Put boosts into ScoringContext |
| for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) { |
| Language lang = GetCLDPriorLang(lang_priors.prior[i]); |
| int qprob = GetCLDPriorWeight(lang_priors.prior[i]); |
| if (qprob > 0) { |
| uint32 langprob = MakeLangProb(lang, qprob); |
| AddLangPriorBoost(lang, langprob, scoringcontext); |
| } |
| } |
| |
| // Put whacks into scoring context |
| // We do not in general want zh-Hans and zh-Hant to be close pairs, |
| // but we do here. Use close_set_count[kCloseSetSize] to count zh, zh-Hant |
| std::vector<int> close_set_count(kCloseSetSize + 1, 0); |
| |
| for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) { |
| Language lang = GetCLDPriorLang(lang_priors.prior[i]); |
| ++close_set_count[LanguageCloseSet(lang)]; |
| if (lang == CLD2::CHINESE) {++close_set_count[kCloseSetSize];} |
| if (lang == CLD2::CHINESE_T) {++close_set_count[kCloseSetSize];} |
| } |
| |
| // If a boost language is in a close set, force suppressing the others in |
| // that set, if exactly one of the set is present |
| for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) { |
| Language lang = GetCLDPriorLang(lang_priors.prior[i]); |
| int qprob = GetCLDPriorWeight(lang_priors.prior[i]); |
| if (qprob > 0) { |
| int close_set = LanguageCloseSet(lang); |
| if ((close_set > 0) && (close_set_count[close_set] == 1)) { |
| AddCloseLangWhack(lang, scoringcontext); |
| } |
| if (((lang == CLD2::CHINESE) || (lang == CLD2::CHINESE_T)) && |
| (close_set_count[kCloseSetSize] == 1)) { |
| AddCloseLangWhack(lang, scoringcontext); |
| } |
| } |
| } |
| |
| |
| |
| |
| |
| |
| } |
| |
| |
| |
| // Results language3/percent3/text_bytes must be exactly three items |
| Language DetectLanguageSummaryV2( |
| const char* buffer, |
| int buffer_length, |
| bool is_plain_text, |
| const CLDHints* cld_hints, |
| bool allow_extended_lang, |
| int flags, |
| Language plus_one, |
| Language* language3, |
| int* percent3, |
| double* normalized_score3, |
| ResultChunkVector* resultchunkvector, |
| int* text_bytes, |
| bool* is_reliable) { |
| language3[0] = UNKNOWN_LANGUAGE; |
| language3[1] = UNKNOWN_LANGUAGE; |
| language3[2] = UNKNOWN_LANGUAGE; |
| percent3[0] = 0; |
| percent3[1] = 0; |
| percent3[2] = 0; |
| normalized_score3[0] = 0.0; |
| normalized_score3[1] = 0.0; |
| normalized_score3[2] = 0.0; |
| if (resultchunkvector != NULL) { |
| resultchunkvector->clear(); |
| } |
| *text_bytes = 0; |
| *is_reliable = false; |
| |
| if ((flags & kCLDFlagEcho) != 0) { |
| string temp(buffer, buffer_length); |
| if ((flags & kCLDFlagHtml) != 0) { |
| fprintf(stderr, "CLD2[%d] '%s'<br>\n", |
| buffer_length, GetHtmlEscapedText(temp).c_str()); |
| } else { |
| fprintf(stderr, "CLD2[%d] '%s'\n", |
| buffer_length, GetPlainEscapedText(temp).c_str()); |
| } |
| } |
| |
| #ifdef CLD2_DYNAMIC_MODE |
| // In dynamic mode, we immediately return UNKNOWN_LANGUAGE if the data file |
| // hasn't been loaded yet. This is the only sane thing we can do, as there |
| // are no scoring tables to consult. |
| bool dataLoaded = isDataLoaded(); |
| if ((flags & kCLDFlagVerbose) != 0) { |
| fprintf(stderr, "Data loaded: %s\n", (dataLoaded ? "true" : "false")); |
| } |
| if (!dataLoaded) { |
| return UNKNOWN_LANGUAGE; |
| } |
| #endif |
| |
| // Exit now if no text |
| if (buffer_length == 0) {return UNKNOWN_LANGUAGE;} |
| if (kScoringtables.quadgram_obj == NULL) {return UNKNOWN_LANGUAGE;} |
| |
| // Document totals |
| DocTote doc_tote; // Reliability = 0..100 |
| |
| // ScoringContext carries state across scriptspans |
| ScoringContext scoringcontext; |
| scoringcontext.debug_file = stderr; |
| scoringcontext.flags_cld2_score_as_quads = |
| ((flags & kCLDFlagScoreAsQuads) != 0); |
| scoringcontext.flags_cld2_html = ((flags & kCLDFlagHtml) != 0); |
| scoringcontext.flags_cld2_cr = ((flags & kCLDFlagCr) != 0); |
| scoringcontext.flags_cld2_verbose = ((flags & kCLDFlagVerbose) != 0); |
| scoringcontext.prior_chunk_lang = UNKNOWN_LANGUAGE; |
| scoringcontext.ulscript = ULScript_Common; |
| scoringcontext.scoringtables = &kScoringtables; |
| scoringcontext.scanner = NULL; |
| scoringcontext.init(); // Clear the internal memory arrays |
| |
| // Now thread safe. |
| bool FLAGS_cld2_html = ((flags & kCLDFlagHtml) != 0); |
| bool FLAGS_cld2_quiet = ((flags & kCLDFlagQuiet) != 0); |
| |
| ApplyHints(buffer, buffer_length, is_plain_text, cld_hints, &scoringcontext); |
| |
| // Four individual script totals, Latin, Han, other2, other3 |
| int next_other_tote = 2; |
| int tote_num = 0; |
| |
| // Four totes for up to four different scripts pending at once |
| Tote totes[4]; // [0] Latn [1] Hani [2] other [3] other |
| bool tote_seen[4] = {false, false, false, false}; |
| int tote_grams[4] = {0, 0, 0, 0}; // Number in partial chunk |
| ULScript tote_script[4] = |
| {ULScript_Latin, ULScript_Hani, ULScript_Common, ULScript_Common}; |
| |
| // Loop through text spans in a single script |
| ScriptScanner ss(buffer, buffer_length, is_plain_text); |
| LangSpan scriptspan; |
| |
| scoringcontext.scanner = &ss; |
| |
| scriptspan.text = NULL; |
| scriptspan.text_bytes = 0; |
| scriptspan.offset = 0; |
| scriptspan.ulscript = ULScript_Common; |
| scriptspan.lang = UNKNOWN_LANGUAGE; |
| |
| int total_text_bytes = 0; |
| int textlimit = FLAGS_cld_textlimit << 10; // in KB |
| if (textlimit == 0) {textlimit = 0x7fffffff;} |
| |
| int advance_by = 2; // Advance 2 bytes |
| int advance_limit = textlimit >> 3; // For first 1/8 of max document |
| |
| int initial_word_span = kDefaultWordSpan; |
| if (FLAGS_cld_forcewords) { |
| initial_word_span = kReallyBigWordSpan; |
| } |
| |
| // Pick up chunk sizes |
| // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each |
| // Sanity check -- force into a reasonable range |
| int chunksizequads = FLAGS_cld_smoothwidth; |
| chunksizequads = minint(maxint(chunksizequads, kMinChunkSizeQuads), |
| kMaxChunkSizeQuads); |
| int chunksizeunis = (chunksizequads * 5) >> 1; |
| |
| // Varying short-span limit doesn't work well -- skips too much beyond 20KB |
| // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth; |
| int spantooshortlimit = kShortSpanThresh; |
| |
| // For debugging only. Not thread-safe |
| prior_lang = UNKNOWN_LANGUAGE; |
| prior_unreliable = false; |
| |
| // Allocate full-document prediction table for finding repeating words |
| int hash = 0; |
| int* predict_tbl = new int[kPredictionTableSize]; |
| if (FlagRepeats(flags)) { |
| memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); |
| } |
| |
| |
| |
| // Loop through scriptspans accumulating number of text bytes in each language |
| while (ss.GetOneScriptSpanLower(&scriptspan)) { |
| ULScript ulscript = scriptspan.ulscript; |
| |
| // Squeeze out big chunks of text span if asked to |
| if (FlagSqueeze(flags)) { |
| // Remove repetitive or mostly-spaces chunks |
| int newlen; |
| int chunksize = 0; // Use the default |
| if (resultchunkvector != NULL) { |
| newlen = CheapSqueezeInplaceOverwrite(scriptspan.text, |
| scriptspan.text_bytes, |
| chunksize); |
| } else { |
| newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes, |
| chunksize); |
| } |
| scriptspan.text_bytes = newlen; |
| } else { |
| // Check now and then to see if we should be squeezing |
| if (((kCheapSqueezeTestThresh >> 1) < scriptspan.text_bytes) && |
| !FlagFinish(flags)) { |
| // fprintf(stderr, "CheapSqueezeTriggerTest, " |
| // "first %d bytes of %d (>%d/2)<br>\n", |
| // kCheapSqueezeTestLen, |
| // scriptspan.text_bytes, |
| // kCheapSqueezeTestThresh); |
| |
| if (CheapSqueezeTriggerTest(scriptspan.text, |
| scriptspan.text_bytes, |
| kCheapSqueezeTestLen)) { |
| // Recursive call with big-chunk squeezing set |
| if (FLAGS_cld2_html || FLAGS_dbgscore) { |
| fprintf(stderr, |
| "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n", |
| total_text_bytes); |
| } |
| // Deallocate full-document prediction table |
| delete[] predict_tbl; |
| |
| return DetectLanguageSummaryV2( |
| buffer, |
| buffer_length, |
| is_plain_text, |
| cld_hints, |
| allow_extended_lang, |
| flags | kCLDFlagSqueeze, |
| plus_one, |
| language3, |
| percent3, |
| normalized_score3, |
| resultchunkvector, |
| text_bytes, |
| is_reliable); |
| } |
| } |
| } |
| |
| // Remove repetitive words if asked to |
| if (FlagRepeats(flags)) { |
| // Remove repetitive words |
| int newlen; |
| if (resultchunkvector != NULL) { |
| newlen = CheapRepWordsInplaceOverwrite(scriptspan.text, |
| scriptspan.text_bytes, |
| &hash, predict_tbl); |
| } else { |
| newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes, |
| &hash, predict_tbl); |
| } |
| scriptspan.text_bytes = newlen; |
| } |
| |
| // Scoring depends on scriptspan buffer ALWAYS having |
| // leading space and off-the-end space space space NUL, |
| // DCHECK(scriptspan.text[0] == ' '); |
| // DCHECK(scriptspan.text[scriptspan.text_bytes + 0] == ' '); |
| // DCHECK(scriptspan.text[scriptspan.text_bytes + 1] == ' '); |
| // DCHECK(scriptspan.text[scriptspan.text_bytes + 2] == ' '); |
| // DCHECK(scriptspan.text[scriptspan.text_bytes + 3] == '\0'); |
| |
| // The real scoring |
| // Accumulate directly into the document total, or accmulate in one of four |
| // chunk totals. The purpose of the multiple chunk totals is to piece |
| // together short choppy pieces of text in alternating scripts. One total is |
| // dedicated to Latin text, one to Han text, and the other two are dynamicly |
| // assigned. |
| |
| scoringcontext.ulscript = scriptspan.ulscript; |
| // FLAGS_cld2_html = scoringcontext.flags_cld2_html; |
| |
| ScoreOneScriptSpan(scriptspan, |
| &scoringcontext, |
| &doc_tote, |
| resultchunkvector); |
| |
| total_text_bytes += scriptspan.text_bytes; |
| } // End while (ss.GetOneScriptSpanLower()) |
| |
| // Deallocate full-document prediction table |
| delete[] predict_tbl; |
| |
| if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { |
| // If no forced <cr>, put one in front of dump |
| if (!scoringcontext.flags_cld2_cr) {fprintf(stderr, "<br>\n");} |
| doc_tote.Dump(stderr); |
| } |
| |
| |
| // If extended langauges are disallowed, remove them here |
| if (!allow_extended_lang) { |
| RemoveExtendedLanguages(&doc_tote); |
| } |
| |
| // Force close pairs to one or the other |
| // If given, also update resultchunkvector |
| RefineScoredClosePairs(&doc_tote, resultchunkvector, |
| FLAGS_cld2_html, FLAGS_cld2_quiet); |
| |
| |
| // Calculate return results |
| // Find top three byte counts in tote heap |
| int reliable_percent3[3]; |
| |
| // Cannot use Add, etc. after sorting |
| doc_tote.Sort(3); |
| |
| ExtractLangEtc(&doc_tote, total_text_bytes, |
| reliable_percent3, language3, percent3, normalized_score3, |
| text_bytes, is_reliable); |
| |
| bool have_good_answer = false; |
| if (FlagFinish(flags)) { |
| // Force a result |
| have_good_answer = true; |
| } else if (total_text_bytes <= kShortTextThresh) { |
| // Don't recurse on short text -- we already did word scores |
| have_good_answer = true; |
| } else if (*is_reliable && |
| (percent3[0] >= kGoodLang1Percent)) { |
| have_good_answer = true; |
| } else if (*is_reliable && |
| ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) { |
| have_good_answer = true; |
| } |
| |
| |
| if (have_good_answer) { |
| // This is the real, non-recursive return |
| |
| // Move bytes for unreliable langs to another lang or UNKNOWN |
| RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet); |
| |
| // Redo the result extraction after the removal above |
| doc_tote.Sort(3); |
| ExtractLangEtc(&doc_tote, total_text_bytes, |
| reliable_percent3, language3, percent3, normalized_score3, |
| text_bytes, is_reliable); |
| |
| |
| |
| Language summary_lang; |
| CalcSummaryLang(&doc_tote, total_text_bytes, |
| reliable_percent3, language3, percent3, |
| &summary_lang, is_reliable, |
| FLAGS_cld2_html, FLAGS_cld2_quiet); |
| |
| if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { |
| for (int i = 0; i < 3; ++i) { |
| if (language3[i] != UNKNOWN_LANGUAGE) { |
| fprintf(stderr, "%s.%dR(%d%%) ", |
| LanguageCode(language3[i]), |
| reliable_percent3[i], |
| percent3[i]); |
| } |
| } |
| |
| fprintf(stderr, "%d bytes ", total_text_bytes); |
| fprintf(stderr, "= %s%c ", |
| LanguageName(summary_lang), *is_reliable ? ' ' : '*'); |
| fprintf(stderr, "<br><br>\n"); |
| } |
| |
| // Slightly condensed if quiet |
| if (FLAGS_cld2_html && FLAGS_cld2_quiet) { |
| fprintf(stderr, " "); |
| for (int i = 0; i < 3; ++i) { |
| if (language3[i] != UNKNOWN_LANGUAGE) { |
| fprintf(stderr, " %s %d%% ", |
| LanguageCode(language3[i]), |
| percent3[i]); |
| } |
| } |
| fprintf(stderr, "= %s%c ", |
| LanguageName(summary_lang), *is_reliable ? ' ' : '*'); |
| fprintf(stderr, "<br>\n"); |
| } |
| |
| return summary_lang; |
| } |
| |
| // Not a good answer -- do recursive call to refine |
| if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) { |
| // This is what we hope to improve on in the recursive call, if any |
| PrintLangs(stderr, language3, percent3, text_bytes, is_reliable); |
| } |
| |
| // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40 |
| // For this purpose, we treate "Ignore" as top40 |
| Language new_plus_one = UNKNOWN_LANGUAGE; |
| |
| if (total_text_bytes < kShortTextThresh) { |
| // Short text: Recursive call with top40 and short set |
| if (FLAGS_cld2_html || FLAGS_dbgscore) { |
| fprintf(stderr, " ---text_bytes[%d] " |
| "Recursive(Top40/Rep/Short/Words)---<br><br>\n", |
| total_text_bytes); |
| } |
| return DetectLanguageSummaryV2( |
| buffer, |
| buffer_length, |
| is_plain_text, |
| cld_hints, |
| allow_extended_lang, |
| flags | kCLDFlagTop40 | kCLDFlagRepeats | |
| kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish, |
| new_plus_one, |
| language3, |
| percent3, |
| normalized_score3, |
| resultchunkvector, |
| text_bytes, |
| is_reliable); |
| } |
| |
| // Longer text: Recursive call with top40 set |
| if (FLAGS_cld2_html || FLAGS_dbgscore) { |
| fprintf(stderr, |
| " ---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n", |
| total_text_bytes); |
| } |
| return DetectLanguageSummaryV2( |
| buffer, |
| buffer_length, |
| is_plain_text, |
| cld_hints, |
| allow_extended_lang, |
| flags | kCLDFlagTop40 | kCLDFlagRepeats | |
| kCLDFlagFinish, |
| new_plus_one, |
| language3, |
| percent3, |
| normalized_score3, |
| resultchunkvector, |
| text_bytes, |
| is_reliable); |
| } |
| |
| |
| // For debugging and wrappers. Not thread safe. |
| static char temp_detectlanguageversion[32]; |
| |
| // Return version text string |
| // String is "code_version - data_build_date" |
| const char* DetectLanguageVersion() { |
| if (kScoringtables.quadgram_obj == NULL) {return "";} |
| sprintf(temp_detectlanguageversion, |
| "V2.0 - %u", kScoringtables.quadgram_obj->kCLDTableBuildDate); |
| return temp_detectlanguageversion; |
| } |
| |
| |
| } // End namespace CLD2 |