| // Copyright 2013 Google Inc. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // |
| // Little program to read sample UTF-8 text and score it |
| // Giving precision, recall, F, and matrix |
| // |
| // Author: dsites@google.com (Dick Sites) |
| // |
| |
| #include <math.h> // for sqrt |
| #include <stdio.h> |
| #include <string.h> |
| #include <string> |
| |
| #include "debug.h" // for uint8 etc |
| #include "integral_types.h" // for uint8 etc |
| #include "compact_lang_det_impl.h" |
| #include "lang_script.h" |
| |
| using namespace std; |
| |
| namespace CLD2 { |
| |
| |
| // Scaffolding |
| typedef int32 Encoding; |
| static const Encoding UNKNOWN_ENCODING = 0; |
| |
| static const bool FLAGS_cld2_html = true; |
| static const bool FLAGS_noext = false; |
| static const bool FLAGS_echo_mismatch = true; |
| static const int32 FLAGS_minsize = 0; |
| |
| |
| /*** |
| accepts one or more input files |
| loop: |
| reads source line |
| does cld on each source line |
| records source lang-script, CLD lang-script, count+=1 |
| |
| at end, print row headers = CLD lang, script, per_M |
| print column headers = in lang, script, per_M |
| print matrix, recall, precision, F |
| and overall RMS F. |
| |
| sort by script, and within script, by per_M and near diagonal |
| ***/ |
| |
| |
| #define LF 0x0a |
| #define CR 0x0d |
| const int kMaxBuffer = 5 * 1024; |
| |
| bool ReadLine(FILE* infile, char* buffer, size_t maxlen) { |
| char* p = fgets(buffer, maxlen, infile); |
| if (p == NULL) { |
| return false; |
| } |
| int len = strlen(buffer); |
| |
| // trim CR LF |
| if (buffer[len-1] == LF) {buffer[--len] = '\0';} |
| if (buffer[len-1] == CR) {buffer[--len] = '\0';} |
| return true; |
| } |
| |
| bool IsComment(char* buffer) { |
| int len = strlen(buffer); |
| if (len == 0) {return true;} |
| if (buffer[0] == '#') {return true;} |
| if (buffer[0] == ' ') {return true;} // Any leading space is comment |
| if ((len >= 5) && (memcmp(buffer, "BOGUS", 5) == 0)) {return true;} |
| return false; |
| } |
| |
| |
| // Skips over xxxxx_ where _ is one or more spaces/tabs |
| // Returns string::npos if no more fields |
| int SkipOneField(const string& src, int pos) { |
| if (pos == string::npos) {return pos;} |
| |
| int lpos = pos; |
| lpos = src.find_first_of(" \t", lpos); |
| if (lpos == string::npos) {return lpos;} |
| lpos = src.find_first_not_of(" \t", lpos); |
| if (lpos == string::npos) {return lpos;} |
| return lpos; |
| } |
| |
| // Return language and script from parsed line |
| void GetStatedLangScript(const string& src, string* lang_script, string* tld) { |
| *lang_script = ""; |
| *tld = ""; |
| int pos = 0; |
| int pos2 = 0; |
| if (src.substr(0,7) == "SAMPLE ") { |
| // SAMPLE ll-Ssss |
| pos = SkipOneField(src, pos); |
| } else if (src.substr(0,5) == "SAMP ") { |
| // SAMP ll-Ssss /tld2.tld/ |
| pos = SkipOneField(src, pos); |
| pos2 = SkipOneField(src, pos); |
| } else if (src.substr(0,5) == "Samp ") { |
| // Samp ll-Ssss /tld2.tld/ |
| pos = SkipOneField(src, pos); |
| pos2 = SkipOneField(src, pos); |
| } |
| if (pos == 0) {return;} |
| if (pos == string::npos) {return;} |
| |
| // Pos is at the first letter of language-script combination |
| int end = src.find_first_of(" \t", pos); // find end of lang-script |
| if (end == string::npos) {return;} |
| *lang_script = src.substr(pos, end - pos); |
| |
| // Pos2 is 0 or at the first letter of the tld string |
| if (pos2 == 0) {return;} |
| if (pos2 == string::npos) {return;} |
| end = src.find_first_of(" \t", pos2); |
| if (end == string::npos) {return;} |
| *tld = src.substr(pos2, end - pos2); |
| } |
| |
| // Return position of start of text |
| int GetTextBeginPos(const string& src) { |
| int pos = 0; |
| if (src.size() < 8) {return pos;} |
| |
| if (src.substr(0,7) == "SAMPLE ") { |
| // Skip SAMPLE ll-Ssss |
| pos = SkipOneField(src, pos); |
| pos = SkipOneField(src, pos); |
| } else if (src.substr(0,5) == "SAMP ") { |
| // Skip SAMP ll-Ssss /tld2.tld/ |
| pos = SkipOneField(src, pos); |
| pos = SkipOneField(src, pos); |
| pos = SkipOneField(src, pos); |
| } else if (src.substr(0,5) == "Samp ") { |
| // Skip Samp ll-Ssss /tld2.tld/ |
| pos = SkipOneField(src, pos); |
| pos = SkipOneField(src, pos); |
| pos = SkipOneField(src, pos); |
| } |
| return pos; |
| } |
| |
| |
| bool CarefulMatch(const char* in_langscript, |
| Language in_lang, ULScript in_lscript, |
| Language cld_lang, ULScript cld_lscript) { |
| bool easy_match = ((in_lang == cld_lang) & (in_lscript == cld_lscript)); |
| if (easy_match) {return true;} |
| |
| // Unrecognized list, matching un-Xxxx |
| if ((cld_lang == UNKNOWN_LANGUAGE) && (in_lscript == cld_lscript)) { |
| if (strcmp(in_langscript, "az-Arab") == 0) {return true;} |
| if (strcmp(in_langscript, "az-Cyrl") == 0) {return true;} |
| if (strcmp(in_langscript, "kk-Latn") == 0) {return true;} |
| if (strcmp(in_langscript, "ku-Latn") == 0) {return true;} |
| if (strcmp(in_langscript, "my-Latn") == 0) {return true;} |
| if (strcmp(in_langscript, "ru-Latn") == 0) {return true;} |
| if (strcmp(in_langscript, "tg-Arab") == 0) {return true;} |
| if (strcmp(in_langscript, "ug-Latn") == 0) {return true;} |
| if (strcmp(in_langscript, "za-Hani") == 0) {return true;} |
| } |
| |
| // bs/me => sr/hr |
| if ((cld_lang == CROATIAN) && (cld_lscript == ULScript_Latin)) { |
| if (strcmp(in_langscript, "bs-Latn") == 0) {return true;} |
| if (strcmp(in_langscript, "sr-ME-Latn") == 0) {return true;} |
| } |
| if ((cld_lang == SERBIAN) && (cld_lscript == ULScript_Cyrillic)) { |
| if (strcmp(in_langscript, "bs-Cyrl") == 0) {return true;} |
| if (strcmp(in_langscript, "sr-ME-Cyrl") == 0) {return true;} |
| } |
| |
| // Twi => Akan |
| if ((cld_lang == AKAN) && (cld_lscript == ULScript_Latin)) { |
| if (strcmp(in_langscript, "tw-Latn") == 0) {return true;} |
| } |
| |
| // za-Hani |
| if ((cld_lang == CHINESE) && (cld_lscript == ULScript_Hani)) { |
| if (strcmp(in_langscript, "za-Hani") == 0) {return true;} |
| } |
| |
| // zzb, zze, zzh fake languages |
| if (strcmp(in_langscript, "zzb-Latn") == 0) {return true;} |
| if (strcmp(in_langscript, "zze-Latn") == 0) {return true;} |
| if (strcmp(in_langscript, "zzh-Latn") == 0) {return true;} |
| |
| |
| return false; |
| } |
| |
| |
| #if 0 |
| typedef hash_map<string, int32> StringIntMap; |
| |
| static int32 next_in_map; |
| static int32 next_cld_map; |
| static int32 next_in_cld_map; |
| static StringIntMap in_map; // xx-Fooo to small int |
| static StringIntMap cld_map; // xx-Fooo to small int |
| static StringIntMap in_cld_map; // xx-Fooo_xx-Barr to small int |
| |
| static vector<int32> in_count; // counts by in_map subscript |
| static vector<int32> cld_count; // counts by cld_map subscript |
| static vector<int32> in_cld_count; // counts by in_cld_map subscript |
| |
| int32 MapToSmallInt(const string& s, StringIntMap* smap, int* next_smap) { |
| StringIntMap::iterator it = smap->find(s); |
| if (it == smap->end()) { |
| // New |
| (*smap)[s] = *next_smap; |
| *next_smap += 1; |
| } |
| return (*smap)[s]; |
| } |
| #endif |
| |
| |
| void InitResult() { |
| #if 0 |
| in_map.clear(); |
| cld_map.clear(); |
| in_cld_map.clear(); |
| next_in_map = 0; |
| next_cld_map = 0; |
| next_in_cld_map = 0; |
| in_count.clear(); |
| cld_count.clear(); |
| in_cld_count.clear(); |
| #endif |
| } |
| |
| void RecordCLDResult(const char* buffer, const char* in_langscript, |
| Language in_lang, ULScript in_lscript, |
| Language cld_lang, ULScript cld_lscript) { |
| |
| bool match = CarefulMatch(in_langscript, |
| in_lang, in_lscript, cld_lang, cld_lscript); |
| if (FLAGS_echo_mismatch && !match) { |
| fprintf(stderr, |
| " =Mismatch: " |
| "expected %s, actual %s<br>\n", |
| LanguageCode(in_lang), LanguageCode(cld_lang)); |
| } |
| #if 0 |
| printf("%s %s-%s %s\n", in_langscript, |
| ExtLanguageCode(cld_lang), UnicodeLScriptCode(cld_lscript), |
| match ? "" : "!="); |
| |
| string cld_langscript = ExtLanguageCode(cld_lang); |
| cld_langscript.append("-"); |
| cld_langscript.append( UnicodeLScriptCode(cld_lscript)); |
| |
| string in_cld_langscript = in_langscript; |
| in_cld_langscript.append("_"); |
| in_cld_langscript.append(cld_langscript); |
| |
| // Extend vectors if needed |
| int32 in_int = MapToSmallInt(in_langscript, &in_map, &next_in_map); |
| while (in_count.size() <= in_int) {in_count.push_back(0);} |
| in_count[in_int] += 1; |
| |
| int32 cld_int = MapToSmallInt(cld_langscript, &cld_map, &next_cld_map); |
| while (cld_count.size() <= cld_int) {cld_count.push_back(0);} |
| cld_count[cld_int] += 1; |
| |
| int32 in_cld_int = MapToSmallInt(in_cld_langscript, |
| &in_cld_map, &next_in_cld_map); |
| while (in_cld_count.size() <= in_cld_int) {in_cld_count.push_back(0);} |
| in_cld_count[in_cld_int] += 1; |
| #endif |
| } |
| |
| void FinishResult() { |
| #if 0 |
| int32 in_n = in_map.size(); |
| int32 cld_n = cld_map.size(); |
| |
| int32* in_total = new int32[in_n]; |
| memset(in_total, 0, in_n * sizeof(int32)); |
| int32* in_matches = new int32[in_n]; |
| memset(in_matches, 0, in_n * sizeof(int32)); |
| string* in_str = new string[in_n]; |
| for (StringIntMap::iterator it = in_map.begin(); it != in_map.end(); ++it) { |
| in_str[it->second] = it->first; |
| } |
| |
| int32* cld_total = new int32[cld_n]; |
| memset(cld_total, 0, cld_n * sizeof(int32)); |
| int32* cld_matches = new int32[cld_n]; |
| memset(cld_matches, 0, cld_n * sizeof(int32)); |
| string* cld_str = new string[cld_n]; |
| for (StringIntMap::iterator it = cld_map.begin(); it != cld_map.end(); ++it) { |
| cld_str[it->second] = it->first; |
| } |
| |
| for (StringIntMap::iterator it = in_cld_map.begin(); |
| it != in_cld_map.end(); ++it) { |
| string in_cld = it->first; |
| int32 in_cld_int = it->second; |
| //VERYTEMP |
| //printf("%s[%d] = %d\n", in_cld.c_str(), in_cld_int, in_cld_count[in_cld_int]); |
| |
| // Decompose it all |
| int under_pos = in_cld.find("_"); |
| string in_langscript = in_cld.substr(0, under_pos); |
| string cld_langscript = in_cld.substr(under_pos + 1); |
| int32 in_int = MapToSmallInt(in_langscript, &in_map, &next_in_map); |
| int32 cld_int = MapToSmallInt(cld_langscript, &cld_map, &next_cld_map); |
| |
| Language in_lang = GetLanguageFromNumberOrName(in_langscript.c_str()); |
| ULScript in_lscript = GetLScriptFromNumberOrName(in_langscript.c_str()); |
| Language cld_lang = GetLanguageFromNumberOrName(cld_langscript.c_str()); |
| ULScript cld_lscript = GetLScriptFromNumberOrName(cld_langscript.c_str()); |
| |
| bool match = CarefulMatch(in_langscript.c_str(), |
| in_lang, in_lscript, cld_lang, cld_lscript); |
| |
| //VERYTEMP |
| //printf("%s-%s %s-%s #=%d %d %d %s\n", |
| // ExtLanguageCode(in_lang), UnicodeLScriptCode(in_lscript), |
| // ExtLanguageCode(cld_lang), UnicodeLScriptCode(cld_lscript), |
| // in_cld_count[in_cld_int], in_int, cld_int, match ? "match" : "!="); |
| |
| in_total[in_int] += in_cld_count[in_cld_int]; |
| cld_total[cld_int] += in_cld_count[in_cld_int]; |
| if (match) { |
| in_matches[in_int] += in_cld_count[in_cld_int]; |
| cld_matches[cld_int] += in_cld_count[in_cld_int]; |
| } |
| } |
| |
| int32 total = 0; |
| int32 match_total = 0; |
| for (int i = 0; i < cld_n; ++i) { |
| printf("Precision: %s %d/%d = %6.4f\n", |
| cld_str[i].c_str(), cld_matches[i], cld_total[i], |
| cld_total[i] == 0 ? 0.0 : (cld_matches[i] * 1.0) / cld_total[i]); |
| total += cld_total[i]; |
| match_total += cld_matches[i]; |
| } |
| printf("Precision: %s %d/%d = %6.4f\n", |
| "TOTAL", match_total, total, |
| total == 0 ? 0.0 : (match_total * 1.0) / total); |
| |
| total = 0; |
| match_total = 0; |
| for (int i = 0; i < in_n; ++i) { |
| printf("Recall: %s %d/%d = %6.4f\n", |
| in_str[i].c_str(), in_matches[i], in_total[i], |
| in_total[i] == 0 ? 0.0 : (in_matches[i] * 1.0) / in_total[i]); |
| total += in_total[i]; |
| match_total += in_matches[i]; |
| } |
| printf("Recall: %s %d/%d = %6.4f\n", |
| "TOTAL", match_total, total, |
| total == 0 ? 0.0 : (match_total * 1.0) / total); |
| |
| #endif |
| } |
| |
| bool SkipMe(char c) { |
| if (static_cast<uint8>(c) <= '9') {return true;} |
| return false; |
| } |
| |
| // Remove any trailing digits/spaces (possible mapreduce counts) |
| // Return length |
| int Trim(char* buffer) { |
| int buffer_len = strlen(buffer); |
| while (SkipMe(buffer[buffer_len - 1])) {--buffer_len;} |
| buffer[buffer_len] = '\0'; |
| return buffer_len; |
| } |
| |
| void LangDetLinesOfFile(int flags, bool get_vector, const char* fname) { |
| FILE* fin = fopen(fname, "rb"); |
| if (fin == NULL) { |
| fprintf(stderr, "Did not open %s\n", fname); |
| return; |
| } |
| |
| // Expecting |
| // Samp af-Latn /afr/ word tot skuldig bevind volgens die wet, in... |
| char buffer[kMaxBuffer]; |
| while (ReadLine(fin, buffer, kMaxBuffer)) { |
| if (IsComment(buffer)) {continue;} |
| |
| int buffer_len = Trim(buffer); |
| |
| string buffer_str(buffer, buffer_len); |
| string lang_script; |
| string tld; |
| |
| // Get lang-script |
| GetStatedLangScript(buffer_str, &lang_script, &tld); |
| Language in_lang = GetLanguageFromName(lang_script.c_str()); |
| ULScript in_lscript = GetULScriptFromName(lang_script.c_str()); |
| |
| // Get Text; skip over any prefix fields |
| int pos = GetTextBeginPos(buffer_str); |
| if (pos == string::npos) {continue;} |
| |
| const char* src = buffer_str.data() + pos; |
| int src_len = buffer_str.size() - pos; |
| |
| if (src_len < FLAGS_minsize) {continue;} // Skip if too short |
| |
| // Detect language in one line of UTF-8 |
| bool is_plain_text = false; |
| const char* tldhint = ""; |
| Encoding enchint = UNKNOWN_ENCODING; |
| Language langhint = UNKNOWN_LANGUAGE; |
| // Full-blown flag-bit and hints interface |
| bool allow_extended_lang = true; |
| // Caller initializes flags |
| Language plus_one = UNKNOWN_LANGUAGE; |
| |
| Language language3[3]; |
| int percent3[3]; |
| double normalized_score3[3]; |
| ResultChunkVector resultchunkvector; |
| int text_bytes; |
| bool is_reliable; |
| |
| // Detected language biased summary (biased against English) |
| Language summary_lang = UNKNOWN_LANGUAGE; |
| |
| // Identify the expected value |
| fprintf(stderr, "Samp %s ", lang_script.c_str()); |
| flags |= kCLDFlagQuiet; |
| |
| CLDHints cldhints = {NULL, tldhint, enchint, langhint}; |
| |
| summary_lang = DetectLanguageSummaryV2( |
| src, src_len, |
| is_plain_text, |
| &cldhints, |
| allow_extended_lang, |
| flags, |
| plus_one, |
| language3, |
| percent3, |
| normalized_score3, |
| get_vector ? &resultchunkvector : NULL, |
| &text_bytes, |
| &is_reliable); |
| |
| #if 0 |
| if (FLAGS_noext) { |
| summary_lang = DetectLanguageSummary( |
| src, src_len, |
| is_plain_text, |
| language3, |
| percent3, |
| &text_bytes, |
| &is_reliable); |
| } else { |
| summary_lang = ExtDetectLanguageSummary( |
| src, src_len, |
| is_plain_text, |
| language3, |
| percent3, |
| &text_bytes, |
| &is_reliable); |
| } |
| #endif |
| if (get_vector) { |
| DumpResultChunkVector(stderr, src, &resultchunkvector); |
| } |
| |
| if (!is_reliable) {summary_lang = UNKNOWN_LANGUAGE;} |
| |
| RecordCLDResult(buffer, lang_script.c_str(), |
| in_lang, in_lscript, |
| summary_lang, in_lscript); |
| } |
| |
| fclose(fin); |
| } |
| |
| |
| |
| |
| int main (int argc, char *argv[]) |
| { |
| int flags = 0; |
| bool get_vector = false; |
| |
| for (int i = 1; i < argc; ++i) { |
| if (strcmp(argv[i], "--scoreasquads") == 0) {flags |= kCLDFlagScoreAsQuads;} |
| if (strcmp(argv[i], "--html") == 0) {flags |= kCLDFlagHtml;} |
| if (strcmp(argv[i], "--cr") == 0) {flags |= kCLDFlagCr;} |
| if (strcmp(argv[i], "--verbose") == 0) {flags |= kCLDFlagVerbose;} |
| if (strcmp(argv[i], "--vector") == 0) {get_vector = true;} |
| } |
| |
| if (FLAGS_cld2_html) { |
| // Begin HTML file |
| fprintf(stderr, "<html><meta charset=\"UTF-8\"><body>\n"); |
| fprintf(stderr, "<style media=\"print\" type=\"text/css\"> " |
| ":root { -webkit-print-color-adjust: exact; } </style>\n"); |
| fprintf(stderr, "<span style=\"font-size: 7pt\">\n"); |
| } |
| |
| |
| InitResult(); |
| for (int i = 1; i < argc; ++i) { |
| if (argv[i][0] != '-') { |
| const char* fname = argv[i]; |
| fprintf(stderr, "file = %s<br><br>\n", fname ? fname : "stdin"); |
| LangDetLinesOfFile(flags, get_vector, fname); |
| } |
| } |
| FinishResult(); |
| |
| if (FLAGS_cld2_html) { |
| fprintf(stderr, "\n</span></body></html><br>"); |
| } |
| |
| return 0; |
| } |
| |
| } // End namespace CLD2 |
| |
| |
| int main(int argc, char *argv[]) { |
| return CLD2::main(argc, argv); |
| } |
| |