blob: 16fe6d8ad53999671df620970e577e6d071d0998 [file] [log] [blame]
// Copyright 2013 Google Inc. All Rights Reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
// Little program to read sample UTF-8 text and score it
// Giving precision, recall, F, and matrix
// Author: (Dick Sites)
#include <math.h> // for sqrt
#include <stdio.h>
#include <string.h>
#include <string>
#include "debug.h" // for uint8 etc
#include "integral_types.h" // for uint8 etc
#include "compact_lang_det_impl.h"
#include "lang_script.h"
using namespace std;
namespace CLD2 {
// Scaffolding
typedef int32 Encoding;
static const Encoding UNKNOWN_ENCODING = 0;
static const bool FLAGS_cld2_html = true;
static const bool FLAGS_noext = false;
static const bool FLAGS_echo_mismatch = true;
static const int32 FLAGS_minsize = 0;
accepts one or more input files
reads source line
does cld on each source line
records source lang-script, CLD lang-script, count+=1
at end, print row headers = CLD lang, script, per_M
print column headers = in lang, script, per_M
print matrix, recall, precision, F
and overall RMS F.
sort by script, and within script, by per_M and near diagonal
#define LF 0x0a
#define CR 0x0d
const int kMaxBuffer = 5 * 1024;
bool ReadLine(FILE* infile, char* buffer, size_t maxlen) {
char* p = fgets(buffer, maxlen, infile);
if (p == NULL) {
return false;
int len = strlen(buffer);
// trim CR LF
if (buffer[len-1] == LF) {buffer[--len] = '\0';}
if (buffer[len-1] == CR) {buffer[--len] = '\0';}
return true;
bool IsComment(char* buffer) {
int len = strlen(buffer);
if (len == 0) {return true;}
if (buffer[0] == '#') {return true;}
if (buffer[0] == ' ') {return true;} // Any leading space is comment
if ((len >= 5) && (memcmp(buffer, "BOGUS", 5) == 0)) {return true;}
return false;
// Skips over xxxxx_ where _ is one or more spaces/tabs
// Returns string::npos if no more fields
int SkipOneField(const string& src, int pos) {
if (pos == string::npos) {return pos;}
int lpos = pos;
lpos = src.find_first_of(" \t", lpos);
if (lpos == string::npos) {return lpos;}
lpos = src.find_first_not_of(" \t", lpos);
if (lpos == string::npos) {return lpos;}
return lpos;
// Return language and script from parsed line
void GetStatedLangScript(const string& src, string* lang_script, string* tld) {
*lang_script = "";
*tld = "";
int pos = 0;
int pos2 = 0;
if (src.substr(0,7) == "SAMPLE ") {
// SAMPLE ll-Ssss
pos = SkipOneField(src, pos);
} else if (src.substr(0,5) == "SAMP ") {
// SAMP ll-Ssss /tld2.tld/
pos = SkipOneField(src, pos);
pos2 = SkipOneField(src, pos);
} else if (src.substr(0,5) == "Samp ") {
// Samp ll-Ssss /tld2.tld/
pos = SkipOneField(src, pos);
pos2 = SkipOneField(src, pos);
if (pos == 0) {return;}
if (pos == string::npos) {return;}
// Pos is at the first letter of language-script combination
int end = src.find_first_of(" \t", pos); // find end of lang-script
if (end == string::npos) {return;}
*lang_script = src.substr(pos, end - pos);
// Pos2 is 0 or at the first letter of the tld string
if (pos2 == 0) {return;}
if (pos2 == string::npos) {return;}
end = src.find_first_of(" \t", pos2);
if (end == string::npos) {return;}
*tld = src.substr(pos2, end - pos2);
// Return position of start of text
int GetTextBeginPos(const string& src) {
int pos = 0;
if (src.size() < 8) {return pos;}
if (src.substr(0,7) == "SAMPLE ") {
// Skip SAMPLE ll-Ssss
pos = SkipOneField(src, pos);
pos = SkipOneField(src, pos);
} else if (src.substr(0,5) == "SAMP ") {
// Skip SAMP ll-Ssss /tld2.tld/
pos = SkipOneField(src, pos);
pos = SkipOneField(src, pos);
pos = SkipOneField(src, pos);
} else if (src.substr(0,5) == "Samp ") {
// Skip Samp ll-Ssss /tld2.tld/
pos = SkipOneField(src, pos);
pos = SkipOneField(src, pos);
pos = SkipOneField(src, pos);
return pos;
bool CarefulMatch(const char* in_langscript,
Language in_lang, ULScript in_lscript,
Language cld_lang, ULScript cld_lscript) {
bool easy_match = ((in_lang == cld_lang) & (in_lscript == cld_lscript));
if (easy_match) {return true;}
// Unrecognized list, matching un-Xxxx
if ((cld_lang == UNKNOWN_LANGUAGE) && (in_lscript == cld_lscript)) {
if (strcmp(in_langscript, "az-Arab") == 0) {return true;}
if (strcmp(in_langscript, "az-Cyrl") == 0) {return true;}
if (strcmp(in_langscript, "kk-Latn") == 0) {return true;}
if (strcmp(in_langscript, "ku-Latn") == 0) {return true;}
if (strcmp(in_langscript, "my-Latn") == 0) {return true;}
if (strcmp(in_langscript, "ru-Latn") == 0) {return true;}
if (strcmp(in_langscript, "tg-Arab") == 0) {return true;}
if (strcmp(in_langscript, "ug-Latn") == 0) {return true;}
if (strcmp(in_langscript, "za-Hani") == 0) {return true;}
// bs/me => sr/hr
if ((cld_lang == CROATIAN) && (cld_lscript == ULScript_Latin)) {
if (strcmp(in_langscript, "bs-Latn") == 0) {return true;}
if (strcmp(in_langscript, "sr-ME-Latn") == 0) {return true;}
if ((cld_lang == SERBIAN) && (cld_lscript == ULScript_Cyrillic)) {
if (strcmp(in_langscript, "bs-Cyrl") == 0) {return true;}
if (strcmp(in_langscript, "sr-ME-Cyrl") == 0) {return true;}
// Twi => Akan
if ((cld_lang == AKAN) && (cld_lscript == ULScript_Latin)) {
if (strcmp(in_langscript, "tw-Latn") == 0) {return true;}
// za-Hani
if ((cld_lang == CHINESE) && (cld_lscript == ULScript_Hani)) {
if (strcmp(in_langscript, "za-Hani") == 0) {return true;}
// zzb, zze, zzh fake languages
if (strcmp(in_langscript, "zzb-Latn") == 0) {return true;}
if (strcmp(in_langscript, "zze-Latn") == 0) {return true;}
if (strcmp(in_langscript, "zzh-Latn") == 0) {return true;}
return false;
#if 0
typedef hash_map<string, int32> StringIntMap;
static int32 next_in_map;
static int32 next_cld_map;
static int32 next_in_cld_map;
static StringIntMap in_map; // xx-Fooo to small int
static StringIntMap cld_map; // xx-Fooo to small int
static StringIntMap in_cld_map; // xx-Fooo_xx-Barr to small int
static vector<int32> in_count; // counts by in_map subscript
static vector<int32> cld_count; // counts by cld_map subscript
static vector<int32> in_cld_count; // counts by in_cld_map subscript
int32 MapToSmallInt(const string& s, StringIntMap* smap, int* next_smap) {
StringIntMap::iterator it = smap->find(s);
if (it == smap->end()) {
// New
(*smap)[s] = *next_smap;
*next_smap += 1;
return (*smap)[s];
void InitResult() {
#if 0
next_in_map = 0;
next_cld_map = 0;
next_in_cld_map = 0;
void RecordCLDResult(const char* buffer, const char* in_langscript,
Language in_lang, ULScript in_lscript,
Language cld_lang, ULScript cld_lscript) {
bool match = CarefulMatch(in_langscript,
in_lang, in_lscript, cld_lang, cld_lscript);
if (FLAGS_echo_mismatch && !match) {
"&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;=Mismatch: "
"expected %s, actual %s<br>\n",
LanguageCode(in_lang), LanguageCode(cld_lang));
#if 0
printf("%s %s-%s %s\n", in_langscript,
ExtLanguageCode(cld_lang), UnicodeLScriptCode(cld_lscript),
match ? "" : "!=");
string cld_langscript = ExtLanguageCode(cld_lang);
cld_langscript.append( UnicodeLScriptCode(cld_lscript));
string in_cld_langscript = in_langscript;
// Extend vectors if needed
int32 in_int = MapToSmallInt(in_langscript, &in_map, &next_in_map);
while (in_count.size() <= in_int) {in_count.push_back(0);}
in_count[in_int] += 1;
int32 cld_int = MapToSmallInt(cld_langscript, &cld_map, &next_cld_map);
while (cld_count.size() <= cld_int) {cld_count.push_back(0);}
cld_count[cld_int] += 1;
int32 in_cld_int = MapToSmallInt(in_cld_langscript,
&in_cld_map, &next_in_cld_map);
while (in_cld_count.size() <= in_cld_int) {in_cld_count.push_back(0);}
in_cld_count[in_cld_int] += 1;
void FinishResult() {
#if 0
int32 in_n = in_map.size();
int32 cld_n = cld_map.size();
int32* in_total = new int32[in_n];
memset(in_total, 0, in_n * sizeof(int32));
int32* in_matches = new int32[in_n];
memset(in_matches, 0, in_n * sizeof(int32));
string* in_str = new string[in_n];
for (StringIntMap::iterator it = in_map.begin(); it != in_map.end(); ++it) {
in_str[it->second] = it->first;
int32* cld_total = new int32[cld_n];
memset(cld_total, 0, cld_n * sizeof(int32));
int32* cld_matches = new int32[cld_n];
memset(cld_matches, 0, cld_n * sizeof(int32));
string* cld_str = new string[cld_n];
for (StringIntMap::iterator it = cld_map.begin(); it != cld_map.end(); ++it) {
cld_str[it->second] = it->first;
for (StringIntMap::iterator it = in_cld_map.begin();
it != in_cld_map.end(); ++it) {
string in_cld = it->first;
int32 in_cld_int = it->second;
//printf("%s[%d] = %d\n", in_cld.c_str(), in_cld_int, in_cld_count[in_cld_int]);
// Decompose it all
int under_pos = in_cld.find("_");
string in_langscript = in_cld.substr(0, under_pos);
string cld_langscript = in_cld.substr(under_pos + 1);
int32 in_int = MapToSmallInt(in_langscript, &in_map, &next_in_map);
int32 cld_int = MapToSmallInt(cld_langscript, &cld_map, &next_cld_map);
Language in_lang = GetLanguageFromNumberOrName(in_langscript.c_str());
ULScript in_lscript = GetLScriptFromNumberOrName(in_langscript.c_str());
Language cld_lang = GetLanguageFromNumberOrName(cld_langscript.c_str());
ULScript cld_lscript = GetLScriptFromNumberOrName(cld_langscript.c_str());
bool match = CarefulMatch(in_langscript.c_str(),
in_lang, in_lscript, cld_lang, cld_lscript);
//printf("%s-%s %s-%s #=%d %d %d %s\n",
// ExtLanguageCode(in_lang), UnicodeLScriptCode(in_lscript),
// ExtLanguageCode(cld_lang), UnicodeLScriptCode(cld_lscript),
// in_cld_count[in_cld_int], in_int, cld_int, match ? "match" : "!=");
in_total[in_int] += in_cld_count[in_cld_int];
cld_total[cld_int] += in_cld_count[in_cld_int];
if (match) {
in_matches[in_int] += in_cld_count[in_cld_int];
cld_matches[cld_int] += in_cld_count[in_cld_int];
int32 total = 0;
int32 match_total = 0;
for (int i = 0; i < cld_n; ++i) {
printf("Precision: %s %d/%d = %6.4f\n",
cld_str[i].c_str(), cld_matches[i], cld_total[i],
cld_total[i] == 0 ? 0.0 : (cld_matches[i] * 1.0) / cld_total[i]);
total += cld_total[i];
match_total += cld_matches[i];
printf("Precision: %s %d/%d = %6.4f\n",
"TOTAL", match_total, total,
total == 0 ? 0.0 : (match_total * 1.0) / total);
total = 0;
match_total = 0;
for (int i = 0; i < in_n; ++i) {
printf("Recall: %s %d/%d = %6.4f\n",
in_str[i].c_str(), in_matches[i], in_total[i],
in_total[i] == 0 ? 0.0 : (in_matches[i] * 1.0) / in_total[i]);
total += in_total[i];
match_total += in_matches[i];
printf("Recall: %s %d/%d = %6.4f\n",
"TOTAL", match_total, total,
total == 0 ? 0.0 : (match_total * 1.0) / total);
bool SkipMe(char c) {
if (static_cast<uint8>(c) <= '9') {return true;}
return false;
// Remove any trailing digits/spaces (possible mapreduce counts)
// Return length
int Trim(char* buffer) {
int buffer_len = strlen(buffer);
while (SkipMe(buffer[buffer_len - 1])) {--buffer_len;}
buffer[buffer_len] = '\0';
return buffer_len;
void LangDetLinesOfFile(int flags, bool get_vector, const char* fname) {
FILE* fin = fopen(fname, "rb");
if (fin == NULL) {
fprintf(stderr, "Did not open %s\n", fname);
// Expecting
// Samp af-Latn /afr/ word tot skuldig bevind volgens die wet, in...
char buffer[kMaxBuffer];
while (ReadLine(fin, buffer, kMaxBuffer)) {
if (IsComment(buffer)) {continue;}
int buffer_len = Trim(buffer);
string buffer_str(buffer, buffer_len);
string lang_script;
string tld;
// Get lang-script
GetStatedLangScript(buffer_str, &lang_script, &tld);
Language in_lang = GetLanguageFromName(lang_script.c_str());
ULScript in_lscript = GetULScriptFromName(lang_script.c_str());
// Get Text; skip over any prefix fields
int pos = GetTextBeginPos(buffer_str);
if (pos == string::npos) {continue;}
const char* src = + pos;
int src_len = buffer_str.size() - pos;
if (src_len < FLAGS_minsize) {continue;} // Skip if too short
// Detect language in one line of UTF-8
bool is_plain_text = false;
const char* tldhint = "";
Encoding enchint = UNKNOWN_ENCODING;
Language langhint = UNKNOWN_LANGUAGE;
// Full-blown flag-bit and hints interface
bool allow_extended_lang = true;
// Caller initializes flags
Language plus_one = UNKNOWN_LANGUAGE;
Language language3[3];
int percent3[3];
double normalized_score3[3];
ResultChunkVector resultchunkvector;
int text_bytes;
bool is_reliable;
// Detected language biased summary (biased against English)
Language summary_lang = UNKNOWN_LANGUAGE;
// Identify the expected value
fprintf(stderr, "Samp %s ", lang_script.c_str());
flags |= kCLDFlagQuiet;
CLDHints cldhints = {NULL, tldhint, enchint, langhint};
summary_lang = DetectLanguageSummaryV2(
src, src_len,
get_vector ? &resultchunkvector : NULL,
#if 0
if (FLAGS_noext) {
summary_lang = DetectLanguageSummary(
src, src_len,
} else {
summary_lang = ExtDetectLanguageSummary(
src, src_len,
if (get_vector) {
DumpResultChunkVector(stderr, src, &resultchunkvector);
if (!is_reliable) {summary_lang = UNKNOWN_LANGUAGE;}
RecordCLDResult(buffer, lang_script.c_str(),
in_lang, in_lscript,
summary_lang, in_lscript);
int main (int argc, char *argv[])
int flags = 0;
bool get_vector = false;
for (int i = 1; i < argc; ++i) {
if (strcmp(argv[i], "--scoreasquads") == 0) {flags |= kCLDFlagScoreAsQuads;}
if (strcmp(argv[i], "--html") == 0) {flags |= kCLDFlagHtml;}
if (strcmp(argv[i], "--cr") == 0) {flags |= kCLDFlagCr;}
if (strcmp(argv[i], "--verbose") == 0) {flags |= kCLDFlagVerbose;}
if (strcmp(argv[i], "--vector") == 0) {get_vector = true;}
if (FLAGS_cld2_html) {
// Begin HTML file
fprintf(stderr, "<html><meta charset=\"UTF-8\"><body>\n");
fprintf(stderr, "<style media=\"print\" type=\"text/css\"> "
":root { -webkit-print-color-adjust: exact; } </style>\n");
fprintf(stderr, "<span style=\"font-size: 7pt\">\n");
for (int i = 1; i < argc; ++i) {
if (argv[i][0] != '-') {
const char* fname = argv[i];
fprintf(stderr, "file = %s<br><br>\n", fname ? fname : "stdin");
LangDetLinesOfFile(flags, get_vector, fname);
if (FLAGS_cld2_html) {
fprintf(stderr, "\n</span></body></html><br>");
return 0;
} // End namespace CLD2
int main(int argc, char *argv[]) {
return CLD2::main(argc, argv);