blob: 440cf992b79479b85625852962a7e3b2eee2db6e [file] [log] [blame]
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Little program to read lines of sample text, calculate score per 1024 bytes
// per language-script4 combination
// Possible input file /export/hda3/cld/pre2010/b0_samp_prune_20100722.utf8
#include <stdio.h>
#include <string.h>
#include <string>
#include "compact_lang_det_impl.h"
#include "lang_script.h"
using namespace std;
using namespace CLD2;
double bytes[NUM_LANGUAGES][4];
double scores[NUM_LANGUAGES][4];
// Return score per 1024 bytes for top language
Language ScoreOneLine(const char* buffer, int buffer_length,
int* bytes, double* score_per_1kb) {
bool is_plain_text = true;
const CLDHints* cld_hints = NULL;
bool allow_extended_lang = true;
int flags = 0;
Language plus_one = UNKNOWN_LANGUAGE;
Language language3[3];
int percent3[3];
double normalized_score3[3];
ResultChunkVector* resultchunkvector = NULL;
int text_bytes;
bool is_reliable;
Language summary_lang;
summary_lang = DetectLanguageSummaryV2(
buffer,
buffer_length,
is_plain_text,
cld_hints,
allow_extended_lang,
flags,
plus_one,
language3,
percent3,
normalized_score3,
resultchunkvector,
&text_bytes,
&is_reliable);
*bytes = text_bytes;
*score_per_1kb = normalized_score3[0];
return language3[0];
}
#define LF 0x0a
#define CR 0x0d
const int kMaxBuffer = 5 * 1024;
bool ReadLine(FILE* infile, char* buffer, size_t maxlen) {
char* p = fgets(buffer, maxlen, infile);
if (p == NULL) {
return false;
}
int len = strlen(buffer);
// trim CR LF
if (buffer[len-1] == LF) {buffer[--len] = '\0';}
if (buffer[len-1] == CR) {buffer[--len] = '\0';}
return true;
}
bool IsComment(const char* buffer) {
int len = strlen(buffer);
if (len == 0) {return true;}
if (buffer[0] == '#') {return true;}
if (buffer[0] == ' ') {return true;} // Any leading space is comment
return false;
}
// Skips over xxxxx_ where _ is one or more spaces/tabs
// Returns string::npos if no more fields
int SkipOneField(const string& src, int pos) {
if (pos == string::npos) {return pos;}
int lpos = pos;
lpos = src.find_first_of(" \t", lpos);
if (lpos == string::npos) {return lpos;}
lpos = src.find_first_not_of(" \t", lpos);
if (lpos == string::npos) {return lpos;}
return lpos;
}
// Return language and script from parsed line or defaults
void GetLangScript(const string& src,
Language default_lang, ULScript default_lscript,
Language* target_lang, ULScript* target_lscript,
string* tld) {
*target_lang = default_lang;
*target_lscript = default_lscript;
*tld = "";
int pos = 0;
int pos2 = 0;
if (src.substr(0,7) == "SAMPLE ") {
// SAMPLE ll-Ssss
pos = SkipOneField(src, pos);
} else if (src.substr(0,5) == "SAMP ") {
// SAMP ll-Ssss /tld2.tld/
pos = SkipOneField(src, pos);
pos2 = SkipOneField(src, pos);
} else if (src.substr(0,5) == "Samp ") {
// Samp ll-Ssss /tld2.tld/
pos = SkipOneField(src, pos);
pos2 = SkipOneField(src, pos);
}
if (pos == 0) {return;}
if (pos == string::npos) {return;}
// Pos is at the first letter of language-script combination
int end = src.find_first_of(" \t", pos); // find end of lang-script
if (end == string::npos) {return;}
*target_lang = GetLanguageFromName(src.substr(pos, end - pos).c_str());
*target_lscript = GetULScriptFromName(src.substr(pos, end - pos).c_str());
// Pos2 is 0 or at the first letter of the tld string
if (pos2 == 0) {return;}
if (pos2 == string::npos) {return;}
end = src.find_first_of(" \t", pos2);
if (end == string::npos) {return;}
*tld = src.substr(pos2, end - pos2);
}
// Return position of start of text
int GetTextBeginPos(const string& src) {
int pos = 0;
if (src.size() < 8) {return pos;}
if (src.substr(0,7) == "SAMPLE ") {
// Skip SAMPLE ll-Ssss
pos = SkipOneField(src, pos);
pos = SkipOneField(src, pos);
} else if (src.substr(0,5) == "SAMP ") {
// Skip SAMP ll-Ssss /tld2.tld/
pos = SkipOneField(src, pos);
pos = SkipOneField(src, pos);
pos = SkipOneField(src, pos);
} else if (src.substr(0,5) == "Samp ") {
// Skip Samp ll-Ssss /tld2.tld/
pos = SkipOneField(src, pos);
pos = SkipOneField(src, pos);
pos = SkipOneField(src, pos);
}
return pos;
}
// Avoid zdiv
inline double Divisor(double x) {
return (x > 0.0 ? x : 1.0);
}
void Flush(Language cur_lang, ULScript ulscript,
double total_score_cur_lang,
double total_bytes_cur_lang, double total_bad_bytes_cur_lang) {
if (cur_lang == UNKNOWN_LANGUAGE) {return;}
bytes[cur_lang][LScript4(ulscript)] += total_bytes_cur_lang;
scores[cur_lang][LScript4(ulscript)] += total_score_cur_lang;
double score = total_score_cur_lang * 1024.0 / Divisor(total_bytes_cur_lang);
double percent_bad = 100.0 * total_bad_bytes_cur_lang /
Divisor(total_bytes_cur_lang + total_bad_bytes_cur_lang);
fprintf(stdout, "%s-%s %7.0f %6.1f, %2.0f%% bad SUMMARY\n\n",
LanguageCode(cur_lang),
ULScriptCode(ulscript),
total_bytes_cur_lang,
score,
percent_bad);
}
int BytesPer1KB(int i, int j) {
int bytes_per_1kb = ((scores[i][j] * 1024.0) / Divisor(bytes[i][j])) + 0.5;
return bytes_per_1kb;
}
int main(int argc, char *argv[]) {
Language cur_lang = UNKNOWN_LANGUAGE;
ULScript cur_ulscript = ULScript_Common;
double total_score_cur_lang = 0.0;
double total_bytes_cur_lang = 0.0;
double total_bad_bytes_cur_lang = 0.0;
memset(bytes, 0, sizeof(bytes));
memset(scores, 0, sizeof(bytes));
char buffer[kMaxBuffer];
int buffer_length;
const char* filename = NULL;
FILE* infile = stdin;
for (int i = 1; i < argc; ++i) {
if (argv[i][0] != '-') {
filename = argv[i];
}
}
if (filename != NULL) {
infile = fopen(filename, "r");
if (infile == NULL) {
fprintf(stderr, "%s did not open\n", filename);
return 0;
}
}
while (ReadLine(infile, buffer, kMaxBuffer)) {
if (IsComment(buffer)) {continue;}
buffer_length = strlen(buffer);
int bytes;
double score_per_1kb;
Language toplang;
Language target_lang;
ULScript target_ulscript;
string src(buffer, buffer_length);
string tld("");
int pos = GetTextBeginPos(src);
GetLangScript(src, UNKNOWN_LANGUAGE, ULScript_Common,
&target_lang, &target_ulscript, &tld);
if ((cur_lang != target_lang) || (cur_ulscript != target_ulscript)) {
Flush(cur_lang, cur_ulscript, total_score_cur_lang,
total_bytes_cur_lang, total_bad_bytes_cur_lang);
cur_lang = target_lang;
cur_ulscript = target_ulscript;
total_score_cur_lang = 0.0;
total_bytes_cur_lang = 0.0;
total_bad_bytes_cur_lang = 0.0;
}
toplang = ScoreOneLine(&src[pos], src.size() - pos, &bytes, &score_per_1kb);
fprintf(stdout, "%s%c %d %4.1f %s\n",
LanguageCode(toplang),
(toplang == target_lang) ? ' ' : '*',
bytes, score_per_1kb, buffer);
// Only count when detected lang matches the claimed target lang
if (toplang == target_lang) {
total_bytes_cur_lang += bytes;
total_score_cur_lang += (score_per_1kb * bytes) / 1024.0;
} else {
total_bad_bytes_cur_lang += bytes;
}
}
Flush(cur_lang, cur_ulscript, total_score_cur_lang,
total_bytes_cur_lang, total_bad_bytes_cur_lang);
for (int i = 0; i < NUM_LANGUAGES; ++i) {
Language ilang = static_cast<Language>(i);
fprintf(stdout, " {%4d, %4d, %4d, %4d}, // %d %s %s\n",
BytesPer1KB(i, 0), BytesPer1KB(i, 1),
BytesPer1KB(i, 2), BytesPer1KB(i, 3),
i, LanguageName(ilang), LanguageCode(ilang));
}
if (infile != stdin) {
fclose(infile);
}
}