blob: 4007ea6b2d1b5d240a6e59ed32baaedf5c9237ef [file] [log] [blame]
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
#include "cldutil_offline.h"
#include "tote.h"
#include <string>
static const int kMinCJKUTF8CharBytes = 3;
//------------------------------------------------------------------------------
// Offline: used by mapreduce or table construction
//------------------------------------------------------------------------------
namespace CLD2 {
// BIGRAM, QUADGRAM, OCTAGRAM score one => tote
// Input: 4-byte entry of 3 language numbers and one probability subscript, plus
// an accumulator tote. (language 0 means unused entry)
// Output: running sums in tote updated
void ProcessProbV2Tote(uint32 probs, Tote* tote) {
uint8 prob123 = (probs >> 0) & 0xff;
const uint8* prob123_entry = LgProb2TblEntry(prob123);
uint8 top1 = (probs >> 8) & 0xff;
if (top1 > 0) {tote->Add(top1, LgProb3(prob123_entry, 0));}
uint8 top2 = (probs >> 16) & 0xff;
if (top2 > 0) {tote->Add(top2, LgProb3(prob123_entry, 1));}
uint8 top3 = (probs >> 24) & 0xff;
if (top3 > 0) {tote->Add(top3, LgProb3(prob123_entry, 2));}
}
// Advances src, decrements len
uint32 GetNextLangprob(ULScriptRType rtype,
const CLD2TableSummary* wrt_unigram_obj,
const CLD2TableSummary* wrt_quadgram_obj,
const char** isrc, int* isrclen) {
// fprintf(stderr, "GetNextLangprob '%s' %d<br>\n", *isrc, *isrclen);
if (*isrclen <= 0) {return 0;}
// Find one quadgram
const char* src = *isrc;
const char* srclimit = src + *isrclen;
if (*src == ' ') {++src;}
const char* src_end = src;
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
const char* src_mid = src_end;
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
int len = src_end - src;
// Hash the quadgram
uint32 quadhash = QuadHashV2(src, len);
uint32 probs = QuadHashV3Lookup4(wrt_quadgram_obj, quadhash);
int indirect_subscr = probs & ~wrt_quadgram_obj->kCLDTableKeyMask;
uint32 langprob;
if (indirect_subscr < static_cast<int>(wrt_quadgram_obj->kCLDTableSizeOne)) {
// Up to three languages at indirect
langprob = wrt_quadgram_obj->kCLDTableInd[indirect_subscr];
} else {
// Up to six languages at start + 2 * (indirect - start)
indirect_subscr += (indirect_subscr - wrt_quadgram_obj->kCLDTableSizeOne);
langprob = wrt_quadgram_obj->kCLDTableInd[indirect_subscr];
}
// Advance: all the way past word if at end-of-word, else 2 chars
if (src_end[0] == ' ') {
src = src_end;
} else {
src = src_mid;
}
if (src < srclimit) {
src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
} else {
// Advancing by 4/8/16 can overshoot, but we are about to exit anyway
src = srclimit;
}
int quadadvance = src - *isrc;
*isrc = src;
*isrclen -= quadadvance;
return langprob;
}
// Find top two langs and scores for one word; underpins delta tables
void DoWordScore(const char* isrc, int srclen, ULScript ulscript,
const CLD2TableSummary* wrt_unigram_obj,
const CLD2TableSummary* wrt_quadgram_obj,
Language* lang1, int* score1,
Language* lang2, int* score2) {
ULScriptRType rtype = ULScriptRecognitionType(ulscript);
Tote word_tote;
const char* src = isrc;
int len = srclen;
uint32 langprob;
// Advances src, decrements len
langprob = GetNextLangprob(rtype, wrt_unigram_obj, wrt_quadgram_obj,
&src, &len);
ProcessProbV2Tote(langprob, &word_tote);
// Advances src, decrements len
langprob = GetNextLangprob(rtype, wrt_unigram_obj, wrt_quadgram_obj,
&src, &len);
ProcessProbV2Tote(langprob, &word_tote);
int key3[3];
word_tote.CurrentTopThreeKeys(key3);
*lang1 = FromPerScriptNumber(ulscript, key3[0]);
*lang2 = FromPerScriptNumber(ulscript, key3[1]);
*score1 = word_tote.GetScore(key3[0]);
*score2 = word_tote.GetScore(key3[1]);
}
// Routines to store 3 or 5 log probabilities in a single byte.
// Resolution/range = 2**1 to 2**12
//------------------------------------------------------------------------------
// For constructing tables
// Given a vector of 3 probabilities 1..12, find subscript of best table match.
// Minimizes RMS error
// Brute-force version
uint8 FindBestProb3Match(const uint8* prob3) {
int minsubscr = 0;
int minrmserr = 9999;
for (int i = 0; i < kLgProbV2TblSize; ++i) {
int rmserr = 0;
for (int j = 0; j < 3; ++j) {
// If target prob is zero, item is unused, so no errterm
if (prob3[j] > 0) {
int errterm = prob3[j] - LgProb3(LgProb2TblEntry(i), j);
rmserr += (errterm * errterm);
}
}
if (minrmserr > rmserr) {
minrmserr = rmserr;
minsubscr = i;
}
}
return static_cast<uint8>(minsubscr);
};
// Not sure who calls this...
// Return the probability for given language, or 0
int GetProb(Language lang, uint32 probs) {
int prob123 = (probs >> 0) & 0xff;
const uint8* prob123_entry = LgProb2TblEntry(prob123);
int ilang = PerScriptNumber(ULScript_Latin, lang);
int top1 = (probs >> 8) & 0xff;
if (ilang == top1) {return LgProb3(prob123_entry, 0);}
int top2 = (probs >> 16) & 0xff;
if (ilang == top2) {return LgProb3(prob123_entry, 1);}
int top3 = (probs >> 16) & 0xff;
if (ilang == top3) {return LgProb3(prob123_entry, 2);}
return 0;
}
// Converts a unigram prob/lang byte into an approximate prob/lang triple
// Just keeps the largest value.
// Now unused.
uint32 ApproxProb3(int propval) {
return 0;
}
// Take three packed languages and three probabilities 1..12 and put into uint32
// For offline construction of tables
uint32 ProbPackV2(uint8* plang3, uint8* prob3) {
uint32 retval;
// If < 3 entries, pack as top, 0, second, else pack as top, second, third
// This allows FindBestProb3Match to always find a perfect match for < 3
if (plang3[2] == 0) {
// Swap [2] and [3]
uint8 temp = plang3[2]; plang3[2] = plang3[1]; plang3[1] = temp;
temp = prob3[2]; prob3[2] = prob3[1]; prob3[1] = temp;
}
retval = (plang3[2] << 24) |
(plang3[1] << 16) |
(plang3[0] << 8) |
(FindBestProb3Match(prob3));
return retval;
}
// Take uint32 and unpack into three packed languages and three probabilities
// For runtime use of tables
void ProbUnpackV2(uint32 prob, uint8* plang3, uint8* prob3) {
plang3[0] = (prob >> 8) & 0xff;
plang3[1] = (prob >> 16) & 0xff;
plang3[2] = (prob >> 24) & 0xff;
int prob123 = (prob >> 0) & 0xff;
const uint8* prob123_entry = LgProb2TblEntry(prob123);
prob3[0] = LgProb3(prob123_entry, 0);
prob3[1] = LgProb3(prob123_entry, 1);
prob3[2] = LgProb3(prob123_entry, 2);
}
} // End namespace CLD2