blob: 8fe717b8fcf544b20d3c1483bac6594584c3cacb [file] [log] [blame]
// Copyright 2013 Google Inc. All Rights Reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
// Author: (Dick Sites)
// Terminology:
// Incoming original text has HTML tags and entities removed, all but letters
// removed, and letters lowercased. Strings of non-letters are mapped to a
// single ASCII space.
// One scriptspan has a run of letters/spaces in a single script. This is the
// fundamental text unit that is scored. There is an optional backmap from
// scriptspan text to the original document text, so that the language ranges
// reported in ResultChunkVector refer to byte ranges inthe original text.
// Scripts come in two forms, the full Unicode scripts described by
// and a modified list used exclusively in CLD2. The modified form maps all
// the CJK scripts to one, Hani. The current version description is in
// i18n/encodings/cld2/builddata/script_summary.txt
// In addition, all non-letters are mapped to the Common script.
// ULScript describes this Unicode Letter script.
// Scoring uses text nil-grams, uni-grams, bi-grams, quad-grams, and octa-grams.
// Nilgrams (no text lookup at all) are for script-based pseudo-languages and
// for languages that are 1:1 with a given script. Unigrams and bigrams are
// used to score the CJK languages, all in the Hani script. Quadgrams and
// octagrams are used to score all other languages.
// RType is the Recognition Type per ulscript.
// The scoring tables map various grams to language-probability scores.
// A given gram that hits in scoring table maps to an indirect subscript into
// a list of packed languages and log probabilities.
// Languages are stored in two forms: 10-bit values in the Languge enum, and
// shorter 8-bit per-ulscript values in the scoring tables.
// Language refers to the full 10-bit range.
// pslang refers to the per-ulscript shorter values.
// Log probabilities also come in two forms. The full range uses values 0..255
// to represent minus log base 10th-root-of-2, covering 1 .. 1/2**25.5 or about
// TODO BOGUS description, 24 vs 12
// 1/47.5M. The second form quantizes these into multiples of 8 that can be
// added together to represent probability products. The quantized form uses
// values 24..0 with 0 now least likely instead of most likely, thus making
// larger sums for more probable results. 24 maps to original 1/2**4.8 (~1/28)
// and 0 maps to original 1/2**24.0 (~1/16M).
// qprob refers to quantized log probabilities.
// langprob is a uint32 holding three 1-byte pslangs and a 1-byte subscript to
// a list of three qprobs. It always nees a companion ulscript
// A scriptspan is scored via one or more hitbuffers
#include <stdio.h>
#include "integral_types.h" // for uint8 etc.
#include "cld2tablesummary.h"
#include "compact_lang_det_impl.h" // for ResultChunkVector
#include "getonescriptspan.h"
#include "langspan.h"
#include "tote.h"
#include "utf8statetable.h"
namespace CLD2 {
static const int kMaxBoosts = 4; // For each of PerScriptLangBoosts
// must be power of two for wrap()
static const int kChunksizeQuads = 20; // For non-CJK
static const int kChunksizeUnis = 50; // For CJK
static const int kMaxScoringHits = 1000;
static const int kMaxSummaries = kMaxScoringHits / kChunksizeQuads;
// The first four tables are for CJK languages,
// the next three for quadgram languages, and
// the last for expected scores.
typedef struct {
const UTF8PropObj* unigram_obj; // 80K CJK characters
const CLD2TableSummary* unigram_compat_obj; // 256 CJK lookup probabilities
const CLD2TableSummary* deltabi_obj;
const CLD2TableSummary* distinctbi_obj;
const CLD2TableSummary* quadgram_obj; // Primary quadgram lookup table
const CLD2TableSummary* quadgram_obj2; // Secondary "
const CLD2TableSummary* deltaocta_obj;
const CLD2TableSummary* distinctocta_obj;
const short* kExpectedScore; // Expected base + delta + distinct score
// per 1KB input
// Subscripted by language and script4
} ScoringTables;
// Context for boosting several languages
typedef struct {
int32 n;
uint32 langprob[kMaxBoosts];
int wrap(int32 n) {return n & (kMaxBoosts - 1);}
} LangBoosts;
typedef struct {
LangBoosts latn;
LangBoosts othr;
} PerScriptLangBoosts;
// ScoringContext carries state across scriptspans
// ScoringContext also has read-only scoring tables mapping grams to qprobs
typedef struct {
FILE* debug_file; // Non-NULL if debug output wanted
bool flags_cld2_score_as_quads;
bool flags_cld2_html;
bool flags_cld2_cr;
bool flags_cld2_verbose;
ULScript ulscript; // langprobs below are with respect to this script
Language prior_chunk_lang; // Mostly for debug output
// boost has a packed set of per-script langs and probabilites
// whack has a per-script lang to be suppressed from ever scoring (zeroed)
// When a language in a close set is given as an explicit hint, others in
// that set will be whacked.
PerScriptLangBoosts langprior_boost; // From http content-lang or meta lang=
PerScriptLangBoosts langprior_whack; // From http content-lang or meta lang=
PerScriptLangBoosts distinct_boost; // From distinctive letter groups
int oldest_distinct_boost; // Subscript in hitbuffer of oldest
// distinct score to use
const ScoringTables* scoringtables; // Probability lookup tables
ScriptScanner* scanner; // For ResultChunkVector backmap
// Inits boosts
void init() {
memset(&langprior_boost, 0, sizeof(langprior_boost));
memset(&langprior_whack, 0, sizeof(langprior_whack));
memset(&distinct_boost, 0, sizeof(distinct_boost));
} ScoringContext;
// Begin private
// Holds one scoring-table lookup hit. We hold indirect subscript instead of
// langprob to allow a single hit to use a variable number of langprobs.
typedef struct {
int offset; // First byte of quad/octa etc. in scriptspan
int indirect; // subscript of langprobs in scoring table
} ScoringHit;
typedef enum {
} LinearHitType;
// Holds one scoring-table lookup hit resolved into a langprob.
typedef struct {
uint16 offset; // First byte of quad/octa etc. in scriptspan
uint16 type; // LinearHitType
uint32 langprob; // langprob from scoring table
} LangprobHit;
// Holds arrays of scoring-table lookup hits for (part of) a scriptspan
typedef struct {
ULScript ulscript; // langprobs below are with respect to this script
int maxscoringhits; // determines size of arrays below
int next_base; // First unused entry in each array
int next_delta; // "
int next_distinct; // "
int next_linear; // "
int next_chunk_start; // First unused chunk_start entry
int lowest_offset; // First byte of text span used to fill hitbuffer
// Dummy entry at the end of each giving offset of first unused text byte
ScoringHit base[kMaxScoringHits + 1]; // Uni/quad hits
ScoringHit delta[kMaxScoringHits + 1]; // delta-bi/delta-octa hits
ScoringHit distinct[kMaxScoringHits + 1]; // distinct-word hits
LangprobHit linear[4 * kMaxScoringHits + 1]; // Above three merge-sorted
// (4: some bases => 2 linear)
int chunk_start[kMaxSummaries + 1]; // First linear[] subscr of
// each scored chunk
int chunk_offset[kMaxSummaries + 1]; // First text subscr of
// each scored chunk
void init() {
ulscript = ULScript_Common;
maxscoringhits = kMaxScoringHits;
next_base = 0;
next_delta = 0;
next_distinct = 0;
next_linear = 0;
next_chunk_start = 0;
lowest_offset = 0;
base[0].offset = 0;
base[0].indirect = 0;
delta[0].offset = 0;
delta[0].indirect = 0;
distinct[0].offset = 0;
distinct[0].indirect = 0;
linear[0].offset = 0;
linear[0].langprob = 0;
chunk_start[0] = 0;
chunk_offset[0] = 0;
} ScoringHitBuffer;
// TODO: Explain here why we need both ChunkSpan and ChunkSummary
typedef struct {
int chunk_base; // Subscript of first hitbuffer.base[] in chunk
int chunk_delta; // Subscript of first[]
int chunk_distinct; // Subscript of first hitbuffer.distinct[]
int base_len; // Number of hitbuffer.base[] in chunk
int delta_len; // Number of[] in chunk
int distinct_len; // Number of hitbuffer.distinct[] in chunk
} ChunkSpan;
// Packed into 20 bytes for space
typedef struct {
uint16 offset; // Text offset within current scriptspan.text
uint16 chunk_start; // Scoring subscr within hitbuffer->linear[]
uint16 lang1; // Top lang, mapped to full Language
uint16 lang2; // Second lang, mapped to full Language
uint16 score1; // Top lang raw score
uint16 score2; // Second lang raw score
uint16 bytes; // Number of lower letters bytes in chunk
uint16 grams; // Number of scored base quad- uni-grams in chunk
uint16 ulscript; // ULScript of chunk
uint8 reliability_delta; // Reliability 0..100, delta top:second scores
uint8 reliability_score; // Reliability 0..100, top:expected score
} ChunkSummary;
// We buffer up ~50 chunk summaries, corresponding to chunks of 20 quads in a
// 1000-quad hit buffer, so we can do boundary adjustment on them
// when adjacent entries are different languages. After that, we add them
// all into the document score
// About 50 * 20 = 1000 bytes. OK for stack alloc
typedef struct {
int n;
ChunkSummary chunksummary[kMaxSummaries + 1];
} SummaryBuffer;
// End private
// Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating
// scoringcontext
void ScoreEntireScriptSpan(const LangSpan& scriptspan,
ScoringContext* scoringcontext,
DocTote* doc_tote,
ResultChunkVector* vec);
// Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext
void ScoreCJKScriptSpan(const LangSpan& scriptspan,
ScoringContext* scoringcontext,
DocTote* doc_tote,
ResultChunkVector* vec);
// Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext
void ScoreQuadScriptSpan(const LangSpan& scriptspan,
ScoringContext* scoringcontext,
DocTote* doc_tote,
ResultChunkVector* vec);
// Score one scriptspan into doc_tote and vec, updating scoringcontext
void ScoreOneScriptSpan(const LangSpan& scriptspan,
ScoringContext* scoringcontext,
DocTote* doc_tote,
ResultChunkVector* vec);
} // End namespace CLD2