| // Copyright 2013 Google Inc. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // |
| // Author: dsites@google.com (Dick Sites) |
| // |
| // |
| // Terminology: |
| // Incoming original text has HTML tags and entities removed, all but letters |
| // removed, and letters lowercased. Strings of non-letters are mapped to a |
| // single ASCII space. |
| // |
| // One scriptspan has a run of letters/spaces in a single script. This is the |
| // fundamental text unit that is scored. There is an optional backmap from |
| // scriptspan text to the original document text, so that the language ranges |
| // reported in ResultChunkVector refer to byte ranges inthe original text. |
| // |
| // Scripts come in two forms, the full Unicode scripts described by |
| // http://www.unicode.org/Public/UNIDATA/Scripts.txt |
| // and a modified list used exclusively in CLD2. The modified form maps all |
| // the CJK scripts to one, Hani. The current version description is in |
| // i18n/encodings/cld2/builddata/script_summary.txt |
| // In addition, all non-letters are mapped to the Common script. |
| // |
| // ULScript describes this Unicode Letter script. |
| // |
| // Scoring uses text nil-grams, uni-grams, bi-grams, quad-grams, and octa-grams. |
| // Nilgrams (no text lookup at all) are for script-based pseudo-languages and |
| // for languages that are 1:1 with a given script. Unigrams and bigrams are |
| // used to score the CJK languages, all in the Hani script. Quadgrams and |
| // octagrams are used to score all other languages. |
| // |
| // RType is the Recognition Type per ulscript. |
| // |
| // The scoring tables map various grams to language-probability scores. |
| // A given gram that hits in scoring table maps to an indirect subscript into |
| // a list of packed languages and log probabilities. |
| // |
| // Languages are stored in two forms: 10-bit values in the Languge enum, and |
| // shorter 8-bit per-ulscript values in the scoring tables. |
| // |
| // Language refers to the full 10-bit range. |
| // pslang refers to the per-ulscript shorter values. |
| // |
| // Log probabilities also come in two forms. The full range uses values 0..255 |
| // to represent minus log base 10th-root-of-2, covering 1 .. 1/2**25.5 or about |
| // TODO BOGUS description, 24 vs 12 |
| // 1/47.5M. The second form quantizes these into multiples of 8 that can be |
| // added together to represent probability products. The quantized form uses |
| // values 24..0 with 0 now least likely instead of most likely, thus making |
| // larger sums for more probable results. 24 maps to original 1/2**4.8 (~1/28) |
| // and 0 maps to original 1/2**24.0 (~1/16M). |
| // |
| // qprob refers to quantized log probabilities. |
| // |
| // langprob is a uint32 holding three 1-byte pslangs and a 1-byte subscript to |
| // a list of three qprobs. It always nees a companion ulscript |
| // |
| // A scriptspan is scored via one or more hitbuffers |
| |
| |
| #ifndef I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__ |
| #define I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__ |
| |
| #include <stdio.h> |
| |
| #include "integral_types.h" // for uint8 etc. |
| |
| #include "cld2tablesummary.h" |
| #include "compact_lang_det_impl.h" // for ResultChunkVector |
| #include "getonescriptspan.h" |
| #include "langspan.h" |
| #include "tote.h" |
| #include "utf8statetable.h" |
| |
| namespace CLD2 { |
| |
| static const int kMaxBoosts = 4; // For each of PerScriptLangBoosts |
| // must be power of two for wrap() |
| static const int kChunksizeQuads = 20; // For non-CJK |
| static const int kChunksizeUnis = 50; // For CJK |
| static const int kMaxScoringHits = 1000; |
| static const int kMaxSummaries = kMaxScoringHits / kChunksizeQuads; |
| |
| |
| // The first four tables are for CJK languages, |
| // the next three for quadgram languages, and |
| // the last for expected scores. |
| typedef struct { |
| const UTF8PropObj* unigram_obj; // 80K CJK characters |
| const CLD2TableSummary* unigram_compat_obj; // 256 CJK lookup probabilities |
| const CLD2TableSummary* deltabi_obj; |
| const CLD2TableSummary* distinctbi_obj; |
| |
| const CLD2TableSummary* quadgram_obj; // Primary quadgram lookup table |
| const CLD2TableSummary* quadgram_obj2; // Secondary " |
| const CLD2TableSummary* deltaocta_obj; |
| const CLD2TableSummary* distinctocta_obj; |
| |
| const short* kExpectedScore; // Expected base + delta + distinct score |
| // per 1KB input |
| // Subscripted by language and script4 |
| } ScoringTables; |
| |
| // Context for boosting several languages |
| typedef struct { |
| int32 n; |
| uint32 langprob[kMaxBoosts]; |
| int wrap(int32 n) {return n & (kMaxBoosts - 1);} |
| } LangBoosts; |
| |
| typedef struct { |
| LangBoosts latn; |
| LangBoosts othr; |
| } PerScriptLangBoosts; |
| |
| |
| |
| // ScoringContext carries state across scriptspans |
| // ScoringContext also has read-only scoring tables mapping grams to qprobs |
| typedef struct { |
| FILE* debug_file; // Non-NULL if debug output wanted |
| bool flags_cld2_score_as_quads; |
| bool flags_cld2_html; |
| bool flags_cld2_cr; |
| bool flags_cld2_verbose; |
| ULScript ulscript; // langprobs below are with respect to this script |
| Language prior_chunk_lang; // Mostly for debug output |
| // boost has a packed set of per-script langs and probabilites |
| // whack has a per-script lang to be suppressed from ever scoring (zeroed) |
| // When a language in a close set is given as an explicit hint, others in |
| // that set will be whacked. |
| PerScriptLangBoosts langprior_boost; // From http content-lang or meta lang= |
| PerScriptLangBoosts langprior_whack; // From http content-lang or meta lang= |
| PerScriptLangBoosts distinct_boost; // From distinctive letter groups |
| int oldest_distinct_boost; // Subscript in hitbuffer of oldest |
| // distinct score to use |
| const ScoringTables* scoringtables; // Probability lookup tables |
| ScriptScanner* scanner; // For ResultChunkVector backmap |
| |
| // Inits boosts |
| void init() { |
| memset(&langprior_boost, 0, sizeof(langprior_boost)); |
| memset(&langprior_whack, 0, sizeof(langprior_whack)); |
| memset(&distinct_boost, 0, sizeof(distinct_boost)); |
| }; |
| } ScoringContext; |
| |
| |
| |
| // Begin private |
| |
| // Holds one scoring-table lookup hit. We hold indirect subscript instead of |
| // langprob to allow a single hit to use a variable number of langprobs. |
| typedef struct { |
| int offset; // First byte of quad/octa etc. in scriptspan |
| int indirect; // subscript of langprobs in scoring table |
| } ScoringHit; |
| |
| typedef enum { |
| UNIHIT = 0, |
| QUADHIT = 1, |
| DELTAHIT = 2, |
| DISTINCTHIT = 3 |
| } LinearHitType; |
| |
| // Holds one scoring-table lookup hit resolved into a langprob. |
| typedef struct { |
| uint16 offset; // First byte of quad/octa etc. in scriptspan |
| uint16 type; // LinearHitType |
| uint32 langprob; // langprob from scoring table |
| } LangprobHit; |
| |
| // Holds arrays of scoring-table lookup hits for (part of) a scriptspan |
| typedef struct { |
| ULScript ulscript; // langprobs below are with respect to this script |
| int maxscoringhits; // determines size of arrays below |
| int next_base; // First unused entry in each array |
| int next_delta; // " |
| int next_distinct; // " |
| int next_linear; // " |
| int next_chunk_start; // First unused chunk_start entry |
| int lowest_offset; // First byte of text span used to fill hitbuffer |
| // Dummy entry at the end of each giving offset of first unused text byte |
| ScoringHit base[kMaxScoringHits + 1]; // Uni/quad hits |
| ScoringHit delta[kMaxScoringHits + 1]; // delta-bi/delta-octa hits |
| ScoringHit distinct[kMaxScoringHits + 1]; // distinct-word hits |
| LangprobHit linear[4 * kMaxScoringHits + 1]; // Above three merge-sorted |
| // (4: some bases => 2 linear) |
| int chunk_start[kMaxSummaries + 1]; // First linear[] subscr of |
| // each scored chunk |
| int chunk_offset[kMaxSummaries + 1]; // First text subscr of |
| // each scored chunk |
| |
| void init() { |
| ulscript = ULScript_Common; |
| maxscoringhits = kMaxScoringHits; |
| next_base = 0; |
| next_delta = 0; |
| next_distinct = 0; |
| next_linear = 0; |
| next_chunk_start = 0; |
| lowest_offset = 0; |
| base[0].offset = 0; |
| base[0].indirect = 0; |
| delta[0].offset = 0; |
| delta[0].indirect = 0; |
| distinct[0].offset = 0; |
| distinct[0].indirect = 0; |
| linear[0].offset = 0; |
| linear[0].langprob = 0; |
| chunk_start[0] = 0; |
| chunk_offset[0] = 0; |
| }; |
| } ScoringHitBuffer; |
| |
| // TODO: Explain here why we need both ChunkSpan and ChunkSummary |
| typedef struct { |
| int chunk_base; // Subscript of first hitbuffer.base[] in chunk |
| int chunk_delta; // Subscript of first hitbuffer.delta[] |
| int chunk_distinct; // Subscript of first hitbuffer.distinct[] |
| int base_len; // Number of hitbuffer.base[] in chunk |
| int delta_len; // Number of hitbuffer.delta[] in chunk |
| int distinct_len; // Number of hitbuffer.distinct[] in chunk |
| } ChunkSpan; |
| |
| |
| // Packed into 20 bytes for space |
| typedef struct { |
| uint16 offset; // Text offset within current scriptspan.text |
| uint16 chunk_start; // Scoring subscr within hitbuffer->linear[] |
| uint16 lang1; // Top lang, mapped to full Language |
| uint16 lang2; // Second lang, mapped to full Language |
| uint16 score1; // Top lang raw score |
| uint16 score2; // Second lang raw score |
| uint16 bytes; // Number of lower letters bytes in chunk |
| uint16 grams; // Number of scored base quad- uni-grams in chunk |
| uint16 ulscript; // ULScript of chunk |
| uint8 reliability_delta; // Reliability 0..100, delta top:second scores |
| uint8 reliability_score; // Reliability 0..100, top:expected score |
| } ChunkSummary; |
| |
| |
| // We buffer up ~50 chunk summaries, corresponding to chunks of 20 quads in a |
| // 1000-quad hit buffer, so we can do boundary adjustment on them |
| // when adjacent entries are different languages. After that, we add them |
| // all into the document score |
| // |
| // About 50 * 20 = 1000 bytes. OK for stack alloc |
| typedef struct { |
| int n; |
| ChunkSummary chunksummary[kMaxSummaries + 1]; |
| } SummaryBuffer; |
| |
| // End private |
| |
| |
| // Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating |
| // scoringcontext |
| void ScoreEntireScriptSpan(const LangSpan& scriptspan, |
| ScoringContext* scoringcontext, |
| DocTote* doc_tote, |
| ResultChunkVector* vec); |
| |
| // Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext |
| void ScoreCJKScriptSpan(const LangSpan& scriptspan, |
| ScoringContext* scoringcontext, |
| DocTote* doc_tote, |
| ResultChunkVector* vec); |
| |
| // Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext |
| void ScoreQuadScriptSpan(const LangSpan& scriptspan, |
| ScoringContext* scoringcontext, |
| DocTote* doc_tote, |
| ResultChunkVector* vec); |
| |
| // Score one scriptspan into doc_tote and vec, updating scoringcontext |
| void ScoreOneScriptSpan(const LangSpan& scriptspan, |
| ScoringContext* scoringcontext, |
| DocTote* doc_tote, |
| ResultChunkVector* vec); |
| |
| } // End namespace CLD2 |
| |
| #endif // I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__ |
| |