internal/scoreonescriptspan.h - external/github.com/CLD2Owners/cld2 - Git at Google

 // Copyright 2013 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 //
 // Author: dsites@google.com (Dick Sites)
 //
 //
 // Terminology:
 // Incoming original text has HTML tags and entities removed, all but letters
 // removed, and letters lowercased. Strings of non-letters are mapped to a
 // single ASCII space.
 //
 // One scriptspan has a run of letters/spaces  in a single script. This is the
 // fundamental text unit that is scored. There is an optional backmap from
 // scriptspan text to the original document text, so that the language ranges
 // reported in ResultChunkVector refer to byte ranges inthe original text.
 //
 // Scripts come in two forms, the full Unicode scripts described by
 //   http://www.unicode.org/Public/UNIDATA/Scripts.txt
 // and a modified list used exclusively in CLD2. The modified form maps all
 // the CJK scripts to one, Hani. The current version description is in
 //  i18n/encodings/cld2/builddata/script_summary.txt
 // In addition, all non-letters are mapped to the Common script.
 //
 // ULScript describes this Unicode Letter script.
 //
 // Scoring uses text nil-grams, uni-grams, bi-grams, quad-grams, and octa-grams.
 // Nilgrams (no text lookup at all) are for script-based pseudo-languages and
 // for languages that are 1:1 with a given script. Unigrams and bigrams are
 // used to score the CJK languages, all in the Hani script. Quadgrams and
 // octagrams are used to score all other languages.
 //
 // RType is the Recognition Type per ulscript.
 //
 // The scoring tables map various grams to language-probability scores.
 // A given gram that hits in scoring table maps to an indirect subscript into
 // a list of packed languages and log probabilities.
 //
 // Languages are stored in two forms: 10-bit values in the Languge enum, and
 // shorter 8-bit per-ulscript values in the scoring tables.
 //
 // Language refers to the full 10-bit range.
 // pslang refers to the per-ulscript shorter values.
 //
 // Log probabilities also come in two forms. The full range uses values 0..255
 // to represent minus log base 10th-root-of-2, covering 1 .. 1/2**25.5 or about
 // TODO BOGUS description, 24 vs 12
 // 1/47.5M. The second form quantizes these into multiples of 8 that can be
 // added together to represent probability products. The quantized form uses
 // values 24..0 with 0 now least likely instead of most likely, thus making
 // larger sums for more probable results. 24 maps to original 1/2**4.8 (~1/28)
 // and 0 maps to original 1/2**24.0 (~1/16M).
 //
 // qprob refers to quantized log probabilities.
 //
 // langprob is a uint32 holding three 1-byte pslangs and a 1-byte subscript to
 // a list of three qprobs. It always nees a companion ulscript
 //
 // A scriptspan is scored via one or more hitbuffers


 #ifndef I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__
 #define I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__

 #include <stdio.h>

 #include "integral_types.h"           // for uint8 etc.

 #include "cld2tablesummary.h"
 #include "compact_lang_det_impl.h"    // for ResultChunkVector
 #include "getonescriptspan.h"
 #include "langspan.h"
 #include "tote.h"
 #include "utf8statetable.h"

 namespace CLD2 {

 static const int kMaxBoosts = 4;              // For each of PerScriptLangBoosts
                                               // must be power of two for wrap()
 static const int kChunksizeQuads = 20;        // For non-CJK
 static const int kChunksizeUnis = 50;         // For CJK
 static const int kMaxScoringHits = 1000;
 static const int kMaxSummaries = kMaxScoringHits / kChunksizeQuads;


 // The first four tables are for CJK languages,
 // the next three for quadgram languages, and
 // the last for expected scores.
 typedef struct {
   const UTF8PropObj* unigram_obj;               // 80K CJK characters
   const CLD2TableSummary* unigram_compat_obj;   // 256 CJK lookup probabilities
   const CLD2TableSummary* deltabi_obj;
   const CLD2TableSummary* distinctbi_obj;

   const CLD2TableSummary* quadgram_obj;         // Primary quadgram lookup table
   const CLD2TableSummary* quadgram_obj2;        // Secondary  "
   const CLD2TableSummary* deltaocta_obj;
   const CLD2TableSummary* distinctocta_obj;

   const short* kExpectedScore;      // Expected base + delta + distinct score
                                     // per 1KB input
                                     // Subscripted by language and script4
 } ScoringTables;

 // Context for boosting several languages
 typedef struct {
    int32 n;
    uint32 langprob[kMaxBoosts];
    int wrap(int32 n) {return n & (kMaxBoosts - 1);}
 } LangBoosts;

 typedef struct {
    LangBoosts latn;
    LangBoosts othr;
 } PerScriptLangBoosts;


 // ScoringContext carries state across scriptspans
 // ScoringContext also has read-only scoring tables mapping grams to qprobs
 typedef struct {
   FILE* debug_file;                   // Non-NULL if debug output wanted
   bool flags_cld2_score_as_quads;
   bool flags_cld2_html;
   bool flags_cld2_cr;
   bool flags_cld2_verbose;
   ULScript ulscript;        // langprobs below are with respect to this script
   Language prior_chunk_lang;          // Mostly for debug output
   // boost has a packed set of per-script langs and probabilites
   // whack has a per-script lang to be suppressed from ever scoring (zeroed)
   // When a language in a close set is given as an explicit hint, others in
   //  that set will be whacked.
   PerScriptLangBoosts langprior_boost;  // From http content-lang or meta lang=
   PerScriptLangBoosts langprior_whack;  // From http content-lang or meta lang=
   PerScriptLangBoosts distinct_boost;   // From distinctive letter groups
   int oldest_distinct_boost;          // Subscript in hitbuffer of oldest
                                       // distinct score to use
   const ScoringTables* scoringtables; // Probability lookup tables
   ScriptScanner* scanner;             // For ResultChunkVector backmap

   // Inits boosts
   void init() {
     memset(&langprior_boost, 0, sizeof(langprior_boost));
     memset(&langprior_whack, 0, sizeof(langprior_whack));
     memset(&distinct_boost, 0, sizeof(distinct_boost));
   };
 } ScoringContext;


 // Begin private

 // Holds one scoring-table lookup hit. We hold indirect subscript instead of
 // langprob to allow a single hit to use a variable number of langprobs.
 typedef struct {
   int offset;         // First byte of quad/octa etc. in scriptspan
   int indirect;       // subscript of langprobs in scoring table
 } ScoringHit;

 typedef enum {
   UNIHIT                       = 0,
   QUADHIT                      = 1,
   DELTAHIT                     = 2,
   DISTINCTHIT                  = 3
 } LinearHitType;

 // Holds one scoring-table lookup hit resolved into a langprob.
 typedef struct {
   uint16 offset;      // First byte of quad/octa etc. in scriptspan
   uint16 type;        // LinearHitType
   uint32 langprob;    // langprob from scoring table
 } LangprobHit;

 // Holds arrays of scoring-table lookup hits for (part of) a scriptspan
 typedef struct {
   ULScript ulscript;        // langprobs below are with respect to this script
   int maxscoringhits;       // determines size of arrays below
   int next_base;            // First unused entry in each array
   int next_delta;           //   "
   int next_distinct;        //   "
   int next_linear;          //   "
   int next_chunk_start;     // First unused chunk_start entry
   int lowest_offset;        // First byte of text span used to fill hitbuffer
   // Dummy entry at the end of each giving offset of first unused text byte
   ScoringHit base[kMaxScoringHits + 1];         // Uni/quad hits
   ScoringHit delta[kMaxScoringHits + 1];        // delta-bi/delta-octa hits
   ScoringHit distinct[kMaxScoringHits + 1];     // distinct-word hits
   LangprobHit linear[4 * kMaxScoringHits + 1];  // Above three merge-sorted
                                                 // (4: some bases => 2 linear)
   int chunk_start[kMaxSummaries + 1];           // First linear[] subscr of
                                                 //  each scored chunk
   int chunk_offset[kMaxSummaries + 1];          // First text subscr of
                                                 //  each scored chunk

   void init() {
     ulscript = ULScript_Common;
     maxscoringhits = kMaxScoringHits;
     next_base = 0;
     next_delta = 0;
     next_distinct = 0;
     next_linear = 0;
     next_chunk_start = 0;
     lowest_offset = 0;
     base[0].offset = 0;
     base[0].indirect = 0;
     delta[0].offset = 0;
     delta[0].indirect = 0;
     distinct[0].offset = 0;
     distinct[0].indirect = 0;
     linear[0].offset = 0;
     linear[0].langprob = 0;
     chunk_start[0] = 0;
     chunk_offset[0] = 0;
   };
 } ScoringHitBuffer;

 // TODO: Explain here why we need both ChunkSpan and ChunkSummary
 typedef struct {
   int chunk_base;       // Subscript of first hitbuffer.base[] in chunk
   int chunk_delta;      // Subscript of first hitbuffer.delta[]
   int chunk_distinct;   // Subscript of first hitbuffer.distinct[]
   int base_len;         // Number of hitbuffer.base[] in chunk
   int delta_len;        // Number of hitbuffer.delta[] in chunk
   int distinct_len;     // Number of hitbuffer.distinct[] in chunk
 } ChunkSpan;


 // Packed into 20 bytes for space
 typedef struct {
   uint16 offset;              // Text offset within current scriptspan.text
   uint16 chunk_start;         // Scoring subscr within hitbuffer->linear[]
   uint16 lang1;               // Top lang, mapped to full Language
   uint16 lang2;               // Second lang, mapped to full Language
   uint16 score1;              // Top lang raw score
   uint16 score2;              // Second lang raw score
   uint16 bytes;               // Number of lower letters bytes in chunk
   uint16 grams;               // Number of scored base quad- uni-grams in chunk
   uint16 ulscript;            // ULScript of chunk
   uint8 reliability_delta;    // Reliability 0..100, delta top:second scores
   uint8 reliability_score;    // Reliability 0..100, top:expected score
 } ChunkSummary;


 // We buffer up ~50 chunk summaries, corresponding to chunks of 20 quads in a
 // 1000-quad hit buffer, so we can do boundary adjustment on them
 // when adjacent entries are different languages. After that, we add them
 // all into the document score
 //
 // About 50 * 20 = 1000 bytes. OK for stack alloc
 typedef struct {
   int n;
   ChunkSummary chunksummary[kMaxSummaries + 1];
 } SummaryBuffer;

 // End private


 // Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating
 // scoringcontext
 void ScoreEntireScriptSpan(const LangSpan& scriptspan,
                            ScoringContext* scoringcontext,
                            DocTote* doc_tote,
                            ResultChunkVector* vec);

 // Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext
 void ScoreCJKScriptSpan(const LangSpan& scriptspan,
                         ScoringContext* scoringcontext,
                         DocTote* doc_tote,
                         ResultChunkVector* vec);

 // Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext
 void ScoreQuadScriptSpan(const LangSpan& scriptspan,
                          ScoringContext* scoringcontext,
                          DocTote* doc_tote,
                          ResultChunkVector* vec);

 // Score one scriptspan into doc_tote and vec, updating scoringcontext
 void ScoreOneScriptSpan(const LangSpan& scriptspan,
                         ScoringContext* scoringcontext,
                         DocTote* doc_tote,
                         ResultChunkVector* vec);

 }       // End namespace CLD2

 #endif  // I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__
	// Copyright 2013 Google Inc. All Rights Reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	//
	// Author: dsites@google.com (Dick Sites)
	//
	//
	// Terminology:
	// Incoming original text has HTML tags and entities removed, all but letters
	// removed, and letters lowercased. Strings of non-letters are mapped to a
	// single ASCII space.
	//
	// One scriptspan has a run of letters/spaces in a single script. This is the
	// fundamental text unit that is scored. There is an optional backmap from
	// scriptspan text to the original document text, so that the language ranges
	// reported in ResultChunkVector refer to byte ranges inthe original text.
	//
	// Scripts come in two forms, the full Unicode scripts described by
	// http://www.unicode.org/Public/UNIDATA/Scripts.txt
	// and a modified list used exclusively in CLD2. The modified form maps all
	// the CJK scripts to one, Hani. The current version description is in
	// i18n/encodings/cld2/builddata/script_summary.txt
	// In addition, all non-letters are mapped to the Common script.
	//
	// ULScript describes this Unicode Letter script.
	//
	// Scoring uses text nil-grams, uni-grams, bi-grams, quad-grams, and octa-grams.
	// Nilgrams (no text lookup at all) are for script-based pseudo-languages and
	// for languages that are 1:1 with a given script. Unigrams and bigrams are
	// used to score the CJK languages, all in the Hani script. Quadgrams and
	// octagrams are used to score all other languages.
	//
	// RType is the Recognition Type per ulscript.
	//
	// The scoring tables map various grams to language-probability scores.
	// A given gram that hits in scoring table maps to an indirect subscript into
	// a list of packed languages and log probabilities.
	//
	// Languages are stored in two forms: 10-bit values in the Languge enum, and
	// shorter 8-bit per-ulscript values in the scoring tables.
	//
	// Language refers to the full 10-bit range.
	// pslang refers to the per-ulscript shorter values.
	//
	// Log probabilities also come in two forms. The full range uses values 0..255
	// to represent minus log base 10th-root-of-2, covering 1 .. 1/2**25.5 or about
	// TODO BOGUS description, 24 vs 12
	// 1/47.5M. The second form quantizes these into multiples of 8 that can be
	// added together to represent probability products. The quantized form uses
	// values 24..0 with 0 now least likely instead of most likely, thus making
	// larger sums for more probable results. 24 maps to original 1/2**4.8 (~1/28)
	// and 0 maps to original 1/2**24.0 (~1/16M).
	//
	// qprob refers to quantized log probabilities.
	//
	// langprob is a uint32 holding three 1-byte pslangs and a 1-byte subscript to
	// a list of three qprobs. It always nees a companion ulscript
	//
	// A scriptspan is scored via one or more hitbuffers


	#ifndef I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__
	#define I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__

	#include <stdio.h>

	#include "integral_types.h" // for uint8 etc.

	#include "cld2tablesummary.h"
	#include "compact_lang_det_impl.h" // for ResultChunkVector
	#include "getonescriptspan.h"
	#include "langspan.h"
	#include "tote.h"
	#include "utf8statetable.h"

	namespace CLD2 {

	static const int kMaxBoosts = 4; // For each of PerScriptLangBoosts
	// must be power of two for wrap()
	static const int kChunksizeQuads = 20; // For non-CJK
	static const int kChunksizeUnis = 50; // For CJK
	static const int kMaxScoringHits = 1000;
	static const int kMaxSummaries = kMaxScoringHits / kChunksizeQuads;


	// The first four tables are for CJK languages,
	// the next three for quadgram languages, and
	// the last for expected scores.
	typedef struct {
	const UTF8PropObj* unigram_obj; // 80K CJK characters
	const CLD2TableSummary* unigram_compat_obj; // 256 CJK lookup probabilities
	const CLD2TableSummary* deltabi_obj;
	const CLD2TableSummary* distinctbi_obj;

	const CLD2TableSummary* quadgram_obj; // Primary quadgram lookup table
	const CLD2TableSummary* quadgram_obj2; // Secondary "
	const CLD2TableSummary* deltaocta_obj;
	const CLD2TableSummary* distinctocta_obj;

	const short* kExpectedScore; // Expected base + delta + distinct score
	// per 1KB input
	// Subscripted by language and script4
	} ScoringTables;

	// Context for boosting several languages
	typedef struct {
	int32 n;
	uint32 langprob[kMaxBoosts];
	int wrap(int32 n) {return n & (kMaxBoosts - 1);}
	} LangBoosts;

	typedef struct {
	LangBoosts latn;
	LangBoosts othr;
	} PerScriptLangBoosts;



	// ScoringContext carries state across scriptspans
	// ScoringContext also has read-only scoring tables mapping grams to qprobs
	typedef struct {
	FILE* debug_file; // Non-NULL if debug output wanted
	bool flags_cld2_score_as_quads;
	bool flags_cld2_html;
	bool flags_cld2_cr;
	bool flags_cld2_verbose;
	ULScript ulscript; // langprobs below are with respect to this script
	Language prior_chunk_lang; // Mostly for debug output
	// boost has a packed set of per-script langs and probabilites
	// whack has a per-script lang to be suppressed from ever scoring (zeroed)
	// When a language in a close set is given as an explicit hint, others in
	// that set will be whacked.
	PerScriptLangBoosts langprior_boost; // From http content-lang or meta lang=
	PerScriptLangBoosts langprior_whack; // From http content-lang or meta lang=
	PerScriptLangBoosts distinct_boost; // From distinctive letter groups
	int oldest_distinct_boost; // Subscript in hitbuffer of oldest
	// distinct score to use
	const ScoringTables* scoringtables; // Probability lookup tables
	ScriptScanner* scanner; // For ResultChunkVector backmap

	// Inits boosts
	void init() {
	memset(&langprior_boost, 0, sizeof(langprior_boost));
	memset(&langprior_whack, 0, sizeof(langprior_whack));
	memset(&distinct_boost, 0, sizeof(distinct_boost));
	};
	} ScoringContext;



	// Begin private

	// Holds one scoring-table lookup hit. We hold indirect subscript instead of
	// langprob to allow a single hit to use a variable number of langprobs.
	typedef struct {
	int offset; // First byte of quad/octa etc. in scriptspan
	int indirect; // subscript of langprobs in scoring table
	} ScoringHit;

	typedef enum {
	UNIHIT = 0,
	QUADHIT = 1,
	DELTAHIT = 2,
	DISTINCTHIT = 3
	} LinearHitType;

	// Holds one scoring-table lookup hit resolved into a langprob.
	typedef struct {
	uint16 offset; // First byte of quad/octa etc. in scriptspan
	uint16 type; // LinearHitType
	uint32 langprob; // langprob from scoring table
	} LangprobHit;

	// Holds arrays of scoring-table lookup hits for (part of) a scriptspan
	typedef struct {
	ULScript ulscript; // langprobs below are with respect to this script
	int maxscoringhits; // determines size of arrays below
	int next_base; // First unused entry in each array
	int next_delta; // "
	int next_distinct; // "
	int next_linear; // "
	int next_chunk_start; // First unused chunk_start entry
	int lowest_offset; // First byte of text span used to fill hitbuffer
	// Dummy entry at the end of each giving offset of first unused text byte
	ScoringHit base[kMaxScoringHits + 1]; // Uni/quad hits
	ScoringHit delta[kMaxScoringHits + 1]; // delta-bi/delta-octa hits
	ScoringHit distinct[kMaxScoringHits + 1]; // distinct-word hits
	LangprobHit linear[4 * kMaxScoringHits + 1]; // Above three merge-sorted
	// (4: some bases => 2 linear)
	int chunk_start[kMaxSummaries + 1]; // First linear[] subscr of
	// each scored chunk
	int chunk_offset[kMaxSummaries + 1]; // First text subscr of
	// each scored chunk

	void init() {
	ulscript = ULScript_Common;
	maxscoringhits = kMaxScoringHits;
	next_base = 0;
	next_delta = 0;
	next_distinct = 0;
	next_linear = 0;
	next_chunk_start = 0;
	lowest_offset = 0;
	base[0].offset = 0;
	base[0].indirect = 0;
	delta[0].offset = 0;
	delta[0].indirect = 0;
	distinct[0].offset = 0;
	distinct[0].indirect = 0;
	linear[0].offset = 0;
	linear[0].langprob = 0;
	chunk_start[0] = 0;
	chunk_offset[0] = 0;
	};
	} ScoringHitBuffer;

	// TODO: Explain here why we need both ChunkSpan and ChunkSummary
	typedef struct {
	int chunk_base; // Subscript of first hitbuffer.base[] in chunk
	int chunk_delta; // Subscript of first hitbuffer.delta[]
	int chunk_distinct; // Subscript of first hitbuffer.distinct[]
	int base_len; // Number of hitbuffer.base[] in chunk
	int delta_len; // Number of hitbuffer.delta[] in chunk
	int distinct_len; // Number of hitbuffer.distinct[] in chunk
	} ChunkSpan;


	// Packed into 20 bytes for space
	typedef struct {
	uint16 offset; // Text offset within current scriptspan.text
	uint16 chunk_start; // Scoring subscr within hitbuffer->linear[]
	uint16 lang1; // Top lang, mapped to full Language
	uint16 lang2; // Second lang, mapped to full Language
	uint16 score1; // Top lang raw score
	uint16 score2; // Second lang raw score
	uint16 bytes; // Number of lower letters bytes in chunk
	uint16 grams; // Number of scored base quad- uni-grams in chunk
	uint16 ulscript; // ULScript of chunk
	uint8 reliability_delta; // Reliability 0..100, delta top:second scores
	uint8 reliability_score; // Reliability 0..100, top:expected score
	} ChunkSummary;


	// We buffer up ~50 chunk summaries, corresponding to chunks of 20 quads in a
	// 1000-quad hit buffer, so we can do boundary adjustment on them
	// when adjacent entries are different languages. After that, we add them
	// all into the document score
	//
	// About 50 * 20 = 1000 bytes. OK for stack alloc
	typedef struct {
	int n;
	ChunkSummary chunksummary[kMaxSummaries + 1];
	} SummaryBuffer;

	// End private


	// Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating
	// scoringcontext
	void ScoreEntireScriptSpan(const LangSpan& scriptspan,
	ScoringContext* scoringcontext,
	DocTote* doc_tote,
	ResultChunkVector* vec);

	// Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext
	void ScoreCJKScriptSpan(const LangSpan& scriptspan,
	ScoringContext* scoringcontext,
	DocTote* doc_tote,
	ResultChunkVector* vec);

	// Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext
	void ScoreQuadScriptSpan(const LangSpan& scriptspan,
	ScoringContext* scoringcontext,
	DocTote* doc_tote,
	ResultChunkVector* vec);

	// Score one scriptspan into doc_tote and vec, updating scoringcontext
	void ScoreOneScriptSpan(const LangSpan& scriptspan,
	ScoringContext* scoringcontext,
	DocTote* doc_tote,
	ResultChunkVector* vec);

	} // End namespace CLD2

	#endif // I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__