internal/cld2_dynamic_data.h - external/github.com/CLD2Owners/cld2 - Git at Google

 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_
 #define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_

 #include "integral_types.h"
 #include "cld2tablesummary.h"
 #include "utf8statetable.h"
 #include "scoreonescriptspan.h"

 /*
   There are two primary parts to a CLD2 dynamic data file:
     1. A header, wherein trivial data, block lengths and block offsets are kept
     2. A data block, wherein the large binary blocks are kept

   By reading the header, an application can determine the offsets and lengths of
   all the data blocks for all tables. Offsets in the header are expressed
   relative to the first byte of the file, inclusive of the header itself; thus,
   any offset whose value is less than the length of the header is invalid.

   Any offset whose value is zero indicates a field that is null in the
   underlying CLD2 data; a real example of this is the fast_state field of the
   UTF8PropObj, which may be null.

   The size of the header can be precalculated by calling calculateHeaderSize(),
   which will indicate the exact size of the header for a data file that contains
   a given number of CLD2TableSummary objects.

   Notes on endianness:
   The data format is only suitable for little-endian machines. For big-endian
   systems, a tedious transformation would need to be made first to reverse the
   byte order of significant portions of the binary - not just the lengths, but
   also some of the underlying table data.

   Note on 32/64 bit:
   The data format is agnostic to 32/64 bit pointers. All the offsets within the
   data blob itself are 32-bit values relative to the start of the file, and the
   file should certainly never be gigabytes in size!
   When the file is ultimately read by the loading code and mmap()'d, new
   pointers are generated at whatever size the system uses, initialized to the
   start of the mmap, and incremented by the 32-bit offset. This should be safe
   regardless of 32- or 64-bit architectures.

   --------------------------------------------------------------------
   FIELD
   --------------------------------------------------------------------
   DATA_FILE_MARKER (no null terminator)
   total file size (sanity check, uint32)
   --------------------------------------------------------------------
   UTF8PropObj: const uint32 state0
   UTF8PropObj: const uint32 state0_size
   UTF8PropObj: const uint32 total_size
   UTF8PropObj: const int max_expand
   UTF8PropObj: const int entry_shift (coerced to 32 bits)
   UTF8PropObj: const int bytes_per_entry (coerced to 32 bits)
   UTF8PropObj: const uint32 losub
   UTF8PropObj: const uint32 hiadd
   offset of UTF8PropObj: const uint8* state_table
   length of UTF8PropObj: const uint8* state_table
   offset of UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
   length of UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
   offset of UTF8PropObj: const uint8* remap_string
   length of UTF8PropObj: const uint8* remap_string
   offset of UTF8PropObj: const uint8* fast_state
   length of UTF8PropObj: const uint8* fast_state
   --------------------------------------------------------------------
   start of const short kAvgDeltaOctaScore[]
   length of const short kAvgDeltaOctaScore[]
   --------------------------------------------------------------------
   number of CLD2TableSummary objects encoded (n)
   [Table 1]: CLD2TableSummary: uint32 kCLDTableSizeOne
   [Table 1]: CLD2TableSummary: uint32 kCLDTableSize
   [Table 1]: CLD2TableSummary: uint32 kCLDTableKeyMask
   [Table 1]: CLD2TableSummary: uint32 kCLDTableBuildDate
   [Table 1]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
   [Table 1]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
   [Table 1]: offset of CLD2TableSummary: const uint32* kCLDTableInd
   [Table 1]: length of CLD2TableSummary: const uint32* kCLDTableInd
   [Table 1]: offset of CLD2TableSummary: const char* kRecognizedLangScripts
   [Table 1]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1
   .
   .
   .
   [Table n]: CLD2TableSummary: uint32 kCLDTableSizeOne
   [Table n]: CLD2TableSummary: uint32 kCLDTableSize
   [Table n]: CLD2TableSummary: uint32 kCLDTableKeyMask
   [Table n]: CLD2TableSummary: uint32 kCLDTableBuildDate
   [Table n]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
   [Table n]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
   [Table n]: offset of CLD2TableSummary: const uint32* kCLDTableInd
   [Table n]: length of CLD2TableSummary: const uint32* kCLDTableInd
   [Table n]: offset of CLD2TableSummary: const char* kRecognizedLangScripts
   [Table n]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1
   --------------------------------------------------------------------


   Immediately after the header fields comes the data block. The data block has
   the following content, in this order (note that padding is applied in order to
   keep lookups word-aligned):

   UTF8PropObj: const uint8* state_table
   UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
   UTF8PropObj: const uint8* remap_string
   UTF8PropObj: const uint8* fast_state
   const short kAvgDeltaOctaScore[]
   [Table 1]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable
   [Table 1]: CLD2TableSummary: const uint32* kCLDTableInd
   [Table 1]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator)
   .
   .
   .
   [Table n]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable
   [Table n]: CLD2TableSummary: const uint32* kCLDTableInd
   [Table n]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator)


   It is STRONGLY recommended that the chunks within the data block be kept
   128-bit aligned for efficiency reasons, although the code will work without
   such alignment: the main lookup tables have randomly-accessed groups of four
   4-byte entries, and these must be 16-byte aligned to avoid the performance
   cost of multiple cache misses per group.
 */
 namespace CLD2DynamicData {

 static const char* DATA_FILE_MARKER = "cld2_data_file00";
 static const int DATA_FILE_MARKER_LENGTH = 16; // Keep aligned to 128 bits

 // Nicer version of memcmp that shows the offset at which bytes differ
 bool mem_compare(const void* data1, const void* data2, const int length);

 // Enable or disable debugging; 0 to disable, 1 to enable
 void setDebug(int debug);

 // Lower-level structure for individual tables. There are n table headers in
 // a given file header.
 typedef struct {
   CLD2::uint32 kCLDTableSizeOne;
   CLD2::uint32 kCLDTableSize;
   CLD2::uint32 kCLDTableKeyMask;
   CLD2::uint32 kCLDTableBuildDate;
   CLD2::uint32 startOf_kCLDTable;
   CLD2::uint32 lengthOf_kCLDTable;
   CLD2::uint32 startOf_kCLDTableInd;
   CLD2::uint32 lengthOf_kCLDTableInd;
   CLD2::uint32 startOf_kRecognizedLangScripts;
   CLD2::uint32 lengthOf_kRecognizedLangScripts;
 } TableHeader;


 // Top-level structure for a CLD2 Data File Header.
 // Contains all the primitive fields for the header as well as an array of
 // headers for the individual tables.
 typedef struct {
   // Marker fields help recognize and verify the data file
   char sanityString[DATA_FILE_MARKER_LENGTH];
   CLD2::uint32 totalFileSizeBytes;

   // UTF8 primitives
   CLD2::uint32 utf8PropObj_state0;
   CLD2::uint32 utf8PropObj_state0_size;
   CLD2::uint32 utf8PropObj_total_size;
   CLD2::uint32 utf8PropObj_max_expand;
   CLD2::uint32 utf8PropObj_entry_shift;
   CLD2::uint32 utf8PropObj_bytes_per_entry;
   CLD2::uint32 utf8PropObj_losub;
   CLD2::uint32 utf8PropObj_hiadd;
   CLD2::uint32 startOf_utf8PropObj_state_table;
   CLD2::uint32 lengthOf_utf8PropObj_state_table;
   CLD2::uint32 startOf_utf8PropObj_remap_base;
   CLD2::uint32 lengthOf_utf8PropObj_remap_base;
   CLD2::uint32 startOf_utf8PropObj_remap_string;
   CLD2::uint32 lengthOf_utf8PropObj_remap_string;
   CLD2::uint32 startOf_utf8PropObj_fast_state;
   CLD2::uint32 lengthOf_utf8PropObj_fast_state;

   // Average delta-octa-score bits
   CLD2::uint32 startOf_kAvgDeltaOctaScore;
   CLD2::uint32 lengthOf_kAvgDeltaOctaScore;

   // Table bits
   CLD2::uint32 numTablesEncoded;
   TableHeader* tableHeaders;
 } FileHeader;

 // The CLD2::TableHeader structure doesn't contain everything that is needed
 // to dump table data. Specifically, the size of the indirect table is not
 // part of the data structure. Any such data are captured in this struct.
 typedef struct {
   const CLD2::uint32 lengthOf_kAvgDeltaOctaScore;
   // An array of 32-bit unsigned integers representing the indirect table sizes
   // for each of the table headers in the FileHeader structure. It is assumed
   // that there is exactly one entry in the array for each table header present
   // in the tableHeaders field of the FileHeader, and that they are in the
   // same order as the entries in that structure.
   const CLD2::uint32* indirectTableSizes;
 } Supplement;

 // Calculate the exact size of a header that encodes the specified number of
 // tables. This can be used to reserve space within the data file,
 // calculate offsets, and so on.
 CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables);

 // Dump a given header to stdout as a human-readable string.
 void dumpHeader(FileHeader* header);

 // Verify that a given pair of scoring tables match precisely.
 // Uses the provided supplement to verify information that cannot be otherwise
 // checked from the CLD2::ScoringTables structure.
 // If there is a problem, returns false.
 bool verify(const CLD2::ScoringTables* realData,
             const Supplement* realSupplement,
             const CLD2::ScoringTables* loadedData);

 // Return true iff the program is running in little-endian mode.
 bool isLittleEndian();

 // Return true iff the core size assumptions are ok on this platform.
 bool coreAssumptionsOk();

 } // End namespace CLD2DynamicData
 #endif  // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_
	// Copyright 2014 Google Inc. All Rights Reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_
	#define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_

	#include "integral_types.h"
	#include "cld2tablesummary.h"
	#include "utf8statetable.h"
	#include "scoreonescriptspan.h"

	/*
	There are two primary parts to a CLD2 dynamic data file:
	1. A header, wherein trivial data, block lengths and block offsets are kept
	2. A data block, wherein the large binary blocks are kept

	By reading the header, an application can determine the offsets and lengths of
	all the data blocks for all tables. Offsets in the header are expressed
	relative to the first byte of the file, inclusive of the header itself; thus,
	any offset whose value is less than the length of the header is invalid.

	Any offset whose value is zero indicates a field that is null in the
	underlying CLD2 data; a real example of this is the fast_state field of the
	UTF8PropObj, which may be null.

	The size of the header can be precalculated by calling calculateHeaderSize(),
	which will indicate the exact size of the header for a data file that contains
	a given number of CLD2TableSummary objects.

	Notes on endianness:
	The data format is only suitable for little-endian machines. For big-endian
	systems, a tedious transformation would need to be made first to reverse the
	byte order of significant portions of the binary - not just the lengths, but
	also some of the underlying table data.

	Note on 32/64 bit:
	The data format is agnostic to 32/64 bit pointers. All the offsets within the
	data blob itself are 32-bit values relative to the start of the file, and the
	file should certainly never be gigabytes in size!
	When the file is ultimately read by the loading code and mmap()'d, new
	pointers are generated at whatever size the system uses, initialized to the
	start of the mmap, and incremented by the 32-bit offset. This should be safe
	regardless of 32- or 64-bit architectures.

	--------------------------------------------------------------------
	FIELD
	--------------------------------------------------------------------
	DATA_FILE_MARKER (no null terminator)
	total file size (sanity check, uint32)
	--------------------------------------------------------------------
	UTF8PropObj: const uint32 state0
	UTF8PropObj: const uint32 state0_size
	UTF8PropObj: const uint32 total_size
	UTF8PropObj: const int max_expand
	UTF8PropObj: const int entry_shift (coerced to 32 bits)
	UTF8PropObj: const int bytes_per_entry (coerced to 32 bits)
	UTF8PropObj: const uint32 losub
	UTF8PropObj: const uint32 hiadd
	offset of UTF8PropObj: const uint8* state_table
	length of UTF8PropObj: const uint8* state_table
	offset of UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
	length of UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
	offset of UTF8PropObj: const uint8* remap_string
	length of UTF8PropObj: const uint8* remap_string
	offset of UTF8PropObj: const uint8* fast_state
	length of UTF8PropObj: const uint8* fast_state
	--------------------------------------------------------------------
	start of const short kAvgDeltaOctaScore[]
	length of const short kAvgDeltaOctaScore[]
	--------------------------------------------------------------------
	number of CLD2TableSummary objects encoded (n)
	[Table 1]: CLD2TableSummary: uint32 kCLDTableSizeOne
	[Table 1]: CLD2TableSummary: uint32 kCLDTableSize
	[Table 1]: CLD2TableSummary: uint32 kCLDTableKeyMask
	[Table 1]: CLD2TableSummary: uint32 kCLDTableBuildDate
	[Table 1]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
	[Table 1]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
	[Table 1]: offset of CLD2TableSummary: const uint32* kCLDTableInd
	[Table 1]: length of CLD2TableSummary: const uint32* kCLDTableInd
	[Table 1]: offset of CLD2TableSummary: const char* kRecognizedLangScripts
	[Table 1]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1
	.
	.
	.
	[Table n]: CLD2TableSummary: uint32 kCLDTableSizeOne
	[Table n]: CLD2TableSummary: uint32 kCLDTableSize
	[Table n]: CLD2TableSummary: uint32 kCLDTableKeyMask
	[Table n]: CLD2TableSummary: uint32 kCLDTableBuildDate
	[Table n]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
	[Table n]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
	[Table n]: offset of CLD2TableSummary: const uint32* kCLDTableInd
	[Table n]: length of CLD2TableSummary: const uint32* kCLDTableInd
	[Table n]: offset of CLD2TableSummary: const char* kRecognizedLangScripts
	[Table n]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1
	--------------------------------------------------------------------


	Immediately after the header fields comes the data block. The data block has
	the following content, in this order (note that padding is applied in order to
	keep lookups word-aligned):

	UTF8PropObj: const uint8* state_table
	UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
	UTF8PropObj: const uint8* remap_string
	UTF8PropObj: const uint8* fast_state
	const short kAvgDeltaOctaScore[]
	[Table 1]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable
	[Table 1]: CLD2TableSummary: const uint32* kCLDTableInd
	[Table 1]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator)
	.
	.
	.
	[Table n]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable
	[Table n]: CLD2TableSummary: const uint32* kCLDTableInd
	[Table n]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator)


	It is STRONGLY recommended that the chunks within the data block be kept
	128-bit aligned for efficiency reasons, although the code will work without
	such alignment: the main lookup tables have randomly-accessed groups of four
	4-byte entries, and these must be 16-byte aligned to avoid the performance
	cost of multiple cache misses per group.
	*/
	namespace CLD2DynamicData {

	static const char* DATA_FILE_MARKER = "cld2_data_file00";
	static const int DATA_FILE_MARKER_LENGTH = 16; // Keep aligned to 128 bits

	// Nicer version of memcmp that shows the offset at which bytes differ
	bool mem_compare(const void* data1, const void* data2, const int length);

	// Enable or disable debugging; 0 to disable, 1 to enable
	void setDebug(int debug);

	// Lower-level structure for individual tables. There are n table headers in
	// a given file header.
	typedef struct {
	CLD2::uint32 kCLDTableSizeOne;
	CLD2::uint32 kCLDTableSize;
	CLD2::uint32 kCLDTableKeyMask;
	CLD2::uint32 kCLDTableBuildDate;
	CLD2::uint32 startOf_kCLDTable;
	CLD2::uint32 lengthOf_kCLDTable;
	CLD2::uint32 startOf_kCLDTableInd;
	CLD2::uint32 lengthOf_kCLDTableInd;
	CLD2::uint32 startOf_kRecognizedLangScripts;
	CLD2::uint32 lengthOf_kRecognizedLangScripts;
	} TableHeader;


	// Top-level structure for a CLD2 Data File Header.
	// Contains all the primitive fields for the header as well as an array of
	// headers for the individual tables.
	typedef struct {
	// Marker fields help recognize and verify the data file
	char sanityString[DATA_FILE_MARKER_LENGTH];
	CLD2::uint32 totalFileSizeBytes;

	// UTF8 primitives
	CLD2::uint32 utf8PropObj_state0;
	CLD2::uint32 utf8PropObj_state0_size;
	CLD2::uint32 utf8PropObj_total_size;
	CLD2::uint32 utf8PropObj_max_expand;
	CLD2::uint32 utf8PropObj_entry_shift;
	CLD2::uint32 utf8PropObj_bytes_per_entry;
	CLD2::uint32 utf8PropObj_losub;
	CLD2::uint32 utf8PropObj_hiadd;
	CLD2::uint32 startOf_utf8PropObj_state_table;
	CLD2::uint32 lengthOf_utf8PropObj_state_table;
	CLD2::uint32 startOf_utf8PropObj_remap_base;
	CLD2::uint32 lengthOf_utf8PropObj_remap_base;
	CLD2::uint32 startOf_utf8PropObj_remap_string;
	CLD2::uint32 lengthOf_utf8PropObj_remap_string;
	CLD2::uint32 startOf_utf8PropObj_fast_state;
	CLD2::uint32 lengthOf_utf8PropObj_fast_state;

	// Average delta-octa-score bits
	CLD2::uint32 startOf_kAvgDeltaOctaScore;
	CLD2::uint32 lengthOf_kAvgDeltaOctaScore;

	// Table bits
	CLD2::uint32 numTablesEncoded;
	TableHeader* tableHeaders;
	} FileHeader;

	// The CLD2::TableHeader structure doesn't contain everything that is needed
	// to dump table data. Specifically, the size of the indirect table is not
	// part of the data structure. Any such data are captured in this struct.
	typedef struct {
	const CLD2::uint32 lengthOf_kAvgDeltaOctaScore;
	// An array of 32-bit unsigned integers representing the indirect table sizes
	// for each of the table headers in the FileHeader structure. It is assumed
	// that there is exactly one entry in the array for each table header present
	// in the tableHeaders field of the FileHeader, and that they are in the
	// same order as the entries in that structure.
	const CLD2::uint32* indirectTableSizes;
	} Supplement;

	// Calculate the exact size of a header that encodes the specified number of
	// tables. This can be used to reserve space within the data file,
	// calculate offsets, and so on.
	CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables);

	// Dump a given header to stdout as a human-readable string.
	void dumpHeader(FileHeader* header);

	// Verify that a given pair of scoring tables match precisely.
	// Uses the provided supplement to verify information that cannot be otherwise
	// checked from the CLD2::ScoringTables structure.
	// If there is a problem, returns false.
	bool verify(const CLD2::ScoringTables* realData,
	const Supplement* realSupplement,
	const CLD2::ScoringTables* loadedData);

	// Return true iff the program is running in little-endian mode.
	bool isLittleEndian();

	// Return true iff the core size assumptions are ok on this platform.
	bool coreAssumptionsOk();

	} // End namespace CLD2DynamicData
	#endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_