| // Copyright 2014 Google Inc. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include "cld2_dynamic_data.h" |
| #include "integral_types.h" |
| #include <assert.h> |
| #include <stdint.h> |
| |
| namespace CLD2DynamicData { |
| static int DEBUG=0; |
| void setDebug(int debug) { |
| DEBUG=debug; |
| } |
| |
| bool mem_compare(const void* data1, const void* data2, const int length) { |
| const unsigned char* raw1 = static_cast<const unsigned char*>(data1); |
| const unsigned char* raw2 = static_cast<const unsigned char*>(data2); |
| for (int x=0; x<length; x++) { |
| if (raw1[x] != raw2[x]) { |
| fprintf(stderr, "mem difference at data[%d]: decimal %d != decimal %d\n", |
| x, (unsigned int) raw1[x], (unsigned int) raw2[x]); |
| int y = (x - 5 > 0) ? (x - 5) : 0; // https://code.google.com/p/cld2/issues/detail?id=24 |
| for (; y<length && y<=x+5; y++) { |
| fprintf(stderr, "[%d]: %d <-> %d%s\n", |
| y, (unsigned int) raw1[y], (unsigned int) raw2[y], |
| ( x == y ? " [FIRST ERROR DETECTED HERE] " : "")); |
| } |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables) { |
| return DATA_FILE_MARKER_LENGTH // NB: no null terminator |
| + (20 * sizeof(CLD2::uint32)) // 20 uint32 fields in the struct |
| + (numTables * (10 * sizeof(CLD2::uint32))); // 10 uint32 per table |
| } |
| |
| void dumpHeader(FileHeader* header) { |
| char safeString[DATA_FILE_MARKER_LENGTH + 1]; |
| memcpy(safeString, header->sanityString, DATA_FILE_MARKER_LENGTH); |
| safeString[DATA_FILE_MARKER_LENGTH] = 0; |
| fprintf(stdout, "sanityString: %s\n", safeString); |
| fprintf(stdout, "totalFileSizeBytes: %d\n", header->totalFileSizeBytes); |
| fprintf(stdout, "utf8PropObj_state0: %d\n", header->utf8PropObj_state0); |
| fprintf(stdout, "utf8PropObj_state0_size: %d\n", header->utf8PropObj_state0_size); |
| fprintf(stdout, "utf8PropObj_total_size: %d\n", header->utf8PropObj_total_size); |
| fprintf(stdout, "utf8PropObj_max_expand: %d\n", header->utf8PropObj_max_expand); |
| fprintf(stdout, "utf8PropObj_entry_shift: %d\n", header->utf8PropObj_entry_shift); |
| fprintf(stdout, "utf8PropObj_bytes_per_entry: %d\n", header->utf8PropObj_bytes_per_entry); |
| fprintf(stdout, "utf8PropObj_losub: %d\n", header->utf8PropObj_losub); |
| fprintf(stdout, "utf8PropObj_hiadd: %d\n", header->utf8PropObj_hiadd); |
| fprintf(stdout, "startOf_utf8PropObj_state_table: %d\n", header->startOf_utf8PropObj_state_table); |
| fprintf(stdout, "lengthOf_utf8PropObj_state_table: %d\n", header->lengthOf_utf8PropObj_state_table); |
| fprintf(stdout, "startOf_utf8PropObj_remap_base: %d\n", header->startOf_utf8PropObj_remap_base); |
| fprintf(stdout, "lengthOf_utf8PropObj_remap_base: %d\n", header->lengthOf_utf8PropObj_remap_base); |
| fprintf(stdout, "startOf_utf8PropObj_remap_string: %d\n", header->startOf_utf8PropObj_remap_string); |
| fprintf(stdout, "lengthOf_utf8PropObj_remap_string: %d\n", header->lengthOf_utf8PropObj_remap_string); |
| fprintf(stdout, "startOf_utf8PropObj_fast_state: %d\n", header->startOf_utf8PropObj_fast_state); |
| fprintf(stdout, "lengthOf_utf8PropObj_fast_state: %d\n", header->lengthOf_utf8PropObj_fast_state); |
| fprintf(stdout, "startOf_kAvgDeltaOctaScore: %d\n", header->startOf_kAvgDeltaOctaScore); |
| fprintf(stdout, "lengthOf_kAvgDeltaOctaScore: %d\n", header->lengthOf_kAvgDeltaOctaScore); |
| fprintf(stdout, "numTablesEncoded: %d\n", header->numTablesEncoded); |
| |
| const char* tableNames[7]; |
| tableNames[0]="unigram_compat_obj"; |
| tableNames[1]="deltabi_obj"; |
| tableNames[2]="distinctbi_obj"; |
| tableNames[3]="quadgram_obj"; |
| tableNames[4]="quadgram_obj2"; |
| tableNames[5]="deltaocta_obj"; |
| tableNames[6]="distinctocta_obj"; |
| |
| for (int x=0; x < (int) header->numTablesEncoded; x++) { |
| TableHeader& tHeader = header->tableHeaders[x]; |
| |
| fprintf(stdout, "Table %d: (%s)\n", (x+1), tableNames[x]);; |
| fprintf(stdout, " kCLDTableSizeOne: %d\n", tHeader.kCLDTableSizeOne); |
| fprintf(stdout, " kCLDTableSize: %d\n", tHeader.kCLDTableSize); |
| fprintf(stdout, " kCLDTableKeyMask: %d\n", tHeader.kCLDTableKeyMask); |
| fprintf(stdout, " kCLDTableBuildDate: %d\n", tHeader.kCLDTableBuildDate); |
| fprintf(stdout, " startOf_kCLDTable: %d\n", tHeader.startOf_kCLDTable); |
| fprintf(stdout, " lengthOf_kCLDTable: %d\n", tHeader.lengthOf_kCLDTable); |
| fprintf(stdout, " startOf_kCLDTableInd: %d\n", tHeader.startOf_kCLDTableInd); |
| fprintf(stdout, " lengthOf_kCLDTableInd: %d\n", tHeader.lengthOf_kCLDTableInd); |
| fprintf(stdout, " startOf_kRecognizedLangScripts: %d\n", tHeader.startOf_kRecognizedLangScripts); |
| fprintf(stdout, " lengthOf_kRecognizedLangScripts: %d\n", tHeader.lengthOf_kRecognizedLangScripts); |
| } |
| } |
| |
| #define CHECK_EQUALS(name) if (loadedData->name != realData->name) {\ |
| fprintf(stderr, "%s: %d != %d\n", #name, loadedData->name, realData->name);\ |
| return false;\ |
| } |
| |
| #define CHECK_MEM_EQUALS(name,size) if (!mem_compare(loadedData->name,realData->name,size)) {\ |
| fprintf(stderr, "%s: data mismatch.\n", #name);\ |
| return false;\ |
| } |
| |
| bool verify(const CLD2::ScoringTables* realData, |
| const Supplement* realSupplement, |
| const CLD2::ScoringTables* loadedData) { |
| const int NUM_TABLES = 7; |
| const CLD2::CLD2TableSummary* realTableSummaries[NUM_TABLES]; |
| realTableSummaries[0] = realData->unigram_compat_obj; |
| realTableSummaries[1] = realData->deltabi_obj; |
| realTableSummaries[2] = realData->distinctbi_obj; |
| realTableSummaries[3] = realData->quadgram_obj; |
| realTableSummaries[4] = realData->quadgram_obj2; |
| realTableSummaries[5] = realData->deltaocta_obj; |
| realTableSummaries[6] = realData->distinctocta_obj; |
| |
| const CLD2::CLD2TableSummary* loadedTableSummaries[NUM_TABLES]; |
| loadedTableSummaries[0] = loadedData->unigram_compat_obj; |
| loadedTableSummaries[1] = loadedData->deltabi_obj; |
| loadedTableSummaries[2] = loadedData->distinctbi_obj; |
| loadedTableSummaries[3] = loadedData->quadgram_obj; |
| loadedTableSummaries[4] = loadedData->quadgram_obj2; |
| loadedTableSummaries[5] = loadedData->deltaocta_obj; |
| loadedTableSummaries[6] = loadedData->distinctocta_obj; |
| |
| CHECK_EQUALS(unigram_obj->state0); |
| CHECK_EQUALS(unigram_obj->state0_size); |
| CHECK_EQUALS(unigram_obj->total_size); |
| CHECK_EQUALS(unigram_obj->max_expand); |
| CHECK_EQUALS(unigram_obj->entry_shift); |
| CHECK_EQUALS(unigram_obj->bytes_per_entry); |
| CHECK_EQUALS(unigram_obj->losub); |
| CHECK_EQUALS(unigram_obj->hiadd); |
| CHECK_MEM_EQUALS(unigram_obj->state_table, realData->unigram_obj->total_size); |
| CHECK_MEM_EQUALS(unigram_obj->remap_base, sizeof(CLD2::RemapEntry)); // TODO: can this have more than one entry? |
| CHECK_MEM_EQUALS(unigram_obj->remap_string, strlen( |
| reinterpret_cast<const char*>(realData->unigram_obj->remap_string)) + 1); // null terminator included |
| |
| if (loadedData->unigram_obj->fast_state == NULL) { |
| if (realData->unigram_obj->fast_state != NULL) { |
| fprintf(stderr, "unigram_obj->fast_state is missing.\n"); |
| return false; |
| } |
| } else { |
| if (realData->unigram_obj->fast_state == NULL) { |
| fprintf(stderr, "unigram_obj->fast_state shouldn't be present.\n"); |
| return false; |
| } |
| CHECK_MEM_EQUALS(unigram_obj->fast_state, strlen( |
| reinterpret_cast<const char*>(realData->unigram_obj->fast_state)) + 1); // null terminator included |
| } |
| if (DEBUG) fprintf(stdout, "verified.\n"); |
| |
| if (DEBUG) fprintf(stdout, "Verifying kExpectedScore... "); |
| CHECK_MEM_EQUALS(kExpectedScore, realSupplement->lengthOf_kAvgDeltaOctaScore); |
| if (DEBUG) fprintf(stdout, "verified.\n"); |
| |
| // 3. Each table |
| for (int x=0; x<NUM_TABLES; x++) { |
| if (DEBUG) fprintf(stdout, "Verifying table %d... ", x+1); |
| const CLD2::CLD2TableSummary* realData = realTableSummaries[x]; |
| const CLD2::CLD2TableSummary* loadedData = loadedTableSummaries[x]; |
| // We need to calculate the table lengths to do the memcmp |
| CLD2::uint32 bytesPerBucket = sizeof(CLD2::IndirectProbBucket4); |
| CLD2::uint32 numBuckets = realData->kCLDTableSize; |
| CLD2::uint32 tableSizeBytes = bytesPerBucket * numBuckets; |
| CLD2::uint32 indirectTableSizeBytes = realSupplement->indirectTableSizes[x]; |
| CLD2::uint32 recognizedScriptsSizeBytes = |
| strlen(realData->kRecognizedLangScripts) + 1; // null terminator included |
| |
| // Verify the table data |
| CHECK_EQUALS(kCLDTableSizeOne); |
| CHECK_EQUALS(kCLDTableSize); |
| CHECK_EQUALS(kCLDTableKeyMask); |
| CHECK_EQUALS(kCLDTableBuildDate); |
| CHECK_MEM_EQUALS(kCLDTable, tableSizeBytes); |
| CHECK_MEM_EQUALS(kCLDTableInd, indirectTableSizeBytes); |
| CHECK_MEM_EQUALS(kRecognizedLangScripts, recognizedScriptsSizeBytes); |
| if (DEBUG) fprintf(stdout, "verified.\n"); |
| } |
| if (DEBUG) fprintf(stdout, "All data verified successfully.\n"); |
| return true; |
| } |
| |
| // As noted on http://stackoverflow.com/questions/1001307, gcc is highly likely |
| // to convert this function's return into a constant - meaning that any |
| // if-branches based upon it will be eliminated at compile time, allowing |
| // "free" detection throughout any dependent code. |
| bool isLittleEndian() { |
| union { |
| uint32_t integer; |
| char bytes[4]; |
| } test = {0x01020304}; |
| return test.bytes[0] == 4; |
| } |
| |
| bool coreAssumptionsOk() { |
| if (sizeof(CLD2::uint8) != 1) { |
| fprintf(stderr, "uint8 is %d bits instead of 8!\n", (int) (sizeof(CLD2::uint8) * 8)); |
| return false; |
| } |
| if (sizeof(CLD2::uint16) != 2) { |
| fprintf(stderr, "uint16 is %d bits instead of 16!\n", (int) (sizeof(CLD2::uint16) * 8)); |
| return false; |
| } |
| if (sizeof(CLD2::uint32) != 4) { |
| fprintf(stderr, "uint32 is %d bits instead of 32!\n", (int) (sizeof(CLD2::uint32) * 8)); |
| return false; |
| } |
| return true; |
| } |
| |
| } // End namespace CLD2DynamicData |