src/aosp/suggest/core/dictionary/bigram_dictionary.cc - chromiumos/platform/suggest - Git at Google

 /*
  * Copyright (C) 2010, The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include <cstring>

 #define LOG_TAG "LatinIME: bigram_dictionary.cpp"

 #include "bigram_dictionary.h"

 #include "defines.h"
 #include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
 #include "suggest/core/dictionary/dictionary.h"
 #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
 #include "utils/char_utils.h"

 namespace latinime {

 BigramDictionary::BigramDictionary(
         const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy)
         : mDictionaryStructurePolicy(dictionaryStructurePolicy) {
     if (DEBUG_DICT) {
         AKLOGI("BigramDictionary - constructor");
     }
 }

 BigramDictionary::~BigramDictionary() {
 }

 void BigramDictionary::addWordBigram(int *word, int length, int probability, int *bigramProbability,
         int *bigramCodePoints, int *outputTypes) const {
     word[length] = 0;
     if (DEBUG_DICT_FULL) {
 #ifdef FLAG_DBG
         char s[length + 1];
         for (int i = 0; i <= length; i++) s[i] = static_cast<char>(word[i]);
         AKLOGI("Bigram: Found word = %s, freq = %d :", s, probability);
 #endif
     }

     // Find the right insertion point
     int insertAt = 0;
     while (insertAt < MAX_RESULTS) {
         if (probability > bigramProbability[insertAt] || (bigramProbability[insertAt] == probability
                 && length < CharUtils::getCodePointCount(MAX_WORD_LENGTH,
                         bigramCodePoints + insertAt * MAX_WORD_LENGTH))) {
             break;
         }
         insertAt++;
     }
     if (DEBUG_DICT_FULL) {
         AKLOGI("Bigram: InsertAt -> %d MAX_RESULTS: %d", insertAt, MAX_RESULTS);
     }
     if (insertAt >= MAX_RESULTS) {
         return;
     }
     memmove(bigramProbability + (insertAt + 1),
             bigramProbability + insertAt,
             (MAX_RESULTS - insertAt - 1) * sizeof(bigramProbability[0]));
     bigramProbability[insertAt] = probability;
     outputTypes[insertAt] = Dictionary::KIND_PREDICTION;
     memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH,
             bigramCodePoints + insertAt * MAX_WORD_LENGTH,
             (MAX_RESULTS - insertAt - 1) * sizeof(bigramCodePoints[0]) * MAX_WORD_LENGTH);
     int *dest = bigramCodePoints + insertAt * MAX_WORD_LENGTH;
     while (length--) {
         *dest++ = *word++;
     }
     *dest = 0; // NULL terminate
     if (DEBUG_DICT_FULL) {
         AKLOGI("Bigram: Added word at %d", insertAt);
     }
 }

 /* Parameters :
  * prevWord: the word before, the one for which we need to look up bigrams.
  * prevWordLength: its length.
  * outBigramCodePoints: an array for output, at the same format as outwords for getSuggestions.
  * outBigramProbability: an array to output frequencies.
  * outputTypes: an array to output types.
  * This method returns the number of bigrams this word has, for backward compatibility.
  */
 int BigramDictionary::getPredictions(const int *prevWord, const int prevWordLength,
         int *const outBigramCodePoints, int *const outBigramProbability,
         int *const outputTypes) const {
     // TODO: remove unused arguments, and refrain from storing stuff in members of this class
     // TODO: have "in" arguments before "out" ones, and make out args explicit in the name

     int pos = getBigramListPositionForWord(prevWord, prevWordLength,
             false /* forceLowerCaseSearch */);
     // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
     if (NOT_A_DICT_POS == pos) {
         // If no bigrams for this exact word, search again in lower case.
         pos = getBigramListPositionForWord(prevWord, prevWordLength,
                 true /* forceLowerCaseSearch */);
     }
     // If still no bigrams, we really don't have them!
     if (NOT_A_DICT_POS == pos) return 0;

     int bigramCount = 0;
     int unigramProbability = 0;
     int bigramBuffer[MAX_WORD_LENGTH];
     BinaryDictionaryBigramsIterator bigramsIt(
             mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos);
     while (bigramsIt.hasNext()) {
         bigramsIt.next();
         if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) {
             continue;
         }
         const int codePointCount = mDictionaryStructurePolicy->
                 getCodePointsAndProbabilityAndReturnCodePointCount(bigramsIt.getBigramPos(),
                         MAX_WORD_LENGTH, bigramBuffer, &unigramProbability);
         if (codePointCount <= 0) {
             continue;
         }
         // Due to space constraints, the probability for bigrams is approximate - the lower the
         // unigram probability, the worse the precision. The theoritical maximum error in
         // resulting probability is 8 - although in the practice it's never bigger than 3 or 4
         // in very bad cases. This means that sometimes, we'll see some bigrams interverted
         // here, but it can't get too bad.
         const int probability = mDictionaryStructurePolicy->getProbability(
                 unigramProbability, bigramsIt.getProbability());
         addWordBigram(bigramBuffer, codePointCount, probability, outBigramProbability,
                 outBigramCodePoints, outputTypes);
         ++bigramCount;
     }
     return min(bigramCount, MAX_RESULTS);
 }

 // Returns a pointer to the start of the bigram list.
 // If the word is not found or has no bigrams, this function returns NOT_A_DICT_POS.
 int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
         const bool forceLowerCaseSearch) const {
     if (0 >= prevWordLength) return NOT_A_DICT_POS;
     int pos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(prevWord, prevWordLength,
             forceLowerCaseSearch);
     if (NOT_A_DICT_POS == pos) return NOT_A_DICT_POS;
     return mDictionaryStructurePolicy->getBigramsPositionOfPtNode(pos);
 }

 int BigramDictionary::getBigramProbability(const int *word0, int length0, const int *word1,
         int length1) const {
     int pos = getBigramListPositionForWord(word0, length0, false /* forceLowerCaseSearch */);
     // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
     if (NOT_A_DICT_POS == pos) return NOT_A_PROBABILITY;
     int nextWordPos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(word1, length1,
             false /* forceLowerCaseSearch */);
     if (NOT_A_DICT_POS == nextWordPos) return NOT_A_PROBABILITY;

     BinaryDictionaryBigramsIterator bigramsIt(
             mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos);
     while (bigramsIt.hasNext()) {
         bigramsIt.next();
         if (bigramsIt.getBigramPos() == nextWordPos) {
             return mDictionaryStructurePolicy->getProbability(
                     mDictionaryStructurePolicy->getUnigramProbabilityOfPtNode(nextWordPos),
                     bigramsIt.getProbability());
         }
     }
     return NOT_A_PROBABILITY;
 }

 // TODO: Move functions related to bigram to here
 } // namespace latinime
	/*
	* Copyright (C) 2010, The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include <cstring>

	#define LOG_TAG "LatinIME: bigram_dictionary.cpp"

	#include "bigram_dictionary.h"

	#include "defines.h"
	#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
	#include "suggest/core/dictionary/dictionary.h"
	#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
	#include "utils/char_utils.h"

	namespace latinime {

	BigramDictionary::BigramDictionary(
	const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy)
	: mDictionaryStructurePolicy(dictionaryStructurePolicy) {
	if (DEBUG_DICT) {
	AKLOGI("BigramDictionary - constructor");
	}
	}

	BigramDictionary::~BigramDictionary() {
	}

	void BigramDictionary::addWordBigram(int word, int length, int probability, int bigramProbability,
	int bigramCodePoints, int outputTypes) const {
	word[length] = 0;
	if (DEBUG_DICT_FULL) {
	#ifdef FLAG_DBG
	char s[length + 1];
	for (int i = 0; i <= length; i++) s[i] = static_cast<char>(word[i]);
	AKLOGI("Bigram: Found word = %s, freq = %d :", s, probability);
	#endif
	}

	// Find the right insertion point
	int insertAt = 0;
	while (insertAt < MAX_RESULTS) {
	if (probability > bigramProbability[insertAt] \|\| (bigramProbability[insertAt] == probability
	&& length < CharUtils::getCodePointCount(MAX_WORD_LENGTH,
	bigramCodePoints + insertAt * MAX_WORD_LENGTH))) {
	break;
	}
	insertAt++;
	}
	if (DEBUG_DICT_FULL) {
	AKLOGI("Bigram: InsertAt -> %d MAX_RESULTS: %d", insertAt, MAX_RESULTS);
	}
	if (insertAt >= MAX_RESULTS) {
	return;
	}
	memmove(bigramProbability + (insertAt + 1),
	bigramProbability + insertAt,
	(MAX_RESULTS - insertAt - 1) * sizeof(bigramProbability[0]));
	bigramProbability[insertAt] = probability;
	outputTypes[insertAt] = Dictionary::KIND_PREDICTION;
	memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH,
	bigramCodePoints + insertAt * MAX_WORD_LENGTH,
	(MAX_RESULTS - insertAt - 1) * sizeof(bigramCodePoints[0]) * MAX_WORD_LENGTH);
	int dest = bigramCodePoints + insertAt MAX_WORD_LENGTH;
	while (length--) {
	dest++ = word++;
	}
	*dest = 0; // NULL terminate
	if (DEBUG_DICT_FULL) {
	AKLOGI("Bigram: Added word at %d", insertAt);
	}
	}

	/* Parameters :
	* prevWord: the word before, the one for which we need to look up bigrams.
	* prevWordLength: its length.
	* outBigramCodePoints: an array for output, at the same format as outwords for getSuggestions.
	* outBigramProbability: an array to output frequencies.
	* outputTypes: an array to output types.
	* This method returns the number of bigrams this word has, for backward compatibility.
	*/
	int BigramDictionary::getPredictions(const int *prevWord, const int prevWordLength,
	int const outBigramCodePoints, int const outBigramProbability,
	int *const outputTypes) const {
	// TODO: remove unused arguments, and refrain from storing stuff in members of this class
	// TODO: have "in" arguments before "out" ones, and make out args explicit in the name

	int pos = getBigramListPositionForWord(prevWord, prevWordLength,
	false /* forceLowerCaseSearch */);
	// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
	if (NOT_A_DICT_POS == pos) {
	// If no bigrams for this exact word, search again in lower case.
	pos = getBigramListPositionForWord(prevWord, prevWordLength,
	true /* forceLowerCaseSearch */);
	}
	// If still no bigrams, we really don't have them!
	if (NOT_A_DICT_POS == pos) return 0;

	int bigramCount = 0;
	int unigramProbability = 0;
	int bigramBuffer[MAX_WORD_LENGTH];
	BinaryDictionaryBigramsIterator bigramsIt(
	mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos);
	while (bigramsIt.hasNext()) {
	bigramsIt.next();
	if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) {
	continue;
	}
	const int codePointCount = mDictionaryStructurePolicy->
	getCodePointsAndProbabilityAndReturnCodePointCount(bigramsIt.getBigramPos(),
	MAX_WORD_LENGTH, bigramBuffer, &unigramProbability);
	if (codePointCount <= 0) {
	continue;
	}
	// Due to space constraints, the probability for bigrams is approximate - the lower the
	// unigram probability, the worse the precision. The theoritical maximum error in
	// resulting probability is 8 - although in the practice it's never bigger than 3 or 4
	// in very bad cases. This means that sometimes, we'll see some bigrams interverted
	// here, but it can't get too bad.
	const int probability = mDictionaryStructurePolicy->getProbability(
	unigramProbability, bigramsIt.getProbability());
	addWordBigram(bigramBuffer, codePointCount, probability, outBigramProbability,
	outBigramCodePoints, outputTypes);
	++bigramCount;
	}
	return min(bigramCount, MAX_RESULTS);
	}

	// Returns a pointer to the start of the bigram list.
	// If the word is not found or has no bigrams, this function returns NOT_A_DICT_POS.
	int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
	const bool forceLowerCaseSearch) const {
	if (0 >= prevWordLength) return NOT_A_DICT_POS;
	int pos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(prevWord, prevWordLength,
	forceLowerCaseSearch);
	if (NOT_A_DICT_POS == pos) return NOT_A_DICT_POS;
	return mDictionaryStructurePolicy->getBigramsPositionOfPtNode(pos);
	}

	int BigramDictionary::getBigramProbability(const int word0, int length0, const int word1,
	int length1) const {
	int pos = getBigramListPositionForWord(word0, length0, false /* forceLowerCaseSearch */);
	// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
	if (NOT_A_DICT_POS == pos) return NOT_A_PROBABILITY;
	int nextWordPos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(word1, length1,
	false /* forceLowerCaseSearch */);
	if (NOT_A_DICT_POS == nextWordPos) return NOT_A_PROBABILITY;

	BinaryDictionaryBigramsIterator bigramsIt(
	mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos);
	while (bigramsIt.hasNext()) {
	bigramsIt.next();
	if (bigramsIt.getBigramPos() == nextWordPos) {
	return mDictionaryStructurePolicy->getProbability(
	mDictionaryStructurePolicy->getUnigramProbabilityOfPtNode(nextWordPos),
	bigramsIt.getProbability());
	}
	}
	return NOT_A_PROBABILITY;
	}

	// TODO: Move functions related to bigram to here
	} // namespace latinime