initial commit contains a copy of the native AOSP suggestion code as well as wrapper to make it compile on chromeos, a wrapper for use as a native C++ library and a demo program. BUG=chromium:329015 TEST=manual testing with the demo program Change-Id: Iece71fbe862b7296dbd4f60a30964e8986935a72
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1a13fe5 --- /dev/null +++ b/.gitignore
@@ -0,0 +1,4 @@ +.AppleDouble +.DS_Store +*.d +*.o
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..28fc25f --- /dev/null +++ b/Makefile
@@ -0,0 +1,97 @@ + +DESTDIR = /usr +LIBDIR = $(DESTDIR)/lib +SHAREDIR = $(DESTDIR)/share/libsuggest + +AOSP_SOURCE_FILES=src/aosp/compability/log_utils_mock.cpp\ + src/aosp/suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp\ + src/aosp/suggest/policyimpl/typing/scoring_params.cpp\ + src/aosp/suggest/policyimpl/typing/typing_weighting.cpp\ + src/aosp/suggest/policyimpl/typing/typing_suggest_policy.cpp\ + src/aosp/suggest/policyimpl/typing/typing_traversal.cpp\ + src/aosp/suggest/policyimpl/typing/typing_scoring.cpp\ + src/aosp/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp\ + src/aosp/suggest/policyimpl/dictionary/header/header_policy.cpp\ + src/aosp/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp\ + src/aosp/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp\ + src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp\ + src/aosp/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp\ + src/aosp/suggest/policyimpl/dictionary/patricia_trie_policy.cpp\ + src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp\ + src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp\ + src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.cpp\ + src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp\ + src/aosp/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.cpp\ + src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp\ + src/aosp/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp\ + src/aosp/suggest/policyimpl/dictionary/utils/format_utils.cpp\ + src/aosp/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp\ + src/aosp/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp\ + src/aosp/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp\ + src/aosp/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.cpp\ + src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp\ + src/aosp/suggest/core/session/dic_traverse_session.cpp\ + src/aosp/suggest/core/dicnode/dic_node_utils.cpp\ + src/aosp/suggest/core/dicnode/dic_nodes_cache.cpp\ + src/aosp/suggest/core/dicnode/dic_node.cpp\ + src/aosp/suggest/core/suggest.cpp\ + src/aosp/suggest/core/policy/weighting.cpp\ + src/aosp/suggest/core/layout/proximity_info_state.cpp\ + src/aosp/suggest/core/layout/proximity_info.cpp\ + src/aosp/suggest/core/layout/proximity_info_state_utils.cpp\ + src/aosp/suggest/core/layout/additional_proximity_chars.cpp\ + src/aosp/suggest/core/layout/proximity_info_params.cpp\ + src/aosp/suggest/core/dictionary/bigram_dictionary.cpp\ + src/aosp/suggest/core/dictionary/multi_bigram_map.cpp\ + src/aosp/suggest/core/dictionary/digraph_utils.cpp\ + src/aosp/suggest/core/dictionary/bloom_filter.cpp\ + src/aosp/suggest/core/dictionary/dictionary.cpp\ + src/aosp/utils/char_utils.cpp\ + src/aosp/utils/autocorrection_threshold_utils.cpp + +SUGGEST_SOURCE_FILES=src/suggest.cpp + +SUGGEST_DEMO_FILES=src/demo.cpp + +CXXFLAGS=-Iinclude -Isrc/aosp -Isrc/aosp/compability -Isrc -std=c++11 + + +# libsuggest.so library + +LIB_SOURCES=$(SUGGEST_SOURCE_FILES) $(AOSP_SOURCE_FILES) +LIB_OBJECTS=$(LIB_SOURCES:%.cpp=%.pic.o) + +libsuggest.so: $(LIB_OBJECTS) + g++ -shared -o libsuggest.so $(CXXFLAGS) $(LIB_OBJECTS) + +%.pic.o: %.cpp + g++ $(CXXFLAGS) -fPIC -MD -g -c -o $@ $< + + +# suggest_demo executable + +DEMO_SOURCES=$(SUGGEST_DEMO_FILES) $(LIB_SOURCES) +DEMO_OBJECTS=$(DEMO_SOURCES:%.cpp=%.pie.o) + +suggest_demo: libsuggest.so $(DEMO_OBJECTS) + g++ -o suggest_demo -lsuggest -lncurses -L. $(CXXFLAGS) $(DEMO_OBJECTS) + +%.pie.o: %.cpp + g++ $(CXXFLAGS) -MD -g -c -o $@ $< + +# common rules + +ALL_OBJECTS=$(DEMO_OBJECTS) $(LIB_OBJECTS) + +clean: + rm libsuggest.so | true + rm suggest_demo | true + rm $(ALL_OBJECTS) | true + +install: libsuggest.so + install -D -m 0755 libsuggest.so $(LIBDIR)/libsuggest.so + install -D -m 0755 dicts/en_US.dict $(SHAREDIR)/en_US.dict + +all: libsuggest.so suggest_demo + +-include $(ALL_OBJECTS:%.o=%.d) \ No newline at end of file
diff --git a/dicts/en_US.dict b/dicts/en_US.dict new file mode 100644 index 0000000..09b6992 --- /dev/null +++ b/dicts/en_US.dict Binary files differ
diff --git a/include/suggest/suggest.h b/include/suggest/suggest.h new file mode 100644 index 0000000..500bb3d --- /dev/null +++ b/include/suggest/suggest.h
@@ -0,0 +1,104 @@ +// Copyright (c) 2013 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +#ifndef SUGGEST_SUGGEST_H_ +#define SUGGEST_SUGGEST_H_ + +#include <string> +#include <list> +#include <vector> + +#include "types.h" + +namespace suggest { + +class SuggestEngine; +class SuggestEngineSession; + +// Description of a key. +// The coordinate system can be chosen by the user of this library, +// but keep in mind that all calculations are made in integer arithmetics. +// the rectangle is defined as location of the top left corner and size. +struct Key { + Key() {} + Key(vec2f location, vec2f size, charcode _code) : + rect(location, size), code(_code) { + } + + rect2f rect; + charcode code; + + static Key InvalidKey; +}; + +// A single word suggestion including confidences +struct Suggestion { + std::string word; + int frequency; + int commit_first_word_confidence; +}; + +// Next to the coordinates of a touch, suggest needs to know which +// character your keyboard recognized and displays to the users. +struct Touch { + Touch() {} + Touch(vec2f pos, charcode code); + + // create touch based on location only. Looks up which key is + // at this location. + Touch(vec2f pos, const SuggestEngine& engine); + + // create a touch based on char code, assumes the key is hit right at + // the center. + Touch(charcode code, const SuggestEngine& engine); + + vec2f pos; + charcode code; +}; + +// Parameters to tweak the suggestion process. +// The list of parameters might change during iterations of the +// library, but the constructor will always choose sane default values. +// The provided locale name will be used to pick the right dictionary +// from /usr/share/libsuggest/ +struct SuggestParameters { + SuggestParameters(std::string locale); + + vec2f grid_cells; + float search_box_size_factor; + std::string locale; +}; + +// I am not 100% sure what the session concept in AOSP is used for, +// so far I have been using a single session for all suggestions. +// The session allows you to receive suggestions based on a list +// of touch coordinates and (optional) the previously typed word. +class SuggestSession { + public: + virtual const std::list<Suggestion>& GetSuggestions( + const std::vector<Touch> &touches, + std::string previous_word="") = 0; +}; + +// Main class of the suggestion process, which builds the keyboard definition +// and allows suggest sessions to be created with this keyboard. +class SuggestEngine { + public: + virtual Key GetKeyAt(vec2f pos) const = 0; + virtual Key GetKey(charcode code) const = 0; + + // start a new session + virtual bool LoadDictionary(std::string locale) = 0; + virtual SuggestSession* NewSession() = 0; +}; + +// To hide the implementation details of the SuggestEngine class +// (especially for hiding the AOSP API) +SuggestEngine* NewSuggestEngine(vec2f keyboard_size, vec2f common_key_size, + const std::vector<Key> &keylist, + const SuggestParameters ¶meters); + + +} // namespace suggest + +#endif // SUGGEST_SUGGEST_H_ \ No newline at end of file
diff --git a/include/suggest/types.h b/include/suggest/types.h new file mode 100644 index 0000000..6c7c77e --- /dev/null +++ b/include/suggest/types.h
@@ -0,0 +1,106 @@ +// Copyright (c) 2013 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +#ifndef SUGGEST_TYPES_H_ +#define SUGGEST_TYPES_H_ + +#include <cmath> + +namespace suggest { + +// Simple 2D math vector for locations and sizes +template<typename T> struct vec2 { + T x; + T y; + + vec2() {} + vec2(T _x, T _y) : x(_x), y(_y) {} + + T abs_sq() { + return x*x + y*y; + } + T abs() { + return (T)sqrt(static_cast<double>(abs_sq())); + } +}; + +template<typename T> vec2<T> add(vec2<T> a, vec2<T> b) { + vec2<T> res; + res.x = a.x + b.x; + res.y = a.y + b.y; + return res; +} + +template<typename T> vec2<T> sub(vec2<T> a, vec2<T> b) { + vec2<T> res; + res.x = a.x - b.x; + res.y = a.y - b.y; + return res; +} + +template<typename T> vec2<T> mul_elem(vec2<T> a, vec2<T> b) { + vec2<T> res; + res.x = a.x * b.x; + res.y = a.y * b.y; + return res; +} + +template<typename T> vec2<T> div_elem(vec2<T> a, vec2<T> b) { + vec2<T> res; + res.x = a.x / b.x; + res.y = a.y / b.y; + return res; +} + +template<typename T> vec2<T> mul(vec2<T> a, T b) { + vec2<T> res; + res.x = a.x * b; + res.y = a.y * b; + return res; +} + +template<typename T> vec2<T> div(vec2<T> a, T b) { + vec2<T> res; + res.x = a.x / b; + res.y = a.y / b; + return res; +} + +typedef vec2<int> vec2i; +typedef vec2<float> vec2f; + +template<typename T> struct rect2 { + vec2<T> pos; + vec2<T> size; + + rect2() {} + rect2(vec2<T> pos, vec2<T> size) : pos(pos), size(size) {} + + vec2<T> center() const { + return add(pos, mul(size, 0.5f)); + } + + rect2<T> resized(vec2<T> new_size) const { + return rect2<T>(sub(center(), mul(new_size, 0.5f)), new_size); + } + + bool intersects(rect2 other) const { + vec2<T> delta = sub(other.pos, pos); + return delta.x < size.x && delta.x > -other.size.x && + delta.y < size.y && delta.y > -other.size.y; + } + + bool contains(vec2<T> point) const { + return point.x > pos.x && point.x < pos.x + size.x && + point.x > pos.y && point.y < pos.y + size.y; + } +}; + +typedef rect2<int> rect2i; +typedef rect2<float> rect2f; + +typedef int charcode; + +} // namespace suggest + +#endif // SUGGEST_TYPES_H_ \ No newline at end of file
diff --git a/keylist.txt b/keylist.txt new file mode 100644 index 0000000..43bf9c1 --- /dev/null +++ b/keylist.txt
@@ -0,0 +1,31 @@ +400 700 +100 100 +28 +a 0 0 100 100 +b 100 0 100 100 +c 200 0 100 100 +d 300 0 100 100 +e 0 100 100 100 +f 100 100 100 100 +g 200 100 100 100 +h 300 100 100 100 +i 0 200 100 100 +j 100 200 100 100 +k 200 200 100 100 +l 300 200 100 100 +m 0 300 100 100 +n 100 300 100 100 +o 200 300 100 100 +p 300 300 100 100 +q 0 400 100 100 +r 100 400 100 100 +s 200 400 100 100 +t 300 400 100 100 +u 0 500 100 100 +v 100 500 100 100 +w 200 500 100 100 +x 300 500 100 100 +y 0 600 100 100 +z 100 600 100 100 +. 200 600 100 100 +SPC 300 600 100 100
diff --git a/src/aosp/compability/android/log.h b/src/aosp/compability/android/log.h new file mode 100644 index 0000000..04fabc6 --- /dev/null +++ b/src/aosp/compability/android/log.h
@@ -0,0 +1,19 @@ +// Copyright (c) 2013 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef AOSP_COMPABILITY_ANDROID_LOG_H_ +#define AOSP_COMPABILITY_ANDROID_LOG_H_ + +namespace latinime { + class LogUtils { + void logToJava(JNIEnv *env, char const *fmt, ...) { + va_list args; + va_start(args, fmt); + vprintf(fmt, args); + va_end(args); + } + }; +}; + +#endif
diff --git a/src/aosp/compability/jni.h b/src/aosp/compability/jni.h new file mode 100644 index 0000000..3be988b --- /dev/null +++ b/src/aosp/compability/jni.h
@@ -0,0 +1,50 @@ +// Copyright (c) 2013 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef AOSP_COMPABILITY_JNI_H_ +#define AOSP_COMPABILITY_JNI_H_ + +#include <vector> +#include <string> + +typedef std::string jstring; +typedef std::vector<int> *jintArray; +typedef std::vector<float> *jfloatArray; +typedef int jint; +typedef long int jlong; +typedef float jfloat; +typedef size_t jsize; + +class JNIEnv { + public: + void GetIntArrayRegion(jintArray array, jsize offset, jsize len, jint* buffer) { + if (offset + len > array->size()) + return; + std::copy(array->begin() + offset, array->begin() + offset + len, buffer); + } + void GetFloatArrayRegion(jfloatArray array, jsize offset, jsize len, jfloat* buffer) { + if (offset + len > array->size()) + return; + std::copy(array->begin() + offset, array->begin() + offset + len, buffer); + } + jsize GetArrayLength(jintArray array) { + return array->size(); + } + jsize GetArrayLength(jfloatArray array) { + return array->size(); + } + void GetStringUTFRegion(jstring str, jsize offset, jsize len, char* buffer) { + if (offset + len > str.size()) + return; + std::copy(str.begin() + offset, str.begin() + offset + len, buffer); + } + jsize GetStringUTFLength(jstring string) { + return string.size(); + } + jsize GetStringLength(jstring string) { + return string.size(); + } +}; + +#endif
diff --git a/src/aosp/compability/log_utils_mock.cpp b/src/aosp/compability/log_utils_mock.cpp new file mode 100644 index 0000000..1496cc8 --- /dev/null +++ b/src/aosp/compability/log_utils_mock.cpp
@@ -0,0 +1,19 @@ +// Copyright (c) 2013 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <utils/log_utils.h> + +#include <cstdio> +#include <stdarg.h> + +#include "defines.h" + +namespace latinime { + /* static */ void LogUtils::logToJava(JNIEnv *const env, const char *const format, ...) { + va_list args; + va_start(args, format); + vprintf(format, args); + va_end(args); + } +}
diff --git a/src/aosp/defines.h b/src/aosp/defines.h new file mode 100644 index 0000000..742e388 --- /dev/null +++ b/src/aosp/defines.h
@@ -0,0 +1,415 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DEFINES_H +#define LATINIME_DEFINES_H + +#ifdef __GNUC__ +#define AK_FORCE_INLINE __attribute__((always_inline)) __inline__ +#else // __GNUC__ +#define AK_FORCE_INLINE inline +#endif // __GNUC__ + +#if defined(FLAG_DO_PROFILE) || defined(FLAG_DBG) +#undef AK_FORCE_INLINE +#define AK_FORCE_INLINE inline +#endif // defined(FLAG_DO_PROFILE) || defined(FLAG_DBG) + +// Must be equal to Constants.Dictionary.MAX_WORD_LENGTH in Java +#define MAX_WORD_LENGTH 48 +// Must be equal to BinaryDictionary.MAX_RESULTS in Java +#define MAX_RESULTS 18 +// Must be equal to ProximityInfo.MAX_PROXIMITY_CHARS_SIZE in Java +#define MAX_PROXIMITY_CHARS_SIZE 16 +#define ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE 2 +#define NELEMS(x) (sizeof(x) / sizeof((x)[0])) + +AK_FORCE_INLINE static int intArrayToCharArray(const int *const source, const int sourceSize, + char *dest, const int destSize) { + // We want to always terminate with a 0 char, so stop one short of the length to make + // sure there is room. + const int destLimit = destSize - 1; + int si = 0; + int di = 0; + while (si < sourceSize && di < destLimit && 0 != source[si]) { + const int codePoint = source[si++]; + if (codePoint < 0x7F) { // One byte + dest[di++] = codePoint; + } else if (codePoint < 0x7FF) { // Two bytes + if (di + 1 >= destLimit) break; + dest[di++] = 0xC0 + (codePoint >> 6); + dest[di++] = 0x80 + (codePoint & 0x3F); + } else if (codePoint < 0xFFFF) { // Three bytes + if (di + 2 >= destLimit) break; + dest[di++] = 0xE0 + (codePoint >> 12); + dest[di++] = 0x80 + ((codePoint >> 6) & 0x3F); + dest[di++] = 0x80 + (codePoint & 0x3F); + } else if (codePoint <= 0x1FFFFF) { // Four bytes + if (di + 3 >= destLimit) break; + dest[di++] = 0xF0 + (codePoint >> 18); + dest[di++] = 0x80 + ((codePoint >> 12) & 0x3F); + dest[di++] = 0x80 + ((codePoint >> 6) & 0x3F); + dest[di++] = 0x80 + (codePoint & 0x3F); + } else if (codePoint <= 0x3FFFFFF) { // Five bytes + if (di + 4 >= destLimit) break; + dest[di++] = 0xF8 + (codePoint >> 24); + dest[di++] = 0x80 + ((codePoint >> 18) & 0x3F); + dest[di++] = 0x80 + ((codePoint >> 12) & 0x3F); + dest[di++] = 0x80 + ((codePoint >> 6) & 0x3F); + dest[di++] = codePoint & 0x3F; + } else if (codePoint <= 0x7FFFFFFF) { // Six bytes + if (di + 5 >= destLimit) break; + dest[di++] = 0xFC + (codePoint >> 30); + dest[di++] = 0x80 + ((codePoint >> 24) & 0x3F); + dest[di++] = 0x80 + ((codePoint >> 18) & 0x3F); + dest[di++] = 0x80 + ((codePoint >> 12) & 0x3F); + dest[di++] = 0x80 + ((codePoint >> 6) & 0x3F); + dest[di++] = codePoint & 0x3F; + } else { + // Not a code point... skip. + } + } + dest[di] = 0; + return di; +} + +#if defined(FLAG_DO_PROFILE) || defined(FLAG_DBG) +#include <android/log.h> +#ifndef LOG_TAG +#define LOG_TAG "LatinIME: " +#endif // LOG_TAG +#define AKLOGE(fmt, ...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, fmt, ##__VA_ARGS__) +#define AKLOGI(fmt, ...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, fmt, ##__VA_ARGS__) + +#define DUMP_RESULT(words, frequencies) do { dumpResult(words, frequencies); } while (0) +#define DUMP_WORD(word, length) do { dumpWord(word, length); } while (0) +#define INTS_TO_CHARS(input, length, output, outlength) do { \ + intArrayToCharArray(input, length, output, outlength); } while (0) + +static inline void dumpWordInfo(const int *word, const int length, const int rank, + const int probability) { + static char charBuf[50]; + const int N = intArrayToCharArray(word, length, charBuf, NELEMS(charBuf)); + if (N > 1) { + AKLOGI("%2d [ %s ] (%d)", rank, charBuf, probability); + } +} + +static inline void dumpResult(const int *outWords, const int *frequencies) { + AKLOGI("--- DUMP RESULT ---------"); + for (int i = 0; i < MAX_RESULTS; ++i) { + dumpWordInfo(&outWords[i * MAX_WORD_LENGTH], MAX_WORD_LENGTH, i, frequencies[i]); + } + AKLOGI("-------------------------"); +} + +static AK_FORCE_INLINE void dumpWord(const int *word, const int length) { + static char charBuf[50]; + const int N = intArrayToCharArray(word, length, charBuf, NELEMS(charBuf)); + if (N > 1) { + AKLOGI("[ %s ]", charBuf); + } +} + +#ifndef __ANDROID__ +#include <cassert> +#include <execinfo.h> +#include <stdlib.h> + +#define DO_ASSERT_TEST +#define ASSERT(success) do { if (!(success)) { showStackTrace(); assert(success);} } while (0) +#define SHOW_STACK_TRACE do { showStackTrace(); } while (0) + +static inline void showStackTrace() { + void *callstack[128]; + int i, frames = backtrace(callstack, 128); + char **strs = backtrace_symbols(callstack, frames); + for (i = 0; i < frames; ++i) { + if (i == 0) { + AKLOGI("=== Trace ==="); + continue; + } + AKLOGI("%s", strs[i]); + } + free(strs); +} +#else // __ANDROID__ +#include <cassert> +#define DO_ASSERT_TEST +#define ASSERT(success) assert(success) +#define SHOW_STACK_TRACE +#endif // __ANDROID__ + +#else // defined(FLAG_DO_PROFILE) || defined(FLAG_DBG) +#define AKLOGE(fmt, ...) +#define AKLOGI(fmt, ...) +#define DUMP_RESULT(words, frequencies) +#define DUMP_WORD(word, length) +#undef DO_ASSERT_TEST +#define ASSERT(success) +#define SHOW_STACK_TRACE +#define INTS_TO_CHARS(input, length, output) +#endif // defined(FLAG_DO_PROFILE) || defined(FLAG_DBG) + +#ifdef FLAG_DO_PROFILE +// Profiler +#include <time.h> + +#define PROF_BUF_SIZE 100 +static float profile_buf[PROF_BUF_SIZE]; +static float profile_old[PROF_BUF_SIZE]; +static unsigned int profile_counter[PROF_BUF_SIZE]; + +#define PROF_RESET prof_reset() +#define PROF_COUNT(prof_buf_id) ++profile_counter[prof_buf_id] +#define PROF_OPEN do { PROF_RESET; PROF_START(PROF_BUF_SIZE - 1); } while (0) +#define PROF_START(prof_buf_id) do { \ + PROF_COUNT(prof_buf_id); profile_old[prof_buf_id] = (clock()); } while (0) +#define PROF_CLOSE do { PROF_END(PROF_BUF_SIZE - 1); PROF_OUTALL; } while (0) +#define PROF_END(prof_buf_id) profile_buf[prof_buf_id] += ((clock()) - profile_old[prof_buf_id]) +#define PROF_CLOCKOUT(prof_buf_id) \ + AKLOGI("%s : clock is %f", __FUNCTION__, (clock() - profile_old[prof_buf_id])) +#define PROF_OUTALL do { AKLOGI("--- %s ---", __FUNCTION__); prof_out(); } while (0) + +static inline void prof_reset(void) { + for (int i = 0; i < PROF_BUF_SIZE; ++i) { + profile_buf[i] = 0; + profile_old[i] = 0; + profile_counter[i] = 0; + } +} + +static inline void prof_out(void) { + if (profile_counter[PROF_BUF_SIZE - 1] != 1) { + AKLOGI("Error: You must call PROF_OPEN before PROF_CLOSE."); + } + AKLOGI("Total time is %6.3f ms.", + profile_buf[PROF_BUF_SIZE - 1] * 1000.0f / static_cast<float>(CLOCKS_PER_SEC)); + float all = 0.0f; + for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) { + all += profile_buf[i]; + } + if (all < 1.0f) all = 1.0f; + for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) { + if (profile_buf[i] > 0.0f) { + AKLOGI("(%d): Used %4.2f%%, %8.4f ms. Called %d times.", + i, (profile_buf[i] * 100.0f / all), + profile_buf[i] * 1000.0f / static_cast<float>(CLOCKS_PER_SEC), + profile_counter[i]); + } + } +} + +#else // FLAG_DO_PROFILE +#define PROF_BUF_SIZE 0 +#define PROF_RESET +#define PROF_COUNT(prof_buf_id) +#define PROF_OPEN +#define PROF_START(prof_buf_id) +#define PROF_CLOSE +#define PROF_END(prof_buf_id) +#define PROF_CLOCK_OUT(prof_buf_id) +#define PROF_CLOCKOUT(prof_buf_id) +#define PROF_OUTALL + +#endif // FLAG_DO_PROFILE + +#ifdef FLAG_DBG +#define DEBUG_DICT true +#define DEBUG_DICT_FULL false +#define DEBUG_EDIT_DISTANCE false +#define DEBUG_NODE DEBUG_DICT_FULL +#define DEBUG_TRACE DEBUG_DICT_FULL +#define DEBUG_PROXIMITY_INFO false +#define DEBUG_PROXIMITY_CHARS false +#define DEBUG_CORRECTION false +#define DEBUG_CORRECTION_FREQ false +#define DEBUG_SAMPLING_POINTS false +#define DEBUG_POINTS_PROBABILITY false +#define DEBUG_DOUBLE_LETTER false +#define DEBUG_CACHE false +#define DEBUG_DUMP_ERROR false +#define DEBUG_EVALUATE_MOST_PROBABLE_STRING false + +#ifdef FLAG_FULL_DBG +#define DEBUG_GEO_FULL true +#else +#define DEBUG_GEO_FULL false +#endif + +#else // FLAG_DBG + +#define DEBUG_DICT false +#define DEBUG_DICT_FULL false +#define DEBUG_EDIT_DISTANCE false +#define DEBUG_NODE false +#define DEBUG_TRACE false +#define DEBUG_PROXIMITY_INFO false +#define DEBUG_PROXIMITY_CHARS false +#define DEBUG_CORRECTION false +#define DEBUG_CORRECTION_FREQ false +#define DEBUG_SAMPLING_POINTS false +#define DEBUG_POINTS_PROBABILITY false +#define DEBUG_DOUBLE_LETTER false +#define DEBUG_CACHE false +#define DEBUG_DUMP_ERROR false +#define DEBUG_EVALUATE_MOST_PROBABLE_STRING false + +#define DEBUG_GEO_FULL false + +#endif // FLAG_DBG + +#ifndef S_INT_MAX +#define S_INT_MAX 2147483647 // ((1 << 31) - 1) +#endif +#ifndef S_INT_MIN +// The literal constant -2147483648 does not work in C prior C90, because +// the compiler tries to fit the positive number into an int and then negate it. +// GCC warns about this. +#define S_INT_MIN (-2147483647 - 1) // -(1 << 31) +#endif + +#define M_PI_F 3.14159265f +#define MAX_PERCENTILE 100 + +// Number of base-10 digits in the largest integer + 1 to leave room for a zero terminator. +// As such, this is the maximum number of characters will be needed to represent an int as a +// string, including the terminator; this is used as the size of a string buffer large enough to +// hold any value that is intended to fit in an integer, e.g. in the code that reads the header +// of the binary dictionary where a {key,value} string pair scheme is used. +#define LARGEST_INT_DIGIT_COUNT 11 + +#define NOT_A_CODE_POINT (-1) +#define NOT_A_DISTANCE (-1) +#define NOT_A_COORDINATE (-1) +#define NOT_AN_INDEX (-1) +#define NOT_A_PROBABILITY (-1) +#define NOT_A_DICT_POS (S_INT_MIN) + +// A special value to mean the first word confidence makes no sense in this case, +// e.g. this is not a multi-word suggestion. +#define NOT_A_FIRST_WORD_CONFIDENCE (S_INT_MAX) +// How high the confidence needs to be for us to auto-commit. Arbitrary. +// This needs to be the same as CONFIDENCE_FOR_AUTO_COMMIT in BinaryDictionary.java +#define CONFIDENCE_FOR_AUTO_COMMIT (1000000) +// 80% of the full confidence +#define DISTANCE_WEIGHT_FOR_AUTO_COMMIT (80 * CONFIDENCE_FOR_AUTO_COMMIT / 100) +// 100% of the full confidence +#define LENGTH_WEIGHT_FOR_AUTO_COMMIT (CONFIDENCE_FOR_AUTO_COMMIT) +// 80% of the full confidence +#define SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT (80 * CONFIDENCE_FOR_AUTO_COMMIT / 100) + +#define KEYCODE_SPACE ' ' +#define KEYCODE_SINGLE_QUOTE '\'' +#define KEYCODE_HYPHEN_MINUS '-' + +#define SUGGEST_INTERFACE_OUTPUT_SCALE 1000000.0f +#define MAX_PROBABILITY 255 +#define MAX_BIGRAM_ENCODED_PROBABILITY 15 + +// Assuming locale strings such as en_US, sr-Latn etc. +#define MAX_LOCALE_STRING_LENGTH 10 + +// Max value for length, distance and probability which are used in weighting +// TODO: Remove +#define MAX_VALUE_FOR_WEIGHTING 10000000 + +// The max number of the keys in one keyboard layout +#define MAX_KEY_COUNT_IN_A_KEYBOARD 64 + +// TODO: Remove +#define MAX_POINTER_COUNT 1 +#define MAX_POINTER_COUNT_G 2 + +template<typename T> AK_FORCE_INLINE const T &min(const T &a, const T &b) { return a < b ? a : b; } +template<typename T> AK_FORCE_INLINE const T &max(const T &a, const T &b) { return a > b ? a : b; } + +// DEBUG +#define INPUTLENGTH_FOR_DEBUG (-1) +#define MIN_OUTPUT_INDEX_FOR_DEBUG (-1) + +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&); \ + void operator=(const TypeName&) + +#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \ + TypeName(); \ + DISALLOW_COPY_AND_ASSIGN(TypeName) + +// Used as a return value for character comparison +typedef enum { + // Same char, possibly with different case or accent + MATCH_CHAR, + // It is a char located nearby on the keyboard + PROXIMITY_CHAR, + // Additional proximity char which can differ by language. + ADDITIONAL_PROXIMITY_CHAR, + // It is a substitution char + SUBSTITUTION_CHAR, + // It is an unrelated char + UNRELATED_CHAR, +} ProximityType; + +typedef enum { + NOT_A_DOUBLE_LETTER, + A_DOUBLE_LETTER, + A_STRONG_DOUBLE_LETTER +} DoubleLetterLevel; + +typedef enum { + // Correction for MATCH_CHAR + CT_MATCH, + // Correction for PROXIMITY_CHAR + CT_PROXIMITY, + // Correction for ADDITIONAL_PROXIMITY_CHAR + CT_ADDITIONAL_PROXIMITY, + // Correction for SUBSTITUTION_CHAR + CT_SUBSTITUTION, + // Skip one omitted letter + CT_OMISSION, + // Delete an unnecessarily inserted letter + CT_INSERTION, + // Swap the order of next two touch points + CT_TRANSPOSITION, + CT_COMPLETION, + CT_TERMINAL, + CT_TERMINAL_INSERTION, + // Create new word with space omission + CT_NEW_WORD_SPACE_OMISSION, + // Create new word with space substitution + CT_NEW_WORD_SPACE_SUBSTITUTION, +} CorrectionType; + +// ErrorType is mainly decided by CorrectionType but it is also depending on if +// the correction has really been performed or not. +typedef enum { + // Substitution, omission and transposition + ET_EDIT_CORRECTION, + // Proximity error + ET_PROXIMITY_CORRECTION, + // Completion + ET_COMPLETION, + // New word + // TODO: Remove. + // A new word error should be an edit correction error or a proximity correction error. + ET_NEW_WORD, + // Treat error as an intentional omission when the CorrectionType is omission and the node can + // be intentional omission. + ET_INTENTIONAL_OMISSION, + // Not treated as an error. Tracked for checking exact match + ET_NOT_AN_ERROR +} ErrorType; +#endif // LATINIME_DEFINES_H
diff --git a/src/aosp/suggest/core/dicnode/dic_node.cpp b/src/aosp/suggest/core/dicnode/dic_node.cpp new file mode 100644 index 0000000..de088c7 --- /dev/null +++ b/src/aosp/suggest/core/dicnode/dic_node.cpp
@@ -0,0 +1,44 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dicnode/dic_node.h" + +namespace latinime { + +DicNode::DicNode(const DicNode &dicNode) + : +#if DEBUG_DICT + mProfiler(dicNode.mProfiler), +#endif + mDicNodeProperties(dicNode.mDicNodeProperties), mDicNodeState(dicNode.mDicNodeState), + mIsCachedForNextSuggestion(dicNode.mIsCachedForNextSuggestion), mIsUsed(dicNode.mIsUsed), + mReleaseListener(0) { + /* empty */ +} + +DicNode &DicNode::operator=(const DicNode &dicNode) { +#if DEBUG_DICT + mProfiler = dicNode.mProfiler; +#endif + mDicNodeProperties = dicNode.mDicNodeProperties; + mDicNodeState = dicNode.mDicNodeState; + mIsCachedForNextSuggestion = dicNode.mIsCachedForNextSuggestion; + mIsUsed = dicNode.mIsUsed; + mReleaseListener = dicNode.mReleaseListener; + return *this; +} + +} // namespace latinime
diff --git a/src/aosp/suggest/core/dicnode/dic_node.h b/src/aosp/suggest/core/dicnode/dic_node.h new file mode 100644 index 0000000..49cfdec --- /dev/null +++ b/src/aosp/suggest/core/dicnode/dic_node.h
@@ -0,0 +1,617 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_H +#define LATINIME_DIC_NODE_H + +#include "defines.h" +#include "suggest/core/dicnode/dic_node_profiler.h" +#include "suggest/core/dicnode/dic_node_release_listener.h" +#include "suggest/core/dicnode/internal/dic_node_state.h" +#include "suggest/core/dicnode/internal/dic_node_properties.h" +#include "suggest/core/dictionary/digraph_utils.h" +#include "utils/char_utils.h" + +#if DEBUG_DICT +#define LOGI_SHOW_ADD_COST_PROP \ + do { char charBuf[50]; \ + INTS_TO_CHARS(getOutputWordBuf(), getNodeCodePointCount(), charBuf, NELEMS(charBuf)); \ + AKLOGI("%20s, \"%c\", size = %03d, total = %03d, index(0) = %02d, dist = %.4f, %s,,", \ + __FUNCTION__, getNodeCodePoint(), inputSize, getTotalInputIndex(), \ + getInputIndex(0), getNormalizedCompoundDistance(), charBuf); } while (0) +#define DUMP_WORD_AND_SCORE(header) \ + do { char charBuf[50]; char prevWordCharBuf[50]; \ + INTS_TO_CHARS(getOutputWordBuf(), getNodeCodePointCount(), charBuf, NELEMS(charBuf)); \ + INTS_TO_CHARS(mDicNodeState.mDicNodeStatePrevWord.mPrevWord, \ + mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength(), prevWordCharBuf, \ + NELEMS(prevWordCharBuf)); \ + AKLOGI("#%8s, %5f, %5f, %5f, %5f, %s, %s, %d, %5f,", header, \ + getSpatialDistanceForScoring(), getLanguageDistanceForScoring(), \ + getNormalizedCompoundDistance(), getRawLength(), prevWordCharBuf, charBuf, \ + getInputIndex(0), getNormalizedCompoundDistanceAfterFirstWord()); \ + } while (0) +#else +#define LOGI_SHOW_ADD_COST_PROP +#define DUMP_WORD_AND_SCORE(header) +#endif + +namespace latinime { + +// This struct is purely a bucket to return values. No instances of this struct should be kept. +struct DicNode_InputStateG { + DicNode_InputStateG() + : mNeedsToUpdateInputStateG(false), mPointerId(0), mInputIndex(0), + mPrevCodePoint(0), mTerminalDiffCost(0.0f), mRawLength(0.0f), + mDoubleLetterLevel(NOT_A_DOUBLE_LETTER) {} + + bool mNeedsToUpdateInputStateG; + int mPointerId; + int16_t mInputIndex; + int mPrevCodePoint; + float mTerminalDiffCost; + float mRawLength; + DoubleLetterLevel mDoubleLetterLevel; +}; + +class DicNode { + // Caveat: We define Weighting as a friend class of DicNode to let Weighting change + // the distance of DicNode. + // Caution!!! In general, we avoid using the "friend" access modifier. + // This is an exception to explicitly hide DicNode::addCost() from all classes but Weighting. + friend class Weighting; + + public: +#if DEBUG_DICT + DicNodeProfiler mProfiler; +#endif + ////////////////// + // Memory utils // + ////////////////// + AK_FORCE_INLINE static void managedDelete(DicNode *node) { + node->remove(); + } + // end + ///////////////// + + AK_FORCE_INLINE DicNode() + : +#if DEBUG_DICT + mProfiler(), +#endif + mDicNodeProperties(), mDicNodeState(), mIsCachedForNextSuggestion(false), + mIsUsed(false), mReleaseListener(0) {} + + DicNode(const DicNode &dicNode); + DicNode &operator=(const DicNode &dicNode); + virtual ~DicNode() {} + + // Init for copy + void initByCopy(const DicNode *dicNode) { + mIsUsed = true; + mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; + mDicNodeProperties.init(&dicNode->mDicNodeProperties); + mDicNodeState.init(&dicNode->mDicNodeState); + PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); + } + + // Init for root with prevWordNodePos which is used for bigram + void initAsRoot(const int rootGroupPos, const int prevWordNodePos) { + mIsUsed = true; + mIsCachedForNextSuggestion = false; + mDicNodeProperties.init( + NOT_A_DICT_POS /* pos */, rootGroupPos, NOT_A_CODE_POINT /* nodeCodePoint */, + NOT_A_PROBABILITY /* probability */, false /* isTerminal */, + true /* hasChildren */, false /* isBlacklistedOrNotAWord */, 0 /* depth */, + 0 /* terminalDepth */); + mDicNodeState.init(prevWordNodePos); + PROF_NODE_RESET(mProfiler); + } + + // Init for root with previous word + void initAsRootWithPreviousWord(DicNode *dicNode, const int rootGroupPos) { + mIsUsed = true; + mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; + mDicNodeProperties.init( + NOT_A_DICT_POS /* pos */, rootGroupPos, NOT_A_CODE_POINT /* nodeCodePoint */, + NOT_A_PROBABILITY /* probability */, false /* isTerminal */, + true /* hasChildren */, false /* isBlacklistedOrNotAWord */, 0 /* depth */, + 0 /* terminalDepth */); + // TODO: Move to dicNodeState? + mDicNodeState.mDicNodeStateOutput.init(); // reset for next word + mDicNodeState.mDicNodeStateInput.init( + &dicNode->mDicNodeState.mDicNodeStateInput, true /* resetTerminalDiffCost */); + mDicNodeState.mDicNodeStateScoring.init( + &dicNode->mDicNodeState.mDicNodeStateScoring); + mDicNodeState.mDicNodeStatePrevWord.init( + dicNode->mDicNodeState.mDicNodeStatePrevWord.getPrevWordCount() + 1, + dicNode->mDicNodeProperties.getProbability(), + dicNode->mDicNodeProperties.getPos(), + dicNode->mDicNodeState.mDicNodeStatePrevWord.mPrevWord, + dicNode->mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength(), + dicNode->getOutputWordBuf(), + dicNode->mDicNodeProperties.getDepth(), + dicNode->mDicNodeState.mDicNodeStatePrevWord.getSecondWordFirstInputIndex(), + mDicNodeState.mDicNodeStateInput.getInputIndex(0) /* lastInputIndex */); + PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); + } + + void initAsPassingChild(DicNode *parentNode) { + mIsUsed = true; + mIsCachedForNextSuggestion = parentNode->mIsCachedForNextSuggestion; + const int c = parentNode->getNodeTypedCodePoint(); + mDicNodeProperties.init(&parentNode->mDicNodeProperties, c); + mDicNodeState.init(&parentNode->mDicNodeState); + PROF_NODE_COPY(&parentNode->mProfiler, mProfiler); + } + + void initAsChild(const DicNode *const dicNode, const int pos, const int childrenPos, + const int probability, const bool isTerminal, const bool hasChildren, + const bool isBlacklistedOrNotAWord, const uint16_t mergedNodeCodePointCount, + const int *const mergedNodeCodePoints) { + mIsUsed = true; + uint16_t newDepth = static_cast<uint16_t>(dicNode->getNodeCodePointCount() + 1); + mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; + const uint16_t newLeavingDepth = static_cast<uint16_t>( + dicNode->mDicNodeProperties.getLeavingDepth() + mergedNodeCodePointCount); + mDicNodeProperties.init(pos, childrenPos, mergedNodeCodePoints[0], probability, + isTerminal, hasChildren, isBlacklistedOrNotAWord, newDepth, newLeavingDepth); + mDicNodeState.init(&dicNode->mDicNodeState, mergedNodeCodePointCount, + mergedNodeCodePoints); + PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); + } + + AK_FORCE_INLINE void remove() { + mIsUsed = false; + if (mReleaseListener) { + mReleaseListener->onReleased(this); + } + } + + bool isUsed() const { + return mIsUsed; + } + + bool isRoot() const { + return getNodeCodePointCount() == 0; + } + + bool hasChildren() const { + return mDicNodeProperties.hasChildren(); + } + + bool isLeavingNode() const { + ASSERT(getNodeCodePointCount() <= mDicNodeProperties.getLeavingDepth()); + return getNodeCodePointCount() == mDicNodeProperties.getLeavingDepth(); + } + + AK_FORCE_INLINE bool isFirstLetter() const { + return getNodeCodePointCount() == 1; + } + + bool isCached() const { + return mIsCachedForNextSuggestion; + } + + void setCached() { + mIsCachedForNextSuggestion = true; + } + + // Used to expand the node in DicNodeUtils + int getNodeTypedCodePoint() const { + return mDicNodeState.mDicNodeStateOutput.getCodePointAt(getNodeCodePointCount()); + } + + // Check if the current word and the previous word can be considered as a valid multiple word + // suggestion. + bool isValidMultipleWordSuggestion() const { + if (isBlacklistedOrNotAWord()) { + return false; + } + // Treat suggestion as invalid if the current and the previous word are single character + // words. + const int prevWordLen = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength() + - mDicNodeState.mDicNodeStatePrevWord.getPrevWordStart() - 1; + const int currentWordLen = getNodeCodePointCount(); + return (prevWordLen != 1 || currentWordLen != 1); + } + + bool isFirstCharUppercase() const { + const int c = getOutputWordBuf()[0]; + return CharUtils::isAsciiUpper(c); + } + + bool isFirstWord() const { + return mDicNodeState.mDicNodeStatePrevWord.getPrevWordNodePos() == NOT_A_DICT_POS; + } + + bool isCompletion(const int inputSize) const { + return mDicNodeState.mDicNodeStateInput.getInputIndex(0) >= inputSize; + } + + bool canDoLookAheadCorrection(const int inputSize) const { + return mDicNodeState.mDicNodeStateInput.getInputIndex(0) < inputSize - 1; + } + + // Used to get bigram probability in DicNodeUtils + int getPos() const { + return mDicNodeProperties.getPos(); + } + + // Used to get bigram probability in DicNodeUtils + int getPrevWordPos() const { + return mDicNodeState.mDicNodeStatePrevWord.getPrevWordNodePos(); + } + + // Used in DicNodeUtils + int getChildrenPos() const { + return mDicNodeProperties.getChildrenPos(); + } + + int getProbability() const { + return mDicNodeProperties.getProbability(); + } + + AK_FORCE_INLINE bool isTerminalWordNode() const { + const bool isTerminalNodes = mDicNodeProperties.isTerminal(); + const int currentNodeDepth = getNodeCodePointCount(); + const int terminalNodeDepth = mDicNodeProperties.getLeavingDepth(); + return isTerminalNodes && currentNodeDepth > 0 && currentNodeDepth == terminalNodeDepth; + } + + bool shouldBeFilteredBySafetyNetForBigram() const { + const uint16_t currentDepth = getNodeCodePointCount(); + const int prevWordLen = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength() + - mDicNodeState.mDicNodeStatePrevWord.getPrevWordStart() - 1; + return !(currentDepth > 0 && (currentDepth != 1 || prevWordLen != 1)); + } + + bool isTotalInputSizeExceedingLimit() const { + const int prevWordsLen = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength(); + const int currentWordDepth = getNodeCodePointCount(); + // TODO: 3 can be 2? Needs to be investigated. + // TODO: Have a const variable for 3 (or 2) + return prevWordsLen + currentWordDepth > MAX_WORD_LENGTH - 3; + } + + // TODO: This may be defective. Needs to be revised. + bool truncateNode(const DicNode *const topNode, const int inputCommitPoint) { + const int prevWordLenOfTop = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength(); + int newPrevWordStartIndex = inputCommitPoint; + int charCount = 0; + // Find new word start index + for (int i = 0; i < prevWordLenOfTop; ++i) { + const int c = mDicNodeState.mDicNodeStatePrevWord.getPrevWordCodePointAt(i); + // TODO: Check other separators. + if (c != KEYCODE_SPACE && c != KEYCODE_SINGLE_QUOTE) { + if (charCount == inputCommitPoint) { + newPrevWordStartIndex = i; + break; + } + ++charCount; + } + } + if (!mDicNodeState.mDicNodeStatePrevWord.startsWith( + &topNode->mDicNodeState.mDicNodeStatePrevWord, newPrevWordStartIndex - 1)) { + // Node mismatch. + return false; + } + mDicNodeState.mDicNodeStateInput.truncate(inputCommitPoint); + mDicNodeState.mDicNodeStatePrevWord.truncate(newPrevWordStartIndex); + return true; + } + + void outputResult(int *dest) const { + const uint16_t prevWordLength = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength(); + const uint16_t currentDepth = getNodeCodePointCount(); + DicNodeUtils::appendTwoWords(mDicNodeState.mDicNodeStatePrevWord.mPrevWord, + prevWordLength, getOutputWordBuf(), currentDepth, dest); + DUMP_WORD_AND_SCORE("OUTPUT"); + } + + // "Total" in this context (and other methods in this class) means the whole suggestion. When + // this represents a multi-word suggestion, the referenced PtNode (in mDicNodeState) is only + // the one that corresponds to the last word of the suggestion, and all the previous words + // are concatenated together in mPrevWord - which contains a space at the end. + int getTotalNodeSpaceCount() const { + if (isFirstWord()) return 0; + return CharUtils::getSpaceCount(mDicNodeState.mDicNodeStatePrevWord.mPrevWord, + mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength()); + } + + int getSecondWordFirstInputIndex(const ProximityInfoState *const pInfoState) const { + const int inputIndex = mDicNodeState.mDicNodeStatePrevWord.getSecondWordFirstInputIndex(); + if (inputIndex == NOT_AN_INDEX) { + return NOT_AN_INDEX; + } else { + return pInfoState->getInputIndexOfSampledPoint(inputIndex); + } + } + + bool hasMultipleWords() const { + return mDicNodeState.mDicNodeStatePrevWord.getPrevWordCount() > 0; + } + + int getProximityCorrectionCount() const { + return mDicNodeState.mDicNodeStateScoring.getProximityCorrectionCount(); + } + + int getEditCorrectionCount() const { + return mDicNodeState.mDicNodeStateScoring.getEditCorrectionCount(); + } + + // Used to prune nodes + float getNormalizedCompoundDistance() const { + return mDicNodeState.mDicNodeStateScoring.getNormalizedCompoundDistance(); + } + + // Used to prune nodes + float getNormalizedSpatialDistance() const { + return mDicNodeState.mDicNodeStateScoring.getSpatialDistance() + / static_cast<float>(getInputIndex(0) + 1); + } + + // Used to prune nodes + float getCompoundDistance() const { + return mDicNodeState.mDicNodeStateScoring.getCompoundDistance(); + } + + // Used to prune nodes + float getCompoundDistance(const float languageWeight) const { + return mDicNodeState.mDicNodeStateScoring.getCompoundDistance(languageWeight); + } + + // Used to commit input partially + int getPrevWordNodePos() const { + return mDicNodeState.mDicNodeStatePrevWord.getPrevWordNodePos(); + } + + AK_FORCE_INLINE const int *getOutputWordBuf() const { + return mDicNodeState.mDicNodeStateOutput.mCodePointsBuf; + } + + int getPrevCodePointG(int pointerId) const { + return mDicNodeState.mDicNodeStateInput.getPrevCodePoint(pointerId); + } + + // Whether the current codepoint can be an intentional omission, in which case the traversal + // algorithm will always check for a possible omission here. + bool canBeIntentionalOmission() const { + return CharUtils::isIntentionalOmissionCodePoint(getNodeCodePoint()); + } + + // Whether the omission is so frequent that it should incur zero cost. + bool isZeroCostOmission() const { + // TODO: do not hardcode and read from header + return (getNodeCodePoint() == KEYCODE_SINGLE_QUOTE); + } + + // TODO: remove + float getTerminalDiffCostG(int path) const { + return mDicNodeState.mDicNodeStateInput.getTerminalDiffCost(path); + } + + ////////////////////// + // Temporary getter // + // TODO: Remove // + ////////////////////// + // TODO: Remove once touch path is merged into ProximityInfoState + // Note: Returned codepoint may be a digraph codepoint if the node is in a composite glyph. + int getNodeCodePoint() const { + const int codePoint = mDicNodeProperties.getNodeCodePoint(); + const DigraphUtils::DigraphCodePointIndex digraphIndex = + mDicNodeState.mDicNodeStateScoring.getDigraphIndex(); + if (digraphIndex == DigraphUtils::NOT_A_DIGRAPH_INDEX) { + return codePoint; + } + return DigraphUtils::getDigraphCodePointForIndex(codePoint, digraphIndex); + } + + //////////////////////////////// + // Utils for cost calculation // + //////////////////////////////// + AK_FORCE_INLINE bool isSameNodeCodePoint(const DicNode *const dicNode) const { + return mDicNodeProperties.getNodeCodePoint() + == dicNode->mDicNodeProperties.getNodeCodePoint(); + } + + // TODO: remove + // TODO: rename getNextInputIndex + int16_t getInputIndex(int pointerId) const { + return mDicNodeState.mDicNodeStateInput.getInputIndex(pointerId); + } + + //////////////////////////////////// + // Getter of features for scoring // + //////////////////////////////////// + float getSpatialDistanceForScoring() const { + return mDicNodeState.mDicNodeStateScoring.getSpatialDistance(); + } + + float getLanguageDistanceForScoring() const { + return mDicNodeState.mDicNodeStateScoring.getLanguageDistance(); + } + + // For space-aware gestures, we store the normalized distance at the char index + // that ends the first word of the suggestion. We call this the distance after + // first word. + float getNormalizedCompoundDistanceAfterFirstWord() const { + return mDicNodeState.mDicNodeStateScoring.getNormalizedCompoundDistanceAfterFirstWord(); + } + + float getLanguageDistanceRatePerWordForScoring() const { + const float langDist = getLanguageDistanceForScoring(); + const float totalWordCount = + static_cast<float>(mDicNodeState.mDicNodeStatePrevWord.getPrevWordCount() + 1); + return langDist / totalWordCount; + } + + float getRawLength() const { + return mDicNodeState.mDicNodeStateScoring.getRawLength(); + } + + bool isLessThanOneErrorForScoring() const { + return mDicNodeState.mDicNodeStateScoring.getEditCorrectionCount() + + mDicNodeState.mDicNodeStateScoring.getProximityCorrectionCount() <= 1; + } + + DoubleLetterLevel getDoubleLetterLevel() const { + return mDicNodeState.mDicNodeStateScoring.getDoubleLetterLevel(); + } + + void setDoubleLetterLevel(DoubleLetterLevel doubleLetterLevel) { + mDicNodeState.mDicNodeStateScoring.setDoubleLetterLevel(doubleLetterLevel); + } + + bool isInDigraph() const { + return mDicNodeState.mDicNodeStateScoring.getDigraphIndex() + != DigraphUtils::NOT_A_DIGRAPH_INDEX; + } + + void advanceDigraphIndex() { + mDicNodeState.mDicNodeStateScoring.advanceDigraphIndex(); + } + + bool isExactMatch() const { + return mDicNodeState.mDicNodeStateScoring.isExactMatch(); + } + + bool isBlacklistedOrNotAWord() const { + return mDicNodeProperties.isBlacklistedOrNotAWord(); + } + + inline uint16_t getNodeCodePointCount() const { + return mDicNodeProperties.getDepth(); + } + + // Returns code point count including spaces + inline uint16_t getTotalNodeCodePointCount() const { + return getNodeCodePointCount() + mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength(); + } + + AK_FORCE_INLINE void dump(const char *tag) const { +#if DEBUG_DICT + DUMP_WORD_AND_SCORE(tag); +#if DEBUG_DUMP_ERROR + mProfiler.dump(); +#endif +#endif + } + + void setReleaseListener(DicNodeReleaseListener *releaseListener) { + mReleaseListener = releaseListener; + } + + AK_FORCE_INLINE bool compare(const DicNode *right) { + if (!isUsed() && !right->isUsed()) { + // Compare pointer values here for stable comparison + return this > right; + } + if (!isUsed()) { + return true; + } + if (!right->isUsed()) { + return false; + } + // Promote exact matches to prevent them from being pruned. + const bool leftExactMatch = isExactMatch(); + const bool rightExactMatch = right->isExactMatch(); + if (leftExactMatch != rightExactMatch) { + return leftExactMatch; + } + const float diff = + right->getNormalizedCompoundDistance() - getNormalizedCompoundDistance(); + static const float MIN_DIFF = 0.000001f; + if (diff > MIN_DIFF) { + return true; + } else if (diff < -MIN_DIFF) { + return false; + } + const int depth = getNodeCodePointCount(); + const int depthDiff = right->getNodeCodePointCount() - depth; + if (depthDiff != 0) { + return depthDiff > 0; + } + for (int i = 0; i < depth; ++i) { + const int codePoint = mDicNodeState.mDicNodeStateOutput.getCodePointAt(i); + const int rightCodePoint = right->mDicNodeState.mDicNodeStateOutput.getCodePointAt(i); + if (codePoint != rightCodePoint) { + return rightCodePoint > codePoint; + } + } + // Compare pointer values here for stable comparison + return this > right; + } + + private: + DicNodeProperties mDicNodeProperties; + DicNodeState mDicNodeState; + // TODO: Remove + bool mIsCachedForNextSuggestion; + bool mIsUsed; + DicNodeReleaseListener *mReleaseListener; + + AK_FORCE_INLINE int getTotalInputIndex() const { + int index = 0; + for (int i = 0; i < MAX_POINTER_COUNT_G; i++) { + index += mDicNodeState.mDicNodeStateInput.getInputIndex(i); + } + return index; + } + + // Caveat: Must not be called outside Weighting + // This restriction is guaranteed by "friend" + AK_FORCE_INLINE void addCost(const float spatialCost, const float languageCost, + const bool doNormalization, const int inputSize, const ErrorType errorType) { + if (DEBUG_GEO_FULL) { + LOGI_SHOW_ADD_COST_PROP; + } + mDicNodeState.mDicNodeStateScoring.addCost(spatialCost, languageCost, doNormalization, + inputSize, getTotalInputIndex(), errorType); + } + + // Saves the current normalized compound distance for space-aware gestures. + // See getNormalizedCompoundDistanceAfterFirstWord for details. + AK_FORCE_INLINE void saveNormalizedCompoundDistanceAfterFirstWordIfNoneYet() { + mDicNodeState.mDicNodeStateScoring.saveNormalizedCompoundDistanceAfterFirstWordIfNoneYet(); + } + + // Caveat: Must not be called outside Weighting + // This restriction is guaranteed by "friend" + AK_FORCE_INLINE void forwardInputIndex(const int pointerId, const int count, + const bool overwritesPrevCodePointByNodeCodePoint) { + if (count == 0) { + return; + } + mDicNodeState.mDicNodeStateInput.forwardInputIndex(pointerId, count); + if (overwritesPrevCodePointByNodeCodePoint) { + mDicNodeState.mDicNodeStateInput.setPrevCodePoint(0, getNodeCodePoint()); + } + } + + AK_FORCE_INLINE void updateInputIndexG(const DicNode_InputStateG *const inputStateG) { + if (mDicNodeState.mDicNodeStatePrevWord.getPrevWordCount() == 1 && isFirstLetter()) { + mDicNodeState.mDicNodeStatePrevWord.setSecondWordFirstInputIndex( + inputStateG->mInputIndex); + } + mDicNodeState.mDicNodeStateInput.updateInputIndexG(inputStateG->mPointerId, + inputStateG->mInputIndex, inputStateG->mPrevCodePoint, + inputStateG->mTerminalDiffCost, inputStateG->mRawLength); + mDicNodeState.mDicNodeStateScoring.addRawLength(inputStateG->mRawLength); + mDicNodeState.mDicNodeStateScoring.setDoubleLetterLevel(inputStateG->mDoubleLetterLevel); + } +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_H
diff --git a/src/aosp/suggest/core/dicnode/dic_node_priority_queue.h b/src/aosp/suggest/core/dicnode/dic_node_priority_queue.h new file mode 100644 index 0000000..7461f0c --- /dev/null +++ b/src/aosp/suggest/core/dicnode/dic_node_priority_queue.h
@@ -0,0 +1,211 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_PRIORITY_QUEUE_H +#define LATINIME_DIC_NODE_PRIORITY_QUEUE_H + +#include <queue> +#include <vector> + +#include "defines.h" +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_release_listener.h" + +namespace latinime { + +class DicNodePriorityQueue : public DicNodeReleaseListener { + public: + AK_FORCE_INLINE explicit DicNodePriorityQueue(const int capacity) + : mCapacity(capacity), mMaxSize(capacity), mDicNodesBuf(), + mUnusedNodeIndices(), mNextUnusedNodeId(0), mDicNodesQueue() { + mDicNodesBuf.resize(mCapacity + 1); + mUnusedNodeIndices.resize(mCapacity + 1); + clearAndResizeToCapacity(); + } + + // Non virtual inline destructor -- never inherit this class + AK_FORCE_INLINE ~DicNodePriorityQueue() {} + + int getSize() const { + return static_cast<int>(mDicNodesQueue.size()); + } + + int getMaxSize() const { + return mMaxSize; + } + + AK_FORCE_INLINE void setMaxSize(const int maxSize) { + ASSERT(maxSize <= mCapacity); + mMaxSize = min(maxSize, mCapacity); + } + + AK_FORCE_INLINE void clearAndResizeToCapacity() { + clearAndResize(mCapacity); + } + + AK_FORCE_INLINE void clear() { + clearAndResize(mMaxSize); + } + + AK_FORCE_INLINE void clearAndResize(const int maxSize) { + ASSERT(maxSize <= mCapacity); + while (!mDicNodesQueue.empty()) { + mDicNodesQueue.pop(); + } + setMaxSize(maxSize); + for (int i = 0; i < mCapacity + 1; ++i) { + mDicNodesBuf[i].remove(); + mDicNodesBuf[i].setReleaseListener(this); + mUnusedNodeIndices[i] = i == mCapacity ? NOT_A_NODE_ID : static_cast<int>(i) + 1; + } + mNextUnusedNodeId = 0; + } + + // Copy + AK_FORCE_INLINE DicNode *copyPush(DicNode *dicNode) { + return copyPush(dicNode, mMaxSize); + } + + AK_FORCE_INLINE void copyPop(DicNode *dest) { + if (mDicNodesQueue.empty()) { + ASSERT(false); + return; + } + DicNode *node = mDicNodesQueue.top(); + if (dest) { + DicNodeUtils::initByCopy(node, dest); + } + node->remove(); + mDicNodesQueue.pop(); + } + + void onReleased(DicNode *dicNode) { + const int index = static_cast<int>(dicNode - &mDicNodesBuf[0]); + if (mUnusedNodeIndices[index] != NOT_A_NODE_ID) { + // it's already released + return; + } + mUnusedNodeIndices[index] = mNextUnusedNodeId; + mNextUnusedNodeId = index; + ASSERT(index >= 0 && index < (mCapacity + 1)); + } + + AK_FORCE_INLINE void dump() const { + AKLOGI("\n\n\n\n\n==========================="); + for (int i = 0; i < mCapacity + 1; ++i) { + if (mDicNodesBuf[i].isUsed()) { + mDicNodesBuf[i].dump("QUEUE: "); + } + } + AKLOGI("===========================\n\n\n\n\n"); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DicNodePriorityQueue); + static const int NOT_A_NODE_ID = -1; + + AK_FORCE_INLINE static bool compareDicNode(DicNode *left, DicNode *right) { + return left->compare(right); + } + + struct DicNodeComparator { + bool operator ()(DicNode *left, DicNode *right) { + return compareDicNode(left, right); + } + }; + + typedef std::priority_queue<DicNode *, std::vector<DicNode *>, DicNodeComparator> DicNodesQueue; + const int mCapacity; + int mMaxSize; + std::vector<DicNode> mDicNodesBuf; // of each element of mDicNodesBuf respectively + std::vector<int> mUnusedNodeIndices; + int mNextUnusedNodeId; + DicNodesQueue mDicNodesQueue; + + inline bool isFull(const int maxSize) const { + return getSize() >= maxSize; + } + + AK_FORCE_INLINE void pop() { + copyPop(0); + } + + AK_FORCE_INLINE bool betterThanWorstDicNode(DicNode *dicNode) const { + DicNode *worstNode = mDicNodesQueue.top(); + if (!worstNode) { + return true; + } + return compareDicNode(dicNode, worstNode); + } + + AK_FORCE_INLINE DicNode *searchEmptyDicNode() { + if (mCapacity == 0) { + return 0; + } + if (mNextUnusedNodeId == NOT_A_NODE_ID) { + AKLOGI("No unused node found."); + for (int i = 0; i < mCapacity + 1; ++i) { + AKLOGI("Dump node availability, %d, %d, %d", + i, mDicNodesBuf[i].isUsed(), mUnusedNodeIndices[i]); + } + ASSERT(false); + return 0; + } + DicNode *dicNode = &mDicNodesBuf[mNextUnusedNodeId]; + markNodeAsUsed(dicNode); + return dicNode; + } + + AK_FORCE_INLINE void markNodeAsUsed(DicNode *dicNode) { + const int index = static_cast<int>(dicNode - &mDicNodesBuf[0]); + mNextUnusedNodeId = mUnusedNodeIndices[index]; + mUnusedNodeIndices[index] = NOT_A_NODE_ID; + ASSERT(index >= 0 && index < (mCapacity + 1)); + } + + AK_FORCE_INLINE DicNode *pushPoolNodeWithMaxSize(DicNode *dicNode, const int maxSize) { + if (!dicNode) { + return 0; + } + if (!isFull(maxSize)) { + mDicNodesQueue.push(dicNode); + return dicNode; + } + if (betterThanWorstDicNode(dicNode)) { + pop(); + mDicNodesQueue.push(dicNode); + return dicNode; + } + dicNode->remove(); + return 0; + } + + // Copy + AK_FORCE_INLINE DicNode *copyPush(DicNode *dicNode, const int maxSize) { + return pushPoolNodeWithMaxSize(newDicNode(dicNode), maxSize); + } + + AK_FORCE_INLINE DicNode *newDicNode(DicNode *dicNode) { + DicNode *newNode = searchEmptyDicNode(); + if (newNode) { + DicNodeUtils::initByCopy(dicNode, newNode); + } + return newNode; + } + +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_PRIORITY_QUEUE_H
diff --git a/src/aosp/suggest/core/dicnode/dic_node_profiler.h b/src/aosp/suggest/core/dicnode/dic_node_profiler.h new file mode 100644 index 0000000..1f4d257 --- /dev/null +++ b/src/aosp/suggest/core/dicnode/dic_node_profiler.h
@@ -0,0 +1,188 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_PROFILER_H +#define LATINIME_DIC_NODE_PROFILER_H + +#include "defines.h" + +#if DEBUG_DICT +#define PROF_SPACE_SUBSTITUTION(profiler) profiler.profSpaceSubstitution() +#define PROF_SPACE_OMISSION(profiler) profiler.profSpaceOmission() +#define PROF_ADDITIONAL_PROXIMITY(profiler) profiler.profAdditionalProximity() +#define PROF_SUBSTITUTION(profiler) profiler.profSubstitution() +#define PROF_OMISSION(profiler) profiler.profOmission() +#define PROF_INSERTION(profiler) profiler.profInsertion() +#define PROF_MATCH(profiler) profiler.profMatch() +#define PROF_COMPLETION(profiler) profiler.profCompletion() +#define PROF_TRANSPOSITION(profiler) profiler.profTransposition() +#define PROF_NEARESTKEY(profiler) profiler.profNearestKey() +#define PROF_TERMINAL(profiler) profiler.profTerminal() +#define PROF_TERMINAL_INSERTION(profiler) profiler.profTerminalInsertion() +#define PROF_NEW_WORD(profiler) profiler.profNewWord() +#define PROF_NEW_WORD_BIGRAM(profiler) profiler.profNewWordBigram() +#define PROF_NODE_RESET(profiler) profiler.reset() +#define PROF_NODE_COPY(src, dest) dest.copy(src) +#else +#define PROF_SPACE_SUBSTITUTION(profiler) +#define PROF_SPACE_OMISSION(profiler) +#define PROF_ADDITONAL_PROXIMITY(profiler) +#define PROF_SUBSTITUTION(profiler) +#define PROF_OMISSION(profiler) +#define PROF_INSERTION(profiler) +#define PROF_MATCH(profiler) +#define PROF_COMPLETION(profiler) +#define PROF_TRANSPOSITION(profiler) +#define PROF_NEARESTKEY(profiler) +#define PROF_TERMINAL(profiler) +#define PROF_TERMINAL_INSERTION(profiler) +#define PROF_NEW_WORD(profiler) +#define PROF_NEW_WORD_BIGRAM(profiler) +#define PROF_NODE_RESET(profiler) +#define PROF_NODE_COPY(src, dest) +#endif + +namespace latinime { + +class DicNodeProfiler { + public: +#if DEBUG_DICT + AK_FORCE_INLINE DicNodeProfiler() + : mProfOmission(0), mProfInsertion(0), mProfTransposition(0), + mProfAdditionalProximity(0), mProfSubstitution(0), + mProfSpaceSubstitution(0), mProfSpaceOmission(0), + mProfMatch(0), mProfCompletion(0), mProfTerminal(0), mProfTerminalInsertion(0), + mProfNearestKey(0), mProfNewWord(0), mProfNewWordBigram(0) {} + + int mProfOmission; + int mProfInsertion; + int mProfTransposition; + int mProfAdditionalProximity; + int mProfSubstitution; + int mProfSpaceSubstitution; + int mProfSpaceOmission; + int mProfMatch; + int mProfCompletion; + int mProfTerminal; + int mProfTerminalInsertion; + int mProfNearestKey; + int mProfNewWord; + int mProfNewWordBigram; + + void profSpaceSubstitution() { + ++mProfSpaceSubstitution; + } + + void profSpaceOmission() { + ++mProfSpaceOmission; + } + + void profAdditionalProximity() { + ++mProfAdditionalProximity; + } + + void profSubstitution() { + ++mProfSubstitution; + } + + void profOmission() { + ++mProfOmission; + } + + void profInsertion() { + ++mProfInsertion; + } + + void profMatch() { + ++mProfMatch; + } + + void profCompletion() { + ++mProfCompletion; + } + + void profTransposition() { + ++mProfTransposition; + } + + void profNearestKey() { + ++mProfNearestKey; + } + + void profTerminal() { + ++mProfTerminal; + } + + void profTerminalInsertion() { + ++mProfTerminalInsertion; + } + + void profNewWord() { + ++mProfNewWord; + } + + void profNewWordBigram() { + ++mProfNewWordBigram; + } + + void reset() { + mProfSpaceSubstitution = 0; + mProfSpaceOmission = 0; + mProfAdditionalProximity = 0; + mProfSubstitution = 0; + mProfOmission = 0; + mProfInsertion = 0; + mProfMatch = 0; + mProfCompletion = 0; + mProfTransposition = 0; + mProfNearestKey = 0; + mProfTerminal = 0; + mProfNewWord = 0; + mProfNewWordBigram = 0; + } + + void copy(const DicNodeProfiler *const profiler) { + mProfSpaceSubstitution = profiler->mProfSpaceSubstitution; + mProfSpaceOmission = profiler->mProfSpaceOmission; + mProfAdditionalProximity = profiler->mProfAdditionalProximity; + mProfSubstitution = profiler->mProfSubstitution; + mProfOmission = profiler->mProfOmission; + mProfInsertion = profiler->mProfInsertion; + mProfMatch = profiler->mProfMatch; + mProfCompletion = profiler->mProfCompletion; + mProfTransposition = profiler->mProfTransposition; + mProfNearestKey = profiler->mProfNearestKey; + mProfTerminal = profiler->mProfTerminal; + mProfNewWord = profiler->mProfNewWord; + mProfNewWordBigram = profiler->mProfNewWordBigram; + } + + void dump() const { + AKLOGI("O %d, I %d, T %d, AP %d, S %d, SS %d, SO %d, M %d, C %d, TE %d, NW = %d, NWB = %d", + mProfOmission, mProfInsertion, mProfTransposition, mProfAdditionalProximity, + mProfSubstitution, mProfSpaceSubstitution, mProfSpaceOmission, mProfMatch, + mProfCompletion, mProfTerminal, mProfNewWord, mProfNewWordBigram); + } +#else + DicNodeProfiler() {} +#endif + private: + // Caution!!! + // Use a default copy constructor and an assign operator because shallow copies are ok + // for this class +}; +} +#endif // LATINIME_DIC_NODE_PROFILER_H
diff --git a/src/aosp/suggest/core/dicnode/dic_node_release_listener.h b/src/aosp/suggest/core/dicnode/dic_node_release_listener.h new file mode 100644 index 0000000..2ca4f21 --- /dev/null +++ b/src/aosp/suggest/core/dicnode/dic_node_release_listener.h
@@ -0,0 +1,35 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_RELEASE_LISTENER_H +#define LATINIME_DIC_NODE_RELEASE_LISTENER_H + +#include "defines.h" + +namespace latinime { + +class DicNode; + +class DicNodeReleaseListener { + public: + DicNodeReleaseListener() {} + virtual ~DicNodeReleaseListener() {} + virtual void onReleased(DicNode *dicNode) = 0; + private: + DISALLOW_COPY_AND_ASSIGN(DicNodeReleaseListener); +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_RELEASE_LISTENER_H
diff --git a/src/aosp/suggest/core/dicnode/dic_node_utils.cpp b/src/aosp/suggest/core/dicnode/dic_node_utils.cpp new file mode 100644 index 0000000..ec65114 --- /dev/null +++ b/src/aosp/suggest/core/dicnode/dic_node_utils.cpp
@@ -0,0 +1,136 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dicnode/dic_node_utils.h" + +#include <cstring> + +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/core/dictionary/multi_bigram_map.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "utils/char_utils.h" + +namespace latinime { + +/////////////////////////////// +// Node initialization utils // +/////////////////////////////// + +/* static */ void DicNodeUtils::initAsRoot( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const int prevWordNodePos, DicNode *const newRootNode) { + newRootNode->initAsRoot(dictionaryStructurePolicy->getRootPosition(), prevWordNodePos); +} + +/*static */ void DicNodeUtils::initAsRootWithPreviousWord( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + DicNode *const prevWordLastNode, DicNode *const newRootNode) { + newRootNode->initAsRootWithPreviousWord( + prevWordLastNode, dictionaryStructurePolicy->getRootPosition()); +} + +/* static */ void DicNodeUtils::initByCopy(DicNode *srcNode, DicNode *destNode) { + destNode->initByCopy(srcNode); +} + +/////////////////////////////////// +// Traverse node expansion utils // +/////////////////////////////////// +/* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + DicNodeVector *childDicNodes) { + if (dicNode->isTotalInputSizeExceedingLimit()) { + return; + } + if (!dicNode->isLeavingNode()) { + childDicNodes->pushPassingChild(dicNode); + } else { + dictionaryStructurePolicy->createAndGetAllChildNodes(dicNode, childDicNodes); + } +} + +/////////////////// +// Scoring utils // +/////////////////// +/** + * Computes the combined bigram / unigram cost for the given dicNode. + */ +/* static */ float DicNodeUtils::getBigramNodeImprobability( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const DicNode *const node, MultiBigramMap *multiBigramMap) { + if (node->hasMultipleWords() && !node->isValidMultipleWordSuggestion()) { + return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); + } + const int probability = getBigramNodeProbability(dictionaryStructurePolicy, node, + multiBigramMap); + // TODO: This equation to calculate the improbability looks unreasonable. Investigate this. + const float cost = static_cast<float>(MAX_PROBABILITY - probability) + / static_cast<float>(MAX_PROBABILITY); + return cost; +} + +/* static */ int DicNodeUtils::getBigramNodeProbability( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const DicNode *const node, MultiBigramMap *multiBigramMap) { + const int unigramProbability = node->getProbability(); + const int wordPos = node->getPos(); + const int prevWordPos = node->getPrevWordPos(); + if (NOT_A_DICT_POS == wordPos || NOT_A_DICT_POS == prevWordPos) { + // Note: Normally wordPos comes from the dictionary and should never equal + // NOT_A_VALID_WORD_POS. + return dictionaryStructurePolicy->getProbability(unigramProbability, + NOT_A_PROBABILITY); + } + if (multiBigramMap) { + return multiBigramMap->getBigramProbability(dictionaryStructurePolicy, prevWordPos, + wordPos, unigramProbability); + } + return dictionaryStructurePolicy->getProbability(unigramProbability, + NOT_A_PROBABILITY); +} + +//////////////// +// Char utils // +//////////////// + +// TODO: Move to char_utils? +/* static */ int DicNodeUtils::appendTwoWords(const int *const src0, const int16_t length0, + const int *const src1, const int16_t length1, int *dest) { + int actualLength0 = 0; + for (int i = 0; i < length0; ++i) { + if (src0[i] == 0) { + break; + } + actualLength0 = i + 1; + } + actualLength0 = min(actualLength0, MAX_WORD_LENGTH); + memcpy(dest, src0, actualLength0 * sizeof(dest[0])); + if (!src1 || length1 == 0) { + return actualLength0; + } + int actualLength1 = 0; + for (int i = 0; i < length1; ++i) { + if (src1[i] == 0) { + break; + } + actualLength1 = i + 1; + } + actualLength1 = min(actualLength1, MAX_WORD_LENGTH - actualLength0); + memcpy(&dest[actualLength0], src1, actualLength1 * sizeof(dest[0])); + return actualLength0 + actualLength1; +} +} // namespace latinime
diff --git a/src/aosp/suggest/core/dicnode/dic_node_utils.h b/src/aosp/suggest/core/dicnode/dic_node_utils.h new file mode 100644 index 0000000..3fb351a --- /dev/null +++ b/src/aosp/suggest/core/dicnode/dic_node_utils.h
@@ -0,0 +1,59 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_UTILS_H +#define LATINIME_DIC_NODE_UTILS_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +class DicNode; +class DicNodeVector; +class DictionaryStructureWithBufferPolicy; +class MultiBigramMap; + +class DicNodeUtils { + public: + static int appendTwoWords(const int *src0, const int16_t length0, const int *src1, + const int16_t length1, int *dest); + static void initAsRoot( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const int prevWordNodePos, DicNode *newRootNode); + static void initAsRootWithPreviousWord( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + DicNode *prevWordLastNode, DicNode *newRootNode); + static void initByCopy(DicNode *srcNode, DicNode *destNode); + static void getAllChildDicNodes(DicNode *dicNode, + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + DicNodeVector *childDicNodes); + static float getBigramNodeImprobability( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const DicNode *const node, MultiBigramMap *const multiBigramMap); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DicNodeUtils); + // Max number of bigrams to look up + static const int MAX_BIGRAMS_CONSIDERED_PER_CONTEXT = 500; + + static int getBigramNodeProbability( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const DicNode *const node, MultiBigramMap *multiBigramMap); +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_UTILS_H
diff --git a/src/aosp/suggest/core/dicnode/dic_node_vector.h b/src/aosp/suggest/core/dicnode/dic_node_vector.h new file mode 100644 index 0000000..42addae --- /dev/null +++ b/src/aosp/suggest/core/dicnode/dic_node_vector.h
@@ -0,0 +1,93 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_VECTOR_H +#define LATINIME_DIC_NODE_VECTOR_H + +#include <vector> + +#include "defines.h" +#include "suggest/core/dicnode/dic_node.h" + +namespace latinime { + +class DicNodeVector { + public: +#ifdef FLAG_DBG + // 0 will introduce resizing the vector. + static const int DEFAULT_NODES_SIZE_FOR_OPTIMIZATION = 0; +#else + static const int DEFAULT_NODES_SIZE_FOR_OPTIMIZATION = 60; +#endif + AK_FORCE_INLINE DicNodeVector() : mDicNodes(0), mLock(false), mEmptyNode() {} + + // Specify the capacity of the vector + AK_FORCE_INLINE DicNodeVector(const int size) : mDicNodes(0), mLock(false), mEmptyNode() { + mDicNodes.reserve(size); + } + + // Non virtual inline destructor -- never inherit this class + AK_FORCE_INLINE ~DicNodeVector() {} + + AK_FORCE_INLINE void clear() { + mDicNodes.clear(); + mLock = false; + } + + int getSizeAndLock() { + mLock = true; + return static_cast<int>(mDicNodes.size()); + } + + bool exceeds(const size_t limit) const { + return mDicNodes.size() >= limit; + } + + void pushPassingChild(DicNode *dicNode) { + ASSERT(!mLock); + mDicNodes.push_back(mEmptyNode); + mDicNodes.back().initAsPassingChild(dicNode); + } + + void pushLeavingChild(const DicNode *const dicNode, const int pos, const int childrenPos, + const int probability, const bool isTerminal, const bool hasChildren, + const bool isBlacklistedOrNotAWord, const uint16_t mergedNodeCodePointCount, + const int *const mergedNodeCodePoints) { + ASSERT(!mLock); + mDicNodes.push_back(mEmptyNode); + mDicNodes.back().initAsChild(dicNode, pos, childrenPos, probability, isTerminal, + hasChildren, isBlacklistedOrNotAWord, mergedNodeCodePointCount, + mergedNodeCodePoints); + } + + DicNode *operator[](const int id) { + ASSERT(id < static_cast<int>(mDicNodes.size())); + return &mDicNodes[id]; + } + + DicNode *front() { + ASSERT(1 <= static_cast<int>(mDicNodes.size())); + return &mDicNodes[0]; + } + + private: + DISALLOW_COPY_AND_ASSIGN(DicNodeVector); + std::vector<DicNode> mDicNodes; + bool mLock; + DicNode mEmptyNode; +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_VECTOR_H
diff --git a/src/aosp/suggest/core/dicnode/dic_nodes_cache.cpp b/src/aosp/suggest/core/dicnode/dic_nodes_cache.cpp new file mode 100644 index 0000000..b6be47e --- /dev/null +++ b/src/aosp/suggest/core/dicnode/dic_nodes_cache.cpp
@@ -0,0 +1,64 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <list> + +#include "defines.h" +#include "suggest/core/dicnode/dic_node_priority_queue.h" +#include "suggest/core/dicnode/dic_node_utils.h" +#include "suggest/core/dicnode/dic_nodes_cache.h" + +namespace latinime { + +// The biggest value among MAX_CACHE_DIC_NODE_SIZE, MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT, ... +const int DicNodesCache::LARGE_PRIORITY_QUEUE_CAPACITY = 310; +// Capacity for reducing memory footprint. +const int DicNodesCache::SMALL_PRIORITY_QUEUE_CAPACITY = 100; + +/** + * Truncates all of the dicNodes so that they start at the given commit point. + * Only called for multi-word typing input. + */ +DicNode *DicNodesCache::setCommitPoint(int commitPoint) { + std::list<DicNode> dicNodesList; + while (mCachedDicNodesForContinuousSuggestion->getSize() > 0) { + DicNode dicNode; + mCachedDicNodesForContinuousSuggestion->copyPop(&dicNode); + dicNodesList.push_front(dicNode); + } + + // Get the starting words of the top scoring dicNode (last dicNode popped from priority queue) + // up to the commit point. These words have already been committed to the text view. + DicNode *topDicNode = &dicNodesList.front(); + DicNode topDicNodeCopy; + DicNodeUtils::initByCopy(topDicNode, &topDicNodeCopy); + + // Keep only those dicNodes that match the same starting words. + std::list<DicNode>::iterator iter; + for (iter = dicNodesList.begin(); iter != dicNodesList.end(); iter++) { + DicNode *dicNode = &*iter; + if (dicNode->truncateNode(&topDicNodeCopy, commitPoint)) { + mCachedDicNodesForContinuousSuggestion->copyPush(dicNode); + } else { + // Top dicNode should be reprocessed. + ASSERT(dicNode != topDicNode); + DicNode::managedDelete(dicNode); + } + } + mInputIndex -= commitPoint; + return topDicNode; +} +} // namespace latinime
diff --git a/src/aosp/suggest/core/dicnode/dic_nodes_cache.h b/src/aosp/suggest/core/dicnode/dic_nodes_cache.h new file mode 100644 index 0000000..8493b6a --- /dev/null +++ b/src/aosp/suggest/core/dicnode/dic_nodes_cache.h
@@ -0,0 +1,200 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODES_CACHE_H +#define LATINIME_DIC_NODES_CACHE_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/dicnode/dic_node_priority_queue.h" + +namespace latinime { + +class DicNode; + +/** + * Class for controlling dicNode search priority queue and lexicon trie traversal. + */ +class DicNodesCache { + public: + AK_FORCE_INLINE explicit DicNodesCache(const bool usesLargeCapacityCache) + : mUsesLargeCapacityCache(usesLargeCapacityCache), + mDicNodePriorityQueue0(getCacheCapacity()), + mDicNodePriorityQueue1(getCacheCapacity()), + mDicNodePriorityQueue2(getCacheCapacity()), + mDicNodePriorityQueueForTerminal(MAX_RESULTS), + mActiveDicNodes(&mDicNodePriorityQueue0), + mNextActiveDicNodes(&mDicNodePriorityQueue1), + mCachedDicNodesForContinuousSuggestion(&mDicNodePriorityQueue2), + mTerminalDicNodes(&mDicNodePriorityQueueForTerminal), + mInputIndex(0), mLastCachedInputIndex(0) {} + + AK_FORCE_INLINE virtual ~DicNodesCache() {} + + AK_FORCE_INLINE void reset(const int nextActiveSize, const int terminalSize) { + mInputIndex = 0; + mLastCachedInputIndex = 0; + // We want to use the max capacity for the current active dic node queue. + mActiveDicNodes->clearAndResizeToCapacity(); + // nextActiveSize is used to limit the next iteration's active dic node size. + const int nextActiveSizeFittingToTheCapacity = min(nextActiveSize, getCacheCapacity()); + mNextActiveDicNodes->clearAndResize(nextActiveSizeFittingToTheCapacity); + mTerminalDicNodes->clearAndResize(terminalSize); + // We want to use the max capacity for the cached dic nodes that will be used for the + // continuous suggestion. + mCachedDicNodesForContinuousSuggestion->clearAndResizeToCapacity(); + } + + AK_FORCE_INLINE void continueSearch() { + resetTemporaryCaches(); + restoreActiveDicNodesFromCache(); + } + + AK_FORCE_INLINE void advanceActiveDicNodes() { + if (DEBUG_DICT) { + AKLOGI("Advance active %d nodes.", mNextActiveDicNodes->getSize()); + } + if (DEBUG_DICT_FULL) { + mNextActiveDicNodes->dump(); + } + mNextActiveDicNodes = + moveNodesAndReturnReusableEmptyQueue(mNextActiveDicNodes, &mActiveDicNodes); + } + + DicNode *setCommitPoint(int commitPoint); + + int activeSize() const { return mActiveDicNodes->getSize(); } + int terminalSize() const { return mTerminalDicNodes->getSize(); } + bool isLookAheadCorrectionInputIndex(const int inputIndex) const { + return inputIndex == mInputIndex - 1; + } + void advanceInputIndex(const int inputSize) { + if (mInputIndex < inputSize) { + mInputIndex++; + } + } + + AK_FORCE_INLINE void copyPushTerminal(DicNode *dicNode) { + mTerminalDicNodes->copyPush(dicNode); + } + + AK_FORCE_INLINE void copyPushActive(DicNode *dicNode) { + mActiveDicNodes->copyPush(dicNode); + } + + AK_FORCE_INLINE bool copyPushContinue(DicNode *dicNode) { + return mCachedDicNodesForContinuousSuggestion->copyPush(dicNode); + } + + AK_FORCE_INLINE void copyPushNextActive(DicNode *dicNode) { + DicNode *pushedDicNode = mNextActiveDicNodes->copyPush(dicNode); + if (!pushedDicNode) { + if (dicNode->isCached()) { + dicNode->remove(); + } + // We simply drop any dic node that was not cached, ignoring the slim chance + // that one of its children represents what the user really wanted. + } + } + + void popTerminal(DicNode *dest) { + mTerminalDicNodes->copyPop(dest); + } + + void popActive(DicNode *dest) { + mActiveDicNodes->copyPop(dest); + } + + bool hasCachedDicNodesForContinuousSuggestion() const { + return mCachedDicNodesForContinuousSuggestion + && mCachedDicNodesForContinuousSuggestion->getSize() > 0; + } + + AK_FORCE_INLINE bool isCacheBorderForTyping(const int inputSize) const { + // TODO: Move this variable to header + static const int CACHE_BACK_LENGTH = 3; + const int cacheInputIndex = inputSize - CACHE_BACK_LENGTH; + const bool shouldCache = (cacheInputIndex == mInputIndex) + && (cacheInputIndex != mLastCachedInputIndex); + return shouldCache; + } + + AK_FORCE_INLINE void updateLastCachedInputIndex() { + mLastCachedInputIndex = mInputIndex; + } + + private: + DISALLOW_COPY_AND_ASSIGN(DicNodesCache); + + AK_FORCE_INLINE void restoreActiveDicNodesFromCache() { + if (DEBUG_DICT) { + AKLOGI("Restore %d nodes. inputIndex = %d.", + mCachedDicNodesForContinuousSuggestion->getSize(), mLastCachedInputIndex); + } + if (DEBUG_DICT_FULL || DEBUG_CACHE) { + mCachedDicNodesForContinuousSuggestion->dump(); + } + mInputIndex = mLastCachedInputIndex; + mCachedDicNodesForContinuousSuggestion = moveNodesAndReturnReusableEmptyQueue( + mCachedDicNodesForContinuousSuggestion, &mActiveDicNodes); + } + + AK_FORCE_INLINE static DicNodePriorityQueue *moveNodesAndReturnReusableEmptyQueue( + DicNodePriorityQueue *src, DicNodePriorityQueue **dest) { + const int srcMaxSize = src->getMaxSize(); + const int destMaxSize = (*dest)->getMaxSize(); + DicNodePriorityQueue *tmp = *dest; + *dest = src; + (*dest)->setMaxSize(destMaxSize); + tmp->clearAndResize(srcMaxSize); + return tmp; + } + + AK_FORCE_INLINE int getCacheCapacity() const { + return mUsesLargeCapacityCache ? + LARGE_PRIORITY_QUEUE_CAPACITY : SMALL_PRIORITY_QUEUE_CAPACITY; + } + + AK_FORCE_INLINE void resetTemporaryCaches() { + mActiveDicNodes->clear(); + mNextActiveDicNodes->clear(); + mTerminalDicNodes->clear(); + } + + static const int LARGE_PRIORITY_QUEUE_CAPACITY; + static const int SMALL_PRIORITY_QUEUE_CAPACITY; + + const bool mUsesLargeCapacityCache; + // Instances + DicNodePriorityQueue mDicNodePriorityQueue0; + DicNodePriorityQueue mDicNodePriorityQueue1; + DicNodePriorityQueue mDicNodePriorityQueue2; + DicNodePriorityQueue mDicNodePriorityQueueForTerminal; + + // Active dicNodes currently being expanded. + DicNodePriorityQueue *mActiveDicNodes; + // Next dicNodes to be expanded. + DicNodePriorityQueue *mNextActiveDicNodes; + // Cached dicNodes used for continuous suggestion. + DicNodePriorityQueue *mCachedDicNodesForContinuousSuggestion; + // Current top terminal dicNodes. + DicNodePriorityQueue *mTerminalDicNodes; + int mInputIndex; + int mLastCachedInputIndex; +}; +} // namespace latinime +#endif // LATINIME_DIC_NODES_CACHE_H
diff --git a/src/aosp/suggest/core/dicnode/internal/dic_node_properties.h b/src/aosp/suggest/core/dicnode/internal/dic_node_properties.h new file mode 100644 index 0000000..9e0f62c --- /dev/null +++ b/src/aosp/suggest/core/dicnode/internal/dic_node_properties.h
@@ -0,0 +1,132 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_PROPERTIES_H +#define LATINIME_DIC_NODE_PROPERTIES_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +/** + * Node for traversing the lexicon trie. + */ +// TODO: Introduce a dictionary node class which has attribute members required to understand the +// dictionary structure. +class DicNodeProperties { + public: + AK_FORCE_INLINE DicNodeProperties() + : mPos(0), mChildrenPos(0), mProbability(0), mNodeCodePoint(0), mIsTerminal(false), + mHasChildren(false), mIsBlacklistedOrNotAWord(false), mDepth(0), mLeavingDepth(0) {} + + virtual ~DicNodeProperties() {} + + // Should be called only once per DicNode is initialized. + void init(const int pos, const int childrenPos, const int nodeCodePoint, const int probability, + const bool isTerminal, const bool hasChildren, const bool isBlacklistedOrNotAWord, + const uint16_t depth, const uint16_t leavingDepth) { + mPos = pos; + mChildrenPos = childrenPos; + mNodeCodePoint = nodeCodePoint; + mProbability = probability; + mIsTerminal = isTerminal; + mHasChildren = hasChildren; + mIsBlacklistedOrNotAWord = isBlacklistedOrNotAWord; + mDepth = depth; + mLeavingDepth = leavingDepth; + } + + // Init for copy + void init(const DicNodeProperties *const nodeProp) { + mPos = nodeProp->mPos; + mChildrenPos = nodeProp->mChildrenPos; + mNodeCodePoint = nodeProp->mNodeCodePoint; + mProbability = nodeProp->mProbability; + mIsTerminal = nodeProp->mIsTerminal; + mHasChildren = nodeProp->mHasChildren; + mIsBlacklistedOrNotAWord = nodeProp->mIsBlacklistedOrNotAWord; + mDepth = nodeProp->mDepth; + mLeavingDepth = nodeProp->mLeavingDepth; + } + + // Init as passing child + void init(const DicNodeProperties *const nodeProp, const int codePoint) { + mPos = nodeProp->mPos; + mChildrenPos = nodeProp->mChildrenPos; + mNodeCodePoint = codePoint; // Overwrite the node char of a passing child + mProbability = nodeProp->mProbability; + mIsTerminal = nodeProp->mIsTerminal; + mHasChildren = nodeProp->mHasChildren; + mIsBlacklistedOrNotAWord = nodeProp->mIsBlacklistedOrNotAWord; + mDepth = nodeProp->mDepth + 1; // Increment the depth of a passing child + mLeavingDepth = nodeProp->mLeavingDepth; + } + + int getPos() const { + return mPos; + } + + int getChildrenPos() const { + return mChildrenPos; + } + + int getProbability() const { + return mProbability; + } + + int getNodeCodePoint() const { + return mNodeCodePoint; + } + + uint16_t getDepth() const { + return mDepth; + } + + // TODO: Move to output? + uint16_t getLeavingDepth() const { + return mLeavingDepth; + } + + bool isTerminal() const { + return mIsTerminal; + } + + bool hasChildren() const { + return mHasChildren || mDepth != mLeavingDepth; + } + + bool isBlacklistedOrNotAWord() const { + return mIsBlacklistedOrNotAWord; + } + + private: + // Caution!!! + // Use a default copy constructor and an assign operator because shallow copies are ok + // for this class + int mPos; + int mChildrenPos; + int mProbability; + int mNodeCodePoint; + bool mIsTerminal; + bool mHasChildren; + bool mIsBlacklistedOrNotAWord; + uint16_t mDepth; + uint16_t mLeavingDepth; +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_PROPERTIES_H
diff --git a/src/aosp/suggest/core/dicnode/internal/dic_node_state.h b/src/aosp/suggest/core/dicnode/internal/dic_node_state.h new file mode 100644 index 0000000..b0fddb7 --- /dev/null +++ b/src/aosp/suggest/core/dicnode/internal/dic_node_state.h
@@ -0,0 +1,72 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_STATE_H +#define LATINIME_DIC_NODE_STATE_H + +#include "defines.h" +#include "suggest/core/dicnode/internal/dic_node_state_input.h" +#include "suggest/core/dicnode/internal/dic_node_state_output.h" +#include "suggest/core/dicnode/internal/dic_node_state_prevword.h" +#include "suggest/core/dicnode/internal/dic_node_state_scoring.h" + +namespace latinime { + +class DicNodeState { + public: + DicNodeStateInput mDicNodeStateInput; + DicNodeStateOutput mDicNodeStateOutput; + DicNodeStatePrevWord mDicNodeStatePrevWord; + DicNodeStateScoring mDicNodeStateScoring; + + AK_FORCE_INLINE DicNodeState() + : mDicNodeStateInput(), mDicNodeStateOutput(), mDicNodeStatePrevWord(), + mDicNodeStateScoring() { + } + + virtual ~DicNodeState() {} + + // Init with prevWordPos + void init(const int prevWordPos) { + mDicNodeStateInput.init(); + mDicNodeStateOutput.init(); + mDicNodeStatePrevWord.init(prevWordPos); + mDicNodeStateScoring.init(); + } + + // Init by copy + AK_FORCE_INLINE void init(const DicNodeState *const src) { + mDicNodeStateInput.init(&src->mDicNodeStateInput); + mDicNodeStateOutput.init(&src->mDicNodeStateOutput); + mDicNodeStatePrevWord.init(&src->mDicNodeStatePrevWord); + mDicNodeStateScoring.init(&src->mDicNodeStateScoring); + } + + // Init by copy and adding merged node code points. + void init(const DicNodeState *const src, const uint16_t mergedNodeCodePointCount, + const int *const mergedNodeCodePoints) { + init(src); + mDicNodeStateOutput.addMergedNodeCodePoints( + mergedNodeCodePointCount, mergedNodeCodePoints); + } + + private: + // Caution!!! + // Use a default copy constructor and an assign operator because shallow copies are ok + // for this class +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_STATE_H
diff --git a/src/aosp/suggest/core/dicnode/internal/dic_node_state_input.h b/src/aosp/suggest/core/dicnode/internal/dic_node_state_input.h new file mode 100644 index 0000000..bbd9435 --- /dev/null +++ b/src/aosp/suggest/core/dicnode/internal/dic_node_state_input.h
@@ -0,0 +1,100 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_STATE_INPUT_H +#define LATINIME_DIC_NODE_STATE_INPUT_H + +#include "defines.h" + +namespace latinime { + +// TODO: Have a .cpp for this class +class DicNodeStateInput { + public: + DicNodeStateInput() {} + virtual ~DicNodeStateInput() {} + + // TODO: Merge into DicNodeStatePrevWord::truncate + void truncate(const int commitPoint) { + mInputIndex[0] -= commitPoint; + } + + void init() { + for (int i = 0; i < MAX_POINTER_COUNT_G; i++) { + // TODO: The initial value for mInputIndex should be -1? + //mInputIndex[i] = i == 0 ? 0 : -1; + mInputIndex[i] = 0; + mPrevCodePoint[i] = NOT_A_CODE_POINT; + mTerminalDiffCost[i] = static_cast<float>(MAX_VALUE_FOR_WEIGHTING); + } + } + + void init(const DicNodeStateInput *const src, const bool resetTerminalDiffCost) { + for (int i = 0; i < MAX_POINTER_COUNT_G; i++) { + mInputIndex[i] = src->mInputIndex[i]; + mPrevCodePoint[i] = src->mPrevCodePoint[i]; + mTerminalDiffCost[i] = resetTerminalDiffCost ? + static_cast<float>(MAX_VALUE_FOR_WEIGHTING) : src->mTerminalDiffCost[i]; + } + } + + void updateInputIndexG(const int pointerId, const int inputIndex, + const int prevCodePoint, const float terminalDiffCost, const float rawLength) { + mInputIndex[pointerId] = inputIndex; + mPrevCodePoint[pointerId] = prevCodePoint; + mTerminalDiffCost[pointerId] = terminalDiffCost; + } + + void init(const DicNodeStateInput *const src) { + init(src, false); + } + + // For transposition + void setPrevCodePoint(const int pointerId, const int c) { + mPrevCodePoint[pointerId] = c; + } + + void forwardInputIndex(const int pointerId, const int val) { + if (mInputIndex[pointerId] < 0) { + mInputIndex[pointerId] = val; + } else { + mInputIndex[pointerId] = mInputIndex[pointerId] + val; + } + } + + int getInputIndex(const int pointerId) const { + // when "inputIndex" exceeds "inputSize", auto-completion needs to be done + return mInputIndex[pointerId]; + } + + int getPrevCodePoint(const int pointerId) const { + return mPrevCodePoint[pointerId]; + } + + float getTerminalDiffCost(const int pointerId) const { + return mTerminalDiffCost[pointerId]; + } + + private: + // Caution!!! + // Use a default copy constructor and an assign operator because shallow copies are ok + // for this class + int mInputIndex[MAX_POINTER_COUNT_G]; + int mPrevCodePoint[MAX_POINTER_COUNT_G]; + float mTerminalDiffCost[MAX_POINTER_COUNT_G]; +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_STATE_INPUT_H
diff --git a/src/aosp/suggest/core/dicnode/internal/dic_node_state_output.h b/src/aosp/suggest/core/dicnode/internal/dic_node_state_output.h new file mode 100644 index 0000000..74eb5df --- /dev/null +++ b/src/aosp/suggest/core/dicnode/internal/dic_node_state_output.h
@@ -0,0 +1,79 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_STATE_OUTPUT_H +#define LATINIME_DIC_NODE_STATE_OUTPUT_H + +#include <cstring> // for memcpy() +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +class DicNodeStateOutput { + public: + DicNodeStateOutput() : mOutputtedCodePointCount(0) { + init(); + } + + virtual ~DicNodeStateOutput() {} + + void init() { + mOutputtedCodePointCount = 0; + mCodePointsBuf[0] = 0; + } + + void init(const DicNodeStateOutput *const stateOutput) { + memcpy(mCodePointsBuf, stateOutput->mCodePointsBuf, + stateOutput->mOutputtedCodePointCount * sizeof(mCodePointsBuf[0])); + mOutputtedCodePointCount = stateOutput->mOutputtedCodePointCount; + if (mOutputtedCodePointCount < MAX_WORD_LENGTH) { + mCodePointsBuf[mOutputtedCodePointCount] = 0; + } + } + + void addMergedNodeCodePoints(const uint16_t mergedNodeCodePointCount, + const int *const mergedNodeCodePoints) { + if (mergedNodeCodePoints) { + const int additionalCodePointCount = min(static_cast<int>(mergedNodeCodePointCount), + MAX_WORD_LENGTH - mOutputtedCodePointCount); + memcpy(&mCodePointsBuf[mOutputtedCodePointCount], mergedNodeCodePoints, + additionalCodePointCount * sizeof(mCodePointsBuf[0])); + mOutputtedCodePointCount = static_cast<uint16_t>( + mOutputtedCodePointCount + mergedNodeCodePointCount); + if (mOutputtedCodePointCount < MAX_WORD_LENGTH) { + mCodePointsBuf[mOutputtedCodePointCount] = 0; + } + } + } + + // TODO: Remove + int getCodePointAt(const int index) const { + return mCodePointsBuf[index]; + } + + // TODO: Move to private + int mCodePointsBuf[MAX_WORD_LENGTH]; + + private: + // Caution!!! + // Use a default copy constructor and an assign operator because shallow copies are ok + // for this class + uint16_t mOutputtedCodePointCount; +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_STATE_OUTPUT_H
diff --git a/src/aosp/suggest/core/dicnode/internal/dic_node_state_prevword.h b/src/aosp/suggest/core/dicnode/internal/dic_node_state_prevword.h new file mode 100644 index 0000000..b898620 --- /dev/null +++ b/src/aosp/suggest/core/dicnode/internal/dic_node_state_prevword.h
@@ -0,0 +1,154 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_STATE_PREVWORD_H +#define LATINIME_DIC_NODE_STATE_PREVWORD_H + +#include <cstring> // for memset() +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/dicnode/dic_node_utils.h" +#include "suggest/core/layout/proximity_info_state.h" + +namespace latinime { + +class DicNodeStatePrevWord { + public: + AK_FORCE_INLINE DicNodeStatePrevWord() + : mPrevWordCount(0), mPrevWordLength(0), mPrevWordStart(0), mPrevWordProbability(0), + mPrevWordNodePos(NOT_A_DICT_POS), mSecondWordFirstInputIndex(NOT_AN_INDEX) { + memset(mPrevWord, 0, sizeof(mPrevWord)); + } + + virtual ~DicNodeStatePrevWord() {} + + void init() { + mPrevWordLength = 0; + mPrevWordCount = 0; + mPrevWordStart = 0; + mPrevWordProbability = -1; + mPrevWordNodePos = NOT_A_DICT_POS; + mSecondWordFirstInputIndex = NOT_AN_INDEX; + } + + void init(const int prevWordNodePos) { + mPrevWordLength = 0; + mPrevWordCount = 0; + mPrevWordStart = 0; + mPrevWordProbability = -1; + mPrevWordNodePos = prevWordNodePos; + mSecondWordFirstInputIndex = NOT_AN_INDEX; + } + + // Init by copy + AK_FORCE_INLINE void init(const DicNodeStatePrevWord *const prevWord) { + mPrevWordLength = prevWord->mPrevWordLength; + mPrevWordCount = prevWord->mPrevWordCount; + mPrevWordStart = prevWord->mPrevWordStart; + mPrevWordProbability = prevWord->mPrevWordProbability; + mPrevWordNodePos = prevWord->mPrevWordNodePos; + mSecondWordFirstInputIndex = prevWord->mSecondWordFirstInputIndex; + memcpy(mPrevWord, prevWord->mPrevWord, prevWord->mPrevWordLength * sizeof(mPrevWord[0])); + } + + void init(const int16_t prevWordCount, const int16_t prevWordProbability, + const int prevWordNodePos, const int *const src0, const int16_t length0, + const int *const src1, const int16_t length1, + const int prevWordSecondWordFirstInputIndex, const int lastInputIndex) { + mPrevWordCount = min(prevWordCount, static_cast<int16_t>(MAX_RESULTS)); + mPrevWordProbability = prevWordProbability; + mPrevWordNodePos = prevWordNodePos; + int twoWordsLen = + DicNodeUtils::appendTwoWords(src0, length0, src1, length1, mPrevWord); + if (twoWordsLen >= MAX_WORD_LENGTH) { + twoWordsLen = MAX_WORD_LENGTH - 1; + } + mPrevWord[twoWordsLen] = KEYCODE_SPACE; + mPrevWordStart = length0; + mPrevWordLength = static_cast<int16_t>(twoWordsLen + 1); + mSecondWordFirstInputIndex = prevWordSecondWordFirstInputIndex; + } + + void truncate(const int offset) { + // TODO: memmove + if (mPrevWordLength < offset) { + memset(mPrevWord, 0, sizeof(mPrevWord)); + mPrevWordLength = 0; + return; + } + const int newPrevWordLength = mPrevWordLength - offset; + memmove(mPrevWord, &mPrevWord[offset], newPrevWordLength * sizeof(mPrevWord[0])); + mPrevWordLength = newPrevWordLength; + } + + void setSecondWordFirstInputIndex(const int inputIndex) { + mSecondWordFirstInputIndex = inputIndex; + } + + int getSecondWordFirstInputIndex() const { + return mSecondWordFirstInputIndex; + } + + // TODO: remove + int16_t getPrevWordLength() const { + return mPrevWordLength; + } + + int16_t getPrevWordCount() const { + return mPrevWordCount; + } + + int16_t getPrevWordStart() const { + return mPrevWordStart; + } + + int getPrevWordNodePos() const { + return mPrevWordNodePos; + } + + int getPrevWordCodePointAt(const int id) const { + return mPrevWord[id]; + } + + bool startsWith(const DicNodeStatePrevWord *const prefix, const int prefixLen) const { + if (prefixLen > mPrevWordLength) { + return false; + } + for (int i = 0; i < prefixLen; ++i) { + if (mPrevWord[i] != prefix->mPrevWord[i]) { + return false; + } + } + return true; + } + + // TODO: Move to private + int mPrevWord[MAX_WORD_LENGTH]; + + private: + // Caution!!! + // Use a default copy constructor and an assign operator because shallow copies are ok + // for this class + int16_t mPrevWordCount; + int16_t mPrevWordLength; + int16_t mPrevWordStart; + int16_t mPrevWordProbability; + int mPrevWordNodePos; + int mSecondWordFirstInputIndex; +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_STATE_PREVWORD_H
diff --git a/src/aosp/suggest/core/dicnode/internal/dic_node_state_scoring.h b/src/aosp/suggest/core/dicnode/internal/dic_node_state_scoring.h new file mode 100644 index 0000000..3c85d0e --- /dev/null +++ b/src/aosp/suggest/core/dicnode/internal/dic_node_state_scoring.h
@@ -0,0 +1,218 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_STATE_SCORING_H +#define LATINIME_DIC_NODE_STATE_SCORING_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/dictionary/digraph_utils.h" + +namespace latinime { + +class DicNodeStateScoring { + public: + AK_FORCE_INLINE DicNodeStateScoring() + : mDoubleLetterLevel(NOT_A_DOUBLE_LETTER), + mDigraphIndex(DigraphUtils::NOT_A_DIGRAPH_INDEX), + mEditCorrectionCount(0), mProximityCorrectionCount(0), + mNormalizedCompoundDistance(0.0f), mSpatialDistance(0.0f), mLanguageDistance(0.0f), + mRawLength(0.0f), mExactMatch(true), + mNormalizedCompoundDistanceAfterFirstWord(MAX_VALUE_FOR_WEIGHTING) { + } + + virtual ~DicNodeStateScoring() {} + + void init() { + mEditCorrectionCount = 0; + mProximityCorrectionCount = 0; + mNormalizedCompoundDistance = 0.0f; + mSpatialDistance = 0.0f; + mLanguageDistance = 0.0f; + mRawLength = 0.0f; + mDoubleLetterLevel = NOT_A_DOUBLE_LETTER; + mDigraphIndex = DigraphUtils::NOT_A_DIGRAPH_INDEX; + mNormalizedCompoundDistanceAfterFirstWord = MAX_VALUE_FOR_WEIGHTING; + mExactMatch = true; + } + + AK_FORCE_INLINE void init(const DicNodeStateScoring *const scoring) { + mEditCorrectionCount = scoring->mEditCorrectionCount; + mProximityCorrectionCount = scoring->mProximityCorrectionCount; + mNormalizedCompoundDistance = scoring->mNormalizedCompoundDistance; + mSpatialDistance = scoring->mSpatialDistance; + mLanguageDistance = scoring->mLanguageDistance; + mRawLength = scoring->mRawLength; + mDoubleLetterLevel = scoring->mDoubleLetterLevel; + mDigraphIndex = scoring->mDigraphIndex; + mExactMatch = scoring->mExactMatch; + mNormalizedCompoundDistanceAfterFirstWord = + scoring->mNormalizedCompoundDistanceAfterFirstWord; + } + + void addCost(const float spatialCost, const float languageCost, const bool doNormalization, + const int inputSize, const int totalInputIndex, const ErrorType errorType) { + addDistance(spatialCost, languageCost, doNormalization, inputSize, totalInputIndex); + switch (errorType) { + case ET_EDIT_CORRECTION: + ++mEditCorrectionCount; + mExactMatch = false; + break; + case ET_PROXIMITY_CORRECTION: + ++mProximityCorrectionCount; + mExactMatch = false; + break; + case ET_COMPLETION: + mExactMatch = false; + break; + case ET_NEW_WORD: + mExactMatch = false; + break; + case ET_INTENTIONAL_OMISSION: + mExactMatch = false; + break; + case ET_NOT_AN_ERROR: + break; + } + } + + // Saves the current normalized distance for space-aware gestures. + // See getNormalizedCompoundDistanceAfterFirstWord for details. + void saveNormalizedCompoundDistanceAfterFirstWordIfNoneYet() { + // We get called here after each word. We only want to store the distance after + // the first word, so if we already have a distance we skip saving -- hence "IfNoneYet" + // in the method name. + if (mNormalizedCompoundDistanceAfterFirstWord >= MAX_VALUE_FOR_WEIGHTING) { + mNormalizedCompoundDistanceAfterFirstWord = getNormalizedCompoundDistance(); + } + } + + void addRawLength(const float rawLength) { + mRawLength += rawLength; + } + + float getCompoundDistance() const { + return getCompoundDistance(1.0f); + } + + float getCompoundDistance(const float languageWeight) const { + return mSpatialDistance + mLanguageDistance * languageWeight; + } + + float getNormalizedCompoundDistance() const { + return mNormalizedCompoundDistance; + } + + // For space-aware gestures, we store the normalized distance at the char index + // that ends the first word of the suggestion. We call this the distance after + // first word. + float getNormalizedCompoundDistanceAfterFirstWord() const { + return mNormalizedCompoundDistanceAfterFirstWord; + } + + float getSpatialDistance() const { + return mSpatialDistance; + } + + float getLanguageDistance() const { + return mLanguageDistance; + } + + int16_t getEditCorrectionCount() const { + return mEditCorrectionCount; + } + + int16_t getProximityCorrectionCount() const { + return mProximityCorrectionCount; + } + + float getRawLength() const { + return mRawLength; + } + + DoubleLetterLevel getDoubleLetterLevel() const { + return mDoubleLetterLevel; + } + + void setDoubleLetterLevel(DoubleLetterLevel doubleLetterLevel) { + switch(doubleLetterLevel) { + case NOT_A_DOUBLE_LETTER: + break; + case A_DOUBLE_LETTER: + if (mDoubleLetterLevel != A_STRONG_DOUBLE_LETTER) { + mDoubleLetterLevel = doubleLetterLevel; + } + break; + case A_STRONG_DOUBLE_LETTER: + mDoubleLetterLevel = doubleLetterLevel; + break; + } + } + + DigraphUtils::DigraphCodePointIndex getDigraphIndex() const { + return mDigraphIndex; + } + + void advanceDigraphIndex() { + switch(mDigraphIndex) { + case DigraphUtils::NOT_A_DIGRAPH_INDEX: + mDigraphIndex = DigraphUtils::FIRST_DIGRAPH_CODEPOINT; + break; + case DigraphUtils::FIRST_DIGRAPH_CODEPOINT: + mDigraphIndex = DigraphUtils::SECOND_DIGRAPH_CODEPOINT; + break; + case DigraphUtils::SECOND_DIGRAPH_CODEPOINT: + mDigraphIndex = DigraphUtils::NOT_A_DIGRAPH_INDEX; + break; + } + } + + bool isExactMatch() const { + return mExactMatch; + } + + private: + // Caution!!! + // Use a default copy constructor and an assign operator because shallow copies are ok + // for this class + DoubleLetterLevel mDoubleLetterLevel; + DigraphUtils::DigraphCodePointIndex mDigraphIndex; + + int16_t mEditCorrectionCount; + int16_t mProximityCorrectionCount; + + float mNormalizedCompoundDistance; + float mSpatialDistance; + float mLanguageDistance; + float mRawLength; + bool mExactMatch; + float mNormalizedCompoundDistanceAfterFirstWord; + + AK_FORCE_INLINE void addDistance(float spatialDistance, float languageDistance, + bool doNormalization, int inputSize, int totalInputIndex) { + mSpatialDistance += spatialDistance; + mLanguageDistance += languageDistance; + if (!doNormalization) { + mNormalizedCompoundDistance = mSpatialDistance + mLanguageDistance; + } else { + mNormalizedCompoundDistance = (mSpatialDistance + mLanguageDistance) + / static_cast<float>(max(1, totalInputIndex)); + } + } +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_STATE_SCORING_H
diff --git a/src/aosp/suggest/core/dictionary/bigram_dictionary.cpp b/src/aosp/suggest/core/dictionary/bigram_dictionary.cpp new file mode 100644 index 0000000..71f4ef6 --- /dev/null +++ b/src/aosp/suggest/core/dictionary/bigram_dictionary.cpp
@@ -0,0 +1,176 @@ +/* + * Copyright (C) 2010, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cstring> + +#define LOG_TAG "LatinIME: bigram_dictionary.cpp" + +#include "bigram_dictionary.h" + +#include "defines.h" +#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" +#include "suggest/core/dictionary/dictionary.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "utils/char_utils.h" + +namespace latinime { + +BigramDictionary::BigramDictionary( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy) + : mDictionaryStructurePolicy(dictionaryStructurePolicy) { + if (DEBUG_DICT) { + AKLOGI("BigramDictionary - constructor"); + } +} + +BigramDictionary::~BigramDictionary() { +} + +void BigramDictionary::addWordBigram(int *word, int length, int probability, int *bigramProbability, + int *bigramCodePoints, int *outputTypes) const { + word[length] = 0; + if (DEBUG_DICT_FULL) { +#ifdef FLAG_DBG + char s[length + 1]; + for (int i = 0; i <= length; i++) s[i] = static_cast<char>(word[i]); + AKLOGI("Bigram: Found word = %s, freq = %d :", s, probability); +#endif + } + + // Find the right insertion point + int insertAt = 0; + while (insertAt < MAX_RESULTS) { + if (probability > bigramProbability[insertAt] || (bigramProbability[insertAt] == probability + && length < CharUtils::getCodePointCount(MAX_WORD_LENGTH, + bigramCodePoints + insertAt * MAX_WORD_LENGTH))) { + break; + } + insertAt++; + } + if (DEBUG_DICT_FULL) { + AKLOGI("Bigram: InsertAt -> %d MAX_RESULTS: %d", insertAt, MAX_RESULTS); + } + if (insertAt >= MAX_RESULTS) { + return; + } + memmove(bigramProbability + (insertAt + 1), + bigramProbability + insertAt, + (MAX_RESULTS - insertAt - 1) * sizeof(bigramProbability[0])); + bigramProbability[insertAt] = probability; + outputTypes[insertAt] = Dictionary::KIND_PREDICTION; + memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH, + bigramCodePoints + insertAt * MAX_WORD_LENGTH, + (MAX_RESULTS - insertAt - 1) * sizeof(bigramCodePoints[0]) * MAX_WORD_LENGTH); + int *dest = bigramCodePoints + insertAt * MAX_WORD_LENGTH; + while (length--) { + *dest++ = *word++; + } + *dest = 0; // NULL terminate + if (DEBUG_DICT_FULL) { + AKLOGI("Bigram: Added word at %d", insertAt); + } +} + +/* Parameters : + * prevWord: the word before, the one for which we need to look up bigrams. + * prevWordLength: its length. + * outBigramCodePoints: an array for output, at the same format as outwords for getSuggestions. + * outBigramProbability: an array to output frequencies. + * outputTypes: an array to output types. + * This method returns the number of bigrams this word has, for backward compatibility. + */ +int BigramDictionary::getPredictions(const int *prevWord, const int prevWordLength, + int *const outBigramCodePoints, int *const outBigramProbability, + int *const outputTypes) const { + // TODO: remove unused arguments, and refrain from storing stuff in members of this class + // TODO: have "in" arguments before "out" ones, and make out args explicit in the name + + int pos = getBigramListPositionForWord(prevWord, prevWordLength, + false /* forceLowerCaseSearch */); + // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams + if (NOT_A_DICT_POS == pos) { + // If no bigrams for this exact word, search again in lower case. + pos = getBigramListPositionForWord(prevWord, prevWordLength, + true /* forceLowerCaseSearch */); + } + // If still no bigrams, we really don't have them! + if (NOT_A_DICT_POS == pos) return 0; + + int bigramCount = 0; + int unigramProbability = 0; + int bigramBuffer[MAX_WORD_LENGTH]; + BinaryDictionaryBigramsIterator bigramsIt( + mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos); + while (bigramsIt.hasNext()) { + bigramsIt.next(); + if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) { + continue; + } + const int codePointCount = mDictionaryStructurePolicy-> + getCodePointsAndProbabilityAndReturnCodePointCount(bigramsIt.getBigramPos(), + MAX_WORD_LENGTH, bigramBuffer, &unigramProbability); + if (codePointCount <= 0) { + continue; + } + // Due to space constraints, the probability for bigrams is approximate - the lower the + // unigram probability, the worse the precision. The theoritical maximum error in + // resulting probability is 8 - although in the practice it's never bigger than 3 or 4 + // in very bad cases. This means that sometimes, we'll see some bigrams interverted + // here, but it can't get too bad. + const int probability = mDictionaryStructurePolicy->getProbability( + unigramProbability, bigramsIt.getProbability()); + addWordBigram(bigramBuffer, codePointCount, probability, outBigramProbability, + outBigramCodePoints, outputTypes); + ++bigramCount; + } + return min(bigramCount, MAX_RESULTS); +} + +// Returns a pointer to the start of the bigram list. +// If the word is not found or has no bigrams, this function returns NOT_A_DICT_POS. +int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength, + const bool forceLowerCaseSearch) const { + if (0 >= prevWordLength) return NOT_A_DICT_POS; + int pos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(prevWord, prevWordLength, + forceLowerCaseSearch); + if (NOT_A_DICT_POS == pos) return NOT_A_DICT_POS; + return mDictionaryStructurePolicy->getBigramsPositionOfPtNode(pos); +} + +int BigramDictionary::getBigramProbability(const int *word0, int length0, const int *word1, + int length1) const { + int pos = getBigramListPositionForWord(word0, length0, false /* forceLowerCaseSearch */); + // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams + if (NOT_A_DICT_POS == pos) return NOT_A_PROBABILITY; + int nextWordPos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(word1, length1, + false /* forceLowerCaseSearch */); + if (NOT_A_DICT_POS == nextWordPos) return NOT_A_PROBABILITY; + + BinaryDictionaryBigramsIterator bigramsIt( + mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos); + while (bigramsIt.hasNext()) { + bigramsIt.next(); + if (bigramsIt.getBigramPos() == nextWordPos) { + return mDictionaryStructurePolicy->getProbability( + mDictionaryStructurePolicy->getUnigramProbabilityOfPtNode(nextWordPos), + bigramsIt.getProbability()); + } + } + return NOT_A_PROBABILITY; +} + +// TODO: Move functions related to bigram to here +} // namespace latinime
diff --git a/src/aosp/suggest/core/dictionary/bigram_dictionary.h b/src/aosp/suggest/core/dictionary/bigram_dictionary.h new file mode 100644 index 0000000..8af7ee7 --- /dev/null +++ b/src/aosp/suggest/core/dictionary/bigram_dictionary.h
@@ -0,0 +1,46 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BIGRAM_DICTIONARY_H +#define LATINIME_BIGRAM_DICTIONARY_H + +#include "defines.h" + +namespace latinime { + +class DictionaryStructureWithBufferPolicy; + +class BigramDictionary { + public: + BigramDictionary(const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy); + + int getPredictions(const int *word, int length, int *outBigramCodePoints, + int *outBigramProbability, int *outputTypes) const; + int getBigramProbability(const int *word1, int length1, const int *word2, int length2) const; + ~BigramDictionary(); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(BigramDictionary); + + void addWordBigram(int *word, int length, int probability, int *bigramProbability, + int *bigramCodePoints, int *outputTypes) const; + int getBigramListPositionForWord(const int *prevWord, const int prevWordLength, + const bool forceLowerCaseSearch) const; + + const DictionaryStructureWithBufferPolicy *const mDictionaryStructurePolicy; +}; +} // namespace latinime +#endif // LATINIME_BIGRAM_DICTIONARY_H
diff --git a/src/aosp/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h b/src/aosp/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h new file mode 100644 index 0000000..d16ac47 --- /dev/null +++ b/src/aosp/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h
@@ -0,0 +1,59 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H +#define LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H + +#include "defines.h" +#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" + +namespace latinime { + +class BinaryDictionaryBigramsIterator { + public: + BinaryDictionaryBigramsIterator( + const DictionaryBigramsStructurePolicy *const bigramsStructurePolicy, const int pos) + : mBigramsStructurePolicy(bigramsStructurePolicy), mPos(pos), + mBigramPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), + mHasNext(pos != NOT_A_DICT_POS) {} + + AK_FORCE_INLINE bool hasNext() const { + return mHasNext; + } + + AK_FORCE_INLINE void next() { + mBigramsStructurePolicy->getNextBigram(&mBigramPos, &mProbability, &mHasNext, &mPos); + } + + AK_FORCE_INLINE int getProbability() const { + return mProbability; + } + + AK_FORCE_INLINE int getBigramPos() const { + return mBigramPos; + } + + private: + DISALLOW_COPY_AND_ASSIGN(BinaryDictionaryBigramsIterator); + + const DictionaryBigramsStructurePolicy *const mBigramsStructurePolicy; + int mPos; + int mBigramPos; + int mProbability; + bool mHasNext; +}; +} // namespace latinime +#endif // LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H
diff --git a/src/aosp/suggest/core/dictionary/binary_dictionary_shortcut_iterator.h b/src/aosp/suggest/core/dictionary/binary_dictionary_shortcut_iterator.h new file mode 100644 index 0000000..558e0a5 --- /dev/null +++ b/src/aosp/suggest/core/dictionary/binary_dictionary_shortcut_iterator.h
@@ -0,0 +1,55 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H +#define LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H + +#include "defines.h" +#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" + +namespace latinime { + +class BinaryDictionaryShortcutIterator { + public: + BinaryDictionaryShortcutIterator( + const DictionaryShortcutsStructurePolicy *const shortcutStructurePolicy, + const int shortcutPos) + : mShortcutStructurePolicy(shortcutStructurePolicy), + mPos(shortcutStructurePolicy->getStartPos(shortcutPos)), + mHasNextShortcutTarget(shortcutPos != NOT_A_DICT_POS) {} + + AK_FORCE_INLINE bool hasNextShortcutTarget() const { + return mHasNextShortcutTarget; + } + + // Gets the shortcut target itself as an int string and put it to outTarget, put its length + // to outTargetLength, put whether it is whitelist to outIsWhitelist. + AK_FORCE_INLINE void nextShortcutTarget( + const int maxDepth, int *const outTarget, int *const outTargetLength, + bool *const outIsWhitelist) { + mShortcutStructurePolicy->getNextShortcut(maxDepth, outTarget, outTargetLength, + outIsWhitelist, &mHasNextShortcutTarget, &mPos); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryShortcutIterator); + + const DictionaryShortcutsStructurePolicy *const mShortcutStructurePolicy; + int mPos; + bool mHasNextShortcutTarget; +}; +} // namespace latinime +#endif // LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H
diff --git a/src/aosp/suggest/core/dictionary/bloom_filter.cpp b/src/aosp/suggest/core/dictionary/bloom_filter.cpp new file mode 100644 index 0000000..4ae474e --- /dev/null +++ b/src/aosp/suggest/core/dictionary/bloom_filter.cpp
@@ -0,0 +1,25 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dictionary/bloom_filter.h" + +namespace latinime { + +// Must be smaller than BIGRAM_FILTER_BYTE_SIZE * 8, and preferably prime. 1021 is the largest +// prime under 128 * 8. +const int BloomFilter::BIGRAM_FILTER_MODULO = 1021; + +} // namespace latinime
diff --git a/src/aosp/suggest/core/dictionary/bloom_filter.h b/src/aosp/suggest/core/dictionary/bloom_filter.h new file mode 100644 index 0000000..5205456 --- /dev/null +++ b/src/aosp/suggest/core/dictionary/bloom_filter.h
@@ -0,0 +1,70 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BLOOM_FILTER_H +#define LATINIME_BLOOM_FILTER_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +// This bloom filter is used for optimizing bigram retrieval. +// Execution times with previous word "this" are as follows: +// without bloom filter (use only hash_map): +// Total 147792.34 (sum of others 147771.57) +// with bloom filter: +// Total 145900.64 (sum of others 145874.30) +// always read binary dictionary: +// Total 148603.14 (sum of others 148579.90) +class BloomFilter { + public: + BloomFilter() { + ASSERT(BIGRAM_FILTER_BYTE_SIZE * 8 >= BIGRAM_FILTER_MODULO); + } + + // TODO: uint32_t position + AK_FORCE_INLINE void setInFilter(const int32_t position) { + const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO); + mFilter[bucket >> 3] |= static_cast<uint8_t>(1 << (bucket & 0x7)); + } + + // TODO: uint32_t position + AK_FORCE_INLINE bool isInFilter(const int32_t position) const { + const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO); + return (mFilter[bucket >> 3] & static_cast<uint8_t>(1 << (bucket & 0x7))) != 0; + } + + private: + // Size, in bytes, of the bloom filter index for bigrams + // 128 gives us 1024 buckets. The probability of false positive is (1 - e ** (-kn/m))**k, + // where k is the number of hash functions, n the number of bigrams, and m the number of + // bits we can test. + // At the moment 100 is the maximum number of bigrams for a word with the current + // dictionaries, so n = 100. 1024 buckets give us m = 1024. + // With 1 hash function, our false positive rate is about 9.3%, which should be enough for + // our uses since we are only using this to increase average performance. For the record, + // k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%, + // and m = 4096 gives 2.4%. + // This is assigned here because it is used for array size. + static const int BIGRAM_FILTER_BYTE_SIZE = 128; + static const int BIGRAM_FILTER_MODULO; + + uint8_t mFilter[BIGRAM_FILTER_BYTE_SIZE]; +}; +} // namespace latinime +#endif // LATINIME_BLOOM_FILTER_H
diff --git a/src/aosp/suggest/core/dictionary/dictionary.cpp b/src/aosp/suggest/core/dictionary/dictionary.cpp new file mode 100644 index 0000000..6bc4b30 --- /dev/null +++ b/src/aosp/suggest/core/dictionary/dictionary.cpp
@@ -0,0 +1,165 @@ +/* + * Copyright (C) 2009, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "LatinIME: dictionary.cpp" + +#include "suggest/core/dictionary/dictionary.h" + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/dictionary/bigram_dictionary.h" +#include "suggest/core/policy/dictionary_header_structure_policy.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "suggest/core/session/dic_traverse_session.h" +#include "suggest/core/suggest.h" +#include "suggest/core/suggest_options.h" +#include "suggest/policyimpl/gesture/gesture_suggest_policy_factory.h" +#include "suggest/policyimpl/typing/typing_suggest_policy_factory.h" +#include "utils/log_utils.h" + +namespace latinime { + +const int Dictionary::HEADER_ATTRIBUTE_BUFFER_SIZE = 32; + +Dictionary::Dictionary(JNIEnv *env, + DictionaryStructureWithBufferPolicy *const dictionaryStructureWithBufferPolicy) + : mDictionaryStructureWithBufferPolicy(dictionaryStructureWithBufferPolicy), + mBigramDictionary(new BigramDictionary(mDictionaryStructureWithBufferPolicy)), + mGestureSuggest(new Suggest(GestureSuggestPolicyFactory::getGestureSuggestPolicy())), + mTypingSuggest(new Suggest(TypingSuggestPolicyFactory::getTypingSuggestPolicy())) { + logDictionaryInfo(env); +} + +Dictionary::~Dictionary() { + delete mBigramDictionary; + delete mGestureSuggest; + delete mTypingSuggest; + delete mDictionaryStructureWithBufferPolicy; +} + +int Dictionary::getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession *traverseSession, + int *xcoordinates, int *ycoordinates, int *times, int *pointerIds, int *inputCodePoints, + int inputSize, int *prevWordCodePoints, int prevWordLength, int commitPoint, + const SuggestOptions *const suggestOptions, int *outWords, int *frequencies, + int *spaceIndices, int *outputTypes, int *outputAutoCommitFirstWordConfidence) const { + int result = 0; + AKLOGI("HELLO! [DK]"); + if (suggestOptions->isGesture()) { + DicTraverseSession::initSessionInstance( + traverseSession, this, prevWordCodePoints, prevWordLength, suggestOptions); + result = mGestureSuggest->getSuggestions(proximityInfo, traverseSession, xcoordinates, + ycoordinates, times, pointerIds, inputCodePoints, inputSize, commitPoint, outWords, + frequencies, spaceIndices, outputTypes, outputAutoCommitFirstWordConfidence); + if (DEBUG_DICT) { + DUMP_RESULT(outWords, frequencies); + } + return result; + } else { + DicTraverseSession::initSessionInstance( + traverseSession, this, prevWordCodePoints, prevWordLength, suggestOptions); + result = mTypingSuggest->getSuggestions(proximityInfo, traverseSession, xcoordinates, + ycoordinates, times, pointerIds, inputCodePoints, inputSize, commitPoint, + outWords, frequencies, spaceIndices, outputTypes, + outputAutoCommitFirstWordConfidence); + if (DEBUG_DICT) { + DUMP_RESULT(outWords, frequencies); + } + return result; + } +} + +int Dictionary::getBigrams(const int *word, int length, int *outWords, int *frequencies, + int *outputTypes) const { + if (length <= 0) return 0; + return mBigramDictionary->getPredictions(word, length, outWords, frequencies, outputTypes); +} + +int Dictionary::getProbability(const int *word, int length) const { + int pos = getDictionaryStructurePolicy()->getTerminalNodePositionOfWord(word, length, + false /* forceLowerCaseSearch */); + if (NOT_A_DICT_POS == pos) { + return NOT_A_PROBABILITY; + } + return getDictionaryStructurePolicy()->getUnigramProbabilityOfPtNode(pos); +} + +int Dictionary::getBigramProbability(const int *word0, int length0, const int *word1, + int length1) const { + return mBigramDictionary->getBigramProbability(word0, length0, word1, length1); +} + +void Dictionary::addUnigramWord(const int *const word, const int length, const int probability) { + mDictionaryStructureWithBufferPolicy->addUnigramWord(word, length, probability); +} + +void Dictionary::addBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1, const int probability) { + mDictionaryStructureWithBufferPolicy->addBigramWords(word0, length0, word1, length1, + probability); +} + +void Dictionary::removeBigramWords(const int *const word0, const int length0, + const int *const word1, const int length1) { + mDictionaryStructureWithBufferPolicy->removeBigramWords(word0, length0, word1, length1); +} + +void Dictionary::flush(const char *const filePath) { + mDictionaryStructureWithBufferPolicy->flush(filePath); +} + +void Dictionary::flushWithGC(const char *const filePath) { + mDictionaryStructureWithBufferPolicy->flushWithGC(filePath); +} + +bool Dictionary::needsToRunGC(const bool mindsBlockByGC) { + return mDictionaryStructureWithBufferPolicy->needsToRunGC(mindsBlockByGC); +} + +void Dictionary::getProperty(const char *const query, char *const outResult, + const int maxResultLength) { + return mDictionaryStructureWithBufferPolicy->getProperty(query, outResult, maxResultLength); +} + +void Dictionary::logDictionaryInfo(JNIEnv *const env) const { + int dictionaryIdCodePointBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE]; + int versionStringCodePointBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE]; + int dateStringCodePointBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE]; + const DictionaryHeaderStructurePolicy *const headerPolicy = + getDictionaryStructurePolicy()->getHeaderStructurePolicy(); + headerPolicy->readHeaderValueOrQuestionMark("dictionary", dictionaryIdCodePointBuffer, + HEADER_ATTRIBUTE_BUFFER_SIZE); + headerPolicy->readHeaderValueOrQuestionMark("version", versionStringCodePointBuffer, + HEADER_ATTRIBUTE_BUFFER_SIZE); + headerPolicy->readHeaderValueOrQuestionMark("date", dateStringCodePointBuffer, + HEADER_ATTRIBUTE_BUFFER_SIZE); + + char dictionaryIdCharBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE]; + char versionStringCharBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE]; + char dateStringCharBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE]; + intArrayToCharArray(dictionaryIdCodePointBuffer, HEADER_ATTRIBUTE_BUFFER_SIZE, + dictionaryIdCharBuffer, HEADER_ATTRIBUTE_BUFFER_SIZE); + intArrayToCharArray(versionStringCodePointBuffer, HEADER_ATTRIBUTE_BUFFER_SIZE, + versionStringCharBuffer, HEADER_ATTRIBUTE_BUFFER_SIZE); + intArrayToCharArray(dateStringCodePointBuffer, HEADER_ATTRIBUTE_BUFFER_SIZE, + dateStringCharBuffer, HEADER_ATTRIBUTE_BUFFER_SIZE); + + LogUtils::logToJava(env, + "Dictionary info: dictionary = %s ; version = %s ; date = %s", + dictionaryIdCharBuffer, versionStringCharBuffer, dateStringCharBuffer); +} + +} // namespace latinime
diff --git a/src/aosp/suggest/core/dictionary/dictionary.h b/src/aosp/suggest/core/dictionary/dictionary.h new file mode 100644 index 0000000..0195d5b --- /dev/null +++ b/src/aosp/suggest/core/dictionary/dictionary.h
@@ -0,0 +1,108 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_H +#define LATINIME_DICTIONARY_H + +#include <stdint.h> + +#include "defines.h" +#include "jni.h" + +namespace latinime { + +class BigramDictionary; +class DictionaryStructureWithBufferPolicy; +class DicTraverseSession; +class ProximityInfo; +class SuggestInterface; +class SuggestOptions; + +class Dictionary { + public: + // Taken from SuggestedWords.java + static const int KIND_MASK_KIND = 0xFF; // Mask to get only the kind + static const int KIND_TYPED = 0; // What user typed + static const int KIND_CORRECTION = 1; // Simple correction/suggestion + static const int KIND_COMPLETION = 2; // Completion (suggestion with appended chars) + static const int KIND_WHITELIST = 3; // Whitelisted word + static const int KIND_BLACKLIST = 4; // Blacklisted word + static const int KIND_HARDCODED = 5; // Hardcoded suggestion, e.g. punctuation + static const int KIND_APP_DEFINED = 6; // Suggested by the application + static const int KIND_SHORTCUT = 7; // A shortcut + static const int KIND_PREDICTION = 8; // A prediction (== a suggestion with no input) + // KIND_RESUMED: A resumed suggestion (comes from a span, currently this type is used only + // in java for re-correction) + static const int KIND_RESUMED = 9; + static const int KIND_OOV_CORRECTION = 10; // Most probable string correction + + static const int KIND_MASK_FLAGS = 0xFFFFFF00; // Mask to get the flags + static const int KIND_FLAG_POSSIBLY_OFFENSIVE = 0x80000000; + static const int KIND_FLAG_EXACT_MATCH = 0x40000000; + + Dictionary(JNIEnv *env, + DictionaryStructureWithBufferPolicy *const dictionaryStructureWithBufferPoilcy); + + int getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession *traverseSession, + int *xcoordinates, int *ycoordinates, int *times, int *pointerIds, int *inputCodePoints, + int inputSize, int *prevWordCodePoints, int prevWordLength, int commitPoint, + const SuggestOptions *const suggestOptions, int *outWords, int *frequencies, + int *spaceIndices, int *outputTypes, int *outputAutoCommitFirstWordConfidence) const; + + int getBigrams(const int *word, int length, int *outWords, int *frequencies, + int *outputTypes) const; + + int getProbability(const int *word, int length) const; + + int getBigramProbability(const int *word0, int length0, const int *word1, int length1) const; + + void addUnigramWord(const int *const word, const int length, const int probability); + + void addBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1, const int probability); + + void removeBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1); + + void flush(const char *const filePath); + + void flushWithGC(const char *const filePath); + + bool needsToRunGC(const bool mindsBlockByGC); + + void getProperty(const char *const query, char *const outResult, + const int maxResultLength); + + const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const { + return mDictionaryStructureWithBufferPolicy; + } + + virtual ~Dictionary(); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Dictionary); + + static const int HEADER_ATTRIBUTE_BUFFER_SIZE; + + DictionaryStructureWithBufferPolicy *const mDictionaryStructureWithBufferPolicy; + const BigramDictionary *const mBigramDictionary; + const SuggestInterface *const mGestureSuggest; + const SuggestInterface *const mTypingSuggest; + + void logDictionaryInfo(JNIEnv *const env) const; +}; +} // namespace latinime +#endif // LATINIME_DICTIONARY_H
diff --git a/src/aosp/suggest/core/dictionary/digraph_utils.cpp b/src/aosp/suggest/core/dictionary/digraph_utils.cpp new file mode 100644 index 0000000..3271c1b --- /dev/null +++ b/src/aosp/suggest/core/dictionary/digraph_utils.cpp
@@ -0,0 +1,131 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dictionary/digraph_utils.h" + +#include <cstdlib> + +#include "defines.h" +#include "suggest/core/policy/dictionary_header_structure_policy.h" +#include "utils/char_utils.h" + +namespace latinime { + +const DigraphUtils::digraph_t DigraphUtils::GERMAN_UMLAUT_DIGRAPHS[] = + { { 'a', 'e', 0x00E4 }, // U+00E4 : LATIN SMALL LETTER A WITH DIAERESIS + { 'o', 'e', 0x00F6 }, // U+00F6 : LATIN SMALL LETTER O WITH DIAERESIS + { 'u', 'e', 0x00FC } }; // U+00FC : LATIN SMALL LETTER U WITH DIAERESIS +const DigraphUtils::digraph_t DigraphUtils::FRENCH_LIGATURES_DIGRAPHS[] = + { { 'a', 'e', 0x00E6 }, // U+00E6 : LATIN SMALL LETTER AE + { 'o', 'e', 0x0153 } }; // U+0153 : LATIN SMALL LIGATURE OE +const DigraphUtils::DigraphType DigraphUtils::USED_DIGRAPH_TYPES[] = + { DIGRAPH_TYPE_GERMAN_UMLAUT, DIGRAPH_TYPE_FRENCH_LIGATURES }; + +/* static */ bool DigraphUtils::hasDigraphForCodePoint( + const DictionaryHeaderStructurePolicy *const headerPolicy, + const int compositeGlyphCodePoint) { + const DigraphUtils::DigraphType digraphType = getDigraphTypeForDictionary(headerPolicy); + if (DigraphUtils::getDigraphForDigraphTypeAndCodePoint(digraphType, compositeGlyphCodePoint)) { + return true; + } + return false; +} + +// Returns the digraph type associated with the given dictionary. +/* static */ DigraphUtils::DigraphType DigraphUtils::getDigraphTypeForDictionary( + const DictionaryHeaderStructurePolicy *const headerPolicy) { + if (headerPolicy->requiresGermanUmlautProcessing()) { + return DIGRAPH_TYPE_GERMAN_UMLAUT; + } + if (headerPolicy->requiresFrenchLigatureProcessing()) { + return DIGRAPH_TYPE_FRENCH_LIGATURES; + } + return DIGRAPH_TYPE_NONE; +} + +// Returns the digraph codepoint for the given composite glyph codepoint and digraph codepoint index +// (which specifies the first or second codepoint in the digraph). +/* static */ int DigraphUtils::getDigraphCodePointForIndex(const int compositeGlyphCodePoint, + const DigraphCodePointIndex digraphCodePointIndex) { + if (digraphCodePointIndex == NOT_A_DIGRAPH_INDEX) { + return NOT_A_CODE_POINT; + } + const DigraphUtils::digraph_t *const digraph = + DigraphUtils::getDigraphForCodePoint(compositeGlyphCodePoint); + if (!digraph) { + return NOT_A_CODE_POINT; + } + if (digraphCodePointIndex == FIRST_DIGRAPH_CODEPOINT) { + return digraph->first; + } else if (digraphCodePointIndex == SECOND_DIGRAPH_CODEPOINT) { + return digraph->second; + } + ASSERT(false); + return NOT_A_CODE_POINT; +} + +// Retrieves the set of all digraphs associated with the given digraph type. +// Returns the size of the digraph array, or 0 if none exist. +/* static */ int DigraphUtils::getAllDigraphsForDigraphTypeAndReturnSize( + const DigraphUtils::DigraphType digraphType, + const DigraphUtils::digraph_t **const digraphs) { + if (digraphType == DigraphUtils::DIGRAPH_TYPE_GERMAN_UMLAUT) { + *digraphs = GERMAN_UMLAUT_DIGRAPHS; + return NELEMS(GERMAN_UMLAUT_DIGRAPHS); + } + if (digraphType == DIGRAPH_TYPE_FRENCH_LIGATURES) { + *digraphs = FRENCH_LIGATURES_DIGRAPHS; + return NELEMS(FRENCH_LIGATURES_DIGRAPHS); + } + return 0; +} + +/** + * Returns the digraph for the input composite glyph codepoint, or 0 if none exists. + * compositeGlyphCodePoint: the method returns the digraph corresponding to this codepoint. + */ +/* static */ const DigraphUtils::digraph_t *DigraphUtils::getDigraphForCodePoint( + const int compositeGlyphCodePoint) { + for (size_t i = 0; i < NELEMS(USED_DIGRAPH_TYPES); i++) { + const DigraphUtils::digraph_t *const digraph = getDigraphForDigraphTypeAndCodePoint( + USED_DIGRAPH_TYPES[i], compositeGlyphCodePoint); + if (digraph) { + return digraph; + } + } + return 0; +} + +/** + * Returns the digraph for the input composite glyph codepoint, or 0 if none exists. + * digraphType: the type of digraphs supported. + * compositeGlyphCodePoint: the method returns the digraph corresponding to this codepoint. + */ +/* static */ const DigraphUtils::digraph_t *DigraphUtils::getDigraphForDigraphTypeAndCodePoint( + const DigraphUtils::DigraphType digraphType, const int compositeGlyphCodePoint) { + const DigraphUtils::digraph_t *digraphs = 0; + const int compositeGlyphLowerCodePoint = CharUtils::toLowerCase(compositeGlyphCodePoint); + const int digraphsSize = + DigraphUtils::getAllDigraphsForDigraphTypeAndReturnSize(digraphType, &digraphs); + for (int i = 0; i < digraphsSize; i++) { + if (digraphs[i].compositeGlyph == compositeGlyphLowerCodePoint) { + return &digraphs[i]; + } + } + return 0; +} + +} // namespace latinime
diff --git a/src/aosp/suggest/core/dictionary/digraph_utils.h b/src/aosp/suggest/core/dictionary/digraph_utils.h new file mode 100644 index 0000000..6ae16e3 --- /dev/null +++ b/src/aosp/suggest/core/dictionary/digraph_utils.h
@@ -0,0 +1,62 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DIGRAPH_UTILS_H +#define DIGRAPH_UTILS_H + +#include "defines.h" + +namespace latinime { + +class DictionaryHeaderStructurePolicy; + +class DigraphUtils { + public: + typedef enum { + NOT_A_DIGRAPH_INDEX, + FIRST_DIGRAPH_CODEPOINT, + SECOND_DIGRAPH_CODEPOINT + } DigraphCodePointIndex; + + typedef enum { + DIGRAPH_TYPE_NONE, + DIGRAPH_TYPE_GERMAN_UMLAUT, + DIGRAPH_TYPE_FRENCH_LIGATURES + } DigraphType; + + typedef struct { int first; int second; int compositeGlyph; } digraph_t; + + static bool hasDigraphForCodePoint(const DictionaryHeaderStructurePolicy *const headerPolicy, + const int compositeGlyphCodePoint); + static int getDigraphCodePointForIndex(const int compositeGlyphCodePoint, + const DigraphCodePointIndex digraphCodePointIndex); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DigraphUtils); + static DigraphType getDigraphTypeForDictionary( + const DictionaryHeaderStructurePolicy *const headerPolicy); + static int getAllDigraphsForDigraphTypeAndReturnSize( + const DigraphType digraphType, const digraph_t **const digraphs); + static const digraph_t *getDigraphForCodePoint(const int compositeGlyphCodePoint); + static const digraph_t *getDigraphForDigraphTypeAndCodePoint( + const DigraphType digraphType, const int compositeGlyphCodePoint); + + static const digraph_t GERMAN_UMLAUT_DIGRAPHS[]; + static const digraph_t FRENCH_LIGATURES_DIGRAPHS[]; + static const DigraphType USED_DIGRAPH_TYPES[]; +}; +} // namespace latinime +#endif // DIGRAPH_UTILS_H
diff --git a/src/aosp/suggest/core/dictionary/multi_bigram_map.cpp b/src/aosp/suggest/core/dictionary/multi_bigram_map.cpp new file mode 100644 index 0000000..b1d2f4b --- /dev/null +++ b/src/aosp/suggest/core/dictionary/multi_bigram_map.cpp
@@ -0,0 +1,33 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dictionary/multi_bigram_map.h" + +#include <cstddef> + +namespace latinime { + +// Max number of bigram maps (previous word contexts) to be cached. Increasing this number +// could improve bigram lookup speed for multi-word suggestions, but at the cost of more memory +// usage. Also, there are diminishing returns since the most frequently used bigrams are +// typically near the beginning of the input and are thus the first ones to be cached. Note +// that these bigrams are reset for each new composing word. +const size_t MultiBigramMap::MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP = 25; + +// Most common previous word contexts currently have 100 bigrams +const int MultiBigramMap::BigramMap::DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP = 100; + +} // namespace latinime
diff --git a/src/aosp/suggest/core/dictionary/multi_bigram_map.h b/src/aosp/suggest/core/dictionary/multi_bigram_map.h new file mode 100644 index 0000000..4633c07 --- /dev/null +++ b/src/aosp/suggest/core/dictionary/multi_bigram_map.h
@@ -0,0 +1,132 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_MULTI_BIGRAM_MAP_H +#define LATINIME_MULTI_BIGRAM_MAP_H + +#include <cstddef> + +#include "defines.h" +#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" +#include "suggest/core/dictionary/bloom_filter.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "utils/hash_map_compat.h" + +namespace latinime { + +// Class for caching bigram maps for multiple previous word contexts. This is useful since the +// algorithm needs to look up the set of bigrams for every word pair that occurs in every +// multi-word suggestion. +class MultiBigramMap { + public: + MultiBigramMap() : mBigramMaps() {} + ~MultiBigramMap() {} + + // Look up the bigram probability for the given word pair from the cached bigram maps. + // Also caches the bigrams if there is space remaining and they have not been cached already. + int getBigramProbability(const DictionaryStructureWithBufferPolicy *const structurePolicy, + const int wordPosition, const int nextWordPosition, const int unigramProbability) { + hash_map_compat<int, BigramMap>::const_iterator mapPosition = + mBigramMaps.find(wordPosition); + if (mapPosition != mBigramMaps.end()) { + return mapPosition->second.getBigramProbability(structurePolicy, nextWordPosition, + unigramProbability); + } + if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) { + addBigramsForWordPosition(structurePolicy, wordPosition); + return mBigramMaps[wordPosition].getBigramProbability(structurePolicy, + nextWordPosition, unigramProbability); + } + return readBigramProbabilityFromBinaryDictionary(structurePolicy, wordPosition, + nextWordPosition, unigramProbability); + } + + void clear() { + mBigramMaps.clear(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(MultiBigramMap); + + class BigramMap { + public: + BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP), mBloomFilter() {} + ~BigramMap() {} + + void init(const DictionaryStructureWithBufferPolicy *const structurePolicy, + const int nodePos) { + const int bigramsListPos = structurePolicy->getBigramsPositionOfPtNode(nodePos); + BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(), + bigramsListPos); + while (bigramsIt.hasNext()) { + bigramsIt.next(); + if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) { + continue; + } + mBigramMap[bigramsIt.getBigramPos()] = bigramsIt.getProbability(); + mBloomFilter.setInFilter(bigramsIt.getBigramPos()); + } + } + + AK_FORCE_INLINE int getBigramProbability( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const int nextWordPosition, const int unigramProbability) const { + int bigramProbability = NOT_A_PROBABILITY; + if (mBloomFilter.isInFilter(nextWordPosition)) { + const hash_map_compat<int, int>::const_iterator bigramProbabilityIt = + mBigramMap.find(nextWordPosition); + if (bigramProbabilityIt != mBigramMap.end()) { + bigramProbability = bigramProbabilityIt->second; + } + } + return structurePolicy->getProbability(unigramProbability, bigramProbability); + } + + private: + // NOTE: The BigramMap class doesn't use DISALLOW_COPY_AND_ASSIGN() because its default + // copy constructor is needed for use in hash_map. + static const int DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP; + hash_map_compat<int, int> mBigramMap; + BloomFilter mBloomFilter; + }; + + AK_FORCE_INLINE void addBigramsForWordPosition( + const DictionaryStructureWithBufferPolicy *const structurePolicy, const int position) { + mBigramMaps[position].init(structurePolicy, position); + } + + AK_FORCE_INLINE int readBigramProbabilityFromBinaryDictionary( + const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos, + const int nextWordPosition, const int unigramProbability) { + int bigramProbability = NOT_A_PROBABILITY; + const int bigramsListPos = structurePolicy->getBigramsPositionOfPtNode(nodePos); + BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(), + bigramsListPos); + while (bigramsIt.hasNext()) { + bigramsIt.next(); + if (bigramsIt.getBigramPos() == nextWordPosition) { + bigramProbability = bigramsIt.getProbability(); + break; + } + } + return structurePolicy->getProbability(unigramProbability, bigramProbability); + } + + static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP; + hash_map_compat<int, BigramMap> mBigramMaps; +}; +} // namespace latinime +#endif // LATINIME_MULTI_BIGRAM_MAP_H
diff --git a/src/aosp/suggest/core/dictionary/shortcut_utils.h b/src/aosp/suggest/core/dictionary/shortcut_utils.h new file mode 100644 index 0000000..9ccef02 --- /dev/null +++ b/src/aosp/suggest/core/dictionary/shortcut_utils.h
@@ -0,0 +1,64 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SHORTCUT_UTILS +#define LATINIME_SHORTCUT_UTILS + +#include "defines.h" +#include "suggest/core/dicnode/dic_node_utils.h" +#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h" + +namespace latinime { + +class ShortcutUtils { + public: + static int outputShortcuts(BinaryDictionaryShortcutIterator *const shortcutIt, + int outputWordIndex, const int finalScore, int *const outputCodePoints, + int *const frequencies, int *const outputTypes, const bool sameAsTyped) { + int shortcutTarget[MAX_WORD_LENGTH]; + while (shortcutIt->hasNextShortcutTarget() && outputWordIndex < MAX_RESULTS) { + bool isWhilelist; + int shortcutTargetStringLength; + shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget, + &shortcutTargetStringLength, &isWhilelist); + int shortcutScore; + int kind; + if (isWhilelist && sameAsTyped) { + shortcutScore = S_INT_MAX; + kind = Dictionary::KIND_WHITELIST; + } else { + // shortcut entry's score == its base entry's score - 1 + shortcutScore = finalScore; + // Protection against int underflow + shortcutScore = max(S_INT_MIN + 1, shortcutScore) - 1; + kind = Dictionary::KIND_SHORTCUT; + } + outputTypes[outputWordIndex] = kind; + frequencies[outputWordIndex] = shortcutScore; + frequencies[outputWordIndex] = max(S_INT_MIN + 1, shortcutScore) - 1; + const int startIndex2 = outputWordIndex * MAX_WORD_LENGTH; + DicNodeUtils::appendTwoWords(0, 0, shortcutTarget, shortcutTargetStringLength, + &outputCodePoints[startIndex2]); + ++outputWordIndex; + } + return outputWordIndex; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutUtils); +}; +} // namespace latinime +#endif // LATINIME_SHORTCUT_UTILS
diff --git a/src/aosp/suggest/core/layout/additional_proximity_chars.cpp b/src/aosp/suggest/core/layout/additional_proximity_chars.cpp new file mode 100644 index 0000000..34b8b37 --- /dev/null +++ b/src/aosp/suggest/core/layout/additional_proximity_chars.cpp
@@ -0,0 +1,43 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/layout/additional_proximity_chars.h" + +namespace latinime { +// TODO: Stop using hardcoded additional proximity characters. +// TODO: Have proximity character informations in each language's binary dictionary. +const char *AdditionalProximityChars::LOCALE_EN_US = "en"; + +const int AdditionalProximityChars::EN_US_ADDITIONAL_A[EN_US_ADDITIONAL_A_SIZE] = { + 'e', 'i', 'o', 'u' +}; + +const int AdditionalProximityChars::EN_US_ADDITIONAL_E[EN_US_ADDITIONAL_E_SIZE] = { + 'a', 'i', 'o', 'u' +}; + +const int AdditionalProximityChars::EN_US_ADDITIONAL_I[EN_US_ADDITIONAL_I_SIZE] = { + 'a', 'e', 'o', 'u' +}; + +const int AdditionalProximityChars::EN_US_ADDITIONAL_O[EN_US_ADDITIONAL_O_SIZE] = { + 'a', 'e', 'i', 'u' +}; + +const int AdditionalProximityChars::EN_US_ADDITIONAL_U[EN_US_ADDITIONAL_U_SIZE] = { + 'a', 'e', 'i', 'o' +}; +} // namespace latinime
diff --git a/src/aosp/suggest/core/layout/additional_proximity_chars.h b/src/aosp/suggest/core/layout/additional_proximity_chars.h new file mode 100644 index 0000000..a88fd6c --- /dev/null +++ b/src/aosp/suggest/core/layout/additional_proximity_chars.h
@@ -0,0 +1,89 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_ADDITIONAL_PROXIMITY_CHARS_H +#define LATINIME_ADDITIONAL_PROXIMITY_CHARS_H + +#include <cstring> + +#include "defines.h" + +namespace latinime { + +class AdditionalProximityChars { + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(AdditionalProximityChars); + static const char *LOCALE_EN_US; + static const int EN_US_ADDITIONAL_A_SIZE = 4; + static const int EN_US_ADDITIONAL_A[]; + static const int EN_US_ADDITIONAL_E_SIZE = 4; + static const int EN_US_ADDITIONAL_E[]; + static const int EN_US_ADDITIONAL_I_SIZE = 4; + static const int EN_US_ADDITIONAL_I[]; + static const int EN_US_ADDITIONAL_O_SIZE = 4; + static const int EN_US_ADDITIONAL_O[]; + static const int EN_US_ADDITIONAL_U_SIZE = 4; + static const int EN_US_ADDITIONAL_U[]; + + AK_FORCE_INLINE static bool isEnLocale(const char *localeStr) { + const size_t LOCALE_EN_US_SIZE = strlen(LOCALE_EN_US); + return localeStr && strlen(localeStr) >= LOCALE_EN_US_SIZE + && strncmp(localeStr, LOCALE_EN_US, LOCALE_EN_US_SIZE) == 0; + } + + public: + static int getAdditionalCharsSize(const char *const localeStr, const int c) { + if (!isEnLocale(localeStr)) { + return 0; + } + switch (c) { + case 'a': + return EN_US_ADDITIONAL_A_SIZE; + case 'e': + return EN_US_ADDITIONAL_E_SIZE; + case 'i': + return EN_US_ADDITIONAL_I_SIZE; + case 'o': + return EN_US_ADDITIONAL_O_SIZE; + case 'u': + return EN_US_ADDITIONAL_U_SIZE; + default: + return 0; + } + } + + static const int *getAdditionalChars(const char *const localeStr, const int c) { + if (!isEnLocale(localeStr)) { + return 0; + } + switch (c) { + case 'a': + return EN_US_ADDITIONAL_A; + case 'e': + return EN_US_ADDITIONAL_E; + case 'i': + return EN_US_ADDITIONAL_I; + case 'o': + return EN_US_ADDITIONAL_O; + case 'u': + return EN_US_ADDITIONAL_U; + default: + return 0; + } + } +}; +} // namespace latinime +#endif // LATINIME_ADDITIONAL_PROXIMITY_CHARS_H
diff --git a/src/aosp/suggest/core/layout/geometry_utils.h b/src/aosp/suggest/core/layout/geometry_utils.h new file mode 100644 index 0000000..b667df6 --- /dev/null +++ b/src/aosp/suggest/core/layout/geometry_utils.h
@@ -0,0 +1,59 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_GEOMETRY_UTILS_H +#define LATINIME_GEOMETRY_UTILS_H + +#include <cmath> + +#include "defines.h" + +#define ROUND_FLOAT_10000(f) ((f) < 1000.0f && (f) > 0.001f) \ + ? (floorf((f) * 10000.0f) / 10000.0f) : (f) + +namespace latinime { + +class GeometryUtils { + public: + static inline float SQUARE_FLOAT(const float x) { return x * x; } + + static AK_FORCE_INLINE float getAngle(const int x1, const int y1, const int x2, const int y2) { + const int dx = x1 - x2; + const int dy = y1 - y2; + if (dx == 0 && dy == 0) return 0.0f; + return atan2f(static_cast<float>(dy), static_cast<float>(dx)); + } + + static AK_FORCE_INLINE float getAngleDiff(const float a1, const float a2) { + const float deltaA = fabsf(a1 - a2); + const float diff = ROUND_FLOAT_10000(deltaA); + if (diff > M_PI_F) { + const float normalizedDiff = 2.0f * M_PI_F - diff; + return ROUND_FLOAT_10000(normalizedDiff); + } + return diff; + } + + static AK_FORCE_INLINE int getDistanceInt(const int x1, const int y1, const int x2, + const int y2) { + return static_cast<int>(hypotf(static_cast<float>(x1 - x2), static_cast<float>(y1 - y2))); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(GeometryUtils); +}; +} // namespace latinime +#endif // LATINIME_GEOMETRY_UTILS_H
diff --git a/src/aosp/suggest/core/layout/proximity_info.cpp b/src/aosp/suggest/core/layout/proximity_info.cpp new file mode 100644 index 0000000..897de1c --- /dev/null +++ b/src/aosp/suggest/core/layout/proximity_info.cpp
@@ -0,0 +1,266 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "LatinIME: proximity_info.cpp" + +#include "suggest/core/layout/proximity_info.h" + +#include <cstring> +#include <cmath> + +#include "defines.h" +#include "jni.h" +#include "suggest/core/layout/additional_proximity_chars.h" +#include "suggest/core/layout/geometry_utils.h" +#include "suggest/core/layout/proximity_info_params.h" +#include "utils/char_utils.h" + +namespace latinime { + +static AK_FORCE_INLINE void safeGetOrFillZeroIntArrayRegion(JNIEnv *env, jintArray jArray, + jsize len, jint *buffer) { + if (jArray && buffer) { + env->GetIntArrayRegion(jArray, 0, len, buffer); + } else if (buffer) { + memset(buffer, 0, len * sizeof(buffer[0])); + } +} + +static AK_FORCE_INLINE void safeGetOrFillZeroFloatArrayRegion(JNIEnv *env, jfloatArray jArray, + jsize len, jfloat *buffer) { + if (jArray && buffer) { + env->GetFloatArrayRegion(jArray, 0, len, buffer); + } else if (buffer) { + memset(buffer, 0, len * sizeof(buffer[0])); + } +} + +ProximityInfo::ProximityInfo(JNIEnv *env, const jstring localeJStr, + const int keyboardWidth, const int keyboardHeight, const int gridWidth, + const int gridHeight, const int mostCommonKeyWidth, const int mostCommonKeyHeight, + const jintArray proximityChars, const int keyCount, const jintArray keyXCoordinates, + const jintArray keyYCoordinates, const jintArray keyWidths, const jintArray keyHeights, + const jintArray keyCharCodes, const jfloatArray sweetSpotCenterXs, + const jfloatArray sweetSpotCenterYs, const jfloatArray sweetSpotRadii) + : GRID_WIDTH(gridWidth), GRID_HEIGHT(gridHeight), MOST_COMMON_KEY_WIDTH(mostCommonKeyWidth), + MOST_COMMON_KEY_WIDTH_SQUARE(mostCommonKeyWidth * mostCommonKeyWidth), + MOST_COMMON_KEY_HEIGHT(mostCommonKeyHeight), + NORMALIZED_SQUARED_MOST_COMMON_KEY_HYPOTENUSE(1.0f + + GeometryUtils::SQUARE_FLOAT(static_cast<float>(mostCommonKeyHeight) / + static_cast<float>(mostCommonKeyWidth))), + CELL_WIDTH((keyboardWidth + gridWidth - 1) / gridWidth), + CELL_HEIGHT((keyboardHeight + gridHeight - 1) / gridHeight), + KEY_COUNT(min(keyCount, MAX_KEY_COUNT_IN_A_KEYBOARD)), + KEYBOARD_WIDTH(keyboardWidth), KEYBOARD_HEIGHT(keyboardHeight), + KEYBOARD_HYPOTENUSE(hypotf(KEYBOARD_WIDTH, KEYBOARD_HEIGHT)), + HAS_TOUCH_POSITION_CORRECTION_DATA(keyCount > 0 && keyXCoordinates && keyYCoordinates + && keyWidths && keyHeights && keyCharCodes && sweetSpotCenterXs + && sweetSpotCenterYs && sweetSpotRadii), + mProximityCharsArray(new int[GRID_WIDTH * GRID_HEIGHT * MAX_PROXIMITY_CHARS_SIZE + /* proximityCharsLength */]), + mCodeToKeyMap() { + /* Let's check the input array length here to make sure */ + const jsize proximityCharsLength = env->GetArrayLength(proximityChars); + if (proximityCharsLength != GRID_WIDTH * GRID_HEIGHT * MAX_PROXIMITY_CHARS_SIZE) { + AKLOGE("Invalid proximityCharsLength: %d", proximityCharsLength); + ASSERT(false); + return; + } + + + if (DEBUG_PROXIMITY_INFO) { + AKLOGI("Create proximity info array %d", proximityCharsLength); + } + const jsize localeCStrUtf8Length = env->GetStringUTFLength(localeJStr); + if (localeCStrUtf8Length >= MAX_LOCALE_STRING_LENGTH) { + AKLOGI("Locale string length too long: length=%d", localeCStrUtf8Length); + ASSERT(false); + } + memset(mLocaleStr, 0, sizeof(mLocaleStr)); + env->GetStringUTFRegion(localeJStr, 0, env->GetStringLength(localeJStr), mLocaleStr); + safeGetOrFillZeroIntArrayRegion(env, proximityChars, proximityCharsLength, + mProximityCharsArray); + + for (int x=0; x<GRID_WIDTH; ++x) { + for (int y=0; y<GRID_HEIGHT; ++y) { + AKLOGI("(%d, %d) = %c %c %c %c %c %c %c %c", x, y, + (char)mProximityCharsArray[((y * GRID_WIDTH) + x) * MAX_PROXIMITY_CHARS_SIZE + 0], + (char)mProximityCharsArray[((y * GRID_WIDTH) + x) * MAX_PROXIMITY_CHARS_SIZE + 1], + (char)mProximityCharsArray[((y * GRID_WIDTH) + x) * MAX_PROXIMITY_CHARS_SIZE + 2], + (char)mProximityCharsArray[((y * GRID_WIDTH) + x) * MAX_PROXIMITY_CHARS_SIZE + 3], + (char)mProximityCharsArray[((y * GRID_WIDTH) + x) * MAX_PROXIMITY_CHARS_SIZE + 4], + (char)mProximityCharsArray[((y * GRID_WIDTH) + x) * MAX_PROXIMITY_CHARS_SIZE + 5], + (char)mProximityCharsArray[((y * GRID_WIDTH) + x) * MAX_PROXIMITY_CHARS_SIZE + 6], + (char)mProximityCharsArray[((y * GRID_WIDTH) + x) * MAX_PROXIMITY_CHARS_SIZE + 7]); + } + } + safeGetOrFillZeroIntArrayRegion(env, keyXCoordinates, KEY_COUNT, mKeyXCoordinates); + safeGetOrFillZeroIntArrayRegion(env, keyYCoordinates, KEY_COUNT, mKeyYCoordinates); + safeGetOrFillZeroIntArrayRegion(env, keyWidths, KEY_COUNT, mKeyWidths); + safeGetOrFillZeroIntArrayRegion(env, keyHeights, KEY_COUNT, mKeyHeights); + safeGetOrFillZeroIntArrayRegion(env, keyCharCodes, KEY_COUNT, mKeyCodePoints); + safeGetOrFillZeroFloatArrayRegion(env, sweetSpotCenterXs, KEY_COUNT, mSweetSpotCenterXs); + safeGetOrFillZeroFloatArrayRegion(env, sweetSpotCenterYs, KEY_COUNT, mSweetSpotCenterYs); + safeGetOrFillZeroFloatArrayRegion(env, sweetSpotRadii, KEY_COUNT, mSweetSpotRadii); + initializeG(); +} + +ProximityInfo::~ProximityInfo() { + delete[] mProximityCharsArray; +} + +bool ProximityInfo::hasSpaceProximity(const int x, const int y) const { + if (x < 0 || y < 0) { + if (DEBUG_DICT) { + AKLOGI("HasSpaceProximity: Illegal coordinates (%d, %d)", x, y); + // TODO: Enable this assertion. + //ASSERT(false); + } + return false; + } + + const int startIndex = ProximityInfoUtils::getStartIndexFromCoordinates(x, y, + CELL_HEIGHT, CELL_WIDTH, GRID_WIDTH); + if (DEBUG_PROXIMITY_INFO) { + AKLOGI("hasSpaceProximity: index %d, %d, %d", startIndex, x, y); + } + int *proximityCharsArray = mProximityCharsArray; + for (int i = 0; i < MAX_PROXIMITY_CHARS_SIZE; ++i) { + if (DEBUG_PROXIMITY_INFO) { + AKLOGI("Index: %d", mProximityCharsArray[startIndex + i]); + } + if (proximityCharsArray[startIndex + i] == KEYCODE_SPACE) { + return true; + } + } + return false; +} + +float ProximityInfo::getNormalizedSquaredDistanceFromCenterFloatG( + const int keyId, const int x, const int y, const bool isGeometric) const { + const float centerX = static_cast<float>(getKeyCenterXOfKeyIdG(keyId, x, isGeometric)); + const float centerY = static_cast<float>(getKeyCenterYOfKeyIdG(keyId, y, isGeometric)); + const float touchX = static_cast<float>(x); + const float touchY = static_cast<float>(y); + return ProximityInfoUtils::getSquaredDistanceFloat(centerX, centerY, touchX, touchY) + / GeometryUtils::SQUARE_FLOAT(static_cast<float>(getMostCommonKeyWidth())); +} + +int ProximityInfo::getCodePointOf(const int keyIndex) const { + if (keyIndex < 0 || keyIndex >= KEY_COUNT) { + return NOT_A_CODE_POINT; + } + return mKeyIndexToCodePointG[keyIndex]; +} + +void ProximityInfo::initializeG() { + // TODO: Optimize + for (int i = 0; i < KEY_COUNT; ++i) { + const int code = mKeyCodePoints[i]; + const int lowerCode = CharUtils::toLowerCase(code); + mCenterXsG[i] = mKeyXCoordinates[i] + mKeyWidths[i] / 2; + mCenterYsG[i] = mKeyYCoordinates[i] + mKeyHeights[i] / 2; + if (hasTouchPositionCorrectionData()) { + // Computes sweet spot center points for geometric input. + const float verticalScale = ProximityInfoParams::VERTICAL_SWEET_SPOT_SCALE_G; + const float sweetSpotCenterY = static_cast<float>(mSweetSpotCenterYs[i]); + const float gapY = sweetSpotCenterY - mCenterYsG[i]; + mSweetSpotCenterYsG[i] = static_cast<int>(mCenterYsG[i] + gapY * verticalScale); + } + mCodeToKeyMap[lowerCode] = i; + mKeyIndexToCodePointG[i] = lowerCode; + } + for (int i = 0; i < KEY_COUNT; i++) { + mKeyKeyDistancesG[i][i] = 0; + for (int j = i + 1; j < KEY_COUNT; j++) { + if (hasTouchPositionCorrectionData()) { + // Computes distances using sweet spots if they exist. + // We have two types of Y coordinate sweet spots, for geometric and for the others. + // The sweet spots for geometric input are used for calculating key-key distances + // here. + mKeyKeyDistancesG[i][j] = GeometryUtils::getDistanceInt( + mSweetSpotCenterXs[i], mSweetSpotCenterYsG[i], + mSweetSpotCenterXs[j], mSweetSpotCenterYsG[j]); + } else { + mKeyKeyDistancesG[i][j] = GeometryUtils::getDistanceInt( + mCenterXsG[i], mCenterYsG[i], mCenterXsG[j], mCenterYsG[j]); + } + mKeyKeyDistancesG[j][i] = mKeyKeyDistancesG[i][j]; + } + } +} + +// referencePointX is used only for keys wider than most common key width. When the referencePointX +// is NOT_A_COORDINATE, this method calculates the return value without using the line segment. +// isGeometric is currently not used because we don't have extra X coordinates sweet spots for +// geometric input. +int ProximityInfo::getKeyCenterXOfKeyIdG( + const int keyId, const int referencePointX, const bool isGeometric) const { + if (keyId < 0) { + return 0; + } + int centerX = (hasTouchPositionCorrectionData()) ? static_cast<int>(mSweetSpotCenterXs[keyId]) + : mCenterXsG[keyId]; + const int keyWidth = mKeyWidths[keyId]; + if (referencePointX != NOT_A_COORDINATE + && keyWidth > getMostCommonKeyWidth()) { + // For keys wider than most common keys, we use a line segment instead of the center point; + // thus, centerX is adjusted depending on referencePointX. + const int keyWidthHalfDiff = (keyWidth - getMostCommonKeyWidth()) / 2; + if (referencePointX < centerX - keyWidthHalfDiff) { + centerX -= keyWidthHalfDiff; + } else if (referencePointX > centerX + keyWidthHalfDiff) { + centerX += keyWidthHalfDiff; + } else { + centerX = referencePointX; + } + } + return centerX; +} + +// When the referencePointY is NOT_A_COORDINATE, this method calculates the return value without +// using the line segment. +int ProximityInfo::getKeyCenterYOfKeyIdG( + const int keyId, const int referencePointY, const bool isGeometric) const { + // TODO: Remove "isGeometric" and have separate "proximity_info"s for gesture and typing. + if (keyId < 0) { + return 0; + } + int centerY; + if (!hasTouchPositionCorrectionData()) { + centerY = mCenterYsG[keyId]; + } else if (isGeometric) { + centerY = static_cast<int>(mSweetSpotCenterYsG[keyId]); + } else { + centerY = static_cast<int>(mSweetSpotCenterYs[keyId]); + } + if (referencePointY != NOT_A_COORDINATE && + centerY + mKeyHeights[keyId] > KEYBOARD_HEIGHT && centerY < referencePointY) { + // When the distance between center point and bottom edge of the keyboard is shorter than + // the key height, we assume the key is located at the bottom row of the keyboard. + // The center point is extended to the bottom edge for such keys. + return referencePointY; + } + return centerY; +} + +int ProximityInfo::getKeyKeyDistanceG(const int keyId0, const int keyId1) const { + if (keyId0 >= 0 && keyId1 >= 0) { + return mKeyKeyDistancesG[keyId0][keyId1]; + } + return MAX_VALUE_FOR_WEIGHTING; +} +} // namespace latinime
diff --git a/src/aosp/suggest/core/layout/proximity_info.h b/src/aosp/suggest/core/layout/proximity_info.h new file mode 100644 index 0000000..f259490 --- /dev/null +++ b/src/aosp/suggest/core/layout/proximity_info.h
@@ -0,0 +1,129 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROXIMITY_INFO_H +#define LATINIME_PROXIMITY_INFO_H + +#include "defines.h" +#include "jni.h" +#include "suggest/core/layout/proximity_info_utils.h" +#include "utils/hash_map_compat.h" + +namespace latinime { + +class ProximityInfo { + public: + ProximityInfo(JNIEnv *env, const jstring localeJStr, + const int keyboardWidth, const int keyboardHeight, const int gridWidth, + const int gridHeight, const int mostCommonKeyWidth, const int mostCommonKeyHeight, + const jintArray proximityChars, const int keyCount, const jintArray keyXCoordinates, + const jintArray keyYCoordinates, const jintArray keyWidths, const jintArray keyHeights, + const jintArray keyCharCodes, const jfloatArray sweetSpotCenterXs, + const jfloatArray sweetSpotCenterYs, const jfloatArray sweetSpotRadii); + ~ProximityInfo(); + bool hasSpaceProximity(const int x, const int y) const; + int getNormalizedSquaredDistance(const int inputIndex, const int proximityIndex) const; + float getNormalizedSquaredDistanceFromCenterFloatG( + const int keyId, const int x, const int y, const bool isGeometric) const; + int getCodePointOf(const int keyIndex) const; + bool hasSweetSpotData(const int keyIndex) const { + // When there are no calibration data for a key, + // the radius of the key is assigned to zero. + return mSweetSpotRadii[keyIndex] > 0.0f; + } + float getSweetSpotRadiiAt(int keyIndex) const { return mSweetSpotRadii[keyIndex]; } + float getSweetSpotCenterXAt(int keyIndex) const { return mSweetSpotCenterXs[keyIndex]; } + float getSweetSpotCenterYAt(int keyIndex) const { return mSweetSpotCenterYs[keyIndex]; } + void calculateNearbyKeyCodes( + const int x, const int y, const int primaryKey, int *inputCodes) const; + bool hasTouchPositionCorrectionData() const { return HAS_TOUCH_POSITION_CORRECTION_DATA; } + int getMostCommonKeyWidth() const { return MOST_COMMON_KEY_WIDTH; } + int getMostCommonKeyWidthSquare() const { return MOST_COMMON_KEY_WIDTH_SQUARE; } + float getNormalizedSquaredMostCommonKeyHypotenuse() const { + return NORMALIZED_SQUARED_MOST_COMMON_KEY_HYPOTENUSE; + } + int getKeyCount() const { return KEY_COUNT; } + int getCellHeight() const { return CELL_HEIGHT; } + int getCellWidth() const { return CELL_WIDTH; } + int getGridWidth() const { return GRID_WIDTH; } + int getGridHeight() const { return GRID_HEIGHT; } + int getKeyboardWidth() const { return KEYBOARD_WIDTH; } + int getKeyboardHeight() const { return KEYBOARD_HEIGHT; } + float getKeyboardHypotenuse() const { return KEYBOARD_HYPOTENUSE; } + + int getKeyCenterXOfKeyIdG( + const int keyId, const int referencePointX, const bool isGeometric) const; + int getKeyCenterYOfKeyIdG( + const int keyId, const int referencePointY, const bool isGeometric) const; + int getKeyKeyDistanceG(int keyId0, int keyId1) const; + + AK_FORCE_INLINE void initializeProximities(const int *const inputCodes, + const int *const inputXCoordinates, const int *const inputYCoordinates, + const int inputSize, int *allInputCodes) const { + ProximityInfoUtils::initializeProximities(inputCodes, inputXCoordinates, inputYCoordinates, + inputSize, mKeyXCoordinates, mKeyYCoordinates, mKeyWidths, mKeyHeights, + mProximityCharsArray, CELL_HEIGHT, CELL_WIDTH, GRID_WIDTH, MOST_COMMON_KEY_WIDTH, + KEY_COUNT, mLocaleStr, &mCodeToKeyMap, allInputCodes); + } + + AK_FORCE_INLINE int getKeyIndexOf(const int c) const { + return ProximityInfoUtils::getKeyIndexOf(KEY_COUNT, c, &mCodeToKeyMap); + } + + AK_FORCE_INLINE bool isCodePointOnKeyboard(const int codePoint) const { + return getKeyIndexOf(codePoint) != NOT_AN_INDEX; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ProximityInfo); + + void initializeG(); + + const int GRID_WIDTH; + const int GRID_HEIGHT; + const int MOST_COMMON_KEY_WIDTH; + const int MOST_COMMON_KEY_WIDTH_SQUARE; + const int MOST_COMMON_KEY_HEIGHT; + const float NORMALIZED_SQUARED_MOST_COMMON_KEY_HYPOTENUSE; + const int CELL_WIDTH; + const int CELL_HEIGHT; + const int KEY_COUNT; + const int KEYBOARD_WIDTH; + const int KEYBOARD_HEIGHT; + const float KEYBOARD_HYPOTENUSE; + const bool HAS_TOUCH_POSITION_CORRECTION_DATA; + char mLocaleStr[MAX_LOCALE_STRING_LENGTH]; + int *mProximityCharsArray; + int mKeyXCoordinates[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mKeyYCoordinates[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mKeyWidths[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mKeyHeights[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mKeyCodePoints[MAX_KEY_COUNT_IN_A_KEYBOARD]; + float mSweetSpotCenterXs[MAX_KEY_COUNT_IN_A_KEYBOARD]; + float mSweetSpotCenterYs[MAX_KEY_COUNT_IN_A_KEYBOARD]; + // Sweet spots for geometric input. Note that we have extra sweet spots only for Y coordinates. + float mSweetSpotCenterYsG[MAX_KEY_COUNT_IN_A_KEYBOARD]; + float mSweetSpotRadii[MAX_KEY_COUNT_IN_A_KEYBOARD]; + hash_map_compat<int, int> mCodeToKeyMap; + + int mKeyIndexToCodePointG[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mCenterXsG[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mCenterYsG[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mKeyKeyDistancesG[MAX_KEY_COUNT_IN_A_KEYBOARD][MAX_KEY_COUNT_IN_A_KEYBOARD]; + // TODO: move to correction.h +}; +} // namespace latinime +#endif // LATINIME_PROXIMITY_INFO_H
diff --git a/src/aosp/suggest/core/layout/proximity_info_params.cpp b/src/aosp/suggest/core/layout/proximity_info_params.cpp new file mode 100644 index 0000000..49df103 --- /dev/null +++ b/src/aosp/suggest/core/layout/proximity_info_params.cpp
@@ -0,0 +1,104 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "defines.h" +#include "suggest/core/layout/proximity_info_params.h" + +namespace latinime { +const float ProximityInfoParams::NOT_A_DISTANCE_FLOAT = -1.0f; +const int ProximityInfoParams::MIN_DOUBLE_LETTER_BEELINE_SPEED_PERCENTILE = 5; +const float ProximityInfoParams::VERTICAL_SWEET_SPOT_SCALE = 1.0f; +const float ProximityInfoParams::VERTICAL_SWEET_SPOT_SCALE_G = 0.5f; + +/* Per method constants */ +// Used by ProximityInfoStateUtils::initGeometricDistanceInfos() +const float ProximityInfoParams::NEAR_KEY_NORMALIZED_SQUARED_THRESHOLD = 4.0f; + +// Used by ProximityInfoStateUtils::updateNearKeysDistances() +const float ProximityInfoParams::NEAR_KEY_THRESHOLD_FOR_DISTANCE = 2.0f; + +// Used by ProximityInfoStateUtils::isPrevLocalMin() +const float ProximityInfoParams::MARGIN_FOR_PREV_LOCAL_MIN = 0.01f; + +// Used by ProximityInfoStateUtils::getPointScore() +const int ProximityInfoParams::DISTANCE_BASE_SCALE = 100; +const float ProximityInfoParams::NEAR_KEY_THRESHOLD_FOR_POINT_SCORE = 0.6f; +const int ProximityInfoParams::CORNER_CHECK_DISTANCE_THRESHOLD_SCALE = 25; +const float ProximityInfoParams::NOT_LOCALMIN_DISTANCE_SCORE = -1.0f; +const float ProximityInfoParams::LOCALMIN_DISTANCE_AND_NEAR_TO_KEY_SCORE = 1.0f; +const float ProximityInfoParams::CORNER_ANGLE_THRESHOLD_FOR_POINT_SCORE = M_PI_F * 2.0f / 3.0f; +const float ProximityInfoParams::CORNER_SUM_ANGLE_THRESHOLD = M_PI_F / 4.0f; +const float ProximityInfoParams::CORNER_SCORE = 1.0f; + +// Used by ProximityInfoStateUtils::refreshSpeedRates() +const int ProximityInfoParams::NUM_POINTS_FOR_SPEED_CALCULATION = 2; + +// Used by ProximityInfoStateUtils::pushTouchPoint() +const int ProximityInfoParams::LAST_POINT_SKIP_DISTANCE_SCALE = 4; + +// Used by ProximityInfoStateUtils::updateAlignPointProbabilities() +const float ProximityInfoParams::MIN_PROBABILITY = 0.000001f; +const float ProximityInfoParams::MAX_SKIP_PROBABILITY = 0.95f; +const float ProximityInfoParams::SKIP_FIRST_POINT_PROBABILITY = 0.01f; +const float ProximityInfoParams::SKIP_LAST_POINT_PROBABILITY = 0.1f; +const float ProximityInfoParams::MIN_SPEED_RATE_FOR_SKIP_PROBABILITY = 0.15f; +const float ProximityInfoParams::SPEED_WEIGHT_FOR_SKIP_PROBABILITY = 0.9f; +const float ProximityInfoParams::SLOW_STRAIGHT_WEIGHT_FOR_SKIP_PROBABILITY = 0.6f; +const float ProximityInfoParams::NEAREST_DISTANCE_WEIGHT = 0.5f; +const float ProximityInfoParams::NEAREST_DISTANCE_BIAS = 0.5f; +const float ProximityInfoParams::NEAREST_DISTANCE_WEIGHT_FOR_LAST = 0.6f; +const float ProximityInfoParams::NEAREST_DISTANCE_BIAS_FOR_LAST = 0.4f; +const float ProximityInfoParams::ANGLE_WEIGHT = 0.90f; +const float ProximityInfoParams::DEEP_CORNER_ANGLE_THRESHOLD = M_PI_F * 60.0f / 180.0f; +const float ProximityInfoParams::SKIP_DEEP_CORNER_PROBABILITY = 0.1f; +const float ProximityInfoParams::CORNER_ANGLE_THRESHOLD = M_PI_F * 30.0f / 180.0f; +const float ProximityInfoParams::STRAIGHT_ANGLE_THRESHOLD = M_PI_F * 15.0f / 180.0f; +const float ProximityInfoParams::SKIP_CORNER_PROBABILITY = 0.4f; +const float ProximityInfoParams::SPEED_MARGIN = 0.1f; +const float ProximityInfoParams::CENTER_VALUE_OF_NORMALIZED_DISTRIBUTION = 0.0f; +// TODO: The variance is critical for accuracy; thus, adjusting these parameters by machine +// learning or something would be efficient. +const float ProximityInfoParams::SPEEDxANGLE_WEIGHT_FOR_STANDARD_DEVIATION = 0.3f; +const float ProximityInfoParams::MAX_SPEEDxANGLE_RATE_FOR_STANDARD_DEVIATION = 0.25f; +const float ProximityInfoParams::SPEEDxNEAREST_WEIGHT_FOR_STANDARD_DEVIATION = 0.5f; +const float ProximityInfoParams::MAX_SPEEDxNEAREST_RATE_FOR_STANDARD_DEVIATION = 0.15f; +const float ProximityInfoParams::MIN_STANDARD_DEVIATION = 0.37f; +const float ProximityInfoParams::PREV_DISTANCE_WEIGHT = 0.5f; +const float ProximityInfoParams::NEXT_DISTANCE_WEIGHT = 0.6f; + +// Used by ProximityInfoStateUtils::suppressCharProbabilities() +const float ProximityInfoParams::SUPPRESSION_LENGTH_WEIGHT = 1.5f; +const float ProximityInfoParams::MIN_SUPPRESSION_RATE = 0.1f; +const float ProximityInfoParams::SUPPRESSION_WEIGHT = 0.5f; +const float ProximityInfoParams::SUPPRESSION_WEIGHT_FOR_PROBABILITY_GAIN = 0.1f; +const float ProximityInfoParams::SKIP_PROBABALITY_WEIGHT_FOR_PROBABILITY_GAIN = 0.3f; + +// Used by ProximityInfoStateUtils::getMostProbableString() +const float ProximityInfoParams::DEMOTION_LOG_PROBABILITY = 0.3f; + +// Used by ProximityInfoStateUtils::updateSampledSearchKeySets() +// TODO: Investigate if this is required +const float ProximityInfoParams::SEARCH_KEY_RADIUS_RATIO = 0.95f; + +// Used by ProximityInfoStateUtils::calculateBeelineSpeedRate() +const int ProximityInfoParams::LOOKUP_RADIUS_PERCENTILE = 50; +const int ProximityInfoParams::FIRST_POINT_TIME_OFFSET_MILLIS = 150; +const int ProximityInfoParams::STRONG_DOUBLE_LETTER_TIME_MILLIS = 600; + +// Used by ProximityInfoStateUtils::calculateNormalizedSquaredDistance() +const int ProximityInfoParams::NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR = 1 << 10; + +} // namespace latinime
diff --git a/src/aosp/suggest/core/layout/proximity_info_params.h b/src/aosp/suggest/core/layout/proximity_info_params.h new file mode 100644 index 0000000..ae1f82c --- /dev/null +++ b/src/aosp/suggest/core/layout/proximity_info_params.h
@@ -0,0 +1,109 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROXIMITY_INFO_PARAMS_H +#define LATINIME_PROXIMITY_INFO_PARAMS_H + +#include "defines.h" + +namespace latinime { + +class ProximityInfoParams { + public: + static const float NOT_A_DISTANCE_FLOAT; + static const int MIN_DOUBLE_LETTER_BEELINE_SPEED_PERCENTILE; + static const float VERTICAL_SWEET_SPOT_SCALE; + static const float VERTICAL_SWEET_SPOT_SCALE_G; + + // Used by ProximityInfoStateUtils::initGeometricDistanceInfos() + static const float NEAR_KEY_NORMALIZED_SQUARED_THRESHOLD; + + // Used by ProximityInfoStateUtils::updateNearKeysDistances() + static const float NEAR_KEY_THRESHOLD_FOR_DISTANCE; + + // Used by ProximityInfoStateUtils::isPrevLocalMin() + static const float MARGIN_FOR_PREV_LOCAL_MIN; + + // Used by ProximityInfoStateUtils::getPointScore() + static const int DISTANCE_BASE_SCALE; + static const float NEAR_KEY_THRESHOLD_FOR_POINT_SCORE; + static const int CORNER_CHECK_DISTANCE_THRESHOLD_SCALE; + static const float NOT_LOCALMIN_DISTANCE_SCORE; + static const float LOCALMIN_DISTANCE_AND_NEAR_TO_KEY_SCORE; + static const float CORNER_ANGLE_THRESHOLD_FOR_POINT_SCORE; + static const float CORNER_SUM_ANGLE_THRESHOLD; + static const float CORNER_SCORE; + + // Used by ProximityInfoStateUtils::refreshSpeedRates() + static const int NUM_POINTS_FOR_SPEED_CALCULATION; + + // Used by ProximityInfoStateUtils::pushTouchPoint() + static const int LAST_POINT_SKIP_DISTANCE_SCALE; + + // Used by ProximityInfoStateUtils::updateAlignPointProbabilities() + static const float MIN_PROBABILITY; + static const float MAX_SKIP_PROBABILITY; + static const float SKIP_FIRST_POINT_PROBABILITY; + static const float SKIP_LAST_POINT_PROBABILITY; + static const float MIN_SPEED_RATE_FOR_SKIP_PROBABILITY; + static const float SPEED_WEIGHT_FOR_SKIP_PROBABILITY; + static const float SLOW_STRAIGHT_WEIGHT_FOR_SKIP_PROBABILITY; + static const float NEAREST_DISTANCE_WEIGHT; + static const float NEAREST_DISTANCE_BIAS; + static const float NEAREST_DISTANCE_WEIGHT_FOR_LAST; + static const float NEAREST_DISTANCE_BIAS_FOR_LAST; + static const float ANGLE_WEIGHT; + static const float DEEP_CORNER_ANGLE_THRESHOLD; + static const float SKIP_DEEP_CORNER_PROBABILITY; + static const float CORNER_ANGLE_THRESHOLD; + static const float STRAIGHT_ANGLE_THRESHOLD; + static const float SKIP_CORNER_PROBABILITY; + static const float SPEED_MARGIN; + static const float CENTER_VALUE_OF_NORMALIZED_DISTRIBUTION; + static const float SPEEDxANGLE_WEIGHT_FOR_STANDARD_DEVIATION; + static const float MAX_SPEEDxANGLE_RATE_FOR_STANDARD_DEVIATION; + static const float SPEEDxNEAREST_WEIGHT_FOR_STANDARD_DEVIATION; + static const float MAX_SPEEDxNEAREST_RATE_FOR_STANDARD_DEVIATION; + static const float MIN_STANDARD_DEVIATION; + static const float PREV_DISTANCE_WEIGHT; + static const float NEXT_DISTANCE_WEIGHT; + + // Used by ProximityInfoStateUtils::suppressCharProbabilities() + static const float SUPPRESSION_LENGTH_WEIGHT; + static const float MIN_SUPPRESSION_RATE; + static const float SUPPRESSION_WEIGHT; + static const float SUPPRESSION_WEIGHT_FOR_PROBABILITY_GAIN; + static const float SKIP_PROBABALITY_WEIGHT_FOR_PROBABILITY_GAIN; + + // Used by ProximityInfoStateUtils::getMostProbableString() + static const float DEMOTION_LOG_PROBABILITY; + + // Used by ProximityInfoStateUtils::updateSampledSearchKeySets() + static const float SEARCH_KEY_RADIUS_RATIO; + + // Used by ProximityInfoStateUtils::calculateBeelineSpeedRate() + static const int LOOKUP_RADIUS_PERCENTILE; + static const int FIRST_POINT_TIME_OFFSET_MILLIS; + static const int STRONG_DOUBLE_LETTER_TIME_MILLIS; + + // Used by ProximityInfoStateUtils::calculateNormalizedSquaredDistance() + static const int NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ProximityInfoParams); +}; +} // namespace latinime +#endif // LATINIME_PROXIMITY_INFO_PARAMS_H
diff --git a/src/aosp/suggest/core/layout/proximity_info_state.cpp b/src/aosp/suggest/core/layout/proximity_info_state.cpp new file mode 100644 index 0000000..fbabd92 --- /dev/null +++ b/src/aosp/suggest/core/layout/proximity_info_state.cpp
@@ -0,0 +1,292 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "LatinIME: proximity_info_state.cpp" + +#include "suggest/core/layout/proximity_info_state.h" + +#include <cstring> // for memset() and memcpy() +#include <sstream> // for debug prints +#include <vector> + +#include "defines.h" +#include "suggest/core/layout/geometry_utils.h" +#include "suggest/core/layout/proximity_info.h" +#include "suggest/core/layout/proximity_info_state_utils.h" +#include "utils/char_utils.h" + +namespace latinime { + +// TODO: Remove the dependency of "isGeometric" +void ProximityInfoState::initInputParams(const int pointerId, const float maxPointToKeyLength, + const ProximityInfo *proximityInfo, const int *const inputCodes, const int inputSize, + const int *const xCoordinates, const int *const yCoordinates, const int *const times, + const int *const pointerIds, const bool isGeometric) { + ASSERT(isGeometric || (inputSize < MAX_WORD_LENGTH)); + mIsContinuousSuggestionPossible = (mHasBeenUpdatedByGeometricInput != isGeometric) ? + false : ProximityInfoStateUtils::checkAndReturnIsContinuousSuggestionPossible( + inputSize, xCoordinates, yCoordinates, times, mSampledInputSize, + &mSampledInputXs, &mSampledInputYs, &mSampledTimes, &mSampledInputIndice); + if (DEBUG_DICT) { + AKLOGI("isContinuousSuggestionPossible = %s", + (mIsContinuousSuggestionPossible ? "true" : "false")); + } + + mProximityInfo = proximityInfo; + mHasTouchPositionCorrectionData = proximityInfo->hasTouchPositionCorrectionData(); + mMostCommonKeyWidthSquare = proximityInfo->getMostCommonKeyWidthSquare(); + mKeyCount = proximityInfo->getKeyCount(); + mCellHeight = proximityInfo->getCellHeight(); + mCellWidth = proximityInfo->getCellWidth(); + mGridHeight = proximityInfo->getGridWidth(); + mGridWidth = proximityInfo->getGridHeight(); + + memset(mInputProximities, 0, sizeof(mInputProximities)); + + if (!isGeometric && pointerId == 0) { + mProximityInfo->initializeProximities(inputCodes, xCoordinates, yCoordinates, + inputSize, mInputProximities); + } + + /////////////////////// + // Setup touch points + int pushTouchPointStartIndex = 0; + int lastSavedInputSize = 0; + mMaxPointToKeyLength = maxPointToKeyLength; + mSampledInputSize = 0; + mMostProbableStringProbability = 0.0f; + + if (mIsContinuousSuggestionPossible && mSampledInputIndice.size() > 1) { + // Just update difference. + // Previous two points are never skipped. Thus, we pop 2 input point data here. + pushTouchPointStartIndex = ProximityInfoStateUtils::trimLastTwoTouchPoints( + &mSampledInputXs, &mSampledInputYs, &mSampledTimes, &mSampledLengthCache, + &mSampledInputIndice); + lastSavedInputSize = mSampledInputXs.size(); + } else { + // Clear all data. + mSampledInputXs.clear(); + mSampledInputYs.clear(); + mSampledTimes.clear(); + mSampledInputIndice.clear(); + mSampledLengthCache.clear(); + mSampledNormalizedSquaredLengthCache.clear(); + mSampledNearKeySets.clear(); + mSampledSearchKeySets.clear(); + mSpeedRates.clear(); + mBeelineSpeedPercentiles.clear(); + mCharProbabilities.clear(); + mDirections.clear(); + } + + if (DEBUG_GEO_FULL) { + AKLOGI("Init ProximityInfoState: reused points = %d, last input size = %d", + pushTouchPointStartIndex, lastSavedInputSize); + } + + if (xCoordinates && yCoordinates) { + mSampledInputSize = ProximityInfoStateUtils::updateTouchPoints(mProximityInfo, + mMaxPointToKeyLength, mInputProximities, xCoordinates, yCoordinates, times, + pointerIds, inputSize, isGeometric, pointerId, + pushTouchPointStartIndex, &mSampledInputXs, &mSampledInputYs, &mSampledTimes, + &mSampledLengthCache, &mSampledInputIndice); + } + + if (mSampledInputSize > 0 && isGeometric) { + mAverageSpeed = ProximityInfoStateUtils::refreshSpeedRates(inputSize, xCoordinates, + yCoordinates, times, lastSavedInputSize, mSampledInputSize, &mSampledInputXs, + &mSampledInputYs, &mSampledTimes, &mSampledLengthCache, &mSampledInputIndice, + &mSpeedRates, &mDirections); + ProximityInfoStateUtils::refreshBeelineSpeedRates(mProximityInfo->getMostCommonKeyWidth(), + mAverageSpeed, inputSize, xCoordinates, yCoordinates, times, mSampledInputSize, + &mSampledInputXs, &mSampledInputYs, &mSampledInputIndice, + &mBeelineSpeedPercentiles); + } + + if (mSampledInputSize > 0) { + ProximityInfoStateUtils::initGeometricDistanceInfos(mProximityInfo, mSampledInputSize, + lastSavedInputSize, isGeometric, &mSampledInputXs, &mSampledInputYs, + &mSampledNearKeySets, &mSampledNormalizedSquaredLengthCache); + if (isGeometric) { + // updates probabilities of skipping or mapping each key for all points. + ProximityInfoStateUtils::updateAlignPointProbabilities( + mMaxPointToKeyLength, mProximityInfo->getMostCommonKeyWidth(), + mProximityInfo->getKeyCount(), lastSavedInputSize, mSampledInputSize, + &mSampledInputXs, &mSampledInputYs, &mSpeedRates, &mSampledLengthCache, + &mSampledNormalizedSquaredLengthCache, &mSampledNearKeySets, + &mCharProbabilities); + ProximityInfoStateUtils::updateSampledSearchKeySets(mProximityInfo, + mSampledInputSize, lastSavedInputSize, &mSampledLengthCache, + &mSampledNearKeySets, &mSampledSearchKeySets, + &mSampledSearchKeyVectors); + mMostProbableStringProbability = ProximityInfoStateUtils::getMostProbableString( + mProximityInfo, mSampledInputSize, &mCharProbabilities, mMostProbableString); + + } + } + + if (DEBUG_SAMPLING_POINTS) { + ProximityInfoStateUtils::dump(isGeometric, inputSize, xCoordinates, yCoordinates, + mSampledInputSize, &mSampledInputXs, &mSampledInputYs, &mSampledTimes, &mSpeedRates, + &mBeelineSpeedPercentiles); + } + // end + /////////////////////// + + mTouchPositionCorrectionEnabled = mSampledInputSize > 0 && mHasTouchPositionCorrectionData + && xCoordinates && yCoordinates; + if (!isGeometric && pointerId == 0) { + ProximityInfoStateUtils::initPrimaryInputWord( + inputSize, mInputProximities, mPrimaryInputWord); + } + if (DEBUG_GEO_FULL) { + AKLOGI("ProximityState init finished: %d points out of %d", mSampledInputSize, inputSize); + } + mHasBeenUpdatedByGeometricInput = isGeometric; +} + +// This function basically converts from a length to an edit distance. Accordingly, it's obviously +// wrong to compare with mMaxPointToKeyLength. +float ProximityInfoState::getPointToKeyLength( + const int inputIndex, const int codePoint) const { + const int keyId = mProximityInfo->getKeyIndexOf(codePoint); + if (keyId != NOT_AN_INDEX) { + const int index = inputIndex * mProximityInfo->getKeyCount() + keyId; + return min(mSampledNormalizedSquaredLengthCache[index], mMaxPointToKeyLength); + } + if (CharUtils::isIntentionalOmissionCodePoint(codePoint)) { + return 0.0f; + } + // If the char is not a key on the keyboard then return the max length. + return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); +} + +float ProximityInfoState::getPointToKeyByIdLength( + const int inputIndex, const int keyId) const { + return ProximityInfoStateUtils::getPointToKeyByIdLength(mMaxPointToKeyLength, + &mSampledNormalizedSquaredLengthCache, mProximityInfo->getKeyCount(), inputIndex, + keyId); +} + +// In the following function, c is the current character of the dictionary word currently examined. +// currentChars is an array containing the keys close to the character the user actually typed at +// the same position. We want to see if c is in it: if so, then the word contains at that position +// a character close to what the user typed. +// What the user typed is actually the first character of the array. +// proximityIndex is a pointer to the variable where getProximityType returns the index of c +// in the proximity chars of the input index. +// Notice : accented characters do not have a proximity list, so they are alone in their list. The +// non-accented version of the character should be considered "close", but not the other keys close +// to the non-accented version. +ProximityType ProximityInfoState::getProximityType(const int index, const int codePoint, + const bool checkProximityChars, int *proximityIndex) const { + const int *currentCodePoints = getProximityCodePointsAt(index); + const int firstCodePoint = currentCodePoints[0]; + const int baseLowerC = CharUtils::toBaseLowerCase(codePoint); + + // The first char in the array is what user typed. If it matches right away, that means the + // user typed that same char for this pos. + if (firstCodePoint == baseLowerC || firstCodePoint == codePoint) { + return MATCH_CHAR; + } + + if (!checkProximityChars) return SUBSTITUTION_CHAR; + + // If the non-accented, lowercased version of that first character matches c, then we have a + // non-accented version of the accented character the user typed. Treat it as a close char. + if (CharUtils::toBaseLowerCase(firstCodePoint) == baseLowerC) { + return PROXIMITY_CHAR; + } + + // Not an exact nor an accent-alike match: search the list of close keys + int j = 1; + while (j < MAX_PROXIMITY_CHARS_SIZE + && currentCodePoints[j] > ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) { + const bool matched = (currentCodePoints[j] == baseLowerC + || currentCodePoints[j] == codePoint); + if (matched) { + if (proximityIndex) { + *proximityIndex = j; + } + return PROXIMITY_CHAR; + } + ++j; + } + if (j < MAX_PROXIMITY_CHARS_SIZE + && currentCodePoints[j] == ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) { + ++j; + while (j < MAX_PROXIMITY_CHARS_SIZE + && currentCodePoints[j] > ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) { + const bool matched = (currentCodePoints[j] == baseLowerC + || currentCodePoints[j] == codePoint); + if (matched) { + if (proximityIndex) { + *proximityIndex = j; + } + return ADDITIONAL_PROXIMITY_CHAR; + } + ++j; + } + } + // Was not included, signal this as a substitution character. + return SUBSTITUTION_CHAR; +} + +ProximityType ProximityInfoState::getProximityTypeG(const int index, const int codePoint) const { + if (!isUsed()) { + return UNRELATED_CHAR; + } + const int lowerCodePoint = CharUtils::toLowerCase(codePoint); + const int baseLowerCodePoint = CharUtils::toBaseCodePoint(lowerCodePoint); + for (int i = 0; i < static_cast<int>(mSampledSearchKeyVectors[index].size()); ++i) { + if (mSampledSearchKeyVectors[index][i] == lowerCodePoint + || mSampledSearchKeyVectors[index][i] == baseLowerCodePoint) { + return MATCH_CHAR; + } + } + return UNRELATED_CHAR; +} + +bool ProximityInfoState::isKeyInSerchKeysAfterIndex(const int index, const int keyId) const { + ASSERT(keyId >= 0 && index >= 0 && index < mSampledInputSize); + return mSampledSearchKeySets[index].test(keyId); +} + +float ProximityInfoState::getDirection(const int index0, const int index1) const { + return ProximityInfoStateUtils::getDirection( + &mSampledInputXs, &mSampledInputYs, index0, index1); +} + +float ProximityInfoState::getMostProbableString(int *const codePointBuf) const { + memcpy(codePointBuf, mMostProbableString, sizeof(mMostProbableString)); + return mMostProbableStringProbability; +} + +bool ProximityInfoState::hasSpaceProximity(const int index) const { + ASSERT(0 <= index && index < mSampledInputSize); + return mProximityInfo->hasSpaceProximity(getInputX(index), getInputY(index)); +} + +// Returns a probability of mapping index to keyIndex. +float ProximityInfoState::getProbability(const int index, const int keyIndex) const { + ASSERT(0 <= index && index < mSampledInputSize); + hash_map_compat<int, float>::const_iterator it = mCharProbabilities[index].find(keyIndex); + if (it != mCharProbabilities[index].end()) { + return it->second; + } + return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); +} +} // namespace latinime
diff --git a/src/aosp/suggest/core/layout/proximity_info_state.h b/src/aosp/suggest/core/layout/proximity_info_state.h new file mode 100644 index 0000000..c94060f --- /dev/null +++ b/src/aosp/suggest/core/layout/proximity_info_state.h
@@ -0,0 +1,243 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROXIMITY_INFO_STATE_H +#define LATINIME_PROXIMITY_INFO_STATE_H + +#include <cstring> // for memset() +#include <vector> + +#include "defines.h" +#include "suggest/core/layout/proximity_info_params.h" +#include "suggest/core/layout/proximity_info_state_utils.h" +#include "utils/hash_map_compat.h" + +namespace latinime { + +class ProximityInfo; + +class ProximityInfoState { + public: + ///////////////////////////////////////// + // Defined in proximity_info_state.cpp // + ///////////////////////////////////////// + void initInputParams(const int pointerId, const float maxPointToKeyLength, + const ProximityInfo *proximityInfo, const int *const inputCodes, + const int inputSize, const int *xCoordinates, const int *yCoordinates, + const int *const times, const int *const pointerIds, const bool isGeometric); + + ///////////////////////////////////////// + // Defined here // + ///////////////////////////////////////// + AK_FORCE_INLINE ProximityInfoState() + : mProximityInfo(0), mMaxPointToKeyLength(0.0f), mAverageSpeed(0.0f), + mHasTouchPositionCorrectionData(false), mMostCommonKeyWidthSquare(0), + mKeyCount(0), mCellHeight(0), mCellWidth(0), mGridHeight(0), mGridWidth(0), + mIsContinuousSuggestionPossible(false), mHasBeenUpdatedByGeometricInput(false), + mSampledInputXs(), mSampledInputYs(), mSampledTimes(), mSampledInputIndice(), + mSampledLengthCache(), mBeelineSpeedPercentiles(), + mSampledNormalizedSquaredLengthCache(), mSpeedRates(), mDirections(), + mCharProbabilities(), mSampledNearKeySets(), mSampledSearchKeySets(), + mSampledSearchKeyVectors(), mTouchPositionCorrectionEnabled(false), + mSampledInputSize(0), mMostProbableStringProbability(0.0f) { + memset(mInputProximities, 0, sizeof(mInputProximities)); + memset(mPrimaryInputWord, 0, sizeof(mPrimaryInputWord)); + memset(mMostProbableString, 0, sizeof(mMostProbableString)); + } + + // Non virtual inline destructor -- never inherit this class + AK_FORCE_INLINE ~ProximityInfoState() {} + + inline int getPrimaryCodePointAt(const int index) const { + return getProximityCodePointsAt(index)[0]; + } + + inline bool sameAsTyped(const int *word, int length) const { + if (length != mSampledInputSize) { + return false; + } + const int *inputProximities = mInputProximities; + while (length--) { + if (*inputProximities != *word) { + return false; + } + inputProximities += MAX_PROXIMITY_CHARS_SIZE; + word++; + } + return true; + } + + AK_FORCE_INLINE bool existsCodePointInProximityAt(const int index, const int c) const { + const int *codePoints = getProximityCodePointsAt(index); + int i = 0; + while (codePoints[i] > 0 && i < MAX_PROXIMITY_CHARS_SIZE) { + if (codePoints[i++] == c) { + return true; + } + } + return false; + } + + AK_FORCE_INLINE bool existsAdjacentProximityChars(const int index) const { + if (index < 0 || index >= mSampledInputSize) return false; + const int currentCodePoint = getPrimaryCodePointAt(index); + const int leftIndex = index - 1; + if (leftIndex >= 0 && existsCodePointInProximityAt(leftIndex, currentCodePoint)) { + return true; + } + const int rightIndex = index + 1; + if (rightIndex < mSampledInputSize + && existsCodePointInProximityAt(rightIndex, currentCodePoint)) { + return true; + } + return false; + } + + inline const int *getPrimaryInputWord() const { + return mPrimaryInputWord; + } + + inline bool touchPositionCorrectionEnabled() const { + return mTouchPositionCorrectionEnabled; + } + + bool isUsed() const { + return mSampledInputSize > 0; + } + + int size() const { + return mSampledInputSize; + } + + int getInputX(const int index) const { + return mSampledInputXs[index]; + } + + int getInputY(const int index) const { + return mSampledInputYs[index]; + } + + int getInputIndexOfSampledPoint(const int sampledIndex) const { + return mSampledInputIndice[sampledIndex]; + } + + bool hasSpaceProximity(const int index) const; + + int getLengthCache(const int index) const { + return mSampledLengthCache[index]; + } + + bool isContinuousSuggestionPossible() const { + return mIsContinuousSuggestionPossible; + } + + // TODO: Rename s/Length/NormalizedSquaredLength/ + float getPointToKeyByIdLength(const int inputIndex, const int keyId) const; + // TODO: Rename s/Length/NormalizedSquaredLength/ + float getPointToKeyLength(const int inputIndex, const int codePoint) const; + + ProximityType getProximityType(const int index, const int codePoint, + const bool checkProximityChars, int *proximityIndex = 0) const; + + ProximityType getProximityTypeG(const int index, const int codePoint) const; + + const std::vector<int> *getSearchKeyVector(const int index) const { + return &mSampledSearchKeyVectors[index]; + } + + float getSpeedRate(const int index) const { + return mSpeedRates[index]; + } + + AK_FORCE_INLINE int getBeelineSpeedPercentile(const int id) const { + return mBeelineSpeedPercentiles[id]; + } + + AK_FORCE_INLINE DoubleLetterLevel getDoubleLetterLevel(const int id) const { + const int beelineSpeedRate = getBeelineSpeedPercentile(id); + if (beelineSpeedRate == 0) { + return A_STRONG_DOUBLE_LETTER; + } else if (beelineSpeedRate + < ProximityInfoParams::MIN_DOUBLE_LETTER_BEELINE_SPEED_PERCENTILE) { + return A_DOUBLE_LETTER; + } else { + return NOT_A_DOUBLE_LETTER; + } + } + + float getDirection(const int index) const { + return mDirections[index]; + } + // get xy direction + float getDirection(const int x, const int y) const; + + float getMostProbableString(int *const codePointBuf) const; + + float getProbability(const int index, const int charCode) const; + + bool isKeyInSerchKeysAfterIndex(const int index, const int keyId) const; + + private: + DISALLOW_COPY_AND_ASSIGN(ProximityInfoState); + + inline const int *getProximityCodePointsAt(const int index) const { + return ProximityInfoStateUtils::getProximityCodePointsAt(mInputProximities, index); + } + + // const + const ProximityInfo *mProximityInfo; + float mMaxPointToKeyLength; + float mAverageSpeed; + bool mHasTouchPositionCorrectionData; + int mMostCommonKeyWidthSquare; + int mKeyCount; + int mCellHeight; + int mCellWidth; + int mGridHeight; + int mGridWidth; + bool mIsContinuousSuggestionPossible; + bool mHasBeenUpdatedByGeometricInput; + + std::vector<int> mSampledInputXs; + std::vector<int> mSampledInputYs; + std::vector<int> mSampledTimes; + std::vector<int> mSampledInputIndice; + std::vector<int> mSampledLengthCache; + std::vector<int> mBeelineSpeedPercentiles; + std::vector<float> mSampledNormalizedSquaredLengthCache; + std::vector<float> mSpeedRates; + std::vector<float> mDirections; + // probabilities of skipping or mapping to a key for each point. + std::vector<hash_map_compat<int, float> > mCharProbabilities; + // The vector for the key code set which holds nearby keys for each sampled input point + // 1. Used to calculate the probability of the key + // 2. Used to calculate mSampledSearchKeySets + std::vector<ProximityInfoStateUtils::NearKeycodesSet> mSampledNearKeySets; + // The vector for the key code set which holds nearby keys of some trailing sampled input points + // for each sampled input point. These nearby keys contain the next characters which can be in + // the dictionary. Specifically, currently we are looking for keys nearby trailing sampled + // inputs including the current input point. + std::vector<ProximityInfoStateUtils::NearKeycodesSet> mSampledSearchKeySets; + std::vector<std::vector<int> > mSampledSearchKeyVectors; + bool mTouchPositionCorrectionEnabled; + int mInputProximities[MAX_PROXIMITY_CHARS_SIZE * MAX_WORD_LENGTH]; + int mSampledInputSize; + int mPrimaryInputWord[MAX_WORD_LENGTH]; + float mMostProbableStringProbability; + int mMostProbableString[MAX_WORD_LENGTH]; +}; +} // namespace latinime +#endif // LATINIME_PROXIMITY_INFO_STATE_H
diff --git a/src/aosp/suggest/core/layout/proximity_info_state_utils.cpp b/src/aosp/suggest/core/layout/proximity_info_state_utils.cpp new file mode 100644 index 0000000..e1b3534 --- /dev/null +++ b/src/aosp/suggest/core/layout/proximity_info_state_utils.cpp
@@ -0,0 +1,1045 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/layout/proximity_info_state_utils.h" + +#include <cmath> +#include <cstring> // for memset() +#include <sstream> // for debug prints +#include <vector> + +#include "defines.h" +#include "suggest/core/layout/geometry_utils.h" +#include "suggest/core/layout/proximity_info.h" +#include "suggest/core/layout/proximity_info_params.h" + +namespace latinime { + +/* static */ int ProximityInfoStateUtils::trimLastTwoTouchPoints(std::vector<int> *sampledInputXs, + std::vector<int> *sampledInputYs, std::vector<int> *sampledInputTimes, + std::vector<int> *sampledLengthCache, std::vector<int> *sampledInputIndice) { + const int nextStartIndex = (*sampledInputIndice)[sampledInputIndice->size() - 2]; + popInputData(sampledInputXs, sampledInputYs, sampledInputTimes, sampledLengthCache, + sampledInputIndice); + popInputData(sampledInputXs, sampledInputYs, sampledInputTimes, sampledLengthCache, + sampledInputIndice); + return nextStartIndex; +} + +/* static */ int ProximityInfoStateUtils::updateTouchPoints( + const ProximityInfo *const proximityInfo, const int maxPointToKeyLength, + const int *const inputProximities, const int *const inputXCoordinates, + const int *const inputYCoordinates, const int *const times, const int *const pointerIds, + const int inputSize, const bool isGeometric, const int pointerId, + const int pushTouchPointStartIndex, std::vector<int> *sampledInputXs, + std::vector<int> *sampledInputYs, std::vector<int> *sampledInputTimes, + std::vector<int> *sampledLengthCache, std::vector<int> *sampledInputIndice) { + if (DEBUG_SAMPLING_POINTS) { + if (times) { + for (int i = 0; i < inputSize; ++i) { + AKLOGI("(%d) x %d, y %d, time %d", + i, inputXCoordinates[i], inputYCoordinates[i], times[i]); + } + } + } +#ifdef DO_ASSERT_TEST + if (times) { + for (int i = 0; i < inputSize; ++i) { + if (i > 0) { + if (times[i] < times[i - 1]) { + AKLOGI("Invalid time sequence. %d, %d", times[i - 1], times[i]); + ASSERT(false); + } + } + } + } +#endif + const bool proximityOnly = !isGeometric + && (inputXCoordinates[0] < 0 || inputYCoordinates[0] < 0); + int lastInputIndex = pushTouchPointStartIndex; + for (int i = lastInputIndex; i < inputSize; ++i) { + const int pid = pointerIds ? pointerIds[i] : 0; + if (pointerId == pid) { + lastInputIndex = i; + } + } + if (DEBUG_GEO_FULL) { + AKLOGI("Init ProximityInfoState: last input index = %d", lastInputIndex); + } + // Working space to save near keys distances for current, prev and prevprev input point. + NearKeysDistanceMap nearKeysDistances[3]; + // These pointers are swapped for each inputs points. + NearKeysDistanceMap *currentNearKeysDistances = &nearKeysDistances[0]; + NearKeysDistanceMap *prevNearKeysDistances = &nearKeysDistances[1]; + NearKeysDistanceMap *prevPrevNearKeysDistances = &nearKeysDistances[2]; + // "sumAngle" is accumulated by each angle of input points. And when "sumAngle" exceeds + // the threshold we save that point, reset sumAngle. This aims to keep the figure of + // the curve. + float sumAngle = 0.0f; + + for (int i = pushTouchPointStartIndex; i <= lastInputIndex; ++i) { + // Assuming pointerId == 0 if pointerIds is null. + const int pid = pointerIds ? pointerIds[i] : 0; + if (DEBUG_GEO_FULL) { + AKLOGI("Init ProximityInfoState: (%d)PID = %d", i, pid); + } + if (pointerId == pid) { + const int c = isGeometric ? + NOT_A_COORDINATE : getPrimaryCodePointAt(inputProximities, i); + const int x = proximityOnly ? NOT_A_COORDINATE : inputXCoordinates[i]; + const int y = proximityOnly ? NOT_A_COORDINATE : inputYCoordinates[i]; + const int time = times ? times[i] : -1; + + if (i > 1) { + const float prevAngle = GeometryUtils::getAngle( + inputXCoordinates[i - 2], inputYCoordinates[i - 2], + inputXCoordinates[i - 1], inputYCoordinates[i - 1]); + const float currentAngle = GeometryUtils::getAngle( + inputXCoordinates[i - 1], inputYCoordinates[i - 1], x, y); + sumAngle += GeometryUtils::getAngleDiff(prevAngle, currentAngle); + } + + if (pushTouchPoint(proximityInfo, maxPointToKeyLength, i, c, x, y, time, + isGeometric, isGeometric /* doSampling */, i == lastInputIndex, + sumAngle, currentNearKeysDistances, prevNearKeysDistances, + prevPrevNearKeysDistances, sampledInputXs, sampledInputYs, sampledInputTimes, + sampledLengthCache, sampledInputIndice)) { + // Previous point information was popped. + NearKeysDistanceMap *tmp = prevNearKeysDistances; + prevNearKeysDistances = currentNearKeysDistances; + currentNearKeysDistances = tmp; + } else { + NearKeysDistanceMap *tmp = prevPrevNearKeysDistances; + prevPrevNearKeysDistances = prevNearKeysDistances; + prevNearKeysDistances = currentNearKeysDistances; + currentNearKeysDistances = tmp; + sumAngle = 0.0f; + } + } + } + return sampledInputXs->size(); +} + +/* static */ const int *ProximityInfoStateUtils::getProximityCodePointsAt( + const int *const inputProximities, const int index) { + return inputProximities + (index * MAX_PROXIMITY_CHARS_SIZE); +} + +/* static */ int ProximityInfoStateUtils::getPrimaryCodePointAt(const int *const inputProximities, + const int index) { + return getProximityCodePointsAt(inputProximities, index)[0]; +} + +/* static */ void ProximityInfoStateUtils::initPrimaryInputWord(const int inputSize, + const int *const inputProximities, int *primaryInputWord) { + memset(primaryInputWord, 0, sizeof(primaryInputWord[0]) * MAX_WORD_LENGTH); + for (int i = 0; i < inputSize; ++i) { + primaryInputWord[i] = getPrimaryCodePointAt(inputProximities, i); + } +} + +/* static */ float ProximityInfoStateUtils::calculateSquaredDistanceFromSweetSpotCenter( + const ProximityInfo *const proximityInfo, const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, const int keyIndex, const int inputIndex) { + const float sweetSpotCenterX = proximityInfo->getSweetSpotCenterXAt(keyIndex); + const float sweetSpotCenterY = proximityInfo->getSweetSpotCenterYAt(keyIndex); + const float inputX = static_cast<float>((*sampledInputXs)[inputIndex]); + const float inputY = static_cast<float>((*sampledInputYs)[inputIndex]); + return GeometryUtils::SQUARE_FLOAT(inputX - sweetSpotCenterX) + + GeometryUtils::SQUARE_FLOAT(inputY - sweetSpotCenterY); +} + +/* static */ float ProximityInfoStateUtils::calculateNormalizedSquaredDistance( + const ProximityInfo *const proximityInfo, const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, const int keyIndex, const int inputIndex) { + if (keyIndex == NOT_AN_INDEX) { + return ProximityInfoParams::NOT_A_DISTANCE_FLOAT; + } + if (!proximityInfo->hasSweetSpotData(keyIndex)) { + return ProximityInfoParams::NOT_A_DISTANCE_FLOAT; + } + if (NOT_A_COORDINATE == (*sampledInputXs)[inputIndex]) { + return ProximityInfoParams::NOT_A_DISTANCE_FLOAT; + } + const float squaredDistance = calculateSquaredDistanceFromSweetSpotCenter(proximityInfo, + sampledInputXs, sampledInputYs, keyIndex, inputIndex); + const float squaredRadius = GeometryUtils::SQUARE_FLOAT( + proximityInfo->getSweetSpotRadiiAt(keyIndex)); + return squaredDistance / squaredRadius; +} + +/* static */ void ProximityInfoStateUtils::initGeometricDistanceInfos( + const ProximityInfo *const proximityInfo, const int sampledInputSize, + const int lastSavedInputSize, const bool isGeometric, + const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, + std::vector<NearKeycodesSet> *sampledNearKeySets, + std::vector<float> *sampledNormalizedSquaredLengthCache) { + sampledNearKeySets->resize(sampledInputSize); + const int keyCount = proximityInfo->getKeyCount(); + sampledNormalizedSquaredLengthCache->resize(sampledInputSize * keyCount); + for (int i = lastSavedInputSize; i < sampledInputSize; ++i) { + (*sampledNearKeySets)[i].reset(); + for (int k = 0; k < keyCount; ++k) { + const int index = i * keyCount + k; + const int x = (*sampledInputXs)[i]; + const int y = (*sampledInputYs)[i]; + const float normalizedSquaredDistance = + proximityInfo->getNormalizedSquaredDistanceFromCenterFloatG( + k, x, y, isGeometric); + (*sampledNormalizedSquaredLengthCache)[index] = normalizedSquaredDistance; + if (normalizedSquaredDistance + < ProximityInfoParams::NEAR_KEY_NORMALIZED_SQUARED_THRESHOLD) { + (*sampledNearKeySets)[i][k] = true; + } + } + } +} + +/* static */ void ProximityInfoStateUtils::popInputData(std::vector<int> *sampledInputXs, + std::vector<int> *sampledInputYs, std::vector<int> *sampledInputTimes, + std::vector<int> *sampledLengthCache, std::vector<int> *sampledInputIndice) { + sampledInputXs->pop_back(); + sampledInputYs->pop_back(); + sampledInputTimes->pop_back(); + sampledLengthCache->pop_back(); + sampledInputIndice->pop_back(); +} + +/* static */ float ProximityInfoStateUtils::refreshSpeedRates(const int inputSize, + const int *const xCoordinates, const int *const yCoordinates, const int *const times, + const int lastSavedInputSize, const int sampledInputSize, + const std::vector<int> *const sampledInputXs, const std::vector<int> *const sampledInputYs, + const std::vector<int> *const sampledInputTimes, + const std::vector<int> *const sampledLengthCache, + const std::vector<int> *const sampledInputIndice, std::vector<float> *sampledSpeedRates, + std::vector<float> *sampledDirections) { + // Relative speed calculation. + const int sumDuration = sampledInputTimes->back() - sampledInputTimes->front(); + const int sumLength = sampledLengthCache->back() - sampledLengthCache->front(); + const float averageSpeed = static_cast<float>(sumLength) / static_cast<float>(sumDuration); + sampledSpeedRates->resize(sampledInputSize); + for (int i = lastSavedInputSize; i < sampledInputSize; ++i) { + const int index = (*sampledInputIndice)[i]; + int length = 0; + int duration = 0; + + // Calculate velocity by using distances and durations of + // ProximityInfoParams::NUM_POINTS_FOR_SPEED_CALCULATION points for both forward and + // backward. + const int forwardNumPoints = min(inputSize - 1, + index + ProximityInfoParams::NUM_POINTS_FOR_SPEED_CALCULATION); + for (int j = index; j < forwardNumPoints; ++j) { + if (i < sampledInputSize - 1 && j >= (*sampledInputIndice)[i + 1]) { + break; + } + length += GeometryUtils::getDistanceInt(xCoordinates[j], yCoordinates[j], + xCoordinates[j + 1], yCoordinates[j + 1]); + duration += times[j + 1] - times[j]; + } + const int backwardNumPoints = max(0, + index - ProximityInfoParams::NUM_POINTS_FOR_SPEED_CALCULATION); + for (int j = index - 1; j >= backwardNumPoints; --j) { + if (i > 0 && j < (*sampledInputIndice)[i - 1]) { + break; + } + // TODO: use mSampledLengthCache instead? + length += GeometryUtils::getDistanceInt(xCoordinates[j], yCoordinates[j], + xCoordinates[j + 1], yCoordinates[j + 1]); + duration += times[j + 1] - times[j]; + } + if (duration == 0 || sumDuration == 0) { + // Cannot calculate speed; thus, it gives an average value (1.0); + (*sampledSpeedRates)[i] = 1.0f; + } else { + const float speed = static_cast<float>(length) / static_cast<float>(duration); + (*sampledSpeedRates)[i] = speed / averageSpeed; + } + } + + // Direction calculation. + sampledDirections->resize(sampledInputSize - 1); + for (int i = max(0, lastSavedInputSize - 1); i < sampledInputSize - 1; ++i) { + (*sampledDirections)[i] = getDirection(sampledInputXs, sampledInputYs, i, i + 1); + } + return averageSpeed; +} + +/* static */ void ProximityInfoStateUtils::refreshBeelineSpeedRates(const int mostCommonKeyWidth, + const float averageSpeed, const int inputSize, const int *const xCoordinates, + const int *const yCoordinates, const int *times, const int sampledInputSize, + const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, const std::vector<int> *const inputIndice, + std::vector<int> *beelineSpeedPercentiles) { + if (DEBUG_SAMPLING_POINTS) { + AKLOGI("--- refresh beeline speed rates"); + } + beelineSpeedPercentiles->resize(sampledInputSize); + for (int i = 0; i < sampledInputSize; ++i) { + (*beelineSpeedPercentiles)[i] = static_cast<int>(calculateBeelineSpeedRate( + mostCommonKeyWidth, averageSpeed, i, inputSize, xCoordinates, yCoordinates, times, + sampledInputSize, sampledInputXs, sampledInputYs, inputIndice) * MAX_PERCENTILE); + } +} + +/* static */float ProximityInfoStateUtils::getDirection( + const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, const int index0, const int index1) { + ASSERT(sampledInputXs && sampledInputYs); + const int sampledInputSize =sampledInputXs->size(); + if (index0 < 0 || index0 > sampledInputSize - 1) { + return 0.0f; + } + if (index1 < 0 || index1 > sampledInputSize - 1) { + return 0.0f; + } + const int x1 = (*sampledInputXs)[index0]; + const int y1 = (*sampledInputYs)[index0]; + const int x2 = (*sampledInputXs)[index1]; + const int y2 = (*sampledInputYs)[index1]; + return GeometryUtils::getAngle(x1, y1, x2, y2); +} + +// Calculating point to key distance for all near keys and returning the distance between +// the given point and the nearest key position. +/* static */ float ProximityInfoStateUtils::updateNearKeysDistances( + const ProximityInfo *const proximityInfo, const float maxPointToKeyLength, const int x, + const int y, const bool isGeometric, NearKeysDistanceMap *const currentNearKeysDistances) { + currentNearKeysDistances->clear(); + const int keyCount = proximityInfo->getKeyCount(); + float nearestKeyDistance = maxPointToKeyLength; + for (int k = 0; k < keyCount; ++k) { + const float dist = proximityInfo->getNormalizedSquaredDistanceFromCenterFloatG(k, x, y, + isGeometric); + if (dist < ProximityInfoParams::NEAR_KEY_THRESHOLD_FOR_DISTANCE) { + currentNearKeysDistances->insert(std::pair<int, float>(k, dist)); + } + if (nearestKeyDistance > dist) { + nearestKeyDistance = dist; + } + } + return nearestKeyDistance; +} + +// Check if previous point is at local minimum position to near keys. +/* static */ bool ProximityInfoStateUtils::isPrevLocalMin( + const NearKeysDistanceMap *const currentNearKeysDistances, + const NearKeysDistanceMap *const prevNearKeysDistances, + const NearKeysDistanceMap *const prevPrevNearKeysDistances) { + for (NearKeysDistanceMap::const_iterator it = prevNearKeysDistances->begin(); + it != prevNearKeysDistances->end(); ++it) { + NearKeysDistanceMap::const_iterator itPP = prevPrevNearKeysDistances->find(it->first); + NearKeysDistanceMap::const_iterator itC = currentNearKeysDistances->find(it->first); + const bool isPrevPrevNear = (itPP == prevPrevNearKeysDistances->end() + || itPP->second > it->second + ProximityInfoParams::MARGIN_FOR_PREV_LOCAL_MIN); + const bool isCurrentNear = (itC == currentNearKeysDistances->end() + || itC->second > it->second + ProximityInfoParams::MARGIN_FOR_PREV_LOCAL_MIN); + if (isPrevPrevNear && isCurrentNear) { + return true; + } + } + return false; +} + +// Calculating a point score that indicates usefulness of the point. +/* static */ float ProximityInfoStateUtils::getPointScore(const int mostCommonKeyWidth, + const int x, const int y, const int time, const bool lastPoint, const float nearest, + const float sumAngle, const NearKeysDistanceMap *const currentNearKeysDistances, + const NearKeysDistanceMap *const prevNearKeysDistances, + const NearKeysDistanceMap *const prevPrevNearKeysDistances, + std::vector<int> *sampledInputXs, std::vector<int> *sampledInputYs) { + const size_t size = sampledInputXs->size(); + // If there is only one point, add this point. Besides, if the previous point's distance map + // is empty, we re-compute nearby keys distances from the current point. + // Note that the current point is the first point in the incremental input that needs to + // be re-computed. + if (size <= 1 || prevNearKeysDistances->empty()) { + return 0.0f; + } + + const int baseSampleRate = mostCommonKeyWidth; + const int distPrev = GeometryUtils::getDistanceInt(sampledInputXs->back(), + sampledInputYs->back(), (*sampledInputXs)[size - 2], + (*sampledInputYs)[size - 2]) * ProximityInfoParams::DISTANCE_BASE_SCALE; + float score = 0.0f; + + // Location + if (!isPrevLocalMin(currentNearKeysDistances, prevNearKeysDistances, + prevPrevNearKeysDistances)) { + score += ProximityInfoParams::NOT_LOCALMIN_DISTANCE_SCORE; + } else if (nearest < ProximityInfoParams::NEAR_KEY_THRESHOLD_FOR_POINT_SCORE) { + // Promote points nearby keys + score += ProximityInfoParams::LOCALMIN_DISTANCE_AND_NEAR_TO_KEY_SCORE; + } + // Angle + const float angle1 = GeometryUtils::getAngle(x, y, sampledInputXs->back(), + sampledInputYs->back()); + const float angle2 = GeometryUtils::getAngle(sampledInputXs->back(), sampledInputYs->back(), + (*sampledInputXs)[size - 2], (*sampledInputYs)[size - 2]); + const float angleDiff = GeometryUtils::getAngleDiff(angle1, angle2); + + // Save corner + if (distPrev > baseSampleRate * ProximityInfoParams::CORNER_CHECK_DISTANCE_THRESHOLD_SCALE + && (sumAngle > ProximityInfoParams::CORNER_SUM_ANGLE_THRESHOLD + || angleDiff > ProximityInfoParams::CORNER_ANGLE_THRESHOLD_FOR_POINT_SCORE)) { + score += ProximityInfoParams::CORNER_SCORE; + } + return score; +} + +// Sampling touch point and pushing information to vectors. +// Returning if previous point is popped or not. +/* static */ bool ProximityInfoStateUtils::pushTouchPoint(const ProximityInfo *const proximityInfo, + const int maxPointToKeyLength, const int inputIndex, const int nodeCodePoint, int x, int y, + const int time, const bool isGeometric, const bool doSampling, + const bool isLastPoint, const float sumAngle, + NearKeysDistanceMap *const currentNearKeysDistances, + const NearKeysDistanceMap *const prevNearKeysDistances, + const NearKeysDistanceMap *const prevPrevNearKeysDistances, + std::vector<int> *sampledInputXs, std::vector<int> *sampledInputYs, + std::vector<int> *sampledInputTimes, std::vector<int> *sampledLengthCache, + std::vector<int> *sampledInputIndice) { + const int mostCommonKeyWidth = proximityInfo->getMostCommonKeyWidth(); + + size_t size = sampledInputXs->size(); + bool popped = false; + if (nodeCodePoint < 0 && doSampling) { + const float nearest = updateNearKeysDistances(proximityInfo, maxPointToKeyLength, x, y, + isGeometric, currentNearKeysDistances); + const float score = getPointScore(mostCommonKeyWidth, x, y, time, isLastPoint, nearest, + sumAngle, currentNearKeysDistances, prevNearKeysDistances, + prevPrevNearKeysDistances, sampledInputXs, sampledInputYs); + if (score < 0) { + // Pop previous point because it would be useless. + popInputData(sampledInputXs, sampledInputYs, sampledInputTimes, sampledLengthCache, + sampledInputIndice); + size = sampledInputXs->size(); + popped = true; + } else { + popped = false; + } + // Check if the last point should be skipped. + if (isLastPoint && size > 0) { + if (GeometryUtils::getDistanceInt(x, y, sampledInputXs->back(), sampledInputYs->back()) + * ProximityInfoParams::LAST_POINT_SKIP_DISTANCE_SCALE < mostCommonKeyWidth) { + // This point is not used because it's too close to the previous point. + if (DEBUG_GEO_FULL) { + AKLOGI("p0: size = %zd, x = %d, y = %d, lx = %d, ly = %d, dist = %d, " + "width = %d", size, x, y, sampledInputXs->back(), + sampledInputYs->back(), GeometryUtils::getDistanceInt( + x, y, sampledInputXs->back(), sampledInputYs->back()), + mostCommonKeyWidth + / ProximityInfoParams::LAST_POINT_SKIP_DISTANCE_SCALE); + } + return popped; + } + } + } + + if (nodeCodePoint >= 0 && (x < 0 || y < 0)) { + const int keyId = proximityInfo->getKeyIndexOf(nodeCodePoint); + if (keyId >= 0) { + x = proximityInfo->getKeyCenterXOfKeyIdG(keyId, NOT_AN_INDEX, isGeometric); + y = proximityInfo->getKeyCenterYOfKeyIdG(keyId, NOT_AN_INDEX, isGeometric); + } + } + + // Pushing point information. + if (size > 0) { + sampledLengthCache->push_back( + sampledLengthCache->back() + GeometryUtils::getDistanceInt( + x, y, sampledInputXs->back(), sampledInputYs->back())); + } else { + sampledLengthCache->push_back(0); + } + sampledInputXs->push_back(x); + sampledInputYs->push_back(y); + sampledInputTimes->push_back(time); + sampledInputIndice->push_back(inputIndex); + if (DEBUG_GEO_FULL) { + AKLOGI("pushTouchPoint: x = %03d, y = %03d, time = %d, index = %d, popped ? %01d", + x, y, time, inputIndex, popped); + } + return popped; +} + +/* static */ float ProximityInfoStateUtils::calculateBeelineSpeedRate(const int mostCommonKeyWidth, + const float averageSpeed, const int id, const int inputSize, const int *const xCoordinates, + const int *const yCoordinates, const int *times, const int sampledInputSize, + const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, + const std::vector<int> *const sampledInputIndices) { + if (sampledInputSize <= 0 || averageSpeed < 0.001f) { + if (DEBUG_SAMPLING_POINTS) { + AKLOGI("--- invalid state: cancel. size = %d, ave = %f", + sampledInputSize, averageSpeed); + } + return 1.0f; + } + const int lookupRadius = mostCommonKeyWidth + * ProximityInfoParams::LOOKUP_RADIUS_PERCENTILE / MAX_PERCENTILE; + const int x0 = (*sampledInputXs)[id]; + const int y0 = (*sampledInputYs)[id]; + const int actualInputIndex = (*sampledInputIndices)[id]; + int tempTime = 0; + int tempBeelineDistance = 0; + int start = actualInputIndex; + // lookup forward + while (start > 0 && tempBeelineDistance < lookupRadius) { + tempTime += times[start] - times[start - 1]; + --start; + tempBeelineDistance = GeometryUtils::getDistanceInt(x0, y0, xCoordinates[start], + yCoordinates[start]); + } + // Exclusive unless this is an edge point + if (start > 0 && start < actualInputIndex) { + ++start; + } + tempTime= 0; + tempBeelineDistance = 0; + int end = actualInputIndex; + // lookup backward + while (end < (inputSize - 1) && tempBeelineDistance < lookupRadius) { + tempTime += times[end + 1] - times[end]; + ++end; + tempBeelineDistance = GeometryUtils::getDistanceInt(x0, y0, xCoordinates[end], + yCoordinates[end]); + } + // Exclusive unless this is an edge point + if (end > actualInputIndex && end < (inputSize - 1)) { + --end; + } + + if (start >= end) { + if (DEBUG_DOUBLE_LETTER) { + AKLOGI("--- double letter: start == end %d", start); + } + return 1.0f; + } + + const int x2 = xCoordinates[start]; + const int y2 = yCoordinates[start]; + const int x3 = xCoordinates[end]; + const int y3 = yCoordinates[end]; + const int beelineDistance = GeometryUtils::getDistanceInt(x2, y2, x3, y3); + int adjustedStartTime = times[start]; + if (start == 0 && actualInputIndex == 0 && inputSize > 1) { + adjustedStartTime += ProximityInfoParams::FIRST_POINT_TIME_OFFSET_MILLIS; + } + int adjustedEndTime = times[end]; + if (end == (inputSize - 1) && inputSize > 1) { + adjustedEndTime -= ProximityInfoParams::FIRST_POINT_TIME_OFFSET_MILLIS; + } + const int time = adjustedEndTime - adjustedStartTime; + if (time <= 0) { + return 1.0f; + } + + if (time >= ProximityInfoParams::STRONG_DOUBLE_LETTER_TIME_MILLIS){ + return 0.0f; + } + if (DEBUG_DOUBLE_LETTER) { + AKLOGI("--- (%d, %d) double letter: start = %d, end = %d, dist = %d, time = %d," + " speed = %f, ave = %f, val = %f, start time = %d, end time = %d", + id, (*sampledInputIndices)[id], start, end, beelineDistance, time, + (static_cast<float>(beelineDistance) / static_cast<float>(time)), averageSpeed, + ((static_cast<float>(beelineDistance) / static_cast<float>(time)) + / averageSpeed), adjustedStartTime, adjustedEndTime); + } + // Offset 1% + // TODO: Detect double letter more smartly + return 0.01f + static_cast<float>(beelineDistance) / static_cast<float>(time) / averageSpeed; +} + +/* static */ float ProximityInfoStateUtils::getPointAngle( + const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, const int index) { + if (!sampledInputXs || !sampledInputYs) { + return 0.0f; + } + const int sampledInputSize = sampledInputXs->size(); + if (index <= 0 || index >= sampledInputSize - 1) { + return 0.0f; + } + const float previousDirection = getDirection(sampledInputXs, sampledInputYs, index - 1, index); + const float nextDirection = getDirection(sampledInputXs, sampledInputYs, index, index + 1); + const float directionDiff = GeometryUtils::getAngleDiff(previousDirection, nextDirection); + return directionDiff; +} + +/* static */ float ProximityInfoStateUtils::getPointsAngle( + const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, + const int index0, const int index1, const int index2) { + if (!sampledInputXs || !sampledInputYs) { + return 0.0f; + } + const int sampledInputSize = sampledInputXs->size(); + if (index0 < 0 || index0 > sampledInputSize - 1) { + return 0.0f; + } + if (index1 < 0 || index1 > sampledInputSize - 1) { + return 0.0f; + } + if (index2 < 0 || index2 > sampledInputSize - 1) { + return 0.0f; + } + const float previousDirection = getDirection(sampledInputXs, sampledInputYs, index0, index1); + const float nextDirection = getDirection(sampledInputXs, sampledInputYs, index1, index2); + return GeometryUtils::getAngleDiff(previousDirection, nextDirection); +} + +// This function basically converts from a length to an edit distance. Accordingly, it's obviously +// wrong to compare with mMaxPointToKeyLength. +/* static */ float ProximityInfoStateUtils::getPointToKeyByIdLength(const float maxPointToKeyLength, + const std::vector<float> *const sampledNormalizedSquaredLengthCache, const int keyCount, + const int inputIndex, const int keyId) { + if (keyId != NOT_AN_INDEX) { + const int index = inputIndex * keyCount + keyId; + return min((*sampledNormalizedSquaredLengthCache)[index], maxPointToKeyLength); + } + // If the char is not a key on the keyboard then return the max length. + return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); +} + +// Updates probabilities of aligning to some keys and skipping. +// Word suggestion should be based on this probabilities. +/* static */ void ProximityInfoStateUtils::updateAlignPointProbabilities( + const float maxPointToKeyLength, const int mostCommonKeyWidth, const int keyCount, + const int start, const int sampledInputSize, const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, + const std::vector<float> *const sampledSpeedRates, + const std::vector<int> *const sampledLengthCache, + const std::vector<float> *const sampledNormalizedSquaredLengthCache, + std::vector<NearKeycodesSet> *sampledNearKeySets, + std::vector<hash_map_compat<int, float> > *charProbabilities) { + charProbabilities->resize(sampledInputSize); + // Calculates probabilities of using a point as a correlated point with the character + // for each point. + for (int i = start; i < sampledInputSize; ++i) { + (*charProbabilities)[i].clear(); + // First, calculates skip probability. Starts from MAX_SKIP_PROBABILITY. + // Note that all values that are multiplied to this probability should be in [0.0, 1.0]; + float skipProbability = ProximityInfoParams::MAX_SKIP_PROBABILITY; + + const float currentAngle = getPointAngle(sampledInputXs, sampledInputYs, i); + const float speedRate = (*sampledSpeedRates)[i]; + + float nearestKeyDistance = static_cast<float>(MAX_VALUE_FOR_WEIGHTING); + for (int j = 0; j < keyCount; ++j) { + if ((*sampledNearKeySets)[i].test(j)) { + const float distance = getPointToKeyByIdLength( + maxPointToKeyLength, sampledNormalizedSquaredLengthCache, keyCount, i, j); + if (distance < nearestKeyDistance) { + nearestKeyDistance = distance; + } + } + } + + if (i == 0) { + skipProbability *= min(1.0f, + nearestKeyDistance * ProximityInfoParams::NEAREST_DISTANCE_WEIGHT + + ProximityInfoParams::NEAREST_DISTANCE_BIAS); + // Promote the first point + skipProbability *= ProximityInfoParams::SKIP_FIRST_POINT_PROBABILITY; + } else if (i == sampledInputSize - 1) { + skipProbability *= min(1.0f, + nearestKeyDistance * ProximityInfoParams::NEAREST_DISTANCE_WEIGHT_FOR_LAST + + ProximityInfoParams::NEAREST_DISTANCE_BIAS_FOR_LAST); + // Promote the last point + skipProbability *= ProximityInfoParams::SKIP_LAST_POINT_PROBABILITY; + } else { + // If the current speed is relatively slower than adjacent keys, we promote this point. + if ((*sampledSpeedRates)[i - 1] - ProximityInfoParams::SPEED_MARGIN > speedRate + && speedRate + < (*sampledSpeedRates)[i + 1] - ProximityInfoParams::SPEED_MARGIN) { + if (currentAngle < ProximityInfoParams::CORNER_ANGLE_THRESHOLD) { + skipProbability *= min(1.0f, speedRate + * ProximityInfoParams::SLOW_STRAIGHT_WEIGHT_FOR_SKIP_PROBABILITY); + } else { + // If the angle is small enough, we promote this point more. (e.g. pit vs put) + skipProbability *= min(1.0f, + speedRate * ProximityInfoParams::SPEED_WEIGHT_FOR_SKIP_PROBABILITY + + ProximityInfoParams::MIN_SPEED_RATE_FOR_SKIP_PROBABILITY); + } + } + + skipProbability *= min(1.0f, + speedRate * nearestKeyDistance * ProximityInfoParams::NEAREST_DISTANCE_WEIGHT + + ProximityInfoParams::NEAREST_DISTANCE_BIAS); + + // Adjusts skip probability by a rate depending on angle. + // ANGLE_RATE of skipProbability is adjusted by current angle. + skipProbability *= (M_PI_F - currentAngle) / M_PI_F * ProximityInfoParams::ANGLE_WEIGHT + + (1.0f - ProximityInfoParams::ANGLE_WEIGHT); + if (currentAngle > ProximityInfoParams::DEEP_CORNER_ANGLE_THRESHOLD) { + skipProbability *= ProximityInfoParams::SKIP_DEEP_CORNER_PROBABILITY; + } + // We assume the angle of this point is the angle for point[i], point[i - 2] + // and point[i - 3]. The reason why we don't use the angle for point[i], point[i - 1] + // and point[i - 2] is this angle can be more affected by the noise. + const float prevAngle = getPointsAngle(sampledInputXs, sampledInputYs, i, i - 2, i - 3); + if (i >= 3 && prevAngle < ProximityInfoParams::STRAIGHT_ANGLE_THRESHOLD + && currentAngle > ProximityInfoParams::CORNER_ANGLE_THRESHOLD) { + skipProbability *= ProximityInfoParams::SKIP_CORNER_PROBABILITY; + } + } + + // probabilities must be in [0.0, ProximityInfoParams::MAX_SKIP_PROBABILITY]; + ASSERT(skipProbability >= 0.0f); + ASSERT(skipProbability <= ProximityInfoParams::MAX_SKIP_PROBABILITY); + (*charProbabilities)[i][NOT_AN_INDEX] = skipProbability; + + // Second, calculates key probabilities by dividing the rest probability + // (1.0f - skipProbability). + const float inputCharProbability = 1.0f - skipProbability; + + const float speedxAngleRate = min(speedRate * currentAngle / M_PI_F + * ProximityInfoParams::SPEEDxANGLE_WEIGHT_FOR_STANDARD_DEVIATION, + ProximityInfoParams::MAX_SPEEDxANGLE_RATE_FOR_STANDARD_DEVIATION); + const float speedxNearestKeyDistanceRate = min(speedRate * nearestKeyDistance + * ProximityInfoParams::SPEEDxNEAREST_WEIGHT_FOR_STANDARD_DEVIATION, + ProximityInfoParams::MAX_SPEEDxNEAREST_RATE_FOR_STANDARD_DEVIATION); + const float sigma = speedxAngleRate + speedxNearestKeyDistanceRate + + ProximityInfoParams::MIN_STANDARD_DEVIATION; + + ProximityInfoUtils::NormalDistribution + distribution(ProximityInfoParams::CENTER_VALUE_OF_NORMALIZED_DISTRIBUTION, sigma); + // Summing up probability densities of all near keys. + float sumOfProbabilityDensities = 0.0f; + for (int j = 0; j < keyCount; ++j) { + if ((*sampledNearKeySets)[i].test(j)) { + float distance = sqrtf(getPointToKeyByIdLength( + maxPointToKeyLength, sampledNormalizedSquaredLengthCache, keyCount, i, j)); + if (i == 0 && i != sampledInputSize - 1) { + // For the first point, weighted average of distances from first point and the + // next point to the key is used as a point to key distance. + const float nextDistance = sqrtf(getPointToKeyByIdLength( + maxPointToKeyLength, sampledNormalizedSquaredLengthCache, keyCount, + i + 1, j)); + if (nextDistance < distance) { + // The distance of the first point tends to bigger than continuing + // points because the first touch by the user can be sloppy. + // So we promote the first point if the distance of that point is larger + // than the distance of the next point. + distance = (distance + + nextDistance * ProximityInfoParams::NEXT_DISTANCE_WEIGHT) + / (1.0f + ProximityInfoParams::NEXT_DISTANCE_WEIGHT); + } + } else if (i != 0 && i == sampledInputSize - 1) { + // For the first point, weighted average of distances from last point and + // the previous point to the key is used as a point to key distance. + const float previousDistance = sqrtf(getPointToKeyByIdLength( + maxPointToKeyLength, sampledNormalizedSquaredLengthCache, keyCount, + i - 1, j)); + if (previousDistance < distance) { + // The distance of the last point tends to bigger than continuing points + // because the last touch by the user can be sloppy. So we promote the + // last point if the distance of that point is larger than the distance of + // the previous point. + distance = (distance + + previousDistance * ProximityInfoParams::PREV_DISTANCE_WEIGHT) + / (1.0f + ProximityInfoParams::PREV_DISTANCE_WEIGHT); + } + } + // TODO: Promote the first point when the extended line from the next input is near + // from a key. Also, promote the last point as well. + sumOfProbabilityDensities += distribution.getProbabilityDensity(distance); + } + } + + // Split the probability of an input point to keys that are close to the input point. + for (int j = 0; j < keyCount; ++j) { + if ((*sampledNearKeySets)[i].test(j)) { + float distance = sqrtf(getPointToKeyByIdLength( + maxPointToKeyLength, sampledNormalizedSquaredLengthCache, keyCount, i, j)); + if (i == 0 && i != sampledInputSize - 1) { + // For the first point, weighted average of distances from the first point and + // the next point to the key is used as a point to key distance. + const float prevDistance = sqrtf(getPointToKeyByIdLength( + maxPointToKeyLength, sampledNormalizedSquaredLengthCache, keyCount, + i + 1, j)); + if (prevDistance < distance) { + distance = (distance + + prevDistance * ProximityInfoParams::NEXT_DISTANCE_WEIGHT) + / (1.0f + ProximityInfoParams::NEXT_DISTANCE_WEIGHT); + } + } else if (i != 0 && i == sampledInputSize - 1) { + // For the first point, weighted average of distances from last point and + // the previous point to the key is used as a point to key distance. + const float prevDistance = sqrtf(getPointToKeyByIdLength( + maxPointToKeyLength, sampledNormalizedSquaredLengthCache, keyCount, + i - 1, j)); + if (prevDistance < distance) { + distance = (distance + + prevDistance * ProximityInfoParams::PREV_DISTANCE_WEIGHT) + / (1.0f + ProximityInfoParams::PREV_DISTANCE_WEIGHT); + } + } + const float probabilityDensity = distribution.getProbabilityDensity(distance); + const float probability = inputCharProbability * probabilityDensity + / sumOfProbabilityDensities; + (*charProbabilities)[i][j] = probability; + } + } + } + + if (DEBUG_POINTS_PROBABILITY) { + for (int i = 0; i < sampledInputSize; ++i) { + std::stringstream sstream; + sstream << i << ", "; + sstream << "(" << (*sampledInputXs)[i] << ", " << (*sampledInputYs)[i] << "), "; + sstream << "Speed: "<< (*sampledSpeedRates)[i] << ", "; + sstream << "Angle: "<< getPointAngle(sampledInputXs, sampledInputYs, i) << ", \n"; + + for (hash_map_compat<int, float>::iterator it = (*charProbabilities)[i].begin(); + it != (*charProbabilities)[i].end(); ++it) { + if (it->first == NOT_AN_INDEX) { + sstream << it->first + << "(skip):" + << it->second + << "\n"; + } else { + sstream << it->first + << "(" + //<< static_cast<char>(mProximityInfo->getCodePointOf(it->first)) + << "):" + << it->second + << "\n"; + } + } + AKLOGI("%s", sstream.str().c_str()); + } + } + + // Decrease key probabilities of points which don't have the highest probability of that key + // among nearby points. Probabilities of the first point and the last point are not suppressed. + for (int i = max(start, 1); i < sampledInputSize; ++i) { + for (int j = i + 1; j < sampledInputSize; ++j) { + if (!suppressCharProbabilities( + mostCommonKeyWidth, sampledInputSize, sampledLengthCache, i, j, + charProbabilities)) { + break; + } + } + for (int j = i - 1; j >= max(start, 0); --j) { + if (!suppressCharProbabilities( + mostCommonKeyWidth, sampledInputSize, sampledLengthCache, i, j, + charProbabilities)) { + break; + } + } + } + + // Converting from raw probabilities to log probabilities to calculate spatial distance. + for (int i = start; i < sampledInputSize; ++i) { + for (int j = 0; j < keyCount; ++j) { + hash_map_compat<int, float>::iterator it = (*charProbabilities)[i].find(j); + if (it == (*charProbabilities)[i].end()){ + (*sampledNearKeySets)[i].reset(j); + } else if(it->second < ProximityInfoParams::MIN_PROBABILITY) { + // Erases from near keys vector because it has very low probability. + (*sampledNearKeySets)[i].reset(j); + (*charProbabilities)[i].erase(j); + } else { + it->second = -logf(it->second); + } + } + (*charProbabilities)[i][NOT_AN_INDEX] = -logf((*charProbabilities)[i][NOT_AN_INDEX]); + } +} + +/* static */ void ProximityInfoStateUtils::updateSampledSearchKeySets( + const ProximityInfo *const proximityInfo, const int sampledInputSize, + const int lastSavedInputSize, + const std::vector<int> *const sampledLengthCache, + const std::vector<NearKeycodesSet> *const sampledNearKeySets, + std::vector<NearKeycodesSet> *sampledSearchKeySets, + std::vector<std::vector<int> > *sampledSearchKeyVectors) { + sampledSearchKeySets->resize(sampledInputSize); + sampledSearchKeyVectors->resize(sampledInputSize); + const int readForwordLength = static_cast<int>( + hypotf(proximityInfo->getKeyboardWidth(), proximityInfo->getKeyboardHeight()) + * ProximityInfoParams::SEARCH_KEY_RADIUS_RATIO); + for (int i = 0; i < sampledInputSize; ++i) { + if (i >= lastSavedInputSize) { + (*sampledSearchKeySets)[i].reset(); + } + for (int j = max(i, lastSavedInputSize); j < sampledInputSize; ++j) { + // TODO: Investigate if this is required. This may not fail. + if ((*sampledLengthCache)[j] - (*sampledLengthCache)[i] >= readForwordLength) { + break; + } + (*sampledSearchKeySets)[i] |= (*sampledNearKeySets)[j]; + } + } + const int keyCount = proximityInfo->getKeyCount(); + for (int i = 0; i < sampledInputSize; ++i) { + std::vector<int> *searchKeyVector = &(*sampledSearchKeyVectors)[i]; + searchKeyVector->clear(); + for (int j = 0; j < keyCount; ++j) { + if ((*sampledSearchKeySets)[i].test(j)) { + const int keyCodePoint = proximityInfo->getCodePointOf(j); + if (std::find(searchKeyVector->begin(), searchKeyVector->end(), keyCodePoint) + == searchKeyVector->end()) { + searchKeyVector->push_back(keyCodePoint); + } + } + } + } +} + +// Decreases char probabilities of index0 by checking probabilities of a near point (index1) and +// increases char probabilities of index1 by checking probabilities of index0. +/* static */ bool ProximityInfoStateUtils::suppressCharProbabilities(const int mostCommonKeyWidth, + const int sampledInputSize, const std::vector<int> *const lengthCache, + const int index0, const int index1, + std::vector<hash_map_compat<int, float> > *charProbabilities) { + ASSERT(0 <= index0 && index0 < sampledInputSize); + ASSERT(0 <= index1 && index1 < sampledInputSize); + const float keyWidthFloat = static_cast<float>(mostCommonKeyWidth); + const float diff = fabsf(static_cast<float>((*lengthCache)[index0] - (*lengthCache)[index1])); + if (diff > keyWidthFloat * ProximityInfoParams::SUPPRESSION_LENGTH_WEIGHT) { + return false; + } + const float suppressionRate = ProximityInfoParams::MIN_SUPPRESSION_RATE + + diff / keyWidthFloat / ProximityInfoParams::SUPPRESSION_LENGTH_WEIGHT + * ProximityInfoParams::SUPPRESSION_WEIGHT; + for (hash_map_compat<int, float>::iterator it = (*charProbabilities)[index0].begin(); + it != (*charProbabilities)[index0].end(); ++it) { + hash_map_compat<int, float>::iterator it2 = (*charProbabilities)[index1].find(it->first); + if (it2 != (*charProbabilities)[index1].end() && it->second < it2->second) { + const float newProbability = it->second * suppressionRate; + const float suppression = it->second - newProbability; + it->second = newProbability; + // mCharProbabilities[index0][NOT_AN_INDEX] is the probability of skipping this point. + (*charProbabilities)[index0][NOT_AN_INDEX] += suppression; + + // Add the probability of the same key nearby index1 + const float probabilityGain = min(suppression + * ProximityInfoParams::SUPPRESSION_WEIGHT_FOR_PROBABILITY_GAIN, + (*charProbabilities)[index1][NOT_AN_INDEX] + * ProximityInfoParams::SKIP_PROBABALITY_WEIGHT_FOR_PROBABILITY_GAIN); + it2->second += probabilityGain; + (*charProbabilities)[index1][NOT_AN_INDEX] -= probabilityGain; + } + } + return true; +} + +/* static */ bool ProximityInfoStateUtils::checkAndReturnIsContinuousSuggestionPossible( + const int inputSize, const int *const xCoordinates, const int *const yCoordinates, + const int *const times, const int sampledInputSize, + const std::vector<int> *const sampledInputXs, const std::vector<int> *const sampledInputYs, + const std::vector<int> *const sampledTimes, + const std::vector<int> *const sampledInputIndices) { + if (inputSize < sampledInputSize) { + return false; + } + for (int i = 0; i < sampledInputSize; ++i) { + const int index = (*sampledInputIndices)[i]; + if (index >= inputSize) { + return false; + } + if (xCoordinates[index] != (*sampledInputXs)[i] + || yCoordinates[index] != (*sampledInputYs)[i]) { + return false; + } + if (!times) { + continue; + } + if (times[index] != (*sampledTimes)[i]) { + return false; + } + } + return true; +} + +// Get a word that is detected by tracing the most probable string into codePointBuf and +// returns probability of generating the word. +/* static */ float ProximityInfoStateUtils::getMostProbableString( + const ProximityInfo *const proximityInfo, const int sampledInputSize, + const std::vector<hash_map_compat<int, float> > *const charProbabilities, + int *const codePointBuf) { + ASSERT(sampledInputSize >= 0); + memset(codePointBuf, 0, sizeof(codePointBuf[0]) * MAX_WORD_LENGTH); + int index = 0; + float sumLogProbability = 0.0f; + // TODO: Current implementation is greedy algorithm. DP would be efficient for many cases. + for (int i = 0; i < sampledInputSize && index < MAX_WORD_LENGTH - 1; ++i) { + float minLogProbability = static_cast<float>(MAX_VALUE_FOR_WEIGHTING); + int character = NOT_AN_INDEX; + for (hash_map_compat<int, float>::const_iterator it = (*charProbabilities)[i].begin(); + it != (*charProbabilities)[i].end(); ++it) { + const float logProbability = (it->first != NOT_AN_INDEX) + ? it->second + ProximityInfoParams::DEMOTION_LOG_PROBABILITY : it->second; + if (logProbability < minLogProbability) { + minLogProbability = logProbability; + character = it->first; + } + } + if (character != NOT_AN_INDEX) { + codePointBuf[index] = proximityInfo->getCodePointOf(character); + index++; + } + sumLogProbability += minLogProbability; + } + codePointBuf[index] = '\0'; + return sumLogProbability; +} + +/* static */ void ProximityInfoStateUtils::dump(const bool isGeometric, const int inputSize, + const int *const inputXCoordinates, const int *const inputYCoordinates, + const int sampledInputSize, const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, + const std::vector<int> *const sampledTimes, + const std::vector<float> *const sampledSpeedRates, + const std::vector<int> *const sampledBeelineSpeedPercentiles) { + if (DEBUG_GEO_FULL) { + for (int i = 0; i < sampledInputSize; ++i) { + AKLOGI("Sampled(%d): x = %d, y = %d, time = %d", i, (*sampledInputXs)[i], + (*sampledInputYs)[i], sampledTimes ? (*sampledTimes)[i] : -1); + } + } + + std::stringstream originalX, originalY, sampledX, sampledY; + for (int i = 0; i < inputSize; ++i) { + originalX << inputXCoordinates[i]; + originalY << inputYCoordinates[i]; + if (i != inputSize - 1) { + originalX << ";"; + originalY << ";"; + } + } + AKLOGI("===== sampled points ====="); + for (int i = 0; i < sampledInputSize; ++i) { + if (isGeometric) { + AKLOGI("%d: x = %d, y = %d, time = %d, relative speed = %.4f, beeline speed = %d", + i, (*sampledInputXs)[i], (*sampledInputYs)[i], (*sampledTimes)[i], + (*sampledSpeedRates)[i], (*sampledBeelineSpeedPercentiles)[i]); + } + sampledX << (*sampledInputXs)[i]; + sampledY << (*sampledInputYs)[i]; + if (i != sampledInputSize - 1) { + sampledX << ";"; + sampledY << ";"; + } + } + AKLOGI("original points:\n%s, %s,\nsampled points:\n%s, %s,\n", + originalX.str().c_str(), originalY.str().c_str(), sampledX.str().c_str(), + sampledY.str().c_str()); +} +} // namespace latinime
diff --git a/src/aosp/suggest/core/layout/proximity_info_state_utils.h b/src/aosp/suggest/core/layout/proximity_info_state_utils.h new file mode 100644 index 0000000..6de9700 --- /dev/null +++ b/src/aosp/suggest/core/layout/proximity_info_state_utils.h
@@ -0,0 +1,165 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROXIMITY_INFO_STATE_UTILS_H +#define LATINIME_PROXIMITY_INFO_STATE_UTILS_H + +#include <bitset> +#include <vector> + +#include "defines.h" +#include "utils/hash_map_compat.h" + +namespace latinime { +class ProximityInfo; +class ProximityInfoParams; + +class ProximityInfoStateUtils { + public: + typedef hash_map_compat<int, float> NearKeysDistanceMap; + typedef std::bitset<MAX_KEY_COUNT_IN_A_KEYBOARD> NearKeycodesSet; + + static int trimLastTwoTouchPoints(std::vector<int> *sampledInputXs, + std::vector<int> *sampledInputYs, std::vector<int> *sampledInputTimes, + std::vector<int> *sampledLengthCache, std::vector<int> *sampledInputIndice); + static int updateTouchPoints(const ProximityInfo *const proximityInfo, + const int maxPointToKeyLength, const int *const inputProximities, + const int *const inputXCoordinates, const int *const inputYCoordinates, + const int *const times, const int *const pointerIds, const int inputSize, + const bool isGeometric, const int pointerId, const int pushTouchPointStartIndex, + std::vector<int> *sampledInputXs, std::vector<int> *sampledInputYs, + std::vector<int> *sampledInputTimes, std::vector<int> *sampledLengthCache, + std::vector<int> *sampledInputIndice); + static const int *getProximityCodePointsAt(const int *const inputProximities, const int index); + static int getPrimaryCodePointAt(const int *const inputProximities, const int index); + static void popInputData(std::vector<int> *sampledInputXs, std::vector<int> *sampledInputYs, + std::vector<int> *sampledInputTimes, std::vector<int> *sampledLengthCache, + std::vector<int> *sampledInputIndice); + static float refreshSpeedRates(const int inputSize, const int *const xCoordinates, + const int *const yCoordinates, const int *const times, const int lastSavedInputSize, + const int sampledInputSize, const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, + const std::vector<int> *const sampledInputTimes, + const std::vector<int> *const sampledLengthCache, + const std::vector<int> *const sampledInputIndice, + std::vector<float> *sampledSpeedRates, std::vector<float> *sampledDirections); + static void refreshBeelineSpeedRates(const int mostCommonKeyWidth, const float averageSpeed, + const int inputSize, const int *const xCoordinates, const int *const yCoordinates, + const int *times, const int sampledInputSize, + const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, const std::vector<int> *const inputIndice, + std::vector<int> *beelineSpeedPercentiles); + static float getDirection(const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, const int index0, const int index1); + static void updateAlignPointProbabilities(const float maxPointToKeyLength, + const int mostCommonKeyWidth, const int keyCount, const int start, + const int sampledInputSize, const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, + const std::vector<float> *const sampledSpeedRates, + const std::vector<int> *const sampledLengthCache, + const std::vector<float> *const sampledNormalizedSquaredLengthCache, + std::vector<NearKeycodesSet> *sampledNearKeySets, + std::vector<hash_map_compat<int, float> > *charProbabilities); + static void updateSampledSearchKeySets(const ProximityInfo *const proximityInfo, + const int sampledInputSize, const int lastSavedInputSize, + const std::vector<int> *const sampledLengthCache, + const std::vector<NearKeycodesSet> *const sampledNearKeySets, + std::vector<NearKeycodesSet> *sampledSearchKeySets, + std::vector<std::vector<int> > *sampledSearchKeyVectors); + static float getPointToKeyByIdLength(const float maxPointToKeyLength, + const std::vector<float> *const sampledNormalizedSquaredLengthCache, const int keyCount, + const int inputIndex, const int keyId); + static void initGeometricDistanceInfos(const ProximityInfo *const proximityInfo, + const int sampledInputSize, const int lastSavedInputSize, const bool isGeometric, + const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, + std::vector<NearKeycodesSet> *sampledNearKeySets, + std::vector<float> *sampledNormalizedSquaredLengthCache); + static void initPrimaryInputWord(const int inputSize, const int *const inputProximities, + int *primaryInputWord); + static void initNormalizedSquaredDistances(const ProximityInfo *const proximityInfo, + const int inputSize, const int *inputXCoordinates, const int *inputYCoordinates, + const int *const inputProximities, const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, int *normalizedSquaredDistances); + static void dump(const bool isGeometric, const int inputSize, + const int *const inputXCoordinates, const int *const inputYCoordinates, + const int sampledInputSize, const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, + const std::vector<int> *const sampledTimes, + const std::vector<float> *const sampledSpeedRates, + const std::vector<int> *const sampledBeelineSpeedPercentiles); + static bool checkAndReturnIsContinuousSuggestionPossible(const int inputSize, + const int *const xCoordinates, const int *const yCoordinates, const int *const times, + const int sampledInputSize, const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, + const std::vector<int> *const sampledTimes, + const std::vector<int> *const sampledInputIndices); + // TODO: Move to most_probable_string_utils.h + static float getMostProbableString(const ProximityInfo *const proximityInfo, + const int sampledInputSize, + const std::vector<hash_map_compat<int, float> > *const charProbabilities, + int *const codePointBuf); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ProximityInfoStateUtils); + + static float updateNearKeysDistances(const ProximityInfo *const proximityInfo, + const float maxPointToKeyLength, const int x, const int y, + const bool isGeometric, + NearKeysDistanceMap *const currentNearKeysDistances); + static bool isPrevLocalMin(const NearKeysDistanceMap *const currentNearKeysDistances, + const NearKeysDistanceMap *const prevNearKeysDistances, + const NearKeysDistanceMap *const prevPrevNearKeysDistances); + static float getPointScore(const int mostCommonKeyWidth, const int x, const int y, + const int time, const bool lastPoint, const float nearest, const float sumAngle, + const NearKeysDistanceMap *const currentNearKeysDistances, + const NearKeysDistanceMap *const prevNearKeysDistances, + const NearKeysDistanceMap *const prevPrevNearKeysDistances, + std::vector<int> *sampledInputXs, std::vector<int> *sampledInputYs); + static bool pushTouchPoint(const ProximityInfo *const proximityInfo, + const int maxPointToKeyLength, const int inputIndex, const int nodeCodePoint, int x, + int y, const int time, const bool isGeometric, + const bool doSampling, const bool isLastPoint, + const float sumAngle, NearKeysDistanceMap *const currentNearKeysDistances, + const NearKeysDistanceMap *const prevNearKeysDistances, + const NearKeysDistanceMap *const prevPrevNearKeysDistances, + std::vector<int> *sampledInputXs, std::vector<int> *sampledInputYs, + std::vector<int> *sampledInputTimes, std::vector<int> *sampledLengthCache, + std::vector<int> *sampledInputIndice); + static float calculateBeelineSpeedRate(const int mostCommonKeyWidth, const float averageSpeed, + const int id, const int inputSize, const int *const xCoordinates, + const int *const yCoordinates, const int *times, const int sampledInputSize, + const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, + const std::vector<int> *const inputIndice); + static float getPointAngle(const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, const int index); + static float getPointsAngle(const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, const int index0, const int index1, + const int index2); + static bool suppressCharProbabilities(const int mostCommonKeyWidth, + const int sampledInputSize, const std::vector<int> *const lengthCache, const int index0, + const int index1, std::vector<hash_map_compat<int, float> > *charProbabilities); + static float calculateSquaredDistanceFromSweetSpotCenter( + const ProximityInfo *const proximityInfo, const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, const int keyIndex, + const int inputIndex); + static float calculateNormalizedSquaredDistance(const ProximityInfo *const proximityInfo, + const std::vector<int> *const sampledInputXs, + const std::vector<int> *const sampledInputYs, const int keyIndex, const int inputIndex); +}; +} // namespace latinime +#endif // LATINIME_PROXIMITY_INFO_STATE_UTILS_H
diff --git a/src/aosp/suggest/core/layout/proximity_info_utils.h b/src/aosp/suggest/core/layout/proximity_info_utils.h new file mode 100644 index 0000000..0e28560 --- /dev/null +++ b/src/aosp/suggest/core/layout/proximity_info_utils.h
@@ -0,0 +1,249 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROXIMITY_INFO_UTILS_H +#define LATINIME_PROXIMITY_INFO_UTILS_H + +#include <cmath> + +#include "defines.h" +#include "suggest/core/layout/additional_proximity_chars.h" +#include "suggest/core/layout/geometry_utils.h" +#include "utils/char_utils.h" +#include "utils/hash_map_compat.h" + +namespace latinime { +class ProximityInfoUtils { + public: + static AK_FORCE_INLINE int getKeyIndexOf(const int keyCount, const int c, + const hash_map_compat<int, int> *const codeToKeyMap) { + if (keyCount == 0) { + // We do not have the coordinate data + return NOT_AN_INDEX; + } + if (c == NOT_A_CODE_POINT) { + return NOT_AN_INDEX; + } + const int lowerCode = CharUtils::toLowerCase(c); + hash_map_compat<int, int>::const_iterator mapPos = codeToKeyMap->find(lowerCode); + if (mapPos != codeToKeyMap->end()) { + return mapPos->second; + } + return NOT_AN_INDEX; + } + + static AK_FORCE_INLINE void initializeProximities(const int *const inputCodes, + const int *const inputXCoordinates, const int *const inputYCoordinates, + const int inputSize, const int *const keyXCoordinates, + const int *const keyYCoordinates, const int *const keyWidths, const int *keyHeights, + const int *const proximityCharsArray, const int cellHeight, const int cellWidth, + const int gridWidth, const int mostCommonKeyWidth, const int keyCount, + const char *const localeStr, + const hash_map_compat<int, int> *const codeToKeyMap, int *inputProximities) { + // Initialize + // - mInputCodes + // - mNormalizedSquaredDistances + // TODO: Merge + for (int i = 0; i < inputSize; ++i) { + const int primaryKey = inputCodes[i]; + const int x = inputXCoordinates[i]; + const int y = inputYCoordinates[i]; + int *proximities = &inputProximities[i * MAX_PROXIMITY_CHARS_SIZE]; + calculateProximities(keyXCoordinates, keyYCoordinates, keyWidths, keyHeights, + proximityCharsArray, cellHeight, cellWidth, gridWidth, mostCommonKeyWidth, + keyCount, x, y, primaryKey, localeStr, codeToKeyMap, proximities); + } + + if (DEBUG_PROXIMITY_CHARS) { + for (int i = 0; i < inputSize; ++i) { + AKLOGI("---"); + for (int j = 0; j < MAX_PROXIMITY_CHARS_SIZE; ++j) { + int proximityChar = + inputProximities[i * MAX_PROXIMITY_CHARS_SIZE + j]; + proximityChar += 0; + AKLOGI("--- (%d)%c", i, proximityChar); + } + } + } + } + + static AK_FORCE_INLINE int getStartIndexFromCoordinates(const int x, const int y, + const int cellHeight, const int cellWidth, const int gridWidth) { + return ((y / cellHeight) * gridWidth + (x / cellWidth)) * MAX_PROXIMITY_CHARS_SIZE; + } + + static inline float getSquaredDistanceFloat(const float x1, const float y1, const float x2, + const float y2) { + return GeometryUtils::SQUARE_FLOAT(x1 - x2) + GeometryUtils::SQUARE_FLOAT(y1 - y2); + } + + static inline float pointToLineSegSquaredDistanceFloat(const float x, const float y, + const float x1, const float y1, const float x2, const float y2, const bool extend) { + const float ray1x = x - x1; + const float ray1y = y - y1; + const float ray2x = x2 - x1; + const float ray2y = y2 - y1; + + const float dotProduct = ray1x * ray2x + ray1y * ray2y; + const float lineLengthSqr = GeometryUtils::SQUARE_FLOAT(ray2x) + + GeometryUtils::SQUARE_FLOAT(ray2y); + const float projectionLengthSqr = dotProduct / lineLengthSqr; + + float projectionX; + float projectionY; + if (!extend && projectionLengthSqr < 0.0f) { + projectionX = x1; + projectionY = y1; + } else if (!extend && projectionLengthSqr > 1.0f) { + projectionX = x2; + projectionY = y2; + } else { + projectionX = x1 + projectionLengthSqr * ray2x; + projectionY = y1 + projectionLengthSqr * ray2y; + } + return getSquaredDistanceFloat(x, y, projectionX, projectionY); + } + + static AK_FORCE_INLINE bool isMatchOrProximityChar(const ProximityType type) { + return type == MATCH_CHAR || type == PROXIMITY_CHAR || type == ADDITIONAL_PROXIMITY_CHAR; + } + + // Normal distribution N(u, sigma^2). + struct NormalDistribution { + public: + NormalDistribution(const float u, const float sigma) + : mU(u), mSigma(sigma), + mPreComputedNonExpPart(1.0f / sqrtf(2.0f * M_PI_F + * GeometryUtils::SQUARE_FLOAT(sigma))), + mPreComputedExponentPart(-1.0f / (2.0f * GeometryUtils::SQUARE_FLOAT(sigma))) {} + + float getProbabilityDensity(const float x) const { + const float shiftedX = x - mU; + return mPreComputedNonExpPart + * expf(mPreComputedExponentPart * GeometryUtils::SQUARE_FLOAT(shiftedX)); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(NormalDistribution); + const float mU; // mean value + const float mSigma; // standard deviation + const float mPreComputedNonExpPart; // = 1 / sqrt(2 * PI * sigma^2) + const float mPreComputedExponentPart; // = -1 / (2 * sigma^2) + }; // struct NormalDistribution + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ProximityInfoUtils); + + static bool isOnKey(const int *const keyXCoordinates, const int *const keyYCoordinates, + const int *const keyWidths, const int *keyHeights, const int keyId, const int x, + const int y) { + if (keyId < 0) return true; // NOT_A_ID is -1, but return whenever < 0 just in case + const int left = keyXCoordinates[keyId]; + const int top = keyYCoordinates[keyId]; + const int right = left + keyWidths[keyId] + 1; + const int bottom = top + keyHeights[keyId]; + return left < right && top < bottom && x >= left && x < right && y >= top && y < bottom; + } + + static AK_FORCE_INLINE void calculateProximities(const int *const keyXCoordinates, + const int *const keyYCoordinates, const int *const keyWidths, const int *keyHeights, + const int *const proximityCharsArray, const int cellHeight, const int cellWidth, + const int gridWidth, const int mostCommonKeyWidth, const int keyCount, + const int x, const int y, const int primaryKey, const char *const localeStr, + const hash_map_compat<int, int> *const codeToKeyMap, int *proximities) { + const int mostCommonKeyWidthSquare = mostCommonKeyWidth * mostCommonKeyWidth; + int insertPos = 0; + proximities[insertPos++] = primaryKey; + const int startIndex = getStartIndexFromCoordinates(x, y, cellHeight, cellWidth, gridWidth); + if (startIndex >= 0) { + for (int i = 0; i < MAX_PROXIMITY_CHARS_SIZE; ++i) { + const int c = proximityCharsArray[startIndex + i]; + if (c < KEYCODE_SPACE || c == primaryKey) { + continue; + } + const int keyIndex = getKeyIndexOf(keyCount, c, codeToKeyMap); + const bool onKey = isOnKey(keyXCoordinates, keyYCoordinates, keyWidths, keyHeights, + keyIndex, x, y); + const int distance = squaredLengthToEdge(keyXCoordinates, keyYCoordinates, + keyWidths, keyHeights, keyIndex, x, y); + if (onKey || distance < mostCommonKeyWidthSquare) { + proximities[insertPos++] = c; + if (insertPos >= MAX_PROXIMITY_CHARS_SIZE) { + if (DEBUG_DICT) { + ASSERT(false); + } + return; + } + } + } + const int additionalProximitySize = + AdditionalProximityChars::getAdditionalCharsSize(localeStr, primaryKey); + if (additionalProximitySize > 0) { + proximities[insertPos++] = ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE; + if (insertPos >= MAX_PROXIMITY_CHARS_SIZE) { + if (DEBUG_DICT) { + ASSERT(false); + } + return; + } + + const int *additionalProximityChars = + AdditionalProximityChars::getAdditionalChars(localeStr, primaryKey); + for (int j = 0; j < additionalProximitySize; ++j) { + const int ac = additionalProximityChars[j]; + int k = 0; + for (; k < insertPos; ++k) { + if (ac == proximities[k]) { + break; + } + } + if (k < insertPos) { + continue; + } + proximities[insertPos++] = ac; + if (insertPos >= MAX_PROXIMITY_CHARS_SIZE) { + if (DEBUG_DICT) { + ASSERT(false); + } + return; + } + } + } + } + // Add a delimiter for the proximity characters + for (int i = insertPos; i < MAX_PROXIMITY_CHARS_SIZE; ++i) { + proximities[i] = NOT_A_CODE_POINT; + } + } + + static int squaredLengthToEdge(const int *const keyXCoordinates, + const int *const keyYCoordinates, const int *const keyWidths, const int *keyHeights, + const int keyId, const int x, const int y) { + // NOT_A_ID is -1, but return whenever < 0 just in case + if (keyId < 0) return MAX_VALUE_FOR_WEIGHTING; + const int left = keyXCoordinates[keyId]; + const int top = keyYCoordinates[keyId]; + const int right = left + keyWidths[keyId]; + const int bottom = top + keyHeights[keyId]; + const int edgeX = x < left ? left : (x > right ? right : x); + const int edgeY = y < top ? top : (y > bottom ? bottom : y); + const int dx = x - edgeX; + const int dy = y - edgeY; + return dx * dx + dy * dy; + } +}; +} // namespace latinime +#endif // LATINIME_PROXIMITY_INFO_UTILS_H
diff --git a/src/aosp/suggest/core/layout/touch_position_correction_utils.h b/src/aosp/suggest/core/layout/touch_position_correction_utils.h new file mode 100644 index 0000000..9130e87 --- /dev/null +++ b/src/aosp/suggest/core/layout/touch_position_correction_utils.h
@@ -0,0 +1,63 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TOUCH_POSITION_CORRECTION_UTILS_H +#define LATINIME_TOUCH_POSITION_CORRECTION_UTILS_H + +#include "defines.h" +#include "suggest/core/layout/proximity_info_params.h" + +namespace latinime { +class TouchPositionCorrectionUtils { + public: + static float getSweetSpotFactor(const bool isTouchPositionCorrectionEnabled, + const float normalizedSquaredDistance) { + // Promote or demote the score according to the distance from the sweet spot + static const float A = 0.0f; + static const float B = 0.24f; + static const float C = 1.20f; + static const float R0 = 0.0f; + static const float R1 = 0.25f; // Sweet spot + static const float R2 = 1.0f; + const float x = normalizedSquaredDistance; + if (!isTouchPositionCorrectionEnabled) { + return min(C, x); + } + + // factor is a piecewise linear function like: + // C -------------. + // / . + // B / . + // -/ . + // A _-^ . + // . + // R0 R1 R2 . + + if (x < R0) { + return A; + } else if (x < R1) { + return (A * (R1 - x) + B * (x - R0)) / (R1 - R0); + } else if (x < R2) { + return (B * (R2 - x) + C * (x - R1)) / (R2 - R1); + } else { + return C; + } + } + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TouchPositionCorrectionUtils); +}; +} // namespace latinime +#endif // LATINIME_TOUCH_POSITION_CORRECTION_UTILS_H
diff --git a/src/aosp/suggest/core/policy/dictionary_bigrams_structure_policy.h b/src/aosp/suggest/core/policy/dictionary_bigrams_structure_policy.h new file mode 100644 index 0000000..661ef1b --- /dev/null +++ b/src/aosp/suggest/core/policy/dictionary_bigrams_structure_policy.h
@@ -0,0 +1,42 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_BIGRAMS_STRUCTURE_POLICY_H +#define LATINIME_DICTIONARY_BIGRAMS_STRUCTURE_POLICY_H + +#include "defines.h" + +namespace latinime { + +/* + * This class abstracts structure of bigrams. + */ +class DictionaryBigramsStructurePolicy { + public: + virtual ~DictionaryBigramsStructurePolicy() {} + + virtual void getNextBigram(int *const outBigramPos, int *const outProbability, + bool *const outHasNext, int *const pos) const = 0; + virtual void skipAllBigrams(int *const pos) const = 0; + + protected: + DictionaryBigramsStructurePolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictionaryBigramsStructurePolicy); +}; +} // namespace latinime +#endif /* LATINIME_DICTIONARY_BIGRAMS_STRUCTURE_POLICY_H */
diff --git a/src/aosp/suggest/core/policy/dictionary_header_structure_policy.h b/src/aosp/suggest/core/policy/dictionary_header_structure_policy.h new file mode 100644 index 0000000..5492c60 --- /dev/null +++ b/src/aosp/suggest/core/policy/dictionary_header_structure_policy.h
@@ -0,0 +1,52 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H +#define LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H + +#include "defines.h" + +namespace latinime { + +/* + * This class abstracts structure of dictionaries. + * Implement this policy to support additional dictionaries. + */ +class DictionaryHeaderStructurePolicy { + public: + virtual ~DictionaryHeaderStructurePolicy() {} + + virtual bool supportsDynamicUpdate() const = 0; + + virtual bool requiresGermanUmlautProcessing() const = 0; + + virtual bool requiresFrenchLigatureProcessing() const = 0; + + virtual float getMultiWordCostMultiplier() const = 0; + + virtual int getLastDecayedTime() const = 0; + + virtual void readHeaderValueOrQuestionMark(const char *const key, int *outValue, + int outValueSize) const = 0; + + protected: + DictionaryHeaderStructurePolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictionaryHeaderStructurePolicy); +}; +} // namespace latinime +#endif /* LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H */
diff --git a/src/aosp/suggest/core/policy/dictionary_shortcuts_structure_policy.h b/src/aosp/suggest/core/policy/dictionary_shortcuts_structure_policy.h new file mode 100644 index 0000000..40b6c2d --- /dev/null +++ b/src/aosp/suggest/core/policy/dictionary_shortcuts_structure_policy.h
@@ -0,0 +1,46 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_SHORTCUTS_STRUCTURE_POLICY_H +#define LATINIME_DICTIONARY_SHORTCUTS_STRUCTURE_POLICY_H + +#include "defines.h" + +namespace latinime { + +/* + * This class abstracts structure of shortcuts. + */ +class DictionaryShortcutsStructurePolicy { + public: + virtual ~DictionaryShortcutsStructurePolicy() {} + + virtual int getStartPos(const int pos) const = 0; + + virtual void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, + int *const pos) const = 0; + + virtual void skipAllShortcuts(int *const pos) const = 0; + + protected: + DictionaryShortcutsStructurePolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictionaryShortcutsStructurePolicy); +}; +} // namespace latinime +#endif /* LATINIME_DICTIONARY_SHORTCUTS_STRUCTURE_POLICY_H */
diff --git a/src/aosp/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/src/aosp/suggest/core/policy/dictionary_structure_with_buffer_policy.h new file mode 100644 index 0000000..41f8204 --- /dev/null +++ b/src/aosp/suggest/core/policy/dictionary_structure_with_buffer_policy.h
@@ -0,0 +1,95 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_STRUCTURE_POLICY_H +#define LATINIME_DICTIONARY_STRUCTURE_POLICY_H + +#include "defines.h" + +namespace latinime { + +class DicNode; +class DicNodeVector; +class DictionaryBigramsStructurePolicy; +class DictionaryHeaderStructurePolicy; +class DictionaryShortcutsStructurePolicy; + +/* + * This class abstracts structure of dictionaries. + * Implement this policy to support additional dictionaries. + */ +class DictionaryStructureWithBufferPolicy { + public: + virtual ~DictionaryStructureWithBufferPolicy() {} + + virtual int getRootPosition() const = 0; + + virtual void createAndGetAllChildNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const = 0; + + virtual int getCodePointsAndProbabilityAndReturnCodePointCount( + const int nodePos, const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const = 0; + + virtual int getTerminalNodePositionOfWord(const int *const inWord, + const int length, const bool forceLowerCaseSearch) const = 0; + + virtual int getProbability(const int unigramProbability, + const int bigramProbability) const = 0; + + virtual int getUnigramProbabilityOfPtNode(const int nodePos) const = 0; + + virtual int getShortcutPositionOfPtNode(const int nodePos) const = 0; + + virtual int getBigramsPositionOfPtNode(const int nodePos) const = 0; + + virtual const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const = 0; + + virtual const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const = 0; + + virtual const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const = 0; + + // Returns whether the update was success or not. + virtual bool addUnigramWord(const int *const word, const int length, + const int probability) = 0; + + // Returns whether the update was success or not. + virtual bool addBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1, const int probability) = 0; + + // Returns whether the update was success or not. + virtual bool removeBigramWords(const int *const word0, const int length0, + const int *const word1, const int length1) = 0; + + virtual void flush(const char *const filePath) = 0; + + virtual void flushWithGC(const char *const filePath) = 0; + + virtual bool needsToRunGC(const bool mindsBlockByGC) const = 0; + + // Currently, this method is used only for testing. You may want to consider creating new + // dedicated method instead of this if you want to use this in the production. + virtual void getProperty(const char *const query, char *const outResult, + const int maxResultLength) = 0; + + protected: + DictionaryStructureWithBufferPolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictionaryStructureWithBufferPolicy); +}; +} // namespace latinime +#endif /* LATINIME_DICTIONARY_STRUCTURE_POLICY_H */
diff --git a/src/aosp/suggest/core/policy/scoring.h b/src/aosp/suggest/core/policy/scoring.h new file mode 100644 index 0000000..102e856 --- /dev/null +++ b/src/aosp/suggest/core/policy/scoring.h
@@ -0,0 +1,55 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SCORING_H +#define LATINIME_SCORING_H + +#include "defines.h" + +namespace latinime { + +class DicNode; +class DicTraverseSession; + +// This class basically tweaks suggestions and distances apart from CompoundDistance +class Scoring { + public: + virtual int calculateFinalScore(const float compoundDistance, const int inputSize, + const bool forceCommit) const = 0; + virtual bool getMostProbableString(const DicTraverseSession *const traverseSession, + const int terminalSize, const float languageWeight, int *const outputCodePoints, + int *const type, int *const freq) const = 0; + virtual void safetyNetForMostProbableString(const int terminalSize, + const int maxScore, int *const outputCodePoints, int *const frequencies) const = 0; + // TODO: Make more generic + virtual void searchWordWithDoubleLetter(DicNode *terminals, const int terminalSize, + int *doubleLetterTerminalIndex, DoubleLetterLevel *doubleLetterLevel) const = 0; + virtual float getAdjustedLanguageWeight(DicTraverseSession *const traverseSession, + DicNode *const terminals, const int size) const = 0; + virtual float getDoubleLetterDemotionDistanceCost(const int terminalIndex, + const int doubleLetterTerminalIndex, + const DoubleLetterLevel doubleLetterLevel) const = 0; + virtual bool doesAutoCorrectValidWord() const = 0; + + protected: + Scoring() {} + virtual ~Scoring() {} + + private: + DISALLOW_COPY_AND_ASSIGN(Scoring); +}; +} // namespace latinime +#endif // LATINIME_SCORING_H
diff --git a/src/aosp/suggest/core/policy/suggest_policy.h b/src/aosp/suggest/core/policy/suggest_policy.h new file mode 100644 index 0000000..5b6402c --- /dev/null +++ b/src/aosp/suggest/core/policy/suggest_policy.h
@@ -0,0 +1,40 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SUGGEST_POLICY_H +#define LATINIME_SUGGEST_POLICY_H + +#include "defines.h" + +namespace latinime { + +class Traversal; +class Scoring; +class Weighting; + +class SuggestPolicy { + public: + SuggestPolicy() {} + virtual ~SuggestPolicy() {} + virtual const Traversal *getTraversal() const = 0; + virtual const Scoring *getScoring() const = 0; + virtual const Weighting *getWeighting() const = 0; + + private: + DISALLOW_COPY_AND_ASSIGN(SuggestPolicy); +}; +} // namespace latinime +#endif // LATINIME_SUGGEST_POLICY_H
diff --git a/src/aosp/suggest/core/policy/traversal.h b/src/aosp/suggest/core/policy/traversal.h new file mode 100644 index 0000000..e935533 --- /dev/null +++ b/src/aosp/suggest/core/policy/traversal.h
@@ -0,0 +1,63 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TRAVERSAL_H +#define LATINIME_TRAVERSAL_H + +#include "defines.h" + +namespace latinime { + +class DicTraverseSession; + +class Traversal { + public: + virtual int getMaxPointerCount() const = 0; + virtual bool allowsErrorCorrections(const DicNode *const dicNode) const = 0; + virtual bool isOmission(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode, const DicNode *const childDicNode, + const bool allowsErrorCorrections) const = 0; + virtual bool isSpaceSubstitutionTerminal(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + virtual bool isSpaceOmissionTerminal(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + virtual bool shouldDepthLevelCache(const DicTraverseSession *const traverseSession) const = 0; + virtual bool shouldNodeLevelCache(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + virtual bool canDoLookAheadCorrection(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + virtual ProximityType getProximityType(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode, const DicNode *const childDicNode) const = 0; + virtual bool sameAsTyped(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + virtual bool needsToTraverseAllUserInput() const = 0; + virtual float getMaxSpatialDistance() const = 0; + virtual bool autoCorrectsToMultiWordSuggestionIfTop() const = 0; + virtual int getDefaultExpandDicNodeSize() const = 0; + virtual int getMaxCacheSize(const int inputSize) const = 0; + virtual bool isPossibleOmissionChildNode(const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode) const = 0; + virtual bool isGoodToTraverseNextWord(const DicNode *const dicNode) const = 0; + + protected: + Traversal() {} + virtual ~Traversal() {} + + private: + DISALLOW_COPY_AND_ASSIGN(Traversal); +}; +} // namespace latinime +#endif // LATINIME_TRAVERSAL_H
diff --git a/src/aosp/suggest/core/policy/weighting.cpp b/src/aosp/suggest/core/policy/weighting.cpp new file mode 100644 index 0000000..0c40168 --- /dev/null +++ b/src/aosp/suggest/core/policy/weighting.cpp
@@ -0,0 +1,201 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/policy/weighting.h" + +#include "defines.h" +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_profiler.h" +#include "suggest/core/dicnode/dic_node_utils.h" +#include "suggest/core/session/dic_traverse_session.h" + +namespace latinime { + +class MultiBigramMap; + +static inline void profile(const CorrectionType correctionType, DicNode *const node) { +#if DEBUG_DICT + switch (correctionType) { + case CT_OMISSION: + PROF_OMISSION(node->mProfiler); + return; + case CT_ADDITIONAL_PROXIMITY: + PROF_ADDITIONAL_PROXIMITY(node->mProfiler); + return; + case CT_SUBSTITUTION: + PROF_SUBSTITUTION(node->mProfiler); + return; + case CT_NEW_WORD_SPACE_OMISSION: + PROF_NEW_WORD(node->mProfiler); + return; + case CT_MATCH: + PROF_MATCH(node->mProfiler); + return; + case CT_COMPLETION: + PROF_COMPLETION(node->mProfiler); + return; + case CT_TERMINAL: + PROF_TERMINAL(node->mProfiler); + return; + case CT_TERMINAL_INSERTION: + PROF_TERMINAL_INSERTION(node->mProfiler); + return; + case CT_NEW_WORD_SPACE_SUBSTITUTION: + PROF_SPACE_SUBSTITUTION(node->mProfiler); + return; + case CT_INSERTION: + PROF_INSERTION(node->mProfiler); + return; + case CT_TRANSPOSITION: + PROF_TRANSPOSITION(node->mProfiler); + return; + default: + // do nothing + return; + } +#else + // do nothing +#endif +} + +/* static */ void Weighting::addCostAndForwardInputIndex(const Weighting *const weighting, + const CorrectionType correctionType, const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, DicNode *const dicNode, + MultiBigramMap *const multiBigramMap) { + const int inputSize = traverseSession->getInputSize(); + DicNode_InputStateG inputStateG; + inputStateG.mNeedsToUpdateInputStateG = false; // Don't use input info by default + const float spatialCost = Weighting::getSpatialCost(weighting, correctionType, + traverseSession, parentDicNode, dicNode, &inputStateG); + const float languageCost = Weighting::getLanguageCost(weighting, correctionType, + traverseSession, parentDicNode, dicNode, multiBigramMap); + const ErrorType errorType = weighting->getErrorType(correctionType, traverseSession, + parentDicNode, dicNode); + profile(correctionType, dicNode); + if (inputStateG.mNeedsToUpdateInputStateG) { + dicNode->updateInputIndexG(&inputStateG); + } else { + dicNode->forwardInputIndex(0, getForwardInputCount(correctionType), + (correctionType == CT_TRANSPOSITION)); + } + dicNode->addCost(spatialCost, languageCost, weighting->needsToNormalizeCompoundDistance(), + inputSize, errorType); + if (CT_NEW_WORD_SPACE_OMISSION == correctionType) { + // When we are on a terminal, we save the current distance for evaluating + // when to auto-commit partial suggestions. + dicNode->saveNormalizedCompoundDistanceAfterFirstWordIfNoneYet(); + } +} + +/* static */ float Weighting::getSpatialCost(const Weighting *const weighting, + const CorrectionType correctionType, const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode, + DicNode_InputStateG *const inputStateG) { + switch(correctionType) { + case CT_OMISSION: + return weighting->getOmissionCost(parentDicNode, dicNode); + case CT_ADDITIONAL_PROXIMITY: + // only used for typing + return weighting->getAdditionalProximityCost(); + case CT_SUBSTITUTION: + // only used for typing + return weighting->getSubstitutionCost(); + case CT_NEW_WORD_SPACE_OMISSION: + return weighting->getNewWordSpatialCost(traverseSession, dicNode, inputStateG); + case CT_MATCH: + return weighting->getMatchedCost(traverseSession, dicNode, inputStateG); + case CT_COMPLETION: + return weighting->getCompletionCost(traverseSession, dicNode); + case CT_TERMINAL: + return weighting->getTerminalSpatialCost(traverseSession, dicNode); + case CT_TERMINAL_INSERTION: + return weighting->getTerminalInsertionCost(traverseSession, dicNode); + case CT_NEW_WORD_SPACE_SUBSTITUTION: + return weighting->getSpaceSubstitutionCost(traverseSession, dicNode); + case CT_INSERTION: + return weighting->getInsertionCost(traverseSession, parentDicNode, dicNode); + case CT_TRANSPOSITION: + return weighting->getTranspositionCost(traverseSession, parentDicNode, dicNode); + default: + return 0.0f; + } +} + +/* static */ float Weighting::getLanguageCost(const Weighting *const weighting, + const CorrectionType correctionType, const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode, + MultiBigramMap *const multiBigramMap) { + switch(correctionType) { + case CT_OMISSION: + return 0.0f; + case CT_SUBSTITUTION: + return 0.0f; + case CT_NEW_WORD_SPACE_OMISSION: + return weighting->getNewWordBigramLanguageCost( + traverseSession, parentDicNode, multiBigramMap); + case CT_MATCH: + return 0.0f; + case CT_COMPLETION: + return 0.0f; + case CT_TERMINAL: { + const float languageImprobability = + DicNodeUtils::getBigramNodeImprobability( + traverseSession->getDictionaryStructurePolicy(), dicNode, multiBigramMap); + return weighting->getTerminalLanguageCost(traverseSession, dicNode, languageImprobability); + } + case CT_TERMINAL_INSERTION: + return 0.0f; + case CT_NEW_WORD_SPACE_SUBSTITUTION: + return weighting->getNewWordBigramLanguageCost( + traverseSession, parentDicNode, multiBigramMap); + case CT_INSERTION: + return 0.0f; + case CT_TRANSPOSITION: + return 0.0f; + default: + return 0.0f; + } +} + +/* static */ int Weighting::getForwardInputCount(const CorrectionType correctionType) { + switch(correctionType) { + case CT_OMISSION: + return 0; + case CT_ADDITIONAL_PROXIMITY: + return 0; /* 0 because CT_MATCH will be called */ + case CT_SUBSTITUTION: + return 0; /* 0 because CT_MATCH will be called */ + case CT_NEW_WORD_SPACE_OMISSION: + return 0; + case CT_MATCH: + return 1; + case CT_COMPLETION: + return 1; + case CT_TERMINAL: + return 0; + case CT_TERMINAL_INSERTION: + return 1; + case CT_NEW_WORD_SPACE_SUBSTITUTION: + return 1; + case CT_INSERTION: + return 2; /* look ahead + skip the current char */ + case CT_TRANSPOSITION: + return 2; /* look ahead + skip the current char */ + default: + return 0; + } +} +} // namespace latinime
diff --git a/src/aosp/suggest/core/policy/weighting.h b/src/aosp/suggest/core/policy/weighting.h new file mode 100644 index 0000000..2d49e98 --- /dev/null +++ b/src/aosp/suggest/core/policy/weighting.h
@@ -0,0 +1,109 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_WEIGHTING_H +#define LATINIME_WEIGHTING_H + +#include "defines.h" + +namespace latinime { + +class DicNode; +class DicTraverseSession; +struct DicNode_InputStateG; +class MultiBigramMap; + +class Weighting { + public: + static void addCostAndForwardInputIndex(const Weighting *const weighting, + const CorrectionType correctionType, + const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, DicNode *const dicNode, + MultiBigramMap *const multiBigramMap); + + protected: + virtual float getTerminalSpatialCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + + virtual float getOmissionCost( + const DicNode *const parentDicNode, const DicNode *const dicNode) const = 0; + + virtual float getMatchedCost( + const DicTraverseSession *const traverseSession, const DicNode *const dicNode, + DicNode_InputStateG *inputStateG) const = 0; + + virtual bool isProximityDicNode(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + + virtual float getTranspositionCost( + const DicTraverseSession *const traverseSession, const DicNode *const parentDicNode, + const DicNode *const dicNode) const = 0; + + virtual float getInsertionCost( + const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode) const = 0; + + virtual float getNewWordSpatialCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode, DicNode_InputStateG *const inputStateG) const = 0; + + virtual float getNewWordBigramLanguageCost( + const DicTraverseSession *const traverseSession, const DicNode *const dicNode, + MultiBigramMap *const multiBigramMap) const = 0; + + virtual float getCompletionCost( + const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + + virtual float getTerminalInsertionCost( + const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + + virtual float getTerminalLanguageCost( + const DicTraverseSession *const traverseSession, const DicNode *const dicNode, + float dicNodeLanguageImprobability) const = 0; + + virtual bool needsToNormalizeCompoundDistance() const = 0; + + virtual float getAdditionalProximityCost() const = 0; + + virtual float getSubstitutionCost() const = 0; + + virtual float getSpaceSubstitutionCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + + virtual ErrorType getErrorType(const CorrectionType correctionType, + const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode) const = 0; + + Weighting() {} + virtual ~Weighting() {} + + private: + DISALLOW_COPY_AND_ASSIGN(Weighting); + + static float getSpatialCost(const Weighting *const weighting, + const CorrectionType correctionType, const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode, + DicNode_InputStateG *const inputStateG); + static float getLanguageCost(const Weighting *const weighting, + const CorrectionType correctionType, const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode, + MultiBigramMap *const multiBigramMap); + // TODO: Move to TypingWeighting and GestureWeighting? + static int getForwardInputCount(const CorrectionType correctionType); +}; +} // namespace latinime +#endif // LATINIME_WEIGHTING_H
diff --git a/src/aosp/suggest/core/session/dic_traverse_session.cpp b/src/aosp/suggest/core/session/dic_traverse_session.cpp new file mode 100644 index 0000000..50f2bbd --- /dev/null +++ b/src/aosp/suggest/core/session/dic_traverse_session.cpp
@@ -0,0 +1,88 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/session/dic_traverse_session.h" + +#include "defines.h" +#include "suggest/core/dictionary/dictionary.h" +#include "suggest/core/policy/dictionary_header_structure_policy.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" + +namespace latinime { + +// 256K bytes threshold is heuristically used to distinguish dictionaries containing many unigrams +// (e.g. main dictionary) from small dictionaries (e.g. contacts...) +const int DicTraverseSession::DICTIONARY_SIZE_THRESHOLD_TO_USE_LARGE_CACHE_FOR_SUGGESTION = + 256 * 1024; + +void DicTraverseSession::init(const Dictionary *const dictionary, const int *prevWord, + int prevWordLength, const SuggestOptions *const suggestOptions) { + mDictionary = dictionary; + mMultiWordCostMultiplier = getDictionaryStructurePolicy()->getHeaderStructurePolicy() + ->getMultiWordCostMultiplier(); + mSuggestOptions = suggestOptions; + if (!prevWord) { + mPrevWordPos = NOT_A_DICT_POS; + return; + } + // TODO: merge following similar calls to getTerminalPosition into one case-insensitive call. + mPrevWordPos = getDictionaryStructurePolicy()->getTerminalNodePositionOfWord( + prevWord, prevWordLength, false /* forceLowerCaseSearch */); + if (mPrevWordPos == NOT_A_DICT_POS) { + // Check bigrams for lower-cased previous word if original was not found. Useful for + // auto-capitalized words like "The [current_word]". + mPrevWordPos = getDictionaryStructurePolicy()->getTerminalNodePositionOfWord( + prevWord, prevWordLength, true /* forceLowerCaseSearch */); + } +} + +void DicTraverseSession::setupForGetSuggestions(const ProximityInfo *pInfo, + const int *inputCodePoints, const int inputSize, const int *const inputXs, + const int *const inputYs, const int *const times, const int *const pointerIds, + const float maxSpatialDistance, const int maxPointerCount) { + mProximityInfo = pInfo; + mMaxPointerCount = maxPointerCount; + initializeProximityInfoStates(inputCodePoints, inputXs, inputYs, times, pointerIds, inputSize, + maxSpatialDistance, maxPointerCount); +} + +const DictionaryStructureWithBufferPolicy *DicTraverseSession::getDictionaryStructurePolicy() + const { + return mDictionary->getDictionaryStructurePolicy(); +} + +void DicTraverseSession::resetCache(const int thresholdForNextActiveDicNodes, const int maxWords) { + mDicNodesCache.reset(thresholdForNextActiveDicNodes /* nextActiveSize */, + maxWords /* terminalSize */); + mMultiBigramMap.clear(); + mPartiallyCommited = false; +} + +void DicTraverseSession::initializeProximityInfoStates(const int *const inputCodePoints, + const int *const inputXs, const int *const inputYs, const int *const times, + const int *const pointerIds, const int inputSize, const float maxSpatialDistance, + const int maxPointerCount) { + ASSERT(1 <= maxPointerCount && maxPointerCount <= MAX_POINTER_COUNT_G); + mInputSize = 0; + for (int i = 0; i < maxPointerCount; ++i) { + mProximityInfoStates[i].initInputParams(i, maxSpatialDistance, getProximityInfo(), + inputCodePoints, inputSize, inputXs, inputYs, times, pointerIds, + maxPointerCount == MAX_POINTER_COUNT_G + /* TODO: this is a hack. fix proximity info state */); + mInputSize += mProximityInfoStates[i].size(); + } +} +} // namespace latinime
diff --git a/src/aosp/suggest/core/session/dic_traverse_session.h b/src/aosp/suggest/core/session/dic_traverse_session.h new file mode 100644 index 0000000..e0b1c67 --- /dev/null +++ b/src/aosp/suggest/core/session/dic_traverse_session.h
@@ -0,0 +1,215 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_TRAVERSE_SESSION_H +#define LATINIME_DIC_TRAVERSE_SESSION_H + +#include <stdint.h> +#include <vector> + +#include "defines.h" +#include "jni.h" +#include "suggest/core/dicnode/dic_nodes_cache.h" +#include "suggest/core/dictionary/multi_bigram_map.h" +#include "suggest/core/layout/proximity_info_state.h" + +namespace latinime { + +class Dictionary; +class DictionaryStructureWithBufferPolicy; +class ProximityInfo; +class SuggestOptions; + +class DicTraverseSession { + public: + + // A factory method for DicTraverseSession + static AK_FORCE_INLINE void *getSessionInstance(JNIEnv *env, jstring localeStr, + jlong dictSize) { + // To deal with the trade-off between accuracy and memory space, large cache is used for + // dictionaries larger that the threshold + return new DicTraverseSession(env, localeStr, + dictSize >= DICTIONARY_SIZE_THRESHOLD_TO_USE_LARGE_CACHE_FOR_SUGGESTION); + } + + static AK_FORCE_INLINE void initSessionInstance(DicTraverseSession *traverseSession, + const Dictionary *const dictionary, const int *prevWord, const int prevWordLength, + const SuggestOptions *const suggestOptions) { + if (traverseSession) { + DicTraverseSession *tSession = static_cast<DicTraverseSession *>(traverseSession); + tSession->init(dictionary, prevWord, prevWordLength, suggestOptions); + } + } + + static AK_FORCE_INLINE void releaseSessionInstance(DicTraverseSession *traverseSession) { + delete traverseSession; + } + + AK_FORCE_INLINE DicTraverseSession(JNIEnv *env, jstring localeStr, bool usesLargeCache) + : mPrevWordPos(NOT_A_DICT_POS), mProximityInfo(0), + mDictionary(0), mSuggestOptions(0), mDicNodesCache(usesLargeCache), + mMultiBigramMap(), mInputSize(0), mPartiallyCommited(false), mMaxPointerCount(1), + mMultiWordCostMultiplier(1.0f) { + // NOTE: mProximityInfoStates is an array of instances. + // No need to initialize it explicitly here. + } + + // Non virtual inline destructor -- never inherit this class + AK_FORCE_INLINE ~DicTraverseSession() {} + + void init(const Dictionary *dictionary, const int *prevWord, int prevWordLength, + const SuggestOptions *const suggestOptions); + // TODO: Remove and merge into init + void setupForGetSuggestions(const ProximityInfo *pInfo, const int *inputCodePoints, + const int inputSize, const int *const inputXs, const int *const inputYs, + const int *const times, const int *const pointerIds, const float maxSpatialDistance, + const int maxPointerCount); + void resetCache(const int thresholdForNextActiveDicNodes, const int maxWords); + + const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const; + + //-------------------- + // getters and setters + //-------------------- + const ProximityInfo *getProximityInfo() const { return mProximityInfo; } + const SuggestOptions *getSuggestOptions() const { return mSuggestOptions; } + int getPrevWordPos() const { return mPrevWordPos; } + // TODO: REMOVE + void setPrevWordPos(int pos) { mPrevWordPos = pos; } + // TODO: Use proper parameter when changed + int getDicRootPos() const { return 0; } + DicNodesCache *getDicTraverseCache() { return &mDicNodesCache; } + MultiBigramMap *getMultiBigramMap() { return &mMultiBigramMap; } + const ProximityInfoState *getProximityInfoState(int id) const { + return &mProximityInfoStates[id]; + } + int getInputSize() const { return mInputSize; } + void setPartiallyCommited() { mPartiallyCommited = true; } + bool isPartiallyCommited() const { return mPartiallyCommited; } + + bool isOnlyOnePointerUsed(int *pointerId) const { + // Not in the dictionary word + int usedPointerCount = 0; + int usedPointerId = 0; + for (int i = 0; i < mMaxPointerCount; ++i) { + if (mProximityInfoStates[i].isUsed()) { + ++usedPointerCount; + usedPointerId = i; + } + } + if (usedPointerCount != 1) { + return false; + } + if (pointerId) { + *pointerId = usedPointerId; + } + return true; + } + + void getSearchKeys(const DicNode *node, std::vector<int> *const outputSearchKeyVector) const { + for (int i = 0; i < MAX_POINTER_COUNT_G; ++i) { + if (!mProximityInfoStates[i].isUsed()) { + continue; + } + const int pointerId = node->getInputIndex(i); + const std::vector<int> *const searchKeyVector = + mProximityInfoStates[i].getSearchKeyVector(pointerId); + outputSearchKeyVector->insert(outputSearchKeyVector->end(), searchKeyVector->begin(), + searchKeyVector->end()); + } + } + + ProximityType getProximityTypeG(const DicNode *const node, const int childCodePoint) const { + ProximityType proximityType = UNRELATED_CHAR; + for (int i = 0; i < MAX_POINTER_COUNT_G; ++i) { + if (!mProximityInfoStates[i].isUsed()) { + continue; + } + const int pointerId = node->getInputIndex(i); + proximityType = mProximityInfoStates[i].getProximityTypeG(pointerId, childCodePoint); + ASSERT(proximityType == UNRELATED_CHAR || proximityType == MATCH_CHAR); + // TODO: Make this more generic + // Currently we assume there are only two types here -- UNRELATED_CHAR + // and MATCH_CHAR + if (proximityType != UNRELATED_CHAR) { + return proximityType; + } + } + return proximityType; + } + + AK_FORCE_INLINE bool isCacheBorderForTyping(const int inputSize) const { + return mDicNodesCache.isCacheBorderForTyping(inputSize); + } + + /** + * Returns whether or not it is possible to continue suggestion from the previous search. + */ + // TODO: Remove. No need to check once the session is fully implemented. + bool isContinuousSuggestionPossible() const { + if (!mDicNodesCache.hasCachedDicNodesForContinuousSuggestion()) { + return false; + } + ASSERT(mMaxPointerCount <= MAX_POINTER_COUNT_G); + for (int i = 0; i < mMaxPointerCount; ++i) { + const ProximityInfoState *const pInfoState = getProximityInfoState(i); + // If a proximity info state is not continuous suggestion possible, + // do not continue searching. + if (pInfoState->isUsed() && !pInfoState->isContinuousSuggestionPossible()) { + return false; + } + } + return true; + } + + bool isTouchPositionCorrectionEnabled() const { + return mProximityInfoStates[0].touchPositionCorrectionEnabled(); + } + + float getMultiWordCostMultiplier() const { + return mMultiWordCostMultiplier; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DicTraverseSession); + // threshold to start caching + static const int CACHE_START_INPUT_LENGTH_THRESHOLD; + static const int DICTIONARY_SIZE_THRESHOLD_TO_USE_LARGE_CACHE_FOR_SUGGESTION; + void initializeProximityInfoStates(const int *const inputCodePoints, const int *const inputXs, + const int *const inputYs, const int *const times, const int *const pointerIds, + const int inputSize, const float maxSpatialDistance, const int maxPointerCount); + + int mPrevWordPos; + const ProximityInfo *mProximityInfo; + const Dictionary *mDictionary; + const SuggestOptions *mSuggestOptions; + + DicNodesCache mDicNodesCache; + // Temporary cache for bigram frequencies + MultiBigramMap mMultiBigramMap; + ProximityInfoState mProximityInfoStates[MAX_POINTER_COUNT_G]; + + int mInputSize; + bool mPartiallyCommited; + int mMaxPointerCount; + + ///////////////////////////////// + // Configuration per dictionary + float mMultiWordCostMultiplier; + +}; +} // namespace latinime +#endif // LATINIME_DIC_TRAVERSE_SESSION_H
diff --git a/src/aosp/suggest/core/suggest.cpp b/src/aosp/suggest/core/suggest.cpp new file mode 100644 index 0000000..73ccebc --- /dev/null +++ b/src/aosp/suggest/core/suggest.cpp
@@ -0,0 +1,647 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/suggest.h" + +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_priority_queue.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h" +#include "suggest/core/dictionary/dictionary.h" +#include "suggest/core/dictionary/digraph_utils.h" +#include "suggest/core/dictionary/shortcut_utils.h" +#include "suggest/core/layout/proximity_info.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "suggest/core/policy/scoring.h" +#include "suggest/core/policy/traversal.h" +#include "suggest/core/policy/weighting.h" +#include "suggest/core/session/dic_traverse_session.h" + +namespace latinime { + +// Initialization of class constants. +const int Suggest::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; +const int Suggest::MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE = 2; +const float Suggest::AUTOCORRECT_CLASSIFICATION_THRESHOLD = 0.33f; + +/** + * Returns a set of suggestions for the given input touch points. The commitPoint argument indicates + * whether to prematurely commit the suggested words up to the given point for sentence-level + * suggestion. + * + * Note: Currently does not support concurrent calls across threads. Continuous suggestion is + * automatically activated for sequential calls that share the same starting input. + * TODO: Stop detecting continuous suggestion. Start using traverseSession instead. + */ +int Suggest::getSuggestions(ProximityInfo *pInfo, void *traverseSession, + int *inputXs, int *inputYs, int *times, int *pointerIds, int *inputCodePoints, + int inputSize, int commitPoint, int *outWords, int *frequencies, int *outputIndices, + int *outputTypes, int *outputAutoCommitFirstWordConfidence) const { + PROF_OPEN; + PROF_START(0); + const float maxSpatialDistance = TRAVERSAL->getMaxSpatialDistance(); + DicTraverseSession *tSession = static_cast<DicTraverseSession *>(traverseSession); + tSession->setupForGetSuggestions(pInfo, inputCodePoints, inputSize, inputXs, inputYs, times, + pointerIds, maxSpatialDistance, TRAVERSAL->getMaxPointerCount()); + // TODO: Add the way to evaluate cache + + initializeSearch(tSession, commitPoint); + PROF_END(0); + PROF_START(1); + + // keep expanding search dicNodes until all have terminated. + while (tSession->getDicTraverseCache()->activeSize() > 0) { + expandCurrentDicNodes(tSession); + tSession->getDicTraverseCache()->advanceActiveDicNodes(); + tSession->getDicTraverseCache()->advanceInputIndex(inputSize); + } + PROF_END(1); + PROF_START(2); + const int size = outputSuggestions(tSession, frequencies, outWords, outputIndices, outputTypes, + outputAutoCommitFirstWordConfidence); + PROF_END(2); + PROF_CLOSE; + return size; +} + +/** + * Initializes the search at the root of the lexicon trie. Note that when possible the search will + * continue suggestion from where it left off during the last call. + */ +void Suggest::initializeSearch(DicTraverseSession *traverseSession, int commitPoint) const { + if (!traverseSession->getProximityInfoState(0)->isUsed()) { + return; + } + + // Never auto partial commit for now. + commitPoint = 0; + + if (traverseSession->getInputSize() > MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE + && traverseSession->isContinuousSuggestionPossible()) { + if (commitPoint == 0) { + // Continue suggestion + traverseSession->getDicTraverseCache()->continueSearch(); + } else { + // Continue suggestion after partial commit. + DicNode *topDicNode = + traverseSession->getDicTraverseCache()->setCommitPoint(commitPoint); + traverseSession->setPrevWordPos(topDicNode->getPrevWordNodePos()); + traverseSession->getDicTraverseCache()->continueSearch(); + traverseSession->setPartiallyCommited(); + } + } else { + // Restart recognition at the root. + traverseSession->resetCache(TRAVERSAL->getMaxCacheSize(traverseSession->getInputSize()), + MAX_RESULTS); + // Create a new dic node here + DicNode rootNode; + DicNodeUtils::initAsRoot(traverseSession->getDictionaryStructurePolicy(), + traverseSession->getPrevWordPos(), &rootNode); + traverseSession->getDicTraverseCache()->copyPushActive(&rootNode); + } +} + +/** + * Outputs the final list of suggestions (i.e., terminal nodes). + */ +int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequencies, + int *outputCodePoints, int *outputIndicesToPartialCommit, int *outputTypes, + int *outputAutoCommitFirstWordConfidence) const { +#if DEBUG_EVALUATE_MOST_PROBABLE_STRING + const int terminalSize = 0; +#else + const int terminalSize = min(MAX_RESULTS, + static_cast<int>(traverseSession->getDicTraverseCache()->terminalSize())); +#endif + DicNode terminals[MAX_RESULTS]; // Avoiding non-POD variable length array + + for (int index = terminalSize - 1; index >= 0; --index) { + traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]); + } + + const float languageWeight = SCORING->getAdjustedLanguageWeight( + traverseSession, terminals, terminalSize); + + int outputWordIndex = 0; + // Insert most probable word at index == 0 as long as there is one terminal at least + const bool hasMostProbableString = + SCORING->getMostProbableString(traverseSession, terminalSize, languageWeight, + &outputCodePoints[0], &outputTypes[0], &frequencies[0]); + if (hasMostProbableString) { + outputIndicesToPartialCommit[outputWordIndex] = NOT_AN_INDEX; + ++outputWordIndex; + } + + // Initial value of the loop index for terminal nodes (words) + int doubleLetterTerminalIndex = -1; + DoubleLetterLevel doubleLetterLevel = NOT_A_DOUBLE_LETTER; + SCORING->searchWordWithDoubleLetter(terminals, terminalSize, + &doubleLetterTerminalIndex, &doubleLetterLevel); + + int maxScore = S_INT_MIN; + // Force autocorrection for obvious long multi-word suggestions when the top suggestion is + // a long multiple words suggestion. + // TODO: Implement a smarter auto-commit method for handling multi-word suggestions. + // traverseSession->isPartiallyCommited() always returns false because we never auto partial + // commit for now. + const bool forceCommitMultiWords = (terminalSize > 0) ? + TRAVERSAL->autoCorrectsToMultiWordSuggestionIfTop() + && (traverseSession->isPartiallyCommited() + || (traverseSession->getInputSize() + >= MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT + && terminals[0].hasMultipleWords())) : false; + // TODO: have partial commit work even with multiple pointers. + const bool outputSecondWordFirstLetterInputIndex = + traverseSession->isOnlyOnePointerUsed(0 /* pointerId */); + if (terminalSize > 0) { + // If we have no suggestions, don't write this + outputAutoCommitFirstWordConfidence[0] = + computeFirstWordConfidence(&terminals[0]); + } + + // Output suggestion results here + for (int terminalIndex = 0; terminalIndex < terminalSize && outputWordIndex < MAX_RESULTS; + ++terminalIndex) { + DicNode *terminalDicNode = &terminals[terminalIndex]; + if (DEBUG_GEO_FULL) { + terminalDicNode->dump("OUT:"); + } + const float doubleLetterCost = SCORING->getDoubleLetterDemotionDistanceCost( + terminalIndex, doubleLetterTerminalIndex, doubleLetterLevel); + const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight) + + doubleLetterCost; + const bool isPossiblyOffensiveWord = + traverseSession->getDictionaryStructurePolicy()->getProbability( + terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0; + const bool isExactMatch = terminalDicNode->isExactMatch(); + const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase(); + // Heuristic: We exclude freq=0 first-char-uppercase words from exact match. + // (e.g. "AMD" and "and") + const bool isSafeExactMatch = isExactMatch + && !(isPossiblyOffensiveWord && isFirstCharUppercase); + const int outputTypeFlags = + (isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0) + | (isSafeExactMatch ? Dictionary::KIND_FLAG_EXACT_MATCH : 0); + + // Entries that are blacklisted or do not represent a word should not be output. + const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord(); + + // Increase output score of top typing suggestion to ensure autocorrection. + // TODO: Better integration with java side autocorrection logic. + const int finalScore = SCORING->calculateFinalScore( + compoundDistance, traverseSession->getInputSize(), + terminalDicNode->isExactMatch() + || (forceCommitMultiWords && terminalDicNode->hasMultipleWords()) + || (isValidWord && SCORING->doesAutoCorrectValidWord())); + if (maxScore < finalScore && isValidWord) { + maxScore = finalScore; + } + + // Don't output invalid words. However, we still need to submit their shortcuts if any. + if (isValidWord) { + outputTypes[outputWordIndex] = Dictionary::KIND_CORRECTION | outputTypeFlags; + frequencies[outputWordIndex] = finalScore; + if (outputSecondWordFirstLetterInputIndex) { + outputIndicesToPartialCommit[outputWordIndex] = + terminalDicNode->getSecondWordFirstInputIndex( + traverseSession->getProximityInfoState(0)); + } else { + outputIndicesToPartialCommit[outputWordIndex] = NOT_AN_INDEX; + } + // Populate the outputChars array with the suggested word. + const int startIndex = outputWordIndex * MAX_WORD_LENGTH; + terminalDicNode->outputResult(&outputCodePoints[startIndex]); + ++outputWordIndex; + } + + if (!terminalDicNode->hasMultipleWords()) { + BinaryDictionaryShortcutIterator shortcutIt( + traverseSession->getDictionaryStructurePolicy()->getShortcutsStructurePolicy(), + traverseSession->getDictionaryStructurePolicy() + ->getShortcutPositionOfPtNode(terminalDicNode->getPos())); + // Shortcut is not supported for multiple words suggestions. + // TODO: Check shortcuts during traversal for multiple words suggestions. + const bool sameAsTyped = TRAVERSAL->sameAsTyped(traverseSession, terminalDicNode); + const int updatedOutputWordIndex = ShortcutUtils::outputShortcuts(&shortcutIt, + outputWordIndex, finalScore, outputCodePoints, frequencies, outputTypes, + sameAsTyped); + const int secondWordFirstInputIndex = terminalDicNode->getSecondWordFirstInputIndex( + traverseSession->getProximityInfoState(0)); + for (int i = outputWordIndex; i < updatedOutputWordIndex; ++i) { + if (outputSecondWordFirstLetterInputIndex) { + outputIndicesToPartialCommit[i] = secondWordFirstInputIndex; + } else { + outputIndicesToPartialCommit[i] = NOT_AN_INDEX; + } + } + outputWordIndex = updatedOutputWordIndex; + } + DicNode::managedDelete(terminalDicNode); + } + + if (hasMostProbableString) { + SCORING->safetyNetForMostProbableString(terminalSize, maxScore, + &outputCodePoints[0], &frequencies[0]); + } + return outputWordIndex; +} + +int Suggest::computeFirstWordConfidence(const DicNode *const terminalDicNode) const { + // Get the number of spaces in the first suggestion + const int spaceCount = terminalDicNode->getTotalNodeSpaceCount(); + // Get the number of characters in the first suggestion + const int length = terminalDicNode->getTotalNodeCodePointCount(); + // Get the distance for the first word of the suggestion + const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord(); + + // Arbitrarily, we give a score whose useful values range from 0 to 1,000,000. + // 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or + // above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means + // we are very confident. + // Expected space count is 1 ~ 5 + static const int MIN_EXPECTED_SPACE_COUNT = 1; + static const int MAX_EXPECTED_SPACE_COUNT = 5; + // Expected length is about 4 ~ 30 + static const int MIN_EXPECTED_LENGTH = 4; + static const int MAX_EXPECTED_LENGTH = 30; + // Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0 + static const float MIN_EXPECTED_DISTANCE = 0.0; + static const float MAX_EXPECTED_DISTANCE = 2.0; + // This is not strict: it's where most stuff will be falling, but it's still fine if it's + // outside these values. We want to output a value that reflects all of these. Each factor + // contributes a bit. + + // We need at least a space. + if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE; + + // The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0 + // contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the + // weight of the distance. Clamp to avoid overflows. + const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE + : distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance; + const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT + * (MAX_EXPECTED_DISTANCE - clampedDistance) + / (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE); + // The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no + // contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the + // length. Length is guaranteed to be between 1 and 48, so we don't need to clamp. + const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT + * (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH); + // The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no + // contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the + // weight of the space count. + const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT + * (spaceCount - MIN_EXPECTED_SPACE_COUNT) + / (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT); + + return distanceContribution + lengthContribution + spaceContribution; +} + +/** + * Expands the dicNodes in the current search priority queue by advancing to the possible child + * nodes based on the next touch point(s) (or no touch points for lookahead) + */ +void Suggest::expandCurrentDicNodes(DicTraverseSession *traverseSession) const { + const int inputSize = traverseSession->getInputSize(); + DicNodeVector childDicNodes(TRAVERSAL->getDefaultExpandDicNodeSize()); + DicNode correctionDicNode; + + // TODO: Find more efficient caching + const bool shouldDepthLevelCache = TRAVERSAL->shouldDepthLevelCache(traverseSession); + if (shouldDepthLevelCache) { + traverseSession->getDicTraverseCache()->updateLastCachedInputIndex(); + } + if (DEBUG_CACHE) { + AKLOGI("expandCurrentDicNodes depth level cache = %d, inputSize = %d", + shouldDepthLevelCache, inputSize); + } + while (traverseSession->getDicTraverseCache()->activeSize() > 0) { + DicNode dicNode; + traverseSession->getDicTraverseCache()->popActive(&dicNode); + if (dicNode.isTotalInputSizeExceedingLimit()) { + return; + } + childDicNodes.clear(); + const int point0Index = dicNode.getInputIndex(0); + const bool canDoLookAheadCorrection = + TRAVERSAL->canDoLookAheadCorrection(traverseSession, &dicNode); + const bool isLookAheadCorrection = canDoLookAheadCorrection + && traverseSession->getDicTraverseCache()-> + isLookAheadCorrectionInputIndex(static_cast<int>(point0Index)); + const bool isCompletion = dicNode.isCompletion(inputSize); + + const bool shouldNodeLevelCache = + TRAVERSAL->shouldNodeLevelCache(traverseSession, &dicNode); + if (shouldDepthLevelCache || shouldNodeLevelCache) { + if (DEBUG_CACHE) { + dicNode.dump("PUSH_CACHE"); + } + traverseSession->getDicTraverseCache()->copyPushContinue(&dicNode); + dicNode.setCached(); + } + + if (dicNode.isInDigraph()) { + // Finish digraph handling if the node is in the middle of a digraph expansion. + processDicNodeAsDigraph(traverseSession, &dicNode); + } else if (isLookAheadCorrection) { + // The algorithm maintains a small set of "deferred" nodes that have not consumed the + // latest touch point yet. These are needed to apply look-ahead correction operations + // that require special handling of the latest touch point. For example, with insertions + // (e.g., "thiis" -> "this") the latest touch point should not be consumed at all. + processDicNodeAsTransposition(traverseSession, &dicNode); + processDicNodeAsInsertion(traverseSession, &dicNode); + } else { // !isLookAheadCorrection + // Only consider typing error corrections if the normalized compound distance is + // below a spatial distance threshold. + // NOTE: the threshold may need to be updated if scoring model changes. + // TODO: Remove. Do not prune node here. + const bool allowsErrorCorrections = TRAVERSAL->allowsErrorCorrections(&dicNode); + // Process for handling space substitution (e.g., hevis => he is) + if (allowsErrorCorrections + && TRAVERSAL->isSpaceSubstitutionTerminal(traverseSession, &dicNode)) { + createNextWordDicNode(traverseSession, &dicNode, true /* spaceSubstitution */); + } + + DicNodeUtils::getAllChildDicNodes( + &dicNode, traverseSession->getDictionaryStructurePolicy(), &childDicNodes); + + const int childDicNodesSize = childDicNodes.getSizeAndLock(); + for (int i = 0; i < childDicNodesSize; ++i) { + DicNode *const childDicNode = childDicNodes[i]; + if (isCompletion) { + // Handle forward lookahead when the lexicon letter exceeds the input size. + processDicNodeAsMatch(traverseSession, childDicNode); + continue; + } + if (DigraphUtils::hasDigraphForCodePoint( + traverseSession->getDictionaryStructurePolicy() + ->getHeaderStructurePolicy(), + childDicNode->getNodeCodePoint())) { + correctionDicNode.initByCopy(childDicNode); + correctionDicNode.advanceDigraphIndex(); + processDicNodeAsDigraph(traverseSession, &correctionDicNode); + } + if (TRAVERSAL->isOmission(traverseSession, &dicNode, childDicNode, + allowsErrorCorrections)) { + // TODO: (Gesture) Change weight between omission and substitution errors + // TODO: (Gesture) Terminal node should not be handled as omission + correctionDicNode.initByCopy(childDicNode); + processDicNodeAsOmission(traverseSession, &correctionDicNode); + } + const ProximityType proximityType = TRAVERSAL->getProximityType( + traverseSession, &dicNode, childDicNode); + switch (proximityType) { + // TODO: Consider the difference of proximityType here + case MATCH_CHAR: + case PROXIMITY_CHAR: + processDicNodeAsMatch(traverseSession, childDicNode); + break; + case ADDITIONAL_PROXIMITY_CHAR: + if (allowsErrorCorrections) { + processDicNodeAsAdditionalProximityChar(traverseSession, &dicNode, + childDicNode); + } + break; + case SUBSTITUTION_CHAR: + if (allowsErrorCorrections) { + processDicNodeAsSubstitution(traverseSession, &dicNode, childDicNode); + } + break; + case UNRELATED_CHAR: + // Just drop this node and do nothing. + break; + default: + // Just drop this node and do nothing. + break; + } + } + + // Push the node for look-ahead correction + if (allowsErrorCorrections && canDoLookAheadCorrection) { + traverseSession->getDicTraverseCache()->copyPushNextActive(&dicNode); + } + } + } +} + +void Suggest::processTerminalDicNode( + DicTraverseSession *traverseSession, DicNode *dicNode) const { + if (dicNode->getCompoundDistance() >= static_cast<float>(MAX_VALUE_FOR_WEIGHTING)) { + return; + } + if (!dicNode->isTerminalWordNode()) { + return; + } + if (dicNode->shouldBeFilteredBySafetyNetForBigram()) { + return; + } + // Create a non-cached node here. + DicNode terminalDicNode; + DicNodeUtils::initByCopy(dicNode, &terminalDicNode); + if (TRAVERSAL->needsToTraverseAllUserInput() + && dicNode->getInputIndex(0) < traverseSession->getInputSize()) { + Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_TERMINAL_INSERTION, traverseSession, 0, + &terminalDicNode, traverseSession->getMultiBigramMap()); + } + Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_TERMINAL, traverseSession, 0, + &terminalDicNode, traverseSession->getMultiBigramMap()); + traverseSession->getDicTraverseCache()->copyPushTerminal(&terminalDicNode); +} + +/** + * Adds the expanded dicNode to the next search priority queue. Also creates an additional next word + * (by the space omission error correction) search path if input dicNode is on a terminal node. + */ +void Suggest::processExpandedDicNode( + DicTraverseSession *traverseSession, DicNode *dicNode) const { + processTerminalDicNode(traverseSession, dicNode); + if (dicNode->getCompoundDistance() < static_cast<float>(MAX_VALUE_FOR_WEIGHTING)) { + if (TRAVERSAL->isSpaceOmissionTerminal(traverseSession, dicNode)) { + createNextWordDicNode(traverseSession, dicNode, false /* spaceSubstitution */); + } + const int allowsLookAhead = !(dicNode->hasMultipleWords() + && dicNode->isCompletion(traverseSession->getInputSize())); + if (dicNode->hasChildren() && allowsLookAhead) { + traverseSession->getDicTraverseCache()->copyPushNextActive(dicNode); + } + } + DicNode::managedDelete(dicNode); +} + +void Suggest::processDicNodeAsMatch(DicTraverseSession *traverseSession, + DicNode *childDicNode) const { + weightChildNode(traverseSession, childDicNode); + processExpandedDicNode(traverseSession, childDicNode); +} + +void Suggest::processDicNodeAsAdditionalProximityChar(DicTraverseSession *traverseSession, + DicNode *dicNode, DicNode *childDicNode) const { + // Note: Most types of corrections don't need to look up the bigram information since they do + // not treat the node as a terminal. There is no need to pass the bigram map in these cases. + Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_ADDITIONAL_PROXIMITY, + traverseSession, dicNode, childDicNode, 0 /* multiBigramMap */); + weightChildNode(traverseSession, childDicNode); + processExpandedDicNode(traverseSession, childDicNode); +} + +void Suggest::processDicNodeAsSubstitution(DicTraverseSession *traverseSession, + DicNode *dicNode, DicNode *childDicNode) const { + Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_SUBSTITUTION, traverseSession, + dicNode, childDicNode, 0 /* multiBigramMap */); + weightChildNode(traverseSession, childDicNode); + processExpandedDicNode(traverseSession, childDicNode); +} + +// Process the node codepoint as a digraph. This means that composite glyphs like the German +// u-umlaut is expanded to the transliteration "ue". Note that this happens in parallel with +// the normal non-digraph traversal, so both "uber" and "ueber" can be corrected to "[u-umlaut]ber". +void Suggest::processDicNodeAsDigraph(DicTraverseSession *traverseSession, + DicNode *childDicNode) const { + weightChildNode(traverseSession, childDicNode); + childDicNode->advanceDigraphIndex(); + processExpandedDicNode(traverseSession, childDicNode); +} + +/** + * Handle the dicNode as an omission error (e.g., ths => this). Skip the current letter and consider + * matches for all possible next letters. Note that just skipping the current letter without any + * other conditions tends to flood the search dic nodes cache with omission nodes. Instead, check + * the possible *next* letters after the omission to better limit search to plausible omissions. + * Note that apostrophes are handled as omissions. + */ +void Suggest::processDicNodeAsOmission( + DicTraverseSession *traverseSession, DicNode *dicNode) const { + DicNodeVector childDicNodes; + DicNodeUtils::getAllChildDicNodes( + dicNode, traverseSession->getDictionaryStructurePolicy(), &childDicNodes); + + const int size = childDicNodes.getSizeAndLock(); + for (int i = 0; i < size; i++) { + DicNode *const childDicNode = childDicNodes[i]; + // Treat this word as omission + Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_OMISSION, traverseSession, + dicNode, childDicNode, 0 /* multiBigramMap */); + weightChildNode(traverseSession, childDicNode); + if (!TRAVERSAL->isPossibleOmissionChildNode(traverseSession, dicNode, childDicNode)) { + continue; + } + processExpandedDicNode(traverseSession, childDicNode); + } +} + +/** + * Handle the dicNode as an insertion error (e.g., thiis => this). Skip the current touch point and + * consider matches for the next touch point. + */ +void Suggest::processDicNodeAsInsertion(DicTraverseSession *traverseSession, + DicNode *dicNode) const { + const int16_t pointIndex = dicNode->getInputIndex(0); + DicNodeVector childDicNodes; + DicNodeUtils::getAllChildDicNodes(dicNode, traverseSession->getDictionaryStructurePolicy(), + &childDicNodes); + const int size = childDicNodes.getSizeAndLock(); + for (int i = 0; i < size; i++) { + if (traverseSession->getProximityInfoState(0)->getPrimaryCodePointAt(pointIndex + 1) + != childDicNodes[i]->getNodeCodePoint()) { + continue; + } + DicNode *const childDicNode = childDicNodes[i]; + Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_INSERTION, traverseSession, + dicNode, childDicNode, 0 /* multiBigramMap */); + processExpandedDicNode(traverseSession, childDicNode); + } +} + +/** + * Handle the dicNode as a transposition error (e.g., thsi => this). Swap the next two touch points. + */ +void Suggest::processDicNodeAsTransposition(DicTraverseSession *traverseSession, + DicNode *dicNode) const { + const int16_t pointIndex = dicNode->getInputIndex(0); + DicNodeVector childDicNodes1; + DicNodeUtils::getAllChildDicNodes(dicNode, traverseSession->getDictionaryStructurePolicy(), + &childDicNodes1); + const int childSize1 = childDicNodes1.getSizeAndLock(); + for (int i = 0; i < childSize1; i++) { + const ProximityType matchedId1 = traverseSession->getProximityInfoState(0) + ->getProximityType(pointIndex + 1, childDicNodes1[i]->getNodeCodePoint(), + true /* checkProximityChars */); + if (!ProximityInfoUtils::isMatchOrProximityChar(matchedId1)) { + continue; + } + if (childDicNodes1[i]->hasChildren()) { + DicNodeVector childDicNodes2; + DicNodeUtils::getAllChildDicNodes(childDicNodes1[i], + traverseSession->getDictionaryStructurePolicy(), &childDicNodes2); + const int childSize2 = childDicNodes2.getSizeAndLock(); + for (int j = 0; j < childSize2; j++) { + DicNode *const childDicNode2 = childDicNodes2[j]; + const ProximityType matchedId2 = traverseSession->getProximityInfoState(0) + ->getProximityType(pointIndex, childDicNode2->getNodeCodePoint(), + true /* checkProximityChars */); + if (!ProximityInfoUtils::isMatchOrProximityChar(matchedId2)) { + continue; + } + Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_TRANSPOSITION, + traverseSession, childDicNodes1[i], childDicNode2, 0 /* multiBigramMap */); + processExpandedDicNode(traverseSession, childDicNode2); + } + } + DicNode::managedDelete(childDicNodes1[i]); + } +} + +/** + * Weight child node by aligning it to the key + */ +void Suggest::weightChildNode(DicTraverseSession *traverseSession, DicNode *dicNode) const { + const int inputSize = traverseSession->getInputSize(); + if (dicNode->isCompletion(inputSize)) { + Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_COMPLETION, traverseSession, + 0 /* parentDicNode */, dicNode, 0 /* multiBigramMap */); + } else { // completion + Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_MATCH, traverseSession, + 0 /* parentDicNode */, dicNode, 0 /* multiBigramMap */); + } +} + +/** + * Creates a new dicNode that represents a space insertion at the end of the input dicNode. Also + * incorporates the unigram / bigram score for the ending word into the new dicNode. + */ +void Suggest::createNextWordDicNode(DicTraverseSession *traverseSession, DicNode *dicNode, + const bool spaceSubstitution) const { + if (!TRAVERSAL->isGoodToTraverseNextWord(dicNode)) { + return; + } + + // Create a non-cached node here. + DicNode newDicNode; + DicNodeUtils::initAsRootWithPreviousWord( + traverseSession->getDictionaryStructurePolicy(), dicNode, &newDicNode); + const CorrectionType correctionType = spaceSubstitution ? + CT_NEW_WORD_SPACE_SUBSTITUTION : CT_NEW_WORD_SPACE_OMISSION; + Weighting::addCostAndForwardInputIndex(WEIGHTING, correctionType, traverseSession, dicNode, + &newDicNode, traverseSession->getMultiBigramMap()); + if (newDicNode.getCompoundDistance() < static_cast<float>(MAX_VALUE_FOR_WEIGHTING)) { + // newDicNode is worth continuing to traverse. + // CAVEAT: This pruning is important for speed. Remove this when we can afford not to prune + // here because here is not the right place to do pruning. Pruning should take place only + // in DicNodePriorityQueue. + traverseSession->getDicTraverseCache()->copyPushNextActive(&newDicNode); + } +} +} // namespace latinime
diff --git a/src/aosp/suggest/core/suggest.h b/src/aosp/suggest/core/suggest.h new file mode 100644 index 0000000..b20343d --- /dev/null +++ b/src/aosp/suggest/core/suggest.h
@@ -0,0 +1,94 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SUGGEST_IMPL_H +#define LATINIME_SUGGEST_IMPL_H + +#include "defines.h" +#include "suggest/core/suggest_interface.h" +#include "suggest/core/policy/suggest_policy.h" + +namespace latinime { + +// Naming convention +// - Distance: "Weighted" edit distance -- used both for spatial and language. +// - Compound Distance: Spatial Distance + Language Distance -- used for pruning and scoring +// - Cost: delta/diff for Distance -- used both for spatial and language +// - Length: "Non-weighted" -- used only for spatial +// - Probability: "Non-weighted" -- used only for language +// - Score: Final calibrated score based on the compound distance, which is sent to java as the +// priority of a suggested word + +class DicNode; +class DicTraverseSession; +class ProximityInfo; +class Scoring; +class Traversal; +class Weighting; + +class Suggest : public SuggestInterface { + public: + AK_FORCE_INLINE Suggest(const SuggestPolicy *const suggestPolicy) + : TRAVERSAL(suggestPolicy ? suggestPolicy->getTraversal() : 0), + SCORING(suggestPolicy ? suggestPolicy->getScoring() : 0), + WEIGHTING(suggestPolicy ? suggestPolicy->getWeighting() : 0) {} + AK_FORCE_INLINE virtual ~Suggest() {} + int getSuggestions(ProximityInfo *pInfo, void *traverseSession, int *inputXs, int *inputYs, + int *times, int *pointerIds, int *inputCodePoints, int inputSize, int commitPoint, + int *outWords, int *frequencies, int *outputIndices, int *outputTypes, + int *outputAutoCommitFirstWordConfidence) const; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Suggest); + void createNextWordDicNode(DicTraverseSession *traverseSession, DicNode *dicNode, + const bool spaceSubstitution) const; + int outputSuggestions(DicTraverseSession *traverseSession, int *frequencies, + int *outputCodePoints, int *outputIndicesToPartialCommit, int *outputTypes, + int *outputAutoCommitFirstWordConfidence) const; + int computeFirstWordConfidence(const DicNode *const terminalDicNode) const; + void initializeSearch(DicTraverseSession *traverseSession, int commitPoint) const; + void expandCurrentDicNodes(DicTraverseSession *traverseSession) const; + void processTerminalDicNode(DicTraverseSession *traverseSession, DicNode *dicNode) const; + void processExpandedDicNode(DicTraverseSession *traverseSession, DicNode *dicNode) const; + void weightChildNode(DicTraverseSession *traverseSession, DicNode *dicNode) const; + float getAutocorrectScore(DicTraverseSession *traverseSession, DicNode *dicNode) const; + void generateFeatures( + DicTraverseSession *traverseSession, DicNode *dicNode, float *features) const; + void processDicNodeAsOmission(DicTraverseSession *traverseSession, DicNode *dicNode) const; + void processDicNodeAsDigraph(DicTraverseSession *traverseSession, DicNode *dicNode) const; + void processDicNodeAsTransposition(DicTraverseSession *traverseSession, + DicNode *dicNode) const; + void processDicNodeAsInsertion(DicTraverseSession *traverseSession, DicNode *dicNode) const; + void processDicNodeAsAdditionalProximityChar(DicTraverseSession *traverseSession, + DicNode *dicNode, DicNode *childDicNode) const; + void processDicNodeAsSubstitution(DicTraverseSession *traverseSession, DicNode *dicNode, + DicNode *childDicNode) const; + void processDicNodeAsMatch(DicTraverseSession *traverseSession, + DicNode *childDicNode) const; + + // Inputs longer than this will autocorrect if the suggestion is multi-word + static const int MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT; + static const int MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE; + + // Threshold for autocorrection classifier + static const float AUTOCORRECT_CLASSIFICATION_THRESHOLD; + + const Traversal *const TRAVERSAL; + const Scoring *const SCORING; + const Weighting *const WEIGHTING; +}; +} // namespace latinime +#endif // LATINIME_SUGGEST_IMPL_H
diff --git a/src/aosp/suggest/core/suggest_interface.h b/src/aosp/suggest/core/suggest_interface.h new file mode 100644 index 0000000..4deb4d9 --- /dev/null +++ b/src/aosp/suggest/core/suggest_interface.h
@@ -0,0 +1,38 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SUGGEST_INTERFACE_H +#define LATINIME_SUGGEST_INTERFACE_H + +#include "defines.h" + +namespace latinime { + +class ProximityInfo; + +class SuggestInterface { + public: + virtual int getSuggestions(ProximityInfo *pInfo, void *traverseSession, int *inputXs, + int *inputYs, int *times, int *pointerIds, int *inputCodePoints, int inputSize, + int commitPoint, int *outWords, int *frequencies, int *outputIndices, + int *outputTypes, int *outputAutoCommitFirstWordConfidence) const = 0; + SuggestInterface() {} + virtual ~SuggestInterface() {} + private: + DISALLOW_COPY_AND_ASSIGN(SuggestInterface); +}; +} // namespace latinime +#endif // LATINIME_SUGGEST_INTERFACE_H
diff --git a/src/aosp/suggest/core/suggest_options.h b/src/aosp/suggest/core/suggest_options.h new file mode 100644 index 0000000..1b21aaf --- /dev/null +++ b/src/aosp/suggest/core/suggest_options.h
@@ -0,0 +1,74 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SUGGEST_OPTIONS_H +#define LATINIME_SUGGEST_OPTIONS_H + +#include "defines.h" + +namespace latinime { + +class SuggestOptions{ + public: + SuggestOptions(const int *const options, const int length) + : mOptions(options), mLength(length) {} + + AK_FORCE_INLINE bool isGesture() const { + return getBoolOption(IS_GESTURE); + } + + AK_FORCE_INLINE bool useFullEditDistance() const { + return getBoolOption(USE_FULL_EDIT_DISTANCE); + } + + AK_FORCE_INLINE bool getAdditionalFeaturesBoolOption(const int key) const { + return getBoolOption(key + ADDITIONAL_FEATURES_OPTIONS); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(SuggestOptions); + + // Need to update com.android.inputmethod.latin.NativeSuggestOptions when you add, remove or + // reorder options. + static const int IS_GESTURE = 0; + static const int USE_FULL_EDIT_DISTANCE = 1; + // Additional features options are stored after the other options and used as setting values of + // experimental features. + static const int ADDITIONAL_FEATURES_OPTIONS = 2; + + const int *const mOptions; + const int mLength; + + AK_FORCE_INLINE bool isValidKey(const int key) const { + return 0 <= key && key < mLength; + } + + AK_FORCE_INLINE bool getBoolOption(const int key) const { + if (isValidKey(key)) { + return mOptions[key] != 0; + } + return false; + } + + AK_FORCE_INLINE int getIntOption(const int key) const { + if (isValidKey(key)) { + return mOptions[key]; + } + return 0; + } +}; +} // namespace latinime +#endif // LATINIME_SUGGEST_OPTIONS_H
diff --git a/src/aosp/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h b/src/aosp/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h new file mode 100644 index 0000000..6ff95ca --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h
@@ -0,0 +1,53 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BIGRAM_LIST_POLICY_H +#define LATINIME_BIGRAM_LIST_POLICY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" +#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" + +namespace latinime { + +class BigramListPolicy : public DictionaryBigramsStructurePolicy { + public: + explicit BigramListPolicy(const uint8_t *const bigramsBuf) : mBigramsBuf(bigramsBuf) {} + + ~BigramListPolicy() {} + + void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext, + int *const pos) const { + BigramListReadWriteUtils::BigramFlags flags; + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(mBigramsBuf, &flags, + outBigramPos, pos); + *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(flags); + *outHasNext = BigramListReadWriteUtils::hasNext(flags); + } + + void skipAllBigrams(int *const pos) const { + BigramListReadWriteUtils::skipExistingBigrams(mBigramsBuf, pos); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListPolicy); + + const uint8_t *const mBigramsBuf; +}; +} // namespace latinime +#endif // LATINIME_BIGRAM_LIST_POLICY_H
diff --git a/src/aosp/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp b/src/aosp/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp new file mode 100644 index 0000000..1926b98 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp
@@ -0,0 +1,182 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::MASK_ATTRIBUTE_ADDRESS_TYPE = + 0x30; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; +// Flag for presence of more attributes +const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::FLAG_ATTRIBUTE_HAS_NEXT = + 0x80; +// Mask for attribute probability, stored on 4 bits inside the flags byte. +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; +const int BigramListReadWriteUtils::ATTRIBUTE_ADDRESS_SHIFT = 4; + +/* static */ void BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + const uint8_t *const bigramsBuf, BigramFlags *const outBigramFlags, + int *const outTargetPtNodePos, int *const bigramEntryPos) { + const BigramFlags bigramFlags = ByteArrayUtils::readUint8AndAdvancePosition(bigramsBuf, + bigramEntryPos); + if (outBigramFlags) { + *outBigramFlags = bigramFlags; + } + const int targetPos = getBigramAddressAndAdvancePosition(bigramsBuf, bigramFlags, + bigramEntryPos); + if (outTargetPtNodePos) { + *outTargetPtNodePos = targetPos; + } +} + +/* static */ void BigramListReadWriteUtils::skipExistingBigrams(const uint8_t *const bigramsBuf, + int *const bigramListPos) { + BigramFlags flags; + do { + getBigramEntryPropertiesAndAdvancePosition(bigramsBuf, &flags, 0 /* outTargetPtNodePos */, + bigramListPos); + } while(hasNext(flags)); +} + +/* static */ int BigramListReadWriteUtils::getBigramAddressAndAdvancePosition( + const uint8_t *const bigramsBuf, const BigramFlags flags, int *const pos) { + int offset = 0; + const int origin = *pos; + switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: + offset = ByteArrayUtils::readUint8AndAdvancePosition(bigramsBuf, pos); + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: + offset = ByteArrayUtils::readUint16AndAdvancePosition(bigramsBuf, pos); + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: + offset = ByteArrayUtils::readUint24AndAdvancePosition(bigramsBuf, pos); + break; + } + if (offset == DynamicPatriciaTrieReadingUtils::DICT_OFFSET_INVALID) { + return NOT_A_DICT_POS; + } else if (offset == DynamicPatriciaTrieReadingUtils::DICT_OFFSET_ZERO_OFFSET) { + return origin; + } + if (isOffsetNegative(flags)) { + return origin - offset; + } else { + return origin + offset; + } +} + +/* static */ bool BigramListReadWriteUtils::setHasNextFlag( + BufferWithExtendableBuffer *const buffer, const bool hasNext, const int entryPos) { + const bool usesAdditionalBuffer = buffer->isInAdditionalBuffer(entryPos); + int readingPos = entryPos; + if (usesAdditionalBuffer) { + readingPos -= buffer->getOriginalBufferSize(); + } + BigramFlags bigramFlags = ByteArrayUtils::readUint8AndAdvancePosition( + buffer->getBuffer(usesAdditionalBuffer), &readingPos); + if (hasNext) { + bigramFlags = bigramFlags | FLAG_ATTRIBUTE_HAS_NEXT; + } else { + bigramFlags = bigramFlags & (~FLAG_ATTRIBUTE_HAS_NEXT); + } + int writingPos = entryPos; + return buffer->writeUintAndAdvancePosition(bigramFlags, 1 /* size */, &writingPos); +} + +/* static */ bool BigramListReadWriteUtils::createAndWriteBigramEntry( + BufferWithExtendableBuffer *const buffer, const int targetPos, const int probability, + const bool hasNext, int *const writingPos) { + BigramFlags flags; + if (!createAndGetBigramFlags(*writingPos, targetPos, probability, hasNext, &flags)) { + return false; + } + return writeBigramEntry(buffer, flags, targetPos, writingPos); +} + +/* static */ bool BigramListReadWriteUtils::writeBigramEntry( + BufferWithExtendableBuffer *const bufferToWrite, const BigramFlags flags, + const int targetPtNodePos, int *const writingPos) { + const int offset = getBigramTargetOffset(targetPtNodePos, *writingPos); + const BigramFlags flagsToWrite = (offset < 0) ? + (flags | FLAG_ATTRIBUTE_OFFSET_NEGATIVE) : (flags & ~FLAG_ATTRIBUTE_OFFSET_NEGATIVE); + if (!bufferToWrite->writeUintAndAdvancePosition(flagsToWrite, 1 /* size */, writingPos)) { + return false; + } + const uint32_t absOffest = abs(offset); + const int bigramTargetFieldSize = attributeAddressSize(flags); + return bufferToWrite->writeUintAndAdvancePosition(absOffest, bigramTargetFieldSize, + writingPos); +} + +// Returns true if the bigram entry is valid and put entry flags into out*. +/* static */ bool BigramListReadWriteUtils::createAndGetBigramFlags(const int entryPos, + const int targetPtNodePos, const int probability, const bool hasNext, + BigramFlags *const outBigramFlags) { + BigramFlags flags = probability & MASK_ATTRIBUTE_PROBABILITY; + if (hasNext) { + flags |= FLAG_ATTRIBUTE_HAS_NEXT; + } + const int offset = getBigramTargetOffset(targetPtNodePos, entryPos); + if (offset < 0) { + flags |= FLAG_ATTRIBUTE_OFFSET_NEGATIVE; + } + const uint32_t absOffest = abs(offset); + if ((absOffest >> 24) != 0) { + // Offset is too large. + return false; + } else if ((absOffest >> 16) != 0) { + flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; + } else if ((absOffest >> 8) != 0) { + flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; + } else { + flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; + } + // Currently, all newly written bigram position fields are 3 bytes to simplify dictionary + // writing. + // TODO: Remove following 2 lines and optimize memory space. + flags = (flags & (~MASK_ATTRIBUTE_ADDRESS_TYPE)) | FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; + *outBigramFlags = flags; + return true; +} + +/* static */ int BigramListReadWriteUtils::getBigramTargetOffset(const int targetPtNodePos, + const int entryPos) { + if (targetPtNodePos == NOT_A_DICT_POS) { + return DynamicPatriciaTrieReadingUtils::DICT_OFFSET_INVALID; + } else { + const int offset = targetPtNodePos - (entryPos + 1 /* bigramFlagsField */); + if (offset == 0) { + return DynamicPatriciaTrieReadingUtils::DICT_OFFSET_ZERO_OFFSET; + } else { + return offset; + } + } +} + +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h b/src/aosp/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h new file mode 100644 index 0000000..eabe4e0 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h
@@ -0,0 +1,102 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H +#define LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H + +#include <cstdlib> +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class BigramListReadWriteUtils { +public: + typedef uint8_t BigramFlags; + + static void getBigramEntryPropertiesAndAdvancePosition(const uint8_t *const bigramsBuf, + BigramFlags *const outBigramFlags, int *const outTargetPtNodePos, + int *const bigramEntryPos); + + static AK_FORCE_INLINE int getProbabilityFromFlags(const BigramFlags flags) { + return flags & MASK_ATTRIBUTE_PROBABILITY; + } + + static AK_FORCE_INLINE bool hasNext(const BigramFlags flags) { + return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0; + } + + // Bigrams reading methods + static void skipExistingBigrams(const uint8_t *const bigramsBuf, int *const bigramListPos); + + // Returns the size of the bigram position field that is stored in bigram flags. + static AK_FORCE_INLINE int attributeAddressSize(const BigramFlags flags) { + return (flags & MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; + /* Note: this is a value-dependant optimization of what may probably be + more readably written this way: + switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; + default: return 0; + } + */ + } + + static bool setHasNextFlag(BufferWithExtendableBuffer *const buffer, + const bool hasNext, const int entryPos); + + static AK_FORCE_INLINE BigramFlags setProbabilityInFlags(const BigramFlags flags, + const int probability) { + return (flags & (~MASK_ATTRIBUTE_PROBABILITY)) | (probability & MASK_ATTRIBUTE_PROBABILITY); + } + + static bool createAndWriteBigramEntry(BufferWithExtendableBuffer *const buffer, + const int targetPos, const int probability, const bool hasNext, int *const writingPos); + + static bool writeBigramEntry(BufferWithExtendableBuffer *const buffer, const BigramFlags flags, + const int targetOffset, int *const writingPos); + +private: + DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListReadWriteUtils); + + static const BigramFlags MASK_ATTRIBUTE_ADDRESS_TYPE; + static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; + static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; + static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; + static const BigramFlags FLAG_ATTRIBUTE_OFFSET_NEGATIVE; + static const BigramFlags FLAG_ATTRIBUTE_HAS_NEXT; + static const BigramFlags MASK_ATTRIBUTE_PROBABILITY; + static const int ATTRIBUTE_ADDRESS_SHIFT; + + // Returns true if the bigram entry is valid and put entry flags into out*. + static bool createAndGetBigramFlags(const int entryPos, const int targetPos, + const int probability, const bool hasNext, BigramFlags *const outBigramFlags); + + static AK_FORCE_INLINE bool isOffsetNegative(const BigramFlags flags) { + return (flags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) != 0; + } + + static int getBigramAddressAndAdvancePosition(const uint8_t *const bigramsBuf, + const BigramFlags flags, int *const pos); + + static int getBigramTargetOffset(const int targetPtNodePos, const int entryPos); +}; +} // namespace latinime +#endif // LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H
diff --git a/src/aosp/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp b/src/aosp/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp new file mode 100644 index 0000000..b1170e2 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp
@@ -0,0 +1,391 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" + +#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { + +const int DynamicBigramListPolicy::CONTINUING_BIGRAM_LINK_COUNT_LIMIT = 10000; +const int DynamicBigramListPolicy::BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT = 100000; + +void DynamicBigramListPolicy::getNextBigram(int *const outBigramPos, int *const outProbability, + bool *const outHasNext, int *const bigramEntryPos) const { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramEntryPos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *bigramEntryPos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::BigramFlags bigramFlags; + int originalBigramPos; + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(buffer, &bigramFlags, + &originalBigramPos, bigramEntryPos); + if (usesAdditionalBuffer && originalBigramPos != NOT_A_DICT_POS) { + originalBigramPos += mBuffer->getOriginalBufferSize(); + } + *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags); + *outHasNext = BigramListReadWriteUtils::hasNext(bigramFlags); + if (mIsDecayingDict && !ForgettingCurveUtils::isValidEncodedProbability(*outProbability)) { + // This bigram is too weak to output. + *outBigramPos = NOT_A_DICT_POS; + } else { + *outBigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); + } + if (usesAdditionalBuffer) { + *bigramEntryPos += mBuffer->getOriginalBufferSize(); + } +} + +void DynamicBigramListPolicy::skipAllBigrams(int *const bigramListPos) const { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *bigramListPos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::skipExistingBigrams(buffer, bigramListPos); + if (usesAdditionalBuffer) { + *bigramListPos += mBuffer->getOriginalBufferSize(); + } +} + +bool DynamicBigramListPolicy::copyAllBigrams(BufferWithExtendableBuffer *const bufferToWrite, + int *const fromPos, int *const toPos, int *const outBigramsCount) const { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*fromPos); + if (usesAdditionalBuffer) { + *fromPos -= mBuffer->getOriginalBufferSize(); + } + *outBigramsCount = 0; + BigramListReadWriteUtils::BigramFlags bigramFlags; + int bigramEntryCount = 0; + int lastWrittenEntryPos = NOT_A_DICT_POS; + do { + if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { + AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", + bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); + ASSERT(false); + return false; + } + // The buffer address can be changed after calling buffer writing methods. + int originalBigramPos; + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos, + fromPos); + if (originalBigramPos == NOT_A_DICT_POS) { + // skip invalid bigram entry. + continue; + } + if (usesAdditionalBuffer) { + originalBigramPos += mBuffer->getOriginalBufferSize(); + } + const int bigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); + if (bigramPos == NOT_A_DICT_POS) { + // Target PtNode has been invalidated. + continue; + } + lastWrittenEntryPos = *toPos; + if (!BigramListReadWriteUtils::createAndWriteBigramEntry(bufferToWrite, bigramPos, + BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags), + BigramListReadWriteUtils::hasNext(bigramFlags), toPos)) { + return false; + } + (*outBigramsCount)++; + } while(BigramListReadWriteUtils::hasNext(bigramFlags)); + // Makes the last entry the terminal of the list. Updates the flags. + if (lastWrittenEntryPos != NOT_A_DICT_POS) { + if (!BigramListReadWriteUtils::setHasNextFlag(bufferToWrite, false /* hasNext */, + lastWrittenEntryPos)) { + return false; + } + } + if (usesAdditionalBuffer) { + *fromPos += mBuffer->getOriginalBufferSize(); + } + return true; +} + +// Finding useless bigram entries and remove them. Bigram entry is useless when the target PtNode +// has been deleted or is not a valid terminal. +bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries( + int *const bigramListPos, int *const outValidBigramEntryCount) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); + if (usesAdditionalBuffer) { + *bigramListPos -= mBuffer->getOriginalBufferSize(); + } + DynamicPatriciaTrieNodeReader nodeReader(mBuffer, this /* bigramsPolicy */, mShortcutPolicy); + BigramListReadWriteUtils::BigramFlags bigramFlags; + int bigramEntryCount = 0; + do { + if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { + AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", + bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); + ASSERT(false); + return false; + } + int bigramEntryPos = *bigramListPos; + int originalBigramPos; + // The buffer address can be changed after calling buffer writing methods. + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos, + bigramListPos); + if (usesAdditionalBuffer) { + bigramEntryPos += mBuffer->getOriginalBufferSize(); + } + if (originalBigramPos == NOT_A_DICT_POS) { + // This entry has already been removed. + continue; + } + if (usesAdditionalBuffer) { + originalBigramPos += mBuffer->getOriginalBufferSize(); + } + const int bigramTargetNodePos = + followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(bigramTargetNodePos); + if (nodeReader.isDeleted() || !nodeReader.isTerminal() + || bigramTargetNodePos == NOT_A_DICT_POS) { + // The target is no longer valid terminal. Invalidate the current bigram entry. + if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags, + NOT_A_DICT_POS /* targetPtNodePos */, &bigramEntryPos)) { + return false; + } + continue; + } + bool isRemoved = false; + if (!updateProbabilityForDecay(bigramFlags, bigramTargetNodePos, &bigramEntryPos, + &isRemoved)) { + return false; + } + if (!isRemoved) { + (*outValidBigramEntryCount) += 1; + } + } while(BigramListReadWriteUtils::hasNext(bigramFlags)); + return true; +} + +// Updates bigram target PtNode positions in the list after the placing step in GC. +bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bigramListPos, + const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const + ptNodePositionRelocationMap, int *const outBigramEntryCount) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); + if (usesAdditionalBuffer) { + *bigramListPos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::BigramFlags bigramFlags; + int bigramEntryCount = 0; + do { + if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { + AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", + bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); + ASSERT(false); + return false; + } + int bigramEntryPos = *bigramListPos; + if (usesAdditionalBuffer) { + bigramEntryPos += mBuffer->getOriginalBufferSize(); + } + int bigramTargetPtNodePos; + // The buffer address can be changed after calling buffer writing methods. + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &bigramTargetPtNodePos, + bigramListPos); + if (bigramTargetPtNodePos == NOT_A_DICT_POS) { + continue; + } + if (usesAdditionalBuffer) { + bigramTargetPtNodePos += mBuffer->getOriginalBufferSize(); + } + + DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap::const_iterator it = + ptNodePositionRelocationMap->find(bigramTargetPtNodePos); + if (it != ptNodePositionRelocationMap->end()) { + bigramTargetPtNodePos = it->second; + } else { + bigramTargetPtNodePos = NOT_A_DICT_POS; + } + if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags, + bigramTargetPtNodePos, &bigramEntryPos)) { + return false; + } + } while(BigramListReadWriteUtils::hasNext(bigramFlags)); + (*outBigramEntryCount) = bigramEntryCount; + return true; +} + +bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTargetPos, + const int probability, int *const bigramListPos, bool *const outAddedNewBigram) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); + if (usesAdditionalBuffer) { + *bigramListPos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::BigramFlags bigramFlags; + int bigramEntryCount = 0; + do { + if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { + AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", + bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); + ASSERT(false); + return false; + } + int entryPos = *bigramListPos; + if (usesAdditionalBuffer) { + entryPos += mBuffer->getOriginalBufferSize(); + } + int originalBigramPos; + // The buffer address can be changed after calling buffer writing methods. + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos, + bigramListPos); + if (usesAdditionalBuffer && originalBigramPos != NOT_A_DICT_POS) { + originalBigramPos += mBuffer->getOriginalBufferSize(); + } + if (followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos) == bigramTargetPos) { + // Update this bigram entry. + *outAddedNewBigram = false; + const int originalProbability = BigramListReadWriteUtils::getProbabilityFromFlags( + bigramFlags); + const int probabilityToWrite = mIsDecayingDict ? + ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability, + probability) : probability; + const BigramListReadWriteUtils::BigramFlags updatedFlags = + BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags, + probabilityToWrite); + return BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedFlags, + originalBigramPos, &entryPos); + } + if (BigramListReadWriteUtils::hasNext(bigramFlags)) { + continue; + } + // The current last entry is found. + // First, update the flags of the last entry. + if (!BigramListReadWriteUtils::setHasNextFlag(mBuffer, true /* hasNext */, entryPos)) { + *outAddedNewBigram = false; + return false; + } + if (usesAdditionalBuffer) { + *bigramListPos += mBuffer->getOriginalBufferSize(); + } + // Then, add a new entry after the last entry. + *outAddedNewBigram = true; + return writeNewBigramEntry(bigramTargetPos, probability, bigramListPos); + } while(BigramListReadWriteUtils::hasNext(bigramFlags)); + // We return directly from the while loop. + ASSERT(false); + return false; +} + +bool DynamicBigramListPolicy::writeNewBigramEntry(const int bigramTargetPos, const int probability, + int *const writingPos) { + // hasNext is false because we are adding a new bigram entry at the end of the bigram list. + const int probabilityToWrite = mIsDecayingDict ? + ForgettingCurveUtils::getUpdatedEncodedProbability(NOT_A_PROBABILITY, probability) : + probability; + return BigramListReadWriteUtils::createAndWriteBigramEntry(mBuffer, bigramTargetPos, + probabilityToWrite, false /* hasNext */, writingPos); +} + +bool DynamicBigramListPolicy::removeBigram(const int bigramListPos, const int bigramTargetPos) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(bigramListPos); + int pos = bigramListPos; + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::BigramFlags bigramFlags; + int bigramEntryCount = 0; + do { + if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { + AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", + bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); + ASSERT(false); + return false; + } + int bigramEntryPos = pos; + int originalBigramPos; + // The buffer address can be changed after calling buffer writing methods. + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos, &pos); + if (usesAdditionalBuffer) { + bigramEntryPos += mBuffer->getOriginalBufferSize(); + } + if (usesAdditionalBuffer && originalBigramPos != NOT_A_DICT_POS) { + originalBigramPos += mBuffer->getOriginalBufferSize(); + } + const int bigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); + if (bigramPos != bigramTargetPos) { + continue; + } + // Target entry is found. Write an invalid target position to mark the bigram invalid. + return BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags, + NOT_A_DICT_POS /* targetOffset */, &bigramEntryPos); + } while(BigramListReadWriteUtils::hasNext(bigramFlags)); + return false; +} + +int DynamicBigramListPolicy::followBigramLinkAndGetCurrentBigramPtNodePos( + const int originalBigramPos) const { + if (originalBigramPos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + int currentPos = originalBigramPos; + DynamicPatriciaTrieNodeReader nodeReader(mBuffer, this /* bigramsPolicy */, mShortcutPolicy); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(currentPos); + int bigramLinkCount = 0; + while (nodeReader.getBigramLinkedNodePos() != NOT_A_DICT_POS) { + currentPos = nodeReader.getBigramLinkedNodePos(); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(currentPos); + bigramLinkCount++; + if (bigramLinkCount > CONTINUING_BIGRAM_LINK_COUNT_LIMIT) { + AKLOGE("Bigram link is invalid. start position: %d", originalBigramPos); + ASSERT(false); + return NOT_A_DICT_POS; + } + } + return currentPos; +} + +bool DynamicBigramListPolicy::updateProbabilityForDecay( + const BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos, + int *const bigramEntryPos, bool *const outRemoved) const { + *outRemoved = false; + if (mIsDecayingDict) { + // Update bigram probability for decaying. + const int newProbability = ForgettingCurveUtils::getEncodedProbabilityToSave( + BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags), mHeaderPolicy); + if (ForgettingCurveUtils::isValidEncodedProbability(newProbability)) { + // Write new probability. + const BigramListReadWriteUtils::BigramFlags updatedBigramFlags = + BigramListReadWriteUtils::setProbabilityInFlags( + bigramFlags, newProbability); + if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedBigramFlags, + targetPtNodePos, bigramEntryPos)) { + return false; + } + } else { + // Remove current bigram entry. + *outRemoved = true; + if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags, + NOT_A_DICT_POS /* targetPtNodePos */, bigramEntryPos)) { + return false; + } + } + } + return true; +} + +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h b/src/aosp/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h new file mode 100644 index 0000000..0504b59 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h
@@ -0,0 +1,92 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_BIGRAM_LIST_POLICY_H +#define LATINIME_DYNAMIC_BIGRAM_LIST_POLICY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" +#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class DictionaryHeaderStructurePolicy; +class DictionaryShortcutsStructurePolicy; + +/* + * This is a dynamic version of BigramListPolicy and supports an additional buffer. + */ +class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy { + public: + DynamicBigramListPolicy(const DictionaryHeaderStructurePolicy *const headerPolicy, + BufferWithExtendableBuffer *const buffer, + const DictionaryShortcutsStructurePolicy *const shortcutPolicy, + const bool isDecayingDict) + : mHeaderPolicy(headerPolicy), mBuffer(buffer), mShortcutPolicy(shortcutPolicy), + mIsDecayingDict(isDecayingDict) {} + + ~DynamicBigramListPolicy() {} + + void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext, + int *const bigramEntryPos) const; + + void skipAllBigrams(int *const bigramListPos) const; + + // Copy bigrams from the bigram list that starts at fromPos in mBuffer to toPos in + // bufferToWrite and advance these positions after bigram lists. This method skips invalid + // bigram entries and write the valid bigram entry count to outBigramsCount. + bool copyAllBigrams(BufferWithExtendableBuffer *const bufferToWrite, int *const fromPos, + int *const toPos, int *const outBigramsCount) const; + + bool updateAllBigramEntriesAndDeleteUselessEntries(int *const bigramListPos, + int *const outBigramEntryCount); + + bool updateAllBigramTargetPtNodePositions(int *const bigramListPos, + const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const + ptNodePositionRelocationMap, int *const outValidBigramEntryCount); + + bool addNewBigramEntryToBigramList(const int bigramTargetPos, const int probability, + int *const bigramListPos, bool *const outAddedNewBigram); + + bool writeNewBigramEntry(const int bigramTargetPos, const int probability, + int *const writingPos); + + // Return whether or not targetBigramPos is found. + bool removeBigram(const int bigramListPos, const int bigramTargetPos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicBigramListPolicy); + + static const int CONTINUING_BIGRAM_LINK_COUNT_LIMIT; + static const int BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT; + + const DictionaryHeaderStructurePolicy *const mHeaderPolicy; + BufferWithExtendableBuffer *const mBuffer; + const DictionaryShortcutsStructurePolicy *const mShortcutPolicy; + const bool mIsDecayingDict; + + // Follow bigram link and return the position of bigram target PtNode that is currently valid. + int followBigramLinkAndGetCurrentBigramPtNodePos(const int originalBigramPos) const; + + bool updateProbabilityForDecay(const BigramListReadWriteUtils::BigramFlags bigramFlags, + const int targetPtNodePos, int *const bigramEntryPos, bool *const outRemoved) const; +}; +} // namespace latinime +#endif // LATINIME_DYNAMIC_BIGRAM_LIST_POLICY_H
diff --git a/src/aosp/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.cpp b/src/aosp/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.cpp new file mode 100644 index 0000000..ff80dd2 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.cpp
@@ -0,0 +1,53 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.h" + +#include <stdint.h> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h" +#include "suggest/policyimpl/dictionary/patricia_trie_policy.h" +#include "suggest/policyimpl/dictionary/utils/format_utils.h" +#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" + +namespace latinime { + +/* static */ DictionaryStructureWithBufferPolicy *DictionaryStructureWithBufferPolicyFactory + ::newDictionaryStructureWithBufferPolicy(const char *const path, const int bufOffset, + const int size, const bool isUpdatable) { + // Allocated buffer in MmapedBuffer::openBuffer() will be freed in the destructor of + // impl classes of DictionaryStructureWithBufferPolicy. + const MmappedBuffer *const mmapedBuffer = MmappedBuffer::openBuffer(path, bufOffset, size, + isUpdatable); + if (!mmapedBuffer) { + return 0; + } + switch (FormatUtils::detectFormatVersion(mmapedBuffer->getBuffer(), + mmapedBuffer->getBufferSize())) { + case FormatUtils::VERSION_2: + return new PatriciaTriePolicy(mmapedBuffer); + case FormatUtils::VERSION_3: + return new DynamicPatriciaTriePolicy(mmapedBuffer); + default: + AKLOGE("DICT: dictionary format is unknown, bad magic number"); + delete mmapedBuffer; + ASSERT(false); + return 0; + } +} + +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.h b/src/aosp/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.h new file mode 100644 index 0000000..8cebc3b --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.h
@@ -0,0 +1,36 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H +#define LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" + +namespace latinime { + +class DictionaryStructureWithBufferPolicyFactory { + public: + static DictionaryStructureWithBufferPolicy *newDictionaryStructureWithBufferPolicy( + const char *const path, const int bufOffset, const int size, const bool isUpdatable); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DictionaryStructureWithBufferPolicyFactory); +}; +} // namespace latinime +#endif // LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H
diff --git a/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp new file mode 100644 index 0000000..5724c5d --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp
@@ -0,0 +1,191 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h" + +#include "suggest/core/policy/dictionary_header_structure_policy.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { + +bool DynamicPatriciaTrieGcEventListeners + ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + ::onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints) { + // PtNode is useless when the PtNode is not a terminal and doesn't have any not useless + // children. + bool isUselessPtNode = !node->isTerminal(); + if (node->isTerminal() && mIsDecayingDict) { + const int newProbability = + ForgettingCurveUtils::getEncodedProbabilityToSave(node->getProbability(), + mHeaderPolicy); + int writingPos = node->getProbabilityFieldPos(); + // Update probability. + if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition( + mBuffer, newProbability, &writingPos)) { + return false; + } + if (!ForgettingCurveUtils::isValidEncodedProbability(newProbability)) { + isUselessPtNode = true; + } + } + if (mChildrenValue > 0) { + isUselessPtNode = false; + } else if (node->isTerminal()) { + // Remove children as all children are useless. + int writingPos = node->getChildrenPosFieldPos(); + if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition( + mBuffer, NOT_A_DICT_POS /* childrenPosition */, &writingPos)) { + return false; + } + } + if (isUselessPtNode) { + // Current PtNode is no longer needed. Mark it as deleted. + if (!mWritingHelper->markNodeAsDeleted(node)) { + return false; + } + } else { + mValueStack.back() += 1; + if (node->isTerminal()) { + mValidUnigramCount += 1; + } + } + return true; +} + +bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateBigramProbability + ::onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints) { + if (!node->isDeleted()) { + int pos = node->getBigramsPos(); + if (pos != NOT_A_DICT_POS) { + int bigramEntryCount = 0; + if (!mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(&pos, + &bigramEntryCount)) { + return false; + } + mValidBigramEntryCount += bigramEntryCount; + } + } + return true; +} + +// Writes dummy PtNode array size when the head of PtNode array is read. +bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + ::onDescend(const int ptNodeArrayPos) { + mValidPtNodeCount = 0; + int writingPos = mBufferToWrite->getTailPosition(); + mDictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.insert( + DynamicPatriciaTrieWritingHelper::PtNodeArrayPositionRelocationMap::value_type( + ptNodeArrayPos, writingPos)); + // Writes dummy PtNode array size because arrays can have a forward link or needles PtNodes. + // This field will be updated later in onReadingPtNodeArrayTail() with actual PtNode count. + mPtNodeArraySizeFieldPos = writingPos; + return DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition( + mBufferToWrite, 0 /* arraySize */, &writingPos); +} + +// Write PtNode array terminal and actual PtNode array size. +bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + ::onReadingPtNodeArrayTail() { + int writingPos = mBufferToWrite->getTailPosition(); + // Write PtNode array terminal. + if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition( + mBufferToWrite, NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { + return false; + } + // Write actual PtNode array size. + if (!DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition( + mBufferToWrite, mValidPtNodeCount, &mPtNodeArraySizeFieldPos)) { + return false; + } + return true; +} + +// Write valid PtNode to buffer and memorize mapping from the old position to the new position. +bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + ::onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints) { + if (node->isDeleted()) { + // Current PtNode is not written in new buffer because it has been deleted. + mDictPositionRelocationMap->mPtNodePositionRelocationMap.insert( + DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap::value_type( + node->getHeadPos(), NOT_A_DICT_POS)); + return true; + } + int writingPos = mBufferToWrite->getTailPosition(); + mDictPositionRelocationMap->mPtNodePositionRelocationMap.insert( + DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap::value_type( + node->getHeadPos(), writingPos)); + mValidPtNodeCount++; + // Writes current PtNode. + return mWritingHelper->writePtNodeToBufferByCopyingPtNodeInfo(mBufferToWrite, node, + node->getParentPos(), nodeCodePoints, node->getCodePointCount(), + node->getProbability(), &writingPos); +} + +bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionFields + ::onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints) { + // Updates parent position. + int parentPos = node->getParentPos(); + if (parentPos != NOT_A_DICT_POS) { + DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap::const_iterator it = + mDictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos); + if (it != mDictPositionRelocationMap->mPtNodePositionRelocationMap.end()) { + parentPos = it->second; + } + } + int writingPos = node->getHeadPos() + DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE; + // Write updated parent offset. + if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition(mBufferToWrite, + parentPos, node->getHeadPos(), &writingPos)) { + return false; + } + + // Updates children position. + int childrenPos = node->getChildrenPos(); + if (childrenPos != NOT_A_DICT_POS) { + DynamicPatriciaTrieWritingHelper::PtNodeArrayPositionRelocationMap::const_iterator it = + mDictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos); + if (it != mDictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) { + childrenPos = it->second; + } + } + writingPos = node->getChildrenPosFieldPos(); + if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition(mBufferToWrite, + childrenPos, &writingPos)) { + return false; + } + + // Updates bigram target PtNode positions in the bigram list. + int bigramsPos = node->getBigramsPos(); + if (bigramsPos != NOT_A_DICT_POS) { + int bigramEntryCount; + if (!mBigramPolicy->updateAllBigramTargetPtNodePositions(&bigramsPos, + &mDictPositionRelocationMap->mPtNodePositionRelocationMap, &bigramEntryCount)) { + return false; + } + mBigramCount += bigramEntryCount; + } + if (node->isTerminal()) { + mUnigramCount++; + } + + return true; +} + +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h new file mode 100644 index 0000000..9755120 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h
@@ -0,0 +1,197 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_GC_EVENT_LISTENERS_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_GC_EVENT_LISTENERS_H + +#include <vector> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "utils/hash_map_compat.h" + +namespace latinime { + +class DictionaryHeaderStructurePolicy; + +class DynamicPatriciaTrieGcEventListeners { + public: + // Updates all PtNodes that can be reached from the root. Checks if each PtNode is useless or + // not and marks useless PtNodes as deleted. Such deleted PtNodes will be discarded in the GC. + // TODO: Concatenate non-terminal PtNodes. + class TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( + const DictionaryHeaderStructurePolicy *const headerPolicy, + DynamicPatriciaTrieWritingHelper *const writingHelper, + BufferWithExtendableBuffer *const buffer, const bool isDecayingDict) + : mHeaderPolicy(headerPolicy), mWritingHelper(writingHelper), mBuffer(buffer), + mIsDecayingDict(isDecayingDict), mValueStack(), mChildrenValue(0), + mValidUnigramCount(0) {} + + ~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {}; + + bool onAscend() { + if (mValueStack.empty()) { + return false; + } + mChildrenValue = mValueStack.back(); + mValueStack.pop_back(); + return true; + } + + bool onDescend(const int ptNodeArrayPos) { + mValueStack.push_back(0); + mChildrenValue = 0; + return true; + } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints); + + int getValidUnigramCount() const { + return mValidUnigramCount; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS( + TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted); + + const DictionaryHeaderStructurePolicy *const mHeaderPolicy; + DynamicPatriciaTrieWritingHelper *const mWritingHelper; + BufferWithExtendableBuffer *const mBuffer; + const bool mIsDecayingDict; + std::vector<int> mValueStack; + int mChildrenValue; + int mValidUnigramCount; + }; + + // Updates all bigram entries that are held by valid PtNodes. This removes useless bigram + // entries. + class TraversePolicyToUpdateBigramProbability + : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateBigramProbability( + DynamicBigramListPolicy *const bigramPolicy) + : mBigramPolicy(bigramPolicy), mValidBigramEntryCount(0) {} + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos) { return true; } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints); + + int getValidBigramEntryCount() const { + return mValidBigramEntryCount; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateBigramProbability); + + DynamicBigramListPolicy *const mBigramPolicy; + int mValidBigramEntryCount; + }; + + class TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { + public: + TraversePolicyToPlaceAndWriteValidPtNodesToBuffer( + DynamicPatriciaTrieWritingHelper *const writingHelper, + BufferWithExtendableBuffer *const bufferToWrite, + DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const + dictPositionRelocationMap) + : mWritingHelper(writingHelper), mBufferToWrite(bufferToWrite), + mDictPositionRelocationMap(dictPositionRelocationMap), mValidPtNodeCount(0), + mPtNodeArraySizeFieldPos(NOT_A_DICT_POS) {}; + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos); + + bool onReadingPtNodeArrayTail(); + + bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToPlaceAndWriteValidPtNodesToBuffer); + + DynamicPatriciaTrieWritingHelper *const mWritingHelper; + BufferWithExtendableBuffer *const mBufferToWrite; + DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const + mDictPositionRelocationMap; + int mValidPtNodeCount; + int mPtNodeArraySizeFieldPos; + }; + + class TraversePolicyToUpdateAllPositionFields + : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateAllPositionFields( + DynamicPatriciaTrieWritingHelper *const writingHelper, + DynamicBigramListPolicy *const bigramPolicy, + BufferWithExtendableBuffer *const bufferToWrite, + const DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const + dictPositionRelocationMap) + : mWritingHelper(writingHelper), mBigramPolicy(bigramPolicy), + mBufferToWrite(bufferToWrite), + mDictPositionRelocationMap(dictPositionRelocationMap), mUnigramCount(0), + mBigramCount(0) {}; + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos) { return true; } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints); + + int getUnigramCount() const { + return mUnigramCount; + } + + int getBigramCount() const { + return mBigramCount; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPositionFields); + + DynamicPatriciaTrieWritingHelper *const mWritingHelper; + DynamicBigramListPolicy *const mBigramPolicy; + BufferWithExtendableBuffer *const mBufferToWrite; + const DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const + mDictPositionRelocationMap; + int mUnigramCount; + int mBigramCount; + }; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieGcEventListeners); +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_GC_EVENT_LISTENERS_H */
diff --git a/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp new file mode 100644 index 0000000..2fa3111 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp
@@ -0,0 +1,124 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" + +#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" +#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +void DynamicPatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProcessMovedPtNode( + const int ptNodePos, const int maxCodePointCount, int *const outCodePoints) { + if (ptNodePos < 0 || ptNodePos >= mBuffer->getTailPosition()) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %d", + ptNodePos, mBuffer->getTailPosition()); + ASSERT(false); + invalidatePtNodeInfo(); + return; + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodePos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int pos = ptNodePos; + mHeadPos = ptNodePos; + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } + mFlags = PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const int parentPosOffset = + DynamicPatriciaTrieReadingUtils::getParentPtNodePosOffsetAndAdvancePosition(dictBuf, + &pos); + mParentPos = DynamicPatriciaTrieReadingUtils::getParentPtNodePos(parentPosOffset, mHeadPos); + if (outCodePoints != 0) { + mCodePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( + dictBuf, mFlags, maxCodePointCount, outCodePoints, &pos); + } else { + mCodePointCount = PatriciaTrieReadingUtils::skipCharacters( + dictBuf, mFlags, MAX_WORD_LENGTH, &pos); + } + if (isTerminal()) { + mProbabilityFieldPos = pos; + if (usesAdditionalBuffer) { + mProbabilityFieldPos += mBuffer->getOriginalBufferSize(); + } + mProbability = PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(dictBuf, &pos); + } else { + mProbabilityFieldPos = NOT_A_DICT_POS; + mProbability = NOT_A_PROBABILITY; + } + mChildrenPosFieldPos = pos; + if (usesAdditionalBuffer) { + mChildrenPosFieldPos += mBuffer->getOriginalBufferSize(); + } + mChildrenPos = DynamicPatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + dictBuf, &pos); + if (usesAdditionalBuffer && mChildrenPos != NOT_A_DICT_POS) { + mChildrenPos += mBuffer->getOriginalBufferSize(); + } + if (mSiblingPos == NOT_A_DICT_POS) { + if (DynamicPatriciaTrieReadingUtils::isMoved(mFlags)) { + mBigramLinkedNodePos = mChildrenPos; + } else { + mBigramLinkedNodePos = NOT_A_DICT_POS; + } + } + if (usesAdditionalBuffer) { + pos += mBuffer->getOriginalBufferSize(); + } + if (PatriciaTrieReadingUtils::hasShortcutTargets(mFlags)) { + mShortcutPos = pos; + mShortcutsPolicy->skipAllShortcuts(&pos); + } else { + mShortcutPos = NOT_A_DICT_POS; + } + if (PatriciaTrieReadingUtils::hasBigrams(mFlags)) { + mBigramPos = pos; + mBigramsPolicy->skipAllBigrams(&pos); + } else { + mBigramPos = NOT_A_DICT_POS; + } + // Update siblingPos if needed. + if (mSiblingPos == NOT_A_DICT_POS) { + // Sibling position is the tail position of current node. + mSiblingPos = pos; + } + // Read destination node if the read node is a moved node. + if (DynamicPatriciaTrieReadingUtils::isMoved(mFlags)) { + // The destination position is stored at the same place as the parent position. + fetchPtNodeInfoFromBufferAndProcessMovedPtNode(mParentPos, maxCodePointCount, + outCodePoints); + } +} + +void DynamicPatriciaTrieNodeReader::invalidatePtNodeInfo() { + mHeadPos = NOT_A_DICT_POS; + mFlags = 0; + mParentPos = NOT_A_DICT_POS; + mCodePointCount = 0; + mProbabilityFieldPos = NOT_A_DICT_POS; + mProbability = NOT_A_PROBABILITY; + mChildrenPosFieldPos = NOT_A_DICT_POS; + mChildrenPos = NOT_A_DICT_POS; + mBigramLinkedNodePos = NOT_A_DICT_POS; + mShortcutPos = NOT_A_DICT_POS; + mBigramPos = NOT_A_DICT_POS; + mSiblingPos = NOT_A_DICT_POS; +} + +}
diff --git a/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h new file mode 100644 index 0000000..3b36d42 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h
@@ -0,0 +1,163 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_NODE_READER_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_NODE_READER_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class DictionaryBigramsStructurePolicy; +class DictionaryShortcutsStructurePolicy; + +/* + * This class is used for helping to read nodes of dynamic patricia trie. This class handles moved + * node and reads node attributes. + */ +class DynamicPatriciaTrieNodeReader { + public: + DynamicPatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer, + const DictionaryBigramsStructurePolicy *const bigramsPolicy, + const DictionaryShortcutsStructurePolicy *const shortcutsPolicy) + : mBuffer(buffer), mBigramsPolicy(bigramsPolicy), + mShortcutsPolicy(shortcutsPolicy), mHeadPos(NOT_A_DICT_POS), mFlags(0), + mParentPos(NOT_A_DICT_POS), mCodePointCount(0), mProbabilityFieldPos(NOT_A_DICT_POS), + mProbability(NOT_A_PROBABILITY), mChildrenPosFieldPos(NOT_A_DICT_POS), + mChildrenPos(NOT_A_DICT_POS), mBigramLinkedNodePos(NOT_A_DICT_POS), + mShortcutPos(NOT_A_DICT_POS), mBigramPos(NOT_A_DICT_POS), + mSiblingPos(NOT_A_DICT_POS) {} + + ~DynamicPatriciaTrieNodeReader() {} + + // Reads PtNode information from dictionary buffer and updates members with the information. + AK_FORCE_INLINE void fetchNodeInfoInBufferFromPtNodePos(const int ptNodePos) { + fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(ptNodePos , + 0 /* maxCodePointCount */, 0 /* outCodePoints */); + } + + AK_FORCE_INLINE void fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints( + const int ptNodePos, const int maxCodePointCount, int *const outCodePoints) { + mSiblingPos = NOT_A_DICT_POS; + mBigramLinkedNodePos = NOT_A_DICT_POS; + fetchPtNodeInfoFromBufferAndProcessMovedPtNode(ptNodePos, maxCodePointCount, outCodePoints); + } + + // HeadPos is different from NodePos when the current PtNode is a moved PtNode. + AK_FORCE_INLINE int getHeadPos() const { + return mHeadPos; + } + + // Flags + AK_FORCE_INLINE bool isDeleted() const { + return DynamicPatriciaTrieReadingUtils::isDeleted(mFlags); + } + + AK_FORCE_INLINE bool hasChildren() const { + return mChildrenPos != NOT_A_DICT_POS; + } + + AK_FORCE_INLINE bool isTerminal() const { + return PatriciaTrieReadingUtils::isTerminal(mFlags); + } + + AK_FORCE_INLINE bool isBlacklisted() const { + return PatriciaTrieReadingUtils::isBlacklisted(mFlags); + } + + AK_FORCE_INLINE bool isNotAWord() const { + return PatriciaTrieReadingUtils::isNotAWord(mFlags); + } + + // Parent node position + AK_FORCE_INLINE int getParentPos() const { + return mParentPos; + } + + // Number of code points + AK_FORCE_INLINE uint8_t getCodePointCount() const { + return mCodePointCount; + } + + // Probability + AK_FORCE_INLINE int getProbabilityFieldPos() const { + return mProbabilityFieldPos; + } + + AK_FORCE_INLINE int getProbability() const { + return mProbability; + } + + // Children PtNode array position + AK_FORCE_INLINE int getChildrenPosFieldPos() const { + return mChildrenPosFieldPos; + } + + AK_FORCE_INLINE int getChildrenPos() const { + return mChildrenPos; + } + + // Bigram linked node position. + AK_FORCE_INLINE int getBigramLinkedNodePos() const { + return mBigramLinkedNodePos; + } + + // Shortcutlist position + AK_FORCE_INLINE int getShortcutPos() const { + return mShortcutPos; + } + + // Bigrams position + AK_FORCE_INLINE int getBigramsPos() const { + return mBigramPos; + } + + // Sibling node position + AK_FORCE_INLINE int getSiblingNodePos() const { + return mSiblingPos; + } + + private: + DISALLOW_COPY_AND_ASSIGN(DynamicPatriciaTrieNodeReader); + + const BufferWithExtendableBuffer *const mBuffer; + const DictionaryBigramsStructurePolicy *const mBigramsPolicy; + const DictionaryShortcutsStructurePolicy *const mShortcutsPolicy; + int mHeadPos; + DynamicPatriciaTrieReadingUtils::NodeFlags mFlags; + int mParentPos; + uint8_t mCodePointCount; + int mProbabilityFieldPos; + int mProbability; + int mChildrenPosFieldPos; + int mChildrenPos; + int mBigramLinkedNodePos; + int mShortcutPos; + int mBigramPos; + int mSiblingPos; + + void fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos, + const int maxCodePointCount, int *const outCodePoints); + + void invalidatePtNodeInfo(); +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_NODE_READER_H */
diff --git a/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp new file mode 100644 index 0000000..a8ea69f --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp
@@ -0,0 +1,373 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h" + +#include <cstdio> +#include <cstring> +#include <ctime> + +#include "defines.h" +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" +#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" +#include "suggest/policyimpl/dictionary/utils/probability_utils.h" + +namespace latinime { + +// Note that these are corresponding definitions in Java side in BinaryDictionaryTests and +// BinaryDictionaryDecayingTests. +const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; +const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; +const char *const DynamicPatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; +const char *const DynamicPatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; +const char *const DynamicPatriciaTriePolicy::SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY = + "SET_NEEDS_TO_DECAY_FOR_TESTING"; +const int DynamicPatriciaTriePolicy::MAX_DICT_EXTENDED_REGION_SIZE = 1024 * 1024; +const int DynamicPatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = + DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE - 1024; + +void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const { + if (!dicNode->hasChildren()) { + return; + } + DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPos()); + const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); + while (!readingHelper.isEnd()) { + childDicNodes->pushLeavingChild(dicNode, nodeReader->getHeadPos(), + nodeReader->getChildrenPos(), nodeReader->getProbability(), + nodeReader->isTerminal() && !nodeReader->isDeleted(), + nodeReader->hasChildren(), nodeReader->isBlacklisted() || nodeReader->isNotAWord(), + nodeReader->getCodePointCount(), readingHelper.getMergedNodeCodePoints()); + readingHelper.readNextSiblingNode(); + } +} + +int DynamicPatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( + const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const { + // This method traverses parent nodes from the terminal by following parent pointers; thus, + // node code points are stored in the buffer in the reverse order. + int reverseCodePoints[maxCodePointCount]; + DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + // First, read the terminal node and get its probability. + readingHelper.initWithPtNodePos(ptNodePos); + if (!readingHelper.isValidTerminalNode()) { + // Node at the ptNodePos is not a valid terminal node. + *outUnigramProbability = NOT_A_PROBABILITY; + return 0; + } + // Store terminal node probability. + *outUnigramProbability = readingHelper.getNodeReader()->getProbability(); + // Then, following parent node link to the dictionary root and fetch node code points. + while (!readingHelper.isEnd()) { + if (readingHelper.getTotalCodePointCount() > maxCodePointCount) { + // The ptNodePos is not a valid terminal node position in the dictionary. + *outUnigramProbability = NOT_A_PROBABILITY; + return 0; + } + // Store node code points to buffer in the reverse order. + readingHelper.fetchMergedNodeCodePointsInReverseOrder( + readingHelper.getPrevTotalCodePointCount(), reverseCodePoints); + // Follow parent node toward the root node. + readingHelper.readParentNode(); + } + if (readingHelper.isError()) { + // The node position or the dictionary is invalid. + *outUnigramProbability = NOT_A_PROBABILITY; + return 0; + } + // Reverse the stored code points to output them. + const int codePointCount = readingHelper.getTotalCodePointCount(); + for (int i = 0; i < codePointCount; ++i) { + outCodePoints[i] = reverseCodePoints[codePointCount - i - 1]; + } + return codePointCount; +} + +int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const inWord, + const int length, const bool forceLowerCaseSearch) const { + int searchCodePoints[length]; + for (int i = 0; i < length; ++i) { + searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i]; + } + DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); + while (!readingHelper.isEnd()) { + const int matchedCodePointCount = readingHelper.getPrevTotalCodePointCount(); + if (readingHelper.getTotalCodePointCount() > length + || !readingHelper.isMatchedCodePoint(0 /* index */, + searchCodePoints[matchedCodePointCount])) { + // Current node has too many code points or its first code point is different from + // target code point. Skip this node and read the next sibling node. + readingHelper.readNextSiblingNode(); + continue; + } + // Check following merged node code points. + const int nodeCodePointCount = nodeReader->getCodePointCount(); + for (int j = 1; j < nodeCodePointCount; ++j) { + if (!readingHelper.isMatchedCodePoint( + j, searchCodePoints[matchedCodePointCount + j])) { + // Different code point is found. The given word is not included in the dictionary. + return NOT_A_DICT_POS; + } + } + // All characters are matched. + if (length == readingHelper.getTotalCodePointCount()) { + // Terminal position is found. + return nodeReader->getHeadPos(); + } + if (!nodeReader->hasChildren()) { + return NOT_A_DICT_POS; + } + // Advance to the children nodes. + readingHelper.readChildNode(); + } + // If we already traversed the tree further than the word is long, there means + // there was no match (or we would have found it). + return NOT_A_DICT_POS; +} + +int DynamicPatriciaTriePolicy::getProbability(const int unigramProbability, + const int bigramProbability) const { + if (mHeaderPolicy.isDecayingDict()) { + return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability); + } else { + if (unigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } else if (bigramProbability == NOT_A_PROBABILITY) { + return ProbabilityUtils::backoff(unigramProbability); + } else { + return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, + bigramProbability); + } + } +} + +int DynamicPatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_PROBABILITY; + } + DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + if (nodeReader.isDeleted() || nodeReader.isBlacklisted() || nodeReader.isNotAWord()) { + return NOT_A_PROBABILITY; + } + return getProbability(nodeReader.getProbability(), NOT_A_PROBABILITY); +} + +int DynamicPatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + if (nodeReader.isDeleted()) { + return NOT_A_DICT_POS; + } + return nodeReader.getShortcutPos(); +} + +int DynamicPatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + if (nodeReader.isDeleted()) { + return NOT_A_DICT_POS; + } + return nodeReader.getBigramsPos(); +} + +bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int length, + const int probability) { + if (!mBuffer->isUpdatable()) { + AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary."); + return false; + } + if (mBufferWithExtendableBuffer.getTailPosition() + >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update."); + return false; + } + DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, + &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); + bool addedNewUnigram = false; + if (writingHelper.addUnigramWord(&readingHelper, word, length, probability, + &addedNewUnigram)) { + if (addedNewUnigram) { + mUnigramCount++; + } + return true; + } else { + return false; + } +} + +bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int length0, + const int *const word1, const int length1, const int probability) { + if (!mBuffer->isUpdatable()) { + AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary."); + return false; + } + if (mBufferWithExtendableBuffer.getTailPosition() + >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update."); + return false; + } + const int word0Pos = getTerminalNodePositionOfWord(word0, length0, + false /* forceLowerCaseSearch */); + if (word0Pos == NOT_A_DICT_POS) { + return false; + } + const int word1Pos = getTerminalNodePositionOfWord(word1, length1, + false /* forceLowerCaseSearch */); + if (word1Pos == NOT_A_DICT_POS) { + return false; + } + DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, + &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); + bool addedNewBigram = false; + if (writingHelper.addBigramWords(word0Pos, word1Pos, probability, &addedNewBigram)) { + if (addedNewBigram) { + mBigramCount++; + } + return true; + } else { + return false; + } +} + +bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0, + const int *const word1, const int length1) { + if (!mBuffer->isUpdatable()) { + AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary."); + return false; + } + if (mBufferWithExtendableBuffer.getTailPosition() + >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update."); + return false; + } + const int word0Pos = getTerminalNodePositionOfWord(word0, length0, + false /* forceLowerCaseSearch */); + if (word0Pos == NOT_A_DICT_POS) { + return false; + } + const int word1Pos = getTerminalNodePositionOfWord(word1, length1, + false /* forceLowerCaseSearch */); + if (word1Pos == NOT_A_DICT_POS) { + return false; + } + DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, + &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); + if (writingHelper.removeBigramWords(word0Pos, word1Pos)) { + mBigramCount--; + return true; + } else { + return false; + } +} + +void DynamicPatriciaTriePolicy::flush(const char *const filePath) { + if (!mBuffer->isUpdatable()) { + AKLOGI("Warning: flush() is called for non-updatable dictionary."); + return; + } + DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, + &mBigramListPolicy, &mShortcutListPolicy, false /* needsToDecay */); + writingHelper.writeToDictFile(filePath, &mHeaderPolicy, mUnigramCount, mBigramCount); +} + +void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) { + if (!mBuffer->isUpdatable()) { + AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); + return; + } + const bool needsToDecay = mHeaderPolicy.isDecayingDict() + && (mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay( + false /* mindsBlockByDecay */, mUnigramCount, mBigramCount, &mHeaderPolicy)); + DynamicBigramListPolicy bigramListPolicyForGC(&mHeaderPolicy, &mBufferWithExtendableBuffer, + &mShortcutListPolicy, needsToDecay); + DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, + &bigramListPolicyForGC, &mShortcutListPolicy, needsToDecay); + writingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy); + mNeedsToDecayForTesting = false; +} + +bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { + if (!mBuffer->isUpdatable()) { + AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); + return false; + } + if (mBufferWithExtendableBuffer.isNearSizeLimit()) { + // Additional buffer size is near the limit. + return true; + } else if (mHeaderPolicy.getExtendedRegionSize() + + mBufferWithExtendableBuffer.getUsedAdditionalBufferSize() + > MAX_DICT_EXTENDED_REGION_SIZE) { + // Total extended region size exceeds the limit. + return true; + } else if (mBufferWithExtendableBuffer.getTailPosition() + >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS + && mBufferWithExtendableBuffer.getUsedAdditionalBufferSize() > 0) { + // Needs to reduce dictionary size. + return true; + } else if (mHeaderPolicy.isDecayingDict()) { + return mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay( + mindsBlockByGC, mUnigramCount, mBigramCount, &mHeaderPolicy); + } + return false; +} + +void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const outResult, + const int maxResultLength) { + if (strncmp(query, UNIGRAM_COUNT_QUERY, maxResultLength) == 0) { + snprintf(outResult, maxResultLength, "%d", mUnigramCount); + } else if (strncmp(query, BIGRAM_COUNT_QUERY, maxResultLength) == 0) { + snprintf(outResult, maxResultLength, "%d", mBigramCount); + } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, maxResultLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_UNIGRAM_COUNT : + static_cast<int>(DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE)); + } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, maxResultLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT : + static_cast<int>(DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE)); + } else if (strncmp(query, SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY, maxResultLength) == 0) { + mNeedsToDecayForTesting = true; + } +} + +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h new file mode 100644 index 0000000..be97ee1 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h
@@ -0,0 +1,121 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H + +#include "defines.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" + +namespace latinime { + +class DicNode; +class DicNodeVector; + +class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { + public: + DynamicPatriciaTriePolicy(const MmappedBuffer *const buffer) + : mBuffer(buffer), mHeaderPolicy(mBuffer->getBuffer(), buffer->getBufferSize()), + mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(), + mBuffer->getBufferSize() - mHeaderPolicy.getSize()), + mShortcutListPolicy(&mBufferWithExtendableBuffer), + mBigramListPolicy(&mHeaderPolicy, &mBufferWithExtendableBuffer, &mShortcutListPolicy, + mHeaderPolicy.isDecayingDict()), + mUnigramCount(mHeaderPolicy.getUnigramCount()), + mBigramCount(mHeaderPolicy.getBigramCount()), mNeedsToDecayForTesting(false) {} + + ~DynamicPatriciaTriePolicy() { + delete mBuffer; + } + + AK_FORCE_INLINE int getRootPosition() const { + return 0; + } + + void createAndGetAllChildNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const; + + int getCodePointsAndProbabilityAndReturnCodePointCount( + const int terminalPtNodePos, const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const; + + int getTerminalNodePositionOfWord(const int *const inWord, + const int length, const bool forceLowerCaseSearch) const; + + int getProbability(const int unigramProbability, const int bigramProbability) const; + + int getUnigramProbabilityOfPtNode(const int ptNodePos) const; + + int getShortcutPositionOfPtNode(const int ptNodePos) const; + + int getBigramsPositionOfPtNode(const int ptNodePos) const; + + const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { + return &mHeaderPolicy; + } + + const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const { + return &mBigramListPolicy; + } + + const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const { + return &mShortcutListPolicy; + } + + bool addUnigramWord(const int *const word, const int length, const int probability); + + bool addBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1, const int probability); + + bool removeBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1); + + void flush(const char *const filePath); + + void flushWithGC(const char *const filePath); + + bool needsToRunGC(const bool mindsBlockByGC) const; + + void getProperty(const char *const query, char *const outResult, + const int maxResultLength); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy); + + static const char *const UNIGRAM_COUNT_QUERY; + static const char *const BIGRAM_COUNT_QUERY; + static const char *const MAX_UNIGRAM_COUNT_QUERY; + static const char *const MAX_BIGRAM_COUNT_QUERY; + static const char *const SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY; + static const int MAX_DICT_EXTENDED_REGION_SIZE; + static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; + + const MmappedBuffer *const mBuffer; + const HeaderPolicy mHeaderPolicy; + BufferWithExtendableBuffer mBufferWithExtendableBuffer; + DynamicShortcutListPolicy mShortcutListPolicy; + DynamicBigramListPolicy mBigramListPolicy; + int mUnigramCount; + int mBigramCount; + int mNeedsToDecayForTesting; +}; +} // namespace latinime +#endif // LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H
diff --git a/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp new file mode 100644 index 0000000..f108c21 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp
@@ -0,0 +1,239 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" + +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +// To avoid infinite loop caused by invalid or malicious forward links. +const int DynamicPatriciaTrieReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000; +const int DynamicPatriciaTrieReadingHelper::MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000; +const size_t DynamicPatriciaTrieReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH; + +// Visits all PtNodes in post-order depth first manner. +// For example, visits c -> b -> y -> x -> a for the following dictionary: +// a _ b _ c +// \ x _ y +bool DynamicPatriciaTrieReadingHelper::traverseAllPtNodesInPostorderDepthFirstManner( + TraversingEventListener *const listener) { + bool alreadyVisitedChildren = false; + // Descend from the root to the root PtNode array. + if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { + return false; + } + while (!isEnd()) { + if (!alreadyVisitedChildren) { + if (mNodeReader.hasChildren()) { + // Move to the first child. + if (!listener->onDescend(mNodeReader.getChildrenPos())) { + return false; + } + pushReadingStateToStack(); + readChildNode(); + } else { + alreadyVisitedChildren = true; + } + } else { + if (!listener->onVisitingPtNode(&mNodeReader, mMergedNodeCodePoints)) { + return false; + } + readNextSiblingNode(); + if (isEnd()) { + // All PtNodes in current linked PtNode arrays have been visited. + // Return to the parent. + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + if (mReadingStateStack.size() <= 0) { + break; + } + if (!listener->onAscend()) { + return false; + } + popReadingStateFromStack(); + alreadyVisitedChildren = true; + } else { + // Process sibling PtNode. + alreadyVisitedChildren = false; + } + } + } + // Ascend from the root PtNode array to the root. + if (!listener->onAscend()) { + return false; + } + return !isError(); +} + +// Visits all PtNodes in PtNode array level pre-order depth first manner, which is the same order +// that PtNodes are written in the dictionary buffer. +// For example, visits a -> b -> x -> c -> y for the following dictionary: +// a _ b _ c +// \ x _ y +bool DynamicPatriciaTrieReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + TraversingEventListener *const listener) { + bool alreadyVisitedAllPtNodesInArray = false; + bool alreadyVisitedChildren = false; + // Descend from the root to the root PtNode array. + if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { + return false; + } + if (isEnd()) { + // Empty dictionary. Needs to notify the listener of the tail of empty PtNode array. + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + } + pushReadingStateToStack(); + while (!isEnd()) { + if (alreadyVisitedAllPtNodesInArray) { + if (alreadyVisitedChildren) { + // Move to next sibling PtNode's children. + readNextSiblingNode(); + if (isEnd()) { + // Return to the parent PTNode. + if (!listener->onAscend()) { + return false; + } + if (mReadingStateStack.size() <= 0) { + break; + } + popReadingStateFromStack(); + alreadyVisitedChildren = true; + alreadyVisitedAllPtNodesInArray = true; + } else { + alreadyVisitedChildren = false; + } + } else { + if (mNodeReader.hasChildren()) { + // Move to the first child. + if (!listener->onDescend(mNodeReader.getChildrenPos())) { + return false; + } + pushReadingStateToStack(); + readChildNode(); + // Push state to return the head of PtNode array. + pushReadingStateToStack(); + alreadyVisitedAllPtNodesInArray = false; + alreadyVisitedChildren = false; + } else { + alreadyVisitedChildren = true; + } + } + } else { + if (!listener->onVisitingPtNode(&mNodeReader, mMergedNodeCodePoints)) { + return false; + } + readNextSiblingNode(); + if (isEnd()) { + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + // Return to the head of current PtNode array. + popReadingStateFromStack(); + alreadyVisitedAllPtNodesInArray = true; + } + } + } + popReadingStateFromStack(); + // Ascend from the root PtNode array to the root. + if (!listener->onAscend()) { + return false; + } + return !isError(); +} + +// Read node array size and process empty node arrays. Nodes and arrays are counted up in this +// method to avoid an infinite loop. +void DynamicPatriciaTrieReadingHelper::nextPtNodeArray() { + if (mReadingState.mPos < 0 || mReadingState.mPos >= mBuffer->getTailPosition()) { + // Reading invalid position because of a bug or a broken dictionary. + AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %d", + mReadingState.mPos, mBuffer->getTailPosition()); + ASSERT(false); + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + return; + } + mReadingState.mPosOfLastPtNodeArrayHead = mReadingState.mPos; + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(mReadingState.mPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + mReadingState.mPos -= mBuffer->getOriginalBufferSize(); + } + mReadingState.mNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + dictBuf, &mReadingState.mPos); + if (usesAdditionalBuffer) { + mReadingState.mPos += mBuffer->getOriginalBufferSize(); + } + // Count up nodes and node arrays to avoid infinite loop. + mReadingState.mTotalNodeCount += mReadingState.mNodeCount; + mReadingState.mNodeArrayCount++; + if (mReadingState.mNodeCount < 0 + || mReadingState.mTotalNodeCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP + || mReadingState.mNodeArrayCount > MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) { + // Invalid dictionary. + AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d" + "nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d", + mReadingState.mNodeCount, mReadingState.mTotalNodeCount, + MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP, mReadingState.mNodeArrayCount, + MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP); + ASSERT(false); + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + return; + } + if (mReadingState.mNodeCount == 0) { + // Empty node array. Try following forward link. + followForwardLink(); + } +} + +// Follow the forward link and read the next node array if exists. +void DynamicPatriciaTrieReadingHelper::followForwardLink() { + if (mReadingState.mPos < 0 || mReadingState.mPos >= mBuffer->getTailPosition()) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %d", + mReadingState.mPos, mBuffer->getTailPosition()); + ASSERT(false); + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + return; + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(mReadingState.mPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + mReadingState.mPos -= mBuffer->getOriginalBufferSize(); + } + const int forwardLinkPosition = + DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(dictBuf, mReadingState.mPos); + if (usesAdditionalBuffer) { + mReadingState.mPos += mBuffer->getOriginalBufferSize(); + } + mReadingState.mPosOfLastForwardLinkField = mReadingState.mPos; + if (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(forwardLinkPosition)) { + // Follow the forward link. + mReadingState.mPos += forwardLinkPosition; + nextPtNodeArray(); + } else { + // All node arrays have been read. + mReadingState.mPos = NOT_A_DICT_POS; + } +} + +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h new file mode 100644 index 0000000..a71c069 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h
@@ -0,0 +1,289 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H + +#include <cstddef> +#include <vector> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class DictionaryBigramsStructurePolicy; +class DictionaryShortcutsStructurePolicy; + +/* + * This class is used for traversing dynamic patricia trie. This class supports iterating nodes and + * dealing with additional buffer. This class counts nodes and node arrays to avoid infinite loop. + */ +class DynamicPatriciaTrieReadingHelper { + public: + class TraversingEventListener { + public: + virtual ~TraversingEventListener() {}; + + // Returns whether the event handling was succeeded or not. + virtual bool onAscend() = 0; + + // Returns whether the event handling was succeeded or not. + virtual bool onDescend(const int ptNodeArrayPos) = 0; + + // Returns whether the event handling was succeeded or not. + virtual bool onReadingPtNodeArrayTail() = 0; + + // Returns whether the event handling was succeeded or not. + virtual bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints) = 0; + + protected: + TraversingEventListener() {}; + + private: + DISALLOW_COPY_AND_ASSIGN(TraversingEventListener); + }; + + DynamicPatriciaTrieReadingHelper(const BufferWithExtendableBuffer *const buffer, + const DictionaryBigramsStructurePolicy *const bigramsPolicy, + const DictionaryShortcutsStructurePolicy *const shortcutsPolicy) + : mIsError(false), mReadingState(), mBuffer(buffer), + mNodeReader(mBuffer, bigramsPolicy, shortcutsPolicy), mReadingStateStack() {} + + ~DynamicPatriciaTrieReadingHelper() {} + + AK_FORCE_INLINE bool isError() const { + return mIsError; + } + + AK_FORCE_INLINE bool isEnd() const { + return mReadingState.mPos == NOT_A_DICT_POS; + } + + // Initialize reading state with the head position of a PtNode array. + AK_FORCE_INLINE void initWithPtNodeArrayPos(const int ptNodeArrayPos) { + if (ptNodeArrayPos == NOT_A_DICT_POS) { + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mIsError = false; + mReadingState.mPos = ptNodeArrayPos; + mReadingState.mPrevTotalCodePointCount = 0; + mReadingState.mTotalNodeCount = 0; + mReadingState.mNodeArrayCount = 0; + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingStateStack.clear(); + nextPtNodeArray(); + if (!isEnd()) { + fetchPtNodeInfo(); + } + } + } + + // Initialize reading state with the head position of a node. + AK_FORCE_INLINE void initWithPtNodePos(const int ptNodePos) { + if (ptNodePos == NOT_A_DICT_POS) { + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mIsError = false; + mReadingState.mPos = ptNodePos; + mReadingState.mNodeCount = 1; + mReadingState.mPrevTotalCodePointCount = 0; + mReadingState.mTotalNodeCount = 1; + mReadingState.mNodeArrayCount = 1; + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingState.mPosOfLastPtNodeArrayHead = NOT_A_DICT_POS; + mReadingStateStack.clear(); + fetchPtNodeInfo(); + } + } + + AK_FORCE_INLINE const DynamicPatriciaTrieNodeReader* getNodeReader() const { + return &mNodeReader; + } + + AK_FORCE_INLINE bool isValidTerminalNode() const { + return !isEnd() && !mNodeReader.isDeleted() && mNodeReader.isTerminal(); + } + + AK_FORCE_INLINE bool isMatchedCodePoint(const int index, const int codePoint) const { + return mMergedNodeCodePoints[index] == codePoint; + } + + // Return code point count exclude the last read node's code points. + AK_FORCE_INLINE int getPrevTotalCodePointCount() const { + return mReadingState.mPrevTotalCodePointCount; + } + + // Return code point count include the last read node's code points. + AK_FORCE_INLINE int getTotalCodePointCount() const { + return mReadingState.mPrevTotalCodePointCount + mNodeReader.getCodePointCount(); + } + + AK_FORCE_INLINE void fetchMergedNodeCodePointsInReverseOrder( + const int index, int *const outCodePoints) const { + const int nodeCodePointCount = mNodeReader.getCodePointCount(); + for (int i = 0; i < nodeCodePointCount; ++i) { + outCodePoints[index + i] = mMergedNodeCodePoints[nodeCodePointCount - 1 - i]; + } + } + + AK_FORCE_INLINE const int *getMergedNodeCodePoints() const { + return mMergedNodeCodePoints; + } + + AK_FORCE_INLINE void readNextSiblingNode() { + mReadingState.mNodeCount -= 1; + mReadingState.mPos = mNodeReader.getSiblingNodePos(); + if (mReadingState.mNodeCount <= 0) { + // All nodes in the current node array have been read. + followForwardLink(); + if (!isEnd()) { + fetchPtNodeInfo(); + } + } else { + fetchPtNodeInfo(); + } + } + + // Read the first child node of the current node. + AK_FORCE_INLINE void readChildNode() { + if (mNodeReader.hasChildren()) { + mReadingState.mPrevTotalCodePointCount += mNodeReader.getCodePointCount(); + mReadingState.mTotalNodeCount = 0; + mReadingState.mNodeArrayCount = 0; + mReadingState.mPos = mNodeReader.getChildrenPos(); + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + // Read children node array. + nextPtNodeArray(); + if (!isEnd()) { + fetchPtNodeInfo(); + } + } else { + mReadingState.mPos = NOT_A_DICT_POS; + } + } + + // Read the parent node of the current node. + AK_FORCE_INLINE void readParentNode() { + if (mNodeReader.getParentPos() != NOT_A_DICT_POS) { + mReadingState.mPrevTotalCodePointCount += mNodeReader.getCodePointCount(); + mReadingState.mTotalNodeCount = 1; + mReadingState.mNodeArrayCount = 1; + mReadingState.mNodeCount = 1; + mReadingState.mPos = mNodeReader.getParentPos(); + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingState.mPosOfLastPtNodeArrayHead = NOT_A_DICT_POS; + fetchPtNodeInfo(); + } else { + mReadingState.mPos = NOT_A_DICT_POS; + } + } + + AK_FORCE_INLINE int getPosOfLastForwardLinkField() const { + return mReadingState.mPosOfLastForwardLinkField; + } + + AK_FORCE_INLINE int getPosOfLastPtNodeArrayHead() const { + return mReadingState.mPosOfLastPtNodeArrayHead; + } + + AK_FORCE_INLINE void reloadCurrentPtNodeInfo() { + if (!isEnd()) { + fetchPtNodeInfo(); + } + } + + bool traverseAllPtNodesInPostorderDepthFirstManner(TraversingEventListener *const listener); + + bool traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + TraversingEventListener *const listener); + + private: + DISALLOW_COPY_AND_ASSIGN(DynamicPatriciaTrieReadingHelper); + + class ReadingState { + public: + // Note that copy constructor and assignment operator are used for this class to use + // std::vector. + ReadingState() : mPos(NOT_A_DICT_POS), mNodeCount(0), mPrevTotalCodePointCount(0), + mTotalNodeCount(0), mNodeArrayCount(0), mPosOfLastForwardLinkField(NOT_A_DICT_POS), + mPosOfLastPtNodeArrayHead(NOT_A_DICT_POS) {} + + int mPos; + // Node count of a node array. + int mNodeCount; + int mPrevTotalCodePointCount; + int mTotalNodeCount; + int mNodeArrayCount; + int mPosOfLastForwardLinkField; + int mPosOfLastPtNodeArrayHead; + }; + + static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP; + static const int MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP; + static const size_t MAX_READING_STATE_STACK_SIZE; + + // TODO: Introduce error code to track what caused the error. + bool mIsError; + ReadingState mReadingState; + const BufferWithExtendableBuffer *const mBuffer; + DynamicPatriciaTrieNodeReader mNodeReader; + int mMergedNodeCodePoints[MAX_WORD_LENGTH]; + std::vector<ReadingState> mReadingStateStack; + + void nextPtNodeArray(); + + void followForwardLink(); + + AK_FORCE_INLINE void fetchPtNodeInfo() { + mNodeReader.fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(mReadingState.mPos, + MAX_WORD_LENGTH, mMergedNodeCodePoints); + if (mNodeReader.getCodePointCount() <= 0) { + // Empty node is not allowed. + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + } + } + + AK_FORCE_INLINE void pushReadingStateToStack() { + if (mReadingStateStack.size() > MAX_READING_STATE_STACK_SIZE) { + AKLOGI("Reading state stack overflow. Max size: %zd", MAX_READING_STATE_STACK_SIZE); + ASSERT(false); + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mReadingStateStack.push_back(mReadingState); + } + } + + AK_FORCE_INLINE void popReadingStateFromStack() { + if (mReadingStateStack.empty()) { + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mReadingState = mReadingStateStack.back(); + mReadingStateStack.pop_back(); + if (!isEnd()) { + fetchPtNodeInfo(); + } + } + } +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H */
diff --git a/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp new file mode 100644 index 0000000..d68446d --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp
@@ -0,0 +1,72 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" + +#include "defines.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +typedef DynamicPatriciaTrieReadingUtils DptReadingUtils; + +const DptReadingUtils::NodeFlags DptReadingUtils::MASK_MOVED = 0xC0; +const DptReadingUtils::NodeFlags DptReadingUtils::FLAG_IS_NOT_MOVED = 0xC0; +const DptReadingUtils::NodeFlags DptReadingUtils::FLAG_IS_MOVED = 0x40; +const DptReadingUtils::NodeFlags DptReadingUtils::FLAG_IS_DELETED = 0x80; + +// TODO: Make DICT_OFFSET_ZERO_OFFSET = 0. +// Currently, DICT_OFFSET_INVALID is 0 in Java side but offset can be 0 during GC. So, the maximum +// value of offsets, which is 0x7FFFFF is used to represent 0 offset. +const int DptReadingUtils::DICT_OFFSET_INVALID = 0; +const int DptReadingUtils::DICT_OFFSET_ZERO_OFFSET = 0x7FFFFF; + +/* static */ int DptReadingUtils::getForwardLinkPosition(const uint8_t *const buffer, + const int pos) { + int linkAddressPos = pos; + return ByteArrayUtils::readSint24AndAdvancePosition(buffer, &linkAddressPos); +} + +/* static */ int DptReadingUtils::getParentPtNodePosOffsetAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + return ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos); +} + +/* static */ int DptReadingUtils::getParentPtNodePos(const int parentOffset, const int ptNodePos) { + if (parentOffset == DICT_OFFSET_INVALID) { + return NOT_A_DICT_POS; + } else if (parentOffset == DICT_OFFSET_ZERO_OFFSET) { + return ptNodePos; + } else { + return parentOffset + ptNodePos; + } +} + +/* static */ int DptReadingUtils::readChildrenPositionAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const int base = *pos; + const int offset = ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos); + if (offset == DICT_OFFSET_INVALID) { + // The PtNode does not have children. + return NOT_A_DICT_POS; + } else if (offset == DICT_OFFSET_ZERO_OFFSET) { + return base; + } else { + return base + offset; + } +} + +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h new file mode 100644 index 0000000..67c3cc5 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h
@@ -0,0 +1,75 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_READING_UTILS_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_READING_UTILS_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +class DynamicPatriciaTrieReadingUtils { + public: + typedef uint8_t NodeFlags; + + static const int DICT_OFFSET_INVALID; + static const int DICT_OFFSET_ZERO_OFFSET; + + static int getForwardLinkPosition(const uint8_t *const buffer, const int pos); + + static AK_FORCE_INLINE bool isValidForwardLinkPosition(const int forwardLinkAddress) { + return forwardLinkAddress != 0; + } + + static int getParentPtNodePosOffsetAndAdvancePosition(const uint8_t *const buffer, + int *const pos); + + static int getParentPtNodePos(const int parentOffset, const int ptNodePos); + + static int readChildrenPositionAndAdvancePosition(const uint8_t *const buffer, int *const pos); + + /** + * Node Flags + */ + static AK_FORCE_INLINE bool isMoved(const NodeFlags flags) { + return FLAG_IS_MOVED == (MASK_MOVED & flags); + } + + static AK_FORCE_INLINE bool isDeleted(const NodeFlags flags) { + return FLAG_IS_DELETED == (MASK_MOVED & flags); + } + + static AK_FORCE_INLINE NodeFlags updateAndGetFlags(const NodeFlags originalFlags, + const bool isMoved, const bool isDeleted) { + NodeFlags flags = originalFlags; + flags = isMoved ? ((flags & (~MASK_MOVED)) | FLAG_IS_MOVED) : flags; + flags = isDeleted ? ((flags & (~MASK_MOVED)) | FLAG_IS_DELETED) : flags; + flags = (!isMoved && !isDeleted) ? ((flags & (~MASK_MOVED)) | FLAG_IS_NOT_MOVED) : flags; + return flags; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieReadingUtils); + + static const NodeFlags MASK_MOVED; + static const NodeFlags FLAG_IS_NOT_MOVED; + static const NodeFlags FLAG_IS_MOVED; + static const NodeFlags FLAG_IS_DELETED; +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_READING_UTILS_H */
diff --git a/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp new file mode 100644 index 0000000..052558b --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp
@@ -0,0 +1,558 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" + +#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" +#include "utils/hash_map_compat.h" + +namespace latinime { + +const int DynamicPatriciaTrieWritingHelper::CHILDREN_POSITION_FIELD_SIZE = 3; +// TODO: Make MAX_DICTIONARY_SIZE 8MB. +const size_t DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE = 2 * 1024 * 1024; + +bool DynamicPatriciaTrieWritingHelper::addUnigramWord( + DynamicPatriciaTrieReadingHelper *const readingHelper, + const int *const wordCodePoints, const int codePointCount, const int probability, + bool *const outAddedNewUnigram) { + int parentPos = NOT_A_DICT_POS; + while (!readingHelper->isEnd()) { + const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount(); + if (!readingHelper->isMatchedCodePoint(0 /* index */, + wordCodePoints[matchedCodePointCount])) { + // The first code point is different from target code point. Skip this node and read + // the next sibling node. + readingHelper->readNextSiblingNode(); + continue; + } + // Check following merged node code points. + const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper->getNodeReader(); + const int nodeCodePointCount = nodeReader->getCodePointCount(); + for (int j = 1; j < nodeCodePointCount; ++j) { + const int nextIndex = matchedCodePointCount + j; + if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(j, + wordCodePoints[matchedCodePointCount + j])) { + *outAddedNewUnigram = true; + return reallocatePtNodeAndAddNewPtNodes(nodeReader, + readingHelper->getMergedNodeCodePoints(), j, + getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, + probability), + wordCodePoints + matchedCodePointCount, + codePointCount - matchedCodePointCount); + } + } + // All characters are matched. + if (codePointCount == readingHelper->getTotalCodePointCount()) { + return setPtNodeProbability(nodeReader, probability, + readingHelper->getMergedNodeCodePoints(), outAddedNewUnigram); + } + if (!nodeReader->hasChildren()) { + *outAddedNewUnigram = true; + return createChildrenPtNodeArrayAndAChildPtNode(nodeReader, + getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability), + wordCodePoints + readingHelper->getTotalCodePointCount(), + codePointCount - readingHelper->getTotalCodePointCount()); + } + // Advance to the children nodes. + parentPos = nodeReader->getHeadPos(); + readingHelper->readChildNode(); + } + if (readingHelper->isError()) { + // The dictionary is invalid. + return false; + } + int pos = readingHelper->getPosOfLastForwardLinkField(); + *outAddedNewUnigram = true; + return createAndInsertNodeIntoPtNodeArray(parentPos, + wordCodePoints + readingHelper->getPrevTotalCodePointCount(), + codePointCount - readingHelper->getPrevTotalCodePointCount(), + getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability), &pos); +} + +bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const int word1Pos, + const int probability, bool *const outAddedNewBigram) { + int mMergedNodeCodePoints[MAX_WORD_LENGTH]; + DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy); + nodeReader.fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(word0Pos, MAX_WORD_LENGTH, + mMergedNodeCodePoints); + // Move node to add bigram entry. + const int newNodePos = mBuffer->getTailPosition(); + if (!markNodeAsMovedAndSetPosition(&nodeReader, newNodePos, newNodePos)) { + return false; + } + int writingPos = newNodePos; + // Write a new PtNode using original PtNode's info to the tail of the dictionary in mBuffer. + if (!writePtNodeToBufferByCopyingPtNodeInfo(mBuffer, &nodeReader, nodeReader.getParentPos(), + mMergedNodeCodePoints, nodeReader.getCodePointCount(), nodeReader.getProbability(), + &writingPos)) { + return false; + } + nodeReader.fetchNodeInfoInBufferFromPtNodePos(newNodePos); + if (nodeReader.getBigramsPos() != NOT_A_DICT_POS) { + // Insert a new bigram entry into the existing bigram list. + int bigramListPos = nodeReader.getBigramsPos(); + return mBigramPolicy->addNewBigramEntryToBigramList(word1Pos, probability, &bigramListPos, + outAddedNewBigram); + } else { + // The PtNode doesn't have a bigram list. + *outAddedNewBigram = true; + // First, Write a bigram entry at the tail position of the PtNode. + if (!mBigramPolicy->writeNewBigramEntry(word1Pos, probability, &writingPos)) { + return false; + } + // Then, Mark as the PtNode having bigram list in the flags. + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + PatriciaTrieReadingUtils::createAndGetFlags(nodeReader.isBlacklisted(), + nodeReader.isNotAWord(), nodeReader.getProbability() != NOT_A_PROBABILITY, + nodeReader.getShortcutPos() != NOT_A_DICT_POS, true /* hasBigrams */, + nodeReader.getCodePointCount() > 1, CHILDREN_POSITION_FIELD_SIZE); + writingPos = newNodePos; + // Write updated flags into the moved PtNode's flags field. + return DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mBuffer, updatedFlags, + &writingPos); + } +} + +// Remove a bigram relation from word0Pos to word1Pos. +bool DynamicPatriciaTrieWritingHelper::removeBigramWords(const int word0Pos, const int word1Pos) { + DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(word0Pos); + if (nodeReader.getBigramsPos() == NOT_A_DICT_POS) { + return false; + } + return mBigramPolicy->removeBigram(nodeReader.getBigramsPos(), word1Pos); +} + +void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileName, + const HeaderPolicy *const headerPolicy, const int unigramCount, const int bigramCount) { + BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); + const int extendedRegionSize = headerPolicy->getExtendedRegionSize() + + mBuffer->getUsedAdditionalBufferSize(); + if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */, + false /* updatesLastDecayedTime */, unigramCount, bigramCount, extendedRegionSize)) { + return; + } + DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, mBuffer); +} + +void DynamicPatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, + const char *const fileName, const HeaderPolicy *const headerPolicy) { + BufferWithExtendableBuffer newDictBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */, + MAX_DICTIONARY_SIZE); + int unigramCount = 0; + int bigramCount = 0; + if (mNeedsToDecay) { + ForgettingCurveUtils::sTimeKeeper.setCurrentTime(); + } + if (!runGC(rootPtNodeArrayPos, headerPolicy, &newDictBuffer, &unigramCount, &bigramCount)) { + return; + } + BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); + if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */, + mNeedsToDecay, unigramCount, bigramCount, 0 /* extendedRegionSize */)) { + return; + } + DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, &newDictBuffer); +} + +bool DynamicPatriciaTrieWritingHelper::markNodeAsDeleted( + const DynamicPatriciaTrieNodeReader *const nodeToUpdate) { + int pos = nodeToUpdate->getHeadPos(); + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPatriciaTrieReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, + true /* isDeleted */); + int writingPos = nodeToUpdate->getHeadPos(); + // Update flags. + return DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mBuffer, updatedFlags, + &writingPos); +} + +bool DynamicPatriciaTrieWritingHelper::markNodeAsMovedAndSetPosition( + const DynamicPatriciaTrieNodeReader *const originalNode, const int movedPos, + const int bigramLinkedNodePos) { + int pos = originalNode->getHeadPos(); + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPatriciaTrieReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */, + false /* isDeleted */); + int writingPos = originalNode->getHeadPos(); + // Update flags. + if (!DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mBuffer, updatedFlags, + &writingPos)) { + return false; + } + // Update moved position, which is stored in the parent offset field. + if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition( + mBuffer, movedPos, originalNode->getHeadPos(), &writingPos)) { + return false; + } + // Update bigram linked node position, which is stored in the children position field. + int childrenPosFieldPos = originalNode->getChildrenPosFieldPos(); + if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition( + mBuffer, bigramLinkedNodePos, &childrenPosFieldPos)) { + return false; + } + if (originalNode->hasChildren()) { + // Update children's parent position. + DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy); + const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); + readingHelper.initWithPtNodeArrayPos(originalNode->getChildrenPos()); + while (!readingHelper.isEnd()) { + int parentOffsetFieldPos = nodeReader->getHeadPos() + + DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE; + if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition( + mBuffer, bigramLinkedNodePos, nodeReader->getHeadPos(), + &parentOffsetFieldPos)) { + // Parent offset cannot be written because of a bug or a broken dictionary; thus, + // we give up to update dictionary. + return false; + } + readingHelper.readNextSiblingNode(); + } + } + return true; +} + +// Write new PtNode at writingPos. +bool DynamicPatriciaTrieWritingHelper::writePtNodeWithFullInfoToBuffer( + BufferWithExtendableBuffer *const bufferToWrite, const bool isBlacklisted, + const bool isNotAWord, const int parentPos, const int *const codePoints, + const int codePointCount, const int probability, const int childrenPos, + const int originalBigramListPos, const int originalShortcutListPos, + int *const writingPos) { + const int nodePos = *writingPos; + // Write dummy flags. The Node flags are updated with appropriate flags at the last step of the + // PtNode writing. + if (!DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(bufferToWrite, + 0 /* nodeFlags */, writingPos)) { + return false; + } + // Calculate a parent offset and write the offset. + if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition(bufferToWrite, + parentPos, nodePos, writingPos)) { + return false; + } + // Write code points + if (!DynamicPatriciaTrieWritingUtils::writeCodePointsAndAdvancePosition(bufferToWrite, + codePoints, codePointCount, writingPos)) { + return false; + } + // Write probability when the probability is a valid probability, which means this node is + // terminal. + if (probability != NOT_A_PROBABILITY) { + if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(bufferToWrite, + probability, writingPos)) { + return false; + } + } + // Write children position + if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition(bufferToWrite, + childrenPos, writingPos)) { + return false; + } + // Copy shortcut list when the originalShortcutListPos is valid dictionary position. + if (originalShortcutListPos != NOT_A_DICT_POS) { + int fromPos = originalShortcutListPos; + if (!mShortcutPolicy->copyAllShortcutsAndReturnIfSucceededOrNot(bufferToWrite, &fromPos, + writingPos)) { + return false; + } + } + // Copy bigram list when the originalBigramListPos is valid dictionary position. + int bigramCount = 0; + if (originalBigramListPos != NOT_A_DICT_POS) { + int fromPos = originalBigramListPos; + if (!mBigramPolicy->copyAllBigrams(bufferToWrite, &fromPos, writingPos, &bigramCount)) { + return false; + } + } + // Create node flags and write them. + PatriciaTrieReadingUtils::NodeFlags nodeFlags = + PatriciaTrieReadingUtils::createAndGetFlags(isBlacklisted, isNotAWord, + probability != NOT_A_PROBABILITY /* isTerminal */, + originalShortcutListPos != NOT_A_DICT_POS /* hasShortcutTargets */, + bigramCount > 0 /* hasBigrams */, codePointCount > 1 /* hasMultipleChars */, + CHILDREN_POSITION_FIELD_SIZE); + int flagsFieldPos = nodePos; + if (!DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(bufferToWrite, nodeFlags, + &flagsFieldPos)) { + return false; + } + return true; +} + +bool DynamicPatriciaTrieWritingHelper::writePtNodeToBuffer( + BufferWithExtendableBuffer *const bufferToWrite, const int parentPos, + const int *const codePoints, const int codePointCount, const int probability, + int *const writingPos) { + return writePtNodeWithFullInfoToBuffer(bufferToWrite, false /* isBlacklisted */, + false /* isNotAWord */, parentPos, codePoints, codePointCount, probability, + NOT_A_DICT_POS /* childrenPos */, NOT_A_DICT_POS /* originalBigramsPos */, + NOT_A_DICT_POS /* originalShortcutPos */, writingPos); +} + +bool DynamicPatriciaTrieWritingHelper::writePtNodeToBufferByCopyingPtNodeInfo( + BufferWithExtendableBuffer *const bufferToWrite, + const DynamicPatriciaTrieNodeReader *const originalNode, const int parentPos, + const int *const codePoints, const int codePointCount, const int probability, + int *const writingPos) { + return writePtNodeWithFullInfoToBuffer(bufferToWrite, originalNode->isBlacklisted(), + originalNode->isNotAWord(), parentPos, codePoints, codePointCount, probability, + originalNode->getChildrenPos(), originalNode->getBigramsPos(), + originalNode->getShortcutPos(), writingPos); +} + +bool DynamicPatriciaTrieWritingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos, + const int *const nodeCodePoints, const int nodeCodePointCount, const int probability, + int *const forwardLinkFieldPos) { + const int newPtNodeArrayPos = mBuffer->getTailPosition(); + if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, + newPtNodeArrayPos, forwardLinkFieldPos)) { + return false; + } + return createNewPtNodeArrayWithAChildPtNode(parentPos, nodeCodePoints, nodeCodePointCount, + probability); +} + +bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability( + const DynamicPatriciaTrieNodeReader *const originalPtNode, const int probability, + const int *const codePoints, bool *const outAddedNewUnigram) { + if (originalPtNode->isTerminal()) { + // Overwrites the probability. + *outAddedNewUnigram = false; + const int probabilityToWrite = getUpdatedProbability(originalPtNode->getProbability(), + probability); + int probabilityFieldPos = originalPtNode->getProbabilityFieldPos(); + if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(mBuffer, + probabilityToWrite, &probabilityFieldPos)) { + return false; + } + } else { + // Make the node terminal and write the probability. + *outAddedNewUnigram = true; + int movedPos = mBuffer->getTailPosition(); + if (!markNodeAsMovedAndSetPosition(originalPtNode, movedPos, movedPos)) { + return false; + } + if (!writePtNodeToBufferByCopyingPtNodeInfo(mBuffer, originalPtNode, + originalPtNode->getParentPos(), codePoints, originalPtNode->getCodePointCount(), + getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability), + &movedPos)) { + return false; + } + } + return true; +} + +bool DynamicPatriciaTrieWritingHelper::createChildrenPtNodeArrayAndAChildPtNode( + const DynamicPatriciaTrieNodeReader *const parentNode, const int probability, + const int *const codePoints, const int codePointCount) { + const int newPtNodeArrayPos = mBuffer->getTailPosition(); + int childrenPosFieldPos = parentNode->getChildrenPosFieldPos(); + if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition(mBuffer, + newPtNodeArrayPos, &childrenPosFieldPos)) { + return false; + } + return createNewPtNodeArrayWithAChildPtNode(parentNode->getHeadPos(), codePoints, + codePointCount, probability); +} + +bool DynamicPatriciaTrieWritingHelper::createNewPtNodeArrayWithAChildPtNode( + const int parentPtNodePos, const int *const nodeCodePoints, const int nodeCodePointCount, + const int probability) { + int writingPos = mBuffer->getTailPosition(); + if (!DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, + 1 /* arraySize */, &writingPos)) { + return false; + } + if (!writePtNodeToBuffer(mBuffer, parentPtNodePos, nodeCodePoints, nodeCodePointCount, + probability, &writingPos)) { + return false; + } + if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, + NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { + return false; + } + return true; +} + +// Returns whether the dictionary updating was succeeded or not. +bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes( + const DynamicPatriciaTrieNodeReader *const reallocatingPtNode, + const int *const reallocatingPtNodeCodePoints, const int overlappingCodePointCount, + const int probabilityOfNewPtNode, const int *const newNodeCodePoints, + const int newNodeCodePointCount) { + // When addsExtraChild is true, split the reallocating PtNode and add new child. + // Reallocating PtNode: abcde, newNode: abcxy. + // abc (1st, not terminal) __ de (2nd) + // \_ xy (extra child, terminal) + // Otherwise, this method makes 1st part terminal and write probabilityOfNewPtNode. + // Reallocating PtNode: abcde, newNode: abc. + // abc (1st, terminal) __ de (2nd) + const bool addsExtraChild = newNodeCodePointCount > overlappingCodePointCount; + const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition(); + int writingPos = firstPartOfReallocatedPtNodePos; + // Write the 1st part of the reallocating node. The children position will be updated later + // with actual children position. + const int newProbability = addsExtraChild ? NOT_A_PROBABILITY : probabilityOfNewPtNode; + if (!writePtNodeToBuffer(mBuffer, reallocatingPtNode->getParentPos(), + reallocatingPtNodeCodePoints, overlappingCodePointCount, newProbability, + &writingPos)) { + return false; + } + const int actualChildrenPos = writingPos; + // Create new children PtNode array. + const size_t newPtNodeCount = addsExtraChild ? 2 : 1; + if (!DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, + newPtNodeCount, &writingPos)) { + return false; + } + // Write the 2nd part of the reallocating node. + const int secondPartOfReallocatedPtNodePos = writingPos; + if (!writePtNodeToBufferByCopyingPtNodeInfo(mBuffer, reallocatingPtNode, + firstPartOfReallocatedPtNodePos, + reallocatingPtNodeCodePoints + overlappingCodePointCount, + reallocatingPtNode->getCodePointCount() - overlappingCodePointCount, + reallocatingPtNode->getProbability(), &writingPos)) { + return false; + } + if (addsExtraChild) { + if (!writePtNodeToBuffer(mBuffer, firstPartOfReallocatedPtNodePos, + newNodeCodePoints + overlappingCodePointCount, + newNodeCodePointCount - overlappingCodePointCount, probabilityOfNewPtNode, + &writingPos)) { + return false; + } + } + if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, + NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { + return false; + } + // Update original reallocatingPtNode as moved. + if (!markNodeAsMovedAndSetPosition(reallocatingPtNode, firstPartOfReallocatedPtNodePos, + secondPartOfReallocatedPtNodePos)) { + return false; + } + // Load node info. Information of the 1st part will be fetched. + DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos); + // Update children position. + int childrenPosFieldPos = nodeReader.getChildrenPosFieldPos(); + if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition(mBuffer, + actualChildrenPos, &childrenPosFieldPos)) { + return false; + } + return true; +} + +bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, + const HeaderPolicy *const headerPolicy, BufferWithExtendableBuffer *const bufferToWrite, + int *const outUnigramCount, int *const outBigramCount) { + DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy); + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPatriciaTrieGcEventListeners + ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( + headerPolicy, this, mBuffer, mNeedsToDecay); + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { + return false; + } + if (mNeedsToDecay && traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + .getValidUnigramCount() > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) { + // TODO: Remove more unigrams. + } + + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateBigramProbability + traversePolicyToUpdateBigramProbability(mBigramPolicy); + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateBigramProbability)) { + return false; + } + if (mNeedsToDecay && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount() + > ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) { + // TODO: Remove more bigrams. + } + + // Mapping from positions in mBuffer to positions in bufferToWrite. + DictPositionRelocationMap dictPositionRelocationMap; + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + traversePolicyToPlaceAndWriteValidPtNodesToBuffer(this, bufferToWrite, + &dictPositionRelocationMap); + if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) { + return false; + } + + // Create policy instance for the GCed dictionary. + DynamicShortcutListPolicy newDictShortcutPolicy(bufferToWrite); + DynamicBigramListPolicy newDictBigramPolicy(headerPolicy, bufferToWrite, &newDictShortcutPolicy, + mNeedsToDecay); + // Create reading helper for the GCed dictionary. + DynamicPatriciaTrieReadingHelper newDictReadingHelper(bufferToWrite, &newDictBigramPolicy, + &newDictShortcutPolicy); + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionFields + traversePolicyToUpdateAllPositionFields(this, &newDictBigramPolicy, bufferToWrite, + &dictPositionRelocationMap); + if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToUpdateAllPositionFields)) { + return false; + } + *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount(); + *outBigramCount = traversePolicyToUpdateAllPositionFields.getBigramCount(); + return true; +} + +int DynamicPatriciaTrieWritingHelper::getUpdatedProbability(const int originalProbability, + const int newProbability) { + if (mNeedsToDecay) { + return ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability, + newProbability); + } else { + return newProbability; + } +} + +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h new file mode 100644 index 0000000..ca86647 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h
@@ -0,0 +1,138 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H + +#include <stdint.h> + +#include "defines.h" +#include "utils/hash_map_compat.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class DynamicBigramListPolicy; +class DynamicPatriciaTrieNodeReader; +class DynamicPatriciaTrieReadingHelper; +class DynamicShortcutListPolicy; +class HeaderPolicy; + +class DynamicPatriciaTrieWritingHelper { + public: + typedef hash_map_compat<int, int> PtNodeArrayPositionRelocationMap; + typedef hash_map_compat<int, int> PtNodePositionRelocationMap; + struct DictPositionRelocationMap { + public: + DictPositionRelocationMap() + : mPtNodeArrayPositionRelocationMap(), mPtNodePositionRelocationMap() {} + + PtNodeArrayPositionRelocationMap mPtNodeArrayPositionRelocationMap; + PtNodePositionRelocationMap mPtNodePositionRelocationMap; + + private: + DISALLOW_COPY_AND_ASSIGN(DictPositionRelocationMap); + }; + + static const size_t MAX_DICTIONARY_SIZE; + + DynamicPatriciaTrieWritingHelper(BufferWithExtendableBuffer *const buffer, + DynamicBigramListPolicy *const bigramPolicy, + DynamicShortcutListPolicy *const shortcutPolicy, const bool needsToDecay) + : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy), + mNeedsToDecay(needsToDecay) {} + + ~DynamicPatriciaTrieWritingHelper() {} + + // Add a word to the dictionary. If the word already exists, update the probability. + bool addUnigramWord(DynamicPatriciaTrieReadingHelper *const readingHelper, + const int *const wordCodePoints, const int codePointCount, const int probability, + bool *const outAddedNewUnigram); + + // Add a bigram relation from word0Pos to word1Pos. + bool addBigramWords(const int word0Pos, const int word1Pos, const int probability, + bool *const outAddedNewBigram); + + // Remove a bigram relation from word0Pos to word1Pos. + bool removeBigramWords(const int word0Pos, const int word1Pos); + + void writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy, + const int unigramCount, const int bigramCount); + + void writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const fileName, + const HeaderPolicy *const headerPolicy); + + // CAVEAT: This method must be called only from inner classes of + // DynamicPatriciaTrieGcEventListeners. + bool markNodeAsDeleted(const DynamicPatriciaTrieNodeReader *const nodeToUpdate); + + // CAVEAT: This method must be called only from this class or inner classes of + // DynamicPatriciaTrieGcEventListeners. + bool writePtNodeToBufferByCopyingPtNodeInfo(BufferWithExtendableBuffer *const bufferToWrite, + const DynamicPatriciaTrieNodeReader *const originalNode, const int parentPos, + const int *const codePoints, const int codePointCount, const int probability, + int *const writingPos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieWritingHelper); + + static const int CHILDREN_POSITION_FIELD_SIZE; + + BufferWithExtendableBuffer *const mBuffer; + DynamicBigramListPolicy *const mBigramPolicy; + DynamicShortcutListPolicy *const mShortcutPolicy; + const bool mNeedsToDecay; + + bool markNodeAsMovedAndSetPosition(const DynamicPatriciaTrieNodeReader *const nodeToUpdate, + const int movedPos, const int bigramLinkedNodePos); + + bool writePtNodeWithFullInfoToBuffer(BufferWithExtendableBuffer *const bufferToWrite, + const bool isBlacklisted, const bool isNotAWord, + const int parentPos, const int *const codePoints, const int codePointCount, + const int probability, const int childrenPos, const int originalBigramListPos, + const int originalShortcutListPos, int *const writingPos); + + bool writePtNodeToBuffer(BufferWithExtendableBuffer *const bufferToWrite, + const int parentPos, const int *const codePoints, const int codePointCount, + const int probability, int *const writingPos); + + bool createAndInsertNodeIntoPtNodeArray(const int parentPos, const int *const nodeCodePoints, + const int nodeCodePointCount, const int probability, int *const forwardLinkFieldPos); + + bool setPtNodeProbability(const DynamicPatriciaTrieNodeReader *const originalNode, + const int probability, const int *const codePoints, bool *const outAddedNewUnigram); + + bool createChildrenPtNodeArrayAndAChildPtNode( + const DynamicPatriciaTrieNodeReader *const parentNode, const int probability, + const int *const codePoints, const int codePointCount); + + bool createNewPtNodeArrayWithAChildPtNode(const int parentPos, const int *const nodeCodePoints, + const int nodeCodePointCount, const int probability); + + bool reallocatePtNodeAndAddNewPtNodes( + const DynamicPatriciaTrieNodeReader *const reallocatingPtNode, + const int *const reallocatingPtNodeCodePoints, const int overlappingCodePointCount, + const int probabilityOfNewPtNode, const int *const newNodeCodePoints, + const int newNodeCodePointCount); + + bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, + BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount, + int *const outBigramCount); + + int getUpdatedProbability(const int originalProbability, const int newProbability); +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H */
diff --git a/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.cpp b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.cpp new file mode 100644 index 0000000..30ff10c --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.cpp
@@ -0,0 +1,147 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h" + +#include <cstddef> +#include <cstdlib> +#include <stdint.h> + +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const size_t DynamicPatriciaTrieWritingUtils::MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD = 0x7F; +const size_t DynamicPatriciaTrieWritingUtils::MAX_PTNODE_ARRAY_SIZE = 0x7FFF; +const int DynamicPatriciaTrieWritingUtils::SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE = 1; +const int DynamicPatriciaTrieWritingUtils::LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE = 2; +const int DynamicPatriciaTrieWritingUtils::LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG = 0x8000; +const int DynamicPatriciaTrieWritingUtils::DICT_OFFSET_FIELD_SIZE = 3; +const int DynamicPatriciaTrieWritingUtils::MAX_DICT_OFFSET_VALUE = 0x7FFFFF; +const int DynamicPatriciaTrieWritingUtils::MIN_DICT_OFFSET_VALUE = -0x7FFFFF; +const int DynamicPatriciaTrieWritingUtils::DICT_OFFSET_NEGATIVE_FLAG = 0x800000; +const int DynamicPatriciaTrieWritingUtils::PROBABILITY_FIELD_SIZE = 1; +const int DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE = 1; + +/* static */ bool DynamicPatriciaTrieWritingUtils::writeEmptyDictionary( + BufferWithExtendableBuffer *const buffer, const int rootPos) { + int writingPos = rootPos; + if (!writePtNodeArraySizeAndAdvancePosition(buffer, 0 /* arraySize */, &writingPos)) { + return false; + } + return writeForwardLinkPositionAndAdvancePosition(buffer, NOT_A_DICT_POS /* forwardLinkPos */, + &writingPos); +} + +/* static */ bool DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int forwardLinkPos, + int *const forwardLinkFieldPos) { + return writeDictOffset(buffer, forwardLinkPos, (*forwardLinkFieldPos), forwardLinkFieldPos); +} + +/* static */ bool DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const size_t arraySize, + int *const arraySizeFieldPos) { + // Currently, all array size field to be created has LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE to + // simplify updating process. + // TODO: Use SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE for small arrays. + /*if (arraySize <= MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD) { + return buffer->writeUintAndAdvancePosition(arraySize, SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE, + arraySizeFieldPos); + } else */ + if (arraySize <= MAX_PTNODE_ARRAY_SIZE) { + uint32_t data = arraySize | LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG; + return buffer->writeUintAndAdvancePosition(data, LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE, + arraySizeFieldPos); + } else { + AKLOGI("PtNode array size cannot be written because arraySize is too large: %zd", + arraySize); + ASSERT(false); + return false; + } +} + +/* static */ bool DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, + const DynamicPatriciaTrieReadingUtils::NodeFlags nodeFlags, int *const nodeFlagsFieldPos) { + return buffer->writeUintAndAdvancePosition(nodeFlags, NODE_FLAG_FIELD_SIZE, nodeFlagsFieldPos); +} + +// Note that parentOffset is offset from node's head position. +/* static */ bool DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int parentPos, const int basePos, + int *const parentPosFieldPos) { + return writeDictOffset(buffer, parentPos, basePos, parentPosFieldPos); +} + +/* static */ bool DynamicPatriciaTrieWritingUtils::writeCodePointsAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int *const codePoints, + const int codePointCount, int *const codePointFieldPos) { + if (codePointCount <= 0) { + AKLOGI("code points cannot be written because codePointCount is invalid: %d", + codePointCount); + ASSERT(false); + return false; + } + const bool hasMultipleCodePoints = codePointCount > 1; + return buffer->writeCodePointsAndAdvancePosition(codePoints, codePointCount, + hasMultipleCodePoints, codePointFieldPos); +} + +/* static */ bool DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int probability, + int *const probabilityFieldPos) { + if (probability < 0 || probability > MAX_PROBABILITY) { + AKLOGI("probability cannot be written because the probability is invalid: %d", + probability); + ASSERT(false); + return false; + } + return buffer->writeUintAndAdvancePosition(probability, PROBABILITY_FIELD_SIZE, + probabilityFieldPos); +} + +/* static */ bool DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int childrenPosition, + int *const childrenPositionFieldPos) { + return writeDictOffset(buffer, childrenPosition, (*childrenPositionFieldPos), + childrenPositionFieldPos); +} + +/* static */ bool DynamicPatriciaTrieWritingUtils::writeDictOffset( + BufferWithExtendableBuffer *const buffer, const int targetPos, const int basePos, + int *const offsetFieldPos) { + int offset = targetPos - basePos; + if (targetPos == NOT_A_DICT_POS) { + offset = DynamicPatriciaTrieReadingUtils::DICT_OFFSET_INVALID; + } else if (offset == 0) { + offset = DynamicPatriciaTrieReadingUtils::DICT_OFFSET_ZERO_OFFSET; + } + if (offset > MAX_DICT_OFFSET_VALUE || offset < MIN_DICT_OFFSET_VALUE) { + AKLOGI("offset cannot be written because the offset is too large or too small: %d", + offset); + ASSERT(false); + return false; + } + uint32_t data = 0; + if (offset >= 0) { + data = offset; + } else { + data = abs(offset) | DICT_OFFSET_NEGATIVE_FLAG; + } + return buffer->writeUintAndAdvancePosition(data, DICT_OFFSET_FIELD_SIZE, offsetFieldPos); +} +}
diff --git a/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h new file mode 100644 index 0000000..af76bc6 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h
@@ -0,0 +1,76 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_UTILS_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_UTILS_H + +#include <cstddef> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class DynamicPatriciaTrieWritingUtils { + public: + static const int NODE_FLAG_FIELD_SIZE; + + static bool writeEmptyDictionary(BufferWithExtendableBuffer *const buffer, const int rootPos); + + static bool writeForwardLinkPositionAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int forwardLinkPos, + int *const forwardLinkFieldPos); + + static bool writePtNodeArraySizeAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const size_t arraySize, int *const arraySizeFieldPos); + + static bool writeFlagsAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const DynamicPatriciaTrieReadingUtils::NodeFlags nodeFlags, + int *const nodeFlagsFieldPos); + + static bool writeParentPosOffsetAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const int parentPosition, const int basePos, int *const parentPosFieldPos); + + static bool writeCodePointsAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const int *const codePoints, const int codePointCount, int *const codePointFieldPos); + + static bool writeProbabilityAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const int probability, int *const probabilityFieldPos); + + static bool writeChildrenPositionAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const int childrenPosition, int *const childrenPositionFieldPos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieWritingUtils); + + static const size_t MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD; + static const size_t MAX_PTNODE_ARRAY_SIZE; + static const int SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE; + static const int LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE; + static const int LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG; + static const int DICT_OFFSET_FIELD_SIZE; + static const int MAX_DICT_OFFSET_VALUE; + static const int MIN_DICT_OFFSET_VALUE; + static const int DICT_OFFSET_NEGATIVE_FLAG; + static const int PROBABILITY_FIELD_SIZE; + + static bool writeDictOffset(BufferWithExtendableBuffer *const buffer, const int targetPos, + const int basePos, int *const offsetFieldPos); +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_UTILS_H */
diff --git a/src/aosp/suggest/policyimpl/dictionary/header/header_policy.cpp b/src/aosp/suggest/policyimpl/dictionary/header/header_policy.cpp new file mode 100644 index 0000000..eb072fb --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/header/header_policy.cpp
@@ -0,0 +1,118 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/header/header_policy.h" + +namespace latinime { + +// Note that these are corresponding definitions in Java side in FormatSpec.FileHeader. +const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE"; +// TODO: Change attribute string to "IS_DECAYING_DICT". +const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE"; +const char *const HeaderPolicy::LAST_UPDATED_TIME_KEY = "date"; +const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME"; +const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT"; +const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT"; +const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE"; +const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100; +const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f; + +// Used for logging. Question mark is used to indicate that the key is not found. +void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue, + int outValueSize) const { + if (outValueSize <= 0) return; + if (outValueSize == 1) { + outValue[0] = '\0'; + return; + } + std::vector<int> keyCodePointVector; + HeaderReadWriteUtils::insertCharactersIntoVector(key, &keyCodePointVector); + HeaderReadWriteUtils::AttributeMap::const_iterator it = mAttributeMap.find(keyCodePointVector); + if (it == mAttributeMap.end()) { + // The key was not found. + outValue[0] = '?'; + outValue[1] = '\0'; + return; + } + const int terminalIndex = min(static_cast<int>(it->second.size()), outValueSize - 1); + for (int i = 0; i < terminalIndex; ++i) { + outValue[i] = it->second[i]; + } + outValue[terminalIndex] = '\0'; +} + +float HeaderPolicy::readMultipleWordCostMultiplier() const { + const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + MULTIPLE_WORDS_DEMOTION_RATE_KEY, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE); + if (demotionRate <= 0) { + return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); + } + return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(demotionRate); +} + +bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite, + const bool updatesLastUpdatedTime, const bool updatesLastDecayedTime, + const int unigramCount, const int bigramCount, const int extendedRegionSize) const { + int writingPos = 0; + if (!HeaderReadWriteUtils::writeDictionaryVersion(bufferToWrite, mDictFormatVersion, + &writingPos)) { + return false; + } + if (!HeaderReadWriteUtils::writeDictionaryFlags(bufferToWrite, mDictionaryFlags, + &writingPos)) { + return false; + } + // Temporarily writes a dummy header size. + int headerSizeFieldPos = writingPos; + if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(bufferToWrite, 0 /* size */, + &writingPos)) { + return false; + } + HeaderReadWriteUtils::AttributeMap attributeMapTowrite(mAttributeMap); + HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, UNIGRAM_COUNT_KEY, unigramCount); + HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, BIGRAM_COUNT_KEY, bigramCount); + HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, EXTENDED_REGION_SIZE_KEY, + extendedRegionSize); + if (updatesLastUpdatedTime) { + // Set current time as a last updated time. + HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, LAST_UPDATED_TIME_KEY, + time(0)); + } + if (updatesLastDecayedTime) { + // Set current time as a last updated time. + HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, LAST_DECAYED_TIME_KEY, + time(0)); + } + if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &attributeMapTowrite, + &writingPos)) { + return false; + } + // Writes an actual header size. + if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(bufferToWrite, writingPos, + &headerSizeFieldPos)) { + return false; + } + return true; +} + +/* static */ HeaderReadWriteUtils::AttributeMap + HeaderPolicy::createAttributeMapAndReadAllAttributes(const uint8_t *const dictBuf) { + HeaderReadWriteUtils::AttributeMap attributeMap; + HeaderReadWriteUtils::fetchAllHeaderAttributes(dictBuf, &attributeMap); + return attributeMap; +} + +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/dictionary/header/header_policy.h b/src/aosp/suggest/policyimpl/dictionary/header/header_policy.h new file mode 100644 index 0000000..a9c7805 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/header/header_policy.h
@@ -0,0 +1,151 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_HEADER_POLICY_H +#define LATINIME_HEADER_POLICY_H + +#include <ctime> +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_header_structure_policy.h" +#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" +#include "suggest/policyimpl/dictionary/utils/format_utils.h" + +namespace latinime { + +class HeaderPolicy : public DictionaryHeaderStructurePolicy { + public: + // Reads information from existing dictionary buffer. + HeaderPolicy(const uint8_t *const dictBuf, const int dictSize) + : mDictFormatVersion(FormatUtils::detectFormatVersion(dictBuf, dictSize)), + mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)), + mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)), + mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)), + mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), + mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, + IS_DECAYING_DICT_KEY, false /* defaultValue */)), + mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)), + mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + LAST_DECAYED_TIME_KEY, time(0) /* defaultValue */)), + mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + UNIGRAM_COUNT_KEY, 0 /* defaultValue */)), + mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + BIGRAM_COUNT_KEY, 0 /* defaultValue */)), + mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)) {} + + // Constructs header information using an attribute map. + HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion, + const HeaderReadWriteUtils::AttributeMap *const attributeMap) + : mDictFormatVersion(dictFormatVersion), + mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( + attributeMap)), mSize(0), mAttributeMap(*attributeMap), + mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), + mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, + IS_DECAYING_DICT_KEY, false /* defaultValue */)), + mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)), + mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)), + mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0) {} + + ~HeaderPolicy() {} + + AK_FORCE_INLINE int getSize() const { + return mSize; + } + + AK_FORCE_INLINE bool supportsDynamicUpdate() const { + return HeaderReadWriteUtils::supportsDynamicUpdate(mDictionaryFlags); + } + + AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const { + return HeaderReadWriteUtils::requiresGermanUmlautProcessing(mDictionaryFlags); + } + + AK_FORCE_INLINE bool requiresFrenchLigatureProcessing() const { + return HeaderReadWriteUtils::requiresFrenchLigatureProcessing(mDictionaryFlags); + } + + AK_FORCE_INLINE float getMultiWordCostMultiplier() const { + return mMultiWordCostMultiplier; + } + + AK_FORCE_INLINE bool isDecayingDict() const { + return mIsDecayingDict; + } + + AK_FORCE_INLINE int getLastUpdatedTime() const { + return mLastUpdatedTime; + } + + AK_FORCE_INLINE int getLastDecayedTime() const { + return mLastDecayedTime; + } + + AK_FORCE_INLINE int getUnigramCount() const { + return mUnigramCount; + } + + AK_FORCE_INLINE int getBigramCount() const { + return mBigramCount; + } + + AK_FORCE_INLINE int getExtendedRegionSize() const { + return mExtendedRegionSize; + } + + void readHeaderValueOrQuestionMark(const char *const key, + int *outValue, int outValueSize) const; + + bool writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite, + const bool updatesLastUpdatedTime, const bool updatesLastDecayedTime, + const int unigramCount, const int bigramCount, const int extendedRegionSize) const; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderPolicy); + + static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY; + static const char *const IS_DECAYING_DICT_KEY; + static const char *const LAST_UPDATED_TIME_KEY; + static const char *const LAST_DECAYED_TIME_KEY; + static const char *const UNIGRAM_COUNT_KEY; + static const char *const BIGRAM_COUNT_KEY; + static const char *const EXTENDED_REGION_SIZE_KEY; + static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; + static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; + + const FormatUtils::FORMAT_VERSION mDictFormatVersion; + const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; + const int mSize; + HeaderReadWriteUtils::AttributeMap mAttributeMap; + const float mMultiWordCostMultiplier; + const bool mIsDecayingDict; + const int mLastUpdatedTime; + const int mLastDecayedTime; + const int mUnigramCount; + const int mBigramCount; + const int mExtendedRegionSize; + + float readMultipleWordCostMultiplier() const; + + static HeaderReadWriteUtils::AttributeMap createAttributeMapAndReadAllAttributes( + const uint8_t *const dictBuf); +}; +} // namespace latinime +#endif /* LATINIME_HEADER_POLICY_H */
diff --git a/src/aosp/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/src/aosp/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp new file mode 100644 index 0000000..5ded8f6 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp
@@ -0,0 +1,227 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" + +#include <cctype> +#include <cstdio> +#include <vector> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256; +const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 256; + +const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4; +const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2; +const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2; +const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4; + +const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0; +// Flags for special processing +// Those *must* match the flags in makedict (FormatSpec#*_PROCESSING_FLAG) or +// something very bad (like, the apocalypse) will happen. Please update both at the same time. +const HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::GERMAN_UMLAUT_PROCESSING_FLAG = 0x1; +const HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::SUPPORTS_DYNAMIC_UPDATE_FLAG = 0x2; +const HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::FRENCH_LIGATURE_PROCESSING_FLAG = 0x4; + +// Note that these are corresponding definitions in Java side in FormatSpec.FileHeader. +const char *const HeaderReadWriteUtils::SUPPORTS_DYNAMIC_UPDATE_KEY = "SUPPORTS_DYNAMIC_UPDATE"; +const char *const HeaderReadWriteUtils::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY = + "REQUIRES_GERMAN_UMLAUT_PROCESSING"; +const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY = + "REQUIRES_FRENCH_LIGATURE_PROCESSING"; + +/* static */ int HeaderReadWriteUtils::getHeaderSize(const uint8_t *const dictBuf) { + // See the format of the header in the comment in + // BinaryDictionaryFormatUtils::detectFormatVersion() + return ByteArrayUtils::readUint32(dictBuf, HEADER_MAGIC_NUMBER_SIZE + + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE); +} + +/* static */ HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::getFlags(const uint8_t *const dictBuf) { + return ByteArrayUtils::readUint16(dictBuf, + HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE); +} + +/* static */ HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( + const HeaderReadWriteUtils::AttributeMap *const attributeMap) { + const bool requiresGermanUmlautProcessing = readBoolAttributeValue(attributeMap, + REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false /* defaultValue */); + const bool requiresFrenchLigatureProcessing = readBoolAttributeValue(attributeMap, + REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY, false /* defaultValue */); + const bool supportsDynamicUpdate = readBoolAttributeValue(attributeMap, + SUPPORTS_DYNAMIC_UPDATE_KEY, false /* defaultValue */); + DictionaryFlags dictflags = NO_FLAGS; + dictflags |= requiresGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0; + dictflags |= requiresFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0; + dictflags |= supportsDynamicUpdate ? SUPPORTS_DYNAMIC_UPDATE_FLAG : 0; + return dictflags; +} + +/* static */ void HeaderReadWriteUtils::fetchAllHeaderAttributes(const uint8_t *const dictBuf, + AttributeMap *const headerAttributes) { + const int headerSize = getHeaderSize(dictBuf); + int pos = getHeaderOptionsPosition(); + if (pos == NOT_A_DICT_POS) { + // The header doesn't have header options. + return; + } + int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH]; + int valueBuffer[MAX_ATTRIBUTE_VALUE_LENGTH]; + while (pos < headerSize) { + const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, + MAX_ATTRIBUTE_KEY_LENGTH, keyBuffer, &pos); + std::vector<int> key; + key.insert(key.end(), keyBuffer, keyBuffer + keyLength); + const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, + MAX_ATTRIBUTE_VALUE_LENGTH, valueBuffer, &pos); + std::vector<int> value; + value.insert(value.end(), valueBuffer, valueBuffer + valueLength); + headerAttributes->insert(AttributeMap::value_type(key, value)); + } +} + +/* static */ bool HeaderReadWriteUtils::writeDictionaryVersion( + BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version, + int *const writingPos) { + if (!buffer->writeUintAndAdvancePosition(FormatUtils::MAGIC_NUMBER, HEADER_MAGIC_NUMBER_SIZE, + writingPos)) { + return false; + } + switch (version) { + case FormatUtils::VERSION_2: + // Version 2 dictionary writing is not supported. + return false; + case FormatUtils::VERSION_3: + return buffer->writeUintAndAdvancePosition(3 /* data */, + HEADER_DICTIONARY_VERSION_SIZE, writingPos); + default: + return false; + } +} + +/* static */ bool HeaderReadWriteUtils::writeDictionaryFlags( + BufferWithExtendableBuffer *const buffer, const DictionaryFlags flags, + int *const writingPos) { + return buffer->writeUintAndAdvancePosition(flags, HEADER_FLAG_SIZE, writingPos); +} + +/* static */ bool HeaderReadWriteUtils::writeDictionaryHeaderSize( + BufferWithExtendableBuffer *const buffer, const int size, int *const writingPos) { + return buffer->writeUintAndAdvancePosition(size, HEADER_SIZE_FIELD_SIZE, writingPos); +} + +/* static */ bool HeaderReadWriteUtils::writeHeaderAttributes( + BufferWithExtendableBuffer *const buffer, const AttributeMap *const headerAttributes, + int *const writingPos) { + for (AttributeMap::const_iterator it = headerAttributes->begin(); + it != headerAttributes->end(); ++it) { + if (it->first.empty() || it->second.empty()) { + continue; + } + // Write a key. + if (!buffer->writeCodePointsAndAdvancePosition(&(it->first.at(0)), it->first.size(), + true /* writesTerminator */, writingPos)) { + return false; + } + // Write a value. + if (!buffer->writeCodePointsAndAdvancePosition(&(it->second.at(0)), it->second.size(), + true /* writesTerminator */, writingPos)) { + return false; + } + } + return true; +} + +/* static */ void HeaderReadWriteUtils::setBoolAttribute(AttributeMap *const headerAttributes, + const char *const key, const bool value) { + setIntAttribute(headerAttributes, key, value ? 1 : 0); +} + +/* static */ void HeaderReadWriteUtils::setIntAttribute(AttributeMap *const headerAttributes, + const char *const key, const int value) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(key, &keyVector); + setIntAttributeInner(headerAttributes, &keyVector, value); +} + +/* static */ void HeaderReadWriteUtils::setIntAttributeInner(AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const int value) { + AttributeMap::mapped_type valueVector; + char charBuf[LARGEST_INT_DIGIT_COUNT + 1]; + snprintf(charBuf, LARGEST_INT_DIGIT_COUNT + 1, "%d", value); + insertCharactersIntoVector(charBuf, &valueVector); + (*headerAttributes)[*key] = valueVector; +} + +/* static */ bool HeaderReadWriteUtils::readBoolAttributeValue( + const AttributeMap *const headerAttributes, const char *const key, + const bool defaultValue) { + const int intDefaultValue = defaultValue ? 1 : 0; + const int intValue = readIntAttributeValue(headerAttributes, key, intDefaultValue); + return intValue != 0; +} + +/* static */ int HeaderReadWriteUtils::readIntAttributeValue( + const AttributeMap *const headerAttributes, const char *const key, + const int defaultValue) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(key, &keyVector); + return readIntAttributeValueInner(headerAttributes, &keyVector, defaultValue); +} + +/* static */ int HeaderReadWriteUtils::readIntAttributeValueInner( + const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key, + const int defaultValue) { + AttributeMap::const_iterator it = headerAttributes->find(*key); + if (it != headerAttributes->end()) { + int value = 0; + bool isNegative = false; + for (size_t i = 0; i < it->second.size(); ++i) { + if (i == 0 && it->second.at(i) == '-') { + isNegative = true; + } else { + if (!isdigit(it->second.at(i))) { + // If not a number. + return defaultValue; + } + value *= 10; + value += it->second.at(i) - '0'; + } + } + return isNegative ? -value : value; + } + return defaultValue; +} + +/* static */ void HeaderReadWriteUtils::insertCharactersIntoVector(const char *const characters, + std::vector<int> *const vector) { + for (int i = 0; characters[i]; ++i) { + vector->push_back(characters[i]); + } +} + +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/dictionary/header/header_read_write_utils.h b/src/aosp/suggest/policyimpl/dictionary/header/header_read_write_utils.h new file mode 100644 index 0000000..2259683 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/header/header_read_write_utils.h
@@ -0,0 +1,123 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_HEADER_READ_WRITE_UTILS_H +#define LATINIME_HEADER_READ_WRITE_UTILS_H + +#include <map> +#include <stdint.h> +#include <vector> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/utils/format_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class HeaderReadWriteUtils { + public: + typedef uint16_t DictionaryFlags; + typedef std::map<std::vector<int>, std::vector<int> > AttributeMap; + + static int getHeaderSize(const uint8_t *const dictBuf); + + static DictionaryFlags getFlags(const uint8_t *const dictBuf); + + static AK_FORCE_INLINE bool supportsDynamicUpdate(const DictionaryFlags flags) { + return (flags & SUPPORTS_DYNAMIC_UPDATE_FLAG) != 0; + } + + static AK_FORCE_INLINE bool requiresGermanUmlautProcessing(const DictionaryFlags flags) { + return (flags & GERMAN_UMLAUT_PROCESSING_FLAG) != 0; + } + + static AK_FORCE_INLINE bool requiresFrenchLigatureProcessing(const DictionaryFlags flags) { + return (flags & FRENCH_LIGATURE_PROCESSING_FLAG) != 0; + } + + static AK_FORCE_INLINE int getHeaderOptionsPosition() { + return HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE + + HEADER_SIZE_FIELD_SIZE; + } + + static DictionaryFlags createAndGetDictionaryFlagsUsingAttributeMap( + const HeaderReadWriteUtils::AttributeMap *const attributeMap); + + static void fetchAllHeaderAttributes(const uint8_t *const dictBuf, + AttributeMap *const headerAttributes); + + static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer, + const FormatUtils::FORMAT_VERSION version, int *const writingPos); + + static bool writeDictionaryFlags(BufferWithExtendableBuffer *const buffer, + const DictionaryFlags flags, int *const writingPos); + + static bool writeDictionaryHeaderSize(BufferWithExtendableBuffer *const buffer, + const int size, int *const writingPos); + + static bool writeHeaderAttributes(BufferWithExtendableBuffer *const buffer, + const AttributeMap *const headerAttributes, int *const writingPos); + + /** + * Methods for header attributes. + */ + static void setBoolAttribute(AttributeMap *const headerAttributes, + const char *const key, const bool value); + + static void setIntAttribute(AttributeMap *const headerAttributes, + const char *const key, const int value); + + static bool readBoolAttributeValue(const AttributeMap *const headerAttributes, + const char *const key, const bool defaultValue); + + static int readIntAttributeValue(const AttributeMap *const headerAttributes, + const char *const key, const int defaultValue); + + static void insertCharactersIntoVector(const char *const characters, + AttributeMap::key_type *const key); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderReadWriteUtils); + + static const int MAX_ATTRIBUTE_KEY_LENGTH; + static const int MAX_ATTRIBUTE_VALUE_LENGTH; + + static const int HEADER_MAGIC_NUMBER_SIZE; + static const int HEADER_DICTIONARY_VERSION_SIZE; + static const int HEADER_FLAG_SIZE; + static const int HEADER_SIZE_FIELD_SIZE; + + static const DictionaryFlags NO_FLAGS; + // Flags for special processing + // Those *must* match the flags in makedict (FormatSpec#*_PROCESSING_FLAGS) or + // something very bad (like, the apocalypse) will happen. Please update both at the same time. + static const DictionaryFlags GERMAN_UMLAUT_PROCESSING_FLAG; + static const DictionaryFlags SUPPORTS_DYNAMIC_UPDATE_FLAG; + static const DictionaryFlags FRENCH_LIGATURE_PROCESSING_FLAG; + + static const char *const SUPPORTS_DYNAMIC_UPDATE_KEY; + static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY; + static const char *const REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY; + + static void setIntAttributeInner(AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const int value); + + static int readIntAttributeValueInner(const AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const int defaultValue); +}; +} +#endif /* LATINIME_HEADER_READ_WRITE_UTILS_H */
diff --git a/src/aosp/suggest/policyimpl/dictionary/patricia_trie_policy.cpp b/src/aosp/suggest/policyimpl/dictionary/patricia_trie_policy.cpp new file mode 100644 index 0000000..8a84bd2 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/patricia_trie_policy.cpp
@@ -0,0 +1,433 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include "suggest/policyimpl/dictionary/patricia_trie_policy.h" + +#include "defines.h" +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/probability_utils.h" + +namespace latinime { + +void PatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const { + if (!dicNode->hasChildren()) { + return; + } + int nextPos = dicNode->getChildrenPos(); + if (nextPos < 0 || nextPos >= mDictBufferSize) { + AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %d", + nextPos, mDictBufferSize); + ASSERT(false); + return; + } + const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + mDictRoot, &nextPos); + for (int i = 0; i < childCount; i++) { + if (nextPos < 0 || nextPos >= mDictBufferSize) { + AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %d, childCount: %d / %d", + nextPos, mDictBufferSize, i, childCount); + ASSERT(false); + return; + } + nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes); + } +} + +// This retrieves code points and the probability of the word by its terminal position. +// Due to the fact that words are ordered in the dictionary in a strict breadth-first order, +// it is possible to check for this with advantageous complexity. For each node, we search +// for PtNodes with children and compare the children position with the position we look for. +// When we shoot the position we look for, it means the word we look for is in the children +// of the previous PtNode. The only tricky part is the fact that if we arrive at the end of a +// PtNode array with the last PtNode's children position still less than what we are searching for, +// we must descend the last PtNode's children (for example, if the word we are searching for starts +// with a z, it's the last PtNode of the root array, so all children addresses will be smaller +// than the position we look for, and we have to descend the z node). +/* Parameters : + * ptNodePos: the byte position of the terminal PtNode of the word we are searching for (this is + * what is stored as the "bigram position" in each bigram) + * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size. + * outUnigramProbability: a pointer to an int to write the probability into. + * Return value : the code point count, of 0 if the word was not found. + */ +// TODO: Split this function to be more readable +int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( + const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const { + int pos = getRootPosition(); + int wordPos = 0; + // One iteration of the outer loop iterates through PtNode arrays. As stated above, we will + // only traverse nodes that are actually a part of the terminal we are searching, so each time + // we enter this loop we are one depth level further than last time. + // The only reason we count nodes is because we want to reduce the probability of infinite + // looping in case there is a bug. Since we know there is an upper bound to the depth we are + // supposed to traverse, it does not hurt to count iterations. + for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) { + int lastCandidatePtNodePos = 0; + // Let's loop through PtNodes in this PtNode array searching for either the terminal + // or one of its ascendants. + for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + mDictRoot, &pos); ptNodeCount > 0; --ptNodeCount) { + const int startPos = pos; + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); + const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &pos); + if (ptNodePos == startPos) { + // We found the position. Copy the rest of the code points in the buffer and return + // the length. + outCodePoints[wordPos] = character; + if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { + int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &pos); + // We count code points in order to avoid infinite loops if the file is broken + // or if there is some other bug + int charCount = maxCodePointCount; + while (NOT_A_CODE_POINT != nextChar && --charCount > 0) { + outCodePoints[++wordPos] = nextChar; + nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &pos); + } + } + *outUnigramProbability = + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, + &pos); + return ++wordPos; + } + // We need to skip past this PtNode, so skip any remaining code points after the + // first and possibly the probability. + if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { + PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); + } + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); + } + // The fact that this PtNode has children is very important. Since we already know + // that this PtNode does not match, if it has no children we know it is irrelevant + // to what we are searching for. + const bool hasChildren = PatriciaTrieReadingUtils::hasChildrenInFlags(flags); + // We will write in `found' whether we have passed the children position we are + // searching for. For example if we search for "beer", the children of b are less + // than the address we are searching for and the children of c are greater. When we + // come here for c, we realize this is too big, and that we should descend b. + bool found; + if (hasChildren) { + int currentPos = pos; + // Here comes the tricky part. First, read the children position. + const int childrenPos = PatriciaTrieReadingUtils + ::readChildrenPositionAndAdvancePosition(mDictRoot, flags, ¤tPos); + if (childrenPos > ptNodePos) { + // If the children pos is greater than the position, it means the previous + // PtNode, which position is stored in lastCandidatePtNodePos, was the right + // one. + found = true; + } else if (1 >= ptNodeCount) { + // However if we are on the LAST PtNode of this array, and we have NOT shot the + // position we should descend THIS node. So we trick the lastCandidatePtNodePos + // so that we will descend this PtNode, not the previous one. + lastCandidatePtNodePos = startPos; + found = true; + } else { + // Else, we should continue looking. + found = false; + } + } else { + // Even if we don't have children here, we could still be on the last PtNode of / + // this array. If this is the case, we should descend the last PtNode that had + // children, and their position is already in lastCandidatePtNodePos. + found = (1 >= ptNodeCount); + } + + if (found) { + // Okay, we found the PtNode we should descend. Its position is in + // the lastCandidatePtNodePos variable, so we just re-read it. + if (0 != lastCandidatePtNodePos) { + const PatriciaTrieReadingUtils::NodeFlags lastFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition( + mDictRoot, &lastCandidatePtNodePos); + const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &lastCandidatePtNodePos); + // We copy all the characters in this PtNode to the buffer + outCodePoints[wordPos] = lastChar; + if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) { + int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &lastCandidatePtNodePos); + int charCount = maxCodePointCount; + while (-1 != nextChar && --charCount > 0) { + outCodePoints[++wordPos] = nextChar; + nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &lastCandidatePtNodePos); + } + } + ++wordPos; + // Now we only need to branch to the children address. Skip the probability if + // it's there, read pos, and break to resume the search at pos. + if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) { + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, + &lastCandidatePtNodePos); + } + pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + mDictRoot, lastFlags, &lastCandidatePtNodePos); + break; + } else { + // Here is a little tricky part: we come here if we found out that all children + // addresses in this PtNode are bigger than the address we are searching for. + // Should we conclude the word is not in the dictionary? No! It could still be + // one of the remaining PtNodes in this array, so we have to keep looking in + // this array until we find it (or we realize it's not there either, in which + // case it's actually not in the dictionary). Pass the end of this PtNode, + // ready to start the next one. + if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + mDictRoot, flags, &pos); + } + if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + mShortcutListPolicy.skipAllShortcuts(&pos); + } + if (PatriciaTrieReadingUtils::hasBigrams(flags)) { + mBigramListPolicy.skipAllBigrams(&pos); + } + } + } else { + // If we did not find it, we should record the last children address for the next + // iteration. + if (hasChildren) lastCandidatePtNodePos = startPos; + // Now skip the end of this PtNode (children pos and the attributes if any) so that + // our pos is after the end of this PtNode, at the start of the next one. + if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + mDictRoot, flags, &pos); + } + if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + mShortcutListPolicy.skipAllShortcuts(&pos); + } + if (PatriciaTrieReadingUtils::hasBigrams(flags)) { + mBigramListPolicy.skipAllBigrams(&pos); + } + } + + } + } + // If we have looked through all the PtNodes and found no match, the ptNodePos is + // not the position of a terminal in this dictionary. + return 0; +} + +// This function gets the position of the terminal node of the exact matching word in the +// dictionary. If no match is found, it returns NOT_A_DICT_POS. +int PatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const inWord, + const int length, const bool forceLowerCaseSearch) const { + int pos = getRootPosition(); + int wordPos = 0; + + while (true) { + // If we already traversed the tree further than the word is long, there means + // there was no match (or we would have found it). + if (wordPos >= length) return NOT_A_DICT_POS; + int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(mDictRoot, + &pos); + const int wChar = forceLowerCaseSearch + ? CharUtils::toLowerCase(inWord[wordPos]) : inWord[wordPos]; + while (true) { + // If there are no more PtNodes in this array, it means we could not + // find a matching character for this depth, therefore there is no match. + if (0 >= ptNodeCount) return NOT_A_DICT_POS; + const int ptNodePos = pos; + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); + int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(mDictRoot, + &pos); + if (character == wChar) { + // This is the correct PtNode. Only one PtNode may start with the same char within + // a PtNode array, so either we found our match in this array, or there is + // no match and we can return NOT_A_DICT_POS. So we will check all the + // characters in this PtNode indeed does match. + if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { + character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(mDictRoot, + &pos); + while (NOT_A_CODE_POINT != character) { + ++wordPos; + // If we shoot the length of the word we search for, or if we find a single + // character that does not match, as explained above, it means the word is + // not in the dictionary (by virtue of this PtNode being the only one to + // match the word on the first character, but not matching the whole word). + if (wordPos >= length) return NOT_A_DICT_POS; + if (inWord[wordPos] != character) return NOT_A_DICT_POS; + character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &pos); + } + } + // If we come here we know that so far, we do match. Either we are on a terminal + // and we match the length, in which case we found it, or we traverse children. + // If we don't match the length AND don't have children, then a word in the + // dictionary fully matches a prefix of the searched word but not the full word. + ++wordPos; + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + if (wordPos == length) { + return ptNodePos; + } + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); + } + if (!PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + return NOT_A_DICT_POS; + } + // We have children and we are still shorter than the word we are searching for, so + // we need to traverse children. Put the pointer on the children position, and + // break + pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, + flags, &pos); + break; + } else { + // This PtNode does not match, so skip the remaining part and go to the next. + if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { + PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, + &pos); + } + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); + } + if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, + flags, &pos); + } + if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + mShortcutListPolicy.skipAllShortcuts(&pos); + } + if (PatriciaTrieReadingUtils::hasBigrams(flags)) { + mBigramListPolicy.skipAllBigrams(&pos); + } + } + --ptNodeCount; + } + } +} + +int PatriciaTriePolicy::getProbability(const int unigramProbability, + const int bigramProbability) const { + if (unigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } else if (bigramProbability == NOT_A_PROBABILITY) { + return ProbabilityUtils::backoff(unigramProbability); + } else { + return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, + bigramProbability); + } +} + +int PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_PROBABILITY; + } + int pos = ptNodePos; + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); + if (!PatriciaTrieReadingUtils::isTerminal(flags)) { + return NOT_A_PROBABILITY; + } + if (PatriciaTrieReadingUtils::isNotAWord(flags) + || PatriciaTrieReadingUtils::isBlacklisted(flags)) { + // If this is not a word, or if it's a blacklisted entry, it should behave as + // having no probability outside of the suggestion process (where it should be used + // for shortcuts). + return NOT_A_PROBABILITY; + } + PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); + return getProbability(PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition( + mDictRoot, &pos), NOT_A_PROBABILITY); +} + +int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + int pos = ptNodePos; + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); + if (!PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + return NOT_A_DICT_POS; + } + PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); + } + if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, flags, &pos); + } + return pos; +} + +int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + int pos = ptNodePos; + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); + if (!PatriciaTrieReadingUtils::hasBigrams(flags)) { + return NOT_A_DICT_POS; + } + PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); + } + if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, flags, &pos); + } + if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + mShortcutListPolicy.skipAllShortcuts(&pos);; + } + return pos; +} + +int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode, + const int ptNodePos, DicNodeVector *childDicNodes) const { + int pos = ptNodePos; + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); + int mergedNodeCodePoints[MAX_WORD_LENGTH]; + const int mergedNodeCodePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( + mDictRoot, flags, MAX_WORD_LENGTH, mergedNodeCodePoints, &pos); + const int probability = (PatriciaTrieReadingUtils::isTerminal(flags))? + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos) + : NOT_A_PROBABILITY; + const int childrenPos = PatriciaTrieReadingUtils::hasChildrenInFlags(flags) ? + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + mDictRoot, flags, &pos) : NOT_A_DICT_POS; + if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + getShortcutsStructurePolicy()->skipAllShortcuts(&pos); + } + if (PatriciaTrieReadingUtils::hasBigrams(flags)) { + getBigramsStructurePolicy()->skipAllBigrams(&pos); + } + if (mergedNodeCodePointCount <= 0) { + AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount); + ASSERT(false); + return pos; + } + childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability, + PatriciaTrieReadingUtils::isTerminal(flags), + PatriciaTrieReadingUtils::hasChildrenInFlags(flags), + PatriciaTrieReadingUtils::isBlacklisted(flags) || + PatriciaTrieReadingUtils::isNotAWord(flags), + mergedNodeCodePointCount, mergedNodeCodePoints); + return pos; +} + +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/dictionary/patricia_trie_policy.h b/src/aosp/suggest/policyimpl/dictionary/patricia_trie_policy.h new file mode 100644 index 0000000..0f8662a --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/patricia_trie_policy.h
@@ -0,0 +1,138 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PATRICIA_TRIE_POLICY_H +#define LATINIME_PATRICIA_TRIE_POLICY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "suggest/policyimpl/dictionary/bigram/bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" + +namespace latinime { + +class DicNode; +class DicNodeVector; + +class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { + public: + PatriciaTriePolicy(const MmappedBuffer *const buffer) + : mBuffer(buffer), mHeaderPolicy(mBuffer->getBuffer(), buffer->getBufferSize()), + mDictRoot(mBuffer->getBuffer() + mHeaderPolicy.getSize()), + mDictBufferSize(mBuffer->getBufferSize() - mHeaderPolicy.getSize()), + mBigramListPolicy(mDictRoot), mShortcutListPolicy(mDictRoot) {} + + ~PatriciaTriePolicy() { + delete mBuffer; + } + + AK_FORCE_INLINE int getRootPosition() const { + return 0; + } + + void createAndGetAllChildNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const; + + int getCodePointsAndProbabilityAndReturnCodePointCount( + const int terminalNodePos, const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const; + + int getTerminalNodePositionOfWord(const int *const inWord, + const int length, const bool forceLowerCaseSearch) const; + + int getProbability(const int unigramProbability, const int bigramProbability) const; + + int getUnigramProbabilityOfPtNode(const int ptNodePos) const; + + int getShortcutPositionOfPtNode(const int ptNodePos) const; + + int getBigramsPositionOfPtNode(const int ptNodePos) const; + + const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { + return &mHeaderPolicy; + } + + const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const { + return &mBigramListPolicy; + } + + const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const { + return &mShortcutListPolicy; + } + + bool addUnigramWord(const int *const word, const int length, const int probability) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary."); + return false; + } + + bool addBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1, const int probability) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary."); + return false; + } + + bool removeBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary."); + return false; + } + + void flush(const char *const filePath) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: flush() is called for non-updatable dictionary."); + } + + void flushWithGC(const char *const filePath) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); + } + + bool needsToRunGC(const bool mindsBlockByGC) const { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); + return false; + } + + void getProperty(const char *const query, char *const outResult, + const int maxResultLength) { + // getProperty is not supported for this class. + if (maxResultLength > 0) { + outResult[0] = '\0'; + } + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy); + + const MmappedBuffer *const mBuffer; + const HeaderPolicy mHeaderPolicy; + const uint8_t *const mDictRoot; + const int mDictBufferSize; + const BigramListPolicy mBigramListPolicy; + const ShortcutListPolicy mShortcutListPolicy; + + int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos, + DicNodeVector *const childDicNodes) const; +}; +} // namespace latinime +#endif // LATINIME_PATRICIA_TRIE_POLICY_H
diff --git a/src/aosp/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp b/src/aosp/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp new file mode 100644 index 0000000..7df5581 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp
@@ -0,0 +1,133 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" + +#include "defines.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +typedef PatriciaTrieReadingUtils PtReadingUtils; + +const PtReadingUtils::NodeFlags PtReadingUtils::MASK_CHILDREN_POSITION_TYPE = 0xC0; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_NOPOSITION = 0x00; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_ONEBYTE = 0x40; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_TWOBYTES = 0x80; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_THREEBYTES = 0xC0; + +// Flag for single/multiple char group +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_MULTIPLE_CHARS = 0x20; +// Flag for terminal PtNodes +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_TERMINAL = 0x10; +// Flag for shortcut targets presence +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08; +// Flag for bigram presence +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04; +// Flag for non-words (typically, shortcut only entries) +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02; +// Flag for blacklist +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01; + +/* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint8_t firstByte = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); + if (firstByte < 0x80) { + return firstByte; + } else { + return ((firstByte & 0x7F) << 8) ^ ByteArrayUtils::readUint8AndAdvancePosition( + buffer, pos); + } +} + +/* static */ PtReadingUtils::NodeFlags PtReadingUtils::getFlagsAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); +} + +/* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer, + int *const pos) { + return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, pos); +} + +// Returns the number of read characters. +/* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer, + const NodeFlags flags, const int maxLength, int *const outBuffer, int *const pos) { + int length = 0; + if (hasMultipleChars(flags)) { + length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, outBuffer, + pos); + } else { + const int codePoint = getCodePointAndAdvancePosition(buffer, pos); + if (codePoint == NOT_A_CODE_POINT) { + // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is + // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR + // when the PtNode has a single code point. + length = 0; + AKLOGE("codePoint is NOT_A_CODE_POINT. pos: %d, codePoint: 0x%x, buffer[pos - 1]: 0x%x", + *pos - 1, codePoint, buffer[*pos - 1]); + ASSERT(false); + } else if (maxLength > 0) { + outBuffer[0] = codePoint; + length = 1; + } + } + return length; +} + +// Returns the number of skipped characters. +/* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags, + const int maxLength, int *const pos) { + if (hasMultipleChars(flags)) { + return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos); + } else { + if (maxLength > 0) { + getCodePointAndAdvancePosition(buffer, pos); + return 1; + } else { + return 0; + } + } +} + +/* static */ int PtReadingUtils::readProbabilityAndAdvancePosition(const uint8_t *const buffer, + int *const pos) { + return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); +} + +/* static */ int PtReadingUtils::readChildrenPositionAndAdvancePosition( + const uint8_t *const buffer, const NodeFlags flags, int *const pos) { + const int base = *pos; + int offset = 0; + switch (MASK_CHILDREN_POSITION_TYPE & flags) { + case FLAG_CHILDREN_POSITION_TYPE_ONEBYTE: + offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); + break; + case FLAG_CHILDREN_POSITION_TYPE_TWOBYTES: + offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer, pos); + break; + case FLAG_CHILDREN_POSITION_TYPE_THREEBYTES: + offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer, pos); + break; + default: + // If we come here, it means we asked for the children of a word with + // no children. + return NOT_A_DICT_POS; + } + return base + offset; +} + +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/dictionary/patricia_trie_reading_utils.h b/src/aosp/suggest/policyimpl/dictionary/patricia_trie_reading_utils.h new file mode 100644 index 0000000..8420ee9 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/patricia_trie_reading_utils.h
@@ -0,0 +1,120 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PATRICIA_TRIE_READING_UTILS_H +#define LATINIME_PATRICIA_TRIE_READING_UTILS_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +class PatriciaTrieReadingUtils { + public: + typedef uint8_t NodeFlags; + + static int getPtNodeArraySizeAndAdvancePosition(const uint8_t *const buffer, int *const pos); + + static NodeFlags getFlagsAndAdvancePosition(const uint8_t *const buffer, int *const pos); + + static int getCodePointAndAdvancePosition(const uint8_t *const buffer, int *const pos); + + // Returns the number of read characters. + static int getCharsAndAdvancePosition(const uint8_t *const buffer, const NodeFlags flags, + const int maxLength, int *const outBuffer, int *const pos); + + // Returns the number of skipped characters. + static int skipCharacters(const uint8_t *const buffer, const NodeFlags flags, + const int maxLength, int *const pos); + + static int readProbabilityAndAdvancePosition(const uint8_t *const buffer, int *const pos); + + static int readChildrenPositionAndAdvancePosition(const uint8_t *const buffer, + const NodeFlags flags, int *const pos); + + /** + * Node Flags + */ + static AK_FORCE_INLINE bool isBlacklisted(const NodeFlags flags) { + return (flags & FLAG_IS_BLACKLISTED) != 0; + } + + static AK_FORCE_INLINE bool isNotAWord(const NodeFlags flags) { + return (flags & FLAG_IS_NOT_A_WORD) != 0; + } + + static AK_FORCE_INLINE bool isTerminal(const NodeFlags flags) { + return (flags & FLAG_IS_TERMINAL) != 0; + } + + static AK_FORCE_INLINE bool hasShortcutTargets(const NodeFlags flags) { + return (flags & FLAG_HAS_SHORTCUT_TARGETS) != 0; + } + + static AK_FORCE_INLINE bool hasBigrams(const NodeFlags flags) { + return (flags & FLAG_HAS_BIGRAMS) != 0; + } + + static AK_FORCE_INLINE bool hasMultipleChars(const NodeFlags flags) { + return (flags & FLAG_HAS_MULTIPLE_CHARS) != 0; + } + + static AK_FORCE_INLINE bool hasChildrenInFlags(const NodeFlags flags) { + return FLAG_CHILDREN_POSITION_TYPE_NOPOSITION != (MASK_CHILDREN_POSITION_TYPE & flags); + } + + static AK_FORCE_INLINE NodeFlags createAndGetFlags(const bool isBlacklisted, + const bool isNotAWord, const bool isTerminal, const bool hasShortcutTargets, + const bool hasBigrams, const bool hasMultipleChars, + const int childrenPositionFieldSize) { + NodeFlags nodeFlags = 0; + nodeFlags = isBlacklisted ? (nodeFlags | FLAG_IS_BLACKLISTED) : nodeFlags; + nodeFlags = isNotAWord ? (nodeFlags | FLAG_IS_NOT_A_WORD) : nodeFlags; + nodeFlags = isTerminal ? (nodeFlags | FLAG_IS_TERMINAL) : nodeFlags; + nodeFlags = hasShortcutTargets ? (nodeFlags | FLAG_HAS_SHORTCUT_TARGETS) : nodeFlags; + nodeFlags = hasBigrams ? (nodeFlags | FLAG_HAS_BIGRAMS) : nodeFlags; + nodeFlags = hasMultipleChars ? (nodeFlags | FLAG_HAS_MULTIPLE_CHARS) : nodeFlags; + if (childrenPositionFieldSize == 1) { + nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_ONEBYTE; + } else if (childrenPositionFieldSize == 2) { + nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_TWOBYTES; + } else if (childrenPositionFieldSize == 3) { + nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_THREEBYTES; + } else { + nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_NOPOSITION; + } + return nodeFlags; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils); + + static const NodeFlags MASK_CHILDREN_POSITION_TYPE; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_NOPOSITION; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_ONEBYTE; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_TWOBYTES; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_THREEBYTES; + + static const NodeFlags FLAG_HAS_MULTIPLE_CHARS; + static const NodeFlags FLAG_IS_TERMINAL; + static const NodeFlags FLAG_HAS_SHORTCUT_TARGETS; + static const NodeFlags FLAG_HAS_BIGRAMS; + static const NodeFlags FLAG_IS_NOT_A_WORD; + static const NodeFlags FLAG_IS_BLACKLISTED; +}; +} // namespace latinime +#endif /* LATINIME_PATRICIA_TRIE_NODE_READING_UTILS_H */
diff --git a/src/aosp/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h b/src/aosp/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h new file mode 100644 index 0000000..bd3211f --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h
@@ -0,0 +1,123 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_SHORTCUT_LIST_POLICY_H +#define LATINIME_DYNAMIC_SHORTCUT_LIST_POLICY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" +#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +/* + * This is a dynamic version of ShortcutListPolicy and supports an additional buffer. + */ +class DynamicShortcutListPolicy : public DictionaryShortcutsStructurePolicy { + public: + explicit DynamicShortcutListPolicy(const BufferWithExtendableBuffer *const buffer) + : mBuffer(buffer) {} + + ~DynamicShortcutListPolicy() {} + + int getStartPos(const int pos) const { + if (pos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + return pos + ShortcutListReadingUtils::getShortcutListSizeFieldSize(); + } + + void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, + int *const pos) const { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *pos -= mBuffer->getOriginalBufferSize(); + } + const ShortcutListReadingUtils::ShortcutFlags flags = + ShortcutListReadingUtils::getFlagsAndForwardPointer(buffer, pos); + if (outHasNext) { + *outHasNext = ShortcutListReadingUtils::hasNext(flags); + } + if (outIsWhitelist) { + *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(flags); + } + if (outCodePoint) { + *outCodePointCount = ShortcutListReadingUtils::readShortcutTarget( + buffer, maxCodePointCount, outCodePoint, pos); + } + if (usesAdditionalBuffer) { + *pos += mBuffer->getOriginalBufferSize(); + } + } + + void skipAllShortcuts(int *const pos) const { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *pos -= mBuffer->getOriginalBufferSize(); + } + const int shortcutListSize = ShortcutListReadingUtils + ::getShortcutListSizeAndForwardPointer(buffer, pos); + *pos += shortcutListSize; + if (usesAdditionalBuffer) { + *pos += mBuffer->getOriginalBufferSize(); + } + } + + // Copy shortcuts from the shortcut list that starts at fromPos in mBuffer to toPos in + // bufferToWrite and advance these positions after the shortcut lists. This returns whether + // the copy was succeeded or not. + bool copyAllShortcutsAndReturnIfSucceededOrNot(BufferWithExtendableBuffer *const bufferToWrite, + int *const fromPos, int *const toPos) const { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*fromPos); + if (usesAdditionalBuffer) { + *fromPos -= mBuffer->getOriginalBufferSize(); + } + const int shortcutListSize = ShortcutListReadingUtils + ::getShortcutListSizeAndForwardPointer(mBuffer->getBuffer(usesAdditionalBuffer), + fromPos); + // Copy shortcut list size. + if (!bufferToWrite->writeUintAndAdvancePosition( + shortcutListSize + ShortcutListReadingUtils::getShortcutListSizeFieldSize(), + ShortcutListReadingUtils::getShortcutListSizeFieldSize(), toPos)) { + return false; + } + // Copy shortcut list. + for (int i = 0; i < shortcutListSize; ++i) { + const uint8_t data = ByteArrayUtils::readUint8AndAdvancePosition( + mBuffer->getBuffer(usesAdditionalBuffer), fromPos); + if (!bufferToWrite->writeUintAndAdvancePosition(data, 1 /* size */, toPos)) { + return false; + } + } + if (usesAdditionalBuffer) { + *fromPos += mBuffer->getOriginalBufferSize(); + } + return true; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicShortcutListPolicy); + + const BufferWithExtendableBuffer *const mBuffer; +}; +} // namespace latinime +#endif // LATINIME_DYNAMIC_SHORTCUT_LIST_POLICY_H
diff --git a/src/aosp/suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h b/src/aosp/suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h new file mode 100644 index 0000000..d73f739 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h
@@ -0,0 +1,73 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SHORTCUT_LIST_POLICY_H +#define LATINIME_SHORTCUT_LIST_POLICY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" +#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h" + +namespace latinime { + +class ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { + public: + explicit ShortcutListPolicy(const uint8_t *const shortcutBuf) + : mShortcutsBuf(shortcutBuf) {} + + ~ShortcutListPolicy() {} + + int getStartPos(const int pos) const { + if (pos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + int listPos = pos; + ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mShortcutsBuf, &listPos); + return listPos; + } + + void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, + int *const pos) const { + const ShortcutListReadingUtils::ShortcutFlags flags = + ShortcutListReadingUtils::getFlagsAndForwardPointer(mShortcutsBuf, pos); + if (outHasNext) { + *outHasNext = ShortcutListReadingUtils::hasNext(flags); + } + if (outIsWhitelist) { + *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(flags); + } + if (outCodePoint) { + *outCodePointCount = ShortcutListReadingUtils::readShortcutTarget( + mShortcutsBuf, maxCodePointCount, outCodePoint, pos); + } + } + + void skipAllShortcuts(int *const pos) const { + const int shortcutListSize = ShortcutListReadingUtils + ::getShortcutListSizeAndForwardPointer(mShortcutsBuf, pos); + *pos += shortcutListSize; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutListPolicy); + + const uint8_t *const mShortcutsBuf; +}; +} // namespace latinime +#endif // LATINIME_SHORTCUT_LIST_POLICY_H
diff --git a/src/aosp/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.cpp b/src/aosp/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.cpp new file mode 100644 index 0000000..847dcde --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.cpp
@@ -0,0 +1,51 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h" + +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +// Flag for presence of more attributes +const ShortcutListReadingUtils::ShortcutFlags + ShortcutListReadingUtils::FLAG_ATTRIBUTE_HAS_NEXT = 0x80; +// Mask for attribute probability, stored on 4 bits inside the flags byte. +const ShortcutListReadingUtils::ShortcutFlags + ShortcutListReadingUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; +const int ShortcutListReadingUtils::SHORTCUT_LIST_SIZE_FIELD_SIZE = 2; +// The numeric value of the shortcut probability that means 'whitelist'. +const int ShortcutListReadingUtils::WHITELIST_SHORTCUT_PROBABILITY = 15; + +/* static */ ShortcutListReadingUtils::ShortcutFlags + ShortcutListReadingUtils::getFlagsAndForwardPointer(const uint8_t *const dictRoot, + int *const pos) { + return ByteArrayUtils::readUint8AndAdvancePosition(dictRoot, pos); +} + +/* static */ int ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer( + const uint8_t *const dictRoot, int *const pos) { + // readUint16andAdvancePosition() returns an offset *including* the uint16 field itself. + return ByteArrayUtils::readUint16AndAdvancePosition(dictRoot, pos) + - SHORTCUT_LIST_SIZE_FIELD_SIZE; +} + +/* static */ int ShortcutListReadingUtils::readShortcutTarget( + const uint8_t *const dictRoot, const int maxLength, int *const outWord, int *const pos) { + return ByteArrayUtils::readStringAndAdvancePosition(dictRoot, maxLength, outWord, pos); +} + +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h b/src/aosp/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h new file mode 100644 index 0000000..a83ed5a --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h
@@ -0,0 +1,69 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SHORTCUT_LIST_READING_UTILS_H +#define LATINIME_SHORTCUT_LIST_READING_UTILS_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +class ShortcutListReadingUtils { + public: + typedef uint8_t ShortcutFlags; + + static ShortcutFlags getFlagsAndForwardPointer(const uint8_t *const dictRoot, int *const pos); + + static AK_FORCE_INLINE int getProbabilityFromFlags(const ShortcutFlags flags) { + return flags & MASK_ATTRIBUTE_PROBABILITY; + } + + static AK_FORCE_INLINE bool hasNext(const ShortcutFlags flags) { + return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0; + } + + // This method returns the size of the shortcut list region excluding the shortcut list size + // field at the beginning. + static int getShortcutListSizeAndForwardPointer(const uint8_t *const dictRoot, int *const pos); + + static AK_FORCE_INLINE int getShortcutListSizeFieldSize() { + return SHORTCUT_LIST_SIZE_FIELD_SIZE; + } + + static AK_FORCE_INLINE void skipShortcuts(const uint8_t *const dictRoot, int *const pos) { + const int shortcutListSize = getShortcutListSizeAndForwardPointer(dictRoot, pos); + *pos += shortcutListSize; + } + + static AK_FORCE_INLINE bool isWhitelist(const ShortcutFlags flags) { + return getProbabilityFromFlags(flags) == WHITELIST_SHORTCUT_PROBABILITY; + } + + static int readShortcutTarget(const uint8_t *const dictRoot, const int maxLength, + int *const outWord, int *const pos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutListReadingUtils); + + static const ShortcutFlags FLAG_ATTRIBUTE_HAS_NEXT; + static const ShortcutFlags MASK_ATTRIBUTE_PROBABILITY; + static const int SHORTCUT_LIST_SIZE_FIELD_SIZE; + static const int WHITELIST_SHORTCUT_PROBABILITY; +}; +} // namespace latinime +#endif // LATINIME_SHORTCUT_LIST_READING_UTILS_H
diff --git a/src/aosp/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp b/src/aosp/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp new file mode 100644 index 0000000..f692882 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp
@@ -0,0 +1,103 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const size_t BufferWithExtendableBuffer::MAX_ADDITIONAL_BUFFER_SIZE = 1024 * 1024; +const int BufferWithExtendableBuffer::NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE = 90; +// TODO: Needs to allocate larger memory corresponding to the current vector size. +const size_t BufferWithExtendableBuffer::EXTEND_ADDITIONAL_BUFFER_SIZE_STEP = 128 * 1024; + +bool BufferWithExtendableBuffer::writeUintAndAdvancePosition(const uint32_t data, const int size, + int *const pos) { + if (!(size >= 1 && size <= 4)) { + AKLOGI("writeUintAndAdvancePosition() is called with invalid size: %d", size); + ASSERT(false); + return false; + } + if (!checkAndPrepareWriting(*pos, size)) { + return false; + } + const bool usesAdditionalBuffer = isInAdditionalBuffer(*pos); + uint8_t *const buffer = usesAdditionalBuffer ? &mAdditionalBuffer[0] : mOriginalBuffer; + if (usesAdditionalBuffer) { + *pos -= mOriginalBufferSize; + } + ByteArrayUtils::writeUintAndAdvancePosition(buffer, data, size, pos); + if (usesAdditionalBuffer) { + *pos += mOriginalBufferSize; + } + return true; +} + +bool BufferWithExtendableBuffer::writeCodePointsAndAdvancePosition(const int *const codePoints, + const int codePointCount, const bool writesTerminator ,int *const pos) { + const size_t size = ByteArrayUtils::calculateRequiredByteCountToStoreCodePoints( + codePoints, codePointCount, writesTerminator); + if (!checkAndPrepareWriting(*pos, size)) { + return false; + } + const bool usesAdditionalBuffer = isInAdditionalBuffer(*pos); + uint8_t *const buffer = usesAdditionalBuffer ? &mAdditionalBuffer[0] : mOriginalBuffer; + if (usesAdditionalBuffer) { + *pos -= mOriginalBufferSize; + } + ByteArrayUtils::writeCodePointsAndAdvancePosition(buffer, codePoints, codePointCount, + writesTerminator, pos); + if (usesAdditionalBuffer) { + *pos += mOriginalBufferSize; + } + return true; +} + +bool BufferWithExtendableBuffer::extendBuffer() { + const size_t sizeAfterExtending = + mAdditionalBuffer.size() + EXTEND_ADDITIONAL_BUFFER_SIZE_STEP; + if (sizeAfterExtending > mMaxAdditionalBufferSize) { + return false; + } + mAdditionalBuffer.resize(mAdditionalBuffer.size() + EXTEND_ADDITIONAL_BUFFER_SIZE_STEP); + return true; +} + +bool BufferWithExtendableBuffer::checkAndPrepareWriting(const int pos, const int size) { + if (isInAdditionalBuffer(pos)) { + const int tailPosition = getTailPosition(); + if (pos == tailPosition) { + // Append data to the tail. + if (pos + size > static_cast<int>(mAdditionalBuffer.size()) + mOriginalBufferSize) { + // Need to extend buffer. + if (!extendBuffer()) { + return false; + } + } + mUsedAdditionalBufferSize += size; + } else if (pos + size > tailPosition) { + // The access will beyond the tail of used region. + return false; + } + } else { + if (pos < 0 || mOriginalBufferSize < pos + size) { + // Invalid position or violate the boundary. + return false; + } + } + return true; +} + +}
diff --git a/src/aosp/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h b/src/aosp/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h new file mode 100644 index 0000000..9dc3482 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h
@@ -0,0 +1,107 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H +#define LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H + +#include <cstddef> +#include <stdint.h> +#include <vector> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +// This is used as a buffer that can be extended for updatable dictionaries. +// To optimize performance, raw pointer is directly used for reading buffer. The position has to be +// adjusted to access additional buffer. On the other hand, this class does not provide writable +// raw pointer but provides several methods that handle boundary checking for writing data. +class BufferWithExtendableBuffer { + public: + BufferWithExtendableBuffer(uint8_t *const originalBuffer, const int originalBufferSize, + const int maxAdditionalBufferSize = MAX_ADDITIONAL_BUFFER_SIZE) + : mOriginalBuffer(originalBuffer), mOriginalBufferSize(originalBufferSize), + mAdditionalBuffer(EXTEND_ADDITIONAL_BUFFER_SIZE_STEP), mUsedAdditionalBufferSize(0), + mMaxAdditionalBufferSize(maxAdditionalBufferSize) {} + + AK_FORCE_INLINE int getTailPosition() const { + return mOriginalBufferSize + mUsedAdditionalBufferSize; + } + + AK_FORCE_INLINE int getUsedAdditionalBufferSize() const { + return mUsedAdditionalBufferSize; + } + + /** + * For reading. + */ + AK_FORCE_INLINE bool isInAdditionalBuffer(const int position) const { + return position >= mOriginalBufferSize; + } + + // TODO: Resolve the issue that the address can be changed when the vector is resized. + // CAVEAT!: Be careful about array out of bound access with buffers + AK_FORCE_INLINE const uint8_t *getBuffer(const bool usesAdditionalBuffer) const { + if (usesAdditionalBuffer) { + return &mAdditionalBuffer[0]; + } else { + return mOriginalBuffer; + } + } + + AK_FORCE_INLINE int getOriginalBufferSize() const { + return mOriginalBufferSize; + } + + AK_FORCE_INLINE bool isNearSizeLimit() const { + return mAdditionalBuffer.size() >= ((mMaxAdditionalBufferSize + * NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE) / 100); + } + + /** + * For writing. + * + * Writing is allowed for original buffer, already written region of additional buffer and the + * tail of additional buffer. + */ + bool writeUintAndAdvancePosition(const uint32_t data, const int size, int *const pos); + + bool writeCodePointsAndAdvancePosition(const int *const codePoints, const int codePointCount, + const bool writesTerminator, int *const pos); + + private: + DISALLOW_COPY_AND_ASSIGN(BufferWithExtendableBuffer); + + static const size_t MAX_ADDITIONAL_BUFFER_SIZE; + static const int NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE; + static const size_t EXTEND_ADDITIONAL_BUFFER_SIZE_STEP; + + uint8_t *const mOriginalBuffer; + const int mOriginalBufferSize; + std::vector<uint8_t> mAdditionalBuffer; + int mUsedAdditionalBufferSize; + const size_t mMaxAdditionalBufferSize; + + // Return if the buffer is successfully extended or not. + bool extendBuffer(); + + // Returns if it is possible to write size-bytes from pos. When pos is at the tail position of + // the additional buffer, try extending the buffer. + bool checkAndPrepareWriting(const int pos, const int size); +}; +} +#endif /* LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H */
diff --git a/src/aosp/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp b/src/aosp/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp new file mode 100644 index 0000000..1833e88 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp
@@ -0,0 +1,25 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +const uint8_t ByteArrayUtils::MINIMUM_ONE_BYTE_CHARACTER_VALUE = 0x20; +const uint8_t ByteArrayUtils::MAXIMUM_ONE_BYTE_CHARACTER_VALUE = 0xFF; +const uint8_t ByteArrayUtils::CHARACTER_ARRAY_TERMINATOR = 0x1F; + +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/dictionary/utils/byte_array_utils.h b/src/aosp/suggest/policyimpl/dictionary/utils/byte_array_utils.h new file mode 100644 index 0000000..0c15768 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/utils/byte_array_utils.h
@@ -0,0 +1,261 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BYTE_ARRAY_UTILS_H +#define LATINIME_BYTE_ARRAY_UTILS_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +/** + * Utility methods for reading byte arrays. + */ +class ByteArrayUtils { + public: + /** + * Integer writing + * + * Each method write a corresponding size integer in a big endian manner. + */ + static AK_FORCE_INLINE void writeUintAndAdvancePosition(uint8_t *const buffer, + const uint32_t data, const int size, int *const pos) { + // size must be in 1 to 4. + ASSERT(size >= 1 && size <= 4); + switch (size) { + case 1: + ByteArrayUtils::writeUint8AndAdvancePosition(buffer, data, pos); + return; + case 2: + ByteArrayUtils::writeUint16AndAdvancePosition(buffer, data, pos); + return; + case 3: + ByteArrayUtils::writeUint24AndAdvancePosition(buffer, data, pos); + return; + case 4: + ByteArrayUtils::writeUint32AndAdvancePosition(buffer, data, pos); + return; + default: + break; + } + } + + /** + * Integer reading + * + * Each method read a corresponding size integer in a big endian manner. + */ + static AK_FORCE_INLINE uint32_t readUint32(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 24) ^ (buffer[pos + 1] << 16) + ^ (buffer[pos + 2] << 8) ^ buffer[pos + 3]; + } + + static AK_FORCE_INLINE uint32_t readUint24(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 16) ^ (buffer[pos + 1] << 8) ^ buffer[pos + 2]; + } + + static AK_FORCE_INLINE uint16_t readUint16(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 8) ^ buffer[pos + 1]; + } + + static AK_FORCE_INLINE uint8_t readUint8(const uint8_t *const buffer, const int pos) { + return buffer[pos]; + } + + static AK_FORCE_INLINE uint32_t readUint32AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint32_t value = readUint32(buffer, *pos); + *pos += 4; + return value; + } + + static AK_FORCE_INLINE int readSint24AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint8_t value = readUint8(buffer, *pos); + if (value < 0x80) { + return readUint24AndAdvancePosition(buffer, pos); + } else { + (*pos)++; + return -(((value & 0x7F) << 16) ^ readUint16AndAdvancePosition(buffer, pos)); + } + } + + static AK_FORCE_INLINE uint32_t readUint24AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint32_t value = readUint24(buffer, *pos); + *pos += 3; + return value; + } + + static AK_FORCE_INLINE uint16_t readUint16AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint16_t value = readUint16(buffer, *pos); + *pos += 2; + return value; + } + + static AK_FORCE_INLINE uint8_t readUint8AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + return buffer[(*pos)++]; + } + + /** + * Code Point Reading + * + * 1 byte = bbbbbbbb match + * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte + * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because + * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with + * 00011111 would be outside unicode. + * else: iso-latin-1 code + * This allows for the whole unicode range to be encoded, including chars outside of + * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control + * characters which should never happen anyway (and still work, but take 3 bytes). + */ + static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) { + int p = pos; + return readCodePointAndAdvancePosition(buffer, &p); + } + + static AK_FORCE_INLINE int readCodePointAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint8_t firstByte = readUint8(buffer, *pos); + if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) { + if (firstByte == CHARACTER_ARRAY_TERMINATOR) { + *pos += 1; + return NOT_A_CODE_POINT; + } else { + return readUint24AndAdvancePosition(buffer, pos); + } + } else { + *pos += 1; + return firstByte; + } + } + + /** + * String (array of code points) Reading + * + * Reads code points until the terminator is found. + */ + // Returns the length of the string. + static int readStringAndAdvancePosition(const uint8_t *const buffer, + const int maxLength, int *const outBuffer, int *const pos) { + int length = 0; + int codePoint = readCodePointAndAdvancePosition(buffer, pos); + while (NOT_A_CODE_POINT != codePoint && length < maxLength) { + outBuffer[length++] = codePoint; + codePoint = readCodePointAndAdvancePosition(buffer, pos); + } + return length; + } + + // Advances the position and returns the length of the string. + static int advancePositionToBehindString( + const uint8_t *const buffer, const int maxLength, int *const pos) { + int length = 0; + int codePoint = readCodePointAndAdvancePosition(buffer, pos); + while (NOT_A_CODE_POINT != codePoint && length < maxLength) { + codePoint = readCodePointAndAdvancePosition(buffer, pos); + length++; + } + return length; + } + + /** + * String (array of code points) Writing + */ + static void writeCodePointsAndAdvancePosition(uint8_t *const buffer, + const int *const codePoints, const int codePointCount, const bool writesTerminator, + int *const pos) { + for (int i = 0; i < codePointCount; ++i) { + const int codePoint = codePoints[i]; + if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { + break; + } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE + || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { + // three bytes character. + writeUint24AndAdvancePosition(buffer, codePoint, pos); + } else { + // one byte character. + writeUint8AndAdvancePosition(buffer, codePoint, pos); + } + } + if (writesTerminator) { + writeUint8AndAdvancePosition(buffer, CHARACTER_ARRAY_TERMINATOR, pos); + } + } + + static int calculateRequiredByteCountToStoreCodePoints(const int *const codePoints, + const int codePointCount, const bool writesTerminator) { + int byteCount = 0; + for (int i = 0; i < codePointCount; ++i) { + const int codePoint = codePoints[i]; + if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { + break; + } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE + || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { + // three bytes character. + byteCount += 3; + } else { + // one byte character. + byteCount += 1; + } + } + if (writesTerminator) { + // The terminator is one byte. + byteCount += 1; + } + return byteCount; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils); + + static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE; + static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE; + static const uint8_t CHARACTER_ARRAY_TERMINATOR; + + static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer, + const uint32_t data, int *const pos) { + buffer[(*pos)++] = (data >> 24) & 0xFF; + buffer[(*pos)++] = (data >> 16) & 0xFF; + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint24AndAdvancePosition(uint8_t *const buffer, + const uint32_t data, int *const pos) { + buffer[(*pos)++] = (data >> 16) & 0xFF; + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint16AndAdvancePosition(uint8_t *const buffer, + const uint16_t data, int *const pos) { + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint8AndAdvancePosition(uint8_t *const buffer, + const uint8_t data, int *const pos) { + buffer[(*pos)++] = data & 0xFF; + } +}; +} // namespace latinime +#endif /* LATINIME_BYTE_ARRAY_UTILS_H */
diff --git a/src/aosp/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/src/aosp/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp new file mode 100644 index 0000000..994826f --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp
@@ -0,0 +1,109 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" + +#include <cstdio> +#include <cstring> + +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/format_utils.h" + +namespace latinime { + +const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = ".tmp"; + +/* static */ bool DictFileWritingUtils::createEmptyDictFile(const char *const filePath, + const int dictVersion, const HeaderReadWriteUtils::AttributeMap *const attributeMap) { + switch (dictVersion) { + case 3: + return createEmptyV3DictFile(filePath, attributeMap); + default: + // Only version 3 dictionary is supported for now. + return false; + } +} + +/* static */ bool DictFileWritingUtils::createEmptyV3DictFile(const char *const filePath, + const HeaderReadWriteUtils::AttributeMap *const attributeMap) { + BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); + HeaderPolicy headerPolicy(FormatUtils::VERSION_3, attributeMap); + headerPolicy.writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */, + true /* updatesLastDecayedTime */, 0 /* unigramCount */, 0 /* bigramCount */, + 0 /* extendedRegionSize */); + BufferWithExtendableBuffer bodyBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); + if (!DynamicPatriciaTrieWritingUtils::writeEmptyDictionary(&bodyBuffer, 0 /* rootPos */)) { + return false; + } + return flushAllHeaderAndBodyToFile(filePath, &headerBuffer, &bodyBuffer); +} + +/* static */ bool DictFileWritingUtils::flushAllHeaderAndBodyToFile(const char *const filePath, + BufferWithExtendableBuffer *const dictHeader, BufferWithExtendableBuffer *const dictBody) { + const int tmpFileNameBufSize = strlen(filePath) + + strlen(TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE) + 1 /* terminator */; + // Name of a temporary file used for writing that is a connected string of original name and + // TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE. + char tmpFileName[tmpFileNameBufSize]; + snprintf(tmpFileName, tmpFileNameBufSize, "%s%s", filePath, + TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE); + FILE *const file = fopen(tmpFileName, "wb"); + if (!file) { + AKLOGE("Dictionary file %s cannnot be opened.", tmpFileName); + ASSERT(false); + return false; + } + // Write the dictionary header. + if (!writeBufferToFile(file, dictHeader)) { + remove(tmpFileName); + AKLOGE("Dictionary header cannnot be written. size: %d", dictHeader->getTailPosition()); + ASSERT(false); + return false; + } + // Write the dictionary body. + if (!writeBufferToFile(file, dictBody)) { + remove(tmpFileName); + AKLOGE("Dictionary body cannnot be written. size: %d", dictBody->getTailPosition()); + ASSERT(false); + return false; + } + fclose(file); + rename(tmpFileName, filePath); + return true; +} + +// This closes file pointer when an error is caused and returns whether the writing was succeeded +// or not. +/* static */ bool DictFileWritingUtils::writeBufferToFile(FILE *const file, + const BufferWithExtendableBuffer *const buffer) { + const int originalBufSize = buffer->getOriginalBufferSize(); + if (originalBufSize > 0 && fwrite(buffer->getBuffer(false /* usesAdditionalBuffer */), + originalBufSize, 1, file) < 1) { + fclose(file); + return false; + } + const int additionalBufSize = buffer->getUsedAdditionalBufferSize(); + if (additionalBufSize > 0 && fwrite(buffer->getBuffer(true /* usesAdditionalBuffer */), + additionalBufSize, 1, file) < 1) { + fclose(file); + return false; + } + return true; +} + +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h b/src/aosp/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h new file mode 100644 index 0000000..bd4ac66 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h
@@ -0,0 +1,50 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICT_FILE_WRITING_UTILS_H +#define LATINIME_DICT_FILE_WRITING_UTILS_H + +#include <cstdio> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class DictFileWritingUtils { + public: + static bool createEmptyDictFile(const char *const filePath, const int dictVersion, + const HeaderReadWriteUtils::AttributeMap *const attributeMap); + + static bool flushAllHeaderAndBodyToFile(const char *const filePath, + BufferWithExtendableBuffer *const dictHeader, + BufferWithExtendableBuffer *const dictBody); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DictFileWritingUtils); + + static const char *const TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE; + + static bool createEmptyV3DictFile(const char *const filePath, + const HeaderReadWriteUtils::AttributeMap *const attributeMap); + + static bool writeBufferToFile(FILE *const file, + const BufferWithExtendableBuffer *const buffer); +}; +} // namespace latinime +#endif /* LATINIME_DICT_FILE_WRITING_UTILS_H */
diff --git a/src/aosp/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/src/aosp/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp new file mode 100644 index 0000000..1632fd0 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp
@@ -0,0 +1,155 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cmath> +#include <ctime> +#include <stdlib.h> + +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" + +#include "suggest/core/policy/dictionary_header_structure_policy.h" +#include "suggest/policyimpl/dictionary/utils/probability_utils.h" + +namespace latinime { + +const int ForgettingCurveUtils::MAX_UNIGRAM_COUNT = 12000; +const int ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC = 10000; +const int ForgettingCurveUtils::MAX_BIGRAM_COUNT = 12000; +const int ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000; + +const int ForgettingCurveUtils::MAX_COMPUTED_PROBABILITY = 127; +const int ForgettingCurveUtils::MAX_ENCODED_PROBABILITY = 15; +const int ForgettingCurveUtils::MIN_VALID_ENCODED_PROBABILITY = 3; +const int ForgettingCurveUtils::ENCODED_PROBABILITY_STEP = 1; +// Currently, we try to decay each uni/bigram once every 2 hours. Accordingly, the expected +// duration of the decay is approximately 66hours. +const float ForgettingCurveUtils::MIN_PROBABILITY_TO_DECAY = 0.03f; +const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60; + +const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable; +ForgettingCurveUtils::TimeKeeper ForgettingCurveUtils::sTimeKeeper; + +void ForgettingCurveUtils::TimeKeeper::setCurrentTime() { + mCurrentTime = time(0); +} + +/* static */ int ForgettingCurveUtils::getProbability(const int encodedUnigramProbability, + const int encodedBigramProbability) { + if (encodedUnigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } else if (encodedBigramProbability == NOT_A_PROBABILITY) { + return backoff(decodeProbability(encodedUnigramProbability)); + } else { + const int unigramProbability = decodeProbability(encodedUnigramProbability); + const int bigramProbability = decodeProbability(encodedBigramProbability); + return min(max(unigramProbability, bigramProbability), MAX_COMPUTED_PROBABILITY); + } +} + +// Caveat: Unlike getProbability(), this method doesn't assume special bigram probability encoding +// (i.e. unigram probability + bigram probability delta). +/* static */ int ForgettingCurveUtils::getUpdatedEncodedProbability( + const int originalEncodedProbability, const int newProbability) { + if (originalEncodedProbability == NOT_A_PROBABILITY) { + // The bigram relation is not in this dictionary. + if (newProbability == NOT_A_PROBABILITY) { + // The bigram target is not in other dictionaries. + return 0; + } else { + return MIN_VALID_ENCODED_PROBABILITY; + } + } else { + if (newProbability != NOT_A_PROBABILITY + && originalEncodedProbability < MIN_VALID_ENCODED_PROBABILITY) { + return MIN_VALID_ENCODED_PROBABILITY; + } + return min(originalEncodedProbability + ENCODED_PROBABILITY_STEP, MAX_ENCODED_PROBABILITY); + } +} + +/* static */ int ForgettingCurveUtils::isValidEncodedProbability(const int encodedProbability) { + return encodedProbability >= MIN_VALID_ENCODED_PROBABILITY; +} + +/* static */ int ForgettingCurveUtils::getEncodedProbabilityToSave(const int encodedProbability, + const DictionaryHeaderStructurePolicy *const headerPolicy) { + const int elapsedTime = sTimeKeeper.peekCurrentTime() - headerPolicy->getLastDecayedTime(); + const int decayIterationCount = max(elapsedTime / DECAY_INTERVAL_SECONDS, 1); + int currentEncodedProbability = max(min(encodedProbability, MAX_ENCODED_PROBABILITY), 0); + // TODO: Implement the decay in more proper way. + for (int i = 0; i < decayIterationCount; ++i) { + const float currentRate = static_cast<float>(currentEncodedProbability) + / static_cast<float>(MAX_ENCODED_PROBABILITY); + const float thresholdToDecay = (1.0f - MIN_PROBABILITY_TO_DECAY) * currentRate; + const float randValue = static_cast<float>(rand()) / static_cast<float>(RAND_MAX); + if (thresholdToDecay < randValue) { + currentEncodedProbability = max(currentEncodedProbability - ENCODED_PROBABILITY_STEP, + 0); + } + } + return currentEncodedProbability; +} + +/* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay, + const int unigramCount, const int bigramCount, + const DictionaryHeaderStructurePolicy *const headerPolicy) { + if (unigramCount >= ForgettingCurveUtils::MAX_UNIGRAM_COUNT) { + // Unigram count exceeds the limit. + return true; + } else if (bigramCount >= ForgettingCurveUtils::MAX_BIGRAM_COUNT) { + // Bigram count exceeds the limit. + return true; + } + if (mindsBlockByDecay) { + return false; + } + if (headerPolicy->getLastDecayedTime() + DECAY_INTERVAL_SECONDS < time(0)) { + // Time to decay. + return true; + } + return false; +} + +/* static */ int ForgettingCurveUtils::decodeProbability(const int encodedProbability) { + if (encodedProbability < MIN_VALID_ENCODED_PROBABILITY) { + return NOT_A_PROBABILITY; + } else { + return min(sProbabilityTable.getProbability(encodedProbability), MAX_ENCODED_PROBABILITY); + } +} + +// See comments in ProbabilityUtils::backoff(). +/* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) { + if (unigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } else { + return max(unigramProbability - 8, 0); + } +} + +ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTable() { + // Table entry is as follows: + // 1, 1, 1, 2, 3, 5, 6, 9, 13, 18, 25, 34, 48, 66, 91, 127. + // Note that first MIN_VALID_ENCODED_PROBABILITY values are not used. + mTable.resize(MAX_ENCODED_PROBABILITY + 1); + for (int i = 0; i <= MAX_ENCODED_PROBABILITY; ++i) { + const int probability = static_cast<int>(powf(static_cast<float>(MAX_COMPUTED_PROBABILITY), + static_cast<float>(i) / static_cast<float>(MAX_ENCODED_PROBABILITY))); + mTable[i] = min(MAX_COMPUTED_PROBABILITY, max(0, probability)); + } +} + +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h b/src/aosp/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h new file mode 100644 index 0000000..2ad4238 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h
@@ -0,0 +1,100 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_FORGETTING_CURVE_UTILS_H +#define LATINIME_FORGETTING_CURVE_UTILS_H + +#include <vector> + +#include "defines.h" + +namespace latinime { + +class DictionaryHeaderStructurePolicy; + +// TODO: Check the elapsed time and decrease the probability depending on the time. Time field is +// required to introduced to each terminal PtNode and bigram entry. +// TODO: Quit using bigram probability to indicate the delta. +class ForgettingCurveUtils { + public: + class TimeKeeper { + public: + TimeKeeper() : mCurrentTime(0) {} + void setCurrentTime(); + int peekCurrentTime() const { return mCurrentTime; }; + + private: + DISALLOW_COPY_AND_ASSIGN(TimeKeeper); + + int mCurrentTime; + }; + + static const int MAX_UNIGRAM_COUNT; + static const int MAX_UNIGRAM_COUNT_AFTER_GC; + static const int MAX_BIGRAM_COUNT; + static const int MAX_BIGRAM_COUNT_AFTER_GC; + + static TimeKeeper sTimeKeeper; + + static int getProbability(const int encodedUnigramProbability, + const int encodedBigramProbability); + + static int getUpdatedEncodedProbability(const int originalEncodedProbability, + const int newProbability); + + static int isValidEncodedProbability(const int encodedProbability); + + static int getEncodedProbabilityToSave(const int encodedProbability, + const DictionaryHeaderStructurePolicy *const headerPolicy); + + static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount, + const int bigramCount, const DictionaryHeaderStructurePolicy *const headerPolicy); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils); + + class ProbabilityTable { + public: + ProbabilityTable(); + + int getProbability(const int encodedProbability) const { + if (encodedProbability < 0 || encodedProbability > static_cast<int>(mTable.size())) { + return NOT_A_PROBABILITY; + } + return mTable[encodedProbability]; + } + + private: + DISALLOW_COPY_AND_ASSIGN(ProbabilityTable); + + std::vector<int> mTable; + }; + + static const int MAX_COMPUTED_PROBABILITY; + static const int MAX_ENCODED_PROBABILITY; + static const int MIN_VALID_ENCODED_PROBABILITY; + static const int ENCODED_PROBABILITY_STEP; + static const float MIN_PROBABILITY_TO_DECAY; + static const int DECAY_INTERVAL_SECONDS; + + static const ProbabilityTable sProbabilityTable; + + static int decodeProbability(const int encodedProbability); + + static int backoff(const int unigramProbability); +}; +} // namespace latinime +#endif /* LATINIME_FORGETTING_CURVE_UTILS_H */
diff --git a/src/aosp/suggest/policyimpl/dictionary/utils/format_utils.cpp b/src/aosp/suggest/policyimpl/dictionary/utils/format_utils.cpp new file mode 100644 index 0000000..1d77d5c --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/utils/format_utils.cpp
@@ -0,0 +1,56 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/utils/format_utils.h" + +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +const uint32_t FormatUtils::MAGIC_NUMBER = 0x9BC13AFE; + +// Magic number (4 bytes), version (2 bytes), flags (2 bytes), header size (4 bytes) = 12 +const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; + +/* static */ FormatUtils::FORMAT_VERSION FormatUtils::detectFormatVersion( + const uint8_t *const dict, const int dictSize) { + // The magic number is stored big-endian. + // If the dictionary is less than 4 bytes, we can't even read the magic number, so we don't + // understand this format. + if (dictSize < DICTIONARY_MINIMUM_SIZE) { + return UNKNOWN_VERSION; + } + const uint32_t magicNumber = ByteArrayUtils::readUint32(dict, 0); + switch (magicNumber) { + case MAGIC_NUMBER: + // Version 2 header is as follows: + // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE + // Dictionary format version number (2 bytes) + // Options (2 bytes) + // Header size (4 bytes) : integer, big endian + if (ByteArrayUtils::readUint16(dict, 4) == 2) { + return VERSION_2; + } else if (ByteArrayUtils::readUint16(dict, 4) == 3) { + return VERSION_3; + } else { + return UNKNOWN_VERSION; + } + default: + return UNKNOWN_VERSION; + } +} + +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/dictionary/utils/format_utils.h b/src/aosp/suggest/policyimpl/dictionary/utils/format_utils.h new file mode 100644 index 0000000..79ed0de --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/utils/format_utils.h
@@ -0,0 +1,49 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_FORMAT_UTILS_H +#define LATINIME_FORMAT_UTILS_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +/** + * Methods to handle binary dictionary format version. + */ +class FormatUtils { + public: + enum FORMAT_VERSION { + VERSION_2, + VERSION_3, + UNKNOWN_VERSION + }; + + // 32 bit magic number is stored at the beginning of the dictionary header to reject + // unsupported or obsolete dictionary formats. + static const uint32_t MAGIC_NUMBER; + + static FORMAT_VERSION detectFormatVersion(const uint8_t *const dict, const int dictSize); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(FormatUtils); + + static const int DICTIONARY_MINIMUM_SIZE; +}; +} // namespace latinime +#endif /* LATINIME_FORMAT_UTILS_H */
diff --git a/src/aosp/suggest/policyimpl/dictionary/utils/mmapped_buffer.h b/src/aosp/suggest/policyimpl/dictionary/utils/mmapped_buffer.h new file mode 100644 index 0000000..6b69116 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/utils/mmapped_buffer.h
@@ -0,0 +1,102 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_MMAPPED_BUFFER_H +#define LATINIME_MMAPPED_BUFFER_H + +#include <cerrno> +#include <fcntl.h> +#include <stdint.h> +#include <sys/mman.h> +#include <unistd.h> + +#include "defines.h" + +namespace latinime { + +class MmappedBuffer { + public: + static MmappedBuffer* openBuffer(const char *const path, const int bufferOffset, + const int bufferSize, const bool isUpdatable) { + const int openMode = isUpdatable ? O_RDWR : O_RDONLY; + const int mmapFd = open(path, openMode); + if (mmapFd < 0) { + AKLOGE("DICT: Can't open the source. path=%s errno=%d", path, errno); + return 0; + } + const int pagesize = getpagesize(); + const int offset = bufferOffset % pagesize; + int alignedOffset = bufferOffset - offset; + int alignedSize = bufferSize + offset; + const int protMode = isUpdatable ? PROT_READ | PROT_WRITE : PROT_READ; + void *const mmappedBuffer = mmap(0, alignedSize, protMode, MAP_PRIVATE, mmapFd, + alignedOffset); + if (mmappedBuffer == MAP_FAILED) { + AKLOGE("DICT: Can't mmap dictionary. errno=%d", errno); + close(mmapFd); + return 0; + } + uint8_t *const buffer = static_cast<uint8_t *>(mmappedBuffer) + offset; + if (!buffer) { + AKLOGE("DICT: buffer is null"); + close(mmapFd); + return 0; + } + return new MmappedBuffer(buffer, bufferSize, mmappedBuffer, alignedSize, mmapFd, + isUpdatable); + } + + ~MmappedBuffer() { + int ret = munmap(mMmappedBuffer, mAlignedSize); + if (ret != 0) { + AKLOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno); + } + ret = close(mMmapFd); + if (ret != 0) { + AKLOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno); + } + } + + AK_FORCE_INLINE uint8_t *getBuffer() const { + return mBuffer; + } + + AK_FORCE_INLINE int getBufferSize() const { + return mBufferSize; + } + + AK_FORCE_INLINE bool isUpdatable() const { + return mIsUpdatable; + } + + private: + AK_FORCE_INLINE MmappedBuffer(uint8_t *const buffer, const int bufferSize, + void *const mmappedBuffer, const int alignedSize, const int mmapFd, + const bool isUpdatable) + : mBuffer(buffer), mBufferSize(bufferSize), mMmappedBuffer(mmappedBuffer), + mAlignedSize(alignedSize), mMmapFd(mmapFd), mIsUpdatable(isUpdatable) {} + + DISALLOW_IMPLICIT_CONSTRUCTORS(MmappedBuffer); + + uint8_t *const mBuffer; + const int mBufferSize; + void *const mMmappedBuffer; + const int mAlignedSize; + const int mMmapFd; + const bool mIsUpdatable; +}; +} +#endif /* LATINIME_MMAPPED_BUFFER_H */
diff --git a/src/aosp/suggest/policyimpl/dictionary/utils/probability_utils.h b/src/aosp/suggest/policyimpl/dictionary/utils/probability_utils.h new file mode 100644 index 0000000..21fe355 --- /dev/null +++ b/src/aosp/suggest/policyimpl/dictionary/utils/probability_utils.h
@@ -0,0 +1,55 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROBABILITY_UTILS_H +#define LATINIME_PROBABILITY_UTILS_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +class ProbabilityUtils { + public: + static AK_FORCE_INLINE int backoff(const int unigramProbability) { + return unigramProbability; + // For some reason, applying the backoff weight gives bad results in tests. To apply the + // backoff weight, we divide the probability by 2, which in our storing format means + // decreasing the score by 8. + // TODO: figure out what's wrong with this. + // return unigramProbability > 8 ? + // unigramProbability - 8 : (0 == unigramProbability ? 0 : 8); + } + + static AK_FORCE_INLINE int computeProbabilityForBigram( + const int unigramProbability, const int bigramProbability) { + // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want + // the unigram probability to be the median value of the 17th step from the top. A value of + // 0 for the bigram probability represents the middle of the 16th step from the top, + // while a value of 15 represents the middle of the top step. + // See makedict.BinaryDictEncoder#makeBigramFlags for details. + const float stepSize = static_cast<float>(MAX_PROBABILITY - unigramProbability) + / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); + return unigramProbability + + static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ProbabilityUtils); +}; +} +#endif /* LATINIME_PROBABILITY_UTILS_H */
diff --git a/src/aosp/suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp b/src/aosp/suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp new file mode 100644 index 0000000..6d31739 --- /dev/null +++ b/src/aosp/suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp
@@ -0,0 +1,21 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "gesture_suggest_policy_factory.h" + +namespace latinime { + const SuggestPolicy *(*GestureSuggestPolicyFactory::sGestureSuggestFactoryMethod)() = 0; +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/gesture/gesture_suggest_policy_factory.h b/src/aosp/suggest/policyimpl/gesture/gesture_suggest_policy_factory.h new file mode 100644 index 0000000..509b01f --- /dev/null +++ b/src/aosp/suggest/policyimpl/gesture/gesture_suggest_policy_factory.h
@@ -0,0 +1,44 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_GESTURE_SUGGEST_POLICY_FACTORY_H +#define LATINIME_GESTURE_SUGGEST_POLICY_FACTORY_H + +#include "defines.h" + +namespace latinime { + +class SuggestPolicy; + +class GestureSuggestPolicyFactory { + public: + static void setGestureSuggestPolicyFactoryMethod(const SuggestPolicy *(*factoryMethod)()) { + sGestureSuggestFactoryMethod = factoryMethod; + } + + static const SuggestPolicy *getGestureSuggestPolicy() { + if (!sGestureSuggestFactoryMethod) { + return 0; + } + return sGestureSuggestFactoryMethod(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(GestureSuggestPolicyFactory); + static const SuggestPolicy *(*sGestureSuggestFactoryMethod)(); +}; +} // namespace latinime +#endif // LATINIME_GESTURE_SUGGEST_POLICY_FACTORY_H
diff --git a/src/aosp/suggest/policyimpl/typing/scoring_params.cpp b/src/aosp/suggest/policyimpl/typing/scoring_params.cpp new file mode 100644 index 0000000..104eb2a --- /dev/null +++ b/src/aosp/suggest/policyimpl/typing/scoring_params.cpp
@@ -0,0 +1,57 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/typing/scoring_params.h" + +namespace latinime { +// TODO: RENAME all +const float ScoringParams::MAX_SPATIAL_DISTANCE = 1.0f; +const int ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY = 40; +const int ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED = 120; +const float ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD = 1.0f; +// TODO: Unlimit max cache dic node size +const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE = 170; +const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT = 310; +const int ScoringParams::THRESHOLD_SHORT_WORD_LENGTH = 4; + +const float ScoringParams::DISTANCE_WEIGHT_LENGTH = 0.1524f; +const float ScoringParams::PROXIMITY_COST = 0.0694f; +const float ScoringParams::FIRST_CHAR_PROXIMITY_COST = 0.072f; +const float ScoringParams::FIRST_PROXIMITY_COST = 0.07788f; +const float ScoringParams::OMISSION_COST = 0.4676f; +const float ScoringParams::OMISSION_COST_SAME_CHAR = 0.399f; +const float ScoringParams::OMISSION_COST_FIRST_CHAR = 0.5256f; +const float ScoringParams::INSERTION_COST = 0.7248f; +const float ScoringParams::TERMINAL_INSERTION_COST = 0.8128f; +const float ScoringParams::INSERTION_COST_SAME_CHAR = 0.5508f; +const float ScoringParams::INSERTION_COST_PROXIMITY_CHAR = 0.674f; +const float ScoringParams::INSERTION_COST_FIRST_CHAR = 0.639f; +const float ScoringParams::TRANSPOSITION_COST = 0.5608f; +const float ScoringParams::SPACE_SUBSTITUTION_COST = 0.339f; +const float ScoringParams::ADDITIONAL_PROXIMITY_COST = 0.4576f; +const float ScoringParams::SUBSTITUTION_COST = 0.3806f; +const float ScoringParams::COST_NEW_WORD = 0.0312f; +const float ScoringParams::COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE = 0.3224f; +const float ScoringParams::DISTANCE_WEIGHT_LANGUAGE = 1.1214f; +const float ScoringParams::COST_FIRST_LOOKAHEAD = 0.4836f; +const float ScoringParams::COST_LOOKAHEAD = 0.00624f; +const float ScoringParams::HAS_PROXIMITY_TERMINAL_COST = 0.06836f; +const float ScoringParams::HAS_EDIT_CORRECTION_TERMINAL_COST = 0.0362f; +const float ScoringParams::HAS_MULTI_WORD_TERMINAL_COST = 0.4182f; +const float ScoringParams::TYPING_BASE_OUTPUT_SCORE = 1.0f; +const float ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT = 0.1f; +const float ScoringParams::NORMALIZED_SPATIAL_DISTANCE_THRESHOLD_FOR_EDIT = 0.045f; +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/typing/scoring_params.h b/src/aosp/suggest/policyimpl/typing/scoring_params.h new file mode 100644 index 0000000..7d4b5c3 --- /dev/null +++ b/src/aosp/suggest/policyimpl/typing/scoring_params.h
@@ -0,0 +1,70 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SCORING_PARAMS_H +#define LATINIME_SCORING_PARAMS_H + +#include "defines.h" + +namespace latinime { + +class ScoringParams { + public: + // Fixed model parameters + static const float MAX_SPATIAL_DISTANCE; + static const int THRESHOLD_NEXT_WORD_PROBABILITY; + static const int THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED; + static const float AUTOCORRECT_OUTPUT_THRESHOLD; + static const int MAX_CACHE_DIC_NODE_SIZE; + static const int MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT; + static const int THRESHOLD_SHORT_WORD_LENGTH; + + // Numerically optimized parameters (currently for tap typing only). + // TODO: add ability to modify these constants programmatically. + // TODO: explore optimization of gesture parameters. + static const float DISTANCE_WEIGHT_LENGTH; + static const float PROXIMITY_COST; + static const float FIRST_CHAR_PROXIMITY_COST; + static const float FIRST_PROXIMITY_COST; + static const float OMISSION_COST; + static const float OMISSION_COST_SAME_CHAR; + static const float OMISSION_COST_FIRST_CHAR; + static const float INSERTION_COST; + static const float TERMINAL_INSERTION_COST; + static const float INSERTION_COST_SAME_CHAR; + static const float INSERTION_COST_PROXIMITY_CHAR; + static const float INSERTION_COST_FIRST_CHAR; + static const float TRANSPOSITION_COST; + static const float SPACE_SUBSTITUTION_COST; + static const float ADDITIONAL_PROXIMITY_COST; + static const float SUBSTITUTION_COST; + static const float COST_NEW_WORD; + static const float COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE; + static const float DISTANCE_WEIGHT_LANGUAGE; + static const float COST_FIRST_LOOKAHEAD; + static const float COST_LOOKAHEAD; + static const float HAS_PROXIMITY_TERMINAL_COST; + static const float HAS_EDIT_CORRECTION_TERMINAL_COST; + static const float HAS_MULTI_WORD_TERMINAL_COST; + static const float TYPING_BASE_OUTPUT_SCORE; + static const float TYPING_MAX_OUTPUT_SCORE_PER_INPUT; + static const float NORMALIZED_SPATIAL_DISTANCE_THRESHOLD_FOR_EDIT; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ScoringParams); +}; +} // namespace latinime +#endif // LATINIME_SCORING_PARAMS_H
diff --git a/src/aosp/suggest/policyimpl/typing/typing_scoring.cpp b/src/aosp/suggest/policyimpl/typing/typing_scoring.cpp new file mode 100644 index 0000000..d8c6175 --- /dev/null +++ b/src/aosp/suggest/policyimpl/typing/typing_scoring.cpp
@@ -0,0 +1,21 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/typing/typing_scoring.h" + +namespace latinime { +const TypingScoring TypingScoring::sInstance; +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/typing/typing_scoring.h b/src/aosp/suggest/policyimpl/typing/typing_scoring.h new file mode 100644 index 0000000..56ffcc9 --- /dev/null +++ b/src/aosp/suggest/policyimpl/typing/typing_scoring.h
@@ -0,0 +1,82 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TYPING_SCORING_H +#define LATINIME_TYPING_SCORING_H + +#include "defines.h" +#include "suggest/core/policy/scoring.h" +#include "suggest/policyimpl/typing/scoring_params.h" + +namespace latinime { + +class DicNode; +class DicTraverseSession; + +class TypingScoring : public Scoring { + public: + static const TypingScoring *getInstance() { return &sInstance; } + + AK_FORCE_INLINE bool getMostProbableString( + const DicTraverseSession *const traverseSession, const int terminalSize, + const float languageWeight, int *const outputCodePoints, int *const type, + int *const freq) const { + return false; + } + + AK_FORCE_INLINE void safetyNetForMostProbableString(const int terminalSize, + const int maxScore, int *const outputCodePoints, int *const frequencies) const { + } + + AK_FORCE_INLINE void searchWordWithDoubleLetter(DicNode *terminals, + const int terminalSize, int *doubleLetterTerminalIndex, + DoubleLetterLevel *doubleLetterLevel) const { + } + + AK_FORCE_INLINE float getAdjustedLanguageWeight(DicTraverseSession *const traverseSession, + DicNode *const terminals, const int size) const { + return 1.0f; + } + + AK_FORCE_INLINE int calculateFinalScore(const float compoundDistance, + const int inputSize, const bool forceCommit) const { + const float maxDistance = ScoringParams::DISTANCE_WEIGHT_LANGUAGE + + static_cast<float>(inputSize) * ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT; + const float score = ScoringParams::TYPING_BASE_OUTPUT_SCORE + - compoundDistance / maxDistance + + (forceCommit ? ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD : 0.0f); + return static_cast<int>(score * SUGGEST_INTERFACE_OUTPUT_SCALE); + } + + AK_FORCE_INLINE float getDoubleLetterDemotionDistanceCost(const int terminalIndex, + const int doubleLetterTerminalIndex, + const DoubleLetterLevel doubleLetterLevel) const { + return 0.0f; + } + + AK_FORCE_INLINE bool doesAutoCorrectValidWord() const { + return false; + } + + private: + DISALLOW_COPY_AND_ASSIGN(TypingScoring); + static const TypingScoring sInstance; + + TypingScoring() {} + ~TypingScoring() {} +}; +} // namespace latinime +#endif // LATINIME_TYPING_SCORING_H
diff --git a/src/aosp/suggest/policyimpl/typing/typing_suggest_policy.cpp b/src/aosp/suggest/policyimpl/typing/typing_suggest_policy.cpp new file mode 100644 index 0000000..0c27639 --- /dev/null +++ b/src/aosp/suggest/policyimpl/typing/typing_suggest_policy.cpp
@@ -0,0 +1,21 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/typing/typing_suggest_policy.h" + +namespace latinime { +const TypingSuggestPolicy TypingSuggestPolicy::sInstance; +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/typing/typing_suggest_policy.h b/src/aosp/suggest/policyimpl/typing/typing_suggest_policy.h new file mode 100644 index 0000000..35f4809 --- /dev/null +++ b/src/aosp/suggest/policyimpl/typing/typing_suggest_policy.h
@@ -0,0 +1,55 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TYPING_SUGGEST_POLICY_H +#define LATINIME_TYPING_SUGGEST_POLICY_H + +#include "defines.h" +#include "suggest/core/policy/suggest_policy.h" +#include "suggest/policyimpl/typing/typing_scoring.h" +#include "suggest/policyimpl/typing/typing_traversal.h" +#include "suggest/policyimpl/typing/typing_weighting.h" + +namespace latinime { + +class Scoring; +class Traversal; +class Weighting; + +class TypingSuggestPolicy : public SuggestPolicy { + public: + static const TypingSuggestPolicy *getInstance() { return &sInstance; } + + TypingSuggestPolicy() {} + virtual ~TypingSuggestPolicy() {} + AK_FORCE_INLINE const Traversal *getTraversal() const { + return TypingTraversal::getInstance(); + } + + AK_FORCE_INLINE const Scoring *getScoring() const { + return TypingScoring::getInstance(); + } + + AK_FORCE_INLINE const Weighting *getWeighting() const { + return TypingWeighting::getInstance(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(TypingSuggestPolicy); + static const TypingSuggestPolicy sInstance; +}; +} // namespace latinime +#endif // LATINIME_TYPING_SUGGEST_POLICY_H
diff --git a/src/aosp/suggest/policyimpl/typing/typing_suggest_policy_factory.h b/src/aosp/suggest/policyimpl/typing/typing_suggest_policy_factory.h new file mode 100644 index 0000000..a67b45b --- /dev/null +++ b/src/aosp/suggest/policyimpl/typing/typing_suggest_policy_factory.h
@@ -0,0 +1,37 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TYPING_SUGGEST_POLICY_FACTORY_H +#define LATINIME_TYPING_SUGGEST_POLICY_FACTORY_H + +#include "defines.h" +#include "typing_suggest_policy.h" + +namespace latinime { + +class SuggestPolicy; + +class TypingSuggestPolicyFactory { + public: + static const SuggestPolicy *getTypingSuggestPolicy() { + return TypingSuggestPolicy::getInstance(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(TypingSuggestPolicyFactory); +}; +} // namespace latinime +#endif // LATINIME_TYPING_SUGGEST_POLICY_FACTORY_H
diff --git a/src/aosp/suggest/policyimpl/typing/typing_traversal.cpp b/src/aosp/suggest/policyimpl/typing/typing_traversal.cpp new file mode 100644 index 0000000..e7e40e3 --- /dev/null +++ b/src/aosp/suggest/policyimpl/typing/typing_traversal.cpp
@@ -0,0 +1,24 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/typing/typing_traversal.h" + +namespace latinime { +const bool TypingTraversal::CORRECT_OMISSION = true; +const bool TypingTraversal::CORRECT_NEW_WORD_SPACE_SUBSTITUTION = true; +const bool TypingTraversal::CORRECT_NEW_WORD_SPACE_OMISSION = true; +const TypingTraversal TypingTraversal::sInstance; +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/typing/typing_traversal.h b/src/aosp/suggest/policyimpl/typing/typing_traversal.h new file mode 100644 index 0000000..007c19e --- /dev/null +++ b/src/aosp/suggest/policyimpl/typing/typing_traversal.h
@@ -0,0 +1,193 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TYPING_TRAVERSAL_H +#define LATINIME_TYPING_TRAVERSAL_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/core/layout/proximity_info_state.h" +#include "suggest/core/layout/proximity_info_utils.h" +#include "suggest/core/policy/traversal.h" +#include "suggest/core/session/dic_traverse_session.h" +#include "suggest/policyimpl/typing/scoring_params.h" +#include "utils/char_utils.h" + +namespace latinime { +class TypingTraversal : public Traversal { + public: + static const TypingTraversal *getInstance() { return &sInstance; } + + AK_FORCE_INLINE int getMaxPointerCount() const { + return MAX_POINTER_COUNT; + } + + AK_FORCE_INLINE bool allowsErrorCorrections(const DicNode *const dicNode) const { + return dicNode->getNormalizedSpatialDistance() + < ScoringParams::NORMALIZED_SPATIAL_DISTANCE_THRESHOLD_FOR_EDIT; + } + + AK_FORCE_INLINE bool isOmission(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode, const DicNode *const childDicNode, + const bool allowsErrorCorrections) const { + if (!CORRECT_OMISSION) { + return false; + } + // Note: Always consider intentional omissions (like apostrophes) since they are common. + const bool canConsiderOmission = + allowsErrorCorrections || childDicNode->canBeIntentionalOmission(); + if (!canConsiderOmission) { + return false; + } + const int inputSize = traverseSession->getInputSize(); + // TODO: Don't refer to isCompletion? + if (dicNode->isCompletion(inputSize)) { + return false; + } + if (dicNode->canBeIntentionalOmission()) { + return true; + } + const int point0Index = dicNode->getInputIndex(0); + const int currentBaseLowerCodePoint = + CharUtils::toBaseLowerCase(childDicNode->getNodeCodePoint()); + const int typedBaseLowerCodePoint = + CharUtils::toBaseLowerCase(traverseSession->getProximityInfoState(0) + ->getPrimaryCodePointAt(point0Index)); + return (currentBaseLowerCodePoint != typedBaseLowerCodePoint); + } + + AK_FORCE_INLINE bool isSpaceSubstitutionTerminal( + const DicTraverseSession *const traverseSession, const DicNode *const dicNode) const { + if (!CORRECT_NEW_WORD_SPACE_SUBSTITUTION) { + return false; + } + if (!canDoLookAheadCorrection(traverseSession, dicNode)) { + return false; + } + const int point0Index = dicNode->getInputIndex(0); + return dicNode->isTerminalWordNode() + && traverseSession->getProximityInfoState(0)-> + hasSpaceProximity(point0Index); + } + + AK_FORCE_INLINE bool isSpaceOmissionTerminal( + const DicTraverseSession *const traverseSession, const DicNode *const dicNode) const { + if (!CORRECT_NEW_WORD_SPACE_OMISSION) { + return false; + } + const int inputSize = traverseSession->getInputSize(); + // TODO: Don't refer to isCompletion? + if (dicNode->isCompletion(inputSize)) { + return false; + } + if (!dicNode->isTerminalWordNode()) { + return false; + } + const int16_t pointIndex = dicNode->getInputIndex(0); + return pointIndex <= inputSize && !dicNode->isTotalInputSizeExceedingLimit() + && !dicNode->shouldBeFilteredBySafetyNetForBigram(); + } + + AK_FORCE_INLINE bool shouldDepthLevelCache( + const DicTraverseSession *const traverseSession) const { + const int inputSize = traverseSession->getInputSize(); + return traverseSession->isCacheBorderForTyping(inputSize); + } + + AK_FORCE_INLINE bool shouldNodeLevelCache( + const DicTraverseSession *const traverseSession, const DicNode *const dicNode) const { + return false; + } + + AK_FORCE_INLINE bool canDoLookAheadCorrection( + const DicTraverseSession *const traverseSession, const DicNode *const dicNode) const { + const int inputSize = traverseSession->getInputSize(); + return dicNode->canDoLookAheadCorrection(inputSize); + } + + AK_FORCE_INLINE ProximityType getProximityType( + const DicTraverseSession *const traverseSession, const DicNode *const dicNode, + const DicNode *const childDicNode) const { + return traverseSession->getProximityInfoState(0)->getProximityType( + dicNode->getInputIndex(0), childDicNode->getNodeCodePoint(), + true /* checkProximityChars */); + } + + AK_FORCE_INLINE bool needsToTraverseAllUserInput() const { + return true; + } + + AK_FORCE_INLINE float getMaxSpatialDistance() const { + return ScoringParams::MAX_SPATIAL_DISTANCE; + } + + AK_FORCE_INLINE bool autoCorrectsToMultiWordSuggestionIfTop() const { + return true; + } + + AK_FORCE_INLINE int getDefaultExpandDicNodeSize() const { + return DicNodeVector::DEFAULT_NODES_SIZE_FOR_OPTIMIZATION; + } + + AK_FORCE_INLINE bool sameAsTyped( + const DicTraverseSession *const traverseSession, const DicNode *const dicNode) const { + return traverseSession->getProximityInfoState(0)->sameAsTyped( + dicNode->getOutputWordBuf(), dicNode->getNodeCodePointCount()); + } + + AK_FORCE_INLINE int getMaxCacheSize(const int inputSize) const { + return (inputSize <= 1) ? ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT + : ScoringParams::MAX_CACHE_DIC_NODE_SIZE; + } + + AK_FORCE_INLINE bool isPossibleOmissionChildNode( + const DicTraverseSession *const traverseSession, const DicNode *const parentDicNode, + const DicNode *const dicNode) const { + const ProximityType proximityType = + getProximityType(traverseSession, parentDicNode, dicNode); + if (!ProximityInfoUtils::isMatchOrProximityChar(proximityType)) { + return false; + } + return true; + } + + AK_FORCE_INLINE bool isGoodToTraverseNextWord(const DicNode *const dicNode) const { + const int probability = dicNode->getProbability(); + if (probability < ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY) { + return false; + } + const int c = dicNode->getOutputWordBuf()[0]; + const bool shortCappedWord = dicNode->getNodeCodePointCount() + < ScoringParams::THRESHOLD_SHORT_WORD_LENGTH && CharUtils::isAsciiUpper(c); + return !shortCappedWord + || probability >= ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED; + } + + private: + DISALLOW_COPY_AND_ASSIGN(TypingTraversal); + static const bool CORRECT_OMISSION; + static const bool CORRECT_NEW_WORD_SPACE_SUBSTITUTION; + static const bool CORRECT_NEW_WORD_SPACE_OMISSION; + static const TypingTraversal sInstance; + + TypingTraversal() {} + ~TypingTraversal() {} +}; +} // namespace latinime +#endif // LATINIME_TYPING_TRAVERSAL_H
diff --git a/src/aosp/suggest/policyimpl/typing/typing_weighting.cpp b/src/aosp/suggest/policyimpl/typing/typing_weighting.cpp new file mode 100644 index 0000000..5b6b5e8 --- /dev/null +++ b/src/aosp/suggest/policyimpl/typing/typing_weighting.cpp
@@ -0,0 +1,61 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/typing/typing_weighting.h" + +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/policyimpl/typing/scoring_params.h" + +namespace latinime { + +const TypingWeighting TypingWeighting::sInstance; + +ErrorType TypingWeighting::getErrorType(const CorrectionType correctionType, + const DicTraverseSession *const traverseSession, const DicNode *const parentDicNode, + const DicNode *const dicNode) const { + switch (correctionType) { + case CT_MATCH: + if (isProximityDicNode(traverseSession, dicNode)) { + return ET_PROXIMITY_CORRECTION; + } else { + return ET_NOT_AN_ERROR; + } + case CT_ADDITIONAL_PROXIMITY: + return ET_PROXIMITY_CORRECTION; + case CT_OMISSION: + if (parentDicNode->canBeIntentionalOmission()) { + return ET_INTENTIONAL_OMISSION; + } else { + return ET_EDIT_CORRECTION; + } + break; + case CT_SUBSTITUTION: + case CT_INSERTION: + case CT_TERMINAL_INSERTION: + case CT_TRANSPOSITION: + return ET_EDIT_CORRECTION; + case CT_NEW_WORD_SPACE_OMISSION: + case CT_NEW_WORD_SPACE_SUBSTITUTION: + return ET_NEW_WORD; + case CT_TERMINAL: + return ET_NOT_AN_ERROR; + case CT_COMPLETION: + return ET_COMPLETION; + default: + return ET_NOT_AN_ERROR; + } +} +} // namespace latinime
diff --git a/src/aosp/suggest/policyimpl/typing/typing_weighting.h b/src/aosp/suggest/policyimpl/typing/typing_weighting.h new file mode 100644 index 0000000..9f0a331 --- /dev/null +++ b/src/aosp/suggest/policyimpl/typing/typing_weighting.h
@@ -0,0 +1,219 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TYPING_WEIGHTING_H +#define LATINIME_TYPING_WEIGHTING_H + +#include "defines.h" +#include "suggest/core/dicnode/dic_node_utils.h" +#include "suggest/core/layout/touch_position_correction_utils.h" +#include "suggest/core/policy/weighting.h" +#include "suggest/core/session/dic_traverse_session.h" +#include "suggest/policyimpl/typing/scoring_params.h" +#include "utils/char_utils.h" + +namespace latinime { + +class DicNode; +struct DicNode_InputStateG; +class MultiBigramMap; + +class TypingWeighting : public Weighting { + public: + static const TypingWeighting *getInstance() { return &sInstance; } + + protected: + float getTerminalSpatialCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const { + float cost = 0.0f; + if (dicNode->hasMultipleWords()) { + cost += ScoringParams::HAS_MULTI_WORD_TERMINAL_COST; + } + if (dicNode->getProximityCorrectionCount() > 0) { + cost += ScoringParams::HAS_PROXIMITY_TERMINAL_COST; + } + if (dicNode->getEditCorrectionCount() > 0) { + cost += ScoringParams::HAS_EDIT_CORRECTION_TERMINAL_COST; + } + return cost; + } + + float getOmissionCost(const DicNode *const parentDicNode, const DicNode *const dicNode) const { + const bool isZeroCostOmission = parentDicNode->isZeroCostOmission(); + const bool sameCodePoint = dicNode->isSameNodeCodePoint(parentDicNode); + // If the traversal omitted the first letter then the dicNode should now be on the second. + const bool isFirstLetterOmission = dicNode->getNodeCodePointCount() == 2; + float cost = 0.0f; + if (isZeroCostOmission) { + cost = 0.0f; + } else if (isFirstLetterOmission) { + cost = ScoringParams::OMISSION_COST_FIRST_CHAR; + } else { + cost = sameCodePoint ? ScoringParams::OMISSION_COST_SAME_CHAR + : ScoringParams::OMISSION_COST; + } + return cost; + } + + float getMatchedCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode, DicNode_InputStateG *inputStateG) const { + const int pointIndex = dicNode->getInputIndex(0); + // Note: min() required since length can be MAX_POINT_TO_KEY_LENGTH for characters not on + // the keyboard (like accented letters) + const float normalizedSquaredLength = traverseSession->getProximityInfoState(0) + ->getPointToKeyLength(pointIndex, + CharUtils::toBaseLowerCase(dicNode->getNodeCodePoint())); + const float normalizedDistance = TouchPositionCorrectionUtils::getSweetSpotFactor( + traverseSession->isTouchPositionCorrectionEnabled(), normalizedSquaredLength); + const float weightedDistance = ScoringParams::DISTANCE_WEIGHT_LENGTH * normalizedDistance; + + const bool isFirstChar = pointIndex == 0; + const bool isProximity = isProximityDicNode(traverseSession, dicNode); + float cost = isProximity ? (isFirstChar ? ScoringParams::FIRST_CHAR_PROXIMITY_COST + : ScoringParams::PROXIMITY_COST) : 0.0f; + if (isProximity && dicNode->getProximityCorrectionCount() == 0) { + cost += ScoringParams::FIRST_PROXIMITY_COST; + } + if (dicNode->getNodeCodePointCount() == 2) { + // At the second character of the current word, we check if the first char is uppercase + // and the word is a second or later word of a multiple word suggestion. We demote it + // if so. + const bool isSecondOrLaterWordFirstCharUppercase = + dicNode->hasMultipleWords() && dicNode->isFirstCharUppercase(); + if (isSecondOrLaterWordFirstCharUppercase) { + cost += ScoringParams::COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE; + } + } + return weightedDistance + cost; + } + + bool isProximityDicNode(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const { + const int pointIndex = dicNode->getInputIndex(0); + const int primaryCodePoint = CharUtils::toBaseLowerCase( + traverseSession->getProximityInfoState(0)->getPrimaryCodePointAt(pointIndex)); + const int dicNodeChar = CharUtils::toBaseLowerCase(dicNode->getNodeCodePoint()); + return primaryCodePoint != dicNodeChar; + } + + float getTranspositionCost(const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode) const { + const int16_t parentPointIndex = parentDicNode->getInputIndex(0); + const int prevCodePoint = parentDicNode->getNodeCodePoint(); + const float distance1 = traverseSession->getProximityInfoState(0)->getPointToKeyLength( + parentPointIndex + 1, CharUtils::toBaseLowerCase(prevCodePoint)); + const int codePoint = dicNode->getNodeCodePoint(); + const float distance2 = traverseSession->getProximityInfoState(0)->getPointToKeyLength( + parentPointIndex, CharUtils::toBaseLowerCase(codePoint)); + const float distance = distance1 + distance2; + const float weightedLengthDistance = + distance * ScoringParams::DISTANCE_WEIGHT_LENGTH; + return ScoringParams::TRANSPOSITION_COST + weightedLengthDistance; + } + + float getInsertionCost(const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode) const { + const int16_t insertedPointIndex = parentDicNode->getInputIndex(0); + const int prevCodePoint = traverseSession->getProximityInfoState(0)->getPrimaryCodePointAt( + insertedPointIndex); + const int currentCodePoint = dicNode->getNodeCodePoint(); + const bool sameCodePoint = prevCodePoint == currentCodePoint; + const bool existsAdjacentProximityChars = traverseSession->getProximityInfoState(0) + ->existsAdjacentProximityChars(insertedPointIndex); + const float dist = traverseSession->getProximityInfoState(0)->getPointToKeyLength( + insertedPointIndex + 1, CharUtils::toBaseLowerCase(dicNode->getNodeCodePoint())); + const float weightedDistance = dist * ScoringParams::DISTANCE_WEIGHT_LENGTH; + const bool singleChar = dicNode->getNodeCodePointCount() == 1; + float cost = (singleChar ? ScoringParams::INSERTION_COST_FIRST_CHAR : 0.0f); + if (sameCodePoint) { + cost += ScoringParams::INSERTION_COST_SAME_CHAR; + } else if (existsAdjacentProximityChars) { + cost += ScoringParams::INSERTION_COST_PROXIMITY_CHAR; + } else { + cost += ScoringParams::INSERTION_COST; + } + return cost + weightedDistance; + } + + float getNewWordSpatialCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode, DicNode_InputStateG *inputStateG) const { + return ScoringParams::COST_NEW_WORD * traverseSession->getMultiWordCostMultiplier(); + } + + float getNewWordBigramLanguageCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode, + MultiBigramMap *const multiBigramMap) const { + return DicNodeUtils::getBigramNodeImprobability( + traverseSession->getDictionaryStructurePolicy(), + dicNode, multiBigramMap) * ScoringParams::DISTANCE_WEIGHT_LANGUAGE; + } + + float getCompletionCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const { + // The auto completion starts when the input index is same as the input size + const bool firstCompletion = dicNode->getInputIndex(0) + == traverseSession->getInputSize(); + // TODO: Change the cost for the first completion for the gesture? + const float cost = firstCompletion ? ScoringParams::COST_FIRST_LOOKAHEAD + : ScoringParams::COST_LOOKAHEAD; + return cost; + } + + float getTerminalLanguageCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode, const float dicNodeLanguageImprobability) const { + return dicNodeLanguageImprobability * ScoringParams::DISTANCE_WEIGHT_LANGUAGE; + } + + float getTerminalInsertionCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const { + const int inputIndex = dicNode->getInputIndex(0); + const int inputSize = traverseSession->getInputSize(); + ASSERT(inputIndex < inputSize); + // TODO: Implement more efficient logic + return ScoringParams::TERMINAL_INSERTION_COST * (inputSize - inputIndex); + } + + AK_FORCE_INLINE bool needsToNormalizeCompoundDistance() const { + return false; + } + + AK_FORCE_INLINE float getAdditionalProximityCost() const { + return ScoringParams::ADDITIONAL_PROXIMITY_COST; + } + + AK_FORCE_INLINE float getSubstitutionCost() const { + return ScoringParams::SUBSTITUTION_COST; + } + + AK_FORCE_INLINE float getSpaceSubstitutionCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const { + const float cost = ScoringParams::SPACE_SUBSTITUTION_COST + ScoringParams::COST_NEW_WORD; + return cost * traverseSession->getMultiWordCostMultiplier(); + } + + ErrorType getErrorType(const CorrectionType correctionType, + const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode) const; + + private: + DISALLOW_COPY_AND_ASSIGN(TypingWeighting); + static const TypingWeighting sInstance; + + TypingWeighting() {} + ~TypingWeighting() {} +}; +} // namespace latinime +#endif // LATINIME_TYPING_WEIGHTING_H
diff --git a/src/aosp/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h b/src/aosp/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h new file mode 100644 index 0000000..81614bc --- /dev/null +++ b/src/aosp/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h
@@ -0,0 +1,79 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DAEMARU_LEVENSHTEIN_EDIT_DISTANCE_POLICY_H +#define LATINIME_DAEMARU_LEVENSHTEIN_EDIT_DISTANCE_POLICY_H + +#include "suggest/policyimpl/utils/edit_distance_policy.h" +#include "utils/char_utils.h" + +namespace latinime { + +class DamerauLevenshteinEditDistancePolicy : public EditDistancePolicy { + public: + DamerauLevenshteinEditDistancePolicy(const int *const string0, const int length0, + const int *const string1, const int length1) + : mString0(string0), mString0Length(length0), mString1(string1), + mString1Length(length1) {} + ~DamerauLevenshteinEditDistancePolicy() {} + + AK_FORCE_INLINE float getSubstitutionCost(const int index0, const int index1) const { + const int c0 = CharUtils::toBaseLowerCase(mString0[index0]); + const int c1 = CharUtils::toBaseLowerCase(mString1[index1]); + return (c0 == c1) ? 0.0f : 1.0f; + } + + AK_FORCE_INLINE float getDeletionCost(const int index0, const int index1) const { + return 1.0f; + } + + AK_FORCE_INLINE float getInsertionCost(const int index0, const int index1) const { + return 1.0f; + } + + AK_FORCE_INLINE bool allowTransposition(const int index0, const int index1) const { + const int c0 = CharUtils::toBaseLowerCase(mString0[index0]); + const int c1 = CharUtils::toBaseLowerCase(mString1[index1]); + if (index0 > 0 && index1 > 0 && c0 == CharUtils::toBaseLowerCase(mString1[index1 - 1]) + && c1 == CharUtils::toBaseLowerCase(mString0[index0 - 1])) { + return true; + } + return false; + } + + AK_FORCE_INLINE float getTranspositionCost(const int index0, const int index1) const { + return getSubstitutionCost(index0, index1); + } + + AK_FORCE_INLINE int getString0Length() const { + return mString0Length; + } + + AK_FORCE_INLINE int getString1Length() const { + return mString1Length; + } + + private: + DISALLOW_COPY_AND_ASSIGN (DamerauLevenshteinEditDistancePolicy); + + const int *const mString0; + const int mString0Length; + const int *const mString1; + const int mString1Length; +}; +} // namespace latinime + +#endif // LATINIME_DAEMARU_LEVENSHTEIN_EDIT_DISTANCE_POLICY_H
diff --git a/src/aosp/suggest/policyimpl/utils/edit_distance.h b/src/aosp/suggest/policyimpl/utils/edit_distance.h new file mode 100644 index 0000000..0871c37 --- /dev/null +++ b/src/aosp/suggest/policyimpl/utils/edit_distance.h
@@ -0,0 +1,90 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_EDIT_DISTANCE_H +#define LATINIME_EDIT_DISTANCE_H + +#include "defines.h" +#include "suggest/policyimpl/utils/edit_distance_policy.h" + +namespace latinime { + +class EditDistance { + public: + // CAVEAT: There may be performance penalty if you need the edit distance as an integer value. + AK_FORCE_INLINE static float getEditDistance(const EditDistancePolicy *const policy) { + const int beforeLength = policy->getString0Length(); + const int afterLength = policy->getString1Length(); + float dp[(beforeLength + 1) * (afterLength + 1)]; + for (int i = 0; i <= beforeLength; ++i) { + dp[(afterLength + 1) * i] = i * policy->getInsertionCost(i - 1, -1); + } + for (int i = 0; i <= afterLength; ++i) { + dp[i] = i * policy->getDeletionCost(-1, i - 1); + } + + for (int i = 0; i < beforeLength; ++i) { + for (int j = 0; j < afterLength; ++j) { + dp[(afterLength + 1) * (i + 1) + (j + 1)] = min( + dp[(afterLength + 1) * i + (j + 1)] + policy->getInsertionCost(i, j), + min(dp[(afterLength + 1) * (i + 1) + j] + policy->getDeletionCost(i, j), + dp[(afterLength + 1) * i + j] + + policy->getSubstitutionCost(i, j))); + if (policy->allowTransposition(i, j)) { + dp[(afterLength + 1) * (i + 1) + (j + 1)] = min( + dp[(afterLength + 1) * (i + 1) + (j + 1)], + dp[(afterLength + 1) * (i - 1) + (j - 1)] + + policy->getTranspositionCost(i, j)); + } + } + } + if (DEBUG_EDIT_DISTANCE) { + AKLOGI("IN = %d, OUT = %d", beforeLength, afterLength); + for (int i = 0; i < beforeLength + 1; ++i) { + for (int j = 0; j < afterLength + 1; ++j) { + AKLOGI("EDIT[%d][%d], %f", i, j, dp[(afterLength + 1) * i + j]); + } + } + } + return dp[(beforeLength + 1) * (afterLength + 1) - 1]; + } + + AK_FORCE_INLINE static void dumpEditDistance10ForDebug(const float *const editDistanceTable, + const int editDistanceTableWidth, const int outputLength) { + if (DEBUG_DICT) { + AKLOGI("EditDistanceTable"); + for (int i = 0; i <= 10; ++i) { + float c[11]; + for (int j = 0; j <= 10; ++j) { + if (j < editDistanceTableWidth + 1 && i < outputLength + 1) { + c[j] = (editDistanceTable + i * (editDistanceTableWidth + 1))[j]; + } else { + c[j] = -1.0f; + } + } + AKLOGI("[ %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f ]", + c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10]); + (void)c; // To suppress compiler warning + } + } + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(EditDistance); +}; +} // namespace latinime + +#endif // LATINIME_EDIT_DISTANCE_H
diff --git a/src/aosp/suggest/policyimpl/utils/edit_distance_policy.h b/src/aosp/suggest/policyimpl/utils/edit_distance_policy.h new file mode 100644 index 0000000..e3d1792 --- /dev/null +++ b/src/aosp/suggest/policyimpl/utils/edit_distance_policy.h
@@ -0,0 +1,43 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_EDIT_DISTANCE_POLICY_H +#define LATINIME_EDIT_DISTANCE_POLICY_H + +#include "defines.h" + +namespace latinime { + +class EditDistancePolicy { + public: + virtual float getSubstitutionCost(const int index0, const int index1) const = 0; + virtual float getDeletionCost(const int index0, const int index1) const = 0; + virtual float getInsertionCost(const int index0, const int index1) const = 0; + virtual bool allowTransposition(const int index0, const int index1) const = 0; + virtual float getTranspositionCost(const int index0, const int index1) const = 0; + virtual int getString0Length() const = 0; + virtual int getString1Length() const = 0; + + protected: + EditDistancePolicy() {} + virtual ~EditDistancePolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(EditDistancePolicy); +}; +} // namespace latinime + +#endif // LATINIME_EDIT_DISTANCE_POLICY_H
diff --git a/src/aosp/utils/autocorrection_threshold_utils.cpp b/src/aosp/utils/autocorrection_threshold_utils.cpp new file mode 100644 index 0000000..1f8ee08 --- /dev/null +++ b/src/aosp/utils/autocorrection_threshold_utils.cpp
@@ -0,0 +1,108 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/autocorrection_threshold_utils.h" + +#include <cmath> + +#include "defines.h" +#include "suggest/policyimpl/utils/edit_distance.h" +#include "suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h" + +namespace latinime { + +const int AutocorrectionThresholdUtils::MAX_INITIAL_SCORE = 255; +const int AutocorrectionThresholdUtils::TYPED_LETTER_MULTIPLIER = 2; +const int AutocorrectionThresholdUtils::FULL_WORD_MULTIPLIER = 2; + +/* static */ int AutocorrectionThresholdUtils::editDistance(const int *before, + const int beforeLength, const int *after, const int afterLength) { + const DamerauLevenshteinEditDistancePolicy daemaruLevenshtein( + before, beforeLength, after, afterLength); + return static_cast<int>(EditDistance::getEditDistance(&daemaruLevenshtein)); +} + +// In dictionary.cpp, getSuggestion() method, +// When USE_SUGGEST_INTERFACE_FOR_TYPING is true: +// +// // TODO: Revise the following logic thoroughly by referring to the logic +// // marked as "Otherwise" below. +// SUGGEST_INTERFACE_OUTPUT_SCALE was multiplied to the original suggestion scores to convert +// them to integers. +// score = (int)((original score) * SUGGEST_INTERFACE_OUTPUT_SCALE) +// Undo the scaling here to recover the original score. +// normalizedScore = ((float)score) / SUGGEST_INTERFACE_OUTPUT_SCALE +// +// Otherwise: suggestion scores are computed using the below formula. +// original score +// := powf(mTypedLetterMultiplier (this is defined 2), +// (the number of matched characters between typed word and suggested word)) +// * (individual word's score which defined in the unigram dictionary, +// and this score is defined in range [0, 255].) +// Then, the following processing is applied. +// - If the dictionary word is matched up to the point of the user entry +// (full match up to min(before.length(), after.length()) +// => Then multiply by FULL_MATCHED_WORDS_PROMOTION_RATE (this is defined 1.2) +// - If the word is a true full match except for differences in accents or +// capitalization, then treat it as if the score was 255. +// - If before.length() == after.length() +// => multiply by mFullWordMultiplier (this is defined 2)) +// So, maximum original score is powf(2, min(before.length(), after.length())) * 255 * 2 * 1.2 +// For historical reasons we ignore the 1.2 modifier (because the measure for a good +// autocorrection threshold was done at a time when it didn't exist). This doesn't change +// the result. +// So, we can normalize original score by dividing powf(2, min(b.l(),a.l())) * 255 * 2. + +/* static */ float AutocorrectionThresholdUtils::calcNormalizedScore(const int *before, + const int beforeLength, const int *after, const int afterLength, const int score) { + if (0 == beforeLength || 0 == afterLength) { + return 0.0f; + } + const int distance = editDistance(before, beforeLength, after, afterLength); + int spaceCount = 0; + for (int i = 0; i < afterLength; ++i) { + if (after[i] == KEYCODE_SPACE) { + ++spaceCount; + } + } + + if (spaceCount == afterLength) { + return 0.0f; + } + + if (score <= 0 || distance >= afterLength) { + // normalizedScore must be 0.0f (the minimum value) if the score is less than or equal to 0, + // or if the edit distance is larger than or equal to afterLength. + return 0.0f; + } + // add a weight based on edit distance. + const float weight = 1.0f - static_cast<float>(distance) / static_cast<float>(afterLength); + + // TODO: Revise the following logic thoroughly by referring to... + if (true /* USE_SUGGEST_INTERFACE_FOR_TYPING */) { + return (static_cast<float>(score) / SUGGEST_INTERFACE_OUTPUT_SCALE) * weight; + } + // ...this logic. + const float maxScore = score >= S_INT_MAX ? static_cast<float>(S_INT_MAX) + : static_cast<float>(MAX_INITIAL_SCORE) + * powf(static_cast<float>(TYPED_LETTER_MULTIPLIER), + static_cast<float>(min(beforeLength, afterLength - spaceCount))) + * static_cast<float>(FULL_WORD_MULTIPLIER); + + return (static_cast<float>(score) / maxScore) * weight; +} + +} // namespace latinime
diff --git a/src/aosp/utils/autocorrection_threshold_utils.h b/src/aosp/utils/autocorrection_threshold_utils.h new file mode 100644 index 0000000..c7537a6 --- /dev/null +++ b/src/aosp/utils/autocorrection_threshold_utils.h
@@ -0,0 +1,39 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_AUTOCORRECTION_THRESHOLD_UTILS_H +#define LATINIME_AUTOCORRECTION_THRESHOLD_UTILS_H + +#include "defines.h" + +namespace latinime { + +class AutocorrectionThresholdUtils { + public: + static float calcNormalizedScore(const int *before, const int beforeLength, + const int *after, const int afterLength, const int score); + static int editDistance(const int *before, const int beforeLength, const int *after, + const int afterLength); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(AutocorrectionThresholdUtils); + + static const int MAX_INITIAL_SCORE; + static const int TYPED_LETTER_MULTIPLIER; + static const int FULL_WORD_MULTIPLIER; +}; +} // namespace latinime +#endif // LATINIME_AUTOCORRECTION_THRESHOLD_UTILS_H
diff --git a/src/aosp/utils/char_utils.cpp b/src/aosp/utils/char_utils.cpp new file mode 100644 index 0000000..0e70396 --- /dev/null +++ b/src/aosp/utils/char_utils.cpp
@@ -0,0 +1,1276 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/char_utils.h" + +#include <cstdlib> + +#include "defines.h" + +namespace latinime { + +struct LatinCapitalSmallPair { + unsigned short capital; + unsigned short small; +}; + +/* + * How to update the SORTED_CHAR_MAP[] array. + * + * 1. Download http://unicode.org/Public/UNIDATA/UnicodeData.txt + * + * 2. Have a latest version of ICU4C dev package installed + * (Note: the current data has been generated with version 4.8) + * $ apt-get install libicu-dev + * + * 3. Build the following code + * $ g++ -o char_utils -I.. -DUPDATING_CHAR_UTILS char_utils.cpp -licuuc + */ +#ifdef UPDATING_CHAR_UTILS +#include <stdio.h> +#include <unicode/uchar.h> // ICU4C + +extern "C" int main() { + for (unsigned short c = 0; c < 0xFFFF; c++) { + if (c <= 0x7F) continue; + const unsigned short icu4cLowerC = u_tolower(c); + const unsigned short myLowerC = CharUtils::latin_tolower(c); + if (c != icu4cLowerC) { +#ifdef CONFIRMING_CHAR_UTILS + if (icu4cLowerC != myLowerC) { + fprintf(stderr, "icu4cLowerC != myLowerC, 0x%04X, 0x%04X\n", icu4cLowerC, myLowerC); + } +#else // CONFIRMING_CHAR_UTILS + printf("0x%04X, 0x%04X\n", c, icu4cLowerC); +#endif // CONFIRMING_CHAR_UTILS + } + } +} +#endif // UPDATING_CHAR_UTILS +/* + * 4. Process the list with UnicodeData.txt + * (You need UnicodeData.txt in the current directory) + * $ ./char_utils | sort -u | \ + * perl -e 'open(FH, "UnicodeData.txt"); @buf = <FH>; close(FH); \ + * while(<>){/0x(\w*), 0x(\w*)/; @lines = grep(/^$1/, @buf); @cols = split(/;/, $lines[0]); \ + * print " { 0x$1, 0x$cols[13] }, // $cols[1]\n";}' + * + * 5. Update the SORTED_CHAR_MAP[] array below with the output above. + * Then, rebuild with -DCONFIRMING_CHAR_UTILS and confirm the program exits successfully. + * $ g++ -o char_utils -I.. -DUPDATING_CHAR_UTILS -DCONFIRMING_CHAR_UTILS char_utils.cpp -licuuc + * $ ./char_utils + * $ + */ +static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = { + { 0x00C0, 0x00E0 }, // LATIN CAPITAL LETTER A WITH GRAVE + { 0x00C1, 0x00E1 }, // LATIN CAPITAL LETTER A WITH ACUTE + { 0x00C2, 0x00E2 }, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX + { 0x00C3, 0x00E3 }, // LATIN CAPITAL LETTER A WITH TILDE + { 0x00C4, 0x00E4 }, // LATIN CAPITAL LETTER A WITH DIAERESIS + { 0x00C5, 0x00E5 }, // LATIN CAPITAL LETTER A WITH RING ABOVE + { 0x00C6, 0x00E6 }, // LATIN CAPITAL LETTER AE + { 0x00C7, 0x00E7 }, // LATIN CAPITAL LETTER C WITH CEDILLA + { 0x00C8, 0x00E8 }, // LATIN CAPITAL LETTER E WITH GRAVE + { 0x00C9, 0x00E9 }, // LATIN CAPITAL LETTER E WITH ACUTE + { 0x00CA, 0x00EA }, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX + { 0x00CB, 0x00EB }, // LATIN CAPITAL LETTER E WITH DIAERESIS + { 0x00CC, 0x00EC }, // LATIN CAPITAL LETTER I WITH GRAVE + { 0x00CD, 0x00ED }, // LATIN CAPITAL LETTER I WITH ACUTE + { 0x00CE, 0x00EE }, // LATIN CAPITAL LETTER I WITH CIRCUMFLEX + { 0x00CF, 0x00EF }, // LATIN CAPITAL LETTER I WITH DIAERESIS + { 0x00D0, 0x00F0 }, // LATIN CAPITAL LETTER ETH + { 0x00D1, 0x00F1 }, // LATIN CAPITAL LETTER N WITH TILDE + { 0x00D2, 0x00F2 }, // LATIN CAPITAL LETTER O WITH GRAVE + { 0x00D3, 0x00F3 }, // LATIN CAPITAL LETTER O WITH ACUTE + { 0x00D4, 0x00F4 }, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX + { 0x00D5, 0x00F5 }, // LATIN CAPITAL LETTER O WITH TILDE + { 0x00D6, 0x00F6 }, // LATIN CAPITAL LETTER O WITH DIAERESIS + { 0x00D8, 0x00F8 }, // LATIN CAPITAL LETTER O WITH STROKE + { 0x00D9, 0x00F9 }, // LATIN CAPITAL LETTER U WITH GRAVE + { 0x00DA, 0x00FA }, // LATIN CAPITAL LETTER U WITH ACUTE + { 0x00DB, 0x00FB }, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX + { 0x00DC, 0x00FC }, // LATIN CAPITAL LETTER U WITH DIAERESIS + { 0x00DD, 0x00FD }, // LATIN CAPITAL LETTER Y WITH ACUTE + { 0x00DE, 0x00FE }, // LATIN CAPITAL LETTER THORN + { 0x0100, 0x0101 }, // LATIN CAPITAL LETTER A WITH MACRON + { 0x0102, 0x0103 }, // LATIN CAPITAL LETTER A WITH BREVE + { 0x0104, 0x0105 }, // LATIN CAPITAL LETTER A WITH OGONEK + { 0x0106, 0x0107 }, // LATIN CAPITAL LETTER C WITH ACUTE + { 0x0108, 0x0109 }, // LATIN CAPITAL LETTER C WITH CIRCUMFLEX + { 0x010A, 0x010B }, // LATIN CAPITAL LETTER C WITH DOT ABOVE + { 0x010C, 0x010D }, // LATIN CAPITAL LETTER C WITH CARON + { 0x010E, 0x010F }, // LATIN CAPITAL LETTER D WITH CARON + { 0x0110, 0x0111 }, // LATIN CAPITAL LETTER D WITH STROKE + { 0x0112, 0x0113 }, // LATIN CAPITAL LETTER E WITH MACRON + { 0x0114, 0x0115 }, // LATIN CAPITAL LETTER E WITH BREVE + { 0x0116, 0x0117 }, // LATIN CAPITAL LETTER E WITH DOT ABOVE + { 0x0118, 0x0119 }, // LATIN CAPITAL LETTER E WITH OGONEK + { 0x011A, 0x011B }, // LATIN CAPITAL LETTER E WITH CARON + { 0x011C, 0x011D }, // LATIN CAPITAL LETTER G WITH CIRCUMFLEX + { 0x011E, 0x011F }, // LATIN CAPITAL LETTER G WITH BREVE + { 0x0120, 0x0121 }, // LATIN CAPITAL LETTER G WITH DOT ABOVE + { 0x0122, 0x0123 }, // LATIN CAPITAL LETTER G WITH CEDILLA + { 0x0124, 0x0125 }, // LATIN CAPITAL LETTER H WITH CIRCUMFLEX + { 0x0126, 0x0127 }, // LATIN CAPITAL LETTER H WITH STROKE + { 0x0128, 0x0129 }, // LATIN CAPITAL LETTER I WITH TILDE + { 0x012A, 0x012B }, // LATIN CAPITAL LETTER I WITH MACRON + { 0x012C, 0x012D }, // LATIN CAPITAL LETTER I WITH BREVE + { 0x012E, 0x012F }, // LATIN CAPITAL LETTER I WITH OGONEK + { 0x0130, 0x0069 }, // LATIN CAPITAL LETTER I WITH DOT ABOVE + { 0x0132, 0x0133 }, // LATIN CAPITAL LIGATURE IJ + { 0x0134, 0x0135 }, // LATIN CAPITAL LETTER J WITH CIRCUMFLEX + { 0x0136, 0x0137 }, // LATIN CAPITAL LETTER K WITH CEDILLA + { 0x0139, 0x013A }, // LATIN CAPITAL LETTER L WITH ACUTE + { 0x013B, 0x013C }, // LATIN CAPITAL LETTER L WITH CEDILLA + { 0x013D, 0x013E }, // LATIN CAPITAL LETTER L WITH CARON + { 0x013F, 0x0140 }, // LATIN CAPITAL LETTER L WITH MIDDLE DOT + { 0x0141, 0x0142 }, // LATIN CAPITAL LETTER L WITH STROKE + { 0x0143, 0x0144 }, // LATIN CAPITAL LETTER N WITH ACUTE + { 0x0145, 0x0146 }, // LATIN CAPITAL LETTER N WITH CEDILLA + { 0x0147, 0x0148 }, // LATIN CAPITAL LETTER N WITH CARON + { 0x014A, 0x014B }, // LATIN CAPITAL LETTER ENG + { 0x014C, 0x014D }, // LATIN CAPITAL LETTER O WITH MACRON + { 0x014E, 0x014F }, // LATIN CAPITAL LETTER O WITH BREVE + { 0x0150, 0x0151 }, // LATIN CAPITAL LETTER O WITH DOUBLE ACUTE + { 0x0152, 0x0153 }, // LATIN CAPITAL LIGATURE OE + { 0x0154, 0x0155 }, // LATIN CAPITAL LETTER R WITH ACUTE + { 0x0156, 0x0157 }, // LATIN CAPITAL LETTER R WITH CEDILLA + { 0x0158, 0x0159 }, // LATIN CAPITAL LETTER R WITH CARON + { 0x015A, 0x015B }, // LATIN CAPITAL LETTER S WITH ACUTE + { 0x015C, 0x015D }, // LATIN CAPITAL LETTER S WITH CIRCUMFLEX + { 0x015E, 0x015F }, // LATIN CAPITAL LETTER S WITH CEDILLA + { 0x0160, 0x0161 }, // LATIN CAPITAL LETTER S WITH CARON + { 0x0162, 0x0163 }, // LATIN CAPITAL LETTER T WITH CEDILLA + { 0x0164, 0x0165 }, // LATIN CAPITAL LETTER T WITH CARON + { 0x0166, 0x0167 }, // LATIN CAPITAL LETTER T WITH STROKE + { 0x0168, 0x0169 }, // LATIN CAPITAL LETTER U WITH TILDE + { 0x016A, 0x016B }, // LATIN CAPITAL LETTER U WITH MACRON + { 0x016C, 0x016D }, // LATIN CAPITAL LETTER U WITH BREVE + { 0x016E, 0x016F }, // LATIN CAPITAL LETTER U WITH RING ABOVE + { 0x0170, 0x0171 }, // LATIN CAPITAL LETTER U WITH DOUBLE ACUTE + { 0x0172, 0x0173 }, // LATIN CAPITAL LETTER U WITH OGONEK + { 0x0174, 0x0175 }, // LATIN CAPITAL LETTER W WITH CIRCUMFLEX + { 0x0176, 0x0177 }, // LATIN CAPITAL LETTER Y WITH CIRCUMFLEX + { 0x0178, 0x00FF }, // LATIN CAPITAL LETTER Y WITH DIAERESIS + { 0x0179, 0x017A }, // LATIN CAPITAL LETTER Z WITH ACUTE + { 0x017B, 0x017C }, // LATIN CAPITAL LETTER Z WITH DOT ABOVE + { 0x017D, 0x017E }, // LATIN CAPITAL LETTER Z WITH CARON + { 0x0181, 0x0253 }, // LATIN CAPITAL LETTER B WITH HOOK + { 0x0182, 0x0183 }, // LATIN CAPITAL LETTER B WITH TOPBAR + { 0x0184, 0x0185 }, // LATIN CAPITAL LETTER TONE SIX + { 0x0186, 0x0254 }, // LATIN CAPITAL LETTER OPEN O + { 0x0187, 0x0188 }, // LATIN CAPITAL LETTER C WITH HOOK + { 0x0189, 0x0256 }, // LATIN CAPITAL LETTER AFRICAN D + { 0x018A, 0x0257 }, // LATIN CAPITAL LETTER D WITH HOOK + { 0x018B, 0x018C }, // LATIN CAPITAL LETTER D WITH TOPBAR + { 0x018E, 0x01DD }, // LATIN CAPITAL LETTER REVERSED E + { 0x018F, 0x0259 }, // LATIN CAPITAL LETTER SCHWA + { 0x0190, 0x025B }, // LATIN CAPITAL LETTER OPEN E + { 0x0191, 0x0192 }, // LATIN CAPITAL LETTER F WITH HOOK + { 0x0193, 0x0260 }, // LATIN CAPITAL LETTER G WITH HOOK + { 0x0194, 0x0263 }, // LATIN CAPITAL LETTER GAMMA + { 0x0196, 0x0269 }, // LATIN CAPITAL LETTER IOTA + { 0x0197, 0x0268 }, // LATIN CAPITAL LETTER I WITH STROKE + { 0x0198, 0x0199 }, // LATIN CAPITAL LETTER K WITH HOOK + { 0x019C, 0x026F }, // LATIN CAPITAL LETTER TURNED M + { 0x019D, 0x0272 }, // LATIN CAPITAL LETTER N WITH LEFT HOOK + { 0x019F, 0x0275 }, // LATIN CAPITAL LETTER O WITH MIDDLE TILDE + { 0x01A0, 0x01A1 }, // LATIN CAPITAL LETTER O WITH HORN + { 0x01A2, 0x01A3 }, // LATIN CAPITAL LETTER OI + { 0x01A4, 0x01A5 }, // LATIN CAPITAL LETTER P WITH HOOK + { 0x01A6, 0x0280 }, // LATIN LETTER YR + { 0x01A7, 0x01A8 }, // LATIN CAPITAL LETTER TONE TWO + { 0x01A9, 0x0283 }, // LATIN CAPITAL LETTER ESH + { 0x01AC, 0x01AD }, // LATIN CAPITAL LETTER T WITH HOOK + { 0x01AE, 0x0288 }, // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK + { 0x01AF, 0x01B0 }, // LATIN CAPITAL LETTER U WITH HORN + { 0x01B1, 0x028A }, // LATIN CAPITAL LETTER UPSILON + { 0x01B2, 0x028B }, // LATIN CAPITAL LETTER V WITH HOOK + { 0x01B3, 0x01B4 }, // LATIN CAPITAL LETTER Y WITH HOOK + { 0x01B5, 0x01B6 }, // LATIN CAPITAL LETTER Z WITH STROKE + { 0x01B7, 0x0292 }, // LATIN CAPITAL LETTER EZH + { 0x01B8, 0x01B9 }, // LATIN CAPITAL LETTER EZH REVERSED + { 0x01BC, 0x01BD }, // LATIN CAPITAL LETTER TONE FIVE + { 0x01C4, 0x01C6 }, // LATIN CAPITAL LETTER DZ WITH CARON + { 0x01C5, 0x01C6 }, // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON + { 0x01C7, 0x01C9 }, // LATIN CAPITAL LETTER LJ + { 0x01C8, 0x01C9 }, // LATIN CAPITAL LETTER L WITH SMALL LETTER J + { 0x01CA, 0x01CC }, // LATIN CAPITAL LETTER NJ + { 0x01CB, 0x01CC }, // LATIN CAPITAL LETTER N WITH SMALL LETTER J + { 0x01CD, 0x01CE }, // LATIN CAPITAL LETTER A WITH CARON + { 0x01CF, 0x01D0 }, // LATIN CAPITAL LETTER I WITH CARON + { 0x01D1, 0x01D2 }, // LATIN CAPITAL LETTER O WITH CARON + { 0x01D3, 0x01D4 }, // LATIN CAPITAL LETTER U WITH CARON + { 0x01D5, 0x01D6 }, // LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON + { 0x01D7, 0x01D8 }, // LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE + { 0x01D9, 0x01DA }, // LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON + { 0x01DB, 0x01DC }, // LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE + { 0x01DE, 0x01DF }, // LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON + { 0x01E0, 0x01E1 }, // LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON + { 0x01E2, 0x01E3 }, // LATIN CAPITAL LETTER AE WITH MACRON + { 0x01E4, 0x01E5 }, // LATIN CAPITAL LETTER G WITH STROKE + { 0x01E6, 0x01E7 }, // LATIN CAPITAL LETTER G WITH CARON + { 0x01E8, 0x01E9 }, // LATIN CAPITAL LETTER K WITH CARON + { 0x01EA, 0x01EB }, // LATIN CAPITAL LETTER O WITH OGONEK + { 0x01EC, 0x01ED }, // LATIN CAPITAL LETTER O WITH OGONEK AND MACRON + { 0x01EE, 0x01EF }, // LATIN CAPITAL LETTER EZH WITH CARON + { 0x01F1, 0x01F3 }, // LATIN CAPITAL LETTER DZ + { 0x01F2, 0x01F3 }, // LATIN CAPITAL LETTER D WITH SMALL LETTER Z + { 0x01F4, 0x01F5 }, // LATIN CAPITAL LETTER G WITH ACUTE + { 0x01F6, 0x0195 }, // LATIN CAPITAL LETTER HWAIR + { 0x01F7, 0x01BF }, // LATIN CAPITAL LETTER WYNN + { 0x01F8, 0x01F9 }, // LATIN CAPITAL LETTER N WITH GRAVE + { 0x01FA, 0x01FB }, // LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE + { 0x01FC, 0x01FD }, // LATIN CAPITAL LETTER AE WITH ACUTE + { 0x01FE, 0x01FF }, // LATIN CAPITAL LETTER O WITH STROKE AND ACUTE + { 0x0200, 0x0201 }, // LATIN CAPITAL LETTER A WITH DOUBLE GRAVE + { 0x0202, 0x0203 }, // LATIN CAPITAL LETTER A WITH INVERTED BREVE + { 0x0204, 0x0205 }, // LATIN CAPITAL LETTER E WITH DOUBLE GRAVE + { 0x0206, 0x0207 }, // LATIN CAPITAL LETTER E WITH INVERTED BREVE + { 0x0208, 0x0209 }, // LATIN CAPITAL LETTER I WITH DOUBLE GRAVE + { 0x020A, 0x020B }, // LATIN CAPITAL LETTER I WITH INVERTED BREVE + { 0x020C, 0x020D }, // LATIN CAPITAL LETTER O WITH DOUBLE GRAVE + { 0x020E, 0x020F }, // LATIN CAPITAL LETTER O WITH INVERTED BREVE + { 0x0210, 0x0211 }, // LATIN CAPITAL LETTER R WITH DOUBLE GRAVE + { 0x0212, 0x0213 }, // LATIN CAPITAL LETTER R WITH INVERTED BREVE + { 0x0214, 0x0215 }, // LATIN CAPITAL LETTER U WITH DOUBLE GRAVE + { 0x0216, 0x0217 }, // LATIN CAPITAL LETTER U WITH INVERTED BREVE + { 0x0218, 0x0219 }, // LATIN CAPITAL LETTER S WITH COMMA BELOW + { 0x021A, 0x021B }, // LATIN CAPITAL LETTER T WITH COMMA BELOW + { 0x021C, 0x021D }, // LATIN CAPITAL LETTER YOGH + { 0x021E, 0x021F }, // LATIN CAPITAL LETTER H WITH CARON + { 0x0220, 0x019E }, // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG + { 0x0222, 0x0223 }, // LATIN CAPITAL LETTER OU + { 0x0224, 0x0225 }, // LATIN CAPITAL LETTER Z WITH HOOK + { 0x0226, 0x0227 }, // LATIN CAPITAL LETTER A WITH DOT ABOVE + { 0x0228, 0x0229 }, // LATIN CAPITAL LETTER E WITH CEDILLA + { 0x022A, 0x022B }, // LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON + { 0x022C, 0x022D }, // LATIN CAPITAL LETTER O WITH TILDE AND MACRON + { 0x022E, 0x022F }, // LATIN CAPITAL LETTER O WITH DOT ABOVE + { 0x0230, 0x0231 }, // LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON + { 0x0232, 0x0233 }, // LATIN CAPITAL LETTER Y WITH MACRON + { 0x023A, 0x2C65 }, // LATIN CAPITAL LETTER A WITH STROKE + { 0x023B, 0x023C }, // LATIN CAPITAL LETTER C WITH STROKE + { 0x023D, 0x019A }, // LATIN CAPITAL LETTER L WITH BAR + { 0x023E, 0x2C66 }, // LATIN CAPITAL LETTER T WITH DIAGONAL STROKE + { 0x0241, 0x0242 }, // LATIN CAPITAL LETTER GLOTTAL STOP + { 0x0243, 0x0180 }, // LATIN CAPITAL LETTER B WITH STROKE + { 0x0244, 0x0289 }, // LATIN CAPITAL LETTER U BAR + { 0x0245, 0x028C }, // LATIN CAPITAL LETTER TURNED V + { 0x0246, 0x0247 }, // LATIN CAPITAL LETTER E WITH STROKE + { 0x0248, 0x0249 }, // LATIN CAPITAL LETTER J WITH STROKE + { 0x024A, 0x024B }, // LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL + { 0x024C, 0x024D }, // LATIN CAPITAL LETTER R WITH STROKE + { 0x024E, 0x024F }, // LATIN CAPITAL LETTER Y WITH STROKE + { 0x0370, 0x0371 }, // GREEK CAPITAL LETTER HETA + { 0x0372, 0x0373 }, // GREEK CAPITAL LETTER ARCHAIC SAMPI + { 0x0376, 0x0377 }, // GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA + { 0x0386, 0x03AC }, // GREEK CAPITAL LETTER ALPHA WITH TONOS + { 0x0388, 0x03AD }, // GREEK CAPITAL LETTER EPSILON WITH TONOS + { 0x0389, 0x03AE }, // GREEK CAPITAL LETTER ETA WITH TONOS + { 0x038A, 0x03AF }, // GREEK CAPITAL LETTER IOTA WITH TONOS + { 0x038C, 0x03CC }, // GREEK CAPITAL LETTER OMICRON WITH TONOS + { 0x038E, 0x03CD }, // GREEK CAPITAL LETTER UPSILON WITH TONOS + { 0x038F, 0x03CE }, // GREEK CAPITAL LETTER OMEGA WITH TONOS + { 0x0391, 0x03B1 }, // GREEK CAPITAL LETTER ALPHA + { 0x0392, 0x03B2 }, // GREEK CAPITAL LETTER BETA + { 0x0393, 0x03B3 }, // GREEK CAPITAL LETTER GAMMA + { 0x0394, 0x03B4 }, // GREEK CAPITAL LETTER DELTA + { 0x0395, 0x03B5 }, // GREEK CAPITAL LETTER EPSILON + { 0x0396, 0x03B6 }, // GREEK CAPITAL LETTER ZETA + { 0x0397, 0x03B7 }, // GREEK CAPITAL LETTER ETA + { 0x0398, 0x03B8 }, // GREEK CAPITAL LETTER THETA + { 0x0399, 0x03B9 }, // GREEK CAPITAL LETTER IOTA + { 0x039A, 0x03BA }, // GREEK CAPITAL LETTER KAPPA + { 0x039B, 0x03BB }, // GREEK CAPITAL LETTER LAMDA + { 0x039C, 0x03BC }, // GREEK CAPITAL LETTER MU + { 0x039D, 0x03BD }, // GREEK CAPITAL LETTER NU + { 0x039E, 0x03BE }, // GREEK CAPITAL LETTER XI + { 0x039F, 0x03BF }, // GREEK CAPITAL LETTER OMICRON + { 0x03A0, 0x03C0 }, // GREEK CAPITAL LETTER PI + { 0x03A1, 0x03C1 }, // GREEK CAPITAL LETTER RHO + { 0x03A3, 0x03C3 }, // GREEK CAPITAL LETTER SIGMA + { 0x03A4, 0x03C4 }, // GREEK CAPITAL LETTER TAU + { 0x03A5, 0x03C5 }, // GREEK CAPITAL LETTER UPSILON + { 0x03A6, 0x03C6 }, // GREEK CAPITAL LETTER PHI + { 0x03A7, 0x03C7 }, // GREEK CAPITAL LETTER CHI + { 0x03A8, 0x03C8 }, // GREEK CAPITAL LETTER PSI + { 0x03A9, 0x03C9 }, // GREEK CAPITAL LETTER OMEGA + { 0x03AA, 0x03CA }, // GREEK CAPITAL LETTER IOTA WITH DIALYTIKA + { 0x03AB, 0x03CB }, // GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA + { 0x03CF, 0x03D7 }, // GREEK CAPITAL KAI SYMBOL + { 0x03D8, 0x03D9 }, // GREEK LETTER ARCHAIC KOPPA + { 0x03DA, 0x03DB }, // GREEK LETTER STIGMA + { 0x03DC, 0x03DD }, // GREEK LETTER DIGAMMA + { 0x03DE, 0x03DF }, // GREEK LETTER KOPPA + { 0x03E0, 0x03E1 }, // GREEK LETTER SAMPI + { 0x03E2, 0x03E3 }, // COPTIC CAPITAL LETTER SHEI + { 0x03E4, 0x03E5 }, // COPTIC CAPITAL LETTER FEI + { 0x03E6, 0x03E7 }, // COPTIC CAPITAL LETTER KHEI + { 0x03E8, 0x03E9 }, // COPTIC CAPITAL LETTER HORI + { 0x03EA, 0x03EB }, // COPTIC CAPITAL LETTER GANGIA + { 0x03EC, 0x03ED }, // COPTIC CAPITAL LETTER SHIMA + { 0x03EE, 0x03EF }, // COPTIC CAPITAL LETTER DEI + { 0x03F4, 0x03B8 }, // GREEK CAPITAL THETA SYMBOL + { 0x03F7, 0x03F8 }, // GREEK CAPITAL LETTER SHO + { 0x03F9, 0x03F2 }, // GREEK CAPITAL LUNATE SIGMA SYMBOL + { 0x03FA, 0x03FB }, // GREEK CAPITAL LETTER SAN + { 0x03FD, 0x037B }, // GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL + { 0x03FE, 0x037C }, // GREEK CAPITAL DOTTED LUNATE SIGMA SYMBOL + { 0x03FF, 0x037D }, // GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL + { 0x0400, 0x0450 }, // CYRILLIC CAPITAL LETTER IE WITH GRAVE + { 0x0401, 0x0451 }, // CYRILLIC CAPITAL LETTER IO + { 0x0402, 0x0452 }, // CYRILLIC CAPITAL LETTER DJE + { 0x0403, 0x0453 }, // CYRILLIC CAPITAL LETTER GJE + { 0x0404, 0x0454 }, // CYRILLIC CAPITAL LETTER UKRAINIAN IE + { 0x0405, 0x0455 }, // CYRILLIC CAPITAL LETTER DZE + { 0x0406, 0x0456 }, // CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I + { 0x0407, 0x0457 }, // CYRILLIC CAPITAL LETTER YI + { 0x0408, 0x0458 }, // CYRILLIC CAPITAL LETTER JE + { 0x0409, 0x0459 }, // CYRILLIC CAPITAL LETTER LJE + { 0x040A, 0x045A }, // CYRILLIC CAPITAL LETTER NJE + { 0x040B, 0x045B }, // CYRILLIC CAPITAL LETTER TSHE + { 0x040C, 0x045C }, // CYRILLIC CAPITAL LETTER KJE + { 0x040D, 0x045D }, // CYRILLIC CAPITAL LETTER I WITH GRAVE + { 0x040E, 0x045E }, // CYRILLIC CAPITAL LETTER SHORT U + { 0x040F, 0x045F }, // CYRILLIC CAPITAL LETTER DZHE + { 0x0410, 0x0430 }, // CYRILLIC CAPITAL LETTER A + { 0x0411, 0x0431 }, // CYRILLIC CAPITAL LETTER BE + { 0x0412, 0x0432 }, // CYRILLIC CAPITAL LETTER VE + { 0x0413, 0x0433 }, // CYRILLIC CAPITAL LETTER GHE + { 0x0414, 0x0434 }, // CYRILLIC CAPITAL LETTER DE + { 0x0415, 0x0435 }, // CYRILLIC CAPITAL LETTER IE + { 0x0416, 0x0436 }, // CYRILLIC CAPITAL LETTER ZHE + { 0x0417, 0x0437 }, // CYRILLIC CAPITAL LETTER ZE + { 0x0418, 0x0438 }, // CYRILLIC CAPITAL LETTER I + { 0x0419, 0x0439 }, // CYRILLIC CAPITAL LETTER SHORT I + { 0x041A, 0x043A }, // CYRILLIC CAPITAL LETTER KA + { 0x041B, 0x043B }, // CYRILLIC CAPITAL LETTER EL + { 0x041C, 0x043C }, // CYRILLIC CAPITAL LETTER EM + { 0x041D, 0x043D }, // CYRILLIC CAPITAL LETTER EN + { 0x041E, 0x043E }, // CYRILLIC CAPITAL LETTER O + { 0x041F, 0x043F }, // CYRILLIC CAPITAL LETTER PE + { 0x0420, 0x0440 }, // CYRILLIC CAPITAL LETTER ER + { 0x0421, 0x0441 }, // CYRILLIC CAPITAL LETTER ES + { 0x0422, 0x0442 }, // CYRILLIC CAPITAL LETTER TE + { 0x0423, 0x0443 }, // CYRILLIC CAPITAL LETTER U + { 0x0424, 0x0444 }, // CYRILLIC CAPITAL LETTER EF + { 0x0425, 0x0445 }, // CYRILLIC CAPITAL LETTER HA + { 0x0426, 0x0446 }, // CYRILLIC CAPITAL LETTER TSE + { 0x0427, 0x0447 }, // CYRILLIC CAPITAL LETTER CHE + { 0x0428, 0x0448 }, // CYRILLIC CAPITAL LETTER SHA + { 0x0429, 0x0449 }, // CYRILLIC CAPITAL LETTER SHCHA + { 0x042A, 0x044A }, // CYRILLIC CAPITAL LETTER HARD SIGN + { 0x042B, 0x044B }, // CYRILLIC CAPITAL LETTER YERU + { 0x042C, 0x044C }, // CYRILLIC CAPITAL LETTER SOFT SIGN + { 0x042D, 0x044D }, // CYRILLIC CAPITAL LETTER E + { 0x042E, 0x044E }, // CYRILLIC CAPITAL LETTER YU + { 0x042F, 0x044F }, // CYRILLIC CAPITAL LETTER YA + { 0x0460, 0x0461 }, // CYRILLIC CAPITAL LETTER OMEGA + { 0x0462, 0x0463 }, // CYRILLIC CAPITAL LETTER YAT + { 0x0464, 0x0465 }, // CYRILLIC CAPITAL LETTER IOTIFIED E + { 0x0466, 0x0467 }, // CYRILLIC CAPITAL LETTER LITTLE YUS + { 0x0468, 0x0469 }, // CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS + { 0x046A, 0x046B }, // CYRILLIC CAPITAL LETTER BIG YUS + { 0x046C, 0x046D }, // CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS + { 0x046E, 0x046F }, // CYRILLIC CAPITAL LETTER KSI + { 0x0470, 0x0471 }, // CYRILLIC CAPITAL LETTER PSI + { 0x0472, 0x0473 }, // CYRILLIC CAPITAL LETTER FITA + { 0x0474, 0x0475 }, // CYRILLIC CAPITAL LETTER IZHITSA + { 0x0476, 0x0477 }, // CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT + { 0x0478, 0x0479 }, // CYRILLIC CAPITAL LETTER UK + { 0x047A, 0x047B }, // CYRILLIC CAPITAL LETTER ROUND OMEGA + { 0x047C, 0x047D }, // CYRILLIC CAPITAL LETTER OMEGA WITH TITLO + { 0x047E, 0x047F }, // CYRILLIC CAPITAL LETTER OT + { 0x0480, 0x0481 }, // CYRILLIC CAPITAL LETTER KOPPA + { 0x048A, 0x048B }, // CYRILLIC CAPITAL LETTER SHORT I WITH TAIL + { 0x048C, 0x048D }, // CYRILLIC CAPITAL LETTER SEMISOFT SIGN + { 0x048E, 0x048F }, // CYRILLIC CAPITAL LETTER ER WITH TICK + { 0x0490, 0x0491 }, // CYRILLIC CAPITAL LETTER GHE WITH UPTURN + { 0x0492, 0x0493 }, // CYRILLIC CAPITAL LETTER GHE WITH STROKE + { 0x0494, 0x0495 }, // CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK + { 0x0496, 0x0497 }, // CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER + { 0x0498, 0x0499 }, // CYRILLIC CAPITAL LETTER ZE WITH DESCENDER + { 0x049A, 0x049B }, // CYRILLIC CAPITAL LETTER KA WITH DESCENDER + { 0x049C, 0x049D }, // CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE + { 0x049E, 0x049F }, // CYRILLIC CAPITAL LETTER KA WITH STROKE + { 0x04A0, 0x04A1 }, // CYRILLIC CAPITAL LETTER BASHKIR KA + { 0x04A2, 0x04A3 }, // CYRILLIC CAPITAL LETTER EN WITH DESCENDER + { 0x04A4, 0x04A5 }, // CYRILLIC CAPITAL LIGATURE EN GHE + { 0x04A6, 0x04A7 }, // CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK + { 0x04A8, 0x04A9 }, // CYRILLIC CAPITAL LETTER ABKHASIAN HA + { 0x04AA, 0x04AB }, // CYRILLIC CAPITAL LETTER ES WITH DESCENDER + { 0x04AC, 0x04AD }, // CYRILLIC CAPITAL LETTER TE WITH DESCENDER + { 0x04AE, 0x04AF }, // CYRILLIC CAPITAL LETTER STRAIGHT U + { 0x04B0, 0x04B1 }, // CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE + { 0x04B2, 0x04B3 }, // CYRILLIC CAPITAL LETTER HA WITH DESCENDER + { 0x04B4, 0x04B5 }, // CYRILLIC CAPITAL LIGATURE TE TSE + { 0x04B6, 0x04B7 }, // CYRILLIC CAPITAL LETTER CHE WITH DESCENDER + { 0x04B8, 0x04B9 }, // CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE + { 0x04BA, 0x04BB }, // CYRILLIC CAPITAL LETTER SHHA + { 0x04BC, 0x04BD }, // CYRILLIC CAPITAL LETTER ABKHASIAN CHE + { 0x04BE, 0x04BF }, // CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER + { 0x04C0, 0x04CF }, // CYRILLIC LETTER PALOCHKA + { 0x04C1, 0x04C2 }, // CYRILLIC CAPITAL LETTER ZHE WITH BREVE + { 0x04C3, 0x04C4 }, // CYRILLIC CAPITAL LETTER KA WITH HOOK + { 0x04C5, 0x04C6 }, // CYRILLIC CAPITAL LETTER EL WITH TAIL + { 0x04C7, 0x04C8 }, // CYRILLIC CAPITAL LETTER EN WITH HOOK + { 0x04C9, 0x04CA }, // CYRILLIC CAPITAL LETTER EN WITH TAIL + { 0x04CB, 0x04CC }, // CYRILLIC CAPITAL LETTER KHAKASSIAN CHE + { 0x04CD, 0x04CE }, // CYRILLIC CAPITAL LETTER EM WITH TAIL + { 0x04D0, 0x04D1 }, // CYRILLIC CAPITAL LETTER A WITH BREVE + { 0x04D2, 0x04D3 }, // CYRILLIC CAPITAL LETTER A WITH DIAERESIS + { 0x04D4, 0x04D5 }, // CYRILLIC CAPITAL LIGATURE A IE + { 0x04D6, 0x04D7 }, // CYRILLIC CAPITAL LETTER IE WITH BREVE + { 0x04D8, 0x04D9 }, // CYRILLIC CAPITAL LETTER SCHWA + { 0x04DA, 0x04DB }, // CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS + { 0x04DC, 0x04DD }, // CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS + { 0x04DE, 0x04DF }, // CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS + { 0x04E0, 0x04E1 }, // CYRILLIC CAPITAL LETTER ABKHASIAN DZE + { 0x04E2, 0x04E3 }, // CYRILLIC CAPITAL LETTER I WITH MACRON + { 0x04E4, 0x04E5 }, // CYRILLIC CAPITAL LETTER I WITH DIAERESIS + { 0x04E6, 0x04E7 }, // CYRILLIC CAPITAL LETTER O WITH DIAERESIS + { 0x04E8, 0x04E9 }, // CYRILLIC CAPITAL LETTER BARRED O + { 0x04EA, 0x04EB }, // CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS + { 0x04EC, 0x04ED }, // CYRILLIC CAPITAL LETTER E WITH DIAERESIS + { 0x04EE, 0x04EF }, // CYRILLIC CAPITAL LETTER U WITH MACRON + { 0x04F0, 0x04F1 }, // CYRILLIC CAPITAL LETTER U WITH DIAERESIS + { 0x04F2, 0x04F3 }, // CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE + { 0x04F4, 0x04F5 }, // CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS + { 0x04F6, 0x04F7 }, // CYRILLIC CAPITAL LETTER GHE WITH DESCENDER + { 0x04F8, 0x04F9 }, // CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS + { 0x04FA, 0x04FB }, // CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK + { 0x04FC, 0x04FD }, // CYRILLIC CAPITAL LETTER HA WITH HOOK + { 0x04FE, 0x04FF }, // CYRILLIC CAPITAL LETTER HA WITH STROKE + { 0x0500, 0x0501 }, // CYRILLIC CAPITAL LETTER KOMI DE + { 0x0502, 0x0503 }, // CYRILLIC CAPITAL LETTER KOMI DJE + { 0x0504, 0x0505 }, // CYRILLIC CAPITAL LETTER KOMI ZJE + { 0x0506, 0x0507 }, // CYRILLIC CAPITAL LETTER KOMI DZJE + { 0x0508, 0x0509 }, // CYRILLIC CAPITAL LETTER KOMI LJE + { 0x050A, 0x050B }, // CYRILLIC CAPITAL LETTER KOMI NJE + { 0x050C, 0x050D }, // CYRILLIC CAPITAL LETTER KOMI SJE + { 0x050E, 0x050F }, // CYRILLIC CAPITAL LETTER KOMI TJE + { 0x0510, 0x0511 }, // CYRILLIC CAPITAL LETTER REVERSED ZE + { 0x0512, 0x0513 }, // CYRILLIC CAPITAL LETTER EL WITH HOOK + { 0x0514, 0x0515 }, // CYRILLIC CAPITAL LETTER LHA + { 0x0516, 0x0517 }, // CYRILLIC CAPITAL LETTER RHA + { 0x0518, 0x0519 }, // CYRILLIC CAPITAL LETTER YAE + { 0x051A, 0x051B }, // CYRILLIC CAPITAL LETTER QA + { 0x051C, 0x051D }, // CYRILLIC CAPITAL LETTER WE + { 0x051E, 0x051F }, // CYRILLIC CAPITAL LETTER ALEUT KA + { 0x0520, 0x0521 }, // CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK + { 0x0522, 0x0523 }, // CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK + { 0x0524, 0x0525 }, // CYRILLIC CAPITAL LETTER PE WITH DESCENDER + { 0x0526, 0x0527 }, // CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER + { 0x0531, 0x0561 }, // ARMENIAN CAPITAL LETTER AYB + { 0x0532, 0x0562 }, // ARMENIAN CAPITAL LETTER BEN + { 0x0533, 0x0563 }, // ARMENIAN CAPITAL LETTER GIM + { 0x0534, 0x0564 }, // ARMENIAN CAPITAL LETTER DA + { 0x0535, 0x0565 }, // ARMENIAN CAPITAL LETTER ECH + { 0x0536, 0x0566 }, // ARMENIAN CAPITAL LETTER ZA + { 0x0537, 0x0567 }, // ARMENIAN CAPITAL LETTER EH + { 0x0538, 0x0568 }, // ARMENIAN CAPITAL LETTER ET + { 0x0539, 0x0569 }, // ARMENIAN CAPITAL LETTER TO + { 0x053A, 0x056A }, // ARMENIAN CAPITAL LETTER ZHE + { 0x053B, 0x056B }, // ARMENIAN CAPITAL LETTER INI + { 0x053C, 0x056C }, // ARMENIAN CAPITAL LETTER LIWN + { 0x053D, 0x056D }, // ARMENIAN CAPITAL LETTER XEH + { 0x053E, 0x056E }, // ARMENIAN CAPITAL LETTER CA + { 0x053F, 0x056F }, // ARMENIAN CAPITAL LETTER KEN + { 0x0540, 0x0570 }, // ARMENIAN CAPITAL LETTER HO + { 0x0541, 0x0571 }, // ARMENIAN CAPITAL LETTER JA + { 0x0542, 0x0572 }, // ARMENIAN CAPITAL LETTER GHAD + { 0x0543, 0x0573 }, // ARMENIAN CAPITAL LETTER CHEH + { 0x0544, 0x0574 }, // ARMENIAN CAPITAL LETTER MEN + { 0x0545, 0x0575 }, // ARMENIAN CAPITAL LETTER YI + { 0x0546, 0x0576 }, // ARMENIAN CAPITAL LETTER NOW + { 0x0547, 0x0577 }, // ARMENIAN CAPITAL LETTER SHA + { 0x0548, 0x0578 }, // ARMENIAN CAPITAL LETTER VO + { 0x0549, 0x0579 }, // ARMENIAN CAPITAL LETTER CHA + { 0x054A, 0x057A }, // ARMENIAN CAPITAL LETTER PEH + { 0x054B, 0x057B }, // ARMENIAN CAPITAL LETTER JHEH + { 0x054C, 0x057C }, // ARMENIAN CAPITAL LETTER RA + { 0x054D, 0x057D }, // ARMENIAN CAPITAL LETTER SEH + { 0x054E, 0x057E }, // ARMENIAN CAPITAL LETTER VEW + { 0x054F, 0x057F }, // ARMENIAN CAPITAL LETTER TIWN + { 0x0550, 0x0580 }, // ARMENIAN CAPITAL LETTER REH + { 0x0551, 0x0581 }, // ARMENIAN CAPITAL LETTER CO + { 0x0552, 0x0582 }, // ARMENIAN CAPITAL LETTER YIWN + { 0x0553, 0x0583 }, // ARMENIAN CAPITAL LETTER PIWR + { 0x0554, 0x0584 }, // ARMENIAN CAPITAL LETTER KEH + { 0x0555, 0x0585 }, // ARMENIAN CAPITAL LETTER OH + { 0x0556, 0x0586 }, // ARMENIAN CAPITAL LETTER FEH + { 0x10A0, 0x2D00 }, // GEORGIAN CAPITAL LETTER AN + { 0x10A1, 0x2D01 }, // GEORGIAN CAPITAL LETTER BAN + { 0x10A2, 0x2D02 }, // GEORGIAN CAPITAL LETTER GAN + { 0x10A3, 0x2D03 }, // GEORGIAN CAPITAL LETTER DON + { 0x10A4, 0x2D04 }, // GEORGIAN CAPITAL LETTER EN + { 0x10A5, 0x2D05 }, // GEORGIAN CAPITAL LETTER VIN + { 0x10A6, 0x2D06 }, // GEORGIAN CAPITAL LETTER ZEN + { 0x10A7, 0x2D07 }, // GEORGIAN CAPITAL LETTER TAN + { 0x10A8, 0x2D08 }, // GEORGIAN CAPITAL LETTER IN + { 0x10A9, 0x2D09 }, // GEORGIAN CAPITAL LETTER KAN + { 0x10AA, 0x2D0A }, // GEORGIAN CAPITAL LETTER LAS + { 0x10AB, 0x2D0B }, // GEORGIAN CAPITAL LETTER MAN + { 0x10AC, 0x2D0C }, // GEORGIAN CAPITAL LETTER NAR + { 0x10AD, 0x2D0D }, // GEORGIAN CAPITAL LETTER ON + { 0x10AE, 0x2D0E }, // GEORGIAN CAPITAL LETTER PAR + { 0x10AF, 0x2D0F }, // GEORGIAN CAPITAL LETTER ZHAR + { 0x10B0, 0x2D10 }, // GEORGIAN CAPITAL LETTER RAE + { 0x10B1, 0x2D11 }, // GEORGIAN CAPITAL LETTER SAN + { 0x10B2, 0x2D12 }, // GEORGIAN CAPITAL LETTER TAR + { 0x10B3, 0x2D13 }, // GEORGIAN CAPITAL LETTER UN + { 0x10B4, 0x2D14 }, // GEORGIAN CAPITAL LETTER PHAR + { 0x10B5, 0x2D15 }, // GEORGIAN CAPITAL LETTER KHAR + { 0x10B6, 0x2D16 }, // GEORGIAN CAPITAL LETTER GHAN + { 0x10B7, 0x2D17 }, // GEORGIAN CAPITAL LETTER QAR + { 0x10B8, 0x2D18 }, // GEORGIAN CAPITAL LETTER SHIN + { 0x10B9, 0x2D19 }, // GEORGIAN CAPITAL LETTER CHIN + { 0x10BA, 0x2D1A }, // GEORGIAN CAPITAL LETTER CAN + { 0x10BB, 0x2D1B }, // GEORGIAN CAPITAL LETTER JIL + { 0x10BC, 0x2D1C }, // GEORGIAN CAPITAL LETTER CIL + { 0x10BD, 0x2D1D }, // GEORGIAN CAPITAL LETTER CHAR + { 0x10BE, 0x2D1E }, // GEORGIAN CAPITAL LETTER XAN + { 0x10BF, 0x2D1F }, // GEORGIAN CAPITAL LETTER JHAN + { 0x10C0, 0x2D20 }, // GEORGIAN CAPITAL LETTER HAE + { 0x10C1, 0x2D21 }, // GEORGIAN CAPITAL LETTER HE + { 0x10C2, 0x2D22 }, // GEORGIAN CAPITAL LETTER HIE + { 0x10C3, 0x2D23 }, // GEORGIAN CAPITAL LETTER WE + { 0x10C4, 0x2D24 }, // GEORGIAN CAPITAL LETTER HAR + { 0x10C5, 0x2D25 }, // GEORGIAN CAPITAL LETTER HOE + { 0x1E00, 0x1E01 }, // LATIN CAPITAL LETTER A WITH RING BELOW + { 0x1E02, 0x1E03 }, // LATIN CAPITAL LETTER B WITH DOT ABOVE + { 0x1E04, 0x1E05 }, // LATIN CAPITAL LETTER B WITH DOT BELOW + { 0x1E06, 0x1E07 }, // LATIN CAPITAL LETTER B WITH LINE BELOW + { 0x1E08, 0x1E09 }, // LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE + { 0x1E0A, 0x1E0B }, // LATIN CAPITAL LETTER D WITH DOT ABOVE + { 0x1E0C, 0x1E0D }, // LATIN CAPITAL LETTER D WITH DOT BELOW + { 0x1E0E, 0x1E0F }, // LATIN CAPITAL LETTER D WITH LINE BELOW + { 0x1E10, 0x1E11 }, // LATIN CAPITAL LETTER D WITH CEDILLA + { 0x1E12, 0x1E13 }, // LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW + { 0x1E14, 0x1E15 }, // LATIN CAPITAL LETTER E WITH MACRON AND GRAVE + { 0x1E16, 0x1E17 }, // LATIN CAPITAL LETTER E WITH MACRON AND ACUTE + { 0x1E18, 0x1E19 }, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW + { 0x1E1A, 0x1E1B }, // LATIN CAPITAL LETTER E WITH TILDE BELOW + { 0x1E1C, 0x1E1D }, // LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE + { 0x1E1E, 0x1E1F }, // LATIN CAPITAL LETTER F WITH DOT ABOVE + { 0x1E20, 0x1E21 }, // LATIN CAPITAL LETTER G WITH MACRON + { 0x1E22, 0x1E23 }, // LATIN CAPITAL LETTER H WITH DOT ABOVE + { 0x1E24, 0x1E25 }, // LATIN CAPITAL LETTER H WITH DOT BELOW + { 0x1E26, 0x1E27 }, // LATIN CAPITAL LETTER H WITH DIAERESIS + { 0x1E28, 0x1E29 }, // LATIN CAPITAL LETTER H WITH CEDILLA + { 0x1E2A, 0x1E2B }, // LATIN CAPITAL LETTER H WITH BREVE BELOW + { 0x1E2C, 0x1E2D }, // LATIN CAPITAL LETTER I WITH TILDE BELOW + { 0x1E2E, 0x1E2F }, // LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE + { 0x1E30, 0x1E31 }, // LATIN CAPITAL LETTER K WITH ACUTE + { 0x1E32, 0x1E33 }, // LATIN CAPITAL LETTER K WITH DOT BELOW + { 0x1E34, 0x1E35 }, // LATIN CAPITAL LETTER K WITH LINE BELOW + { 0x1E36, 0x1E37 }, // LATIN CAPITAL LETTER L WITH DOT BELOW + { 0x1E38, 0x1E39 }, // LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON + { 0x1E3A, 0x1E3B }, // LATIN CAPITAL LETTER L WITH LINE BELOW + { 0x1E3C, 0x1E3D }, // LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW + { 0x1E3E, 0x1E3F }, // LATIN CAPITAL LETTER M WITH ACUTE + { 0x1E40, 0x1E41 }, // LATIN CAPITAL LETTER M WITH DOT ABOVE + { 0x1E42, 0x1E43 }, // LATIN CAPITAL LETTER M WITH DOT BELOW + { 0x1E44, 0x1E45 }, // LATIN CAPITAL LETTER N WITH DOT ABOVE + { 0x1E46, 0x1E47 }, // LATIN CAPITAL LETTER N WITH DOT BELOW + { 0x1E48, 0x1E49 }, // LATIN CAPITAL LETTER N WITH LINE BELOW + { 0x1E4A, 0x1E4B }, // LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW + { 0x1E4C, 0x1E4D }, // LATIN CAPITAL LETTER O WITH TILDE AND ACUTE + { 0x1E4E, 0x1E4F }, // LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS + { 0x1E50, 0x1E51 }, // LATIN CAPITAL LETTER O WITH MACRON AND GRAVE + { 0x1E52, 0x1E53 }, // LATIN CAPITAL LETTER O WITH MACRON AND ACUTE + { 0x1E54, 0x1E55 }, // LATIN CAPITAL LETTER P WITH ACUTE + { 0x1E56, 0x1E57 }, // LATIN CAPITAL LETTER P WITH DOT ABOVE + { 0x1E58, 0x1E59 }, // LATIN CAPITAL LETTER R WITH DOT ABOVE + { 0x1E5A, 0x1E5B }, // LATIN CAPITAL LETTER R WITH DOT BELOW + { 0x1E5C, 0x1E5D }, // LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON + { 0x1E5E, 0x1E5F }, // LATIN CAPITAL LETTER R WITH LINE BELOW + { 0x1E60, 0x1E61 }, // LATIN CAPITAL LETTER S WITH DOT ABOVE + { 0x1E62, 0x1E63 }, // LATIN CAPITAL LETTER S WITH DOT BELOW + { 0x1E64, 0x1E65 }, // LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE + { 0x1E66, 0x1E67 }, // LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE + { 0x1E68, 0x1E69 }, // LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE + { 0x1E6A, 0x1E6B }, // LATIN CAPITAL LETTER T WITH DOT ABOVE + { 0x1E6C, 0x1E6D }, // LATIN CAPITAL LETTER T WITH DOT BELOW + { 0x1E6E, 0x1E6F }, // LATIN CAPITAL LETTER T WITH LINE BELOW + { 0x1E70, 0x1E71 }, // LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW + { 0x1E72, 0x1E73 }, // LATIN CAPITAL LETTER U WITH DIAERESIS BELOW + { 0x1E74, 0x1E75 }, // LATIN CAPITAL LETTER U WITH TILDE BELOW + { 0x1E76, 0x1E77 }, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW + { 0x1E78, 0x1E79 }, // LATIN CAPITAL LETTER U WITH TILDE AND ACUTE + { 0x1E7A, 0x1E7B }, // LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS + { 0x1E7C, 0x1E7D }, // LATIN CAPITAL LETTER V WITH TILDE + { 0x1E7E, 0x1E7F }, // LATIN CAPITAL LETTER V WITH DOT BELOW + { 0x1E80, 0x1E81 }, // LATIN CAPITAL LETTER W WITH GRAVE + { 0x1E82, 0x1E83 }, // LATIN CAPITAL LETTER W WITH ACUTE + { 0x1E84, 0x1E85 }, // LATIN CAPITAL LETTER W WITH DIAERESIS + { 0x1E86, 0x1E87 }, // LATIN CAPITAL LETTER W WITH DOT ABOVE + { 0x1E88, 0x1E89 }, // LATIN CAPITAL LETTER W WITH DOT BELOW + { 0x1E8A, 0x1E8B }, // LATIN CAPITAL LETTER X WITH DOT ABOVE + { 0x1E8C, 0x1E8D }, // LATIN CAPITAL LETTER X WITH DIAERESIS + { 0x1E8E, 0x1E8F }, // LATIN CAPITAL LETTER Y WITH DOT ABOVE + { 0x1E90, 0x1E91 }, // LATIN CAPITAL LETTER Z WITH CIRCUMFLEX + { 0x1E92, 0x1E93 }, // LATIN CAPITAL LETTER Z WITH DOT BELOW + { 0x1E94, 0x1E95 }, // LATIN CAPITAL LETTER Z WITH LINE BELOW + { 0x1E9E, 0x00DF }, // LATIN CAPITAL LETTER SHARP S + { 0x1EA0, 0x1EA1 }, // LATIN CAPITAL LETTER A WITH DOT BELOW + { 0x1EA2, 0x1EA3 }, // LATIN CAPITAL LETTER A WITH HOOK ABOVE + { 0x1EA4, 0x1EA5 }, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE + { 0x1EA6, 0x1EA7 }, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE + { 0x1EA8, 0x1EA9 }, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE + { 0x1EAA, 0x1EAB }, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE + { 0x1EAC, 0x1EAD }, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW + { 0x1EAE, 0x1EAF }, // LATIN CAPITAL LETTER A WITH BREVE AND ACUTE + { 0x1EB0, 0x1EB1 }, // LATIN CAPITAL LETTER A WITH BREVE AND GRAVE + { 0x1EB2, 0x1EB3 }, // LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE + { 0x1EB4, 0x1EB5 }, // LATIN CAPITAL LETTER A WITH BREVE AND TILDE + { 0x1EB6, 0x1EB7 }, // LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW + { 0x1EB8, 0x1EB9 }, // LATIN CAPITAL LETTER E WITH DOT BELOW + { 0x1EBA, 0x1EBB }, // LATIN CAPITAL LETTER E WITH HOOK ABOVE + { 0x1EBC, 0x1EBD }, // LATIN CAPITAL LETTER E WITH TILDE + { 0x1EBE, 0x1EBF }, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE + { 0x1EC0, 0x1EC1 }, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE + { 0x1EC2, 0x1EC3 }, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE + { 0x1EC4, 0x1EC5 }, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE + { 0x1EC6, 0x1EC7 }, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW + { 0x1EC8, 0x1EC9 }, // LATIN CAPITAL LETTER I WITH HOOK ABOVE + { 0x1ECA, 0x1ECB }, // LATIN CAPITAL LETTER I WITH DOT BELOW + { 0x1ECC, 0x1ECD }, // LATIN CAPITAL LETTER O WITH DOT BELOW + { 0x1ECE, 0x1ECF }, // LATIN CAPITAL LETTER O WITH HOOK ABOVE + { 0x1ED0, 0x1ED1 }, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE + { 0x1ED2, 0x1ED3 }, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE + { 0x1ED4, 0x1ED5 }, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE + { 0x1ED6, 0x1ED7 }, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE + { 0x1ED8, 0x1ED9 }, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW + { 0x1EDA, 0x1EDB }, // LATIN CAPITAL LETTER O WITH HORN AND ACUTE + { 0x1EDC, 0x1EDD }, // LATIN CAPITAL LETTER O WITH HORN AND GRAVE + { 0x1EDE, 0x1EDF }, // LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE + { 0x1EE0, 0x1EE1 }, // LATIN CAPITAL LETTER O WITH HORN AND TILDE + { 0x1EE2, 0x1EE3 }, // LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW + { 0x1EE4, 0x1EE5 }, // LATIN CAPITAL LETTER U WITH DOT BELOW + { 0x1EE6, 0x1EE7 }, // LATIN CAPITAL LETTER U WITH HOOK ABOVE + { 0x1EE8, 0x1EE9 }, // LATIN CAPITAL LETTER U WITH HORN AND ACUTE + { 0x1EEA, 0x1EEB }, // LATIN CAPITAL LETTER U WITH HORN AND GRAVE + { 0x1EEC, 0x1EED }, // LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE + { 0x1EEE, 0x1EEF }, // LATIN CAPITAL LETTER U WITH HORN AND TILDE + { 0x1EF0, 0x1EF1 }, // LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW + { 0x1EF2, 0x1EF3 }, // LATIN CAPITAL LETTER Y WITH GRAVE + { 0x1EF4, 0x1EF5 }, // LATIN CAPITAL LETTER Y WITH DOT BELOW + { 0x1EF6, 0x1EF7 }, // LATIN CAPITAL LETTER Y WITH HOOK ABOVE + { 0x1EF8, 0x1EF9 }, // LATIN CAPITAL LETTER Y WITH TILDE + { 0x1EFA, 0x1EFB }, // LATIN CAPITAL LETTER MIDDLE-WELSH LL + { 0x1EFC, 0x1EFD }, // LATIN CAPITAL LETTER MIDDLE-WELSH V + { 0x1EFE, 0x1EFF }, // LATIN CAPITAL LETTER Y WITH LOOP + { 0x1F08, 0x1F00 }, // GREEK CAPITAL LETTER ALPHA WITH PSILI + { 0x1F09, 0x1F01 }, // GREEK CAPITAL LETTER ALPHA WITH DASIA + { 0x1F0A, 0x1F02 }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA + { 0x1F0B, 0x1F03 }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA + { 0x1F0C, 0x1F04 }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA + { 0x1F0D, 0x1F05 }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA + { 0x1F0E, 0x1F06 }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI + { 0x1F0F, 0x1F07 }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI + { 0x1F18, 0x1F10 }, // GREEK CAPITAL LETTER EPSILON WITH PSILI + { 0x1F19, 0x1F11 }, // GREEK CAPITAL LETTER EPSILON WITH DASIA + { 0x1F1A, 0x1F12 }, // GREEK CAPITAL LETTER EPSILON WITH PSILI AND VARIA + { 0x1F1B, 0x1F13 }, // GREEK CAPITAL LETTER EPSILON WITH DASIA AND VARIA + { 0x1F1C, 0x1F14 }, // GREEK CAPITAL LETTER EPSILON WITH PSILI AND OXIA + { 0x1F1D, 0x1F15 }, // GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA + { 0x1F28, 0x1F20 }, // GREEK CAPITAL LETTER ETA WITH PSILI + { 0x1F29, 0x1F21 }, // GREEK CAPITAL LETTER ETA WITH DASIA + { 0x1F2A, 0x1F22 }, // GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA + { 0x1F2B, 0x1F23 }, // GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA + { 0x1F2C, 0x1F24 }, // GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA + { 0x1F2D, 0x1F25 }, // GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA + { 0x1F2E, 0x1F26 }, // GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI + { 0x1F2F, 0x1F27 }, // GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI + { 0x1F38, 0x1F30 }, // GREEK CAPITAL LETTER IOTA WITH PSILI + { 0x1F39, 0x1F31 }, // GREEK CAPITAL LETTER IOTA WITH DASIA + { 0x1F3A, 0x1F32 }, // GREEK CAPITAL LETTER IOTA WITH PSILI AND VARIA + { 0x1F3B, 0x1F33 }, // GREEK CAPITAL LETTER IOTA WITH DASIA AND VARIA + { 0x1F3C, 0x1F34 }, // GREEK CAPITAL LETTER IOTA WITH PSILI AND OXIA + { 0x1F3D, 0x1F35 }, // GREEK CAPITAL LETTER IOTA WITH DASIA AND OXIA + { 0x1F3E, 0x1F36 }, // GREEK CAPITAL LETTER IOTA WITH PSILI AND PERISPOMENI + { 0x1F3F, 0x1F37 }, // GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI + { 0x1F48, 0x1F40 }, // GREEK CAPITAL LETTER OMICRON WITH PSILI + { 0x1F49, 0x1F41 }, // GREEK CAPITAL LETTER OMICRON WITH DASIA + { 0x1F4A, 0x1F42 }, // GREEK CAPITAL LETTER OMICRON WITH PSILI AND VARIA + { 0x1F4B, 0x1F43 }, // GREEK CAPITAL LETTER OMICRON WITH DASIA AND VARIA + { 0x1F4C, 0x1F44 }, // GREEK CAPITAL LETTER OMICRON WITH PSILI AND OXIA + { 0x1F4D, 0x1F45 }, // GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA + { 0x1F59, 0x1F51 }, // GREEK CAPITAL LETTER UPSILON WITH DASIA + { 0x1F5B, 0x1F53 }, // GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA + { 0x1F5D, 0x1F55 }, // GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA + { 0x1F5F, 0x1F57 }, // GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI + { 0x1F68, 0x1F60 }, // GREEK CAPITAL LETTER OMEGA WITH PSILI + { 0x1F69, 0x1F61 }, // GREEK CAPITAL LETTER OMEGA WITH DASIA + { 0x1F6A, 0x1F62 }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA + { 0x1F6B, 0x1F63 }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA + { 0x1F6C, 0x1F64 }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA + { 0x1F6D, 0x1F65 }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA + { 0x1F6E, 0x1F66 }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI + { 0x1F6F, 0x1F67 }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI + { 0x1F88, 0x1F80 }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI + { 0x1F89, 0x1F81 }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI + { 0x1F8A, 0x1F82 }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI + { 0x1F8B, 0x1F83 }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI + { 0x1F8C, 0x1F84 }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI + { 0x1F8D, 0x1F85 }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI + { 0x1F8E, 0x1F86 }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI + { 0x1F8F, 0x1F87 }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI + { 0x1F98, 0x1F90 }, // GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI + { 0x1F99, 0x1F91 }, // GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI + { 0x1F9A, 0x1F92 }, // GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI + { 0x1F9B, 0x1F93 }, // GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI + { 0x1F9C, 0x1F94 }, // GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI + { 0x1F9D, 0x1F95 }, // GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI + { 0x1F9E, 0x1F96 }, // GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI + { 0x1F9F, 0x1F97 }, // GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI + { 0x1FA8, 0x1FA0 }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI + { 0x1FA9, 0x1FA1 }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI + { 0x1FAA, 0x1FA2 }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI + { 0x1FAB, 0x1FA3 }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI + { 0x1FAC, 0x1FA4 }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI + { 0x1FAD, 0x1FA5 }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI + { 0x1FAE, 0x1FA6 }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI + { 0x1FAF, 0x1FA7 }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI + { 0x1FB8, 0x1FB0 }, // GREEK CAPITAL LETTER ALPHA WITH VRACHY + { 0x1FB9, 0x1FB1 }, // GREEK CAPITAL LETTER ALPHA WITH MACRON + { 0x1FBA, 0x1F70 }, // GREEK CAPITAL LETTER ALPHA WITH VARIA + { 0x1FBB, 0x1F71 }, // GREEK CAPITAL LETTER ALPHA WITH OXIA + { 0x1FBC, 0x1FB3 }, // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI + { 0x1FC8, 0x1F72 }, // GREEK CAPITAL LETTER EPSILON WITH VARIA + { 0x1FC9, 0x1F73 }, // GREEK CAPITAL LETTER EPSILON WITH OXIA + { 0x1FCA, 0x1F74 }, // GREEK CAPITAL LETTER ETA WITH VARIA + { 0x1FCB, 0x1F75 }, // GREEK CAPITAL LETTER ETA WITH OXIA + { 0x1FCC, 0x1FC3 }, // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI + { 0x1FD8, 0x1FD0 }, // GREEK CAPITAL LETTER IOTA WITH VRACHY + { 0x1FD9, 0x1FD1 }, // GREEK CAPITAL LETTER IOTA WITH MACRON + { 0x1FDA, 0x1F76 }, // GREEK CAPITAL LETTER IOTA WITH VARIA + { 0x1FDB, 0x1F77 }, // GREEK CAPITAL LETTER IOTA WITH OXIA + { 0x1FE8, 0x1FE0 }, // GREEK CAPITAL LETTER UPSILON WITH VRACHY + { 0x1FE9, 0x1FE1 }, // GREEK CAPITAL LETTER UPSILON WITH MACRON + { 0x1FEA, 0x1F7A }, // GREEK CAPITAL LETTER UPSILON WITH VARIA + { 0x1FEB, 0x1F7B }, // GREEK CAPITAL LETTER UPSILON WITH OXIA + { 0x1FEC, 0x1FE5 }, // GREEK CAPITAL LETTER RHO WITH DASIA + { 0x1FF8, 0x1F78 }, // GREEK CAPITAL LETTER OMICRON WITH VARIA + { 0x1FF9, 0x1F79 }, // GREEK CAPITAL LETTER OMICRON WITH OXIA + { 0x1FFA, 0x1F7C }, // GREEK CAPITAL LETTER OMEGA WITH VARIA + { 0x1FFB, 0x1F7D }, // GREEK CAPITAL LETTER OMEGA WITH OXIA + { 0x1FFC, 0x1FF3 }, // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI + { 0x2126, 0x03C9 }, // OHM SIGN + { 0x212A, 0x006B }, // KELVIN SIGN + { 0x212B, 0x00E5 }, // ANGSTROM SIGN + { 0x2132, 0x214E }, // TURNED CAPITAL F + { 0x2160, 0x2170 }, // ROMAN NUMERAL ONE + { 0x2161, 0x2171 }, // ROMAN NUMERAL TWO + { 0x2162, 0x2172 }, // ROMAN NUMERAL THREE + { 0x2163, 0x2173 }, // ROMAN NUMERAL FOUR + { 0x2164, 0x2174 }, // ROMAN NUMERAL FIVE + { 0x2165, 0x2175 }, // ROMAN NUMERAL SIX + { 0x2166, 0x2176 }, // ROMAN NUMERAL SEVEN + { 0x2167, 0x2177 }, // ROMAN NUMERAL EIGHT + { 0x2168, 0x2178 }, // ROMAN NUMERAL NINE + { 0x2169, 0x2179 }, // ROMAN NUMERAL TEN + { 0x216A, 0x217A }, // ROMAN NUMERAL ELEVEN + { 0x216B, 0x217B }, // ROMAN NUMERAL TWELVE + { 0x216C, 0x217C }, // ROMAN NUMERAL FIFTY + { 0x216D, 0x217D }, // ROMAN NUMERAL ONE HUNDRED + { 0x216E, 0x217E }, // ROMAN NUMERAL FIVE HUNDRED + { 0x216F, 0x217F }, // ROMAN NUMERAL ONE THOUSAND + { 0x2183, 0x2184 }, // ROMAN NUMERAL REVERSED ONE HUNDRED + { 0x24B6, 0x24D0 }, // CIRCLED LATIN CAPITAL LETTER A + { 0x24B7, 0x24D1 }, // CIRCLED LATIN CAPITAL LETTER B + { 0x24B8, 0x24D2 }, // CIRCLED LATIN CAPITAL LETTER C + { 0x24B9, 0x24D3 }, // CIRCLED LATIN CAPITAL LETTER D + { 0x24BA, 0x24D4 }, // CIRCLED LATIN CAPITAL LETTER E + { 0x24BB, 0x24D5 }, // CIRCLED LATIN CAPITAL LETTER F + { 0x24BC, 0x24D6 }, // CIRCLED LATIN CAPITAL LETTER G + { 0x24BD, 0x24D7 }, // CIRCLED LATIN CAPITAL LETTER H + { 0x24BE, 0x24D8 }, // CIRCLED LATIN CAPITAL LETTER I + { 0x24BF, 0x24D9 }, // CIRCLED LATIN CAPITAL LETTER J + { 0x24C0, 0x24DA }, // CIRCLED LATIN CAPITAL LETTER K + { 0x24C1, 0x24DB }, // CIRCLED LATIN CAPITAL LETTER L + { 0x24C2, 0x24DC }, // CIRCLED LATIN CAPITAL LETTER M + { 0x24C3, 0x24DD }, // CIRCLED LATIN CAPITAL LETTER N + { 0x24C4, 0x24DE }, // CIRCLED LATIN CAPITAL LETTER O + { 0x24C5, 0x24DF }, // CIRCLED LATIN CAPITAL LETTER P + { 0x24C6, 0x24E0 }, // CIRCLED LATIN CAPITAL LETTER Q + { 0x24C7, 0x24E1 }, // CIRCLED LATIN CAPITAL LETTER R + { 0x24C8, 0x24E2 }, // CIRCLED LATIN CAPITAL LETTER S + { 0x24C9, 0x24E3 }, // CIRCLED LATIN CAPITAL LETTER T + { 0x24CA, 0x24E4 }, // CIRCLED LATIN CAPITAL LETTER U + { 0x24CB, 0x24E5 }, // CIRCLED LATIN CAPITAL LETTER V + { 0x24CC, 0x24E6 }, // CIRCLED LATIN CAPITAL LETTER W + { 0x24CD, 0x24E7 }, // CIRCLED LATIN CAPITAL LETTER X + { 0x24CE, 0x24E8 }, // CIRCLED LATIN CAPITAL LETTER Y + { 0x24CF, 0x24E9 }, // CIRCLED LATIN CAPITAL LETTER Z + { 0x2C00, 0x2C30 }, // GLAGOLITIC CAPITAL LETTER AZU + { 0x2C01, 0x2C31 }, // GLAGOLITIC CAPITAL LETTER BUKY + { 0x2C02, 0x2C32 }, // GLAGOLITIC CAPITAL LETTER VEDE + { 0x2C03, 0x2C33 }, // GLAGOLITIC CAPITAL LETTER GLAGOLI + { 0x2C04, 0x2C34 }, // GLAGOLITIC CAPITAL LETTER DOBRO + { 0x2C05, 0x2C35 }, // GLAGOLITIC CAPITAL LETTER YESTU + { 0x2C06, 0x2C36 }, // GLAGOLITIC CAPITAL LETTER ZHIVETE + { 0x2C07, 0x2C37 }, // GLAGOLITIC CAPITAL LETTER DZELO + { 0x2C08, 0x2C38 }, // GLAGOLITIC CAPITAL LETTER ZEMLJA + { 0x2C09, 0x2C39 }, // GLAGOLITIC CAPITAL LETTER IZHE + { 0x2C0A, 0x2C3A }, // GLAGOLITIC CAPITAL LETTER INITIAL IZHE + { 0x2C0B, 0x2C3B }, // GLAGOLITIC CAPITAL LETTER I + { 0x2C0C, 0x2C3C }, // GLAGOLITIC CAPITAL LETTER DJERVI + { 0x2C0D, 0x2C3D }, // GLAGOLITIC CAPITAL LETTER KAKO + { 0x2C0E, 0x2C3E }, // GLAGOLITIC CAPITAL LETTER LJUDIJE + { 0x2C0F, 0x2C3F }, // GLAGOLITIC CAPITAL LETTER MYSLITE + { 0x2C10, 0x2C40 }, // GLAGOLITIC CAPITAL LETTER NASHI + { 0x2C11, 0x2C41 }, // GLAGOLITIC CAPITAL LETTER ONU + { 0x2C12, 0x2C42 }, // GLAGOLITIC CAPITAL LETTER POKOJI + { 0x2C13, 0x2C43 }, // GLAGOLITIC CAPITAL LETTER RITSI + { 0x2C14, 0x2C44 }, // GLAGOLITIC CAPITAL LETTER SLOVO + { 0x2C15, 0x2C45 }, // GLAGOLITIC CAPITAL LETTER TVRIDO + { 0x2C16, 0x2C46 }, // GLAGOLITIC CAPITAL LETTER UKU + { 0x2C17, 0x2C47 }, // GLAGOLITIC CAPITAL LETTER FRITU + { 0x2C18, 0x2C48 }, // GLAGOLITIC CAPITAL LETTER HERU + { 0x2C19, 0x2C49 }, // GLAGOLITIC CAPITAL LETTER OTU + { 0x2C1A, 0x2C4A }, // GLAGOLITIC CAPITAL LETTER PE + { 0x2C1B, 0x2C4B }, // GLAGOLITIC CAPITAL LETTER SHTA + { 0x2C1C, 0x2C4C }, // GLAGOLITIC CAPITAL LETTER TSI + { 0x2C1D, 0x2C4D }, // GLAGOLITIC CAPITAL LETTER CHRIVI + { 0x2C1E, 0x2C4E }, // GLAGOLITIC CAPITAL LETTER SHA + { 0x2C1F, 0x2C4F }, // GLAGOLITIC CAPITAL LETTER YERU + { 0x2C20, 0x2C50 }, // GLAGOLITIC CAPITAL LETTER YERI + { 0x2C21, 0x2C51 }, // GLAGOLITIC CAPITAL LETTER YATI + { 0x2C22, 0x2C52 }, // GLAGOLITIC CAPITAL LETTER SPIDERY HA + { 0x2C23, 0x2C53 }, // GLAGOLITIC CAPITAL LETTER YU + { 0x2C24, 0x2C54 }, // GLAGOLITIC CAPITAL LETTER SMALL YUS + { 0x2C25, 0x2C55 }, // GLAGOLITIC CAPITAL LETTER SMALL YUS WITH TAIL + { 0x2C26, 0x2C56 }, // GLAGOLITIC CAPITAL LETTER YO + { 0x2C27, 0x2C57 }, // GLAGOLITIC CAPITAL LETTER IOTATED SMALL YUS + { 0x2C28, 0x2C58 }, // GLAGOLITIC CAPITAL LETTER BIG YUS + { 0x2C29, 0x2C59 }, // GLAGOLITIC CAPITAL LETTER IOTATED BIG YUS + { 0x2C2A, 0x2C5A }, // GLAGOLITIC CAPITAL LETTER FITA + { 0x2C2B, 0x2C5B }, // GLAGOLITIC CAPITAL LETTER IZHITSA + { 0x2C2C, 0x2C5C }, // GLAGOLITIC CAPITAL LETTER SHTAPIC + { 0x2C2D, 0x2C5D }, // GLAGOLITIC CAPITAL LETTER TROKUTASTI A + { 0x2C2E, 0x2C5E }, // GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE + { 0x2C60, 0x2C61 }, // LATIN CAPITAL LETTER L WITH DOUBLE BAR + { 0x2C62, 0x026B }, // LATIN CAPITAL LETTER L WITH MIDDLE TILDE + { 0x2C63, 0x1D7D }, // LATIN CAPITAL LETTER P WITH STROKE + { 0x2C64, 0x027D }, // LATIN CAPITAL LETTER R WITH TAIL + { 0x2C67, 0x2C68 }, // LATIN CAPITAL LETTER H WITH DESCENDER + { 0x2C69, 0x2C6A }, // LATIN CAPITAL LETTER K WITH DESCENDER + { 0x2C6B, 0x2C6C }, // LATIN CAPITAL LETTER Z WITH DESCENDER + { 0x2C6D, 0x0251 }, // LATIN CAPITAL LETTER ALPHA + { 0x2C6E, 0x0271 }, // LATIN CAPITAL LETTER M WITH HOOK + { 0x2C6F, 0x0250 }, // LATIN CAPITAL LETTER TURNED A + { 0x2C70, 0x0252 }, // LATIN CAPITAL LETTER TURNED ALPHA + { 0x2C72, 0x2C73 }, // LATIN CAPITAL LETTER W WITH HOOK + { 0x2C75, 0x2C76 }, // LATIN CAPITAL LETTER HALF H + { 0x2C7E, 0x023F }, // LATIN CAPITAL LETTER S WITH SWASH TAIL + { 0x2C7F, 0x0240 }, // LATIN CAPITAL LETTER Z WITH SWASH TAIL + { 0x2C80, 0x2C81 }, // COPTIC CAPITAL LETTER ALFA + { 0x2C82, 0x2C83 }, // COPTIC CAPITAL LETTER VIDA + { 0x2C84, 0x2C85 }, // COPTIC CAPITAL LETTER GAMMA + { 0x2C86, 0x2C87 }, // COPTIC CAPITAL LETTER DALDA + { 0x2C88, 0x2C89 }, // COPTIC CAPITAL LETTER EIE + { 0x2C8A, 0x2C8B }, // COPTIC CAPITAL LETTER SOU + { 0x2C8C, 0x2C8D }, // COPTIC CAPITAL LETTER ZATA + { 0x2C8E, 0x2C8F }, // COPTIC CAPITAL LETTER HATE + { 0x2C90, 0x2C91 }, // COPTIC CAPITAL LETTER THETHE + { 0x2C92, 0x2C93 }, // COPTIC CAPITAL LETTER IAUDA + { 0x2C94, 0x2C95 }, // COPTIC CAPITAL LETTER KAPA + { 0x2C96, 0x2C97 }, // COPTIC CAPITAL LETTER LAULA + { 0x2C98, 0x2C99 }, // COPTIC CAPITAL LETTER MI + { 0x2C9A, 0x2C9B }, // COPTIC CAPITAL LETTER NI + { 0x2C9C, 0x2C9D }, // COPTIC CAPITAL LETTER KSI + { 0x2C9E, 0x2C9F }, // COPTIC CAPITAL LETTER O + { 0x2CA0, 0x2CA1 }, // COPTIC CAPITAL LETTER PI + { 0x2CA2, 0x2CA3 }, // COPTIC CAPITAL LETTER RO + { 0x2CA4, 0x2CA5 }, // COPTIC CAPITAL LETTER SIMA + { 0x2CA6, 0x2CA7 }, // COPTIC CAPITAL LETTER TAU + { 0x2CA8, 0x2CA9 }, // COPTIC CAPITAL LETTER UA + { 0x2CAA, 0x2CAB }, // COPTIC CAPITAL LETTER FI + { 0x2CAC, 0x2CAD }, // COPTIC CAPITAL LETTER KHI + { 0x2CAE, 0x2CAF }, // COPTIC CAPITAL LETTER PSI + { 0x2CB0, 0x2CB1 }, // COPTIC CAPITAL LETTER OOU + { 0x2CB2, 0x2CB3 }, // COPTIC CAPITAL LETTER DIALECT-P ALEF + { 0x2CB4, 0x2CB5 }, // COPTIC CAPITAL LETTER OLD COPTIC AIN + { 0x2CB6, 0x2CB7 }, // COPTIC CAPITAL LETTER CRYPTOGRAMMIC EIE + { 0x2CB8, 0x2CB9 }, // COPTIC CAPITAL LETTER DIALECT-P KAPA + { 0x2CBA, 0x2CBB }, // COPTIC CAPITAL LETTER DIALECT-P NI + { 0x2CBC, 0x2CBD }, // COPTIC CAPITAL LETTER CRYPTOGRAMMIC NI + { 0x2CBE, 0x2CBF }, // COPTIC CAPITAL LETTER OLD COPTIC OOU + { 0x2CC0, 0x2CC1 }, // COPTIC CAPITAL LETTER SAMPI + { 0x2CC2, 0x2CC3 }, // COPTIC CAPITAL LETTER CROSSED SHEI + { 0x2CC4, 0x2CC5 }, // COPTIC CAPITAL LETTER OLD COPTIC SHEI + { 0x2CC6, 0x2CC7 }, // COPTIC CAPITAL LETTER OLD COPTIC ESH + { 0x2CC8, 0x2CC9 }, // COPTIC CAPITAL LETTER AKHMIMIC KHEI + { 0x2CCA, 0x2CCB }, // COPTIC CAPITAL LETTER DIALECT-P HORI + { 0x2CCC, 0x2CCD }, // COPTIC CAPITAL LETTER OLD COPTIC HORI + { 0x2CCE, 0x2CCF }, // COPTIC CAPITAL LETTER OLD COPTIC HA + { 0x2CD0, 0x2CD1 }, // COPTIC CAPITAL LETTER L-SHAPED HA + { 0x2CD2, 0x2CD3 }, // COPTIC CAPITAL LETTER OLD COPTIC HEI + { 0x2CD4, 0x2CD5 }, // COPTIC CAPITAL LETTER OLD COPTIC HAT + { 0x2CD6, 0x2CD7 }, // COPTIC CAPITAL LETTER OLD COPTIC GANGIA + { 0x2CD8, 0x2CD9 }, // COPTIC CAPITAL LETTER OLD COPTIC DJA + { 0x2CDA, 0x2CDB }, // COPTIC CAPITAL LETTER OLD COPTIC SHIMA + { 0x2CDC, 0x2CDD }, // COPTIC CAPITAL LETTER OLD NUBIAN SHIMA + { 0x2CDE, 0x2CDF }, // COPTIC CAPITAL LETTER OLD NUBIAN NGI + { 0x2CE0, 0x2CE1 }, // COPTIC CAPITAL LETTER OLD NUBIAN NYI + { 0x2CE2, 0x2CE3 }, // COPTIC CAPITAL LETTER OLD NUBIAN WAU + { 0x2CEB, 0x2CEC }, // COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI + { 0x2CED, 0x2CEE }, // COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA + { 0xA640, 0xA641 }, // CYRILLIC CAPITAL LETTER ZEMLYA + { 0xA642, 0xA643 }, // CYRILLIC CAPITAL LETTER DZELO + { 0xA644, 0xA645 }, // CYRILLIC CAPITAL LETTER REVERSED DZE + { 0xA646, 0xA647 }, // CYRILLIC CAPITAL LETTER IOTA + { 0xA648, 0xA649 }, // CYRILLIC CAPITAL LETTER DJERV + { 0xA64A, 0xA64B }, // CYRILLIC CAPITAL LETTER MONOGRAPH UK + { 0xA64C, 0xA64D }, // CYRILLIC CAPITAL LETTER BROAD OMEGA + { 0xA64E, 0xA64F }, // CYRILLIC CAPITAL LETTER NEUTRAL YER + { 0xA650, 0xA651 }, // CYRILLIC CAPITAL LETTER YERU WITH BACK YER + { 0xA652, 0xA653 }, // CYRILLIC CAPITAL LETTER IOTIFIED YAT + { 0xA654, 0xA655 }, // CYRILLIC CAPITAL LETTER REVERSED YU + { 0xA656, 0xA657 }, // CYRILLIC CAPITAL LETTER IOTIFIED A + { 0xA658, 0xA659 }, // CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS + { 0xA65A, 0xA65B }, // CYRILLIC CAPITAL LETTER BLENDED YUS + { 0xA65C, 0xA65D }, // CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS + { 0xA65E, 0xA65F }, // CYRILLIC CAPITAL LETTER YN + { 0xA660, 0xA661 }, // CYRILLIC CAPITAL LETTER REVERSED TSE + { 0xA662, 0xA663 }, // CYRILLIC CAPITAL LETTER SOFT DE + { 0xA664, 0xA665 }, // CYRILLIC CAPITAL LETTER SOFT EL + { 0xA666, 0xA667 }, // CYRILLIC CAPITAL LETTER SOFT EM + { 0xA668, 0xA669 }, // CYRILLIC CAPITAL LETTER MONOCULAR O + { 0xA66A, 0xA66B }, // CYRILLIC CAPITAL LETTER BINOCULAR O + { 0xA66C, 0xA66D }, // CYRILLIC CAPITAL LETTER DOUBLE MONOCULAR O + { 0xA680, 0xA681 }, // CYRILLIC CAPITAL LETTER DWE + { 0xA682, 0xA683 }, // CYRILLIC CAPITAL LETTER DZWE + { 0xA684, 0xA685 }, // CYRILLIC CAPITAL LETTER ZHWE + { 0xA686, 0xA687 }, // CYRILLIC CAPITAL LETTER CCHE + { 0xA688, 0xA689 }, // CYRILLIC CAPITAL LETTER DZZE + { 0xA68A, 0xA68B }, // CYRILLIC CAPITAL LETTER TE WITH MIDDLE HOOK + { 0xA68C, 0xA68D }, // CYRILLIC CAPITAL LETTER TWE + { 0xA68E, 0xA68F }, // CYRILLIC CAPITAL LETTER TSWE + { 0xA690, 0xA691 }, // CYRILLIC CAPITAL LETTER TSSE + { 0xA692, 0xA693 }, // CYRILLIC CAPITAL LETTER TCHE + { 0xA694, 0xA695 }, // CYRILLIC CAPITAL LETTER HWE + { 0xA696, 0xA697 }, // CYRILLIC CAPITAL LETTER SHWE + { 0xA722, 0xA723 }, // LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF + { 0xA724, 0xA725 }, // LATIN CAPITAL LETTER EGYPTOLOGICAL AIN + { 0xA726, 0xA727 }, // LATIN CAPITAL LETTER HENG + { 0xA728, 0xA729 }, // LATIN CAPITAL LETTER TZ + { 0xA72A, 0xA72B }, // LATIN CAPITAL LETTER TRESILLO + { 0xA72C, 0xA72D }, // LATIN CAPITAL LETTER CUATRILLO + { 0xA72E, 0xA72F }, // LATIN CAPITAL LETTER CUATRILLO WITH COMMA + { 0xA732, 0xA733 }, // LATIN CAPITAL LETTER AA + { 0xA734, 0xA735 }, // LATIN CAPITAL LETTER AO + { 0xA736, 0xA737 }, // LATIN CAPITAL LETTER AU + { 0xA738, 0xA739 }, // LATIN CAPITAL LETTER AV + { 0xA73A, 0xA73B }, // LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR + { 0xA73C, 0xA73D }, // LATIN CAPITAL LETTER AY + { 0xA73E, 0xA73F }, // LATIN CAPITAL LETTER REVERSED C WITH DOT + { 0xA740, 0xA741 }, // LATIN CAPITAL LETTER K WITH STROKE + { 0xA742, 0xA743 }, // LATIN CAPITAL LETTER K WITH DIAGONAL STROKE + { 0xA744, 0xA745 }, // LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE + { 0xA746, 0xA747 }, // LATIN CAPITAL LETTER BROKEN L + { 0xA748, 0xA749 }, // LATIN CAPITAL LETTER L WITH HIGH STROKE + { 0xA74A, 0xA74B }, // LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY + { 0xA74C, 0xA74D }, // LATIN CAPITAL LETTER O WITH LOOP + { 0xA74E, 0xA74F }, // LATIN CAPITAL LETTER OO + { 0xA750, 0xA751 }, // LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER + { 0xA752, 0xA753 }, // LATIN CAPITAL LETTER P WITH FLOURISH + { 0xA754, 0xA755 }, // LATIN CAPITAL LETTER P WITH SQUIRREL TAIL + { 0xA756, 0xA757 }, // LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER + { 0xA758, 0xA759 }, // LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE + { 0xA75A, 0xA75B }, // LATIN CAPITAL LETTER R ROTUNDA + { 0xA75C, 0xA75D }, // LATIN CAPITAL LETTER RUM ROTUNDA + { 0xA75E, 0xA75F }, // LATIN CAPITAL LETTER V WITH DIAGONAL STROKE + { 0xA760, 0xA761 }, // LATIN CAPITAL LETTER VY + { 0xA762, 0xA763 }, // LATIN CAPITAL LETTER VISIGOTHIC Z + { 0xA764, 0xA765 }, // LATIN CAPITAL LETTER THORN WITH STROKE + { 0xA766, 0xA767 }, // LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER + { 0xA768, 0xA769 }, // LATIN CAPITAL LETTER VEND + { 0xA76A, 0xA76B }, // LATIN CAPITAL LETTER ET + { 0xA76C, 0xA76D }, // LATIN CAPITAL LETTER IS + { 0xA76E, 0xA76F }, // LATIN CAPITAL LETTER CON + { 0xA779, 0xA77A }, // LATIN CAPITAL LETTER INSULAR D + { 0xA77B, 0xA77C }, // LATIN CAPITAL LETTER INSULAR F + { 0xA77D, 0x1D79 }, // LATIN CAPITAL LETTER INSULAR G + { 0xA77E, 0xA77F }, // LATIN CAPITAL LETTER TURNED INSULAR G + { 0xA780, 0xA781 }, // LATIN CAPITAL LETTER TURNED L + { 0xA782, 0xA783 }, // LATIN CAPITAL LETTER INSULAR R + { 0xA784, 0xA785 }, // LATIN CAPITAL LETTER INSULAR S + { 0xA786, 0xA787 }, // LATIN CAPITAL LETTER INSULAR T + { 0xA78B, 0xA78C }, // LATIN CAPITAL LETTER SALTILLO + { 0xA78D, 0x0265 }, // LATIN CAPITAL LETTER TURNED H + { 0xA790, 0xA791 }, // LATIN CAPITAL LETTER N WITH DESCENDER + { 0xA7A0, 0xA7A1 }, // LATIN CAPITAL LETTER G WITH OBLIQUE STROKE + { 0xA7A2, 0xA7A3 }, // LATIN CAPITAL LETTER K WITH OBLIQUE STROKE + { 0xA7A4, 0xA7A5 }, // LATIN CAPITAL LETTER N WITH OBLIQUE STROKE + { 0xA7A6, 0xA7A7 }, // LATIN CAPITAL LETTER R WITH OBLIQUE STROKE + { 0xA7A8, 0xA7A9 }, // LATIN CAPITAL LETTER S WITH OBLIQUE STROKE + { 0xFF21, 0xFF41 }, // FULLWIDTH LATIN CAPITAL LETTER A + { 0xFF22, 0xFF42 }, // FULLWIDTH LATIN CAPITAL LETTER B + { 0xFF23, 0xFF43 }, // FULLWIDTH LATIN CAPITAL LETTER C + { 0xFF24, 0xFF44 }, // FULLWIDTH LATIN CAPITAL LETTER D + { 0xFF25, 0xFF45 }, // FULLWIDTH LATIN CAPITAL LETTER E + { 0xFF26, 0xFF46 }, // FULLWIDTH LATIN CAPITAL LETTER F + { 0xFF27, 0xFF47 }, // FULLWIDTH LATIN CAPITAL LETTER G + { 0xFF28, 0xFF48 }, // FULLWIDTH LATIN CAPITAL LETTER H + { 0xFF29, 0xFF49 }, // FULLWIDTH LATIN CAPITAL LETTER I + { 0xFF2A, 0xFF4A }, // FULLWIDTH LATIN CAPITAL LETTER J + { 0xFF2B, 0xFF4B }, // FULLWIDTH LATIN CAPITAL LETTER K + { 0xFF2C, 0xFF4C }, // FULLWIDTH LATIN CAPITAL LETTER L + { 0xFF2D, 0xFF4D }, // FULLWIDTH LATIN CAPITAL LETTER M + { 0xFF2E, 0xFF4E }, // FULLWIDTH LATIN CAPITAL LETTER N + { 0xFF2F, 0xFF4F }, // FULLWIDTH LATIN CAPITAL LETTER O + { 0xFF30, 0xFF50 }, // FULLWIDTH LATIN CAPITAL LETTER P + { 0xFF31, 0xFF51 }, // FULLWIDTH LATIN CAPITAL LETTER Q + { 0xFF32, 0xFF52 }, // FULLWIDTH LATIN CAPITAL LETTER R + { 0xFF33, 0xFF53 }, // FULLWIDTH LATIN CAPITAL LETTER S + { 0xFF34, 0xFF54 }, // FULLWIDTH LATIN CAPITAL LETTER T + { 0xFF35, 0xFF55 }, // FULLWIDTH LATIN CAPITAL LETTER U + { 0xFF36, 0xFF56 }, // FULLWIDTH LATIN CAPITAL LETTER V + { 0xFF37, 0xFF57 }, // FULLWIDTH LATIN CAPITAL LETTER W + { 0xFF38, 0xFF58 }, // FULLWIDTH LATIN CAPITAL LETTER X + { 0xFF39, 0xFF59 }, // FULLWIDTH LATIN CAPITAL LETTER Y + { 0xFF3A, 0xFF5A } // FULLWIDTH LATIN CAPITAL LETTER Z +}; + +static int compare_pair_capital(const void *a, const void *b) { + return static_cast<int>(*static_cast<const unsigned short *>(a)) + - static_cast<int>((static_cast<const struct LatinCapitalSmallPair *>(b))->capital); +} + +/* static */ unsigned short CharUtils::latin_tolower(const unsigned short c) { + struct LatinCapitalSmallPair *p = + static_cast<struct LatinCapitalSmallPair *>(bsearch(&c, SORTED_CHAR_MAP, + NELEMS(SORTED_CHAR_MAP), sizeof(SORTED_CHAR_MAP[0]), compare_pair_capital)); + return p ? p->small : c; +} + +/* + * Table mapping most combined Latin, Greek, and Cyrillic characters + * to their base characters. If c is in range, CharUtils::BASE_CHARS[c] == c + * if c is not a combined character, or the base character if it + * is combined. + * + * Generated with: + * cat UnicodeData.txt | perl -e 'while (<>) { @foo = split(/;/); $foo[5] =~ s/<.*> //; \ + * $base[hex($foo[0])] = hex($foo[5]);} \ + * for ($i = 0; $i < 0x500; $i += 8) { printf("/" . "* U+%04X *" . "/ ", $i); \ + * for ($j = $i; $j < $i + 8; $j++) { \ + * printf("0x%04X, ", $base[$j] ? $base[$j] : $j)}; print "\n"; }' + */ +/* static */ const unsigned short CharUtils::BASE_CHARS[CharUtils::BASE_CHARS_SIZE] = { + /* U+0000 */ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, + /* U+0008 */ 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, + /* U+0010 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, + /* U+0018 */ 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, + /* U+0020 */ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, + /* U+0028 */ 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, + /* U+0030 */ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, + /* U+0038 */ 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, + /* U+0040 */ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, + /* U+0048 */ 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, + /* U+0050 */ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, + /* U+0058 */ 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, + /* U+0060 */ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, + /* U+0068 */ 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, + /* U+0070 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, + /* U+0078 */ 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, + /* U+0080 */ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, + /* U+0088 */ 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F, + /* U+0090 */ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, + /* U+0098 */ 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, + /* U+00A0 */ 0x0020, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, + /* U+00A8 */ 0x0020, 0x00A9, 0x0061, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x0020, + /* U+00B0 */ 0x00B0, 0x00B1, 0x0032, 0x0033, 0x0020, 0x03BC, 0x00B6, 0x00B7, + /* U+00B8 */ 0x0020, 0x0031, 0x006F, 0x00BB, 0x0031, 0x0031, 0x0033, 0x00BF, + /* U+00C0 */ 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x00C6, 0x0043, + /* U+00C8 */ 0x0045, 0x0045, 0x0045, 0x0045, 0x0049, 0x0049, 0x0049, 0x0049, + /* U+00D0 */ 0x00D0, 0x004E, 0x004F, 0x004F, 0x004F, 0x004F, 0x004F, 0x00D7, + /* U+00D8 */ 0x004F, 0x0055, 0x0055, 0x0055, 0x0055, 0x0059, 0x00DE, 0x0073, + // U+00D8: Manually changed from 00D8 to 004F + // TODO: Check if it's really acceptable to consider Ø a diacritical variant of O + // U+00DF: Manually changed from 00DF to 0073 + /* U+00E0 */ 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x00E6, 0x0063, + /* U+00E8 */ 0x0065, 0x0065, 0x0065, 0x0065, 0x0069, 0x0069, 0x0069, 0x0069, + /* U+00F0 */ 0x00F0, 0x006E, 0x006F, 0x006F, 0x006F, 0x006F, 0x006F, 0x00F7, + /* U+00F8 */ 0x006F, 0x0075, 0x0075, 0x0075, 0x0075, 0x0079, 0x00FE, 0x0079, + // U+00F8: Manually changed from 00F8 to 006F + // TODO: Check if it's really acceptable to consider ø a diacritical variant of o + /* U+0100 */ 0x0041, 0x0061, 0x0041, 0x0061, 0x0041, 0x0061, 0x0043, 0x0063, + /* U+0108 */ 0x0043, 0x0063, 0x0043, 0x0063, 0x0043, 0x0063, 0x0044, 0x0064, + /* U+0110 */ 0x0110, 0x0111, 0x0045, 0x0065, 0x0045, 0x0065, 0x0045, 0x0065, + /* U+0118 */ 0x0045, 0x0065, 0x0045, 0x0065, 0x0047, 0x0067, 0x0047, 0x0067, + /* U+0120 */ 0x0047, 0x0067, 0x0047, 0x0067, 0x0048, 0x0068, 0x0126, 0x0127, + /* U+0128 */ 0x0049, 0x0069, 0x0049, 0x0069, 0x0049, 0x0069, 0x0049, 0x0069, + /* U+0130 */ 0x0049, 0x0131, 0x0049, 0x0069, 0x004A, 0x006A, 0x004B, 0x006B, + /* U+0138 */ 0x0138, 0x004C, 0x006C, 0x004C, 0x006C, 0x004C, 0x006C, 0x004C, + /* U+0140 */ 0x006C, 0x004C, 0x006C, 0x004E, 0x006E, 0x004E, 0x006E, 0x004E, + // U+0141: Manually changed from 0141 to 004C + // U+0142: Manually changed from 0142 to 006C + /* U+0148 */ 0x006E, 0x02BC, 0x014A, 0x014B, 0x004F, 0x006F, 0x004F, 0x006F, + /* U+0150 */ 0x004F, 0x006F, 0x0152, 0x0153, 0x0052, 0x0072, 0x0052, 0x0072, + /* U+0158 */ 0x0052, 0x0072, 0x0053, 0x0073, 0x0053, 0x0073, 0x0053, 0x0073, + /* U+0160 */ 0x0053, 0x0073, 0x0054, 0x0074, 0x0054, 0x0074, 0x0166, 0x0167, + /* U+0168 */ 0x0055, 0x0075, 0x0055, 0x0075, 0x0055, 0x0075, 0x0055, 0x0075, + /* U+0170 */ 0x0055, 0x0075, 0x0055, 0x0075, 0x0057, 0x0077, 0x0059, 0x0079, + /* U+0178 */ 0x0059, 0x005A, 0x007A, 0x005A, 0x007A, 0x005A, 0x007A, 0x0073, + /* U+0180 */ 0x0180, 0x0181, 0x0182, 0x0183, 0x0184, 0x0185, 0x0186, 0x0187, + /* U+0188 */ 0x0188, 0x0189, 0x018A, 0x018B, 0x018C, 0x018D, 0x018E, 0x018F, + /* U+0190 */ 0x0190, 0x0191, 0x0192, 0x0193, 0x0194, 0x0195, 0x0196, 0x0197, + /* U+0198 */ 0x0198, 0x0199, 0x019A, 0x019B, 0x019C, 0x019D, 0x019E, 0x019F, + /* U+01A0 */ 0x004F, 0x006F, 0x01A2, 0x01A3, 0x01A4, 0x01A5, 0x01A6, 0x01A7, + /* U+01A8 */ 0x01A8, 0x01A9, 0x01AA, 0x01AB, 0x01AC, 0x01AD, 0x01AE, 0x0055, + /* U+01B0 */ 0x0075, 0x01B1, 0x01B2, 0x01B3, 0x01B4, 0x01B5, 0x01B6, 0x01B7, + /* U+01B8 */ 0x01B8, 0x01B9, 0x01BA, 0x01BB, 0x01BC, 0x01BD, 0x01BE, 0x01BF, + /* U+01C0 */ 0x01C0, 0x01C1, 0x01C2, 0x01C3, 0x0044, 0x0044, 0x0064, 0x004C, + /* U+01C8 */ 0x004C, 0x006C, 0x004E, 0x004E, 0x006E, 0x0041, 0x0061, 0x0049, + /* U+01D0 */ 0x0069, 0x004F, 0x006F, 0x0055, 0x0075, 0x0055, 0x0075, 0x0055, + // U+01D5: Manually changed from 00DC to 0055 + // U+01D6: Manually changed from 00FC to 0075 + // U+01D7: Manually changed from 00DC to 0055 + /* U+01D8 */ 0x0075, 0x0055, 0x0075, 0x0055, 0x0075, 0x01DD, 0x0041, 0x0061, + // U+01D8: Manually changed from 00FC to 0075 + // U+01D9: Manually changed from 00DC to 0055 + // U+01DA: Manually changed from 00FC to 0075 + // U+01DB: Manually changed from 00DC to 0055 + // U+01DC: Manually changed from 00FC to 0075 + // U+01DE: Manually changed from 00C4 to 0041 + // U+01DF: Manually changed from 00E4 to 0061 + /* U+01E0 */ 0x0041, 0x0061, 0x00C6, 0x00E6, 0x01E4, 0x01E5, 0x0047, 0x0067, + // U+01E0: Manually changed from 0226 to 0041 + // U+01E1: Manually changed from 0227 to 0061 + /* U+01E8 */ 0x004B, 0x006B, 0x004F, 0x006F, 0x004F, 0x006F, 0x01B7, 0x0292, + // U+01EC: Manually changed from 01EA to 004F + // U+01ED: Manually changed from 01EB to 006F + /* U+01F0 */ 0x006A, 0x0044, 0x0044, 0x0064, 0x0047, 0x0067, 0x01F6, 0x01F7, + /* U+01F8 */ 0x004E, 0x006E, 0x0041, 0x0061, 0x00C6, 0x00E6, 0x004F, 0x006F, + // U+01FA: Manually changed from 00C5 to 0041 + // U+01FB: Manually changed from 00E5 to 0061 + // U+01FE: Manually changed from 00D8 to 004F + // TODO: Check if it's really acceptable to consider Ø a diacritical variant of O + // U+01FF: Manually changed from 00F8 to 006F + // TODO: Check if it's really acceptable to consider ø a diacritical variant of o + /* U+0200 */ 0x0041, 0x0061, 0x0041, 0x0061, 0x0045, 0x0065, 0x0045, 0x0065, + /* U+0208 */ 0x0049, 0x0069, 0x0049, 0x0069, 0x004F, 0x006F, 0x004F, 0x006F, + /* U+0210 */ 0x0052, 0x0072, 0x0052, 0x0072, 0x0055, 0x0075, 0x0055, 0x0075, + /* U+0218 */ 0x0053, 0x0073, 0x0054, 0x0074, 0x021C, 0x021D, 0x0048, 0x0068, + /* U+0220 */ 0x0220, 0x0221, 0x0222, 0x0223, 0x0224, 0x0225, 0x0041, 0x0061, + /* U+0228 */ 0x0045, 0x0065, 0x004F, 0x006F, 0x004F, 0x006F, 0x004F, 0x006F, + // U+022A: Manually changed from 00D6 to 004F + // U+022B: Manually changed from 00F6 to 006F + // U+022C: Manually changed from 00D5 to 004F + // U+022D: Manually changed from 00F5 to 006F + /* U+0230 */ 0x004F, 0x006F, 0x0059, 0x0079, 0x0234, 0x0235, 0x0236, 0x0237, + // U+0230: Manually changed from 022E to 004F + // U+0231: Manually changed from 022F to 006F + /* U+0238 */ 0x0238, 0x0239, 0x023A, 0x023B, 0x023C, 0x023D, 0x023E, 0x023F, + /* U+0240 */ 0x0240, 0x0241, 0x0242, 0x0243, 0x0244, 0x0245, 0x0246, 0x0247, + /* U+0248 */ 0x0248, 0x0249, 0x024A, 0x024B, 0x024C, 0x024D, 0x024E, 0x024F, + /* U+0250 */ 0x0250, 0x0251, 0x0252, 0x0253, 0x0254, 0x0255, 0x0256, 0x0257, + /* U+0258 */ 0x0258, 0x0259, 0x025A, 0x025B, 0x025C, 0x025D, 0x025E, 0x025F, + /* U+0260 */ 0x0260, 0x0261, 0x0262, 0x0263, 0x0264, 0x0265, 0x0266, 0x0267, + /* U+0268 */ 0x0268, 0x0269, 0x026A, 0x026B, 0x026C, 0x026D, 0x026E, 0x026F, + /* U+0270 */ 0x0270, 0x0271, 0x0272, 0x0273, 0x0274, 0x0275, 0x0276, 0x0277, + /* U+0278 */ 0x0278, 0x0279, 0x027A, 0x027B, 0x027C, 0x027D, 0x027E, 0x027F, + /* U+0280 */ 0x0280, 0x0281, 0x0282, 0x0283, 0x0284, 0x0285, 0x0286, 0x0287, + /* U+0288 */ 0x0288, 0x0289, 0x028A, 0x028B, 0x028C, 0x028D, 0x028E, 0x028F, + /* U+0290 */ 0x0290, 0x0291, 0x0292, 0x0293, 0x0294, 0x0295, 0x0296, 0x0297, + /* U+0298 */ 0x0298, 0x0299, 0x029A, 0x029B, 0x029C, 0x029D, 0x029E, 0x029F, + /* U+02A0 */ 0x02A0, 0x02A1, 0x02A2, 0x02A3, 0x02A4, 0x02A5, 0x02A6, 0x02A7, + /* U+02A8 */ 0x02A8, 0x02A9, 0x02AA, 0x02AB, 0x02AC, 0x02AD, 0x02AE, 0x02AF, + /* U+02B0 */ 0x0068, 0x0266, 0x006A, 0x0072, 0x0279, 0x027B, 0x0281, 0x0077, + /* U+02B8 */ 0x0079, 0x02B9, 0x02BA, 0x02BB, 0x02BC, 0x02BD, 0x02BE, 0x02BF, + /* U+02C0 */ 0x02C0, 0x02C1, 0x02C2, 0x02C3, 0x02C4, 0x02C5, 0x02C6, 0x02C7, + /* U+02C8 */ 0x02C8, 0x02C9, 0x02CA, 0x02CB, 0x02CC, 0x02CD, 0x02CE, 0x02CF, + /* U+02D0 */ 0x02D0, 0x02D1, 0x02D2, 0x02D3, 0x02D4, 0x02D5, 0x02D6, 0x02D7, + /* U+02D8 */ 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x02DE, 0x02DF, + /* U+02E0 */ 0x0263, 0x006C, 0x0073, 0x0078, 0x0295, 0x02E5, 0x02E6, 0x02E7, + /* U+02E8 */ 0x02E8, 0x02E9, 0x02EA, 0x02EB, 0x02EC, 0x02ED, 0x02EE, 0x02EF, + /* U+02F0 */ 0x02F0, 0x02F1, 0x02F2, 0x02F3, 0x02F4, 0x02F5, 0x02F6, 0x02F7, + /* U+02F8 */ 0x02F8, 0x02F9, 0x02FA, 0x02FB, 0x02FC, 0x02FD, 0x02FE, 0x02FF, + /* U+0300 */ 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, 0x0307, + /* U+0308 */ 0x0308, 0x0309, 0x030A, 0x030B, 0x030C, 0x030D, 0x030E, 0x030F, + /* U+0310 */ 0x0310, 0x0311, 0x0312, 0x0313, 0x0314, 0x0315, 0x0316, 0x0317, + /* U+0318 */ 0x0318, 0x0319, 0x031A, 0x031B, 0x031C, 0x031D, 0x031E, 0x031F, + /* U+0320 */ 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, 0x0327, + /* U+0328 */ 0x0328, 0x0329, 0x032A, 0x032B, 0x032C, 0x032D, 0x032E, 0x032F, + /* U+0330 */ 0x0330, 0x0331, 0x0332, 0x0333, 0x0334, 0x0335, 0x0336, 0x0337, + /* U+0338 */ 0x0338, 0x0339, 0x033A, 0x033B, 0x033C, 0x033D, 0x033E, 0x033F, + /* U+0340 */ 0x0300, 0x0301, 0x0342, 0x0313, 0x0308, 0x0345, 0x0346, 0x0347, + /* U+0348 */ 0x0348, 0x0349, 0x034A, 0x034B, 0x034C, 0x034D, 0x034E, 0x034F, + /* U+0350 */ 0x0350, 0x0351, 0x0352, 0x0353, 0x0354, 0x0355, 0x0356, 0x0357, + /* U+0358 */ 0x0358, 0x0359, 0x035A, 0x035B, 0x035C, 0x035D, 0x035E, 0x035F, + /* U+0360 */ 0x0360, 0x0361, 0x0362, 0x0363, 0x0364, 0x0365, 0x0366, 0x0367, + /* U+0368 */ 0x0368, 0x0369, 0x036A, 0x036B, 0x036C, 0x036D, 0x036E, 0x036F, + /* U+0370 */ 0x0370, 0x0371, 0x0372, 0x0373, 0x02B9, 0x0375, 0x0376, 0x0377, + /* U+0378 */ 0x0378, 0x0379, 0x0020, 0x037B, 0x037C, 0x037D, 0x003B, 0x037F, + /* U+0380 */ 0x0380, 0x0381, 0x0382, 0x0383, 0x0020, 0x00A8, 0x0391, 0x00B7, + /* U+0388 */ 0x0395, 0x0397, 0x0399, 0x038B, 0x039F, 0x038D, 0x03A5, 0x03A9, + /* U+0390 */ 0x03CA, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, + /* U+0398 */ 0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F, + /* U+03A0 */ 0x03A0, 0x03A1, 0x03A2, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7, + /* U+03A8 */ 0x03A8, 0x03A9, 0x0399, 0x03A5, 0x03B1, 0x03B5, 0x03B7, 0x03B9, + /* U+03B0 */ 0x03CB, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, + /* U+03B8 */ 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF, + /* U+03C0 */ 0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, + /* U+03C8 */ 0x03C8, 0x03C9, 0x03B9, 0x03C5, 0x03BF, 0x03C5, 0x03C9, 0x03CF, + /* U+03D0 */ 0x03B2, 0x03B8, 0x03A5, 0x03D2, 0x03D2, 0x03C6, 0x03C0, 0x03D7, + /* U+03D8 */ 0x03D8, 0x03D9, 0x03DA, 0x03DB, 0x03DC, 0x03DD, 0x03DE, 0x03DF, + /* U+03E0 */ 0x03E0, 0x03E1, 0x03E2, 0x03E3, 0x03E4, 0x03E5, 0x03E6, 0x03E7, + /* U+03E8 */ 0x03E8, 0x03E9, 0x03EA, 0x03EB, 0x03EC, 0x03ED, 0x03EE, 0x03EF, + /* U+03F0 */ 0x03BA, 0x03C1, 0x03C2, 0x03F3, 0x0398, 0x03B5, 0x03F6, 0x03F7, + /* U+03F8 */ 0x03F8, 0x03A3, 0x03FA, 0x03FB, 0x03FC, 0x03FD, 0x03FE, 0x03FF, + /* U+0400 */ 0x0415, 0x0415, 0x0402, 0x0413, 0x0404, 0x0405, 0x0406, 0x0406, + /* U+0408 */ 0x0408, 0x0409, 0x040A, 0x040B, 0x041A, 0x0418, 0x0423, 0x040F, + /* U+0410 */ 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, + /* U+0418 */ 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F, + // U+0419: Manually changed from 0418 to 0419 + /* U+0420 */ 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, + /* U+0428 */ 0x0428, 0x0429, 0x042C, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F, + // U+042A: Manually changed from 042A to 042C + /* U+0430 */ 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, + /* U+0438 */ 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, + // U+0439: Manually changed from 0438 to 0439 + /* U+0440 */ 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, + /* U+0448 */ 0x0448, 0x0449, 0x044C, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, + // U+044A: Manually changed from 044A to 044C + /* U+0450 */ 0x0435, 0x0435, 0x0452, 0x0433, 0x0454, 0x0455, 0x0456, 0x0456, + /* U+0458 */ 0x0458, 0x0459, 0x045A, 0x045B, 0x043A, 0x0438, 0x0443, 0x045F, + /* U+0460 */ 0x0460, 0x0461, 0x0462, 0x0463, 0x0464, 0x0465, 0x0466, 0x0467, + /* U+0468 */ 0x0468, 0x0469, 0x046A, 0x046B, 0x046C, 0x046D, 0x046E, 0x046F, + /* U+0470 */ 0x0470, 0x0471, 0x0472, 0x0473, 0x0474, 0x0475, 0x0474, 0x0475, + /* U+0478 */ 0x0478, 0x0479, 0x047A, 0x047B, 0x047C, 0x047D, 0x047E, 0x047F, + /* U+0480 */ 0x0480, 0x0481, 0x0482, 0x0483, 0x0484, 0x0485, 0x0486, 0x0487, + /* U+0488 */ 0x0488, 0x0489, 0x048A, 0x048B, 0x048C, 0x048D, 0x048E, 0x048F, + /* U+0490 */ 0x0490, 0x0491, 0x0492, 0x0493, 0x0494, 0x0495, 0x0496, 0x0497, + /* U+0498 */ 0x0498, 0x0499, 0x049A, 0x049B, 0x049C, 0x049D, 0x049E, 0x049F, + /* U+04A0 */ 0x04A0, 0x04A1, 0x04A2, 0x04A3, 0x04A4, 0x04A5, 0x04A6, 0x04A7, + /* U+04A8 */ 0x04A8, 0x04A9, 0x04AA, 0x04AB, 0x04AC, 0x04AD, 0x04AE, 0x04AF, + /* U+04B0 */ 0x04B0, 0x04B1, 0x04B2, 0x04B3, 0x04B4, 0x04B5, 0x04B6, 0x04B7, + /* U+04B8 */ 0x04B8, 0x04B9, 0x04BA, 0x04BB, 0x04BC, 0x04BD, 0x04BE, 0x04BF, + /* U+04C0 */ 0x04C0, 0x0416, 0x0436, 0x04C3, 0x04C4, 0x04C5, 0x04C6, 0x04C7, + /* U+04C8 */ 0x04C8, 0x04C9, 0x04CA, 0x04CB, 0x04CC, 0x04CD, 0x04CE, 0x04CF, + /* U+04D0 */ 0x0410, 0x0430, 0x0410, 0x0430, 0x04D4, 0x04D5, 0x0415, 0x0435, + /* U+04D8 */ 0x04D8, 0x04D9, 0x04D8, 0x04D9, 0x0416, 0x0436, 0x0417, 0x0437, + /* U+04E0 */ 0x04E0, 0x04E1, 0x0418, 0x0438, 0x0418, 0x0438, 0x041E, 0x043E, + /* U+04E8 */ 0x04E8, 0x04E9, 0x04E8, 0x04E9, 0x042D, 0x044D, 0x0423, 0x0443, + /* U+04F0 */ 0x0423, 0x0443, 0x0423, 0x0443, 0x0427, 0x0447, 0x04F6, 0x04F7, + /* U+04F8 */ 0x042B, 0x044B, 0x04FA, 0x04FB, 0x04FC, 0x04FD, 0x04FE, 0x04FF, +}; +} // namespace latinime
diff --git a/src/aosp/utils/char_utils.h b/src/aosp/utils/char_utils.h new file mode 100644 index 0000000..41663c8 --- /dev/null +++ b/src/aosp/utils/char_utils.h
@@ -0,0 +1,103 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_CHAR_UTILS_H +#define LATINIME_CHAR_UTILS_H + +#include <cctype> + +#include "defines.h" + +namespace latinime { + +class CharUtils { + public: + static AK_FORCE_INLINE bool isAsciiUpper(int c) { + // Note: isupper(...) reports false positives for some Cyrillic characters, causing them to + // be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...). + return (c >= 'A' && c <= 'Z'); + } + + static AK_FORCE_INLINE int toAsciiLower(int c) { + return c - 'A' + 'a'; + } + + static AK_FORCE_INLINE bool isAscii(int c) { + return isascii(c) != 0; + } + + static AK_FORCE_INLINE int toLowerCase(const int c) { + if (isAsciiUpper(c)) { + return toAsciiLower(c); + } + if (isAscii(c)) { + return c; + } + return static_cast<int>(latin_tolower(static_cast<unsigned short>(c))); + } + + static AK_FORCE_INLINE int toBaseLowerCase(const int c) { + return toLowerCase(toBaseCodePoint(c)); + } + + static AK_FORCE_INLINE bool isIntentionalOmissionCodePoint(const int codePoint) { + // TODO: Do not hardcode here + return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS; + } + + static AK_FORCE_INLINE int getCodePointCount(const int arraySize, const int *const codePoints) { + int size = 0; + for (; size < arraySize; ++size) { + if (codePoints[size] == '\0') { + break; + } + } + return size; + } + + static AK_FORCE_INLINE int toBaseCodePoint(int c) { + if (c < BASE_CHARS_SIZE) { + return static_cast<int>(BASE_CHARS[c]); + } + return c; + } + + static AK_FORCE_INLINE int getSpaceCount(const int *const codePointBuffer, const int length) { + int spaceCount = 0; + for (int i = 0; i < length; ++i) { + if (codePointBuffer[i] == KEYCODE_SPACE) { + ++spaceCount; + } + } + return spaceCount; + } + + static unsigned short latin_tolower(const unsigned short c); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils); + + /** + * Table mapping most combined Latin, Greek, and Cyrillic characters + * to their base characters. If c is in range, BASE_CHARS[c] == c + * if c is not a combined character, or the base character if it + * is combined. + */ + static const int BASE_CHARS_SIZE = 0x0500; + static const unsigned short BASE_CHARS[BASE_CHARS_SIZE]; +}; +} // namespace latinime +#endif // LATINIME_CHAR_UTILS_H
diff --git a/src/aosp/utils/hash_map_compat.h b/src/aosp/utils/hash_map_compat.h new file mode 100644 index 0000000..a1e982b --- /dev/null +++ b/src/aosp/utils/hash_map_compat.h
@@ -0,0 +1,34 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_HASH_MAP_COMPAT_H +#define LATINIME_HASH_MAP_COMPAT_H + +// TODO: Use std::unordered_map that has been standardized in C++11 + +#ifdef __APPLE__ +#include <ext/hash_map> +#else // __APPLE__ +#include <hash_map> +#endif // __APPLE__ + +#ifdef __SGI_STL_PORT +#define hash_map_compat stlport::hash_map +#else // __SGI_STL_PORT +#define hash_map_compat __gnu_cxx::hash_map +#endif // __SGI_STL_PORT + +#endif // LATINIME_HASH_MAP_COMPAT_H
diff --git a/src/aosp/utils/log_utils.cpp b/src/aosp/utils/log_utils.cpp new file mode 100644 index 0000000..5ab2b28 --- /dev/null +++ b/src/aosp/utils/log_utils.cpp
@@ -0,0 +1,72 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "log_utils.h" + +#include <cstdio> +#include <stdarg.h> + +#include "defines.h" + +namespace latinime { + /* static */ void LogUtils::logToJava(JNIEnv *const env, const char *const format, ...) { + static const char *TAG = "LatinIME:LogUtils"; + const jclass androidUtilLogClass = env->FindClass("android/util/Log"); + if (!androidUtilLogClass) { + // If we can't find the class, we are probably in off-device testing, and + // it's expected. Regardless, logging is not essential to functionality, so + // we should just return. However, FindClass has thrown an exception behind + // our back and there is no way to prevent it from doing that, so we clear + // the exception before we return. + env->ExceptionClear(); + return; + } + const jmethodID logDotIMethodId = env->GetStaticMethodID(androidUtilLogClass, "i", + "(Ljava/lang/String;Ljava/lang/String;)I"); + if (!logDotIMethodId) { + env->ExceptionClear(); + if (androidUtilLogClass) env->DeleteLocalRef(androidUtilLogClass); + return; + } + const jstring javaTag = env->NewStringUTF(TAG); + + static const int DEFAULT_LINE_SIZE = 128; + char fixedSizeCString[DEFAULT_LINE_SIZE]; + va_list argList; + va_start(argList, format); + // Get the necessary size. Add 1 for the 0 terminator. + const int size = vsnprintf(fixedSizeCString, DEFAULT_LINE_SIZE, format, argList) + 1; + va_end(argList); + + jstring javaString; + if (size <= DEFAULT_LINE_SIZE) { + // The buffer was large enough. + javaString = env->NewStringUTF(fixedSizeCString); + } else { + // The buffer was not large enough. + va_start(argList, format); + char variableSizeCString[size]; + vsnprintf(variableSizeCString, size, format, argList); + va_end(argList); + javaString = env->NewStringUTF(variableSizeCString); + } + + env->CallStaticIntMethod(androidUtilLogClass, logDotIMethodId, javaTag, javaString); + if (javaString) env->DeleteLocalRef(javaString); + if (javaTag) env->DeleteLocalRef(javaTag); + if (androidUtilLogClass) env->DeleteLocalRef(androidUtilLogClass); + } +}
diff --git a/src/aosp/utils/log_utils.h b/src/aosp/utils/log_utils.h new file mode 100644 index 0000000..6ac16d9 --- /dev/null +++ b/src/aosp/utils/log_utils.h
@@ -0,0 +1,37 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_LOG_UTILS_H +#define LATINIME_LOG_UTILS_H + +#include "defines.h" +#include "jni.h" + +namespace latinime { + +class LogUtils { + public: + static void logToJava(JNIEnv *const env, const char *const format, ...) +#ifdef __GNUC__ + __attribute__ ((format (printf, 2, 3))) +#endif // __GNUC__ + ; + + private: + DISALLOW_COPY_AND_ASSIGN(LogUtils); +}; +} // namespace latinime +#endif // LATINIME_LOG_UTILS_H
diff --git a/src/demo.cpp b/src/demo.cpp new file mode 100644 index 0000000..37b20cc --- /dev/null +++ b/src/demo.cpp
@@ -0,0 +1,88 @@ +#include <iostream> +#include <fstream> +#include <string> +#include <list> +#include <vector> + +#include <memory> + +#include <suggest/suggest.h> +#include <ncurses.h> + +using namespace suggest; + +int main(int argc, char** argv) { + // read a simple file with keyboard description + // file format: + // keyboard_width keyboard_height + // key_width key_height + // num_keys + // char x y width height + // char x y width height + // ... + std::ifstream keylist("keylist.txt"); + + vec2f keyboard_size; + vec2f common_key_size; + keylist >> keyboard_size.x >> keyboard_size.y; + keylist >> common_key_size.x >> common_key_size.y; + + int num_keys; + keylist >> num_keys; + + if (!keylist.good()) + return -1; + + std::vector<Key> keys(num_keys); + for (int i=0; i<num_keys; ++i) { + std::string codestr; + keylist >> codestr + >> keys[i].rect.pos.x >> keys[i].rect.pos.y + >> keys[i].rect.size.x >> keys[i].rect.size.y; + + if (!keylist.good()) + return -1; + + if (codestr == std::string("SPC")) { + keys[i].code = ' '; + } else { + keys[i].code = codestr.c_str()[0]; + } + } + + // create suggestion engine with key list + std::unique_ptr<SuggestEngine> engine(NewSuggestEngine( + keyboard_size, common_key_size, keys, SuggestParameters("en_US"))); + engine->LoadDictionary("en_US"); + + // do some test touches and receive suggestions + std::unique_ptr<SuggestSession> session(engine->NewSession()); + + initscr(); + std::vector<Touch> touches; + std::string display = "$ "; + while (true) { + char c = getch(); + if (c == ' ') { + display.append(1, c); + touches.clear(); + } else if (c == '\n') { + Suggestion suggestion = *session->GetSuggestions(touches).begin(); + display = display.substr(0, display.rfind(' ')+1); + display += suggestion.word + " "; + touches.clear(); + } else { + display.append(1, c); + touches.push_back(Touch(c, *engine)); + } + move(0, 0); + printw(display.c_str()); + move(1, 0); + for (const Suggestion &suggestion: session->GetSuggestions(touches)) { + printw("%s - %d\n", suggestion.word.c_str(), suggestion.frequency); + } + refresh(); + } + endwin(); + return 0; +} \ No newline at end of file
diff --git a/src/suggest.cpp b/src/suggest.cpp new file mode 100644 index 0000000..918b814 --- /dev/null +++ b/src/suggest.cpp
@@ -0,0 +1,264 @@ +// Copyright (c) 2013 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <suggest/suggest.h> + +#include <memory> +#include <fstream> +#include <iostream> + +#include "aosp/suggest/core/layout/proximity_info.h" +#include "aosp/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.h" +#include "aosp/suggest/core/dictionary/dictionary.h" +#include "aosp/suggest/core/session/dic_traverse_session.h" +#include "aosp/suggest/core/suggest_options.h" + +namespace { +std::vector<int> FillVector(int size, std::function<int (int)> fillFunc) { + std::vector<int> array(size); + for(int i=0; i<size; ++i) { + array[i] = fillFunc(i); + } + return array; +} +} + +namespace suggest { + +using namespace latinime; + +Touch::Touch(vec2f pos, charcode code) : pos(pos), code(code) {} + + +Touch::Touch(vec2f pos, const SuggestEngine& engine) : pos(pos) { + code = engine.GetKeyAt(pos).code; +} + +Touch::Touch(charcode code, const SuggestEngine& engine) : code(code) { + pos = engine.GetKey(code).rect.center(); +} + + +class SuggestEngineImpl; + +class SuggestSessionImpl : public SuggestSession { + public: + SuggestSessionImpl(SuggestEngineImpl* engine); + virtual const std::list<Suggestion>& GetSuggestions( + const std::vector<Touch> &touches, + std::string previous_word); + + private: + std::unique_ptr<SuggestOptions> options_; + std::unique_ptr<DicTraverseSession> session_; + std::list<Suggestion> suggestions_; + SuggestEngineImpl* engine_; +}; + +class SuggestEngineImpl : public SuggestEngine { + public: + SuggestEngineImpl(vec2f keyboard_size, vec2f common_key_size, + const std::vector<Key> &keys, + const SuggestParameters &p) : parameters_(p), keys_(keys) { + JNIEnv fakeEnv; + int num_keys = keys.size(); + + // build proximity chars grid + std::vector<int> proximityChars(p.grid_cells.x * p.grid_cells.y * MAX_PROXIMITY_CHARS_SIZE, 0); + vec2f cell_size = div_elem(keyboard_size, p.grid_cells); + vec2f search_box_size = mul(common_key_size, p.search_box_size_factor); + + for (int y=0; y<p.grid_cells.y; ++y) { + for (int x=0; x<p.grid_cells.x; ++x) { + rect2f cell(mul_elem(vec2f(x, y), cell_size), cell_size); + rect2f search_rect = cell.resized(search_box_size); + + int i = ((y * p.grid_cells.x) + x) * MAX_PROXIMITY_CHARS_SIZE; + + for (int k=0; k<num_keys; ++k) { + if (search_rect.intersects(keys[k].rect)) { + proximityChars[i++] = keys[k].code; + } + } + } + } +#ifdef DEBUG + for (int y=0; y<GRID_CELLS_Y; ++y) { + for (int x=0; x<GRID_CELLS_X; ++x) { + AKLOGI("(%d, %d) = %c %c %c %c %c %c %c %c", x, y, + (char)proximityChars[((y * grid_cells.x) + x) * MAX_PROXIMITY_CHARS_SIZE + 0], + (char)proximityChars[((y * grid_cells.x) + x) * MAX_PROXIMITY_CHARS_SIZE + 1], + (char)proximityChars[((y * grid_cells.x) + x) * MAX_PROXIMITY_CHARS_SIZE + 2], + (char)proximityChars[((y * grid_cells.x) + x) * MAX_PROXIMITY_CHARS_SIZE + 3], + (char)proximityChars[((y * grid_cells.x) + x) * MAX_PROXIMITY_CHARS_SIZE + 4], + (char)proximityChars[((y * grid_cells.x) + x) * MAX_PROXIMITY_CHARS_SIZE + 5], + (char)proximityChars[((y * grid_cells.x) + x) * MAX_PROXIMITY_CHARS_SIZE + 6], + (char)proximityChars[((y * grid_cells.x) + x) * MAX_PROXIMITY_CHARS_SIZE + 7]); + } + } +#endif + // build aosp compabtile data structures + std::vector<int> keyXCoordinates = FillVector(num_keys, [&](int i) -> int { return keys[i].rect.pos.x; }); + std::vector<int> keyYCoordinates = FillVector(num_keys, [&](int i) -> int { return keys[i].rect.pos.y; }); + std::vector<int> keyWidths = FillVector(num_keys, [&](int i) -> int { return keys[i].rect.size.x; }); + std::vector<int> keyHeights = FillVector(num_keys, [&](int i) -> int { return keys[i].rect.size.y; }); + std::vector<int> keyCharCodes = FillVector(num_keys, [&](int i) -> int { return keys[i].code; }); + +#ifdef DEBUG + for (int y=0; y<grid_cells.y; ++y) { + for (int x=0; x<grid_cells.x; ++x) { + AKLOGI("(%d, %d) = %c", keyXCoordinates[y * 5 + x], keyYCoordinates[y * 5 + x], (char)keyCharCodes[y * 5 + x]); + } + } +#endif + + // initialize aosp dictionary and proximity info + proximity_info_.reset(new ProximityInfo(&fakeEnv, p.locale.c_str(), + keyboard_size.x, keyboard_size.y, + p.grid_cells.x, p.grid_cells.y, + common_key_size.x, common_key_size.y, + &proximityChars, + num_keys, + &keyXCoordinates, &keyYCoordinates, + &keyWidths, &keyHeights, + &keyCharCodes, + NULL, NULL, NULL)); + } + + virtual bool LoadDictionary(std::string locale) { + JNIEnv fakeEnv; + + std::string dict_filename = "/usr/share/libsuggest/" + locale + ".dict"; + std::cout << "Loading: " << dict_filename << std::endl; + std::ifstream stream(dict_filename); + if (!stream.good()) + return false; + stream.seekg(0, stream.end); + int filesize = stream.tellg(); + stream.close(); + std::cout << "Size: " << filesize << std::endl; + + dict_structure_ = + DictionaryStructureWithBufferPolicyFactory::newDictionaryStructureWithBufferPolicy( + dict_filename.c_str(), 0, filesize, false); + + dictionary_.reset(new Dictionary(&fakeEnv, dict_structure_)); + } + + virtual Key GetKeyAt(vec2f pos) const { + for(const Key &key: keys_) { + if (key.rect.contains(pos)) { + return key; + } + } + return Key::InvalidKey; + } + + virtual Key GetKey(charcode code) const { + for(const Key &key: keys_) { + if (key.code == code) { + return key; + } + } + return Key::InvalidKey; + } + + virtual charcode CodeAt(vec2f pos) { + + } + + virtual SuggestSession* NewSession() { + return new SuggestSessionImpl(this); + } + +public: + ProximityInfo* proximity_info(){ return proximity_info_.get(); } + Dictionary* dictionary(){ return dictionary_.get(); } + const SuggestParameters& parameters() { return parameters_; } + private: + std::unique_ptr<ProximityInfo> proximity_info_; + DictionaryStructureWithBufferPolicy *dict_structure_; + std::unique_ptr<Dictionary> dictionary_; + SuggestParameters parameters_; + std::vector<Key> keys_; + +}; + +SuggestSessionImpl::SuggestSessionImpl(SuggestEngineImpl* engine) : engine_(engine) { + JNIEnv fakeEnv; + + options_.reset(new SuggestOptions(NULL, 0)); + session_.reset(new DicTraverseSession(&fakeEnv, + engine->parameters().locale.c_str(), true)); + +} + +const std::list<Suggestion>& SuggestSessionImpl::GetSuggestions( + const std::vector<Touch> &touches, + std::string previous_word="") { + suggestions_.clear(); + + std::vector<int> x_coords = FillVector(touches.size(), + [&](int i) { return touches[i].pos.x; }); + std::vector<int> y_coords = FillVector(touches.size(), + [&](int i) { return touches[i].pos.y; }); + std::vector<int> codes = FillVector(touches.size(), + [&](int i) { return touches[i].code; }); + std::vector<int> times = FillVector(touches.size(), + [&](int i) { return i; }); + std::vector<int> pointer_ids(touches.size(), 0); + + + int outWords[1024] = {0}; + int frequencies[1024] = {0}; + int spaceIndices[1024] = {0}; + int outputTypes[1024] = {0}; + int outputCommitFirstWordConfidence[1024] = {0}; + + engine_->dictionary()->getSuggestions( + engine_->proximity_info(), session_.get(), + &x_coords[0], &y_coords[0], ×[0], &pointer_ids[0], &codes[0], + touches.size(), + NULL, 0, + 0, + options_.get(), + outWords, frequencies, spaceIndices, outputTypes, + outputCommitFirstWordConfidence); + + for (int i = 0; i < MAX_RESULTS; ++i) { + Suggestion suggestion; + + for (int j=0; j<MAX_WORD_LENGTH; ++j) { + char code = outWords[i * MAX_WORD_LENGTH + j]; + if (code == 0) + break; + suggestion.word.append(1, (char)outWords[i * MAX_WORD_LENGTH + j]); + } + + if (suggestion.word.size() > 0) { + suggestion.frequency = frequencies[i]; + suggestions_.push_back(suggestion); + } else { + break; + } + } + + return suggestions_; +} + +Key Key::InvalidKey(vec2f(0, 0), vec2f(0, 0), 0); + +SuggestParameters::SuggestParameters(std::string locale) + : grid_cells(10, 10), + search_box_size_factor(1.5), + locale(locale) {} + +SuggestEngine* NewSuggestEngine(vec2f keyboard_size, vec2f common_key_size, + const std::vector<Key> &keylist, + const SuggestParameters ¶meters) { + return new SuggestEngineImpl(keyboard_size, common_key_size, + keylist, parameters); +} + +} // namespace suggest \ No newline at end of file