| // Copyright 2010-2011, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| #include "prediction/user_history_predictor.h" |
| |
| #include <time.h> |
| |
| #include <algorithm> |
| #include <climits> |
| #include <string> |
| |
| #include "base/base.h" |
| #include "base/encryptor.h" |
| #include "base/mmap.h" |
| #include "base/init.h" |
| #include "base/thread.h" |
| #include "base/util.h" |
| #include "base/password_manager.h" |
| #include "base/config_file_stream.h" |
| #include "converter/segments.h" |
| #include "prediction/predictor_interface.h" |
| #include "prediction/user_history_predictor.pb.h" |
| #include "rewriter/variants_rewriter.h" |
| #include "storage/lru_cache.h" |
| #include "session/commands.pb.h" |
| #include "session/config_handler.h" |
| #include "session/config.pb.h" |
| #include "usage_stats/usage_stats.h" |
| |
| |
| namespace mozc { |
| |
| namespace { |
| // find suggestion candidates from the most recent 3000 history in LRU. |
| // We don't check all history, since suggestion is called every key event |
| const size_t kMaxSuggestionTrial = 3000; |
| |
| // find suffix matches of history_segments from the most recent 500 histories |
| // in LRU. |
| const size_t kMaxPrevValueTrial = 500; |
| |
| // cache size |
| const size_t kLRUCacheSize = 10000; |
| |
| // don't save key/value that are |
| // longer than kMaxCandidateSize to avoid memory explosion |
| const size_t kMaxStringLength = 256; |
| |
| // salt size for encryption |
| const size_t kSaltSize = 32; |
| |
| // maximum size of next_entries |
| const size_t kMaxNextEntriesSize = 4; |
| |
| // 64Mbyte |
| // Maximum file size for history |
| const size_t kMaxFileSize = 64 * 1024 * 1024; |
| |
| // revert id for user_history_predictor |
| const uint16 kRevertId = 1; |
| |
| // default object pool size for EntryPriorityQueue |
| const size_t kEntryPoolSize = 16; |
| |
| // file name for the history |
| #ifdef OS_WINDOWS |
| const char kFileName[] = "user://history.db"; |
| #else |
| const char kFileName[] = "user://.history.db"; |
| #endif |
| |
| // use '\t' as a key/value delimiter |
| const char kDelimiter[] = "\t"; |
| |
| bool IsPunctuation(const string &value) { |
| // return (value == "。" || value == "." || |
| // value == "、" || value == "," || |
| // value == "?" || value == "?" || |
| // value == "," || value == "."); |
| return (value == "\xE3\x80\x82" || value == "." || |
| value == "\xE3\x80\x81" || value == "," || |
| value == "\xEF\xBC\x9F" || value == "?" || |
| value == "\xEF\xBC\x8C" || value == "\xEF\xBC\x8E"); |
| } |
| |
| // return true if value looks like a content word. |
| // Currently, just checks the script type. |
| bool IsContentWord(const string &value) { |
| return Util::CharsLen(value) > 1 || |
| Util::GetScriptType(value) != Util::UNKNOWN_SCRIPT; |
| } |
| |
| // return true if prev_entry has a next_fp link to entry |
| bool HasBigramEntry(const UserHistoryPredictor::Entry &entry, |
| const UserHistoryPredictor::Entry &prev_entry) { |
| const uint32 fp = UserHistoryPredictor::EntryFingerprint(entry); |
| for (int i = 0; i < prev_entry.next_entries_size(); ++i) { |
| if (fp == prev_entry.next_entries(i).entry_fp()) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| // Returns true if the input first candidate seems to be a privacy sensitive |
| // such like password |
| bool IsPrivacySensitive(const Segments *segments) { |
| // This is a very restricted rule and it will be relaxed. |
| if (segments->conversion_segments_size() != 1) { |
| return false; |
| } |
| const Segment &first_segment = segments->conversion_segment(0); |
| const string &first_candidate_value = first_segment.candidate(0).value; |
| // if key looks like hiragana, the candidate is Katakana to English |
| // transliteration. Don't suppress transliterated candidates. |
| // http://b/issue?id=4394325 |
| return (Util::GetCharacterSet(first_candidate_value) == Util::ASCII && |
| (Util::ContainsScriptType(first_segment.key(), Util::ALPHABET) || |
| Util::GetScriptType(first_segment.key()) == Util::NUMBER)); |
| } |
| } // namespace |
| |
| UserHistoryStorage::UserHistoryStorage(const string &filename) |
| : filename_(filename) {} |
| |
| UserHistoryStorage::~UserHistoryStorage() {} |
| |
| string UserHistoryStorage::filename() const { |
| return filename_; |
| } |
| |
| bool UserHistoryStorage::Load() { |
| string input, salt; |
| |
| // read encrypted message and salt from local file |
| { |
| Mmap<char> mmap; |
| if (!mmap.Open(filename_.c_str(), "r")) { |
| LOG(ERROR) << "cannot open user history file"; |
| return false; |
| } |
| |
| if (mmap.GetFileSize() < kSaltSize) { |
| LOG(ERROR) << "file size is too small"; |
| return false; |
| } |
| |
| if (mmap.GetFileSize() > kMaxFileSize) { |
| LOG(ERROR) << "file size is too big."; |
| return false; |
| } |
| |
| // copy salt |
| char tmp[kSaltSize]; |
| memcpy(tmp, mmap.begin(), kSaltSize); |
| salt.assign(tmp, kSaltSize); |
| |
| // copy body |
| input.assign(mmap.begin() + kSaltSize, |
| mmap.GetFileSize() - kSaltSize); |
| } |
| |
| string password; |
| if (!PasswordManager::GetPassword(&password)) { |
| LOG(ERROR) << "PasswordManager::GetPassword() failed"; |
| return false; |
| } |
| |
| if (password.empty()) { |
| LOG(ERROR) << "password is empty"; |
| return false; |
| } |
| |
| // Decrypt message |
| Encryptor::Key key; |
| if (!key.DeriveFromPassword(password, salt)) { |
| LOG(ERROR) << "Encryptor::Key::DeriveFromPassword failed"; |
| return false; |
| } |
| |
| if (!Encryptor::DecryptString(key, &input)) { |
| LOG(ERROR) << "Encryptor::DecryptString() failed"; |
| return false; |
| } |
| |
| if (!ParseFromString(input)) { |
| LOG(ERROR) << "ParseFromString failed. message looks broken"; |
| return false; |
| } |
| |
| VLOG(1) << "Loaded user histroy, size=" << entries_size(); |
| |
| return true; |
| } |
| |
| bool UserHistoryStorage::Save() const { |
| string salt, output; |
| { |
| if (entries_size() == 0) { |
| LOG(WARNING) << "etries size is 0. Not saved"; |
| return false; |
| } |
| |
| if (!AppendToString(&output)) { |
| LOG(ERROR) << "AppendToString failed"; |
| return false; |
| } |
| } |
| |
| { |
| string password; |
| if (!PasswordManager::GetPassword(&password)) { |
| LOG(ERROR) << "PasswordManager::GetPassword() failed"; |
| return false; |
| } |
| |
| if (password.empty()) { |
| LOG(ERROR) << "password is empty"; |
| return false; |
| } |
| |
| char tmp[kSaltSize]; |
| memset(tmp, '\0', sizeof(tmp)); |
| Util::GetSecureRandomSequence(tmp, sizeof(tmp)); |
| salt.assign(tmp, sizeof(tmp)); |
| |
| Encryptor::Key key; |
| if (!key.DeriveFromPassword(password, salt)) { |
| LOG(ERROR) << "Encryptor::Key::DeriveFromPassword() failed"; |
| return false; |
| } |
| |
| if (!Encryptor::EncryptString(key, &output)) { |
| LOG(ERROR) << "Encryptor::EncryptString() failed"; |
| return false; |
| } |
| } |
| |
| // Even if histoy is empty, save to them into a file to |
| // make the file empty |
| const string tmp_filename = filename_ + ".tmp"; |
| { |
| OutputFileStream ofs(tmp_filename.c_str(), ios::out | ios::binary); |
| if (!ofs) { |
| LOG(ERROR) << "failed to write: " << tmp_filename; |
| return false; |
| } |
| |
| VLOG(1) << "Syncing user history to: " << filename_; |
| ofs.write(salt.data(), salt.size()); |
| ofs.write(output.data(), output.size()); |
| } |
| |
| if (!Util::AtomicRename(tmp_filename, filename_)) { |
| LOG(ERROR) << "AtomicRename failed"; |
| } |
| |
| #ifdef OS_WINDOWS |
| wstring wfilename; |
| Util::UTF8ToWide(filename_.c_str(), &wfilename); |
| if (!::SetFileAttributes(wfilename.c_str(), |
| FILE_ATTRIBUTE_HIDDEN | |
| FILE_ATTRIBUTE_SYSTEM)) { |
| LOG(ERROR) << "Cannot make hidden: " << filename_ |
| << " " << ::GetLastError(); |
| } |
| #endif |
| |
| return true; |
| } |
| |
| UserHistoryPredictor::EntryPriorityQueue::EntryPriorityQueue() |
| : pool_(kEntryPoolSize) {} |
| |
| UserHistoryPredictor::EntryPriorityQueue::~EntryPriorityQueue() {} |
| |
| bool UserHistoryPredictor::EntryPriorityQueue::Push(Entry *entry) { |
| DCHECK(entry); |
| if (!seen_.insert(Util::Fingerprint32(entry->value())).second) { |
| VLOG(2) << "found dups"; |
| return false; |
| } |
| const uint32 score = UserHistoryPredictor::GetScore(*entry); |
| agenda_.push(make_pair(score, entry)); |
| return true; |
| } |
| |
| UserHistoryPredictor::Entry * |
| UserHistoryPredictor::EntryPriorityQueue::Pop() { |
| if (agenda_.empty()) { |
| return NULL; |
| } |
| const QueueElement &element = agenda_.top(); |
| Entry *result = element.second; |
| DCHECK(result); |
| agenda_.pop(); |
| return result; |
| } |
| |
| UserHistoryPredictor::Entry * |
| UserHistoryPredictor::EntryPriorityQueue::NewEntry() { |
| return pool_.Alloc(); |
| } |
| |
| class UserHistoryPredictorSyncer : public Thread { |
| public: |
| enum RequestType { |
| LOAD, |
| SAVE |
| }; |
| |
| UserHistoryPredictorSyncer(UserHistoryPredictor *predictor, |
| RequestType type) |
| : predictor_(predictor), type_(type) { |
| DCHECK(predictor_); |
| } |
| |
| virtual void Run() { |
| switch (type_) { |
| case LOAD: |
| VLOG(1) << "Executing Reload method"; |
| predictor_->Load(); |
| break; |
| case SAVE: |
| VLOG(1) << "Executing Sync method"; |
| predictor_->Save(); |
| break; |
| default: |
| LOG(ERROR) << "Unknown request: " << static_cast<int>(type_); |
| } |
| } |
| |
| virtual ~UserHistoryPredictorSyncer() { |
| Join(); |
| } |
| |
| UserHistoryPredictor *predictor_; |
| RequestType type_; |
| }; |
| |
| UserHistoryPredictor::UserHistoryPredictor() |
| : updated_(false), dic_(new DicCache(UserHistoryPredictor::cache_size())) { |
| AsyncLoad(); // non-blocking |
| // Load() blocking version can be used if any |
| } |
| |
| UserHistoryPredictor::~UserHistoryPredictor() { |
| // In destructor, must call blocking version |
| WaitForSyncer(); |
| Save(); // blocking |
| } |
| |
| string UserHistoryPredictor::GetUserHistoryFileName() { |
| return ConfigFileStream::GetFileName(kFileName); |
| } |
| |
| // return revert id |
| uint16 UserHistoryPredictor::revert_id() { |
| return kRevertId; |
| } |
| |
| void UserHistoryPredictor::WaitForSyncer() { |
| if (syncer_.get() != NULL) { |
| syncer_->Join(); |
| syncer_.reset(NULL); |
| } |
| } |
| |
| bool UserHistoryPredictor::CheckSyncerAndDelete() const { |
| if (syncer_.get() != NULL) { |
| if (syncer_->IsRunning()) { |
| return false; |
| } else { |
| syncer_.reset(NULL); // remove |
| } |
| } |
| |
| return true; |
| } |
| |
| bool UserHistoryPredictor::Sync() { |
| return AsyncSave(); |
| // return Save(); blocking version |
| } |
| |
| bool UserHistoryPredictor::Reload() { |
| WaitForSyncer(); |
| return AsyncLoad(); |
| } |
| |
| bool UserHistoryPredictor::AsyncLoad() { |
| if (!CheckSyncerAndDelete()) { // now loading/saving |
| return true; |
| } |
| |
| syncer_.reset(new UserHistoryPredictorSyncer( |
| this, |
| UserHistoryPredictorSyncer::LOAD)); |
| syncer_->Start(); |
| |
| return true; |
| } |
| |
| bool UserHistoryPredictor::AsyncSave() { |
| if (!updated_) { |
| return true; |
| } |
| |
| if (!CheckSyncerAndDelete()) { // now loading/saving |
| return true; |
| } |
| |
| syncer_.reset(new UserHistoryPredictorSyncer( |
| this, |
| UserHistoryPredictorSyncer::SAVE)); |
| syncer_->Start(); |
| |
| return true; |
| } |
| |
| bool UserHistoryPredictor::Load() { |
| const string filename = GetUserHistoryFileName(); |
| |
| UserHistoryStorage history(filename); |
| if (!history.Load()) { |
| LOG(ERROR) << "UserHistoryStorage::Load() failed"; |
| return false; |
| } |
| |
| for (size_t i = 0; i < history.entries_size(); ++i) { |
| dic_->Insert(EntryFingerprint(history.entries(i)), |
| history.entries(i)); |
| } |
| |
| VLOG(1) << "Loaded user histroy, size=" << history.entries_size(); |
| |
| return true; |
| } |
| |
| bool UserHistoryPredictor::Save() { |
| if (!updated_) { |
| return true; |
| } |
| |
| if (GET_CONFIG(incognito_mode)) { |
| VLOG(2) << "incognito mode"; |
| return true; |
| } |
| |
| if (!GET_CONFIG(use_history_suggest)) { |
| VLOG(2) << "no history suggest"; |
| return true; |
| } |
| |
| const DicElement *tail = dic_->Tail(); |
| if (tail == NULL) { |
| return true; |
| } |
| |
| const string filename = GetUserHistoryFileName(); |
| |
| UserHistoryStorage history(filename); |
| for (const DicElement *elm = tail; elm != NULL; elm = elm->prev) { |
| history.add_entries()->CopyFrom(elm->value); |
| } |
| |
| // update usage stats here. |
| usage_stats::UsageStats::SetInteger( |
| "UserHistoryPredictorEntrySize", |
| static_cast<int>(history.entries_size())); |
| |
| if (!history.Save()) { |
| LOG(ERROR) << "UserHistoryStorage::Save() failed"; |
| return false; |
| } |
| |
| updated_ = false; |
| |
| return true; |
| } |
| |
| bool UserHistoryPredictor::ClearAllHistory() { |
| // Wait until syncer finishes |
| WaitForSyncer(); |
| |
| VLOG(1) << "Clearing user prediction"; |
| // renew DicCache as LRUCache tries to reuse the internal value by |
| // using FreeList |
| dic_.reset(new DicCache(UserHistoryPredictor::cache_size())); |
| |
| // insert a dummy event entry. |
| InsertEvent(Entry::CLEAN_ALL_EVENT); |
| |
| updated_ = true; |
| |
| Sync(); |
| |
| return true; |
| } |
| |
| bool UserHistoryPredictor::ClearUnusedHistory() { |
| // Wait until syncer finishes |
| WaitForSyncer(); |
| |
| VLOG(1) << "Clearing unused prediction"; |
| const DicElement *head = dic_->Head(); |
| if (head == NULL) { |
| VLOG(2) << "dic head is NULL"; |
| return false; |
| } |
| |
| vector<uint32> keys; |
| for (const DicElement *elm = head; elm != NULL; elm = elm->next) { |
| VLOG(3) << elm->key << " " << elm->value.suggestion_freq(); |
| if (elm->value.suggestion_freq() == 0) { |
| keys.push_back(elm->key); |
| } |
| } |
| |
| for (size_t i = 0; i < keys.size(); ++i) { |
| VLOG(2) << "Removing: " << keys[i]; |
| if (!dic_->Erase(keys[i])) { |
| LOG(ERROR) << "cannot erase " << keys[i]; |
| } |
| } |
| |
| // insert a dummy event entry. |
| InsertEvent(Entry::CLEAN_UNUSED_EVENT); |
| |
| updated_ = true; |
| |
| Sync(); |
| |
| VLOG(1) << keys.size() << " removed"; |
| |
| return true; |
| } |
| |
| bool UserHistoryPredictor::LookupEntry( |
| const string &input_key, |
| const UserHistoryPredictor::Entry *entry, |
| const UserHistoryPredictor::Entry *prev_entry, |
| EntryPriorityQueue *results) const { |
| CHECK(entry); |
| CHECK(results); |
| Entry *result = NULL; |
| |
| const Entry *last_entry = NULL; |
| |
| // last_access_time of the left-closest content word. |
| uint32 left_last_access_time = 0; |
| |
| // last_access_time of the left-most content word. |
| uint32 left_most_last_access_time = 0; |
| |
| // Example: [a|B|c|D] |
| // a,c: functional word |
| // B,D: content word |
| // left_last_access_time: timestamp of D |
| // left_most_last_access_time: timestamp of B |
| |
| // |input_key| is a query user is now typing. |
| // |entry->key()| is a target value saved in the database. |
| const MatchType mtype = GetMatchType(input_key, entry->key()); |
| if (mtype == NO_MATCH) { |
| return false; |
| } else if (mtype == LEFT_EMPTY_MATCH) { // zero-query-suggestion |
| // if |input_key| is empty, the |prev_entry| and |entry| must |
| // have bigram relation. |
| if (prev_entry != NULL && HasBigramEntry(*entry, *prev_entry)) { |
| result = results->NewEntry(); |
| result->Clear(); |
| result->CopyFrom(*entry); |
| last_entry = entry; |
| left_last_access_time = entry->last_access_time(); |
| left_most_last_access_time = IsContentWord(entry->value()) ? |
| left_last_access_time : 0; |
| } else { |
| return false; |
| } |
| } else if (mtype == LEFT_PREFIX_MATCH) { |
| // |input_key| is shorter than |entry->key()| |
| // This scenario is a simple prefix match. |
| // e.g., |input_key|="foo", |entry->key()|="foobar" |
| result = results->NewEntry(); |
| result->Clear(); |
| result->CopyFrom(*entry); |
| last_entry = entry; |
| left_last_access_time = entry->last_access_time(); |
| left_most_last_access_time = IsContentWord(entry->value()) ? |
| left_last_access_time : 0; |
| } else if (mtype == RIGHT_PREFIX_MATCH || mtype == EXACT_MATCH) { |
| // |input_key| is longer than or the same as |entry->key()|. |
| // In this case, recursively traverse "next_entries" until |
| // target entry gets longer than input_key. |
| // e.g., |input_key|="foobar", |entry->key()|="foo" |
| left_last_access_time = entry->last_access_time(); |
| left_most_last_access_time = IsContentWord(entry->value()) ? |
| left_last_access_time : 0; |
| string key = entry->key(); |
| string value = entry->value(); |
| const Entry *current_entry = entry; |
| set<uint64> seen; |
| seen.insert(EntryFingerprint(*current_entry)); |
| // Until target entry gets longer than input_key. |
| while (key.size() <= input_key.size()) { |
| const Entry *latest_entry = NULL; |
| const Entry *left_same_timestamp_entry = NULL; |
| const Entry *left_most_same_timestamp_entry = NULL; |
| for (size_t i = 0; i < current_entry->next_entries_size(); ++i) { |
| const Entry *tmp_entry = dic_->LookupWithoutInsert( |
| current_entry->next_entries(i).entry_fp()); |
| if (tmp_entry == NULL || tmp_entry->key().empty()) { |
| continue; |
| } |
| const MatchType mtype2 = GetMatchType(key + tmp_entry->key(), |
| input_key); |
| if (mtype2 == NO_MATCH || mtype2 == LEFT_EMPTY_MATCH) { |
| continue; |
| } |
| if (latest_entry == NULL || |
| latest_entry->last_access_time() < tmp_entry->last_access_time()) { |
| latest_entry = tmp_entry; |
| } |
| if (tmp_entry->last_access_time() == left_last_access_time) { |
| left_same_timestamp_entry = tmp_entry; |
| } |
| if (tmp_entry->last_access_time() == left_most_last_access_time) { |
| left_most_same_timestamp_entry = tmp_entry; |
| } |
| } |
| |
| // Prefer bigrams which are generated at the same time. |
| // When last_access_time are the same, these two bigrams were |
| // input together. |
| // The preferences: |
| // (1). The current entry's time stamp is equal to that of |
| // left most content word |
| // (2). The current entry's time stamp is equal to that of |
| // left closest content word |
| // (3). The current entry is the latest |
| const Entry *next_entry = left_most_same_timestamp_entry; |
| if (next_entry == NULL) { |
| next_entry = left_same_timestamp_entry; |
| } |
| if (next_entry == NULL) { |
| next_entry = latest_entry; |
| } |
| |
| if (next_entry == NULL || next_entry->key().empty()) { |
| break; |
| } |
| |
| // if duplicate entry is found, don't expand more. |
| // This is because an entry only has one timestamp. |
| // we cannot trust the timestamp if there are duplicate values |
| // in one input. |
| if (!seen.insert(EntryFingerprint(*next_entry)).second) { |
| break; |
| } |
| |
| key += next_entry->key(); |
| value += next_entry->value(); |
| current_entry = next_entry; |
| last_entry = next_entry; |
| |
| |
| // Don't update left_access_time if the current entry is |
| // not a content word. |
| // The time-stamp of non-content-word will be updated frequently. |
| // The time-stamp of the previous candidate is more trustful. |
| // It partially fixes the bug http://b/2843371. |
| const bool is_content_word = IsContentWord(current_entry->value()); |
| |
| if (is_content_word) { |
| left_last_access_time = current_entry->last_access_time(); |
| } |
| |
| // if left_most entry is a functional word (symbols/punctuations), |
| // we don't take it as a canonical candidate. |
| if (left_most_last_access_time == 0 && is_content_word) { |
| left_most_last_access_time = current_entry->last_access_time(); |
| } |
| } |
| |
| if (key.size() < input_key.size()) { |
| VLOG(3) << "Cannot find prefix match even after chain rules"; |
| return false; |
| } |
| |
| result = results->NewEntry(); |
| result->CopyFrom(*entry); |
| result->set_key(key); |
| result->set_value(value); |
| } else { |
| LOG(ERROR) << "Unknown match mode: " << mtype; |
| return false; |
| } |
| |
| DCHECK(result); |
| |
| // if prev entry is not NULL, check whether there is a bigram |
| // from |prev_entry| to |entry|. |
| result->set_bigram_boost(false); |
| |
| if (prev_entry != NULL && HasBigramEntry(*entry, *prev_entry)) { |
| // set bigram_boost flag so that this entry is boosted |
| // against LRU policy. |
| result->set_bigram_boost(true); |
| } |
| |
| results->Push(result); |
| |
| // Expand new entry which was input just after "last_entry" |
| if (last_entry != NULL && |
| Util::CharsLen(result->key()) >= 1 && |
| 2 * Util::CharsLen(input_key) >= Util::CharsLen(result->key())) { |
| const Entry *latest_entry = NULL; |
| const Entry *left_same_timestamp_entry = NULL; |
| const Entry *left_most_same_timestamp_entry = NULL; |
| for (int i = 0; i < last_entry->next_entries_size(); ++i) { |
| const Entry *tmp_entry = dic_->LookupWithoutInsert( |
| last_entry->next_entries(i).entry_fp()); |
| if (tmp_entry == NULL || tmp_entry->key().empty()) { |
| continue; |
| } |
| if (latest_entry == NULL || |
| latest_entry->last_access_time() < tmp_entry->last_access_time()) { |
| latest_entry = tmp_entry; |
| } |
| if (tmp_entry->last_access_time() == left_last_access_time) { |
| left_same_timestamp_entry = tmp_entry; |
| } |
| if (tmp_entry->last_access_time() == left_most_last_access_time) { |
| left_most_same_timestamp_entry = tmp_entry; |
| } |
| } |
| |
| const Entry *next_entry = left_most_same_timestamp_entry; |
| if (next_entry == NULL) { |
| next_entry = left_same_timestamp_entry; |
| } |
| if (next_entry == NULL) { |
| next_entry = latest_entry; |
| } |
| |
| // the new entry was input within 10 seconds. |
| // TODO(taku): This is a simple heuristics. |
| if (next_entry != NULL && !next_entry->key().empty() && |
| abs(static_cast<int32>(next_entry->last_access_time() - |
| last_entry->last_access_time())) <= 10 && |
| IsContentWord(next_entry->value())) { |
| Entry *result2 = results->NewEntry(); |
| result2->Clear(); |
| result2->CopyFrom(*result); |
| *(result2->mutable_value()) += next_entry->value(); |
| *(result2->mutable_key()) += next_entry->key(); |
| results->Push(result2); |
| } |
| } |
| |
| return true; |
| } |
| |
| bool UserHistoryPredictor::Predict(Segments *segments) const { |
| if (!CheckSyncerAndDelete()) { |
| LOG(WARNING) << "Syncer is running"; |
| return false; |
| } |
| |
| if (GET_CONFIG(incognito_mode)) { |
| VLOG(2) << "incognito mode"; |
| return false; |
| } |
| |
| if (segments->request_type() == Segments::CONVERSION) { |
| VLOG(2) << "request type is CONVERSION"; |
| return false; |
| } |
| |
| if (!GET_CONFIG(use_history_suggest) && |
| segments->request_type() == Segments::SUGGESTION) { |
| VLOG(2) << "no history suggest"; |
| return false; |
| } |
| |
| if (segments->conversion_segments_size() < 1) { |
| VLOG(2) << "segment size < 1"; |
| return false; |
| } |
| |
| const DicElement *head = dic_->Head(); |
| if (head == NULL) { |
| VLOG(2) << "dic head is NULL"; |
| return false; |
| } |
| |
| const string &input_key = segments->conversion_segment(0).key(); |
| const size_t input_key_len = Util::CharsLen(input_key); |
| |
| bool zero_query_suggestion = false; |
| |
| |
| if (input_key_len == 0 && !zero_query_suggestion) { |
| VLOG(2) << "key length is 0"; |
| return false; |
| } |
| |
| if (segments->request_type() == Segments::SUGGESTION && |
| !zero_query_suggestion && |
| IsPunctuation(Util::SubString(input_key, 0, 1))) { |
| VLOG(2) << "input_key starts with punctuations"; |
| return false; |
| } |
| |
| // Do linear search |
| Segment *segment = segments->mutable_conversion_segment(0); |
| |
| if (segment == NULL) { |
| LOG(ERROR) << "segment is NULL"; |
| return false; |
| } |
| |
| const size_t history_segments_size = segments->history_segments_size(); |
| const Entry *prev_entry = NULL; |
| |
| // when threre are non-zero history segments, lookup an entry |
| // from the LRU dictionary, which is correspoinding to the last |
| // history segment. |
| if (history_segments_size > 0) { |
| const Segment &history_segment = |
| segments->history_segment(history_segments_size - 1); |
| // Simply lookup the history_segment. |
| prev_entry = dic_->LookupWithoutInsert( |
| SegmentFingerprint(history_segment)); |
| // When |prev_entry| is NULL or |prev_entry| has no valid next_entries, |
| // do linear-search over the LRU. |
| if ((prev_entry == NULL && history_segment.candidates_size() > 0) || |
| (prev_entry != NULL && prev_entry->next_entries_size() == 0)) { |
| const string &prev_value = prev_entry == NULL ? |
| history_segment.candidate(0).value : prev_entry->value(); |
| int trial = 0; |
| for (const DicElement *elm = head; |
| trial++ < kMaxPrevValueTrial && elm != NULL; elm = elm->next) { |
| const Entry *entry = &(elm->value); |
| // entry->value() equals to the prev_value or |
| // entry->value() is a SUFFIX of prev_value. |
| // length of entry->value() must be >= 2, as single-length |
| // match would be noisy. |
| if (IsValidEntry(*entry) && |
| entry != prev_entry && |
| entry->next_entries_size() > 0 && |
| Util::CharsLen(entry->value()) >= 2 && |
| (entry->value() == prev_value || |
| Util::EndsWith(prev_value, entry->value()))) { |
| prev_entry = entry; |
| break; |
| } |
| } |
| } |
| } |
| |
| if (input_key_len == 0 && prev_entry == NULL) { |
| VLOG(1) << "If input_key_len is 0, prev_entry must be set"; |
| return false; |
| } |
| |
| const size_t max_results_size = |
| 5 * segments->max_prediction_candidates_size(); |
| |
| int trial = 0; |
| EntryPriorityQueue results; |
| |
| for (const DicElement *elm = head; elm != NULL; elm = elm->next) { |
| if (!IsValidEntry(elm->value)) { |
| continue; |
| } |
| |
| if (segments->request_type() == Segments::SUGGESTION && |
| trial++ >= kMaxSuggestionTrial) { |
| VLOG(2) << "too many trials"; |
| break; |
| } |
| |
| // lookup input_key from elm_value and prev_entry. |
| // If a new entry is found, the entry is pushed to the results. |
| if (!LookupEntry(input_key, &(elm->value), prev_entry, &results)) { |
| continue; |
| } |
| |
| // already found enough results. |
| if (results.size() >= max_results_size) { |
| break; |
| } |
| } |
| |
| if (results.size() == 0) { |
| VLOG(2) << "no prefix match candiate is found."; |
| return false; |
| } |
| |
| while (segment->candidates_size() < |
| segments->max_prediction_candidates_size()) { |
| // |results| is a priority queue where the elemtnt |
| // in the queue is sorted by the score defined in GetScore(). |
| const Entry *result_entry = results.Pop(); |
| if (result_entry == NULL) { |
| // Pop() returns NULL when no more valid entry exists. |
| break; |
| } |
| bool is_valid_candidate = false; |
| if (segments->request_type() == Segments::PREDICTION) { |
| is_valid_candidate = true; |
| } else if (segments->request_type() == Segments::SUGGESTION) { |
| // The top result of suggestion should be a VALID suggestion candidate. |
| // i.e., SuggestionTrigerFunc should return true for the first |
| // candidate. |
| // If user types "デスノート" too many times, "デスノート" will be |
| // suggested when user types "で". It is expected, but if user types |
| // "です" after that, showing "デスノート" is annoying. |
| // In this situation, "です" is in the LRU, but SuggestionTrigerFunc |
| // returns false for "です", since it is short. |
| if (IsValidSuggestion(zero_query_suggestion, |
| input_key_len, *result_entry)) { |
| is_valid_candidate = true; |
| } else if (segment->candidates_size() == 0) { |
| VLOG(2) << "candidates size is 0"; |
| return false; |
| } |
| } else { |
| LOG(ERROR) << "Unknown mode"; |
| return false; |
| } |
| |
| if (!is_valid_candidate) { |
| VLOG(2) << "not a valid candidate: " << result_entry->key(); |
| continue; |
| } |
| |
| Segment::Candidate *candidate = segment->push_back_candidate(); |
| DCHECK(candidate); |
| candidate->Init(); |
| candidate->key = result_entry->key(); |
| candidate->content_key = result_entry->key(); |
| candidate->value = result_entry->value(); |
| candidate->content_value = result_entry->value(); |
| candidate->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION; |
| const string &description = result_entry->description(); |
| // If we have stored description, set it exactly. |
| if (!description.empty()) { |
| candidate->description = description; |
| candidate->attributes |= Segment::Candidate::NO_EXTRA_DESCRIPTION; |
| } else { |
| VariantsRewriter::SetDescriptionForPrediction(candidate); |
| } |
| } |
| |
| return (segment->candidates_size() > 0); |
| } |
| |
| void UserHistoryPredictor::InsertNextEntry( |
| const UserHistoryPredictor::NextEntry &next_entry, |
| UserHistoryPredictor::Entry *entry) const { |
| if (next_entry.entry_fp() == 0 || entry == NULL) { |
| return; |
| } |
| |
| NextEntry *target_next_entry = NULL; |
| |
| // If next_entries_size is less than kMaxNextEntriesSize, |
| // we simply allocate a new entry. |
| if (entry->next_entries_size() < max_next_entries_size()) { |
| target_next_entry = entry->add_next_entries(); |
| } else { |
| // Otherwise, find the oldest next_entry. |
| uint32 last_access_time = UINT_MAX; |
| for (int i = 0; i < entry->next_entries_size(); ++i) { |
| // already has the same id |
| if (next_entry.entry_fp() == entry->next_entries(i).entry_fp()) { |
| target_next_entry = entry->mutable_next_entries(i); |
| break; |
| } |
| const Entry *next_entry = dic_->LookupWithoutInsert( |
| entry->next_entries(i).entry_fp()); |
| // reuse the entry if it is already removed from the LRU. |
| if (next_entry == NULL) { |
| target_next_entry = entry->mutable_next_entries(i); |
| break; |
| } |
| // preserve the oldest entry |
| if (target_next_entry == NULL || |
| last_access_time > next_entry->last_access_time()) { |
| target_next_entry = entry->mutable_next_entries(i); |
| last_access_time = next_entry->last_access_time(); |
| } |
| } |
| } |
| |
| if (target_next_entry == NULL) { |
| LOG(ERROR) << "cannot find a room for inserting next fp"; |
| return; |
| } |
| |
| target_next_entry->CopyFrom(next_entry); |
| } |
| |
| bool UserHistoryPredictor::IsValidEntry(const Entry &entry) { |
| return !entry.removed() && entry.entry_type() == Entry::DEFAULT_ENTRY; |
| } |
| |
| void UserHistoryPredictor::InsertEvent(EntryType type) { |
| if (type == Entry::DEFAULT_ENTRY) { |
| return; |
| } |
| |
| const uint32 last_access_time = static_cast<uint32>(time(NULL)); |
| const uint32 dic_key = Fingerprint("", "", type); |
| |
| CHECK(dic_.get()); |
| DicElement *e = dic_->Insert(dic_key); |
| if (e == NULL) { |
| VLOG(2) << "insert failed"; |
| return; |
| } |
| |
| Entry *entry = &(e->value); |
| DCHECK(entry); |
| entry->Clear(); |
| entry->set_entry_type(type); |
| entry->set_last_access_time(last_access_time); |
| } |
| |
| void UserHistoryPredictor::Insert(const string &key, |
| const string &value, |
| const string &description, |
| bool is_suggestion_selected, |
| uint32 next_fp, |
| uint32 last_access_time, |
| Segments *segments) { |
| if (key.empty() || value.empty() || |
| key.size() > kMaxStringLength || |
| value.size() > kMaxStringLength || |
| description.size() > kMaxStringLength) { |
| return; |
| } |
| |
| const uint32 dic_key = Fingerprint(key, value); |
| |
| if (!dic_->HasKey(dic_key)) { |
| // the key is a new key inserted in the last Finish method. |
| // Here we push a new RevertEntry so that the new "key" can be |
| // removed when Revert() method is called. |
| Segments::RevertEntry *revert_entry = segments->push_back_revert_entry(); |
| DCHECK(revert_entry); |
| revert_entry->key = Uint32ToString(dic_key); |
| revert_entry->id = UserHistoryPredictor::revert_id(); |
| revert_entry->revert_entry_type = Segments::RevertEntry::CREATE_ENTRY; |
| } else { |
| // the key is a old key not inserted in the last Finish method |
| // TODO(taku): |
| // add a treatment for UPDATE_ENTRY mode |
| } |
| |
| DicElement *e = dic_->Insert(dic_key); |
| if (e == NULL) { |
| VLOG(2) << "insert failed"; |
| return; |
| } |
| |
| Entry *entry = &(e->value); |
| DCHECK(entry); |
| |
| entry->set_key(key); |
| entry->set_value(value); |
| entry->set_removed(false); |
| |
| if (description.empty()) { |
| entry->clear_description(); |
| } else { |
| entry->set_description(description); |
| } |
| |
| entry->set_last_access_time(last_access_time); |
| if (is_suggestion_selected) { |
| entry->set_suggestion_freq(entry->suggestion_freq() + 1); |
| } else { |
| entry->set_conversion_freq(entry->conversion_freq() + 1); |
| } |
| |
| // Insert next_fp to the entry |
| if (next_fp != 0) { |
| NextEntry next_entry; |
| next_entry.set_entry_fp(next_fp); |
| InsertNextEntry(next_entry, entry); |
| } |
| |
| VLOG(2) << key << " " << value << " has inserted: " |
| << entry->suggestion_freq() << " " |
| << entry->conversion_freq(); |
| |
| // new entry is inserted to the cache |
| updated_ = true; |
| } |
| |
| void UserHistoryPredictor::Finish(Segments *segments) { |
| if (GET_CONFIG(incognito_mode)) { |
| VLOG(2) << "incognito mode"; |
| return; |
| } |
| |
| if (!GET_CONFIG(use_history_suggest)) { |
| VLOG(2) << "no history suggest"; |
| return; |
| } |
| |
| if (!CheckSyncerAndDelete()) { |
| LOG(WARNING) << "Syncer is running"; |
| return; |
| } |
| |
| const bool kInsertSuggestion = true; |
| const bool kInsertConversion = false; |
| |
| const uint32 last_access_time = static_cast<uint32>(time(NULL)); |
| |
| // If user inputs a punctuation just after some long sentence, |
| // we make a new candidate by concatinating the top element in LRU and |
| // the punctuation user input. The top element in LRU is supposed to be |
| // the long sentence user input before. |
| // This is a fix for http://b/issue?id=2216838 |
| if (dic_->Head() != NULL && |
| segments->conversion_segments_size() == 1 && |
| segments->history_segments_size() > 0 && |
| segments->conversion_segment(0).candidates_size() > 0 && |
| segments->history_segment( |
| segments->history_segments_size() - 1).candidates_size() > 0 && |
| Util::CharsLen( |
| segments->conversion_segment(0).candidate(0).value) == 1 && |
| IsPunctuation( |
| segments->conversion_segment(0).candidate(0).value) && |
| dic_->Head()->value.last_access_time() + 5 > last_access_time) { |
| const Entry *entry = &(dic_->Head()->value); |
| DCHECK(entry); |
| const string &last_value = |
| segments->history_segment( |
| segments->history_segments_size() - 1).candidate(0).value; |
| // Check that value in head element of LRU ends with the candidate value |
| // in history segments. |
| if (last_value.size() <= entry->value().size() && |
| entry->value().substr(entry->value().size() - last_value.size(), |
| last_value.size()) == last_value) { |
| const Segment::Candidate &candidate = |
| segments->conversion_segment(0).candidate(0); |
| const string key = entry->key() + candidate.key; |
| const string value = entry->value() + candidate.value; |
| // use the same last_access_time stored in the top element |
| // so that this item can be grouped together. |
| Insert(key, value, entry->description(), kInsertConversion, 0, |
| entry->last_access_time(), segments); |
| } |
| } |
| |
| string all_key, all_value; |
| |
| if (segments->request_type() == Segments::CONVERSION) { |
| const size_t history_segments_size = segments->history_segments_size(); |
| |
| // Check every segment is valid. |
| for (size_t i = history_segments_size; i < segments->segments_size(); ++i) { |
| const Segment &segment = segments->segment(i); |
| if (segment.candidates_size() < 1) { |
| VLOG(2) << "candidates size < 1"; |
| return; |
| } |
| if (segment.segment_type() != Segment::FIXED_VALUE) { |
| VLOG(2) << "segment is not FIXED_VALUE"; |
| return; |
| } |
| const Segment::Candidate &candidate = segment.candidate(0); |
| if (candidate.attributes & |
| Segment::Candidate::NO_SUGGEST_LEARNING) { |
| VLOG(2) << "NO_SUGGEST_LEARNING"; |
| return; |
| } |
| } |
| |
| if (IsPrivacySensitive(segments)) { |
| VLOG(2) << "do not remember privacy sensitive input"; |
| return; |
| } |
| |
| set<uint64> seen; |
| bool this_was_seen = false; |
| for (size_t i = history_segments_size; i < segments->segments_size(); ++i) { |
| const Segment &segment = segments->segment(i); |
| all_key += segment.key(); |
| all_value += segment.candidate(0).value; |
| uint32 next_fp = (i == segments->segments_size() - 1) ? |
| 0 : SegmentFingerprint(segments->segment(i + 1)); |
| // remember the first segment |
| if (i == history_segments_size) { |
| seen.insert(SegmentFingerprint(segment)); |
| } |
| uint32 next_fp_to_set = next_fp; |
| // If two duplicate segments exist, kills the link |
| // TO/FROM the second one to prevent loops. |
| // Only killing "TO" link caused bug #2982886: |
| // after converting "らいおん(もうじゅう)とぞうりむし(びせいぶつ)" |
| // and typing "ぞうりむし", "ゾウリムシ(猛獣" was suggested. |
| if (this_was_seen) { |
| next_fp_to_set = 0; |
| } |
| if (!seen.insert(next_fp).second) { |
| next_fp_to_set = 0; |
| this_was_seen = true; |
| } else { |
| this_was_seen = false; |
| } |
| Insert(segment.key(), segment.candidate(0).value, |
| segment.candidate(0).description, |
| kInsertConversion, next_fp_to_set, |
| last_access_time, segments); |
| } |
| |
| // insert all_key/all_value |
| if (segments->conversion_segments_size() > 1 && |
| !all_key.empty() && !all_value.empty()) { |
| Insert(all_key, all_value, "", |
| kInsertConversion, |
| 0, last_access_time, segments); |
| } |
| |
| } else { |
| // Store user-history from predictoin |
| if (segments->conversion_segments_size() < 1) { |
| return; |
| } |
| const Segment &segment = segments->conversion_segment(0); |
| if (segment.candidates_size() < 1) { |
| return; |
| } |
| |
| all_key = segment.candidate(0).key; |
| all_value = segment.candidate(0).value; |
| const string &description = segment.candidate(0).description; |
| |
| Insert(all_key, all_value, description, |
| kInsertSuggestion, 0, last_access_time, segments); |
| } |
| |
| // make a link from the right most history_segment to |
| // the left most segment or entire user input. |
| if (segments->history_segments_size() > 0 && |
| segments->conversion_segments_size() > 0) { |
| Entry *history_entry = |
| dic_->MutableLookupWithoutInsert( |
| SegmentFingerprint( |
| segments->segment( |
| segments->history_segments_size() - 1))); |
| |
| NextEntry next_entry; |
| if (segments->request_type() == Segments::CONVERSION) { |
| next_entry.set_entry_fp( |
| SegmentFingerprint(segments->conversion_segment(0))); |
| InsertNextEntry(next_entry, history_entry); |
| } |
| |
| // entire user input or SUGGESTION |
| if (segments->request_type() != Segments::CONVERSION || |
| segments->conversion_segments_size() > 1) { |
| next_entry.set_entry_fp(Fingerprint(all_key, all_value)); |
| InsertNextEntry(next_entry, history_entry); |
| } |
| } |
| |
| return; |
| } |
| |
| void UserHistoryPredictor::Revert(Segments *segments) { |
| if (!CheckSyncerAndDelete()) { |
| LOG(WARNING) << "Syncer is running"; |
| return; |
| } |
| |
| for (size_t i = 0; i < segments->revert_entries_size(); ++i) { |
| const Segments::RevertEntry &revert_entry = |
| segments->revert_entry(i); |
| if (revert_entry.id == UserHistoryPredictor::revert_id() && |
| revert_entry.revert_entry_type == Segments::RevertEntry::CREATE_ENTRY) { |
| VLOG(2) << "Erasing the key: " << StringToUint32(revert_entry.key); |
| dic_->Erase(StringToUint32(revert_entry.key)); |
| } |
| } |
| } |
| |
| // type |
| UserHistoryPredictor::MatchType |
| UserHistoryPredictor::GetMatchType(const string &lstr, const string &rstr) { |
| if (lstr.empty() && !rstr.empty()) { |
| return LEFT_EMPTY_MATCH; |
| } |
| |
| const size_t size = min(lstr.size(), rstr.size()); |
| if (size == 0) { |
| return NO_MATCH; |
| } |
| |
| const int result = memcmp(lstr.data(), rstr.data(), size); |
| if (result != 0) { |
| return NO_MATCH; |
| } |
| |
| if (lstr.size() == rstr.size()) { |
| return EXACT_MATCH; |
| } else if (lstr.size() < rstr.size()) { |
| return LEFT_PREFIX_MATCH; |
| } else { |
| return RIGHT_PREFIX_MATCH; |
| } |
| |
| return NO_MATCH; |
| } |
| |
| // static |
| uint32 UserHistoryPredictor::Fingerprint(const string &key, |
| const string &value, |
| EntryType type) { |
| if (type == Entry::DEFAULT_ENTRY) { |
| // Since we have already used the fingerprint function for next entries and |
| // next entries are saved in user's local machine, we are not able |
| // to change the Fingerprint function for the old key/value type. |
| return Util::Fingerprint32(key + kDelimiter + value); |
| } else { |
| uint8 id = static_cast<uint8>(type); |
| return Util::Fingerprint32(reinterpret_cast<char *>(&id), sizeof(id)); |
| } |
| } |
| |
| uint32 UserHistoryPredictor::Fingerprint(const string &key, |
| const string &value) { |
| return Fingerprint(key, value, Entry::DEFAULT_ENTRY); |
| } |
| |
| // static |
| uint32 UserHistoryPredictor::EntryFingerprint( |
| const UserHistoryPredictor::Entry &entry) { |
| return Fingerprint(entry.key(), entry.value()); |
| } |
| |
| // static |
| uint32 UserHistoryPredictor::SegmentFingerprint(const Segment &segment) { |
| if (segment.candidates_size() > 0) { |
| return Fingerprint(segment.candidate(0).key, segment.candidate(0).value); |
| } |
| return 0; |
| } |
| |
| // static |
| string UserHistoryPredictor::Uint32ToString(uint32 fp) { |
| string buf(reinterpret_cast<const char *>(&fp), sizeof(fp)); |
| return buf; |
| } |
| |
| // static |
| uint32 UserHistoryPredictor::StringToUint32(const string &input) { |
| uint32 result = 0; |
| if (input.size() == sizeof(result)) { |
| memcpy(reinterpret_cast<char *>(&result), input.data(), input.size()); |
| } |
| return result; |
| } |
| |
| // static |
| bool UserHistoryPredictor::IsValidSuggestion( |
| bool is_zero_query_suggestion, |
| uint32 prefix_len, |
| const UserHistoryPredictor::Entry &entry) { |
| // when bigram_boost is true, that means that previous user input |
| // and current input have bigram relation. |
| if (entry.bigram_boost()) { |
| return true; |
| } |
| if (is_zero_query_suggestion) { |
| return true; |
| } |
| // Handle suggestion_freq and conversion_freq differently. |
| // conversion_freq is less aggressively affecting to the final decision. |
| const uint32 freq = max(entry.suggestion_freq(), |
| entry.conversion_freq() / 4); |
| |
| // TODO(taku,komatsu): better to make it simpler and easier to be understood. |
| const uint32 base_prefix_len = 3 - min(static_cast<uint32>(2), freq); |
| return (prefix_len >= base_prefix_len); |
| } |
| |
| // static |
| // 1) sort by last_access_time, which is basically the same as LRU policy. |
| // 2) boost shorter candidate, if having the same last_access_time. |
| // 3) add a bigram boost as a special bonus. |
| // TODO(taku): better to take "frequency" into consideration |
| uint32 UserHistoryPredictor::GetScore( |
| const UserHistoryPredictor::Entry &entry) { |
| const uint32 kBigramBoostAsTime = 7 * 24 * 60 * 60; // 1 week. |
| return |
| entry.last_access_time() - Util::CharsLen(entry.value()) + |
| (entry.bigram_boost() ? kBigramBoostAsTime : 0); |
| } |
| |
| // return the size of cache. |
| // staitc |
| uint32 UserHistoryPredictor::cache_size() { |
| return kLRUCacheSize; |
| } |
| |
| // return the size of next entries. |
| // static |
| uint32 UserHistoryPredictor::max_next_entries_size() { |
| return kMaxNextEntriesSize; |
| } |
| |
| } // namespace mozc |