blob: 6f798461650cb1442df8a95350741343aa3753d5 [file] [log] [blame]
// Copyright 2010-2011, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "prediction/user_history_predictor.h"
#include <time.h>
#include <algorithm>
#include <climits>
#include <string>
#include "base/base.h"
#include "base/encryptor.h"
#include "base/mmap.h"
#include "base/init.h"
#include "base/thread.h"
#include "base/util.h"
#include "base/password_manager.h"
#include "base/config_file_stream.h"
#include "converter/segments.h"
#include "prediction/predictor_interface.h"
#include "prediction/user_history_predictor.pb.h"
#include "rewriter/variants_rewriter.h"
#include "storage/lru_cache.h"
#include "session/commands.pb.h"
#include "session/config_handler.h"
#include "session/config.pb.h"
#include "usage_stats/usage_stats.h"
namespace mozc {
namespace {
// find suggestion candidates from the most recent 3000 history in LRU.
// We don't check all history, since suggestion is called every key event
const size_t kMaxSuggestionTrial = 3000;
// find suffix matches of history_segments from the most recent 500 histories
// in LRU.
const size_t kMaxPrevValueTrial = 500;
// cache size
const size_t kLRUCacheSize = 10000;
// don't save key/value that are
// longer than kMaxCandidateSize to avoid memory explosion
const size_t kMaxStringLength = 256;
// salt size for encryption
const size_t kSaltSize = 32;
// maximum size of next_entries
const size_t kMaxNextEntriesSize = 4;
// 64Mbyte
// Maximum file size for history
const size_t kMaxFileSize = 64 * 1024 * 1024;
// revert id for user_history_predictor
const uint16 kRevertId = 1;
// default object pool size for EntryPriorityQueue
const size_t kEntryPoolSize = 16;
// file name for the history
#ifdef OS_WINDOWS
const char kFileName[] = "user://history.db";
#else
const char kFileName[] = "user://.history.db";
#endif
// use '\t' as a key/value delimiter
const char kDelimiter[] = "\t";
bool IsPunctuation(const string &value) {
// return (value == "。" || value == "." ||
// value == "、" || value == "," ||
// value == "?" || value == "?" ||
// value == "," || value == ".");
return (value == "\xE3\x80\x82" || value == "." ||
value == "\xE3\x80\x81" || value == "," ||
value == "\xEF\xBC\x9F" || value == "?" ||
value == "\xEF\xBC\x8C" || value == "\xEF\xBC\x8E");
}
// return true if value looks like a content word.
// Currently, just checks the script type.
bool IsContentWord(const string &value) {
return Util::CharsLen(value) > 1 ||
Util::GetScriptType(value) != Util::UNKNOWN_SCRIPT;
}
// return true if prev_entry has a next_fp link to entry
bool HasBigramEntry(const UserHistoryPredictor::Entry &entry,
const UserHistoryPredictor::Entry &prev_entry) {
const uint32 fp = UserHistoryPredictor::EntryFingerprint(entry);
for (int i = 0; i < prev_entry.next_entries_size(); ++i) {
if (fp == prev_entry.next_entries(i).entry_fp()) {
return true;
}
}
return false;
}
// Returns true if the input first candidate seems to be a privacy sensitive
// such like password
bool IsPrivacySensitive(const Segments *segments) {
// This is a very restricted rule and it will be relaxed.
if (segments->conversion_segments_size() != 1) {
return false;
}
const Segment &first_segment = segments->conversion_segment(0);
const string &first_candidate_value = first_segment.candidate(0).value;
// if key looks like hiragana, the candidate is Katakana to English
// transliteration. Don't suppress transliterated candidates.
// http://b/issue?id=4394325
return (Util::GetCharacterSet(first_candidate_value) == Util::ASCII &&
(Util::ContainsScriptType(first_segment.key(), Util::ALPHABET) ||
Util::GetScriptType(first_segment.key()) == Util::NUMBER));
}
} // namespace
UserHistoryStorage::UserHistoryStorage(const string &filename)
: filename_(filename) {}
UserHistoryStorage::~UserHistoryStorage() {}
string UserHistoryStorage::filename() const {
return filename_;
}
bool UserHistoryStorage::Load() {
string input, salt;
// read encrypted message and salt from local file
{
Mmap<char> mmap;
if (!mmap.Open(filename_.c_str(), "r")) {
LOG(ERROR) << "cannot open user history file";
return false;
}
if (mmap.GetFileSize() < kSaltSize) {
LOG(ERROR) << "file size is too small";
return false;
}
if (mmap.GetFileSize() > kMaxFileSize) {
LOG(ERROR) << "file size is too big.";
return false;
}
// copy salt
char tmp[kSaltSize];
memcpy(tmp, mmap.begin(), kSaltSize);
salt.assign(tmp, kSaltSize);
// copy body
input.assign(mmap.begin() + kSaltSize,
mmap.GetFileSize() - kSaltSize);
}
string password;
if (!PasswordManager::GetPassword(&password)) {
LOG(ERROR) << "PasswordManager::GetPassword() failed";
return false;
}
if (password.empty()) {
LOG(ERROR) << "password is empty";
return false;
}
// Decrypt message
Encryptor::Key key;
if (!key.DeriveFromPassword(password, salt)) {
LOG(ERROR) << "Encryptor::Key::DeriveFromPassword failed";
return false;
}
if (!Encryptor::DecryptString(key, &input)) {
LOG(ERROR) << "Encryptor::DecryptString() failed";
return false;
}
if (!ParseFromString(input)) {
LOG(ERROR) << "ParseFromString failed. message looks broken";
return false;
}
VLOG(1) << "Loaded user histroy, size=" << entries_size();
return true;
}
bool UserHistoryStorage::Save() const {
string salt, output;
{
if (entries_size() == 0) {
LOG(WARNING) << "etries size is 0. Not saved";
return false;
}
if (!AppendToString(&output)) {
LOG(ERROR) << "AppendToString failed";
return false;
}
}
{
string password;
if (!PasswordManager::GetPassword(&password)) {
LOG(ERROR) << "PasswordManager::GetPassword() failed";
return false;
}
if (password.empty()) {
LOG(ERROR) << "password is empty";
return false;
}
char tmp[kSaltSize];
memset(tmp, '\0', sizeof(tmp));
Util::GetSecureRandomSequence(tmp, sizeof(tmp));
salt.assign(tmp, sizeof(tmp));
Encryptor::Key key;
if (!key.DeriveFromPassword(password, salt)) {
LOG(ERROR) << "Encryptor::Key::DeriveFromPassword() failed";
return false;
}
if (!Encryptor::EncryptString(key, &output)) {
LOG(ERROR) << "Encryptor::EncryptString() failed";
return false;
}
}
// Even if histoy is empty, save to them into a file to
// make the file empty
const string tmp_filename = filename_ + ".tmp";
{
OutputFileStream ofs(tmp_filename.c_str(), ios::out | ios::binary);
if (!ofs) {
LOG(ERROR) << "failed to write: " << tmp_filename;
return false;
}
VLOG(1) << "Syncing user history to: " << filename_;
ofs.write(salt.data(), salt.size());
ofs.write(output.data(), output.size());
}
if (!Util::AtomicRename(tmp_filename, filename_)) {
LOG(ERROR) << "AtomicRename failed";
}
#ifdef OS_WINDOWS
wstring wfilename;
Util::UTF8ToWide(filename_.c_str(), &wfilename);
if (!::SetFileAttributes(wfilename.c_str(),
FILE_ATTRIBUTE_HIDDEN |
FILE_ATTRIBUTE_SYSTEM)) {
LOG(ERROR) << "Cannot make hidden: " << filename_
<< " " << ::GetLastError();
}
#endif
return true;
}
UserHistoryPredictor::EntryPriorityQueue::EntryPriorityQueue()
: pool_(kEntryPoolSize) {}
UserHistoryPredictor::EntryPriorityQueue::~EntryPriorityQueue() {}
bool UserHistoryPredictor::EntryPriorityQueue::Push(Entry *entry) {
DCHECK(entry);
if (!seen_.insert(Util::Fingerprint32(entry->value())).second) {
VLOG(2) << "found dups";
return false;
}
const uint32 score = UserHistoryPredictor::GetScore(*entry);
agenda_.push(make_pair(score, entry));
return true;
}
UserHistoryPredictor::Entry *
UserHistoryPredictor::EntryPriorityQueue::Pop() {
if (agenda_.empty()) {
return NULL;
}
const QueueElement &element = agenda_.top();
Entry *result = element.second;
DCHECK(result);
agenda_.pop();
return result;
}
UserHistoryPredictor::Entry *
UserHistoryPredictor::EntryPriorityQueue::NewEntry() {
return pool_.Alloc();
}
class UserHistoryPredictorSyncer : public Thread {
public:
enum RequestType {
LOAD,
SAVE
};
UserHistoryPredictorSyncer(UserHistoryPredictor *predictor,
RequestType type)
: predictor_(predictor), type_(type) {
DCHECK(predictor_);
}
virtual void Run() {
switch (type_) {
case LOAD:
VLOG(1) << "Executing Reload method";
predictor_->Load();
break;
case SAVE:
VLOG(1) << "Executing Sync method";
predictor_->Save();
break;
default:
LOG(ERROR) << "Unknown request: " << static_cast<int>(type_);
}
}
virtual ~UserHistoryPredictorSyncer() {
Join();
}
UserHistoryPredictor *predictor_;
RequestType type_;
};
UserHistoryPredictor::UserHistoryPredictor()
: updated_(false), dic_(new DicCache(UserHistoryPredictor::cache_size())) {
AsyncLoad(); // non-blocking
// Load() blocking version can be used if any
}
UserHistoryPredictor::~UserHistoryPredictor() {
// In destructor, must call blocking version
WaitForSyncer();
Save(); // blocking
}
string UserHistoryPredictor::GetUserHistoryFileName() {
return ConfigFileStream::GetFileName(kFileName);
}
// return revert id
uint16 UserHistoryPredictor::revert_id() {
return kRevertId;
}
void UserHistoryPredictor::WaitForSyncer() {
if (syncer_.get() != NULL) {
syncer_->Join();
syncer_.reset(NULL);
}
}
bool UserHistoryPredictor::CheckSyncerAndDelete() const {
if (syncer_.get() != NULL) {
if (syncer_->IsRunning()) {
return false;
} else {
syncer_.reset(NULL); // remove
}
}
return true;
}
bool UserHistoryPredictor::Sync() {
return AsyncSave();
// return Save(); blocking version
}
bool UserHistoryPredictor::Reload() {
WaitForSyncer();
return AsyncLoad();
}
bool UserHistoryPredictor::AsyncLoad() {
if (!CheckSyncerAndDelete()) { // now loading/saving
return true;
}
syncer_.reset(new UserHistoryPredictorSyncer(
this,
UserHistoryPredictorSyncer::LOAD));
syncer_->Start();
return true;
}
bool UserHistoryPredictor::AsyncSave() {
if (!updated_) {
return true;
}
if (!CheckSyncerAndDelete()) { // now loading/saving
return true;
}
syncer_.reset(new UserHistoryPredictorSyncer(
this,
UserHistoryPredictorSyncer::SAVE));
syncer_->Start();
return true;
}
bool UserHistoryPredictor::Load() {
const string filename = GetUserHistoryFileName();
UserHistoryStorage history(filename);
if (!history.Load()) {
LOG(ERROR) << "UserHistoryStorage::Load() failed";
return false;
}
for (size_t i = 0; i < history.entries_size(); ++i) {
dic_->Insert(EntryFingerprint(history.entries(i)),
history.entries(i));
}
VLOG(1) << "Loaded user histroy, size=" << history.entries_size();
return true;
}
bool UserHistoryPredictor::Save() {
if (!updated_) {
return true;
}
if (GET_CONFIG(incognito_mode)) {
VLOG(2) << "incognito mode";
return true;
}
if (!GET_CONFIG(use_history_suggest)) {
VLOG(2) << "no history suggest";
return true;
}
const DicElement *tail = dic_->Tail();
if (tail == NULL) {
return true;
}
const string filename = GetUserHistoryFileName();
UserHistoryStorage history(filename);
for (const DicElement *elm = tail; elm != NULL; elm = elm->prev) {
history.add_entries()->CopyFrom(elm->value);
}
// update usage stats here.
usage_stats::UsageStats::SetInteger(
"UserHistoryPredictorEntrySize",
static_cast<int>(history.entries_size()));
if (!history.Save()) {
LOG(ERROR) << "UserHistoryStorage::Save() failed";
return false;
}
updated_ = false;
return true;
}
bool UserHistoryPredictor::ClearAllHistory() {
// Wait until syncer finishes
WaitForSyncer();
VLOG(1) << "Clearing user prediction";
// renew DicCache as LRUCache tries to reuse the internal value by
// using FreeList
dic_.reset(new DicCache(UserHistoryPredictor::cache_size()));
// insert a dummy event entry.
InsertEvent(Entry::CLEAN_ALL_EVENT);
updated_ = true;
Sync();
return true;
}
bool UserHistoryPredictor::ClearUnusedHistory() {
// Wait until syncer finishes
WaitForSyncer();
VLOG(1) << "Clearing unused prediction";
const DicElement *head = dic_->Head();
if (head == NULL) {
VLOG(2) << "dic head is NULL";
return false;
}
vector<uint32> keys;
for (const DicElement *elm = head; elm != NULL; elm = elm->next) {
VLOG(3) << elm->key << " " << elm->value.suggestion_freq();
if (elm->value.suggestion_freq() == 0) {
keys.push_back(elm->key);
}
}
for (size_t i = 0; i < keys.size(); ++i) {
VLOG(2) << "Removing: " << keys[i];
if (!dic_->Erase(keys[i])) {
LOG(ERROR) << "cannot erase " << keys[i];
}
}
// insert a dummy event entry.
InsertEvent(Entry::CLEAN_UNUSED_EVENT);
updated_ = true;
Sync();
VLOG(1) << keys.size() << " removed";
return true;
}
bool UserHistoryPredictor::LookupEntry(
const string &input_key,
const UserHistoryPredictor::Entry *entry,
const UserHistoryPredictor::Entry *prev_entry,
EntryPriorityQueue *results) const {
CHECK(entry);
CHECK(results);
Entry *result = NULL;
const Entry *last_entry = NULL;
// last_access_time of the left-closest content word.
uint32 left_last_access_time = 0;
// last_access_time of the left-most content word.
uint32 left_most_last_access_time = 0;
// Example: [a|B|c|D]
// a,c: functional word
// B,D: content word
// left_last_access_time: timestamp of D
// left_most_last_access_time: timestamp of B
// |input_key| is a query user is now typing.
// |entry->key()| is a target value saved in the database.
const MatchType mtype = GetMatchType(input_key, entry->key());
if (mtype == NO_MATCH) {
return false;
} else if (mtype == LEFT_EMPTY_MATCH) { // zero-query-suggestion
// if |input_key| is empty, the |prev_entry| and |entry| must
// have bigram relation.
if (prev_entry != NULL && HasBigramEntry(*entry, *prev_entry)) {
result = results->NewEntry();
result->Clear();
result->CopyFrom(*entry);
last_entry = entry;
left_last_access_time = entry->last_access_time();
left_most_last_access_time = IsContentWord(entry->value()) ?
left_last_access_time : 0;
} else {
return false;
}
} else if (mtype == LEFT_PREFIX_MATCH) {
// |input_key| is shorter than |entry->key()|
// This scenario is a simple prefix match.
// e.g., |input_key|="foo", |entry->key()|="foobar"
result = results->NewEntry();
result->Clear();
result->CopyFrom(*entry);
last_entry = entry;
left_last_access_time = entry->last_access_time();
left_most_last_access_time = IsContentWord(entry->value()) ?
left_last_access_time : 0;
} else if (mtype == RIGHT_PREFIX_MATCH || mtype == EXACT_MATCH) {
// |input_key| is longer than or the same as |entry->key()|.
// In this case, recursively traverse "next_entries" until
// target entry gets longer than input_key.
// e.g., |input_key|="foobar", |entry->key()|="foo"
left_last_access_time = entry->last_access_time();
left_most_last_access_time = IsContentWord(entry->value()) ?
left_last_access_time : 0;
string key = entry->key();
string value = entry->value();
const Entry *current_entry = entry;
set<uint64> seen;
seen.insert(EntryFingerprint(*current_entry));
// Until target entry gets longer than input_key.
while (key.size() <= input_key.size()) {
const Entry *latest_entry = NULL;
const Entry *left_same_timestamp_entry = NULL;
const Entry *left_most_same_timestamp_entry = NULL;
for (size_t i = 0; i < current_entry->next_entries_size(); ++i) {
const Entry *tmp_entry = dic_->LookupWithoutInsert(
current_entry->next_entries(i).entry_fp());
if (tmp_entry == NULL || tmp_entry->key().empty()) {
continue;
}
const MatchType mtype2 = GetMatchType(key + tmp_entry->key(),
input_key);
if (mtype2 == NO_MATCH || mtype2 == LEFT_EMPTY_MATCH) {
continue;
}
if (latest_entry == NULL ||
latest_entry->last_access_time() < tmp_entry->last_access_time()) {
latest_entry = tmp_entry;
}
if (tmp_entry->last_access_time() == left_last_access_time) {
left_same_timestamp_entry = tmp_entry;
}
if (tmp_entry->last_access_time() == left_most_last_access_time) {
left_most_same_timestamp_entry = tmp_entry;
}
}
// Prefer bigrams which are generated at the same time.
// When last_access_time are the same, these two bigrams were
// input together.
// The preferences:
// (1). The current entry's time stamp is equal to that of
// left most content word
// (2). The current entry's time stamp is equal to that of
// left closest content word
// (3). The current entry is the latest
const Entry *next_entry = left_most_same_timestamp_entry;
if (next_entry == NULL) {
next_entry = left_same_timestamp_entry;
}
if (next_entry == NULL) {
next_entry = latest_entry;
}
if (next_entry == NULL || next_entry->key().empty()) {
break;
}
// if duplicate entry is found, don't expand more.
// This is because an entry only has one timestamp.
// we cannot trust the timestamp if there are duplicate values
// in one input.
if (!seen.insert(EntryFingerprint(*next_entry)).second) {
break;
}
key += next_entry->key();
value += next_entry->value();
current_entry = next_entry;
last_entry = next_entry;
// Don't update left_access_time if the current entry is
// not a content word.
// The time-stamp of non-content-word will be updated frequently.
// The time-stamp of the previous candidate is more trustful.
// It partially fixes the bug http://b/2843371.
const bool is_content_word = IsContentWord(current_entry->value());
if (is_content_word) {
left_last_access_time = current_entry->last_access_time();
}
// if left_most entry is a functional word (symbols/punctuations),
// we don't take it as a canonical candidate.
if (left_most_last_access_time == 0 && is_content_word) {
left_most_last_access_time = current_entry->last_access_time();
}
}
if (key.size() < input_key.size()) {
VLOG(3) << "Cannot find prefix match even after chain rules";
return false;
}
result = results->NewEntry();
result->CopyFrom(*entry);
result->set_key(key);
result->set_value(value);
} else {
LOG(ERROR) << "Unknown match mode: " << mtype;
return false;
}
DCHECK(result);
// if prev entry is not NULL, check whether there is a bigram
// from |prev_entry| to |entry|.
result->set_bigram_boost(false);
if (prev_entry != NULL && HasBigramEntry(*entry, *prev_entry)) {
// set bigram_boost flag so that this entry is boosted
// against LRU policy.
result->set_bigram_boost(true);
}
results->Push(result);
// Expand new entry which was input just after "last_entry"
if (last_entry != NULL &&
Util::CharsLen(result->key()) >= 1 &&
2 * Util::CharsLen(input_key) >= Util::CharsLen(result->key())) {
const Entry *latest_entry = NULL;
const Entry *left_same_timestamp_entry = NULL;
const Entry *left_most_same_timestamp_entry = NULL;
for (int i = 0; i < last_entry->next_entries_size(); ++i) {
const Entry *tmp_entry = dic_->LookupWithoutInsert(
last_entry->next_entries(i).entry_fp());
if (tmp_entry == NULL || tmp_entry->key().empty()) {
continue;
}
if (latest_entry == NULL ||
latest_entry->last_access_time() < tmp_entry->last_access_time()) {
latest_entry = tmp_entry;
}
if (tmp_entry->last_access_time() == left_last_access_time) {
left_same_timestamp_entry = tmp_entry;
}
if (tmp_entry->last_access_time() == left_most_last_access_time) {
left_most_same_timestamp_entry = tmp_entry;
}
}
const Entry *next_entry = left_most_same_timestamp_entry;
if (next_entry == NULL) {
next_entry = left_same_timestamp_entry;
}
if (next_entry == NULL) {
next_entry = latest_entry;
}
// the new entry was input within 10 seconds.
// TODO(taku): This is a simple heuristics.
if (next_entry != NULL && !next_entry->key().empty() &&
abs(static_cast<int32>(next_entry->last_access_time() -
last_entry->last_access_time())) <= 10 &&
IsContentWord(next_entry->value())) {
Entry *result2 = results->NewEntry();
result2->Clear();
result2->CopyFrom(*result);
*(result2->mutable_value()) += next_entry->value();
*(result2->mutable_key()) += next_entry->key();
results->Push(result2);
}
}
return true;
}
bool UserHistoryPredictor::Predict(Segments *segments) const {
if (!CheckSyncerAndDelete()) {
LOG(WARNING) << "Syncer is running";
return false;
}
if (GET_CONFIG(incognito_mode)) {
VLOG(2) << "incognito mode";
return false;
}
if (segments->request_type() == Segments::CONVERSION) {
VLOG(2) << "request type is CONVERSION";
return false;
}
if (!GET_CONFIG(use_history_suggest) &&
segments->request_type() == Segments::SUGGESTION) {
VLOG(2) << "no history suggest";
return false;
}
if (segments->conversion_segments_size() < 1) {
VLOG(2) << "segment size < 1";
return false;
}
const DicElement *head = dic_->Head();
if (head == NULL) {
VLOG(2) << "dic head is NULL";
return false;
}
const string &input_key = segments->conversion_segment(0).key();
const size_t input_key_len = Util::CharsLen(input_key);
bool zero_query_suggestion = false;
if (input_key_len == 0 && !zero_query_suggestion) {
VLOG(2) << "key length is 0";
return false;
}
if (segments->request_type() == Segments::SUGGESTION &&
!zero_query_suggestion &&
IsPunctuation(Util::SubString(input_key, 0, 1))) {
VLOG(2) << "input_key starts with punctuations";
return false;
}
// Do linear search
Segment *segment = segments->mutable_conversion_segment(0);
if (segment == NULL) {
LOG(ERROR) << "segment is NULL";
return false;
}
const size_t history_segments_size = segments->history_segments_size();
const Entry *prev_entry = NULL;
// when threre are non-zero history segments, lookup an entry
// from the LRU dictionary, which is correspoinding to the last
// history segment.
if (history_segments_size > 0) {
const Segment &history_segment =
segments->history_segment(history_segments_size - 1);
// Simply lookup the history_segment.
prev_entry = dic_->LookupWithoutInsert(
SegmentFingerprint(history_segment));
// When |prev_entry| is NULL or |prev_entry| has no valid next_entries,
// do linear-search over the LRU.
if ((prev_entry == NULL && history_segment.candidates_size() > 0) ||
(prev_entry != NULL && prev_entry->next_entries_size() == 0)) {
const string &prev_value = prev_entry == NULL ?
history_segment.candidate(0).value : prev_entry->value();
int trial = 0;
for (const DicElement *elm = head;
trial++ < kMaxPrevValueTrial && elm != NULL; elm = elm->next) {
const Entry *entry = &(elm->value);
// entry->value() equals to the prev_value or
// entry->value() is a SUFFIX of prev_value.
// length of entry->value() must be >= 2, as single-length
// match would be noisy.
if (IsValidEntry(*entry) &&
entry != prev_entry &&
entry->next_entries_size() > 0 &&
Util::CharsLen(entry->value()) >= 2 &&
(entry->value() == prev_value ||
Util::EndsWith(prev_value, entry->value()))) {
prev_entry = entry;
break;
}
}
}
}
if (input_key_len == 0 && prev_entry == NULL) {
VLOG(1) << "If input_key_len is 0, prev_entry must be set";
return false;
}
const size_t max_results_size =
5 * segments->max_prediction_candidates_size();
int trial = 0;
EntryPriorityQueue results;
for (const DicElement *elm = head; elm != NULL; elm = elm->next) {
if (!IsValidEntry(elm->value)) {
continue;
}
if (segments->request_type() == Segments::SUGGESTION &&
trial++ >= kMaxSuggestionTrial) {
VLOG(2) << "too many trials";
break;
}
// lookup input_key from elm_value and prev_entry.
// If a new entry is found, the entry is pushed to the results.
if (!LookupEntry(input_key, &(elm->value), prev_entry, &results)) {
continue;
}
// already found enough results.
if (results.size() >= max_results_size) {
break;
}
}
if (results.size() == 0) {
VLOG(2) << "no prefix match candiate is found.";
return false;
}
while (segment->candidates_size() <
segments->max_prediction_candidates_size()) {
// |results| is a priority queue where the elemtnt
// in the queue is sorted by the score defined in GetScore().
const Entry *result_entry = results.Pop();
if (result_entry == NULL) {
// Pop() returns NULL when no more valid entry exists.
break;
}
bool is_valid_candidate = false;
if (segments->request_type() == Segments::PREDICTION) {
is_valid_candidate = true;
} else if (segments->request_type() == Segments::SUGGESTION) {
// The top result of suggestion should be a VALID suggestion candidate.
// i.e., SuggestionTrigerFunc should return true for the first
// candidate.
// If user types "デスノート" too many times, "デスノート" will be
// suggested when user types "で". It is expected, but if user types
// "です" after that, showing "デスノート" is annoying.
// In this situation, "です" is in the LRU, but SuggestionTrigerFunc
// returns false for "です", since it is short.
if (IsValidSuggestion(zero_query_suggestion,
input_key_len, *result_entry)) {
is_valid_candidate = true;
} else if (segment->candidates_size() == 0) {
VLOG(2) << "candidates size is 0";
return false;
}
} else {
LOG(ERROR) << "Unknown mode";
return false;
}
if (!is_valid_candidate) {
VLOG(2) << "not a valid candidate: " << result_entry->key();
continue;
}
Segment::Candidate *candidate = segment->push_back_candidate();
DCHECK(candidate);
candidate->Init();
candidate->key = result_entry->key();
candidate->content_key = result_entry->key();
candidate->value = result_entry->value();
candidate->content_value = result_entry->value();
candidate->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION;
const string &description = result_entry->description();
// If we have stored description, set it exactly.
if (!description.empty()) {
candidate->description = description;
candidate->attributes |= Segment::Candidate::NO_EXTRA_DESCRIPTION;
} else {
VariantsRewriter::SetDescriptionForPrediction(candidate);
}
}
return (segment->candidates_size() > 0);
}
void UserHistoryPredictor::InsertNextEntry(
const UserHistoryPredictor::NextEntry &next_entry,
UserHistoryPredictor::Entry *entry) const {
if (next_entry.entry_fp() == 0 || entry == NULL) {
return;
}
NextEntry *target_next_entry = NULL;
// If next_entries_size is less than kMaxNextEntriesSize,
// we simply allocate a new entry.
if (entry->next_entries_size() < max_next_entries_size()) {
target_next_entry = entry->add_next_entries();
} else {
// Otherwise, find the oldest next_entry.
uint32 last_access_time = UINT_MAX;
for (int i = 0; i < entry->next_entries_size(); ++i) {
// already has the same id
if (next_entry.entry_fp() == entry->next_entries(i).entry_fp()) {
target_next_entry = entry->mutable_next_entries(i);
break;
}
const Entry *next_entry = dic_->LookupWithoutInsert(
entry->next_entries(i).entry_fp());
// reuse the entry if it is already removed from the LRU.
if (next_entry == NULL) {
target_next_entry = entry->mutable_next_entries(i);
break;
}
// preserve the oldest entry
if (target_next_entry == NULL ||
last_access_time > next_entry->last_access_time()) {
target_next_entry = entry->mutable_next_entries(i);
last_access_time = next_entry->last_access_time();
}
}
}
if (target_next_entry == NULL) {
LOG(ERROR) << "cannot find a room for inserting next fp";
return;
}
target_next_entry->CopyFrom(next_entry);
}
bool UserHistoryPredictor::IsValidEntry(const Entry &entry) {
return !entry.removed() && entry.entry_type() == Entry::DEFAULT_ENTRY;
}
void UserHistoryPredictor::InsertEvent(EntryType type) {
if (type == Entry::DEFAULT_ENTRY) {
return;
}
const uint32 last_access_time = static_cast<uint32>(time(NULL));
const uint32 dic_key = Fingerprint("", "", type);
CHECK(dic_.get());
DicElement *e = dic_->Insert(dic_key);
if (e == NULL) {
VLOG(2) << "insert failed";
return;
}
Entry *entry = &(e->value);
DCHECK(entry);
entry->Clear();
entry->set_entry_type(type);
entry->set_last_access_time(last_access_time);
}
void UserHistoryPredictor::Insert(const string &key,
const string &value,
const string &description,
bool is_suggestion_selected,
uint32 next_fp,
uint32 last_access_time,
Segments *segments) {
if (key.empty() || value.empty() ||
key.size() > kMaxStringLength ||
value.size() > kMaxStringLength ||
description.size() > kMaxStringLength) {
return;
}
const uint32 dic_key = Fingerprint(key, value);
if (!dic_->HasKey(dic_key)) {
// the key is a new key inserted in the last Finish method.
// Here we push a new RevertEntry so that the new "key" can be
// removed when Revert() method is called.
Segments::RevertEntry *revert_entry = segments->push_back_revert_entry();
DCHECK(revert_entry);
revert_entry->key = Uint32ToString(dic_key);
revert_entry->id = UserHistoryPredictor::revert_id();
revert_entry->revert_entry_type = Segments::RevertEntry::CREATE_ENTRY;
} else {
// the key is a old key not inserted in the last Finish method
// TODO(taku):
// add a treatment for UPDATE_ENTRY mode
}
DicElement *e = dic_->Insert(dic_key);
if (e == NULL) {
VLOG(2) << "insert failed";
return;
}
Entry *entry = &(e->value);
DCHECK(entry);
entry->set_key(key);
entry->set_value(value);
entry->set_removed(false);
if (description.empty()) {
entry->clear_description();
} else {
entry->set_description(description);
}
entry->set_last_access_time(last_access_time);
if (is_suggestion_selected) {
entry->set_suggestion_freq(entry->suggestion_freq() + 1);
} else {
entry->set_conversion_freq(entry->conversion_freq() + 1);
}
// Insert next_fp to the entry
if (next_fp != 0) {
NextEntry next_entry;
next_entry.set_entry_fp(next_fp);
InsertNextEntry(next_entry, entry);
}
VLOG(2) << key << " " << value << " has inserted: "
<< entry->suggestion_freq() << " "
<< entry->conversion_freq();
// new entry is inserted to the cache
updated_ = true;
}
void UserHistoryPredictor::Finish(Segments *segments) {
if (GET_CONFIG(incognito_mode)) {
VLOG(2) << "incognito mode";
return;
}
if (!GET_CONFIG(use_history_suggest)) {
VLOG(2) << "no history suggest";
return;
}
if (!CheckSyncerAndDelete()) {
LOG(WARNING) << "Syncer is running";
return;
}
const bool kInsertSuggestion = true;
const bool kInsertConversion = false;
const uint32 last_access_time = static_cast<uint32>(time(NULL));
// If user inputs a punctuation just after some long sentence,
// we make a new candidate by concatinating the top element in LRU and
// the punctuation user input. The top element in LRU is supposed to be
// the long sentence user input before.
// This is a fix for http://b/issue?id=2216838
if (dic_->Head() != NULL &&
segments->conversion_segments_size() == 1 &&
segments->history_segments_size() > 0 &&
segments->conversion_segment(0).candidates_size() > 0 &&
segments->history_segment(
segments->history_segments_size() - 1).candidates_size() > 0 &&
Util::CharsLen(
segments->conversion_segment(0).candidate(0).value) == 1 &&
IsPunctuation(
segments->conversion_segment(0).candidate(0).value) &&
dic_->Head()->value.last_access_time() + 5 > last_access_time) {
const Entry *entry = &(dic_->Head()->value);
DCHECK(entry);
const string &last_value =
segments->history_segment(
segments->history_segments_size() - 1).candidate(0).value;
// Check that value in head element of LRU ends with the candidate value
// in history segments.
if (last_value.size() <= entry->value().size() &&
entry->value().substr(entry->value().size() - last_value.size(),
last_value.size()) == last_value) {
const Segment::Candidate &candidate =
segments->conversion_segment(0).candidate(0);
const string key = entry->key() + candidate.key;
const string value = entry->value() + candidate.value;
// use the same last_access_time stored in the top element
// so that this item can be grouped together.
Insert(key, value, entry->description(), kInsertConversion, 0,
entry->last_access_time(), segments);
}
}
string all_key, all_value;
if (segments->request_type() == Segments::CONVERSION) {
const size_t history_segments_size = segments->history_segments_size();
// Check every segment is valid.
for (size_t i = history_segments_size; i < segments->segments_size(); ++i) {
const Segment &segment = segments->segment(i);
if (segment.candidates_size() < 1) {
VLOG(2) << "candidates size < 1";
return;
}
if (segment.segment_type() != Segment::FIXED_VALUE) {
VLOG(2) << "segment is not FIXED_VALUE";
return;
}
const Segment::Candidate &candidate = segment.candidate(0);
if (candidate.attributes &
Segment::Candidate::NO_SUGGEST_LEARNING) {
VLOG(2) << "NO_SUGGEST_LEARNING";
return;
}
}
if (IsPrivacySensitive(segments)) {
VLOG(2) << "do not remember privacy sensitive input";
return;
}
set<uint64> seen;
bool this_was_seen = false;
for (size_t i = history_segments_size; i < segments->segments_size(); ++i) {
const Segment &segment = segments->segment(i);
all_key += segment.key();
all_value += segment.candidate(0).value;
uint32 next_fp = (i == segments->segments_size() - 1) ?
0 : SegmentFingerprint(segments->segment(i + 1));
// remember the first segment
if (i == history_segments_size) {
seen.insert(SegmentFingerprint(segment));
}
uint32 next_fp_to_set = next_fp;
// If two duplicate segments exist, kills the link
// TO/FROM the second one to prevent loops.
// Only killing "TO" link caused bug #2982886:
// after converting "らいおん(もうじゅう)とぞうりむし(びせいぶつ)"
// and typing "ぞうりむし", "ゾウリムシ(猛獣" was suggested.
if (this_was_seen) {
next_fp_to_set = 0;
}
if (!seen.insert(next_fp).second) {
next_fp_to_set = 0;
this_was_seen = true;
} else {
this_was_seen = false;
}
Insert(segment.key(), segment.candidate(0).value,
segment.candidate(0).description,
kInsertConversion, next_fp_to_set,
last_access_time, segments);
}
// insert all_key/all_value
if (segments->conversion_segments_size() > 1 &&
!all_key.empty() && !all_value.empty()) {
Insert(all_key, all_value, "",
kInsertConversion,
0, last_access_time, segments);
}
} else {
// Store user-history from predictoin
if (segments->conversion_segments_size() < 1) {
return;
}
const Segment &segment = segments->conversion_segment(0);
if (segment.candidates_size() < 1) {
return;
}
all_key = segment.candidate(0).key;
all_value = segment.candidate(0).value;
const string &description = segment.candidate(0).description;
Insert(all_key, all_value, description,
kInsertSuggestion, 0, last_access_time, segments);
}
// make a link from the right most history_segment to
// the left most segment or entire user input.
if (segments->history_segments_size() > 0 &&
segments->conversion_segments_size() > 0) {
Entry *history_entry =
dic_->MutableLookupWithoutInsert(
SegmentFingerprint(
segments->segment(
segments->history_segments_size() - 1)));
NextEntry next_entry;
if (segments->request_type() == Segments::CONVERSION) {
next_entry.set_entry_fp(
SegmentFingerprint(segments->conversion_segment(0)));
InsertNextEntry(next_entry, history_entry);
}
// entire user input or SUGGESTION
if (segments->request_type() != Segments::CONVERSION ||
segments->conversion_segments_size() > 1) {
next_entry.set_entry_fp(Fingerprint(all_key, all_value));
InsertNextEntry(next_entry, history_entry);
}
}
return;
}
void UserHistoryPredictor::Revert(Segments *segments) {
if (!CheckSyncerAndDelete()) {
LOG(WARNING) << "Syncer is running";
return;
}
for (size_t i = 0; i < segments->revert_entries_size(); ++i) {
const Segments::RevertEntry &revert_entry =
segments->revert_entry(i);
if (revert_entry.id == UserHistoryPredictor::revert_id() &&
revert_entry.revert_entry_type == Segments::RevertEntry::CREATE_ENTRY) {
VLOG(2) << "Erasing the key: " << StringToUint32(revert_entry.key);
dic_->Erase(StringToUint32(revert_entry.key));
}
}
}
// type
UserHistoryPredictor::MatchType
UserHistoryPredictor::GetMatchType(const string &lstr, const string &rstr) {
if (lstr.empty() && !rstr.empty()) {
return LEFT_EMPTY_MATCH;
}
const size_t size = min(lstr.size(), rstr.size());
if (size == 0) {
return NO_MATCH;
}
const int result = memcmp(lstr.data(), rstr.data(), size);
if (result != 0) {
return NO_MATCH;
}
if (lstr.size() == rstr.size()) {
return EXACT_MATCH;
} else if (lstr.size() < rstr.size()) {
return LEFT_PREFIX_MATCH;
} else {
return RIGHT_PREFIX_MATCH;
}
return NO_MATCH;
}
// static
uint32 UserHistoryPredictor::Fingerprint(const string &key,
const string &value,
EntryType type) {
if (type == Entry::DEFAULT_ENTRY) {
// Since we have already used the fingerprint function for next entries and
// next entries are saved in user's local machine, we are not able
// to change the Fingerprint function for the old key/value type.
return Util::Fingerprint32(key + kDelimiter + value);
} else {
uint8 id = static_cast<uint8>(type);
return Util::Fingerprint32(reinterpret_cast<char *>(&id), sizeof(id));
}
}
uint32 UserHistoryPredictor::Fingerprint(const string &key,
const string &value) {
return Fingerprint(key, value, Entry::DEFAULT_ENTRY);
}
// static
uint32 UserHistoryPredictor::EntryFingerprint(
const UserHistoryPredictor::Entry &entry) {
return Fingerprint(entry.key(), entry.value());
}
// static
uint32 UserHistoryPredictor::SegmentFingerprint(const Segment &segment) {
if (segment.candidates_size() > 0) {
return Fingerprint(segment.candidate(0).key, segment.candidate(0).value);
}
return 0;
}
// static
string UserHistoryPredictor::Uint32ToString(uint32 fp) {
string buf(reinterpret_cast<const char *>(&fp), sizeof(fp));
return buf;
}
// static
uint32 UserHistoryPredictor::StringToUint32(const string &input) {
uint32 result = 0;
if (input.size() == sizeof(result)) {
memcpy(reinterpret_cast<char *>(&result), input.data(), input.size());
}
return result;
}
// static
bool UserHistoryPredictor::IsValidSuggestion(
bool is_zero_query_suggestion,
uint32 prefix_len,
const UserHistoryPredictor::Entry &entry) {
// when bigram_boost is true, that means that previous user input
// and current input have bigram relation.
if (entry.bigram_boost()) {
return true;
}
if (is_zero_query_suggestion) {
return true;
}
// Handle suggestion_freq and conversion_freq differently.
// conversion_freq is less aggressively affecting to the final decision.
const uint32 freq = max(entry.suggestion_freq(),
entry.conversion_freq() / 4);
// TODO(taku,komatsu): better to make it simpler and easier to be understood.
const uint32 base_prefix_len = 3 - min(static_cast<uint32>(2), freq);
return (prefix_len >= base_prefix_len);
}
// static
// 1) sort by last_access_time, which is basically the same as LRU policy.
// 2) boost shorter candidate, if having the same last_access_time.
// 3) add a bigram boost as a special bonus.
// TODO(taku): better to take "frequency" into consideration
uint32 UserHistoryPredictor::GetScore(
const UserHistoryPredictor::Entry &entry) {
const uint32 kBigramBoostAsTime = 7 * 24 * 60 * 60; // 1 week.
return
entry.last_access_time() - Util::CharsLen(entry.value()) +
(entry.bigram_boost() ? kBigramBoostAsTime : 0);
}
// return the size of cache.
// staitc
uint32 UserHistoryPredictor::cache_size() {
return kLRUCacheSize;
}
// return the size of next entries.
// static
uint32 UserHistoryPredictor::max_next_entries_size() {
return kMaxNextEntriesSize;
}
} // namespace mozc