blob: 58f5921fd77960669cae7dd29cf39368f26c0c1b [file] [log] [blame]
// Copyright 2010-2011, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "rewriter/number_rewriter.h"
#include <stdio.h>
#include <algorithm>
#include <string>
#include <utility>
#include <vector>
#include "base/util.h"
#include "dictionary/pos_matcher.h"
#include "converter/segments.h"
#include "session/commands.pb.h"
#include "session/config_handler.h"
#include "session/config.pb.h"
namespace mozc {
namespace {
void PushBackCandidate(const string &value, const string &desc, uint16 style,
vector<Segment::Candidate> *results) {
bool found = false;
for (vector<Segment::Candidate>::const_iterator it = results->begin();
it != results->end(); ++it) {
if (it->value == value) {
found = true;
break;
}
}
if (!found) {
Segment::Candidate cand;
cand.value = value;
cand.description = desc;
cand.style = style;
results->push_back(cand);
}
}
bool IsNumber(uint16 lid) {
return
(POSMatcher::IsNumber(lid) ||
POSMatcher::IsKanjiNumber(lid));
}
// Return true if rewriter should insert numerical variants.
// *base_candidate_pos: candidate index of base_candidate. POS information
// for numerical variants are coped from the base_candidate.
// *insert_pos: the candidate index from which numerical variants
// should be inserted.
bool GetNumericCandidatePositions(Segment *seg, int *base_candidate_pos,
int *insert_pos) {
CHECK(base_candidate_pos);
CHECK(insert_pos);
for (size_t i = 0; i < seg->candidates_size(); ++i) {
const Segment::Candidate &c = seg->candidate(i);
if (!IsNumber(c.lid)) {
continue;
}
if (Util::GetScriptType(c.content_value) == Util::NUMBER) {
*base_candidate_pos = i;
// +2 as fullwidht/(or halfwidth) variant is on i + 1 postion.
*insert_pos = i + 2;
return true;
}
string kanji_number, arabic_number, half_width_new_content_value;
Util::FullWidthToHalfWidth(c.content_key, &half_width_new_content_value);
// try to get normalized kanji_number and arabic_number.
// if it failed, do nothing.
if (!Util::NormalizeNumbers(c.content_value, true, &kanji_number,
&arabic_number) ||
arabic_number == half_width_new_content_value) {
return false;
}
// Insert arabic number first
Segment::Candidate *arabic_c = seg->insert_candidate(i + 1);
DCHECK(arabic_c);
const string suffix =
c.value.substr(c.content_value.size(),
c.value.size() - c.content_value.size());
arabic_c->Init();
arabic_c->value = arabic_number + suffix;
arabic_c->content_value = arabic_number;
arabic_c->key = c.key;
arabic_c->content_key = c.content_key;
arabic_c->cost = c.cost;
arabic_c->structure_cost = c.structure_cost;
arabic_c->lid = c.lid;
arabic_c->rid = c.rid;
// If top candidate is Kanji numeric, we want to expand at least
// 5 candidates here.
// http://b/issue?id=2872048
const int kArabicNumericOffset = 5;
*base_candidate_pos = i + 1;
*insert_pos = i + kArabicNumericOffset;
return true;
}
return false;
}
} // namespace
NumberRewriter::NumberRewriter() {}
NumberRewriter::~NumberRewriter() {}
int NumberRewriter::capability() const {
return RewriterInterface::CONVERSION;
}
bool NumberRewriter::Rewrite(Segments *segments) const {
if (!GET_CONFIG(use_number_conversion)) {
VLOG(2) << "no use_number_conversion";
return false;
}
bool modified = false;
for (size_t i = segments->history_segments_size();
i < segments->segments_size(); ++i) {
Segment *seg = segments->mutable_segment(i);
DCHECK(seg);
int base_candidate_pos = 0;
int insert_pos = 0;
if (!GetNumericCandidatePositions(seg, &base_candidate_pos, &insert_pos)) {
continue;
}
const Segment::Candidate &base_cand = seg->candidate(base_candidate_pos);
if (base_cand.content_value.size() > base_cand.value.size()) {
LOG(ERROR) << "Invalid content_value/value: ";
continue;
}
string base_content_value;
Util::FullWidthToHalfWidth(base_cand.content_value, &base_content_value);
if (Util::GetScriptType(base_content_value) != Util::NUMBER) {
LOG(ERROR) << "base_content_value is not number: " << base_content_value;
continue;
}
insert_pos = min(insert_pos, static_cast<int>(seg->candidates_size()));
modified = true;
vector<Util::NumberString> output;
vector<Segment::Candidate> converted_numbers;
Util::ArabicToWideArabic(base_content_value, &output);
Util::ArabicToSeparatedArabic(base_content_value, &output);
Util::ArabicToKanji(base_content_value, &output);
Util::ArabicToOtherForms(base_content_value, &output);
if (segments->conversion_segments_size() == 1) {
Util::ArabicToOtherRadixes(base_content_value, &output);
}
for (int i = 0; i < output.size(); i++) {
PushBackCandidate(output[i].value, output[i].description, output[i].style,
&converted_numbers);
}
const string suffix =
base_cand.value.substr(base_cand.content_value.size(),
base_cand.value.size() -
base_cand.content_value.size());
for (vector<Segment::Candidate>::const_iterator iter =
converted_numbers.begin();
iter != converted_numbers.end(); ++iter) {
Segment::Candidate *c = seg->insert_candidate(insert_pos++);
DCHECK(c);
c->lid = base_cand.lid;
c->rid = base_cand.rid;
c->cost = base_cand.cost;
c->value = iter->value + suffix;
c->content_value = iter->value;
c->key = base_cand.key;
c->content_key = base_cand.content_key;
c->style = iter->style;
c->description = iter->description;
// Don't want to have FULL_WIDTH form for Hex/Oct/BIN..etc.
if (c->style == Segment::Candidate::NUMBER_HEX ||
c->style == Segment::Candidate::NUMBER_OCT ||
c->style == Segment::Candidate::NUMBER_BIN) {
c->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION;
}
}
}
return modified;
}
} // namespace mozc