blob: 887f554026bc348b96757cc9404fcbfcfa796ed2 [file] [log] [blame]
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "annotator/grammar/dates/cfg-datetime-annotator.h"
#include "annotator/datetime/utils.h"
#include "annotator/grammar/dates/annotations/annotation-options.h"
#include "annotator/grammar/utils.h"
#include "utils/strings/split.h"
#include "utils/tokenizer.h"
#include "utils/utf8/unicodetext.h"
namespace libtextclassifier3::dates {
namespace {
static std::string GetReferenceLocale(const std::string& locales) {
std::vector<StringPiece> split_locales = strings::Split(locales, ',');
if (!split_locales.empty()) {
return split_locales[0].ToString();
}
return "";
}
static void InterpretParseData(const DatetimeParsedData& datetime_parsed_data,
const DateAnnotationOptions& options,
const CalendarLib& calendarlib,
int64* interpreted_time_ms_utc,
DatetimeGranularity* granularity) {
DatetimeGranularity local_granularity =
calendarlib.GetGranularity(datetime_parsed_data);
if (!calendarlib.InterpretParseData(
datetime_parsed_data, options.base_timestamp_millis,
options.reference_timezone, GetReferenceLocale(options.locales),
/*prefer_future_for_unspecified_date=*/true, interpreted_time_ms_utc,
granularity)) {
TC3_LOG(WARNING) << "Failed to extract time in millis and Granularity.";
// Fallingback to DatetimeParsedData's finest granularity
*granularity = local_granularity;
}
}
} // namespace
CfgDatetimeAnnotator::CfgDatetimeAnnotator(
const UniLib* unilib, const GrammarTokenizerOptions* tokenizer_options,
const CalendarLib* calendar_lib, const DatetimeRules* datetime_rules,
const float annotator_target_classification_score,
const float annotator_priority_score)
: calendar_lib_(*calendar_lib),
tokenizer_(BuildTokenizer(unilib, tokenizer_options)),
parser_(unilib, datetime_rules),
annotator_target_classification_score_(
annotator_target_classification_score),
annotator_priority_score_(annotator_priority_score) {}
void CfgDatetimeAnnotator::Parse(
const std::string& input, const DateAnnotationOptions& annotation_options,
const std::vector<Locale>& locales,
std::vector<DatetimeParseResultSpan>* results) const {
Parse(UTF8ToUnicodeText(input, /*do_copy=*/false), annotation_options,
locales, results);
}
void CfgDatetimeAnnotator::ProcessDatetimeParseResult(
const DateAnnotationOptions& annotation_options,
const DatetimeParseResult& datetime_parse_result,
std::vector<DatetimeParseResult>* results) const {
DatetimeParsedData datetime_parsed_data;
datetime_parsed_data.AddDatetimeComponents(
datetime_parse_result.datetime_components);
std::vector<DatetimeParsedData> interpretations;
if (annotation_options.generate_alternative_interpretations_when_ambiguous) {
FillInterpretations(datetime_parsed_data,
calendar_lib_.GetGranularity(datetime_parsed_data),
&interpretations);
} else {
interpretations.emplace_back(datetime_parsed_data);
}
for (const DatetimeParsedData& interpretation : interpretations) {
results->emplace_back();
interpretation.GetDatetimeComponents(&results->back().datetime_components);
InterpretParseData(interpretation, annotation_options, calendar_lib_,
&(results->back().time_ms_utc),
&(results->back().granularity));
std::sort(results->back().datetime_components.begin(),
results->back().datetime_components.end(),
[](const DatetimeComponent& a, const DatetimeComponent& b) {
return a.component_type > b.component_type;
});
}
}
void CfgDatetimeAnnotator::Parse(
const UnicodeText& input, const DateAnnotationOptions& annotation_options,
const std::vector<Locale>& locales,
std::vector<DatetimeParseResultSpan>* results) const {
std::vector<DatetimeParseResultSpan> grammar_datetime_parse_result_spans =
parser_.Parse(input.data(), tokenizer_.Tokenize(input), locales,
annotation_options);
for (const DatetimeParseResultSpan& grammar_datetime_parse_result_span :
grammar_datetime_parse_result_spans) {
DatetimeParseResultSpan datetime_parse_result_span;
datetime_parse_result_span.span.first =
grammar_datetime_parse_result_span.span.first;
datetime_parse_result_span.span.second =
grammar_datetime_parse_result_span.span.second;
datetime_parse_result_span.priority_score = annotator_priority_score_;
if (annotation_options.use_rule_priority_score) {
datetime_parse_result_span.priority_score =
grammar_datetime_parse_result_span.priority_score;
}
datetime_parse_result_span.target_classification_score =
annotator_target_classification_score_;
for (const DatetimeParseResult& grammar_datetime_parse_result :
grammar_datetime_parse_result_span.data) {
ProcessDatetimeParseResult(annotation_options,
grammar_datetime_parse_result,
&datetime_parse_result_span.data);
}
results->emplace_back(datetime_parse_result_span);
}
}
} // namespace libtextclassifier3::dates