blob: 5e87cf241f0f79ac36aa764c5ac0f2e58358b44b [file] [log] [blame]
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef LIBTEXTCLASSIFIER_ANNOTATOR_GRAMMAR_DATES_UTILS_DATE_MATCH_H_
#define LIBTEXTCLASSIFIER_ANNOTATOR_GRAMMAR_DATES_UTILS_DATE_MATCH_H_
#include <stddef.h>
#include <stdint.h>
#include <algorithm>
#include <vector>
#include "annotator/grammar/dates/dates_generated.h"
#include "annotator/grammar/dates/timezone-code_generated.h"
#include "utils/grammar/match.h"
namespace libtextclassifier3 {
namespace dates {
static constexpr int NO_VAL = -1;
// POD match data structure.
struct MatchBase : public grammar::Match {
void Reset() { type = MatchType::MatchType_UNKNOWN; }
};
struct ExtractionMatch : public MatchBase {
const ExtractionRuleParameter* extraction_rule;
void Reset() {
MatchBase::Reset();
type = MatchType::MatchType_DATETIME_RULE;
extraction_rule = nullptr;
}
};
struct TermValueMatch : public MatchBase {
const TermValue* term_value;
void Reset() {
MatchBase::Reset();
type = MatchType::MatchType_TERM_VALUE;
term_value = nullptr;
}
};
struct NonterminalMatch : public MatchBase {
const NonterminalValue* nonterminal;
void Reset() {
MatchBase::Reset();
type = MatchType::MatchType_NONTERMINAL;
nonterminal = nullptr;
}
};
struct IntegerMatch : public NonterminalMatch {
int value;
int8 count_of_digits; // When expression is in digits format.
bool is_zero_prefixed; // When expression is in digits format.
void Reset() {
NonterminalMatch::Reset();
value = NO_VAL;
count_of_digits = 0;
is_zero_prefixed = false;
}
};
struct DigitsMatch : public IntegerMatch {
void Reset() {
IntegerMatch::Reset();
type = MatchType::MatchType_DIGITS;
}
static bool IsValid(int x) { return true; }
};
struct YearMatch : public IntegerMatch {
void Reset() {
IntegerMatch::Reset();
type = MatchType::MatchType_YEAR;
}
static bool IsValid(int x) { return x >= 1; }
};
struct MonthMatch : public IntegerMatch {
void Reset() {
IntegerMatch::Reset();
type = MatchType::MatchType_MONTH;
}
static bool IsValid(int x) { return (x >= 1 && x <= 12); }
};
struct DayMatch : public IntegerMatch {
void Reset() {
IntegerMatch::Reset();
type = MatchType::MatchType_DAY;
}
static bool IsValid(int x) { return (x >= 1 && x <= 31); }
};
struct HourMatch : public IntegerMatch {
void Reset() {
IntegerMatch::Reset();
type = MatchType::MatchType_HOUR;
}
static bool IsValid(int x) { return (x >= 0 && x <= 24); }
};
struct MinuteMatch : public IntegerMatch {
void Reset() {
IntegerMatch::Reset();
type = MatchType::MatchType_MINUTE;
}
static bool IsValid(int x) { return (x >= 0 && x <= 59); }
};
struct SecondMatch : public IntegerMatch {
void Reset() {
IntegerMatch::Reset();
type = MatchType::MatchType_SECOND;
}
static bool IsValid(int x) { return (x >= 0 && x <= 60); }
};
struct DecimalMatch : public NonterminalMatch {
double value;
int8 count_of_digits; // When expression is in digits format.
void Reset() {
NonterminalMatch::Reset();
value = NO_VAL;
count_of_digits = 0;
}
};
struct FractionSecondMatch : public DecimalMatch {
void Reset() {
DecimalMatch::Reset();
type = MatchType::MatchType_FRACTION_SECOND;
}
static bool IsValid(double x) { return (x >= 0.0 && x < 1.0); }
};
// CombinedIntegersMatch<N> is used for expressions containing multiple (up
// to N) matches of integers without delimeters between them (because
// CFG-grammar is based on tokenizer, it could not split a token into several
// pieces like using regular-expression). For example, "1130" contains "11"
// and "30" meaning November 30.
template <int N>
struct CombinedIntegersMatch : public NonterminalMatch {
enum {
SIZE = N,
};
int values[SIZE];
int8 count_of_digits; // When expression is in digits format.
bool is_zero_prefixed; // When expression is in digits format.
void Reset() {
NonterminalMatch::Reset();
for (int i = 0; i < SIZE; ++i) {
values[i] = NO_VAL;
}
count_of_digits = 0;
is_zero_prefixed = false;
}
};
struct CombinedDigitsMatch : public CombinedIntegersMatch<6> {
enum Index {
INDEX_YEAR = 0,
INDEX_MONTH = 1,
INDEX_DAY = 2,
INDEX_HOUR = 3,
INDEX_MINUTE = 4,
INDEX_SECOND = 5,
};
bool HasYear() const { return values[INDEX_YEAR] != NO_VAL; }
bool HasMonth() const { return values[INDEX_MONTH] != NO_VAL; }
bool HasDay() const { return values[INDEX_DAY] != NO_VAL; }
bool HasHour() const { return values[INDEX_HOUR] != NO_VAL; }
bool HasMinute() const { return values[INDEX_MINUTE] != NO_VAL; }
bool HasSecond() const { return values[INDEX_SECOND] != NO_VAL; }
int GetYear() const { return values[INDEX_YEAR]; }
int GetMonth() const { return values[INDEX_MONTH]; }
int GetDay() const { return values[INDEX_DAY]; }
int GetHour() const { return values[INDEX_HOUR]; }
int GetMinute() const { return values[INDEX_MINUTE]; }
int GetSecond() const { return values[INDEX_SECOND]; }
void Reset() {
CombinedIntegersMatch<SIZE>::Reset();
type = MatchType::MatchType_COMBINED_DIGITS;
}
static bool IsValid(int i, int x) {
switch (i) {
case INDEX_YEAR:
return YearMatch::IsValid(x);
case INDEX_MONTH:
return MonthMatch::IsValid(x);
case INDEX_DAY:
return DayMatch::IsValid(x);
case INDEX_HOUR:
return HourMatch::IsValid(x);
case INDEX_MINUTE:
return MinuteMatch::IsValid(x);
case INDEX_SECOND:
return SecondMatch::IsValid(x);
default:
return false;
}
}
};
struct TimeValueMatch : public NonterminalMatch {
const HourMatch* hour_match;
const MinuteMatch* minute_match;
const SecondMatch* second_match;
const FractionSecondMatch* fraction_second_match;
bool is_hour_zero_prefixed : 1;
bool is_minute_one_digit : 1;
bool is_second_one_digit : 1;
int8 hour;
int8 minute;
int8 second;
double fraction_second;
void Reset() {
NonterminalMatch::Reset();
type = MatchType::MatchType_TIME_VALUE;
hour_match = nullptr;
minute_match = nullptr;
second_match = nullptr;
fraction_second_match = nullptr;
is_hour_zero_prefixed = false;
is_minute_one_digit = false;
is_second_one_digit = false;
hour = NO_VAL;
minute = NO_VAL;
second = NO_VAL;
fraction_second = NO_VAL;
}
};
struct TimeSpanMatch : public NonterminalMatch {
const TimeSpanSpec* time_span_spec;
TimespanCode time_span_code;
void Reset() {
NonterminalMatch::Reset();
type = MatchType::MatchType_TIME_SPAN;
time_span_spec = nullptr;
time_span_code = TimespanCode_TIMESPAN_CODE_NONE;
}
};
struct TimeZoneNameMatch : public NonterminalMatch {
const TimeZoneNameSpec* time_zone_name_spec;
TimezoneCode time_zone_code;
void Reset() {
NonterminalMatch::Reset();
type = MatchType::MatchType_TIME_ZONE_NAME;
time_zone_name_spec = nullptr;
time_zone_code = TimezoneCode_TIMEZONE_CODE_NONE;
}
};
struct TimeZoneOffsetMatch : public NonterminalMatch {
const TimeZoneOffsetParameter* time_zone_offset_param;
int16 time_zone_offset;
void Reset() {
NonterminalMatch::Reset();
type = MatchType::MatchType_TIME_ZONE_OFFSET;
time_zone_offset_param = nullptr;
time_zone_offset = 0;
}
};
struct DayOfWeekMatch : public IntegerMatch {
void Reset() {
IntegerMatch::Reset();
type = MatchType::MatchType_DAY_OF_WEEK;
}
static bool IsValid(int x) {
return (x > DayOfWeek_DOW_NONE && x <= DayOfWeek_MAX);
}
};
struct TimePeriodMatch : public NonterminalMatch {
int value;
void Reset() {
NonterminalMatch::Reset();
type = MatchType::MatchType_TIME_PERIOD;
value = NO_VAL;
}
};
struct RelativeMatch : public NonterminalMatch {
enum {
HAS_NONE = 0,
HAS_YEAR = 1 << 0,
HAS_MONTH = 1 << 1,
HAS_DAY = 1 << 2,
HAS_WEEK = 1 << 3,
HAS_HOUR = 1 << 4,
HAS_MINUTE = 1 << 5,
HAS_SECOND = 1 << 6,
HAS_DAY_OF_WEEK = 1 << 7,
HAS_IS_FUTURE = 1 << 31,
};
uint32 existing;
int year;
int month;
int day;
int week;
int hour;
int minute;
int second;
const NonterminalValue* day_of_week_nonterminal;
int8 day_of_week;
bool is_future_date;
bool HasDay() const { return existing & HAS_DAY; }
bool HasDayFields() const { return existing & (HAS_DAY | HAS_DAY_OF_WEEK); }
bool HasTimeValueFields() const {
return existing & (HAS_HOUR | HAS_MINUTE | HAS_SECOND);
}
bool IsStandaloneRelativeDayOfWeek() const {
return (existing & HAS_DAY_OF_WEEK) && (existing & ~HAS_DAY_OF_WEEK) == 0;
}
void Reset() {
NonterminalMatch::Reset();
type = MatchType::MatchType_RELATIVE_DATE;
existing = HAS_NONE;
year = NO_VAL;
month = NO_VAL;
day = NO_VAL;
week = NO_VAL;
hour = NO_VAL;
minute = NO_VAL;
second = NO_VAL;
day_of_week = NO_VAL;
is_future_date = false;
}
};
// This is not necessarily POD, it is used to keep the final matched result.
struct DateMatch {
// Sub-matches in the date match.
const YearMatch* year_match = nullptr;
const MonthMatch* month_match = nullptr;
const DayMatch* day_match = nullptr;
const DayOfWeekMatch* day_of_week_match = nullptr;
const TimeValueMatch* time_value_match = nullptr;
const TimeSpanMatch* time_span_match = nullptr;
const TimeZoneNameMatch* time_zone_name_match = nullptr;
const TimeZoneOffsetMatch* time_zone_offset_match = nullptr;
const RelativeMatch* relative_match = nullptr;
const CombinedDigitsMatch* combined_digits_match = nullptr;
// [begin, end) indicates the Document position where the date or date range
// was found.
int begin = -1;
int end = -1;
int priority = 0;
float annotator_priority_score = 0.0;
int year = NO_VAL;
int8 month = NO_VAL;
int8 day = NO_VAL;
DayOfWeek day_of_week = DayOfWeek_DOW_NONE;
BCAD bc_ad = BCAD_BCAD_NONE;
int8 hour = NO_VAL;
int8 minute = NO_VAL;
int8 second = NO_VAL;
double fraction_second = NO_VAL;
TimespanCode time_span_code = TimespanCode_TIMESPAN_CODE_NONE;
int time_zone_code = TimezoneCode_TIMEZONE_CODE_NONE;
int16 time_zone_offset = std::numeric_limits<int16>::min();
// Fields about ambiguous hours. These fields are used to interpret the
// possible values of ambiguous hours. Since all kinds of known ambiguities
// are in the form of arithmetic progression (starting from .hour field),
// we can use "ambiguous_hour_count" to denote the count of ambiguous hours,
// and use "ambiguous_hour_interval" to denote the distance between a pair
// of adjacent possible hours. Values in the arithmetic progression are
// shrunk into [0, 23] (MOD 24). One can use the GetPossibleHourValues()
// method for the complete list of possible hours.
uint8 ambiguous_hour_count = 0;
uint8 ambiguous_hour_interval = 0;
bool is_inferred = false;
// This field is set in function PerformRefinements to remove some DateMatch
// like overlapped, duplicated, etc.
bool is_removed = false;
std::string DebugString() const;
bool HasYear() const { return year != NO_VAL; }
bool HasMonth() const { return month != NO_VAL; }
bool HasDay() const { return day != NO_VAL; }
bool HasDayOfWeek() const { return day_of_week != DayOfWeek_DOW_NONE; }
bool HasBcAd() const { return bc_ad != BCAD_BCAD_NONE; }
bool HasHour() const { return hour != NO_VAL; }
bool HasMinute() const { return minute != NO_VAL; }
bool HasSecond() const { return second != NO_VAL; }
bool HasFractionSecond() const { return fraction_second != NO_VAL; }
bool HasTimeSpanCode() const {
return time_span_code != TimespanCode_TIMESPAN_CODE_NONE;
}
bool HasTimeZoneCode() const {
return time_zone_code != TimezoneCode_TIMEZONE_CODE_NONE;
}
bool HasTimeZoneOffset() const {
return time_zone_offset != std::numeric_limits<int16>::min();
}
bool HasRelativeDate() const { return relative_match != nullptr; }
bool IsHourAmbiguous() const { return ambiguous_hour_count >= 2; }
bool IsStandaloneTime() const {
return (HasHour() || HasMinute()) && !HasDayOfWeek() && !HasDay() &&
!HasMonth() && !HasYear();
}
void SetAmbiguousHourProperties(uint8 count, uint8 interval) {
ambiguous_hour_count = count;
ambiguous_hour_interval = interval;
}
// Outputs all the possible hour values. If current DateMatch does not
// contain an hour, nothing will be output. If the hour is not ambiguous,
// only one value (= .hour) will be output. This method clears the vector
// "values" first, and it is not guaranteed that the values in the vector
// are in a sorted order.
void GetPossibleHourValues(std::vector<int8>* values) const;
int GetPriority() const { return priority; }
float GetAnnotatorPriorityScore() const { return annotator_priority_score; }
bool IsStandaloneRelativeDayOfWeek() const {
return (HasRelativeDate() &&
relative_match->IsStandaloneRelativeDayOfWeek() &&
!HasDateFields() && !HasTimeFields() && !HasTimeSpanCode());
}
bool HasDateFields() const {
return (HasYear() || HasMonth() || HasDay() || HasDayOfWeek() || HasBcAd());
}
bool HasTimeValueFields() const {
return (HasHour() || HasMinute() || HasSecond() || HasFractionSecond());
}
bool HasTimeSpanFields() const { return HasTimeSpanCode(); }
bool HasTimeZoneFields() const {
return (HasTimeZoneCode() || HasTimeZoneOffset());
}
bool HasTimeFields() const {
return (HasTimeValueFields() || HasTimeSpanFields() || HasTimeZoneFields());
}
bool IsValid() const;
// Overall relative qualifier of the DateMatch e.g. 2 year ago is 'PAST' and
// next week is 'FUTURE'.
DatetimeComponent::RelativeQualifier GetRelativeQualifier() const;
// Getter method to get the 'DatetimeComponent' of given 'ComponentType'.
Optional<DatetimeComponent> GetDatetimeComponent(
const DatetimeComponent::ComponentType& component_type) const;
void FillDatetimeComponents(
std::vector<DatetimeComponent>* datetime_component) const;
};
// Represent a matched date range which includes the from and to matched date.
struct DateRangeMatch {
int begin = -1;
int end = -1;
DateMatch from;
DateMatch to;
std::string DebugString() const;
int GetPriority() const {
return std::max(from.GetPriority(), to.GetPriority());
}
float GetAnnotatorPriorityScore() const {
return std::max(from.GetAnnotatorPriorityScore(),
to.GetAnnotatorPriorityScore());
}
};
} // namespace dates
} // namespace libtextclassifier3
#endif // LIBTEXTCLASSIFIER_ANNOTATOR_GRAMMAR_DATES_UTILS_DATE_MATCH_H_