| // Copyright 2020 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| |
| #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_GRAMMAR_DATES_UTILS_DATE_MATCH_H_ |
| #define LIBTEXTCLASSIFIER_ANNOTATOR_GRAMMAR_DATES_UTILS_DATE_MATCH_H_ |
| |
| #include <stddef.h> |
| #include <stdint.h> |
| |
| #include <algorithm> |
| #include <vector> |
| |
| #include "annotator/grammar/dates/dates_generated.h" |
| #include "annotator/grammar/dates/timezone-code_generated.h" |
| #include "utils/grammar/match.h" |
| |
| namespace libtextclassifier3 { |
| namespace dates { |
| |
| static constexpr int NO_VAL = -1; |
| |
| // POD match data structure. |
| struct MatchBase : public grammar::Match { |
| void Reset() { type = MatchType::MatchType_UNKNOWN; } |
| }; |
| |
| struct ExtractionMatch : public MatchBase { |
| const ExtractionRuleParameter* extraction_rule; |
| |
| void Reset() { |
| MatchBase::Reset(); |
| type = MatchType::MatchType_DATETIME_RULE; |
| extraction_rule = nullptr; |
| } |
| }; |
| |
| struct TermValueMatch : public MatchBase { |
| const TermValue* term_value; |
| |
| void Reset() { |
| MatchBase::Reset(); |
| type = MatchType::MatchType_TERM_VALUE; |
| term_value = nullptr; |
| } |
| }; |
| |
| struct NonterminalMatch : public MatchBase { |
| const NonterminalValue* nonterminal; |
| |
| void Reset() { |
| MatchBase::Reset(); |
| type = MatchType::MatchType_NONTERMINAL; |
| nonterminal = nullptr; |
| } |
| }; |
| |
| struct IntegerMatch : public NonterminalMatch { |
| int value; |
| int8 count_of_digits; // When expression is in digits format. |
| bool is_zero_prefixed; // When expression is in digits format. |
| |
| void Reset() { |
| NonterminalMatch::Reset(); |
| value = NO_VAL; |
| count_of_digits = 0; |
| is_zero_prefixed = false; |
| } |
| }; |
| |
| struct DigitsMatch : public IntegerMatch { |
| void Reset() { |
| IntegerMatch::Reset(); |
| type = MatchType::MatchType_DIGITS; |
| } |
| |
| static bool IsValid(int x) { return true; } |
| }; |
| |
| struct YearMatch : public IntegerMatch { |
| void Reset() { |
| IntegerMatch::Reset(); |
| type = MatchType::MatchType_YEAR; |
| } |
| |
| static bool IsValid(int x) { return x >= 1; } |
| }; |
| |
| struct MonthMatch : public IntegerMatch { |
| void Reset() { |
| IntegerMatch::Reset(); |
| type = MatchType::MatchType_MONTH; |
| } |
| |
| static bool IsValid(int x) { return (x >= 1 && x <= 12); } |
| }; |
| |
| struct DayMatch : public IntegerMatch { |
| void Reset() { |
| IntegerMatch::Reset(); |
| type = MatchType::MatchType_DAY; |
| } |
| |
| static bool IsValid(int x) { return (x >= 1 && x <= 31); } |
| }; |
| |
| struct HourMatch : public IntegerMatch { |
| void Reset() { |
| IntegerMatch::Reset(); |
| type = MatchType::MatchType_HOUR; |
| } |
| |
| static bool IsValid(int x) { return (x >= 0 && x <= 24); } |
| }; |
| |
| struct MinuteMatch : public IntegerMatch { |
| void Reset() { |
| IntegerMatch::Reset(); |
| type = MatchType::MatchType_MINUTE; |
| } |
| |
| static bool IsValid(int x) { return (x >= 0 && x <= 59); } |
| }; |
| |
| struct SecondMatch : public IntegerMatch { |
| void Reset() { |
| IntegerMatch::Reset(); |
| type = MatchType::MatchType_SECOND; |
| } |
| |
| static bool IsValid(int x) { return (x >= 0 && x <= 60); } |
| }; |
| |
| struct DecimalMatch : public NonterminalMatch { |
| double value; |
| int8 count_of_digits; // When expression is in digits format. |
| |
| void Reset() { |
| NonterminalMatch::Reset(); |
| value = NO_VAL; |
| count_of_digits = 0; |
| } |
| }; |
| |
| struct FractionSecondMatch : public DecimalMatch { |
| void Reset() { |
| DecimalMatch::Reset(); |
| type = MatchType::MatchType_FRACTION_SECOND; |
| } |
| |
| static bool IsValid(double x) { return (x >= 0.0 && x < 1.0); } |
| }; |
| |
| // CombinedIntegersMatch<N> is used for expressions containing multiple (up |
| // to N) matches of integers without delimeters between them (because |
| // CFG-grammar is based on tokenizer, it could not split a token into several |
| // pieces like using regular-expression). For example, "1130" contains "11" |
| // and "30" meaning November 30. |
| template <int N> |
| struct CombinedIntegersMatch : public NonterminalMatch { |
| enum { |
| SIZE = N, |
| }; |
| |
| int values[SIZE]; |
| int8 count_of_digits; // When expression is in digits format. |
| bool is_zero_prefixed; // When expression is in digits format. |
| |
| void Reset() { |
| NonterminalMatch::Reset(); |
| for (int i = 0; i < SIZE; ++i) { |
| values[i] = NO_VAL; |
| } |
| count_of_digits = 0; |
| is_zero_prefixed = false; |
| } |
| }; |
| |
| struct CombinedDigitsMatch : public CombinedIntegersMatch<6> { |
| enum Index { |
| INDEX_YEAR = 0, |
| INDEX_MONTH = 1, |
| INDEX_DAY = 2, |
| INDEX_HOUR = 3, |
| INDEX_MINUTE = 4, |
| INDEX_SECOND = 5, |
| }; |
| |
| bool HasYear() const { return values[INDEX_YEAR] != NO_VAL; } |
| bool HasMonth() const { return values[INDEX_MONTH] != NO_VAL; } |
| bool HasDay() const { return values[INDEX_DAY] != NO_VAL; } |
| bool HasHour() const { return values[INDEX_HOUR] != NO_VAL; } |
| bool HasMinute() const { return values[INDEX_MINUTE] != NO_VAL; } |
| bool HasSecond() const { return values[INDEX_SECOND] != NO_VAL; } |
| |
| int GetYear() const { return values[INDEX_YEAR]; } |
| int GetMonth() const { return values[INDEX_MONTH]; } |
| int GetDay() const { return values[INDEX_DAY]; } |
| int GetHour() const { return values[INDEX_HOUR]; } |
| int GetMinute() const { return values[INDEX_MINUTE]; } |
| int GetSecond() const { return values[INDEX_SECOND]; } |
| |
| void Reset() { |
| CombinedIntegersMatch<SIZE>::Reset(); |
| type = MatchType::MatchType_COMBINED_DIGITS; |
| } |
| |
| static bool IsValid(int i, int x) { |
| switch (i) { |
| case INDEX_YEAR: |
| return YearMatch::IsValid(x); |
| case INDEX_MONTH: |
| return MonthMatch::IsValid(x); |
| case INDEX_DAY: |
| return DayMatch::IsValid(x); |
| case INDEX_HOUR: |
| return HourMatch::IsValid(x); |
| case INDEX_MINUTE: |
| return MinuteMatch::IsValid(x); |
| case INDEX_SECOND: |
| return SecondMatch::IsValid(x); |
| default: |
| return false; |
| } |
| } |
| }; |
| |
| struct TimeValueMatch : public NonterminalMatch { |
| const HourMatch* hour_match; |
| const MinuteMatch* minute_match; |
| const SecondMatch* second_match; |
| const FractionSecondMatch* fraction_second_match; |
| |
| bool is_hour_zero_prefixed : 1; |
| bool is_minute_one_digit : 1; |
| bool is_second_one_digit : 1; |
| |
| int8 hour; |
| int8 minute; |
| int8 second; |
| double fraction_second; |
| |
| void Reset() { |
| NonterminalMatch::Reset(); |
| type = MatchType::MatchType_TIME_VALUE; |
| hour_match = nullptr; |
| minute_match = nullptr; |
| second_match = nullptr; |
| fraction_second_match = nullptr; |
| is_hour_zero_prefixed = false; |
| is_minute_one_digit = false; |
| is_second_one_digit = false; |
| hour = NO_VAL; |
| minute = NO_VAL; |
| second = NO_VAL; |
| fraction_second = NO_VAL; |
| } |
| }; |
| |
| struct TimeSpanMatch : public NonterminalMatch { |
| const TimeSpanSpec* time_span_spec; |
| TimespanCode time_span_code; |
| |
| void Reset() { |
| NonterminalMatch::Reset(); |
| type = MatchType::MatchType_TIME_SPAN; |
| time_span_spec = nullptr; |
| time_span_code = TimespanCode_TIMESPAN_CODE_NONE; |
| } |
| }; |
| |
| struct TimeZoneNameMatch : public NonterminalMatch { |
| const TimeZoneNameSpec* time_zone_name_spec; |
| TimezoneCode time_zone_code; |
| |
| void Reset() { |
| NonterminalMatch::Reset(); |
| type = MatchType::MatchType_TIME_ZONE_NAME; |
| time_zone_name_spec = nullptr; |
| time_zone_code = TimezoneCode_TIMEZONE_CODE_NONE; |
| } |
| }; |
| |
| struct TimeZoneOffsetMatch : public NonterminalMatch { |
| const TimeZoneOffsetParameter* time_zone_offset_param; |
| int16 time_zone_offset; |
| |
| void Reset() { |
| NonterminalMatch::Reset(); |
| type = MatchType::MatchType_TIME_ZONE_OFFSET; |
| time_zone_offset_param = nullptr; |
| time_zone_offset = 0; |
| } |
| }; |
| |
| struct DayOfWeekMatch : public IntegerMatch { |
| void Reset() { |
| IntegerMatch::Reset(); |
| type = MatchType::MatchType_DAY_OF_WEEK; |
| } |
| |
| static bool IsValid(int x) { |
| return (x > DayOfWeek_DOW_NONE && x <= DayOfWeek_MAX); |
| } |
| }; |
| |
| struct TimePeriodMatch : public NonterminalMatch { |
| int value; |
| |
| void Reset() { |
| NonterminalMatch::Reset(); |
| type = MatchType::MatchType_TIME_PERIOD; |
| value = NO_VAL; |
| } |
| }; |
| |
| struct RelativeMatch : public NonterminalMatch { |
| enum { |
| HAS_NONE = 0, |
| HAS_YEAR = 1 << 0, |
| HAS_MONTH = 1 << 1, |
| HAS_DAY = 1 << 2, |
| HAS_WEEK = 1 << 3, |
| HAS_HOUR = 1 << 4, |
| HAS_MINUTE = 1 << 5, |
| HAS_SECOND = 1 << 6, |
| HAS_DAY_OF_WEEK = 1 << 7, |
| HAS_IS_FUTURE = 1 << 31, |
| }; |
| uint32 existing; |
| |
| int year; |
| int month; |
| int day; |
| int week; |
| int hour; |
| int minute; |
| int second; |
| const NonterminalValue* day_of_week_nonterminal; |
| int8 day_of_week; |
| bool is_future_date; |
| |
| bool HasDay() const { return existing & HAS_DAY; } |
| |
| bool HasDayFields() const { return existing & (HAS_DAY | HAS_DAY_OF_WEEK); } |
| |
| bool HasTimeValueFields() const { |
| return existing & (HAS_HOUR | HAS_MINUTE | HAS_SECOND); |
| } |
| |
| bool IsStandaloneRelativeDayOfWeek() const { |
| return (existing & HAS_DAY_OF_WEEK) && (existing & ~HAS_DAY_OF_WEEK) == 0; |
| } |
| |
| void Reset() { |
| NonterminalMatch::Reset(); |
| type = MatchType::MatchType_RELATIVE_DATE; |
| existing = HAS_NONE; |
| year = NO_VAL; |
| month = NO_VAL; |
| day = NO_VAL; |
| week = NO_VAL; |
| hour = NO_VAL; |
| minute = NO_VAL; |
| second = NO_VAL; |
| day_of_week = NO_VAL; |
| is_future_date = false; |
| } |
| }; |
| |
| // This is not necessarily POD, it is used to keep the final matched result. |
| struct DateMatch { |
| // Sub-matches in the date match. |
| const YearMatch* year_match = nullptr; |
| const MonthMatch* month_match = nullptr; |
| const DayMatch* day_match = nullptr; |
| const DayOfWeekMatch* day_of_week_match = nullptr; |
| const TimeValueMatch* time_value_match = nullptr; |
| const TimeSpanMatch* time_span_match = nullptr; |
| const TimeZoneNameMatch* time_zone_name_match = nullptr; |
| const TimeZoneOffsetMatch* time_zone_offset_match = nullptr; |
| const RelativeMatch* relative_match = nullptr; |
| const CombinedDigitsMatch* combined_digits_match = nullptr; |
| |
| // [begin, end) indicates the Document position where the date or date range |
| // was found. |
| int begin = -1; |
| int end = -1; |
| int priority = 0; |
| float annotator_priority_score = 0.0; |
| |
| int year = NO_VAL; |
| int8 month = NO_VAL; |
| int8 day = NO_VAL; |
| DayOfWeek day_of_week = DayOfWeek_DOW_NONE; |
| BCAD bc_ad = BCAD_BCAD_NONE; |
| int8 hour = NO_VAL; |
| int8 minute = NO_VAL; |
| int8 second = NO_VAL; |
| double fraction_second = NO_VAL; |
| TimespanCode time_span_code = TimespanCode_TIMESPAN_CODE_NONE; |
| int time_zone_code = TimezoneCode_TIMEZONE_CODE_NONE; |
| int16 time_zone_offset = std::numeric_limits<int16>::min(); |
| |
| // Fields about ambiguous hours. These fields are used to interpret the |
| // possible values of ambiguous hours. Since all kinds of known ambiguities |
| // are in the form of arithmetic progression (starting from .hour field), |
| // we can use "ambiguous_hour_count" to denote the count of ambiguous hours, |
| // and use "ambiguous_hour_interval" to denote the distance between a pair |
| // of adjacent possible hours. Values in the arithmetic progression are |
| // shrunk into [0, 23] (MOD 24). One can use the GetPossibleHourValues() |
| // method for the complete list of possible hours. |
| uint8 ambiguous_hour_count = 0; |
| uint8 ambiguous_hour_interval = 0; |
| |
| bool is_inferred = false; |
| |
| // This field is set in function PerformRefinements to remove some DateMatch |
| // like overlapped, duplicated, etc. |
| bool is_removed = false; |
| |
| std::string DebugString() const; |
| |
| bool HasYear() const { return year != NO_VAL; } |
| bool HasMonth() const { return month != NO_VAL; } |
| bool HasDay() const { return day != NO_VAL; } |
| bool HasDayOfWeek() const { return day_of_week != DayOfWeek_DOW_NONE; } |
| bool HasBcAd() const { return bc_ad != BCAD_BCAD_NONE; } |
| bool HasHour() const { return hour != NO_VAL; } |
| bool HasMinute() const { return minute != NO_VAL; } |
| bool HasSecond() const { return second != NO_VAL; } |
| bool HasFractionSecond() const { return fraction_second != NO_VAL; } |
| bool HasTimeSpanCode() const { |
| return time_span_code != TimespanCode_TIMESPAN_CODE_NONE; |
| } |
| bool HasTimeZoneCode() const { |
| return time_zone_code != TimezoneCode_TIMEZONE_CODE_NONE; |
| } |
| bool HasTimeZoneOffset() const { |
| return time_zone_offset != std::numeric_limits<int16>::min(); |
| } |
| |
| bool HasRelativeDate() const { return relative_match != nullptr; } |
| |
| bool IsHourAmbiguous() const { return ambiguous_hour_count >= 2; } |
| |
| bool IsStandaloneTime() const { |
| return (HasHour() || HasMinute()) && !HasDayOfWeek() && !HasDay() && |
| !HasMonth() && !HasYear(); |
| } |
| |
| void SetAmbiguousHourProperties(uint8 count, uint8 interval) { |
| ambiguous_hour_count = count; |
| ambiguous_hour_interval = interval; |
| } |
| |
| // Outputs all the possible hour values. If current DateMatch does not |
| // contain an hour, nothing will be output. If the hour is not ambiguous, |
| // only one value (= .hour) will be output. This method clears the vector |
| // "values" first, and it is not guaranteed that the values in the vector |
| // are in a sorted order. |
| void GetPossibleHourValues(std::vector<int8>* values) const; |
| |
| int GetPriority() const { return priority; } |
| |
| float GetAnnotatorPriorityScore() const { return annotator_priority_score; } |
| |
| bool IsStandaloneRelativeDayOfWeek() const { |
| return (HasRelativeDate() && |
| relative_match->IsStandaloneRelativeDayOfWeek() && |
| !HasDateFields() && !HasTimeFields() && !HasTimeSpanCode()); |
| } |
| |
| bool HasDateFields() const { |
| return (HasYear() || HasMonth() || HasDay() || HasDayOfWeek() || HasBcAd()); |
| } |
| bool HasTimeValueFields() const { |
| return (HasHour() || HasMinute() || HasSecond() || HasFractionSecond()); |
| } |
| bool HasTimeSpanFields() const { return HasTimeSpanCode(); } |
| bool HasTimeZoneFields() const { |
| return (HasTimeZoneCode() || HasTimeZoneOffset()); |
| } |
| bool HasTimeFields() const { |
| return (HasTimeValueFields() || HasTimeSpanFields() || HasTimeZoneFields()); |
| } |
| |
| bool IsValid() const; |
| |
| // Overall relative qualifier of the DateMatch e.g. 2 year ago is 'PAST' and |
| // next week is 'FUTURE'. |
| DatetimeComponent::RelativeQualifier GetRelativeQualifier() const; |
| |
| // Getter method to get the 'DatetimeComponent' of given 'ComponentType'. |
| Optional<DatetimeComponent> GetDatetimeComponent( |
| const DatetimeComponent::ComponentType& component_type) const; |
| |
| void FillDatetimeComponents( |
| std::vector<DatetimeComponent>* datetime_component) const; |
| }; |
| |
| // Represent a matched date range which includes the from and to matched date. |
| struct DateRangeMatch { |
| int begin = -1; |
| int end = -1; |
| |
| DateMatch from; |
| DateMatch to; |
| |
| std::string DebugString() const; |
| |
| int GetPriority() const { |
| return std::max(from.GetPriority(), to.GetPriority()); |
| } |
| |
| float GetAnnotatorPriorityScore() const { |
| return std::max(from.GetAnnotatorPriorityScore(), |
| to.GetAnnotatorPriorityScore()); |
| } |
| }; |
| |
| } // namespace dates |
| } // namespace libtextclassifier3 |
| |
| #endif // LIBTEXTCLASSIFIER_ANNOTATOR_GRAMMAR_DATES_UTILS_DATE_MATCH_H_ |