| // Copyright 2020 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| |
| #include "annotator/grammar/dates/parser.h" |
| |
| #include "annotator/grammar/dates/extractor.h" |
| #include "annotator/grammar/dates/utils/date-match.h" |
| #include "annotator/grammar/dates/utils/date-utils.h" |
| #include "utils/base/integral_types.h" |
| #include "utils/base/logging.h" |
| #include "utils/base/macros.h" |
| #include "utils/grammar/lexer.h" |
| #include "utils/grammar/matcher.h" |
| #include "utils/grammar/rules_generated.h" |
| #include "utils/grammar/types.h" |
| #include "utils/strings/split.h" |
| #include "utils/strings/stringpiece.h" |
| |
| namespace libtextclassifier3::dates { |
| namespace { |
| |
| // Helper methods to validate individual components from a date match. |
| |
| // Checks the validation requirement of a rule against a match. |
| // For example if the rule asks for `SPELLED_MONTH`, then we check that the |
| // match has the right flag. |
| bool CheckMatchValidationAndFlag( |
| const grammar::Match* match, const ExtractionRuleParameter* rule, |
| const ExtractionRuleParameter_::ExtractionValidation validation, |
| const NonterminalParameter_::Flag flag) { |
| if (rule == nullptr || (rule->validation() & validation) == 0) { |
| // No validation requirement. |
| return true; |
| } |
| const NonterminalParameter* nonterminal_parameter = |
| static_cast<const NonterminalMatch*>(match) |
| ->nonterminal->nonterminal_parameter(); |
| return (nonterminal_parameter != nullptr && |
| (nonterminal_parameter->flag() & flag) != 0); |
| } |
| |
| bool GenerateDate(const ExtractionRuleParameter* rule, |
| const grammar::Match* match, DateMatch* date) { |
| bool is_valid = true; |
| |
| // Post check and assign date components. |
| grammar::Traverse(match, [rule, date, &is_valid](const grammar::Match* node) { |
| switch (node->type) { |
| case MatchType_YEAR: { |
| if (CheckMatchValidationAndFlag( |
| node, rule, |
| ExtractionRuleParameter_::ExtractionValidation_SPELLED_YEAR, |
| NonterminalParameter_::Flag_IS_SPELLED)) { |
| date->year_match = static_cast<const YearMatch*>(node); |
| date->year = date->year_match->value; |
| } else { |
| is_valid = false; |
| } |
| break; |
| } |
| case MatchType_MONTH: { |
| if (CheckMatchValidationAndFlag( |
| node, rule, |
| ExtractionRuleParameter_::ExtractionValidation_SPELLED_MONTH, |
| NonterminalParameter_::Flag_IS_SPELLED)) { |
| date->month_match = static_cast<const MonthMatch*>(node); |
| date->month = date->month_match->value; |
| } else { |
| is_valid = false; |
| } |
| break; |
| } |
| case MatchType_DAY: { |
| if (CheckMatchValidationAndFlag( |
| node, rule, |
| ExtractionRuleParameter_::ExtractionValidation_SPELLED_DAY, |
| NonterminalParameter_::Flag_IS_SPELLED)) { |
| date->day_match = static_cast<const DayMatch*>(node); |
| date->day = date->day_match->value; |
| } else { |
| is_valid = false; |
| } |
| break; |
| } |
| case MatchType_DAY_OF_WEEK: { |
| date->day_of_week_match = static_cast<const DayOfWeekMatch*>(node); |
| date->day_of_week = |
| static_cast<DayOfWeek>(date->day_of_week_match->value); |
| break; |
| } |
| case MatchType_TIME_VALUE: { |
| date->time_value_match = static_cast<const TimeValueMatch*>(node); |
| date->hour = date->time_value_match->hour; |
| date->minute = date->time_value_match->minute; |
| date->second = date->time_value_match->second; |
| date->fraction_second = date->time_value_match->fraction_second; |
| return false; |
| } |
| case MatchType_TIME_SPAN: { |
| date->time_span_match = static_cast<const TimeSpanMatch*>(node); |
| date->time_span_code = date->time_span_match->time_span_code; |
| return false; |
| } |
| case MatchType_TIME_ZONE_NAME: { |
| date->time_zone_name_match = |
| static_cast<const TimeZoneNameMatch*>(node); |
| date->time_zone_code = date->time_zone_name_match->time_zone_code; |
| return false; |
| } |
| case MatchType_TIME_ZONE_OFFSET: { |
| date->time_zone_offset_match = |
| static_cast<const TimeZoneOffsetMatch*>(node); |
| date->time_zone_offset = date->time_zone_offset_match->time_zone_offset; |
| return false; |
| } |
| case MatchType_RELATIVE_DATE: { |
| date->relative_match = static_cast<const RelativeMatch*>(node); |
| return false; |
| } |
| case MatchType_COMBINED_DIGITS: { |
| date->combined_digits_match = |
| static_cast<const CombinedDigitsMatch*>(node); |
| if (date->combined_digits_match->HasYear()) { |
| date->year = date->combined_digits_match->GetYear(); |
| } |
| if (date->combined_digits_match->HasMonth()) { |
| date->month = date->combined_digits_match->GetMonth(); |
| } |
| if (date->combined_digits_match->HasDay()) { |
| date->day = date->combined_digits_match->GetDay(); |
| } |
| if (date->combined_digits_match->HasHour()) { |
| date->hour = date->combined_digits_match->GetHour(); |
| } |
| if (date->combined_digits_match->HasMinute()) { |
| date->minute = date->combined_digits_match->GetMinute(); |
| } |
| if (date->combined_digits_match->HasSecond()) { |
| date->second = date->combined_digits_match->GetSecond(); |
| } |
| return false; |
| } |
| default: |
| // Expand node further. |
| return true; |
| } |
| |
| return false; |
| }); |
| |
| if (is_valid) { |
| date->begin = match->codepoint_span.first; |
| date->end = match->codepoint_span.second; |
| date->priority = rule ? rule->priority_delta() : 0; |
| date->annotator_priority_score = |
| rule ? rule->annotator_priority_score() : 0.0; |
| } |
| return is_valid; |
| } |
| |
| bool GenerateFromOrToDateRange(const grammar::Match* match, DateMatch* date) { |
| return GenerateDate( |
| /*rule=*/( |
| match->type == MatchType_DATETIME |
| ? static_cast<const ExtractionMatch*>(match)->extraction_rule |
| : nullptr), |
| match, date); |
| } |
| |
| bool GenerateDateRange(const grammar::Match* match, const grammar::Match* from, |
| const grammar::Match* to, DateRangeMatch* date_range) { |
| if (!GenerateFromOrToDateRange(from, &date_range->from)) { |
| TC3_LOG(WARNING) << "Failed to generate date for `from`."; |
| return false; |
| } |
| if (!GenerateFromOrToDateRange(to, &date_range->to)) { |
| TC3_LOG(WARNING) << "Failed to generate date for `to`."; |
| return false; |
| } |
| date_range->begin = match->codepoint_span.first; |
| date_range->end = match->codepoint_span.second; |
| return true; |
| } |
| |
| bool NormalizeHour(DateMatch* date) { |
| if (date->time_span_match == nullptr) { |
| // Nothing to do. |
| return true; |
| } |
| return NormalizeHourByTimeSpan(date->time_span_match->time_span_spec, date); |
| } |
| |
| void CheckAndSetAmbiguousHour(DateMatch* date) { |
| if (date->HasHour()) { |
| // Use am-pm ambiguity as default. |
| if (!date->HasTimeSpanCode() && date->hour >= 1 && date->hour <= 12 && |
| !(date->time_value_match != nullptr && |
| date->time_value_match->hour_match != nullptr && |
| date->time_value_match->hour_match->is_zero_prefixed)) { |
| date->SetAmbiguousHourProperties(2, 12); |
| } |
| } |
| } |
| |
| // Normalizes a date candidate. |
| // Returns whether the candidate was successfully normalized. |
| bool NormalizeDate(DateMatch* date) { |
| // Normalize hour. |
| if (!NormalizeHour(date)) { |
| TC3_VLOG(ERROR) << "Hour normalization (according to time-span) failed." |
| << date->DebugString(); |
| return false; |
| } |
| CheckAndSetAmbiguousHour(date); |
| if (!date->IsValid()) { |
| TC3_VLOG(ERROR) << "Fields inside date instance are ill-formed " |
| << date->DebugString(); |
| } |
| return true; |
| } |
| |
| // Copies the field from one DateMatch to another whose field is null. for |
| // example: if the from is "May 1, 8pm", and the to is "9pm", "May 1" will be |
| // copied to "to". Now we only copy fields for date range requirement.fv |
| void CopyFieldsForDateMatch(const DateMatch& from, DateMatch* to) { |
| if (from.time_span_match != nullptr && to->time_span_match == nullptr) { |
| to->time_span_match = from.time_span_match; |
| to->time_span_code = from.time_span_code; |
| } |
| if (from.month_match != nullptr && to->month_match == nullptr) { |
| to->month_match = from.month_match; |
| to->month = from.month; |
| } |
| } |
| |
| // Normalizes a date range candidate. |
| // Returns whether the date range was successfully normalized. |
| bool NormalizeDateRange(DateRangeMatch* date_range) { |
| CopyFieldsForDateMatch(date_range->from, &date_range->to); |
| CopyFieldsForDateMatch(date_range->to, &date_range->from); |
| return (NormalizeDate(&date_range->from) && NormalizeDate(&date_range->to)); |
| } |
| |
| bool CheckDate(const DateMatch& date, const ExtractionRuleParameter* rule) { |
| // It's possible that "time_zone_name_match == NULL" when |
| // "HasTimeZoneCode() == true", or "time_zone_offset_match == NULL" when |
| // "HasTimeZoneOffset() == true" due to inference between endpoints, so we |
| // must check if they really exist before using them. |
| if (date.HasTimeZoneOffset()) { |
| if (date.HasTimeZoneCode()) { |
| if (date.time_zone_name_match != nullptr) { |
| TC3_CHECK(date.time_zone_name_match->time_zone_name_spec != nullptr); |
| const TimeZoneNameSpec* spec = |
| date.time_zone_name_match->time_zone_name_spec; |
| if (!spec->is_utc()) { |
| return false; |
| } |
| if (!spec->is_abbreviation()) { |
| return false; |
| } |
| } |
| } else if (date.time_zone_offset_match != nullptr) { |
| TC3_CHECK(date.time_zone_offset_match->time_zone_offset_param != nullptr); |
| const TimeZoneOffsetParameter* param = |
| date.time_zone_offset_match->time_zone_offset_param; |
| if (param->format() == TimeZoneOffsetParameter_::Format_FORMAT_H || |
| param->format() == TimeZoneOffsetParameter_::Format_FORMAT_HH) { |
| return false; |
| } |
| if (!(rule->validation() & |
| ExtractionRuleParameter_:: |
| ExtractionValidation_ALLOW_UNCONFIDENT_TIME_ZONE)) { |
| if (param->format() == TimeZoneOffsetParameter_::Format_FORMAT_H_MM || |
| param->format() == TimeZoneOffsetParameter_::Format_FORMAT_HH_MM || |
| param->format() == TimeZoneOffsetParameter_::Format_FORMAT_HMM) { |
| return false; |
| } |
| } |
| } |
| } |
| |
| // Case: 1 April could be extracted as year 1, month april. |
| // We simply remove this case. |
| if (!date.HasBcAd() && date.year_match != nullptr && date.year < 1000) { |
| // We allow case like 11/5/01 |
| if (date.HasMonth() && date.HasDay() && |
| date.year_match->count_of_digits == 2) { |
| } else { |
| return false; |
| } |
| } |
| |
| // Ignore the date if the year is larger than 9999 (The maximum number of 4 |
| // digits). |
| if (date.year_match != nullptr && date.year > 9999) { |
| TC3_VLOG(ERROR) << "Year is greater than 9999."; |
| return false; |
| } |
| |
| // Case: spelled may could be month 5, it also used very common as modal |
| // verbs. We ignore spelled may as month. |
| if ((rule->validation() & |
| ExtractionRuleParameter_::ExtractionValidation_SPELLED_MONTH) && |
| date.month == 5 && !date.HasYear() && !date.HasDay()) { |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool CheckContext(const std::vector<UnicodeText::const_iterator>& text, |
| const DateExtractor::Output& output) { |
| const uint32 validation = output.rule->validation(); |
| |
| // Nothing to check if we don't have any validation requirements for the |
| // span boundaries. |
| if ((validation & |
| (ExtractionRuleParameter_::ExtractionValidation_LEFT_BOUND | |
| ExtractionRuleParameter_::ExtractionValidation_RIGHT_BOUND)) == 0) { |
| return true; |
| } |
| |
| const int begin = output.match->codepoint_span.first; |
| const int end = output.match->codepoint_span.second; |
| |
| // So far, we only check that the adjacent character cannot be a separator, |
| // like /, - or . |
| if ((validation & |
| ExtractionRuleParameter_::ExtractionValidation_LEFT_BOUND) != 0) { |
| if (begin > 0 && (*text[begin - 1] == '/' || *text[begin - 1] == '-' || |
| *text[begin - 1] == ':')) { |
| return false; |
| } |
| } |
| if ((validation & |
| ExtractionRuleParameter_::ExtractionValidation_RIGHT_BOUND) != 0) { |
| // Last valid codepoint is at text.size() - 2 as we added the end position |
| // of text for easier span extraction. |
| if (end < text.size() - 1 && |
| (*text[end] == '/' || *text[end] == '-' || *text[end] == ':')) { |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| |
| // Validates a date match. Returns true if the candidate is valid. |
| bool ValidateDate(const std::vector<UnicodeText::const_iterator>& text, |
| const DateExtractor::Output& output, const DateMatch& date) { |
| if (!CheckDate(date, output.rule)) { |
| return false; |
| } |
| if (!CheckContext(text, output)) { |
| return false; |
| } |
| return true; |
| } |
| |
| // Builds matched date instances from the grammar output. |
| std::vector<DateMatch> BuildDateMatches( |
| const std::vector<UnicodeText::const_iterator>& text, |
| const std::vector<DateExtractor::Output>& outputs) { |
| std::vector<DateMatch> result; |
| for (const DateExtractor::Output& output : outputs) { |
| DateMatch date; |
| if (GenerateDate(output.rule, output.match, &date)) { |
| if (!NormalizeDate(&date)) { |
| continue; |
| } |
| if (!ValidateDate(text, output, date)) { |
| continue; |
| } |
| result.push_back(date); |
| } |
| } |
| return result; |
| } |
| |
| // Builds matched date range instances from the grammar output. |
| std::vector<DateRangeMatch> BuildDateRangeMatches( |
| const std::vector<UnicodeText::const_iterator>& text, |
| const std::vector<DateExtractor::RangeOutput>& range_outputs) { |
| std::vector<DateRangeMatch> result; |
| for (const DateExtractor::RangeOutput& range_output : range_outputs) { |
| DateRangeMatch date_range; |
| if (GenerateDateRange(range_output.match, range_output.from, |
| range_output.to, &date_range)) { |
| if (!NormalizeDateRange(&date_range)) { |
| continue; |
| } |
| result.push_back(date_range); |
| } |
| } |
| return result; |
| } |
| |
| template <typename T> |
| void RemoveDeletedMatches(const std::vector<bool>& removed, |
| std::vector<T>* matches) { |
| int input = 0; |
| for (int next = 0; next < matches->size(); ++next) { |
| if (removed[next]) { |
| continue; |
| } |
| if (input != next) { |
| (*matches)[input] = (*matches)[next]; |
| } |
| input++; |
| } |
| matches->resize(input); |
| } |
| |
| // Removes duplicated date or date range instances. |
| // Overlapping date and date ranges are not considered here. |
| template <typename T> |
| void RemoveDuplicatedDates(std::vector<T>* matches) { |
| // Assumption: matches are sorted ascending by (begin, end). |
| std::vector<bool> removed(matches->size(), false); |
| for (int i = 0; i < matches->size(); i++) { |
| if (removed[i]) { |
| continue; |
| } |
| const T& candidate = matches->at(i); |
| for (int j = i + 1; j < matches->size(); j++) { |
| if (removed[j]) { |
| continue; |
| } |
| const T& next = matches->at(j); |
| |
| // Not overlapping. |
| if (next.begin >= candidate.end) { |
| break; |
| } |
| |
| // If matching the same span of text, then check the priority. |
| if (candidate.begin == next.begin && candidate.end == next.end) { |
| if (candidate.GetPriority() < next.GetPriority()) { |
| removed[i] = true; |
| break; |
| } else { |
| removed[j] = true; |
| continue; |
| } |
| } |
| |
| // Checks if `next` is fully covered by fields of `candidate`. |
| if (next.end <= candidate.end) { |
| removed[j] = true; |
| continue; |
| } |
| |
| // Checks whether `candidate`/`next` is a refinement. |
| if (IsRefinement(candidate, next)) { |
| removed[j] = true; |
| continue; |
| } else if (IsRefinement(next, candidate)) { |
| removed[i] = true; |
| break; |
| } |
| } |
| } |
| RemoveDeletedMatches(removed, matches); |
| } |
| |
| // Filters out simple overtriggering simple matches. |
| bool IsBlacklistedDate(const UniLib& unilib, |
| const std::vector<UnicodeText::const_iterator>& text, |
| const DateMatch& match) { |
| const int begin = match.begin; |
| const int end = match.end; |
| if (end - begin != 3) { |
| return false; |
| } |
| |
| std::string text_lower = |
| unilib |
| .ToLowerText( |
| UTF8ToUnicodeText(text[begin].utf8_data(), |
| text[end].utf8_data() - text[begin].utf8_data(), |
| /*do_copy=*/false)) |
| .ToUTF8String(); |
| |
| // "sun" is not a good abbreviation for a standalone day of the week. |
| if (match.IsStandaloneRelativeDayOfWeek() && |
| (text_lower == "sun" || text_lower == "mon")) { |
| return true; |
| } |
| |
| // "mar" is not a good abbreviation for single month. |
| if (match.HasMonth() && text_lower == "mar") { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| // Checks if two date matches are adjacent and mergeable. |
| bool AreDateMatchesAdjacentAndMergeable( |
| const UniLib& unilib, const std::vector<UnicodeText::const_iterator>& text, |
| const std::vector<std::string>& ignored_spans, const DateMatch& prev, |
| const DateMatch& next) { |
| // Check the context between the two matches. |
| if (next.begin <= prev.end) { |
| // The two matches are not adjacent. |
| return false; |
| } |
| UnicodeText span; |
| for (int i = prev.end; i < next.begin; i++) { |
| const char32 codepoint = *text[i]; |
| if (unilib.IsWhitespace(codepoint)) { |
| continue; |
| } |
| span.push_back(unilib.ToLower(codepoint)); |
| } |
| if (span.empty()) { |
| return true; |
| } |
| const std::string span_text = span.ToUTF8String(); |
| bool matched = false; |
| for (const std::string& ignored_span : ignored_spans) { |
| if (span_text == ignored_span) { |
| matched = true; |
| break; |
| } |
| } |
| if (!matched) { |
| return false; |
| } |
| return IsDateMatchMergeable(prev, next); |
| } |
| |
| // Merges adjacent date and date range. |
| // For e.g. Monday, 5-10pm, the date "Monday" and the time range "5-10pm" will |
| // be merged |
| void MergeDateRangeAndDate(const UniLib& unilib, |
| const std::vector<UnicodeText::const_iterator>& text, |
| const std::vector<std::string>& ignored_spans, |
| const std::vector<DateMatch>& dates, |
| std::vector<DateRangeMatch>* date_ranges) { |
| // For each range, check the date before or after the it to see if they could |
| // be merged. Both the range and date array are sorted, so we only need to |
| // scan the date array once. |
| int next_date = 0; |
| for (int i = 0; i < date_ranges->size(); i++) { |
| DateRangeMatch* date_range = &date_ranges->at(i); |
| // So far we only merge time range with a date. |
| if (!date_range->from.HasHour()) { |
| continue; |
| } |
| |
| for (; next_date < dates.size(); next_date++) { |
| const DateMatch& date = dates[next_date]; |
| |
| // If the range is before the date, we check whether `date_range->to` can |
| // be merged with the date. |
| if (date_range->end <= date.begin) { |
| DateMatch merged_date = date; |
| if (AreDateMatchesAdjacentAndMergeable(unilib, text, ignored_spans, |
| date_range->to, date)) { |
| MergeDateMatch(date_range->to, &merged_date, /*update_span=*/true); |
| date_range->to = merged_date; |
| date_range->end = date_range->to.end; |
| MergeDateMatch(date, &date_range->from, /*update_span=*/false); |
| next_date++; |
| |
| // Check the second date after the range to see if it could be merged |
| // further. For example: 10-11pm, Monday, May 15. 10-11pm is merged |
| // with Monday and then we check that it could be merged with May 15 |
| // as well. |
| if (next_date < dates.size()) { |
| DateMatch next_match = dates[next_date]; |
| if (AreDateMatchesAdjacentAndMergeable( |
| unilib, text, ignored_spans, date_range->to, next_match)) { |
| MergeDateMatch(date_range->to, &next_match, /*update_span=*/true); |
| date_range->to = next_match; |
| date_range->end = date_range->to.end; |
| MergeDateMatch(dates[next_date], &date_range->from, |
| /*update_span=*/false); |
| next_date++; |
| } |
| } |
| } |
| // Since the range is before the date, we try to check if the next range |
| // could be merged with the current date. |
| break; |
| } else if (date_range->end > date.end && date_range->begin > date.begin) { |
| // If the range is after the date, we check if `date_range.from` can be |
| // merged with the date. Here is a special case, the date before range |
| // could be partially overlapped. This is because the range.from could |
| // be extracted as year in date. For example: March 3, 10-11pm is |
| // extracted as date March 3, 2010 and the range 10-11pm. In this |
| // case, we simply clear the year from date. |
| DateMatch merged_date = date; |
| if (date.HasYear() && |
| date.year_match->codepoint_span.second > date_range->begin) { |
| merged_date.year_match = nullptr; |
| merged_date.year = NO_VAL; |
| merged_date.end = date.year_match->match_offset; |
| } |
| // Check and merge the range and the date before the range. |
| if (AreDateMatchesAdjacentAndMergeable(unilib, text, ignored_spans, |
| merged_date, date_range->from)) { |
| MergeDateMatch(merged_date, &date_range->from, /*update_span=*/true); |
| date_range->begin = date_range->from.begin; |
| MergeDateMatch(merged_date, &date_range->to, /*update_span=*/false); |
| |
| // Check if the second date before the range can be merged as well. |
| if (next_date > 0) { |
| DateMatch prev_match = dates[next_date - 1]; |
| if (prev_match.end <= date_range->from.begin) { |
| if (AreDateMatchesAdjacentAndMergeable(unilib, text, |
| ignored_spans, prev_match, |
| date_range->from)) { |
| MergeDateMatch(prev_match, &date_range->from, |
| /*update_span=*/true); |
| date_range->begin = date_range->from.begin; |
| MergeDateMatch(prev_match, &date_range->to, |
| /*update_span=*/false); |
| } |
| } |
| } |
| next_date++; |
| break; |
| } else { |
| // Since the date is before the date range, we move to the next date |
| // to check if it could be merged with the current range. |
| continue; |
| } |
| } else { |
| // The date is either fully overlapped by the date range or the date |
| // span end is after the date range. Move to the next date in both |
| // cases. |
| } |
| } |
| } |
| } |
| |
| // Removes the dates which are part of a range. e.g. in "May 1 - 3", the date |
| // "May 1" is fully contained in the range. |
| void RemoveOverlappedDateByRange(const std::vector<DateRangeMatch>& ranges, |
| std::vector<DateMatch>* dates) { |
| int next_date = 0; |
| std::vector<bool> removed(dates->size(), false); |
| for (int i = 0; i < ranges.size(); ++i) { |
| const auto& range = ranges[i]; |
| for (; next_date < dates->size(); ++next_date) { |
| const auto& date = dates->at(next_date); |
| // So far we don't touch the partially overlapped case. |
| if (date.begin >= range.begin && date.end <= range.end) { |
| // Fully contained. |
| removed[next_date] = true; |
| } else if (date.end <= range.begin) { |
| continue; // date is behind range, go to next date |
| } else if (date.begin >= range.end) { |
| break; // range is behind date, go to next range |
| } |
| } |
| } |
| RemoveDeletedMatches(removed, dates); |
| } |
| |
| // Converts candidate dates and date ranges. |
| void FillDateInstances( |
| const UniLib& unilib, const std::vector<UnicodeText::const_iterator>& text, |
| const DateAnnotationOptions& options, std::vector<DateMatch>* date_matches, |
| std::vector<DatetimeParseResultSpan>* datetime_parse_result_spans) { |
| int i = 0; |
| for (int j = 1; j < date_matches->size(); j++) { |
| if (options.merge_adjacent_components && |
| AreDateMatchesAdjacentAndMergeable(unilib, text, options.ignored_spans, |
| date_matches->at(i), |
| date_matches->at(j))) { |
| MergeDateMatch(date_matches->at(i), &date_matches->at(j), true); |
| } else { |
| if (!IsBlacklistedDate(unilib, text, date_matches->at(i))) { |
| DatetimeParseResultSpan datetime_parse_result_span; |
| FillDateInstance(date_matches->at(i), &datetime_parse_result_span); |
| datetime_parse_result_spans->push_back(datetime_parse_result_span); |
| } |
| } |
| i = j; |
| } |
| if (!IsBlacklistedDate(unilib, text, date_matches->at(i))) { |
| DatetimeParseResultSpan datetime_parse_result_span; |
| FillDateInstance(date_matches->at(i), &datetime_parse_result_span); |
| datetime_parse_result_spans->push_back(datetime_parse_result_span); |
| } |
| } |
| |
| void FillDateRangeInstances( |
| const std::vector<DateRangeMatch>& date_range_matches, |
| std::vector<DatetimeParseResultSpan>* datetime_parse_result_spans) { |
| for (const DateRangeMatch& date_range_match : date_range_matches) { |
| DatetimeParseResultSpan datetime_parse_result_span; |
| FillDateRangeInstance(date_range_match, &datetime_parse_result_span); |
| datetime_parse_result_spans->push_back(datetime_parse_result_span); |
| } |
| } |
| |
| // Fills `DatetimeParseResultSpan` from `DateMatch` and `DateRangeMatch` |
| // instances. |
| std::vector<DatetimeParseResultSpan> GetOutputAsAnnotationList( |
| const UniLib& unilib, const DateExtractor& extractor, |
| const std::vector<UnicodeText::const_iterator>& text, |
| const DateAnnotationOptions& options) { |
| std::vector<DatetimeParseResultSpan> datetime_parse_result_spans; |
| std::vector<DateMatch> date_matches = |
| BuildDateMatches(text, extractor.output()); |
| |
| std::sort( |
| date_matches.begin(), date_matches.end(), |
| // Order by increasing begin, and decreasing end (decreasing length). |
| [](const DateMatch& a, const DateMatch& b) { |
| return (a.begin < b.begin || (a.begin == b.begin && a.end > b.end)); |
| }); |
| |
| if (!date_matches.empty()) { |
| RemoveDuplicatedDates(&date_matches); |
| } |
| |
| if (options.enable_date_range) { |
| std::vector<DateRangeMatch> date_range_matches = |
| BuildDateRangeMatches(text, extractor.range_output()); |
| |
| if (!date_range_matches.empty()) { |
| std::sort( |
| date_range_matches.begin(), date_range_matches.end(), |
| // Order by increasing begin, and decreasing end (decreasing length). |
| [](const DateRangeMatch& a, const DateRangeMatch& b) { |
| return (a.begin < b.begin || (a.begin == b.begin && a.end > b.end)); |
| }); |
| RemoveDuplicatedDates(&date_range_matches); |
| } |
| |
| if (!date_matches.empty()) { |
| MergeDateRangeAndDate(unilib, text, options.ignored_spans, date_matches, |
| &date_range_matches); |
| RemoveOverlappedDateByRange(date_range_matches, &date_matches); |
| } |
| FillDateRangeInstances(date_range_matches, &datetime_parse_result_spans); |
| } |
| |
| if (!date_matches.empty()) { |
| FillDateInstances(unilib, text, options, &date_matches, |
| &datetime_parse_result_spans); |
| } |
| return datetime_parse_result_spans; |
| } |
| |
| } // namespace |
| |
| std::vector<DatetimeParseResultSpan> DateParser::Parse( |
| StringPiece text, const std::vector<Token>& tokens, |
| const std::vector<Locale>& locales, |
| const DateAnnotationOptions& options) const { |
| std::vector<UnicodeText::const_iterator> codepoint_offsets; |
| const UnicodeText text_unicode = UTF8ToUnicodeText(text, |
| /*do_copy=*/false); |
| for (auto it = text_unicode.begin(); it != text_unicode.end(); it++) { |
| codepoint_offsets.push_back(it); |
| } |
| codepoint_offsets.push_back(text_unicode.end()); |
| DateExtractor extractor(codepoint_offsets, options, datetime_rules_); |
| // Select locale matching rules. |
| // Only use a shard if locales match or the shard doesn't specify a locale |
| // restriction. |
| std::vector<const grammar::RulesSet_::Rules*> locale_rules = |
| SelectLocaleMatchingShards(datetime_rules_->rules(), rules_locales_, |
| locales); |
| if (locale_rules.empty()) { |
| return {}; |
| } |
| grammar::Matcher matcher(&unilib_, datetime_rules_->rules(), locale_rules, |
| &extractor); |
| lexer_.Process(text_unicode, tokens, /*annotations=*/nullptr, &matcher); |
| return GetOutputAsAnnotationList(unilib_, extractor, codepoint_offsets, |
| options); |
| } |
| |
| } // namespace libtextclassifier3::dates |