| // Copyright 2020 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| |
| // A token based context-free grammar matcher. |
| // |
| // A parser passes token to the matcher: literal terminal strings and token |
| // types. |
| // The parser passes each token along with the [begin, end) position range |
| // in which it occurs. So for an input string "Groundhog February 2, 2007", the |
| // parser would tell the matcher that: |
| // |
| // "Groundhog" occurs at [0, 9) |
| // "February" occurs at [9, 18) |
| // <digits> occurs at [18, 20) |
| // "," occurs at [20, 21) |
| // <digits> occurs at [21, 26) |
| // |
| // Multiple overlapping symbols can be passed. |
| // The only constraint on symbol order is that they have to be passed in |
| // left-to-right order, strictly speaking, their "end" positions must be |
| // nondecreasing. This constraint allows a more efficient matching algorithm. |
| // The "begin" positions can be in any order. |
| |
| #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_MATCHER_H_ |
| #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_MATCHER_H_ |
| |
| #include <array> |
| #include <functional> |
| #include <vector> |
| |
| #include "annotator/types.h" |
| #include "utils/base/arena.h" |
| #include "utils/grammar/parsing/chart.h" |
| #include "utils/grammar/parsing/derivation.h" |
| #include "utils/grammar/parsing/parse-tree.h" |
| #include "utils/grammar/rules_generated.h" |
| #include "utils/strings/stringpiece.h" |
| #include "utils/utf8/unilib.h" |
| |
| namespace libtextclassifier3::grammar { |
| |
| class Matcher { |
| public: |
| explicit Matcher(const UniLib* unilib, const RulesSet* rules, |
| const std::vector<const RulesSet_::Rules*> rules_shards, |
| UnsafeArena* arena) |
| : unilib_(*unilib), |
| arena_(arena), |
| last_end_(std::numeric_limits<int>().lowest()), |
| rules_(rules), |
| rules_shards_(rules_shards), |
| pending_items_(nullptr), |
| pending_exclusion_items_(nullptr) { |
| TC3_CHECK_NE(rules, nullptr); |
| } |
| |
| explicit Matcher(const UniLib* unilib, const RulesSet* rules, |
| UnsafeArena* arena) |
| : Matcher(unilib, rules, {}, arena) { |
| rules_shards_.reserve(rules->rules()->size()); |
| rules_shards_.insert(rules_shards_.end(), rules->rules()->begin(), |
| rules->rules()->end()); |
| } |
| |
| // Finish the matching. |
| void Finish(); |
| |
| // Tells the matcher that the given terminal was found occupying position |
| // range [begin, end) in the input. |
| // The matcher may invoke callback functions before returning, if this |
| // terminal triggers any new matches for rules in the grammar. |
| // Calls to AddTerminal() and AddParseTree() must be in left-to-right order, |
| // that is, the sequence of `end` values must be non-decreasing. |
| void AddTerminal(const CodepointSpan codepoint_span, const int match_offset, |
| StringPiece terminal); |
| void AddTerminal(const CodepointIndex begin, const CodepointIndex end, |
| StringPiece terminal) { |
| AddTerminal(CodepointSpan{begin, end}, begin, terminal); |
| } |
| |
| // Adds predefined parse tree. |
| void AddParseTree(ParseTree* parse_tree); |
| |
| const Chart<> chart() const { return chart_; } |
| |
| private: |
| // Process matches from lhs set. |
| void ExecuteLhsSet(const CodepointSpan codepoint_span, const int match_offset, |
| const int whitespace_gap, |
| const std::function<void(ParseTree*)>& initializer_fn, |
| const RulesSet_::LhsSet* lhs_set); |
| |
| // Queues a newly created match item. |
| void QueueForProcessing(ParseTree* item); |
| |
| // Queues a match item for later post checking of the exclusion condition. |
| // For exclusions we need to check that the `item->excluded_nonterminal` |
| // doesn't match the same span. As we cannot know which matches have already |
| // been added, we queue the item for later post checking - once all matches |
| // up to `item->codepoint_span.second` have been added. |
| void QueueForPostCheck(ExclusionNode* item); |
| |
| // Adds pending items to the chart, possibly generating new matches as a |
| // result. |
| void ProcessPendingSet(); |
| |
| // Checks all pending exclusion matches that their exclusion condition is |
| // fulfilled. |
| void ProcessPendingExclusionMatches(); |
| |
| UniLib unilib_; |
| |
| // Memory arena for match allocation. |
| UnsafeArena* arena_; |
| |
| // The end position of the most recent match or terminal, for sanity |
| // checking. |
| int last_end_; |
| |
| // Rules. |
| const RulesSet* rules_; |
| // The active rule shards. |
| std::vector<const RulesSet_::Rules*> rules_shards_; |
| |
| // The set of items pending to be added to the chart as a singly-linked list. |
| ParseTree* pending_items_; |
| |
| // The set of items pending to be post-checked as a singly-linked list. |
| ExclusionNode* pending_exclusion_items_; |
| |
| // The chart data structure: a hashtable containing all matches, indexed by |
| // their end positions. |
| Chart<> chart_; |
| }; |
| |
| } // namespace libtextclassifier3::grammar |
| |
| #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_MATCHER_H_ |