| // Copyright 2020 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| |
| #pragma GCC diagnostic ignored "-Wc++17-extensions" |
| |
| #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCH_H_ |
| #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCH_H_ |
| |
| #include <functional> |
| #include <vector> |
| |
| #include "annotator/types.h" |
| #include "utils/grammar/types.h" |
| #include "utils/strings/stringpiece.h" |
| |
| namespace libtextclassifier3::grammar { |
| |
| // Represents a single match that was found for a particular nonterminal. |
| // Instances should be created by calling Matcher::AllocateMatch(). |
| // This uses an arena to allocate matches (and subclasses thereof). |
| struct Match { |
| static constexpr int16 kUnknownType = 0; |
| static constexpr int16 kTokenType = -1; |
| static constexpr int16 kDigitsType = -2; |
| static constexpr int16 kBreakType = -3; |
| static constexpr int16 kAssertionMatch = -4; |
| static constexpr int16 kMappingMatch = -5; |
| static constexpr int16 kExclusionMatch = -6; |
| static constexpr int16 kAnnotationMatch = -7; |
| |
| void Init(const Nonterm arg_lhs, const CodepointSpan arg_codepoint_span, |
| const int arg_match_offset, const int arg_type = kUnknownType) { |
| lhs = arg_lhs; |
| codepoint_span = arg_codepoint_span; |
| match_offset = arg_match_offset; |
| type = arg_type; |
| rhs1 = nullptr; |
| rhs2 = nullptr; |
| } |
| |
| void Init(const Match& other) { *this = other; } |
| |
| // For binary rule matches: rhs1 != NULL and rhs2 != NULL |
| // unary rule matches: rhs1 == NULL and rhs2 != NULL |
| // terminal rule matches: rhs1 != NULL and rhs2 == NULL |
| // custom leaves: rhs1 == NULL and rhs2 == NULL |
| bool IsInteriorNode() const { return rhs2 != nullptr; } |
| bool IsLeaf() const { return !rhs2; } |
| |
| bool IsBinaryRule() const { return rhs1 && rhs2; } |
| bool IsUnaryRule() const { return !rhs1 && rhs2; } |
| bool IsTerminalRule() const { return rhs1 && !rhs2; } |
| bool HasLeadingWhitespace() const { |
| return codepoint_span.first != match_offset; |
| } |
| |
| const Match* unary_rule_rhs() const { return rhs2; } |
| |
| // Used in singly-linked queue of matches for processing. |
| Match* next = nullptr; |
| |
| // Nonterminal we found a match for. |
| Nonterm lhs = kUnassignedNonterm; |
| |
| // Type of the match. |
| int16 type = kUnknownType; |
| |
| // The span in codepoints. |
| CodepointSpan codepoint_span = CodepointSpan::kInvalid; |
| |
| // The begin codepoint offset used during matching. |
| // This is usually including any prefix whitespace. |
| int match_offset; |
| |
| union { |
| // The first sub match for binary rules. |
| const Match* rhs1 = nullptr; |
| |
| // The terminal, for terminal rules. |
| const char* terminal; |
| }; |
| // First or second sub-match for interior nodes. |
| const Match* rhs2 = nullptr; |
| }; |
| |
| // Match type to keep track of associated values. |
| struct MappingMatch : public Match { |
| // The associated id or value. |
| int64 id; |
| }; |
| |
| // Match type to keep track of assertions. |
| struct AssertionMatch : public Match { |
| // If true, the assertion is negative and will be valid if the input doesn't |
| // match. |
| bool negative; |
| }; |
| |
| // Match type to define exclusions. |
| struct ExclusionMatch : public Match { |
| // The nonterminal that denotes matches to exclude from a successful match. |
| // So the match is only valid if there is no match of `exclusion_nonterm` |
| // spanning the same text range. |
| Nonterm exclusion_nonterm; |
| }; |
| |
| // Match to represent an annotator annotated span in the grammar. |
| struct AnnotationMatch : public Match { |
| const ClassificationResult* annotation; |
| }; |
| |
| // Utility functions for parse tree traversal. |
| |
| // Does a preorder traversal, calling `node_fn` on each node. |
| // `node_fn` is expected to return whether to continue expanding a node. |
| void Traverse(const Match* root, |
| const std::function<bool(const Match*)>& node_fn); |
| |
| // Does a preorder traversal, calling `pred_fn` and returns the first node |
| // on which `pred_fn` returns true. |
| const Match* SelectFirst(const Match* root, |
| const std::function<bool(const Match*)>& pred_fn); |
| |
| // Does a preorder traversal, selecting all nodes where `pred_fn` returns true. |
| std::vector<const Match*> SelectAll( |
| const Match* root, const std::function<bool(const Match*)>& pred_fn); |
| |
| // Selects all terminals from a parse tree. |
| inline std::vector<const Match*> SelectTerminals(const Match* root) { |
| return SelectAll(root, &Match::IsTerminalRule); |
| } |
| |
| // Selects all leaves from a parse tree. |
| inline std::vector<const Match*> SelectLeaves(const Match* root) { |
| return SelectAll(root, &Match::IsLeaf); |
| } |
| |
| // Retrieves the first child node of a given type. |
| template <typename T> |
| const T* SelectFirstOfType(const Match* root, const int16 type) { |
| return static_cast<const T*>(SelectFirst( |
| root, [type](const Match* node) { return node->type == type; })); |
| } |
| |
| // Retrieves all nodes of a given type. |
| template <typename T> |
| const std::vector<const T*> SelectAllOfType(const Match* root, |
| const int16 type) { |
| std::vector<const T*> result; |
| Traverse(root, [&result, type](const Match* node) { |
| if (node->type == type) { |
| result.push_back(static_cast<const T*>(node)); |
| } |
| return true; |
| }); |
| return result; |
| } |
| |
| } // namespace libtextclassifier3::grammar |
| |
| #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCH_H_ |