| // Copyright 2020 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| |
| // A lexer that (splits) and classifies tokens. |
| // |
| // Any whitespace gets absorbed into the token that follows them in the text. |
| // For example, if the text contains: |
| // |
| // ...hello there world... |
| // | | | |
| // offset=16 39 52 |
| // |
| // then the output will be: |
| // |
| // "hello" [?, 16) |
| // "there" [16, 44) <-- note "16" NOT "39" |
| // "world" [44, ?) <-- note "44" NOT "52" |
| // |
| // This makes it appear to the Matcher as if the tokens are adjacent. |
| |
| #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_LEXER_H_ |
| #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_LEXER_H_ |
| |
| #include <vector> |
| |
| #include "annotator/types.h" |
| #include "utils/grammar/parsing/parse-tree.h" |
| #include "utils/grammar/types.h" |
| #include "utils/strings/stringpiece.h" |
| #include "utils/utf8/unicodetext.h" |
| #include "utils/utf8/unilib.h" |
| |
| namespace libtextclassifier3::grammar { |
| |
| // A lexical symbol with an identified meaning that represents raw tokens, |
| // token categories or predefined text matches. |
| // It is the unit fed to the grammar matcher. |
| struct Symbol { |
| // The type of the lexical symbol. |
| enum class Type { |
| // A raw token. |
| TYPE_TERM, |
| |
| // A symbol representing a string of digits. |
| TYPE_DIGITS, |
| |
| // Punctuation characters. |
| TYPE_PUNCTUATION, |
| |
| // A predefined parse tree. |
| TYPE_PARSE_TREE |
| }; |
| |
| explicit Symbol() = default; |
| |
| // Constructs a symbol of a given type with an anchor in the text. |
| Symbol(const Type type, const CodepointSpan codepoint_span, |
| const int match_offset, StringPiece lexeme) |
| : type(type), |
| codepoint_span(codepoint_span), |
| match_offset(match_offset), |
| lexeme(lexeme) {} |
| |
| // Constructs a symbol from a pre-defined parse tree. |
| explicit Symbol(ParseTree* parse_tree) |
| : type(Type::TYPE_PARSE_TREE), |
| codepoint_span(parse_tree->codepoint_span), |
| match_offset(parse_tree->match_offset), |
| parse_tree(parse_tree) {} |
| |
| // The type of the symbol. |
| Type type; |
| |
| // The span in the text as codepoint offsets. |
| CodepointSpan codepoint_span; |
| |
| // The match start offset (including preceding whitespace) as codepoint |
| // offset. |
| int match_offset; |
| |
| // The symbol text value. |
| StringPiece lexeme; |
| |
| // The predefined parse tree. |
| ParseTree* parse_tree; |
| }; |
| |
| class Lexer { |
| public: |
| explicit Lexer(const UniLib* unilib) : unilib_(*unilib) {} |
| |
| // Processes a single token. |
| // Splits a token into classified symbols. |
| void AppendTokenSymbols(const StringPiece value, int match_offset, |
| const CodepointSpan codepoint_span, |
| std::vector<Symbol>* symbols) const; |
| |
| private: |
| // Gets the type of a character. |
| Symbol::Type GetSymbolType(const UnicodeText::const_iterator& it) const; |
| |
| const UniLib& unilib_; |
| }; |
| |
| } // namespace libtextclassifier3::grammar |
| |
| #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_LEXER_H_ |