| // Copyright 2020 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| |
| #pragma GCC diagnostic ignored "-Wc++17-extensions" |
| |
| // This is a lexer that runs off the tokenizer and outputs the tokens to a |
| // grammar matcher. The tokens it forwards are the same as the ones produced |
| // by the tokenizer, but possibly further split and normalized (downcased). |
| // Examples: |
| // |
| // - single character tokens for punctuation (e.g., AddTerminal("?")) |
| // |
| // - a string of letters (e.g., "Foo" -- it calls AddTerminal() on "foo") |
| // |
| // - a string of digits (e.g., AddTerminal("37")) |
| // |
| // In addition to the terminal tokens above, it also outputs certain |
| // special nonterminals: |
| // |
| // - a <token> nonterminal, which it outputs in addition to the |
| // regular AddTerminal() call for every token |
| // |
| // - a <digits> nonterminal, which it outputs in addition to |
| // the regular AddTerminal() call for each string of digits |
| // |
| // - <N_digits> nonterminals, where N is the length of the string of |
| // digits. By default the maximum N that will be output is 20. This |
| // may be changed at compile time by kMaxNDigitsLength. For instance, |
| // "123" will produce a <3_digits> nonterminal, "1234567" will produce |
| // a <7_digits> nonterminal. |
| // |
| // It does not output any whitespace. Instead, whitespace gets absorbed into |
| // the token that follows them in the text. |
| // For example, if the text contains: |
| // |
| // ...hello there world... |
| // | | | |
| // offset=16 39 52 |
| // |
| // then the output will be: |
| // |
| // "hello" [?, 16) |
| // "there" [16, 44) <-- note "16" NOT "39" |
| // "world" [44, ?) <-- note "44" NOT "52" |
| // |
| // This makes it appear to the Matcher as if the tokens are adjacent -- so |
| // whitespace is simply ignored. |
| // |
| // A minor optimization: We don't bother to output nonterminals if the grammar |
| // rules don't reference them. |
| |
| #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_LEXER_H_ |
| #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_LEXER_H_ |
| |
| #include "annotator/types.h" |
| #include "utils/grammar/matcher.h" |
| #include "utils/grammar/rules_generated.h" |
| #include "utils/grammar/types.h" |
| #include "utils/strings/stringpiece.h" |
| #include "utils/utf8/unicodetext.h" |
| #include "utils/utf8/unilib.h" |
| |
| namespace libtextclassifier3::grammar { |
| |
| class Lexer { |
| public: |
| explicit Lexer(const UniLib* unilib, const RulesSet* rules); |
| |
| // Processes a tokenized text. Classifies the tokens and feeds them to the |
| // matcher. |
| // The provided annotations will be fed to the matcher alongside the tokens. |
| // NOTE: The `annotations` need to outlive any dependent processing. |
| void Process(const UnicodeText& text, const std::vector<Token>& tokens, |
| const std::vector<AnnotatedSpan>* annotations, |
| Matcher* matcher) const; |
| void Process(const UnicodeText& text, |
| const std::vector<Token>::const_iterator& begin, |
| const std::vector<Token>::const_iterator& end, |
| const std::vector<AnnotatedSpan>* annotations, |
| Matcher* matcher) const; |
| |
| private: |
| // A lexical symbol with an identified meaning that represents raw tokens, |
| // token categories or predefined text matches. |
| // It is the unit fed to the grammar matcher. |
| struct Symbol { |
| // The type of the lexical symbol. |
| enum class Type { |
| // A raw token. |
| TYPE_TERM, |
| |
| // A symbol representing a string of digits. |
| TYPE_DIGITS, |
| |
| // Punctuation characters. |
| TYPE_PUNCTUATION, |
| |
| // A predefined match. |
| TYPE_MATCH |
| }; |
| |
| explicit Symbol() = default; |
| |
| // Constructs a symbol of a given type with an anchor in the text. |
| Symbol(const Type type, const CodepointSpan codepoint_span, |
| const int match_offset, StringPiece lexeme) |
| : type(type), |
| codepoint_span(codepoint_span), |
| match_offset(match_offset), |
| lexeme(lexeme) {} |
| |
| // Constructs a symbol from a pre-defined match. |
| explicit Symbol(Match* match) |
| : type(Type::TYPE_MATCH), |
| codepoint_span(match->codepoint_span), |
| match_offset(match->match_offset), |
| match(match) {} |
| |
| // The type of the symbole. |
| Type type; |
| |
| // The span in the text as codepoint offsets. |
| CodepointSpan codepoint_span; |
| |
| // The match start offset (including preceding whitespace) as codepoint |
| // offset. |
| int match_offset; |
| |
| // The symbol text value. |
| StringPiece lexeme; |
| |
| // The predefined match. |
| Match* match; |
| }; |
| |
| // Processes a single token: the token is split and classified into symbols. |
| void ProcessToken(const StringPiece value, const int prev_token_end, |
| const CodepointSpan codepoint_span, |
| std::vector<Symbol>* symbols) const; |
| |
| // Emits a token to the matcher. |
| void Emit(const Symbol& symbol, const RulesSet_::Nonterminals* nonterms, |
| Matcher* matcher) const; |
| |
| // Gets the type of a character. |
| Symbol::Type GetSymbolType(const UnicodeText::const_iterator& it) const; |
| |
| private: |
| struct RegexAnnotator { |
| std::unique_ptr<UniLib::RegexPattern> pattern; |
| Nonterm nonterm; |
| }; |
| |
| // Uncompress and build the defined regex annotators. |
| std::vector<RegexAnnotator> BuildRegexAnnotator(const UniLib& unilib, |
| const RulesSet* rules) const; |
| |
| const UniLib& unilib_; |
| const RulesSet* rules_; |
| std::vector<RegexAnnotator> regex_annotators_; |
| }; |
| |
| } // namespace libtextclassifier3::grammar |
| |
| #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_LEXER_H_ |