utils/grammar/parsing/parser.h - chromiumos/third_party/libtextclassifier - Git at Google

 // Copyright 2020 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //

 #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSER_H_
 #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSER_H_

 #include <vector>

 #include "annotator/types.h"
 #include "utils/base/arena.h"
 #include "utils/grammar/parsing/derivation.h"
 #include "utils/grammar/parsing/lexer.h"
 #include "utils/grammar/parsing/matcher.h"
 #include "utils/grammar/rules_generated.h"
 #include "utils/grammar/text-context.h"
 #include "utils/i18n/locale.h"
 #include "utils/utf8/unilib.h"

 namespace libtextclassifier3::grammar {

 // Syntactic parsing pass.
 // The parser validates and deduplicates candidates produced by the grammar
 // matcher. It augments the parse trees with derivation information for semantic
 // evaluation.
 class Parser {
  public:
   explicit Parser(const UniLib* unilib, const RulesSet* rules);

   // Parses an input text and returns the root rule derivations.
   std::vector<Derivation> Parse(const TextContext& input,
                                 UnsafeArena* arena) const;

  private:
   struct RegexAnnotator {
     std::unique_ptr<UniLib::RegexPattern> pattern;
     Nonterm nonterm;
   };

   // Uncompresses and build the defined regex annotators.
   std::vector<RegexAnnotator> BuildRegexAnnotators() const;

   // Produces symbols for a text input to feed to a matcher.
   // These are symbols for each tokens from the lexer, existing text annotations
   // and regex annotations.
   // The symbols are sorted with increasing end-positions to satisfy the matcher
   // requirements.
   std::vector<Symbol> SortedSymbolsForInput(const TextContext& input,
                                             UnsafeArena* arena) const;

   // Emits a symbol to the matcher.
   void EmitSymbol(const Symbol& symbol, UnsafeArena* arena,
                   Matcher* matcher) const;

   const UniLib& unilib_;
   const RulesSet* rules_;
   const Lexer lexer_;

   // Pre-defined nonterminals.
   const RulesSet_::Nonterminals* nonterminals_;

   // Pre-parsed locales of the rules.
   const std::vector<std::vector<Locale>> rules_locales_;

   std::vector<RegexAnnotator> regex_annotators_;
 };

 }  // namespace libtextclassifier3::grammar

 #endif  // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSER_H_
	// Copyright 2020 Google LLC
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//

	#ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSER_H_
	#define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSER_H_

	#include <vector>

	#include "annotator/types.h"
	#include "utils/base/arena.h"
	#include "utils/grammar/parsing/derivation.h"
	#include "utils/grammar/parsing/lexer.h"
	#include "utils/grammar/parsing/matcher.h"
	#include "utils/grammar/rules_generated.h"
	#include "utils/grammar/text-context.h"
	#include "utils/i18n/locale.h"
	#include "utils/utf8/unilib.h"

	namespace libtextclassifier3::grammar {

	// Syntactic parsing pass.
	// The parser validates and deduplicates candidates produced by the grammar
	// matcher. It augments the parse trees with derivation information for semantic
	// evaluation.
	class Parser {
	public:
	explicit Parser(const UniLib* unilib, const RulesSet* rules);

	// Parses an input text and returns the root rule derivations.
	std::vector<Derivation> Parse(const TextContext& input,
	UnsafeArena* arena) const;

	private:
	struct RegexAnnotator {
	std::unique_ptr<UniLib::RegexPattern> pattern;
	Nonterm nonterm;
	};

	// Uncompresses and build the defined regex annotators.
	std::vector<RegexAnnotator> BuildRegexAnnotators() const;

	// Produces symbols for a text input to feed to a matcher.
	// These are symbols for each tokens from the lexer, existing text annotations
	// and regex annotations.
	// The symbols are sorted with increasing end-positions to satisfy the matcher
	// requirements.
	std::vector<Symbol> SortedSymbolsForInput(const TextContext& input,
	UnsafeArena* arena) const;

	// Emits a symbol to the matcher.
	void EmitSymbol(const Symbol& symbol, UnsafeArena* arena,
	Matcher* matcher) const;

	const UniLib& unilib_;
	const RulesSet* rules_;
	const Lexer lexer_;

	// Pre-defined nonterminals.
	const RulesSet_::Nonterminals* nonterminals_;

	// Pre-parsed locales of the rules.
	const std::vector<std::vector<Locale>> rules_locales_;

	std::vector<RegexAnnotator> regex_annotators_;
	};

	} // namespace libtextclassifier3::grammar

	#endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSER_H_