utils/grammar/matcher.h - chromiumos/third_party/libtextclassifier - Git at Google

 // Copyright 2020 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //

 #pragma GCC diagnostic ignored "-Wc++17-extensions"

 // A token matcher based on context-free grammars.
 //
 // A lexer passes token to the matcher: literal terminal strings and token
 // types. It passes tokens to the matcher by calling AddTerminal() and
 // AddMatch() for literal terminals and token types, respectively.
 // The lexer passes each token along with the [begin, end) position range
 // in which it occurs.  So for an input string "Groundhog February 2, 2007", the
 // lexer would tell the matcher that:
 //
 //     "Groundhog" occurs at [0, 9)
 //     <space> occurs at [9, 10)
 //     "February" occurs at [10, 18)
 //     <space> occurs at [18, 19)
 //     <string_of_digits> occurs at [19, 20)
 //     "," occurs at [20, 21)
 //     <space> occurs at [21, 22)
 //     <string_of_digits> occurs at [22, 26)
 //
 // The lexer passes tokens to the matcher by calling AddTerminal() and
 // AddMatch() for literal terminals and token types, respectively.
 //
 // Although it is unnecessary for this example grammar, a lexer can
 // output multiple tokens for the same input range.  So our lexer could
 // additionally output:
 //     "2" occurs at [19, 20)        // a second token for [19, 20)
 //     "2007" occurs at [22, 26)
 //     <syllable> occurs at [0, 6)   // overlaps with (Groundhog [0, 9))
 //     <syllable> occurs at [6, 9)
 // The only constraint on the lexer's output is that it has to pass tokens
 // to the matcher in left-to-right order, strictly speaking, their "end"
 // positions must be nondecreasing.  (This constraint allows a more
 // efficient matching algorithm.)  The "begin" positions can be in any
 // order.
 //
 // There are two kinds of supported callbacks:
 // (1) OUTPUT:  Callbacks are the only output mechanism a matcher has.  For each
 // "top-level" rule in your grammar, like the rule for <date> above -- something
 // you're trying to find instances of -- you use a callback which the matcher
 // will invoke every time it finds an instance of <date>.
 // (2) FILTERS:
 // Callbacks allow you to put extra conditions on when a grammar rule
 // applies.  In the example grammar, the rule
 //
 //       <day> ::= <string_of_digits>     // must be between 1 and 31
 //
 // should only apply for *some* <string_of_digits> tokens, not others.
 // By using a filter callback on this rule, you can tell the matcher that
 // an instance of the rule's RHS is only *sometimes* considered an
 // instance of its LHS.  The filter callback will get invoked whenever
 // the matcher finds an instance of <string_of_digits>.  The callback can
 // look at the digits and decide whether they represent a number between
 // 1 and 31.  If so, the callback calls Matcher::AddMatch() to tell the
 // matcher there's a <day> there.  If not, the callback simply exits
 // without calling AddMatch().
 //
 // Technically, a FILTER callback can make any number of calls to
 // AddMatch() or even AddTerminal().  But the expected usage is to just
 // make zero or one call to AddMatch().  OUTPUT callbacks are not expected
 // to call either of these -- output callbacks are invoked merely as a
 // side-effect, not in order to decide whether a rule applies or not.
 //
 // In the above example, you would probably use three callbacks.  Filter
 // callbacks on the rules for <day> and <year> would check the numeric
 // value of the <string_of_digits>.  An output callback on the rule for
 // <date> would simply increment the counter of dates found on the page.
 //
 // Note that callbacks are attached to rules, not to nonterminals.  You
 // could have two alternative rules for <date> and use a different
 // callback for each one.

 #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCHER_H_
 #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCHER_H_

 #include <array>
 #include <functional>
 #include <vector>

 #include "annotator/types.h"
 #include "utils/base/arena.h"
 #include "utils/grammar/callback-delegate.h"
 #include "utils/grammar/match.h"
 #include "utils/grammar/rules_generated.h"
 #include "utils/strings/stringpiece.h"
 #include "utils/utf8/unilib.h"

 namespace libtextclassifier3::grammar {

 class Matcher {
  public:
   explicit Matcher(const UniLib* unilib, const RulesSet* rules,
                    const std::vector<const RulesSet_::Rules*> rules_shards,
                    CallbackDelegate* delegate)
       : state_(STATE_DEFAULT),
         unilib_(*unilib),
         arena_(kBlocksize),
         rules_(rules),
         rules_shards_(rules_shards),
         delegate_(delegate) {
     TC3_CHECK(rules_ != nullptr);
     Reset();
   }
   explicit Matcher(const UniLib* unilib, const RulesSet* rules,
                    CallbackDelegate* delegate)
       : Matcher(unilib, rules, {}, delegate) {
     rules_shards_.reserve(rules->rules()->size());
     rules_shards_.insert(rules_shards_.end(), rules->rules()->begin(),
                          rules->rules()->end());
   }

   // Resets the matcher.
   void Reset();

   // Finish the matching.
   void Finish();

   // Tells the matcher that the given terminal was found occupying position
   // range [begin, end) in the input.
   // The matcher may invoke callback functions before returning, if this
   // terminal triggers any new matches for rules in the grammar.
   // Calls to AddTerminal() and AddMatch() must be in left-to-right order,
   // that is, the sequence of `end` values must be non-decreasing.
   void AddTerminal(const CodepointSpan codepoint_span, const int match_offset,
                    StringPiece terminal);
   void AddTerminal(const CodepointIndex begin, const CodepointIndex end,
                    StringPiece terminal) {
     AddTerminal(CodepointSpan{begin, end}, begin, terminal);
   }

   // Adds a nonterminal match to the chart.
   // This can be invoked by the lexer if the lexer needs to add nonterminals to
   // the chart.
   void AddMatch(Match* match);

   // Allocates memory from an area for a new match.
   // The `size` parameter is there to allow subclassing of the match object
   // with additional fields.
   Match* AllocateMatch(const size_t size) {
     return reinterpret_cast<Match*>(arena_.Alloc(size));
   }

   template <typename T>
   T* AllocateMatch() {
     return reinterpret_cast<T*>(arena_.Alloc(sizeof(T)));
   }

   template <typename T, typename... Args>
   T* AllocateAndInitMatch(Args... args) {
     T* match = AllocateMatch<T>();
     match->Init(args...);
     return match;
   }

   // Returns the current number of bytes allocated for all match objects.
   size_t ArenaSize() const { return arena_.status().bytes_allocated(); }

  private:
   static constexpr int kBlocksize = 16 << 10;

   // The state of the matcher.
   enum State {
     // The matcher is in the default state.
     STATE_DEFAULT = 0,

     // The matcher is currently processing queued match items.
     STATE_PROCESSING = 1,
   };
   State state_;

   // Process matches from lhs set.
   void ExecuteLhsSet(const CodepointSpan codepoint_span, const int match_offset,
                      const int whitespace_gap,
                      const std::function<void(Match*)>& initializer,
                      const RulesSet_::LhsSet* lhs_set,
                      CallbackDelegate* delegate);

   // Queues a newly created match item.
   void QueueForProcessing(Match* item);

   // Queues a match item for later post checking of the exclusion condition.
   // For exclusions we need to check that the `item->excluded_nonterminal`
   // doesn't match the same span. As we cannot know which matches have already
   // been added, we queue the item for later post checking - once all matches
   // up to `item->codepoint_span.second` have been added.
   void QueueForPostCheck(ExclusionMatch* item);

   // Adds pending items to the chart, possibly generating new matches as a
   // result.
   void ProcessPendingSet();

   // Returns whether the chart contains a match for a given nonterminal.
   bool ContainsMatch(const Nonterm nonterm, const CodepointSpan& span) const;

   // Checks all pending exclusion matches that their exclusion condition is
   // fulfilled.
   void ProcessPendingExclusionMatches();

   UniLib unilib_;

   // Memory arena for match allocation.
   UnsafeArena arena_;

   // The end position of the most recent match or terminal, for sanity
   // checking.
   int last_end_;

   // Rules.
   const RulesSet* rules_;

   // The set of items pending to be added to the chart as a singly-linked list.
   Match* pending_items_;

   // The set of items pending to be post-checked as a singly-linked list.
   ExclusionMatch* pending_exclusion_items_;

   // The chart data structure: a hashtable containing all matches, indexed by
   // their end positions.
   static constexpr int kChartHashTableNumBuckets = 1 << 8;
   static constexpr int kChartHashTableBitmask = kChartHashTableNumBuckets - 1;
   std::array<Match*, kChartHashTableNumBuckets> chart_;

   // The active rule shards.
   std::vector<const RulesSet_::Rules*> rules_shards_;

   // The callback handler.
   CallbackDelegate* delegate_;
 };

 }  // namespace libtextclassifier3::grammar

 #endif  // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCHER_H_
	// Copyright 2020 Google LLC
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//

	#pragma GCC diagnostic ignored "-Wc++17-extensions"

	// A token matcher based on context-free grammars.
	//
	// A lexer passes token to the matcher: literal terminal strings and token
	// types. It passes tokens to the matcher by calling AddTerminal() and
	// AddMatch() for literal terminals and token types, respectively.
	// The lexer passes each token along with the [begin, end) position range
	// in which it occurs. So for an input string "Groundhog February 2, 2007", the
	// lexer would tell the matcher that:
	//
	// "Groundhog" occurs at [0, 9)
	// <space> occurs at [9, 10)
	// "February" occurs at [10, 18)
	// <space> occurs at [18, 19)
	// <string_of_digits> occurs at [19, 20)
	// "," occurs at [20, 21)
	// <space> occurs at [21, 22)
	// <string_of_digits> occurs at [22, 26)
	//
	// The lexer passes tokens to the matcher by calling AddTerminal() and
	// AddMatch() for literal terminals and token types, respectively.
	//
	// Although it is unnecessary for this example grammar, a lexer can
	// output multiple tokens for the same input range. So our lexer could
	// additionally output:
	// "2" occurs at [19, 20) // a second token for [19, 20)
	// "2007" occurs at [22, 26)
	// <syllable> occurs at [0, 6) // overlaps with (Groundhog [0, 9))
	// <syllable> occurs at [6, 9)
	// The only constraint on the lexer's output is that it has to pass tokens
	// to the matcher in left-to-right order, strictly speaking, their "end"
	// positions must be nondecreasing. (This constraint allows a more
	// efficient matching algorithm.) The "begin" positions can be in any
	// order.
	//
	// There are two kinds of supported callbacks:
	// (1) OUTPUT: Callbacks are the only output mechanism a matcher has. For each
	// "top-level" rule in your grammar, like the rule for <date> above -- something
	// you're trying to find instances of -- you use a callback which the matcher
	// will invoke every time it finds an instance of <date>.
	// (2) FILTERS:
	// Callbacks allow you to put extra conditions on when a grammar rule
	// applies. In the example grammar, the rule
	//
	// <day> ::= <string_of_digits> // must be between 1 and 31
	//
	// should only apply for some <string_of_digits> tokens, not others.
	// By using a filter callback on this rule, you can tell the matcher that
	// an instance of the rule's RHS is only sometimes considered an
	// instance of its LHS. The filter callback will get invoked whenever
	// the matcher finds an instance of <string_of_digits>. The callback can
	// look at the digits and decide whether they represent a number between
	// 1 and 31. If so, the callback calls Matcher::AddMatch() to tell the
	// matcher there's a <day> there. If not, the callback simply exits
	// without calling AddMatch().
	//
	// Technically, a FILTER callback can make any number of calls to
	// AddMatch() or even AddTerminal(). But the expected usage is to just
	// make zero or one call to AddMatch(). OUTPUT callbacks are not expected
	// to call either of these -- output callbacks are invoked merely as a
	// side-effect, not in order to decide whether a rule applies or not.
	//
	// In the above example, you would probably use three callbacks. Filter
	// callbacks on the rules for <day> and <year> would check the numeric
	// value of the <string_of_digits>. An output callback on the rule for
	// <date> would simply increment the counter of dates found on the page.
	//
	// Note that callbacks are attached to rules, not to nonterminals. You
	// could have two alternative rules for <date> and use a different
	// callback for each one.

	#ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCHER_H_
	#define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCHER_H_

	#include <array>
	#include <functional>
	#include <vector>

	#include "annotator/types.h"
	#include "utils/base/arena.h"
	#include "utils/grammar/callback-delegate.h"
	#include "utils/grammar/match.h"
	#include "utils/grammar/rules_generated.h"
	#include "utils/strings/stringpiece.h"
	#include "utils/utf8/unilib.h"

	namespace libtextclassifier3::grammar {

	class Matcher {
	public:
	explicit Matcher(const UniLib* unilib, const RulesSet* rules,
	const std::vector<const RulesSet_::Rules*> rules_shards,
	CallbackDelegate* delegate)
	: state_(STATE_DEFAULT),
	unilib_(*unilib),
	arena_(kBlocksize),
	rules_(rules),
	rules_shards_(rules_shards),
	delegate_(delegate) {
	TC3_CHECK(rules_ != nullptr);
	Reset();
	}
	explicit Matcher(const UniLib* unilib, const RulesSet* rules,
	CallbackDelegate* delegate)
	: Matcher(unilib, rules, {}, delegate) {
	rules_shards_.reserve(rules->rules()->size());
	rules_shards_.insert(rules_shards_.end(), rules->rules()->begin(),
	rules->rules()->end());
	}

	// Resets the matcher.
	void Reset();

	// Finish the matching.
	void Finish();

	// Tells the matcher that the given terminal was found occupying position
	// range [begin, end) in the input.
	// The matcher may invoke callback functions before returning, if this
	// terminal triggers any new matches for rules in the grammar.
	// Calls to AddTerminal() and AddMatch() must be in left-to-right order,
	// that is, the sequence of `end` values must be non-decreasing.
	void AddTerminal(const CodepointSpan codepoint_span, const int match_offset,
	StringPiece terminal);
	void AddTerminal(const CodepointIndex begin, const CodepointIndex end,
	StringPiece terminal) {
	AddTerminal(CodepointSpan{begin, end}, begin, terminal);
	}

	// Adds a nonterminal match to the chart.
	// This can be invoked by the lexer if the lexer needs to add nonterminals to
	// the chart.
	void AddMatch(Match* match);

	// Allocates memory from an area for a new match.
	// The `size` parameter is there to allow subclassing of the match object
	// with additional fields.
	Match* AllocateMatch(const size_t size) {
	return reinterpret_cast<Match*>(arena_.Alloc(size));
	}

	template <typename T>
	T* AllocateMatch() {
	return reinterpret_cast<T*>(arena_.Alloc(sizeof(T)));
	}

	template <typename T, typename... Args>
	T* AllocateAndInitMatch(Args... args) {
	T* match = AllocateMatch<T>();
	match->Init(args...);
	return match;
	}

	// Returns the current number of bytes allocated for all match objects.
	size_t ArenaSize() const { return arena_.status().bytes_allocated(); }

	private:
	static constexpr int kBlocksize = 16 << 10;

	// The state of the matcher.
	enum State {
	// The matcher is in the default state.
	STATE_DEFAULT = 0,

	// The matcher is currently processing queued match items.
	STATE_PROCESSING = 1,
	};
	State state_;

	// Process matches from lhs set.
	void ExecuteLhsSet(const CodepointSpan codepoint_span, const int match_offset,
	const int whitespace_gap,
	const std::function<void(Match*)>& initializer,
	const RulesSet_::LhsSet* lhs_set,
	CallbackDelegate* delegate);

	// Queues a newly created match item.
	void QueueForProcessing(Match* item);

	// Queues a match item for later post checking of the exclusion condition.
	// For exclusions we need to check that the `item->excluded_nonterminal`
	// doesn't match the same span. As we cannot know which matches have already
	// been added, we queue the item for later post checking - once all matches
	// up to `item->codepoint_span.second` have been added.
	void QueueForPostCheck(ExclusionMatch* item);

	// Adds pending items to the chart, possibly generating new matches as a
	// result.
	void ProcessPendingSet();

	// Returns whether the chart contains a match for a given nonterminal.
	bool ContainsMatch(const Nonterm nonterm, const CodepointSpan& span) const;

	// Checks all pending exclusion matches that their exclusion condition is
	// fulfilled.
	void ProcessPendingExclusionMatches();

	UniLib unilib_;

	// Memory arena for match allocation.
	UnsafeArena arena_;

	// The end position of the most recent match or terminal, for sanity
	// checking.
	int last_end_;

	// Rules.
	const RulesSet* rules_;

	// The set of items pending to be added to the chart as a singly-linked list.
	Match* pending_items_;

	// The set of items pending to be post-checked as a singly-linked list.
	ExclusionMatch* pending_exclusion_items_;

	// The chart data structure: a hashtable containing all matches, indexed by
	// their end positions.
	static constexpr int kChartHashTableNumBuckets = 1 << 8;
	static constexpr int kChartHashTableBitmask = kChartHashTableNumBuckets - 1;
	std::array<Match*, kChartHashTableNumBuckets> chart_;

	// The active rule shards.
	std::vector<const RulesSet_::Rules*> rules_shards_;

	// The callback handler.
	CallbackDelegate* delegate_;
	};

	} // namespace libtextclassifier3::grammar

	#endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCHER_H_