utils/grammar/rules.fbs - chromiumos/third_party/libtextclassifier - Git at Google

 // Copyright 2020 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //

 include "utils/grammar/next/semantics/expression.fbs";
 include "utils/zlib/buffer.fbs";
 include "utils/i18n/language-tag.fbs";

 // The terminal rules map as sorted strings table.
 // The sorted terminal strings table is represented as offsets into the
 // global strings pool, this allows to save memory between localized
 // rules sets.
 namespace libtextclassifier3.grammar.RulesSet_.Rules_;
 table TerminalRulesMap {
   // The offsets into the terminals pool.
   terminal_offsets:[uint];

   // The lhs set associated with a terminal rule.
   // This is an offset into the (deduplicated) global `lhs_set` vector.
   lhs_set_index:[uint];

   // Bounds the lengths of the terminal strings for quick early lookup
   // abort.
   min_terminal_length:int;

   max_terminal_length:int;
 }

 namespace libtextclassifier3.grammar.RulesSet_.Rules_;
 struct UnaryRulesEntry {
   key:uint (key);
   value:uint;
 }

 // One key, value pair entry in the binary rules hash map.
 // The key is a pair of nonterminals and the value the index of the lhs set.
 namespace libtextclassifier3.grammar.RulesSet_.Rules_;
 struct BinaryRule {
   // The two rhs nonterminals.
   rhs_first:uint;

   rhs_second:uint;

   // The lhs set associated with this binary rule.
   // This is an offset into the (deduplicated) global `lhs_set` vector.
   lhs_set_index:uint;
 }

 // One bucket in the binary rule hash map that contains all entries for a
 // given hash value.
 namespace libtextclassifier3.grammar.RulesSet_.Rules_;
 table BinaryRuleTableBucket {
   rules:[BinaryRule];
 }

 namespace libtextclassifier3.grammar.RulesSet_;
 table Rules {
   // The locale this rule set applies to.
   locale:[LanguageTag];

   terminal_rules:Rules_.TerminalRulesMap;
   lowercase_terminal_rules:Rules_.TerminalRulesMap;

   // The unary rules map.
   // This is a map from a nonterminal to an lhs set index into the
   // (deduplicated) global `lhs_set` vector.
   unary_rules:[Rules_.UnaryRulesEntry];

   // The binary rules (hash) map.
   // This is a map from nonterminal pair to an lhs set index into the
   // (deduplicated) global `lhs_set` vector.
   binary_rules:[Rules_.BinaryRuleTableBucket];
 }

 // A set of lhs nonterminals associated with a rule match.
 // Most commonly, that is just the id of the lhs nonterminal of the rule that
 // is triggered, in this case `lhs` is set to the id of the nonterminal.
 // If a callback needs to be triggered, lhs is the (negated) index into the
 // `lhs` vector below that specifies additionally to the nonterminal, also the
 // callback and parameter to call.
 namespace libtextclassifier3.grammar.RulesSet_;
 table LhsSet {
   lhs:[int];
 }

 namespace libtextclassifier3.grammar.RulesSet_;
 struct Lhs {
   // The lhs nonterminal.
   nonterminal:uint;

   // The id of the callback to trigger.
   callback_id:uint;

   // A parameter to pass when invoking the callback.
   callback_param:ulong;

   // The maximum amount of whitespace allowed between the two parts.
   // A value of -1 allows for unbounded whitespace.
   max_whitespace_gap:byte;
 }

 namespace libtextclassifier3.grammar.RulesSet_.Nonterminals_;
 table AnnotationNtEntry {
   key:string (key);
   value:int;
 }

 // Usage of pre-defined non-terminals that the lexer can generate if used by
 // the grammar.
 namespace libtextclassifier3.grammar.RulesSet_;
 table Nonterminals {
   // Id of the nonterminal indicating the start of input.
   start_nt:int;

   // Id of the nonterminal indicating the end of input.
   end_nt:int;

   // Id of the nonterminal indicating a token.
   token_nt:int;

   // Id of the nonterminal indicating a string of digits.
   digits_nt:int;

   // `n_digits_nt[k]` is the id of the nonterminal indicating a string of
   // `k` digits.
   n_digits_nt:[int];

   // Id of the nonterminal indicating a word or token boundary.
   wordbreak_nt:int;

   // Id of the nonterminal indicating an uppercase token.
   uppercase_token_nt:int;

   // Predefined nonterminals for annotations.
   // Maps annotation/collection names to non-terminal ids.
   annotation_nt:[Nonterminals_.AnnotationNtEntry];
 }

 // Callback information.
 namespace libtextclassifier3.grammar.RulesSet_;
 struct Callback {
   // Whether the callback is a filter.
   is_filter:bool;
 }

 namespace libtextclassifier3.grammar.RulesSet_;
 struct CallbackEntry {
   key:uint (key);
   value:Callback;
 }

 namespace libtextclassifier3.grammar.RulesSet_.DebugInformation_;
 table NonterminalNamesEntry {
   key:int (key);
   value:string;
 }

 // Debug information for e.g. printing parse trees and show match
 // information.
 namespace libtextclassifier3.grammar.RulesSet_;
 table DebugInformation {
   nonterminal_names:[DebugInformation_.NonterminalNamesEntry];
 }

 // Regex annotators.
 namespace libtextclassifier3.grammar.RulesSet_;
 table RegexAnnotator {
   // The pattern to run.
   pattern:string;

   compressed_pattern:CompressedBuffer;

   // The nonterminal to trigger.
   nonterminal:uint;
 }

 // Context free grammar rules representation.
 // Rules are represented in (mostly) Chomsky Normal Form, where all rules are
 // of the following form, either:
 // * <nonterm> ::= term
 // * <nonterm> ::= <nonterm>
 // * <nonterm> ::= <nonterm> <nonterm>
 // The `terminals`, `unary_rules` and `binary_rules` maps below represent
 // these sets of rules.
 namespace libtextclassifier3.grammar;
 table RulesSet {
   rules:[RulesSet_.Rules];
   lhs_set:[RulesSet_.LhsSet];
   lhs:[RulesSet_.Lhs];

   // Terminals string pool.
   // The strings are zero-byte delimited and offset indexed by
   // `terminal_offsets` in the terminals rules map.
   terminals:string;

   nonterminals:RulesSet_.Nonterminals;
   callback:[RulesSet_.CallbackEntry];
   debug_information:RulesSet_.DebugInformation;
   regex_annotator:[RulesSet_.RegexAnnotator];

   // If true, will compile the regexes only on first use.
   lazy_regex_compilation:bool;

   // The semantic expressions associated with rule matches.
   semantic_expression:[next.SemanticExpression];

   // The schema defining the semantic results.
   semantic_values_schema:[ubyte];
 }
	// Copyright 2020 Google LLC
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//

	include "utils/grammar/next/semantics/expression.fbs";
	include "utils/zlib/buffer.fbs";
	include "utils/i18n/language-tag.fbs";

	// The terminal rules map as sorted strings table.
	// The sorted terminal strings table is represented as offsets into the
	// global strings pool, this allows to save memory between localized
	// rules sets.
	namespace libtextclassifier3.grammar.RulesSet_.Rules_;
	table TerminalRulesMap {
	// The offsets into the terminals pool.
	terminal_offsets:[uint];

	// The lhs set associated with a terminal rule.
	// This is an offset into the (deduplicated) global `lhs_set` vector.
	lhs_set_index:[uint];

	// Bounds the lengths of the terminal strings for quick early lookup
	// abort.
	min_terminal_length:int;

	max_terminal_length:int;
	}

	namespace libtextclassifier3.grammar.RulesSet_.Rules_;
	struct UnaryRulesEntry {
	key:uint (key);
	value:uint;
	}

	// One key, value pair entry in the binary rules hash map.
	// The key is a pair of nonterminals and the value the index of the lhs set.
	namespace libtextclassifier3.grammar.RulesSet_.Rules_;
	struct BinaryRule {
	// The two rhs nonterminals.
	rhs_first:uint;

	rhs_second:uint;

	// The lhs set associated with this binary rule.
	// This is an offset into the (deduplicated) global `lhs_set` vector.
	lhs_set_index:uint;
	}

	// One bucket in the binary rule hash map that contains all entries for a
	// given hash value.
	namespace libtextclassifier3.grammar.RulesSet_.Rules_;
	table BinaryRuleTableBucket {
	rules:[BinaryRule];
	}

	namespace libtextclassifier3.grammar.RulesSet_;
	table Rules {
	// The locale this rule set applies to.
	locale:[LanguageTag];

	terminal_rules:Rules_.TerminalRulesMap;
	lowercase_terminal_rules:Rules_.TerminalRulesMap;

	// The unary rules map.
	// This is a map from a nonterminal to an lhs set index into the
	// (deduplicated) global `lhs_set` vector.
	unary_rules:[Rules_.UnaryRulesEntry];

	// The binary rules (hash) map.
	// This is a map from nonterminal pair to an lhs set index into the
	// (deduplicated) global `lhs_set` vector.
	binary_rules:[Rules_.BinaryRuleTableBucket];
	}

	// A set of lhs nonterminals associated with a rule match.
	// Most commonly, that is just the id of the lhs nonterminal of the rule that
	// is triggered, in this case `lhs` is set to the id of the nonterminal.
	// If a callback needs to be triggered, lhs is the (negated) index into the
	// `lhs` vector below that specifies additionally to the nonterminal, also the
	// callback and parameter to call.
	namespace libtextclassifier3.grammar.RulesSet_;
	table LhsSet {
	lhs:[int];
	}

	namespace libtextclassifier3.grammar.RulesSet_;
	struct Lhs {
	// The lhs nonterminal.
	nonterminal:uint;

	// The id of the callback to trigger.
	callback_id:uint;

	// A parameter to pass when invoking the callback.
	callback_param:ulong;

	// The maximum amount of whitespace allowed between the two parts.
	// A value of -1 allows for unbounded whitespace.
	max_whitespace_gap:byte;
	}

	namespace libtextclassifier3.grammar.RulesSet_.Nonterminals_;
	table AnnotationNtEntry {
	key:string (key);
	value:int;
	}

	// Usage of pre-defined non-terminals that the lexer can generate if used by
	// the grammar.
	namespace libtextclassifier3.grammar.RulesSet_;
	table Nonterminals {
	// Id of the nonterminal indicating the start of input.
	start_nt:int;

	// Id of the nonterminal indicating the end of input.
	end_nt:int;

	// Id of the nonterminal indicating a token.
	token_nt:int;

	// Id of the nonterminal indicating a string of digits.
	digits_nt:int;

	// `n_digits_nt[k]` is the id of the nonterminal indicating a string of
	// `k` digits.
	n_digits_nt:[int];

	// Id of the nonterminal indicating a word or token boundary.
	wordbreak_nt:int;

	// Id of the nonterminal indicating an uppercase token.
	uppercase_token_nt:int;

	// Predefined nonterminals for annotations.
	// Maps annotation/collection names to non-terminal ids.
	annotation_nt:[Nonterminals_.AnnotationNtEntry];
	}

	// Callback information.
	namespace libtextclassifier3.grammar.RulesSet_;
	struct Callback {
	// Whether the callback is a filter.
	is_filter:bool;
	}

	namespace libtextclassifier3.grammar.RulesSet_;
	struct CallbackEntry {
	key:uint (key);
	value:Callback;
	}

	namespace libtextclassifier3.grammar.RulesSet_.DebugInformation_;
	table NonterminalNamesEntry {
	key:int (key);
	value:string;
	}

	// Debug information for e.g. printing parse trees and show match
	// information.
	namespace libtextclassifier3.grammar.RulesSet_;
	table DebugInformation {
	nonterminal_names:[DebugInformation_.NonterminalNamesEntry];
	}

	// Regex annotators.
	namespace libtextclassifier3.grammar.RulesSet_;
	table RegexAnnotator {
	// The pattern to run.
	pattern:string;

	compressed_pattern:CompressedBuffer;

	// The nonterminal to trigger.
	nonterminal:uint;
	}

	// Context free grammar rules representation.
	// Rules are represented in (mostly) Chomsky Normal Form, where all rules are
	// of the following form, either:
	// * <nonterm> ::= term
	// * <nonterm> ::= <nonterm>
	// * <nonterm> ::= <nonterm> <nonterm>
	// The `terminals`, `unary_rules` and `binary_rules` maps below represent
	// these sets of rules.
	namespace libtextclassifier3.grammar;
	table RulesSet {
	rules:[RulesSet_.Rules];
	lhs_set:[RulesSet_.LhsSet];
	lhs:[RulesSet_.Lhs];

	// Terminals string pool.
	// The strings are zero-byte delimited and offset indexed by
	// `terminal_offsets` in the terminals rules map.
	terminals:string;

	nonterminals:RulesSet_.Nonterminals;
	callback:[RulesSet_.CallbackEntry];
	debug_information:RulesSet_.DebugInformation;
	regex_annotator:[RulesSet_.RegexAnnotator];

	// If true, will compile the regexes only on first use.
	lazy_regex_compilation:bool;

	// The semantic expressions associated with rule matches.
	semantic_expression:[next.SemanticExpression];

	// The schema defining the semantic results.
	semantic_values_schema:[ubyte];
	}