blob: 1fdad84c7a6f663e1ba45c9a5b2100b35ea75eaa [file] [log] [blame]
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#pragma GCC diagnostic ignored "-Wc++17-extensions"
// A token matcher based on context-free grammars.
//
// A lexer passes token to the matcher: literal terminal strings and token
// types. It passes tokens to the matcher by calling AddTerminal() and
// AddMatch() for literal terminals and token types, respectively.
// The lexer passes each token along with the [begin, end) position range
// in which it occurs. So for an input string "Groundhog February 2, 2007", the
// lexer would tell the matcher that:
//
// "Groundhog" occurs at [0, 9)
// <space> occurs at [9, 10)
// "February" occurs at [10, 18)
// <space> occurs at [18, 19)
// <string_of_digits> occurs at [19, 20)
// "," occurs at [20, 21)
// <space> occurs at [21, 22)
// <string_of_digits> occurs at [22, 26)
//
// The lexer passes tokens to the matcher by calling AddTerminal() and
// AddMatch() for literal terminals and token types, respectively.
//
// Although it is unnecessary for this example grammar, a lexer can
// output multiple tokens for the same input range. So our lexer could
// additionally output:
// "2" occurs at [19, 20) // a second token for [19, 20)
// "2007" occurs at [22, 26)
// <syllable> occurs at [0, 6) // overlaps with (Groundhog [0, 9))
// <syllable> occurs at [6, 9)
// The only constraint on the lexer's output is that it has to pass tokens
// to the matcher in left-to-right order, strictly speaking, their "end"
// positions must be nondecreasing. (This constraint allows a more
// efficient matching algorithm.) The "begin" positions can be in any
// order.
//
// There are two kinds of supported callbacks:
// (1) OUTPUT: Callbacks are the only output mechanism a matcher has. For each
// "top-level" rule in your grammar, like the rule for <date> above -- something
// you're trying to find instances of -- you use a callback which the matcher
// will invoke every time it finds an instance of <date>.
// (2) FILTERS:
// Callbacks allow you to put extra conditions on when a grammar rule
// applies. In the example grammar, the rule
//
// <day> ::= <string_of_digits> // must be between 1 and 31
//
// should only apply for *some* <string_of_digits> tokens, not others.
// By using a filter callback on this rule, you can tell the matcher that
// an instance of the rule's RHS is only *sometimes* considered an
// instance of its LHS. The filter callback will get invoked whenever
// the matcher finds an instance of <string_of_digits>. The callback can
// look at the digits and decide whether they represent a number between
// 1 and 31. If so, the callback calls Matcher::AddMatch() to tell the
// matcher there's a <day> there. If not, the callback simply exits
// without calling AddMatch().
//
// Technically, a FILTER callback can make any number of calls to
// AddMatch() or even AddTerminal(). But the expected usage is to just
// make zero or one call to AddMatch(). OUTPUT callbacks are not expected
// to call either of these -- output callbacks are invoked merely as a
// side-effect, not in order to decide whether a rule applies or not.
//
// In the above example, you would probably use three callbacks. Filter
// callbacks on the rules for <day> and <year> would check the numeric
// value of the <string_of_digits>. An output callback on the rule for
// <date> would simply increment the counter of dates found on the page.
//
// Note that callbacks are attached to rules, not to nonterminals. You
// could have two alternative rules for <date> and use a different
// callback for each one.
#ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCHER_H_
#define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCHER_H_
#include <array>
#include <functional>
#include <vector>
#include "annotator/types.h"
#include "utils/base/arena.h"
#include "utils/grammar/callback-delegate.h"
#include "utils/grammar/match.h"
#include "utils/grammar/rules_generated.h"
#include "utils/strings/stringpiece.h"
#include "utils/utf8/unilib.h"
namespace libtextclassifier3::grammar {
class Matcher {
public:
explicit Matcher(const UniLib* unilib, const RulesSet* rules,
const std::vector<const RulesSet_::Rules*> rules_shards,
CallbackDelegate* delegate)
: state_(STATE_DEFAULT),
unilib_(*unilib),
arena_(kBlocksize),
rules_(rules),
rules_shards_(rules_shards),
delegate_(delegate) {
TC3_CHECK(rules_ != nullptr);
Reset();
}
explicit Matcher(const UniLib* unilib, const RulesSet* rules,
CallbackDelegate* delegate)
: Matcher(unilib, rules, {}, delegate) {
rules_shards_.reserve(rules->rules()->size());
rules_shards_.insert(rules_shards_.end(), rules->rules()->begin(),
rules->rules()->end());
}
// Resets the matcher.
void Reset();
// Finish the matching.
void Finish();
// Tells the matcher that the given terminal was found occupying position
// range [begin, end) in the input.
// The matcher may invoke callback functions before returning, if this
// terminal triggers any new matches for rules in the grammar.
// Calls to AddTerminal() and AddMatch() must be in left-to-right order,
// that is, the sequence of `end` values must be non-decreasing.
void AddTerminal(const CodepointSpan codepoint_span, const int match_offset,
StringPiece terminal);
void AddTerminal(const CodepointIndex begin, const CodepointIndex end,
StringPiece terminal) {
AddTerminal(CodepointSpan{begin, end}, begin, terminal);
}
// Adds a nonterminal match to the chart.
// This can be invoked by the lexer if the lexer needs to add nonterminals to
// the chart.
void AddMatch(Match* match);
// Allocates memory from an area for a new match.
// The `size` parameter is there to allow subclassing of the match object
// with additional fields.
Match* AllocateMatch(const size_t size) {
return reinterpret_cast<Match*>(arena_.Alloc(size));
}
template <typename T>
T* AllocateMatch() {
return reinterpret_cast<T*>(arena_.Alloc(sizeof(T)));
}
template <typename T, typename... Args>
T* AllocateAndInitMatch(Args... args) {
T* match = AllocateMatch<T>();
match->Init(args...);
return match;
}
// Returns the current number of bytes allocated for all match objects.
size_t ArenaSize() const { return arena_.status().bytes_allocated(); }
private:
static constexpr int kBlocksize = 16 << 10;
// The state of the matcher.
enum State {
// The matcher is in the default state.
STATE_DEFAULT = 0,
// The matcher is currently processing queued match items.
STATE_PROCESSING = 1,
};
State state_;
// Process matches from lhs set.
void ExecuteLhsSet(const CodepointSpan codepoint_span, const int match_offset,
const int whitespace_gap,
const std::function<void(Match*)>& initializer,
const RulesSet_::LhsSet* lhs_set,
CallbackDelegate* delegate);
// Queues a newly created match item.
void QueueForProcessing(Match* item);
// Queues a match item for later post checking of the exclusion condition.
// For exclusions we need to check that the `item->excluded_nonterminal`
// doesn't match the same span. As we cannot know which matches have already
// been added, we queue the item for later post checking - once all matches
// up to `item->codepoint_span.second` have been added.
void QueueForPostCheck(ExclusionMatch* item);
// Adds pending items to the chart, possibly generating new matches as a
// result.
void ProcessPendingSet();
// Returns whether the chart contains a match for a given nonterminal.
bool ContainsMatch(const Nonterm nonterm, const CodepointSpan& span) const;
// Checks all pending exclusion matches that their exclusion condition is
// fulfilled.
void ProcessPendingExclusionMatches();
UniLib unilib_;
// Memory arena for match allocation.
UnsafeArena arena_;
// The end position of the most recent match or terminal, for sanity
// checking.
int last_end_;
// Rules.
const RulesSet* rules_;
// The set of items pending to be added to the chart as a singly-linked list.
Match* pending_items_;
// The set of items pending to be post-checked as a singly-linked list.
ExclusionMatch* pending_exclusion_items_;
// The chart data structure: a hashtable containing all matches, indexed by
// their end positions.
static constexpr int kChartHashTableNumBuckets = 1 << 8;
static constexpr int kChartHashTableBitmask = kChartHashTableNumBuckets - 1;
std::array<Match*, kChartHashTableNumBuckets> chart_;
// The active rule shards.
std::vector<const RulesSet_::Rules*> rules_shards_;
// The callback handler.
CallbackDelegate* delegate_;
};
} // namespace libtextclassifier3::grammar
#endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCHER_H_