blob: c390c3e589cf05ef182cb6a788ecd8a735866e34 [file] [log] [blame]
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "utils/grammar/analyzer.h"
#include "utils/base/status_macros.h"
#include "utils/utf8/unicodetext.h"
namespace libtextclassifier3::grammar {
Analyzer::Analyzer(const UniLib* unilib, const RulesSet* rules_set)
// TODO(smillius): Add tokenizer options to `RulesSet`.
: owned_tokenizer_(new Tokenizer(libtextclassifier3::TokenizationType_ICU,
unilib,
/*codepoint_ranges=*/{},
/*internal_tokenizer_codepoint_ranges=*/{},
/*split_on_script_change=*/false,
/*icu_preserve_whitespace_tokens=*/false)),
tokenizer_(owned_tokenizer_.get()),
parser_(unilib, rules_set),
semantic_evaluator_(rules_set->semantic_values_schema() != nullptr
? flatbuffers::GetRoot<reflection::Schema>(
rules_set->semantic_values_schema()->data())
: nullptr) {}
Analyzer::Analyzer(const UniLib* unilib, const RulesSet* rules_set,
const Tokenizer* tokenizer)
: tokenizer_(tokenizer),
parser_(unilib, rules_set),
semantic_evaluator_(rules_set->semantic_values_schema() != nullptr
? flatbuffers::GetRoot<reflection::Schema>(
rules_set->semantic_values_schema()->data())
: nullptr) {}
StatusOr<std::vector<EvaluatedDerivation>> Analyzer::Parse(
const TextContext& input, UnsafeArena* arena) const {
std::vector<EvaluatedDerivation> result;
// Evaluate each derivation.
for (const Derivation& derivation :
ValidDeduplicatedDerivations(parser_.Parse(input, arena))) {
TC3_ASSIGN_OR_RETURN(const SemanticValue* value,
semantic_evaluator_.Eval(input, derivation, arena));
result.emplace_back(EvaluatedDerivation{std::move(derivation), value});
}
return result;
}
StatusOr<std::vector<EvaluatedDerivation>> Analyzer::Parse(
const UnicodeText& text, const std::vector<Locale>& locales,
UnsafeArena* arena) const {
return Parse(BuildTextContextForInput(text, locales), arena);
}
TextContext Analyzer::BuildTextContextForInput(
const UnicodeText& text, const std::vector<Locale>& locales) const {
TextContext context;
context.text = UnicodeText(text, /*do_copy=*/false);
context.tokens = tokenizer_->Tokenize(context.text);
context.codepoints = context.text.Codepoints();
context.codepoints.push_back(context.text.end());
context.locales = locales;
context.context_span.first = 0;
context.context_span.second = context.tokens.size();
return context;
}
} // namespace libtextclassifier3::grammar