blob: ce808105769bb320ceb8e1c40aab698edcc06860 [file] [log] [blame]
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef RELEVANT_SCRIPT_FEATURE_H_
#define RELEVANT_SCRIPT_FEATURE_H_
#include "feature_extractor.h"
#include "cld_3/protos/sentence.pb.h"
#include "sentence_features.h"
#include "task_context.h"
#include "workspace.h"
namespace chrome_lang_id {
// Given a sentence, generates one FloatFeatureValue for each "relevant" Unicode
// script (see below): each such feature indicates the script and the ratio of
// UTF8 characters in that script, in the given sentence.
//
// What is a relevant script? Recognizing all 100+ Unicode scripts would
// require too much code size and runtime. Instead, we focus only on a few
// scripts that communicate a lot of language information: e.g., the use of
// Hiragana characters almost always indicates Japanese, so Hiragana is a
// "relevant" script for us. The Latin script is used by dozens of language, so
// Latin is not relevant in this context.
class RelevantScriptFeature : public WholeSentenceFeature {
public:
void Setup(TaskContext *context) override;
void Init(TaskContext *context) override;
// Appends the features computed from the sentence to the feature vector.
void Evaluate(const WorkspaceSet &workspaces, const Sentence &sentence,
FeatureVector *result) const override;
};
} // namespace chrome_lang_id
#endif // RELEVANT_SCRIPT_FEATURE_H_