| /* Copyright 2016 Google Inc. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #ifndef RELEVANT_SCRIPT_FEATURE_H_ |
| #define RELEVANT_SCRIPT_FEATURE_H_ |
| |
| #include "feature_extractor.h" |
| #include "cld_3/protos/sentence.pb.h" |
| #include "sentence_features.h" |
| #include "task_context.h" |
| #include "workspace.h" |
| |
| namespace chrome_lang_id { |
| |
| // Given a sentence, generates one FloatFeatureValue for each "relevant" Unicode |
| // script (see below): each such feature indicates the script and the ratio of |
| // UTF8 characters in that script, in the given sentence. |
| // |
| // What is a relevant script? Recognizing all 100+ Unicode scripts would |
| // require too much code size and runtime. Instead, we focus only on a few |
| // scripts that communicate a lot of language information: e.g., the use of |
| // Hiragana characters almost always indicates Japanese, so Hiragana is a |
| // "relevant" script for us. The Latin script is used by dozens of language, so |
| // Latin is not relevant in this context. |
| class RelevantScriptFeature : public WholeSentenceFeature { |
| public: |
| void Setup(TaskContext *context) override; |
| void Init(TaskContext *context) override; |
| |
| // Appends the features computed from the sentence to the feature vector. |
| void Evaluate(const WorkspaceSet &workspaces, const Sentence &sentence, |
| FeatureVector *result) const override; |
| }; |
| |
| } // namespace chrome_lang_id |
| |
| #endif // RELEVANT_SCRIPT_FEATURE_H_ |