blob: 0cb6d559ce44f2a4ea50a14c41ef245ae9f5c06c [file] [log] [blame]
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "relevant_script_feature.h"
#include <ctype.h>
#include <string>
#include "feature_extractor.h"
#include "feature_types.h"
#include "language_identifier_features.h"
#include "script_detector.h"
#include "cld_3/protos/sentence.pb.h"
#include "sentence_features.h"
#include "task_context.h"
#include "utils.h"
#include "workspace.h"
namespace chrome_lang_id {
void RelevantScriptFeature::Setup(TaskContext *context) {
// Nothing.
}
void RelevantScriptFeature::Init(TaskContext *context) {
set_feature_type(new NumericFeatureType(name(), kNumRelevantScripts));
}
void RelevantScriptFeature::Evaluate(const WorkspaceSet &workspaces,
const Sentence &sentence,
FeatureVector *result) const {
const string &text = sentence.text();
// We expect kNumRelevantScripts to be small, so we stack-allocate the array
// of counts. Still, if that changes, we want to find out.
static_assert(
kNumRelevantScripts < 25,
"switch counts to vector<int>: too big for stack-allocated int[]");
// counts[s] is the number of characters with script s.
// Note: {} "value-initializes" the array to zero.
int counts[kNumRelevantScripts]{};
int total_count = 0;
const char *const text_end = text.data() + text.size();
for (const char *curr = text.data(); curr < text_end;
curr += utils::OneCharLen(curr)) {
const int num_bytes = utils::OneCharLen(curr);
// If a partial UTF-8 character is encountered, break out of the loop.
if (curr + num_bytes > text_end) {
break;
}
// Skip spaces, numbers, punctuation, and all other non-alpha ASCII
// characters: these characters are used in so many languages, they do not
// communicate language-related information.
if ((num_bytes == 1) && !isalpha(*curr)) {
continue;
}
Script script = GetScript(curr, num_bytes);
CLD3_DCHECK(script >= 0);
CLD3_DCHECK(script < kNumRelevantScripts);
counts[static_cast<int>(script)]++;
total_count++;
}
for (int script_id = 0; script_id < kNumRelevantScripts; ++script_id) {
int count = counts[script_id];
if (count > 0) {
const float weight = static_cast<float>(count) / total_count;
FloatFeatureValue value(script_id, weight);
result->add(feature_type(), value.discrete_value);
}
}
}
} // namespace chrome_lang_id