src/language_identifier_features.h - external/github.com/google/cld_3 - Git at Google

 /* Copyright 2016 Google Inc. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #ifndef LANGUAGE_IDENTIFIER_FEATURES_H_
 #define LANGUAGE_IDENTIFIER_FEATURES_H_

 #include <string>

 #include "feature_extractor.h"
 #include "feature_types.h"
 #include "script_span/generated_ulscript.h"
 #include "cld_3/protos/sentence.pb.h"
 #include "sentence_features.h"
 #include "task_context.h"
 #include "workspace.h"

 namespace chrome_lang_id {

 // Feature type for numeric features.
 class NumericFeatureType : public FeatureType {
  public:
   // Initializes numeric feature.
   NumericFeatureType(const string &name, FeatureValue size);

   // Returns numeric feature value.
   string GetFeatureValueName(FeatureValue value) const override;

   // Returns the number of feature values.
   FeatureValue GetDomainSize() const override;

  private:
   FeatureValue size_;
 };

 // Class for computing continuous char ngram features.
 // Feature function descriptor parameters:
 //   include_terminators(bool, false):
 //     If 'true', then splits the text based on spaces to get tokens, adds "^"
 //     to the beginning of each token, and adds "$" to the end of each token.
 //   include_spaces(bool, false):
 //     If 'true', then includes char ngrams containing spaces.
 //   use_equal_weight(bool, false):
 //     If 'true', then weighs each unique ngram by 1.0 / (number of unique
 //     ngrams in the input). Otherwise, weighs each unique ngram by (ngram
 //     count) / (total number of ngrams).
 //   id_dim(int, 10000):
 //     The integer id of each char ngram is computed as follows:
 //     Hash32WithDefaultSeed(char ngram) % id_dim.
 //   size(int, 3):
 //     Only ngrams of this size will be extracted.
 class ContinuousBagOfNgramsFunction : public WholeSentenceFeature {
  public:
   void Setup(TaskContext *context) override;
   void Init(TaskContext *context) override;

   // Appends the features computed from the focus to the feature vector.
   void Evaluate(const WorkspaceSet &workspaces, const Sentence &sentence,
                 FeatureVector *result) const override;

  private:
   // If 'true', then splits the text based on spaces to get tokens, adds "^" to
   // the beginning of each token, and adds "$" to the end of each token.
   bool include_terminators_;

   // If 'true', then includes char ngrams containing spaces.
   bool include_spaces_;

   // If 'true', then weighs each unique ngram by 1.0 / (number of unique ngrams
   // in the input). Otherwise, weighs each unique ngram by (ngram count) /
   // (total number of ngrams).
   bool use_equal_ngram_weight_;

   // The integer id of each char ngram is computed as follows:
   // Hash32WithDefaultSeed(char_ngram) % ngram_id_dimension_.
   int ngram_id_dimension_;

   // Only ngrams of size ngram_size_ will be extracted.
   int ngram_size_;
 };

 // Class for detecting the script of a piece of text. The list of supported
 // scripts is chrome_lang_id::CLD2::ULScript. This class uses the script
 // recognition code ported from CLD2. ULScript_Hani is split into non-Korean
 // script and Korean script (Hangul). In the former case, the function emits
 // ULScript_Hani. In the latter case, the function emits NUM_ULSCRIPTS. The
 // class assumes that the input is (1) interchange valid UTF8, and (2) contains
 // only one chrome_lang_id::CLD2::ULScript.
 class ScriptFeature : public WholeSentenceFeature {
  public:
   void Init(TaskContext *context) override {
     // The dimension is incremented by 1 because ULScript_Hani is split into two
     // as mentioned in the class description.
     set_feature_type(new NumericFeatureType(
         name(), chrome_lang_id::CLD2::NUM_ULSCRIPTS + 1));
   }

   // Computes the feature and saves it in the feature vector.
   FeatureValue Compute(const WorkspaceSet &workspaces, const Sentence &sentence,
                        const FeatureVector *result) const override;
 };

 }  // namespace chrome_lang_id

 #endif  // LANGUAGE_IDENTIFIER_FEATURES_H_
	/* Copyright 2016 Google Inc. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	#ifndef LANGUAGE_IDENTIFIER_FEATURES_H_
	#define LANGUAGE_IDENTIFIER_FEATURES_H_

	#include <string>

	#include "feature_extractor.h"
	#include "feature_types.h"
	#include "script_span/generated_ulscript.h"
	#include "cld_3/protos/sentence.pb.h"
	#include "sentence_features.h"
	#include "task_context.h"
	#include "workspace.h"

	namespace chrome_lang_id {

	// Feature type for numeric features.
	class NumericFeatureType : public FeatureType {
	public:
	// Initializes numeric feature.
	NumericFeatureType(const string &name, FeatureValue size);

	// Returns numeric feature value.
	string GetFeatureValueName(FeatureValue value) const override;

	// Returns the number of feature values.
	FeatureValue GetDomainSize() const override;

	private:
	FeatureValue size_;
	};

	// Class for computing continuous char ngram features.
	// Feature function descriptor parameters:
	// include_terminators(bool, false):
	// If 'true', then splits the text based on spaces to get tokens, adds "^"
	// to the beginning of each token, and adds "$" to the end of each token.
	// include_spaces(bool, false):
	// If 'true', then includes char ngrams containing spaces.
	// use_equal_weight(bool, false):
	// If 'true', then weighs each unique ngram by 1.0 / (number of unique
	// ngrams in the input). Otherwise, weighs each unique ngram by (ngram
	// count) / (total number of ngrams).
	// id_dim(int, 10000):
	// The integer id of each char ngram is computed as follows:
	// Hash32WithDefaultSeed(char ngram) % id_dim.
	// size(int, 3):
	// Only ngrams of this size will be extracted.
	class ContinuousBagOfNgramsFunction : public WholeSentenceFeature {
	public:
	void Setup(TaskContext *context) override;
	void Init(TaskContext *context) override;

	// Appends the features computed from the focus to the feature vector.
	void Evaluate(const WorkspaceSet &workspaces, const Sentence &sentence,
	FeatureVector *result) const override;

	private:
	// If 'true', then splits the text based on spaces to get tokens, adds "^" to
	// the beginning of each token, and adds "$" to the end of each token.
	bool include_terminators_;

	// If 'true', then includes char ngrams containing spaces.
	bool include_spaces_;

	// If 'true', then weighs each unique ngram by 1.0 / (number of unique ngrams
	// in the input). Otherwise, weighs each unique ngram by (ngram count) /
	// (total number of ngrams).
	bool use_equal_ngram_weight_;

	// The integer id of each char ngram is computed as follows:
	// Hash32WithDefaultSeed(char_ngram) % ngram_id_dimension_.
	int ngram_id_dimension_;

	// Only ngrams of size ngram_size_ will be extracted.
	int ngram_size_;
	};

	// Class for detecting the script of a piece of text. The list of supported
	// scripts is chrome_lang_id::CLD2::ULScript. This class uses the script
	// recognition code ported from CLD2. ULScript_Hani is split into non-Korean
	// script and Korean script (Hangul). In the former case, the function emits
	// ULScript_Hani. In the latter case, the function emits NUM_ULSCRIPTS. The
	// class assumes that the input is (1) interchange valid UTF8, and (2) contains
	// only one chrome_lang_id::CLD2::ULScript.
	class ScriptFeature : public WholeSentenceFeature {
	public:
	void Init(TaskContext *context) override {
	// The dimension is incremented by 1 because ULScript_Hani is split into two
	// as mentioned in the class description.
	set_feature_type(new NumericFeatureType(
	name(), chrome_lang_id::CLD2::NUM_ULSCRIPTS + 1));
	}

	// Computes the feature and saves it in the feature vector.
	FeatureValue Compute(const WorkspaceSet &workspaces, const Sentence &sentence,
	const FeatureVector *result) const override;
	};

	} // namespace chrome_lang_id

	#endif // LANGUAGE_IDENTIFIER_FEATURES_H_