blob: 6e1389043280617c655b1f2fdab92529dd2cc8d8 [file] [log] [blame]
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Common feature types for parser components.
#ifndef FEATURE_TYPES_H_
#define FEATURE_TYPES_H_
#include <algorithm>
#include <map>
#include <string>
#include <utility>
#include "base.h"
namespace chrome_lang_id {
// TODO(djweiss) Clean this up as well.
// Use the same type for feature values as is used for predicated.
typedef int64 Predicate;
typedef Predicate FeatureValue;
// Each feature value in a feature vector has a feature type. The feature type
// is used for converting feature type and value pairs to predicate values. The
// feature type can also return names for feature values and calculate the size
// of the feature value domain. The FeatureType class is abstract and must be
// specialized for the concrete feature types.
class FeatureType {
public:
// Initializes a feature type.
explicit FeatureType(const string &name);
virtual ~FeatureType();
// Converts a feature value to a name.
virtual string GetFeatureValueName(FeatureValue value) const = 0;
// Returns the size of the feature values domain.
virtual int64 GetDomainSize() const = 0;
// Returns the feature type name.
const string &name() const { return name_; }
Predicate base() const { return base_; }
void set_base(Predicate base) { base_ = base; }
// Returns true iff this feature is continuous; see FloatFeatureValue.
bool is_continuous() const { return is_continuous_; }
private:
// Feature type name.
string name_;
// "Base" feature value: i.e. a "slot" in a global ordering of features.
Predicate base_;
// See doc for is_continuous().
bool is_continuous_;
};
// Templated generic resource based feature type. This feature type delegates
// look up of feature value names to an unknown resource class, which is not
// owned. Optionally, this type can also store a mapping of extra values which
// are not in the resource.
//
// Note: this class assumes that Resource->GetFeatureValueName() will return
// successfully for values ONLY in the range [0, Resource->NumValues()) Any
// feature value not in the extra value map and not in the above range of
// Resource will result in a ERROR and return of "<INVALID>".
template <class Resource>
class ResourceBasedFeatureType : public FeatureType {
public:
// Creates a new type with given name, resource object, and a mapping of
// special values. The values must be greater or equal to
// resource->NumValues() so as to avoid collisions; this is verified with
// CHECK at creation.
ResourceBasedFeatureType(const string &name, const Resource *resource,
const std::map<FeatureValue, string> &values);
// Creates a new type with no special values.
ResourceBasedFeatureType(const string &name, const Resource *resource);
// Returns the feature name for a given feature value. First checks the values
// map, then checks the resource to look up the name.
string GetFeatureValueName(FeatureValue value) const override {
if (values_.find(value) != values_.end()) {
return values_.find(value)->second;
}
if (value >= 0 && value < resource_->NumValues()) {
return resource_->GetFeatureValueName(value);
} else {
return "<INVALID>";
}
}
// Returns the number of possible values for this feature type. This is the
// based on the largest value that was observed in the extra values.
FeatureValue GetDomainSize() const override { return max_value_ + 1; }
protected:
// Shared resource. Not owned.
const Resource *resource_ = nullptr;
// Maximum possible value this feature could take.
FeatureValue max_value_;
// Mapping for extra feature values not in the resource.
std::map<FeatureValue, string> values_;
};
// Feature type that is defined using an explicit map from FeatureValue to
// string values. This can reduce some of the boilerplate when defining
// features that generate enum values. Example usage:
//
// class BeverageSizeFeature : public FeatureFunction<Beverage>
// enum FeatureValue { SMALL, MEDIUM, LARGE }; // values for this feature
// void Init(TaskContext *context) override {
// set_feature_type(new EnumFeatureType("beverage_size",
// {{SMALL, "SMALL"}, {MEDIUM, "MEDIUM"}, {LARGE, "LARGE"}});
// }
// [...]
// };
class EnumFeatureType : public FeatureType {
public:
EnumFeatureType(const string &name,
const std::map<FeatureValue, string> &value_names);
~EnumFeatureType() override;
// Returns the feature name for a given feature value.
string GetFeatureValueName(FeatureValue value) const override;
// Returns the number of possible values for this feature type. This is one
// greater than the largest value in the value_names map.
FeatureValue GetDomainSize() const override;
protected:
// Maximum possible value this feature could take.
FeatureValue domain_size_ = 0;
// Names of feature values.
std::map<FeatureValue, string> value_names_;
};
} // namespace chrome_lang_id
#endif // FEATURE_TYPES_H_