blob: bdf7f42b9f687a561dc6d1544c8815fb2de50d4b [file] [log] [blame]
// Copyright 2021 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
syntax = "proto2";
option optimize_for = LITE_RUNTIME;
package segmentation_platform.proto;
import "components/segmentation_platform/internal/proto/aggregation.proto";
import "components/segmentation_platform/internal/proto/types.proto";
// The version is used to verify if the metadata provided by the server is
// supported in current version of the code. Update the version number for any
// new feature added to metadata proto, and add a log of the new changes in the
// current version in this file.
// Version 0 supports UMA features and aggregation in |features| field.
// Version 1 supports UMA features, custom inputs and sql features in
// |input_features| field.
// Version 2 supports training data output collection in |training_outputs|
// field.
enum CurrentVersion {
METADATA_VERSION = 1;
}
// Version information for segmentation models.
message VersionInfo {
// Minimum model metadata version that is supported. Some newer
// features/fields might not be available before this version. This field is
// set on the server and read by the client to verify if model is valid.
optional int32 metadata_min_version = 1;
// Current model metadata version. This field is set by the client while
// sending a model download request to optimization guide server so that the
// server knows the capabilities of the client.
optional int32 metadata_cur_version = 2;
}
// Used as time unit for rest of this proto.
enum TimeUnit {
UNKNOWN_TIME_UNIT = 0;
YEAR = 1;
MONTH = 2;
WEEK = 3;
DAY = 4;
HOUR = 5;
MINUTE = 6;
SECOND = 7;
}
message UMAFeature {
// The type of signal this feature refers to.
// Note: SignalType::UKM_EVENT type is only used for SignalStorageConfig and
// should not be used as uma feature's signal type.
optional SignalType type = 1;
// The human readable name of the histogram or user action.
optional string name = 2;
// The hash of the histogram name or user action. Must match the result of
// base::HashMetricName.
optional fixed64 name_hash = 3;
// Number of buckets to include in the result. If set to 0, no data will be
// collected. This can be used to start storing data before it should be used.
// See documentation for Aggregation for details.
optional uint64 bucket_count = 4;
// The required length of the calculated result. See documentation for
// Aggregation for details.
optional uint64 tensor_length = 5;
// The type of aggregation to use for this particular feature.
optional Aggregation aggregation = 6;
// Only set if type == HISTOGRAM_ENUM.
// Matches are only valid when the enum ID matches any of these.
// Works like an OR condition, e.g.: [url, search, …] or just [url].
repeated int32 enum_ids = 7;
}
message CustomInput {
// This parameter is required.
// 1. If the param is directly used as the input tensor field to the model,
// then this specifies the number of columns to fill in the tensor. In this
// case the value should be float.
// 2. If the param is used as a bind value for sql features, then this
// specifies the number of sql bindings to fill in the sql query.
optional int32 tensor_length = 1;
// Used to distinguish between different types of custom inputs.
enum FillPolicy {
// Custom functions provided by the engine that fills in the input feature
// to the model.
UNKNOWN_FILL_POLICY = 0;
// Output is the time at which model prediction is needed. Can be used to
// bind TIME type param to queries.
// Output type: Time
// Output length: 1
FILL_PREDICTION_TIME = 1;
// Output is two timestamps, the beginning and the end of last x days. Can
// be used to bind TIME type param to query within a time interval.
// Output type: Time
// Output length: 2
// Additional arg:
// `bucket_count`: Required. Number of buckets to include in the result.
TIME_RANGE_BEFORE_PREDICTION = 2;
// Used to determine whether a given page is a product details page and can
// be price tracked..
PRICE_TRACKING_HINTS = 3;
}
// The fill type of the custom input.
optional FillPolicy fill_policy = 2;
// If the current chrome version does not support this fill type, use this
// value. If this is not specified and the function is unavailable, the model
// will not run due to missing input. The number of entries should be equal to
// the |tensor_length|.
repeated float default_value = 3;
// If the fill type need additional arguments, use this value.
map<string, string> additional_args = 4;
// The human readable name of the custom input.
optional string name = 5;
}
// Configuration for storing signals in the SQL database.
message SignalFilterConfig {
// Defines a single UKM event that should be stored.
message UkmEvent {
// Event hash of the UKM event.
optional uint64 event_hash = 1;
// List of metric hashes for the event, to store in the database. It is
// is required to provide list of necessary metrics.
// TODO: Support empty metric hash list, the database will store all the
// metrics for the UKM event.
repeated uint64 metric_hash_filter = 2;
}
// List of UKM events to store in the database.
repeated UkmEvent ukm_events = 1;
}
message SqlFeature {
// The query should select a single float column. The query can contain '?'
// which can be used to bind values using |bind_values| list.
// TODO(ssid): Consider expanding this to return multiple input tensor
// features.
optional string sql = 1;
// List of signals needed in the storage for the query.
optional SignalFilterConfig signal_filter = 2;
// Used to bind value for the SQL query.
message BindValue {
// The bind field numbers, in range of 0 to n-1, for n question marks in the
// SQL query.
repeated int32 bind_field_index = 1;
// Used to call Bind*() in sql::Statement.
enum ParamType {
UNKNOWN = 0;
NULL = 1;
BOOL = 2;
INT = 3;
INT64 = 4;
DOUBLE = 5;
STRING = 6;
TIME = 7;
}
optional ParamType param_type = 2;
// Value of the input to bind the query. The custom function should return
// the specified param type. The |tensor_length| should be 0 since these
// inputs can only be used for SQL bind values.
optional CustomInput value = 3;
}
repeated BindValue bind_values = 3;
// The human readable name of the ukm event and metric.
optional string name = 4;
}
// Contains a feature used as an input to the ML model.
message InputFeature {
oneof Feature {
// An UMAFeature type of input feature.
UMAFeature uma_feature = 1;
// A custom input type of input feature.
CustomInput custom_input = 2;
// Input feature computed using SQL query.
SqlFeature sql_feature = 3;
}
}
// Contains a list of training output generators. The ML model pipeline can
// iterate on different output candidates and select the final output generator.
message TrainingOutputs {
repeated TrainingOutput outputs = 1;
}
// Generic type to define how to generate the training data output.
// TODO(xingliu): Add more implementation details about how output training data
// is generated.
message TrainingOutput {
oneof output {
// Training data output is generated from UMA metrics.
UMAOutput uma_output = 1;
}
}
// Contains the information to generate the output for training data based on a
// particular UMA metric.
message UMAOutput {
// The UMA metric to generate the training data output.
optional UMAFeature uma_feature = 1;
// The duration to trigger a training data collection, unit in TimeUnit. If
// not specified or 0, the training data will be generated immediately after
// certain UMA is recorded.
optional uint64 duration = 2;
}
// Metadata about a segmentation model for a given segment. Contains information
// on how to use the model such as collecting signals, interpreting results etc.
// Next tag: 12
message SegmentationModelMetadata {
// The version information needed to validate segmentation models.
optional VersionInfo version_info = 9;
// DEPRECATED: Use |input_features.uma_feature| instead. Only one of
// |features| or |input_features| can be used in the config, not both. An
// ordered list of required features.
repeated UMAFeature features = 1;
// An ordered list of required features and custom inputs. Only one of
// |features| or |input_features| can be used in the config, not both.
repeated InputFeature input_features = 10;
// A list of training data output definitions.
optional TrainingOutputs training_outputs = 11;
// The time unit to be used for the rest of this proto.
optional TimeUnit time_unit = 2;
// The size of each interval the data should be aggregated over.
optional uint64 bucket_duration = 3;
// For how long should data be stored for this model.
optional int64 signal_storage_length = 4;
// For how long do we have to have captured data for this model. If the
// relevant signals have been captured for a shorter amount of time than this,
// this model can never be selected.
optional int64 min_signal_collection_length = 5;
// Describes how long after a valid result has been calculated for this model
// it is OK to cache the result without recalculating with updated data.
optional int64 result_time_to_live = 6;
message DiscreteMapping {
// A mapping result from the raw continuous result to a discrete and
// comparable value based on |rank|.
message Entry {
// The minimum result of the model to be allowed to choose this mapping.
optional float min_result = 1;
// A feature specific rank.
optional int64 rank = 2;
}
// An ordered (based on their |min_result|) list of discrete mappings.
// To map a model evaluation result to a DiscreteMapping, choose the highest
// |min_value| that the evaluation result is at or above.
// E.g. for these mappings: [(0.0, 0), (0.4, 1), (0.7, 2), (0.9, 3)], a
// result of 0.7 would yield (0.7, 2), and 0.69 would yield (0.4, 1).
repeated Entry entries = 1;
}
map<string, DiscreteMapping> discrete_mappings = 7;
// The default key to use during the mapping process if no key has been
// provided.
optional string default_discrete_mapping = 8;
}