| // Copyright 2021 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| syntax = "proto2"; |
| option optimize_for = LITE_RUNTIME; |
| |
| package segmentation_platform.proto; |
| |
| import "components/segmentation_platform/public/proto/aggregation.proto"; |
| import "components/segmentation_platform/public/proto/output_config.proto"; |
| import "components/segmentation_platform/public/proto/types.proto"; |
| |
| // The version is used to verify if the metadata provided by the server is |
| // supported in current version of the code. Update the version number for any |
| // new feature added to metadata proto, and add a log of the new changes in the |
| // current version in this file. |
| // Version 0 supports UMA features and aggregation in |features| field. |
| // Version 1 supports UMA features, custom inputs and sql features in |
| // |input_features| field. |
| // Version 2 supports training data output collection in |training_outputs| |
| // field. |
| // Version 3 supports trigger configurations for training data collection. |
| enum CurrentVersion { |
| METADATA_VERSION = 3; |
| } |
| |
| // Version information for segmentation models. |
| message VersionInfo { |
| // Minimum model metadata version that is supported. Some newer |
| // features/fields might not be available before this version. This field is |
| // set on the server and read by the client to verify if model is valid. |
| optional int32 metadata_min_version = 1; |
| |
| // Current model metadata version. This field is set by the client while |
| // sending a model download request to optimization guide server so that the |
| // server knows the capabilities of the client. |
| optional int32 metadata_cur_version = 2; |
| } |
| |
| // Used to identify the source of the model whether it is a client side or |
| // server side model. |
| enum ModelSource { |
| UNKNOWN_MODEL_SOURCE = 0; |
| SERVER_MODEL_SOURCE = 1; // Represents server side model. |
| DEFAULT_MODEL_SOURCE = 2; // Represents client side model. |
| } |
| |
| message UMAFeature { |
| // The type of signal this feature refers to. |
| // Note: SignalType::UKM_EVENT type is only used for SignalStorageConfig and |
| // should not be used as uma feature's signal type. |
| optional SignalType type = 1; |
| |
| // The human readable name of the histogram or user action. |
| optional string name = 2; |
| |
| // The hash of the histogram name or user action. Must match the result of |
| // base::HashMetricName. |
| optional fixed64 name_hash = 3; |
| |
| // Number of buckets to include in the result. If set to 0, no data will be |
| // collected. This can be used to start storing data before it should be used. |
| // See documentation for Aggregation for details. |
| optional uint64 bucket_count = 4; |
| |
| // The required length of the calculated result. See documentation for |
| // Aggregation for details. |
| optional uint64 tensor_length = 5; |
| |
| // The type of aggregation to use for this particular feature. |
| optional Aggregation aggregation = 6; |
| |
| // Only set if type == HISTOGRAM_ENUM. |
| // Matches are only valid when the enum ID matches any of these. |
| // Works like an OR condition, e.g.: [url, search, …] or just [url]. |
| repeated int32 enum_ids = 7; |
| |
| // Only set if aggregation == LATEST_OR_DEFAULT. |
| // Value used for model if latest value requested is not available in the |
| // database. The number of entries should be equal to the tensor_length. |
| repeated float default_values = 8; |
| } |
| |
| message CustomInput { |
| // This parameter is required. |
| // 1. If the param is directly used as the input tensor field to the model, |
| // then this specifies the number of columns to fill in the tensor. In this |
| // case the value should be float. |
| // 2. If the param is used as a bind value for sql features, then this |
| // specifies the number of sql bindings to fill in the sql query. |
| optional int32 tensor_length = 1; |
| |
| // Used to distinguish between different types of custom inputs. |
| enum FillPolicy { |
| // Custom functions provided by the engine that fills in the input feature |
| // to the model. |
| UNKNOWN_FILL_POLICY = 0; |
| // Output is the time at which model prediction is needed. Can be used to |
| // bind TIME type param to queries. |
| // Output type: Time |
| // Output length: 1 |
| FILL_PREDICTION_TIME = 1; |
| // Output is two timestamps, the beginning and the end of last x days. Can |
| // be used to bind TIME type param to query within a time interval. |
| // Output type: Time |
| // Output length: 2 |
| // Additional arg: |
| // `bucket_count`: Required. Number of buckets to include in the result. |
| TIME_RANGE_BEFORE_PREDICTION = 2; |
| |
| // Used to determine whether a given page is a product details page and can |
| // be price tracked. |
| PRICE_TRACKING_HINTS = 3; |
| |
| // This type of custom input is used directly to fill the input tensor to |
| // the model or to another query. |
| // Output type: ProcessedValue |
| // Output length: 1 |
| // Additional arg: |
| // `name`: Optional. The name of the field to be looked up in input |
| // context. If missing then the |name| field is used. |
| FILL_FROM_INPUT_CONTEXT = 4; |
| |
| // Output is a tensor of length 10 consisting of float values denoting |
| // various devices count by type with different form factor and os type. |
| // See `SyncDeviceInfoObserver` for description of each value. |
| // Output type: float |
| // Output length: 10 |
| // Additional arg: |
| // `wait_for_device_info_in_seconds`: Number of seconds to wait for sync |
| // device info before timeout. If 0, then does not wait for sync and times |
| // out immediately if device info is not available. |
| // InputContext arg: |
| // `active_days_limit`: Number of days after which the device is |
| // considered not active after last sync. Must be INT. |
| FILL_SYNC_DEVICE_INFO = 5; |
| |
| // Output is a tensor of length 1 consisting device RAM in MB. |
| // Output type: float |
| // Output length: 1 |
| FILL_DEVICE_RAM_MB = 6; |
| |
| // Output is a tensor of length 1 describing device OS level. |
| // Output type: float |
| // Output length: 1 |
| FILL_DEVICE_OS_VERSION_NUMBER = 7; |
| |
| // Output is a tensor of length 1 giving pixels per inch for the current |
| // device used by the user. |
| // Output type: float |
| // Output length: 1 |
| FILL_DEVICE_PPI = 8; |
| |
| // Fills metrics about a given tab. A `tab_id` and `session_tag` is expected |
| // from input_context. |
| // Output type: float |
| // Output length: `TabSessionSource::kNumInputs` |
| FILL_TAB_METRICS = 9; |
| |
| // Fills a random number between [0, 1). |
| // Output type: float |
| // Output length: 1 |
| FILL_RANDOM = 10; |
| |
| // Fill various metrics from the shopping service. Currently only support |
| // shopping bookmark count. |
| // Output type: float |
| // Output length: 1 |
| FILL_FROM_SHOPPING_SERVICE = 11; |
| } |
| |
| // The fill type of the custom input. |
| optional FillPolicy fill_policy = 2; |
| |
| // If the current chrome version does not support this fill type, use this |
| // value. If this is not specified and the function is unavailable, the model |
| // will not run due to missing input. The number of entries should be equal to |
| // the |tensor_length|. |
| repeated float default_value = 3; |
| |
| // If the fill type need additional arguments, use this value. |
| map<string, string> additional_args = 4; |
| |
| // The human readable name of the custom input. |
| optional string name = 5; |
| } |
| |
| // Configuration for storing signals in the SQL database. |
| message SignalFilterConfig { |
| // Defines a single UKM event that should be stored. |
| message UkmEvent { |
| // Event hash of the UKM event. |
| optional uint64 event_hash = 1; |
| // List of metric hashes for the event, to store in the database. It is |
| // is required to provide list of necessary metrics. |
| // TODO: Support empty metric hash list, the database will store all the |
| // metrics for the UKM event. |
| repeated uint64 metric_hash_filter = 2; |
| } |
| // List of UKM events to store in the database. |
| repeated UkmEvent ukm_events = 1; |
| } |
| |
| message SqlFeature { |
| // The query should select a single float column. The query can contain '?' |
| // which can be used to bind values using |bind_values| list. |
| // TODO(ssid): Consider expanding this to return multiple input tensor |
| // features. |
| optional string sql = 1; |
| |
| // List of signals needed in the storage for the query. |
| optional SignalFilterConfig signal_filter = 2; |
| |
| // Used to bind value for the SQL query. |
| message BindValue { |
| // The bind field numbers, in range of 0 to n-1, for n question marks in the |
| // SQL query. |
| repeated int32 bind_field_index = 1; |
| |
| // Used to call Bind*() in sql::Statement. |
| enum ParamType { |
| UNKNOWN = 0; |
| NULL = 1; |
| BOOL = 2; |
| INT = 3; |
| INT64 = 4; |
| DOUBLE = 5; |
| STRING = 6; |
| TIME = 7; |
| } |
| optional ParamType param_type = 2; |
| |
| // Value of the input to bind the query. The custom function should return |
| // the specified param type. The |tensor_length| should be 0 since these |
| // inputs can only be used for SQL bind values. |
| optional CustomInput value = 3; |
| } |
| repeated BindValue bind_values = 3; |
| |
| // The human readable name of the ukm event and metric. |
| optional string name = 4; |
| } |
| |
| // Contains a feature used as an input to the ML model. |
| message InputFeature { |
| oneof Feature { |
| // An UMAFeature type of input feature. |
| UMAFeature uma_feature = 1; |
| |
| // A custom input type of input feature. |
| CustomInput custom_input = 2; |
| |
| // Input feature computed using SQL query. |
| SqlFeature sql_feature = 3; |
| } |
| } |
| |
| // Contains a list of training output generators. The ML model pipeline can |
| // iterate on different output candidates and select the final output generator. |
| message TrainingOutputs { |
| repeated TrainingOutput outputs = 1; |
| |
| // Config for triggering the training outputs data collection for the current |
| // model. |
| message TriggerConfig { |
| // Describes how the training outputs are collected. |
| enum DecisionType { |
| // By default considered as PERIODIC type. |
| UNKNOWN = 0; |
| // The on demand scheduler will trigger training data collection when the |
| // client asks for a model execution with input context. |
| ONDEMAND = 1; |
| // The periodic scheduler will trigger training data collection everyday. |
| // Currently this period is fixed on the client to 1 day. |
| PERIODIC = 2; |
| } |
| optional DecisionType decision_type = 1; |
| |
| message ObservationTrigger { |
| oneof trigger { |
| // The delay, in seconds, to collect output tensors after input tensors |
| // are collected. For example, output labels can be collected one week |
| // after input tensors are collected. Set to 0 if output tensors need to |
| // be collected in the same time period as input tensors. |
| uint64 delay_sec = 1; |
| // The user action or histogram to trigger a training data output |
| // collection. Note: Only the name and type should be used with |
| // bucket_duration = 0. |
| // TODO(crbug.com/40239034): Figure out how to include the trigger as |
| // one of the outputs automatically. |
| UMAOutput uma_trigger = 2; |
| } |
| } |
| // List of triggers, whichever is hit first is used to upload the training |
| // data. |
| repeated ObservationTrigger observation_trigger = 2; |
| |
| // Only for PERIODIC trigger. The prediction and observation times can be |
| // exact or flexible. The exact prediction setting forces the prediction |
| // time to be the time at which the segment selection or classification |
| // result was changed. The input features will be collected till the |
| // prediction time. Flexible prediction time setting allows the collector to |
| // pick any point in the past as the prediction time, usually pick the |
| // current time. The training data collection is triggered once a day with a |
| // rolling window whenever Chrome is active. This setting uploads more |
| // training data samples. By default the prediction time is FLEXIBLE. The |
| // exact observation time setting will be used only in case of exact |
| // prediction case and the observation starts exactly after prediction time. |
| // Flexible observation can be used to get most recent user behavior by |
| // setting observation time to the time of upload, which could be later than |
| // end of the observation period. By default the observation time is EXACT. |
| optional bool use_exact_prediction_time = 3; |
| optional bool use_flexible_observation_time = 4; |
| } |
| optional TriggerConfig trigger_config = 2; |
| } |
| |
| // Generic type to define how to generate the training data output. |
| // TODO(xingliu): Add more implementation details about how output training data |
| // is generated. |
| message TrainingOutput { |
| oneof output { |
| // Training data output is generated from UMA metrics. |
| UMAOutput uma_output = 1; |
| } |
| } |
| |
| // Contains the information to generate the output for training data based on a |
| // particular UMA metric. |
| message UMAOutput { |
| // The UMA metric to generate the training data output. |
| optional UMAFeature uma_feature = 1; |
| |
| // The duration to trigger a training data collection, unit in TimeUnit. If |
| // not specified or 0, the training data will be generated immediately after |
| // certain UMA is recorded. |
| optional uint64 duration = 2; |
| } |
| |
| // Metadata about a segmentation model for a given segment. Contains information |
| // on how to use the model such as collecting signals, interpreting results etc. |
| // Next tag: 16 |
| message SegmentationModelMetadata { |
| // Values for obsolete fields. |
| reserved 15; |
| |
| // The version information needed to validate segmentation models. |
| optional VersionInfo version_info = 9; |
| |
| // DEPRECATED: Use |input_features.uma_feature| instead. Only one of |
| // |features| or |input_features| can be used in the config, not both. An |
| // ordered list of required features. |
| repeated UMAFeature features = 1; |
| |
| // An ordered list of required features and custom inputs. Only one of |
| // |features| or |input_features| can be used in the config, not both. |
| repeated InputFeature input_features = 10; |
| |
| // A list of training data output definitions. |
| optional TrainingOutputs training_outputs = 11; |
| |
| // The time unit to be used for the rest of this proto. |
| optional TimeUnit time_unit = 2; |
| |
| // The size of each interval the data should be aggregated over. |
| optional uint64 bucket_duration = 3; |
| |
| // For how long should data be stored for this model. |
| optional int64 signal_storage_length = 4; |
| |
| // For how long do we have to have captured data for this model. If the |
| // relevant signals have been captured for a shorter amount of time than this, |
| // this model can never be selected. |
| optional int64 min_signal_collection_length = 5; |
| |
| // Describes how long after a valid result has been calculated for this model |
| // it is OK to cache the result without recalculating with updated data. |
| optional int64 result_time_to_live = 6; |
| |
| // The model always executes with a fixed timestamp. This is used when the |
| // model is trained on data from a specific time period, and needs to evaluate |
| // on the same date. |
| optional int64 fixed_prediction_timestamp = 17; |
| |
| message DiscreteMapping { |
| // A mapping result from the raw continuous result to a discrete and |
| // comparable value based on |rank|. |
| message Entry { |
| // The minimum result of the model to be allowed to choose this mapping. |
| optional float min_result = 1; |
| |
| // A feature specific rank. |
| optional int64 rank = 2; |
| } |
| |
| // An ordered (based on their |min_result|) list of discrete mappings. |
| // To map a model evaluation result to a DiscreteMapping, choose the highest |
| // |min_value| that the evaluation result is at or above. |
| // E.g. for these mappings: [(0.0, 0), (0.4, 1), (0.7, 2), (0.9, 3)], a |
| // result of 0.7 would yield (0.7, 2), and 0.69 would yield (0.4, 1). |
| repeated Entry entries = 1; |
| } |
| map<string, DiscreteMapping> discrete_mappings = 7; |
| |
| // The default key to use during the mapping process if no key has been |
| // provided. |
| optional string default_discrete_mapping = 8; |
| |
| // The delay, in seconds, to collect output tensors after input tensors are |
| // collected. For example, output labels can be collected one week after input |
| // tensors are collected. If not specified, output tensors are collected in |
| // the same time period as input tensors. |
| // DEPRECATED: optional int64 output_collection_delay_sec = 12; |
| reserved 12; |
| |
| // Whether the client should upload the input and output tensors through UKM. |
| optional bool upload_tensors = 13; |
| |
| // Describes the return type of the model score. Used for recording |
| // histograms. |
| enum OutputDescription { |
| UNKNOWN_RETURN_TYPE = 0; |
| // Model returns either 0 or 1. |
| RETURN_TYPE_HEURISTIC = 1; |
| // Model returns an int corresponding to a specific subsegment. Assume |
| // between 0 and 100. |
| RETURN_TYPE_MULTISEGMENT = 2; |
| // Model returns a float between 0 and 1. |
| RETURN_TYPE_PROBABILITY = 3; |
| // Model returns any integer value. |
| RETURN_TYPE_INTEGER = 4; |
| } |
| // TODO(ritikagup@): Deprecate the field. |
| optional OutputDescription return_type = 14; |
| |
| // Contains information about the model results. Supplied by the client. It |
| // gives a description of how should the results look like and how to |
| // interpret them. |
| optional OutputConfig output_config = 16; |
| } |