syntax = "proto2";
option optimize_for = LITE_RUNTIME;
package mri;
// The output of the media analytics process. Implicitly tied to the
// MediaPerception dictionary defined in Chromium source at
// src/extensions/common/api/media_perception_private.idl for the
// Chromium mediaPerceptionPrivate API. This .proto needs to be compatible
// with the version used in the binary checked into the Chromebox For
// Meetings private overlay.
// This message is packaged by the graph runner when a PerceptionSample
// or array of PerceptionSamples comes out of the graph.
message MediaPerception {
// The timestamp attached when this data originated from the analysis process.
optional uint64 timestamp = 1; // In milliseconds since Epoch.
// A single FramePerception message or array of perceptions (if reporting the
// results from multiple frames).
repeated FramePerception frame_perception = 2;
// A single AudioPerception message or array of audio perceptions (if
// reporting the results from multiple audio frames).
repeated AudioPerception audio_perception = 3;
// A single AudioVisualPerception message or array of audio-visual
// perceptions.
repeated AudioVisualPerception audio_visual_perception = 4;
// Stores metadata such as version of media perception features.
optional Metadata metadata = 5;
// Stores metadata such as version of media perception features.
message Metadata {
optional string visual_experience_controller_version = 1;
// Used to transmit a history of image frames and their associated annotations.
// This is accumulated over time by the graph runner.
message Diagnostics {
repeated PerceptionSample perception_sample = 1;
message VideoStreamParam {
// Identifies the video stream described by these parameters.
optional string id = 1;
// Frame width in pixels.
optional int32 width = 2;
// Frame height in pixels.
optional int32 height = 3;
// The frame rate at which this video stream would be processed.
optional float frame_rate = 4;
message State {
enum Status {
STATUS_UNSPECIFIED = 0; // Unused required default value for Proto enums.
UNINITIALIZED = 1; // Media analytics working on loading configuration.
STARTED = 2; // Analysis process running but not recieving frames.
RUNNING = 3; // Analysis process running and injesting frames.
SUSPENDED = 4; // Media analytics process waiting to be started.
RESTARTING = 5; // Media analytics process should be restarted.
STOPPED = 6; // Media analytics process should be stopped.
// Note: RUNNING and SUSPENDED are the only two states which should be sent to
// SetState.
optional Status status = 1;
// Device context so that the media analytics process can better select the
// right video device to open.
optional string device_context = 2;
// A list of video streams processed by the analytics process.
repeated VideoStreamParam video_stream_param = 3;
// Media analytics configuration. It can only be used when setting state to
optional string configuration = 4;
// Corners and aspect ratio of the whiteboard in the image frame. Should only
// be set when setting state to running and configuration to whiteboard.
optional Whiteboard whiteboard = 5;
enum Feature {
// UNSET is not a real feature value.
// A list of enabled media perception features.
repeated Feature features = 6;
message NamedTemplateArgument {
optional string name = 1;
oneof value {
string str = 2;
double num = 3;
// An optional list of template arguments to be substituted at
// run-time. Each argument present in this list will be set to the
// specified values, others will not be changed. Furthermore, nested
// arguments (dictionaries) are not supported.
// CAUTION: These template args can be used to overwrite the
// Feature flags defined above since feature flags are implemented
// as numeric template args.
repeated NamedTemplateArgument named_template_arguments = 7;
// This is the output of the MediaPerceptionSinkCalculator.
message PerceptionSample {
optional FramePerception frame_perception = 1;
// The image frame data associated with the frame perception.
optional ImageFrame image_frame = 2;
optional AudioPerception audio_perception = 3;
optional AudioVisualPerception audio_visual_perception = 4;
// Perception results based on both audio and video inputs.
message AudioVisualPerception {
// A timestamp in microseconds attached when this message was generated.
optional uint64 timestamp_us = 1;
// Human presence detection results.
optional AudioVisualHumanPresenceDetection
audio_visual_human_presence_detection = 2;
// Detection of human presence based on both audio and video inputs.
message AudioVisualHumanPresenceDetection {
// Indicates a probability in [0, 1] interval that a human is present.
optional double human_presence_likelihood = 1;
// Audio perception results for an audio frame.
message AudioPerception {
// A timestamp in microseconds attached when this message was generated.
optional uint64 timestamp_us = 1;
// Audio localization results for an audio frame.
optional AudioLocalization audio_localization = 2;
// Human presence detection results for an audio frame.
optional AudioHumanPresenceDetection audio_human_presence_detection = 3;
// Hotword detection results.
optional HotwordDetection hotword_detection = 4;
// Detection of hotword in the audio stream.
message HotwordDetection {
enum Type {
message Hotword {
// Unique identifier for the hotword instance. Note that a single hotword
// instance can span more than one audio frame. In that case a single
// hotword instance can be reported in multiple Hotword or HotwordDetection
// messages. Hotword messages associated with the same hotword instance will
// have the same id.
optional uint64 id = 1;
// Indicates the type of this hotword.
optional Type type = 2;
// Id of the audio frame in which the hotword was detected.
optional uint64 frame_id = 3;
// Indicates the start time of this hotword in the audio frame.
optional uint64 start_timestamp_ms = 4;
// Indicates the end time of this hotword in the audio frame.
optional uint64 end_timestamp_ms = 5;
// Indicates a probability in [0, 1] interval that this hotword is present
// in the audio frame.
optional float confidence = 6;
repeated Hotword hotwords = 1;
// An estimate of the direction that the sound is coming from.
message AudioLocalization {
// An angle in radians in the horizontal plane. It roughly points to the peak
// in the probability distribution of azimuth defined below.
optional double azimuth_radians = 1;
// A probability distribution for the current snapshot in time that shows the
// likelihood of a sound source being at a particular azimuth. For example,
// azimuthScores = [0.1, 0.2, 0.3, 0.4] means that the probability that the
// sound is coming from an azimuth of 0, pi/2, pi, 3*pi/2 is 0.1, 0.2, 0.3 and
// 0.4, respectively.
repeated double azimuth_scores = 2;
// Detection of human presence close to the microphone.
message AudioHumanPresenceDetection {
// Indicates a probability in [0, 1] interval that a human has caused a sound
// close to the microphone.
optional double human_presence_likelihood = 1;
// Estimate of the noise spectrogram.
optional AudioSpectrogram noise_spectrogram = 2;
// Spectrogram of an audio frame.
optional AudioSpectrogram frame_spectrogram = 3;
// Spectrogram of an audio frame.
message AudioSpectrogram {
repeated double values = 1;
// This message stores the image frame along with the meta data.
message ImageFrame {
enum Format {
FORMAT_UNSPECIFIED = 0; // Unused required default value for Proto enums.
RGB = 1; // Raw rgb image.
PNG = 2; // PNG image.
JPEG = 3; // JPEG image.
optional int32 width = 1;
optional int32 height = 2;
optional Format format = 3;
optional int32 data_length = 4;
optional bytes pixel_data = 5;
// The set of computer vision metadata for an image frame.
message FramePerception {
optional uint64 frame_id = 1;
optional uint32 frame_width_in_px = 2;
optional uint32 frame_height_in_px = 3;
// The timestamp associated with the frame (when it enters the graph).
optional uint64 timestamp = 4;
// The list of entities detected for this frame.
repeated Entity entity = 5;
// Latency measurement for a list of packet streams in drishti graph.
repeated PacketLatency packet_latency = 6;
// Human presence detection results for a video frame.
optional VideoHumanPresenceDetection video_human_presence_detection = 7;
enum PerceptionType {
// Indicates what types of frame perception were run.
repeated PerceptionType perception_types = 8;
// Detection of human presence close to the camera.
message VideoHumanPresenceDetection {
// Indicates a probability in [0, 1] interval that a human is present in the
// video frame.
optional double human_presence_likelihood = 1;
// Indicates a probability in [0, 1] interval that motion has been detected
// in the video frame.
optional double motion_detected_likelihood = 2;
// Type of lighting conditions.
enum LightCondition {
// No noticeable change occurred.
// Light was switched on in the room.
// Light was switched off in the room.
// Light gradually got dimmer (for example, due to a sunset).
// Light gradually got brighter (for example, due to a sunrise).
// Black frame detected - the current frame contains only noise.
// Indicates lighting condition in the video frame.
optional LightCondition light_condition = 3;
// Indicates a probability in [0, 1] interval that light condition value is
// correct.
optional double light_condition_likelihood = 4;
message Entity {
// A unique id associated with the detected entity, which can be used to track
// the entity over time.
optional uint32 id = 1;
enum EntityType {
FACE = 1;
optional EntityType type = 2;
// Minimum box, which captures entire detected entity.
optional BoundingBox bounding_box = 3;
// A value for the quality of this detection.
optional float confidence = 4;
// Perpendicular distance (depth) from the camera plane to the entity.
optional Distance depth = 5;
// String label for this entity.
optional string label = 6;
message BoundingBox {
// The points that define the corners of a bounding box.
optional Point top_left = 1;
optional Point bottom_right = 2;
// Indicates whether or not these coordinates are normalized to values between
// 0 and 1.
optional bool normalized = 3 [default = false];
message PacketLatency {
// An identifier label for the packet.
optional string label = 1;
// Delay in microseconds with respect to a reference packet.
optional uint64 latency_usec = 2;
message Point {
// x represents the horizontal distance from the top left corner of the image
// to the point.
optional float x = 1;
// y represents the vertical distance from the top left corner of the image to
// the point.
optional float y = 2;
// Generic message object to encapsulate a distance magnitude and units.
message Distance {
enum DistanceUnits {
optional DistanceUnits units = 1;
optional float magnitude = 2;
// The parameters for a whiteboard in the image frame.
message Whiteboard {
// The top left corner of the whiteboard in the image frame.
optional Point top_left = 1;
// The top right corner of the whiteboard in the image frame.
optional Point top_right = 2;
// The bottom left corner of the whiteboard in the image frame.
optional Point bottom_left = 3;
// The bottom right corner of the whiteboard in the image frame.
optional Point bottom_right = 4;
// The physical aspect ratio of the whiteboard.
optional float aspect_ratio = 5;