| syntax = "proto2"; |
| |
| option optimize_for = LITE_RUNTIME; |
| |
| package mri; |
| |
| // The output of the media analytics process. Implicitly tied to the |
| // MediaPerception dictionary defined in Chromium source at |
| // src/extensions/common/api/media_perception_private.idl for the |
| // Chromium mediaPerceptionPrivate API. This .proto needs to be compatible |
| // with the version used in the binary checked into the Chromebox For |
| // Meetings private overlay. |
| // |
| // This message is packaged by the graph runner when a PerceptionSample |
| // or array of PerceptionSamples comes out of the graph. |
| message MediaPerception { |
| // The timestamp attached when this data originated from the analysis process. |
| optional uint64 timestamp = 1; // In milliseconds since Epoch. |
| |
| // A single FramePerception message or array of perceptions (if reporting the |
| // results from multiple frames). |
| repeated FramePerception frame_perception = 2; |
| |
| // A single AudioPerception message or array of audio perceptions (if |
| // reporting the results from multiple audio frames). |
| repeated AudioPerception audio_perception = 3; |
| |
| // A single AudioVisualPerception message or array of audio-visual |
| // perceptions. |
| repeated AudioVisualPerception audio_visual_perception = 4; |
| |
| // Stores metadata such as version of media perception features. |
| optional Metadata metadata = 5; |
| } |
| |
| // Stores metadata such as version of media perception features. |
| message Metadata { |
| optional string visual_experience_controller_version = 1; |
| } |
| |
| // Used to transmit a history of image frames and their associated annotations. |
| // This is accumulated over time by the graph runner. |
| message Diagnostics { |
| repeated PerceptionSample perception_sample = 1; |
| } |
| |
| message VideoStreamParam { |
| // Identifies the video stream described by these parameters. |
| optional string id = 1; |
| |
| // Frame width in pixels. |
| optional int32 width = 2; |
| |
| // Frame height in pixels. |
| optional int32 height = 3; |
| |
| // The frame rate at which this video stream would be processed. |
| optional float frame_rate = 4; |
| } |
| |
| message State { |
| enum Status { |
| STATUS_UNSPECIFIED = 0; // Unused required default value for Proto enums. |
| UNINITIALIZED = 1; // Media analytics working on loading configuration. |
| STARTED = 2; // Analysis process running but not recieving frames. |
| RUNNING = 3; // Analysis process running and injesting frames. |
| SUSPENDED = 4; // Media analytics process waiting to be started. |
| RESTARTING = 5; // Media analytics process should be restarted. |
| STOPPED = 6; // Media analytics process should be stopped. |
| } |
| |
| // Note: RUNNING and SUSPENDED are the only two states which should be sent to |
| // SetState. |
| optional Status status = 1; |
| |
| // Device context so that the media analytics process can better select the |
| // right video device to open. |
| optional string device_context = 2; |
| |
| // A list of video streams processed by the analytics process. |
| repeated VideoStreamParam video_stream_param = 3; |
| |
| // Media analytics configuration. It can only be used when setting state to |
| // RUNNING. |
| optional string configuration = 4; |
| |
| // Corners and aspect ratio of the whiteboard in the image frame. Should only |
| // be set when setting state to running and configuration to whiteboard. |
| optional Whiteboard whiteboard = 5; |
| |
| enum Feature { |
| // UNSET is not a real feature value. |
| FEATURE_UNSET = 0; |
| FEATURE_AUTOZOOM = 1; |
| FEATURE_HOTWORD_DETECTION = 2; |
| FEATURE_OCCUPANCY_DETECTION = 3; |
| FEATURE_EDGE_EMBEDDINGS = 4; |
| FEATURE_SOFTWARE_CROPPING = 5; |
| } |
| |
| // A list of enabled media perception features. |
| repeated Feature features = 6; |
| |
| message NamedTemplateArgument { |
| optional string name = 1; |
| oneof value { |
| string str = 2; |
| double num = 3; |
| } |
| } |
| |
| // An optional list of template arguments to be substituted at |
| // run-time. Each argument present in this list will be set to the |
| // specified values, others will not be changed. Furthermore, nested |
| // arguments (dictionaries) are not supported. |
| // CAUTION: These template args can be used to overwrite the |
| // Feature flags defined above since feature flags are implemented |
| // as numeric template args. |
| repeated NamedTemplateArgument named_template_arguments = 7; |
| } |
| |
| // This is the output of the MediaPerceptionSinkCalculator. |
| message PerceptionSample { |
| optional FramePerception frame_perception = 1; |
| |
| // The image frame data associated with the frame perception. |
| optional ImageFrame image_frame = 2; |
| |
| optional AudioPerception audio_perception = 3; |
| |
| optional AudioVisualPerception audio_visual_perception = 4; |
| |
| // Stores metadata such as version of media perception features. |
| optional Metadata metadata = 5; |
| } |
| |
| // Perception results based on both audio and video inputs. |
| message AudioVisualPerception { |
| // A timestamp in microseconds attached when this message was generated. |
| optional uint64 timestamp_us = 1; |
| |
| // Human presence detection results. |
| optional AudioVisualHumanPresenceDetection |
| audio_visual_human_presence_detection = 2; |
| } |
| |
| // Detection of human presence based on both audio and video inputs. |
| message AudioVisualHumanPresenceDetection { |
| // Indicates a probability in [0, 1] interval that a human is present. |
| optional double human_presence_likelihood = 1; |
| } |
| |
| // Audio perception results for an audio frame. |
| message AudioPerception { |
| // A timestamp in microseconds attached when this message was generated. |
| optional uint64 timestamp_us = 1; |
| |
| // Audio localization results for an audio frame. |
| optional AudioLocalization audio_localization = 2; |
| |
| // Human presence detection results for an audio frame. |
| optional AudioHumanPresenceDetection audio_human_presence_detection = 3; |
| |
| // Hotword detection results. |
| optional HotwordDetection hotword_detection = 4; |
| } |
| |
| // Detection of hotword in the audio stream. |
| message HotwordDetection { |
| enum Type { |
| UNKNOWN_TYPE = 0; |
| OK_GOOGLE = 1; |
| } |
| |
| message Hotword { |
| // Unique identifier for the hotword instance. Note that a single hotword |
| // instance can span more than one audio frame. In that case a single |
| // hotword instance can be reported in multiple Hotword or HotwordDetection |
| // messages. Hotword messages associated with the same hotword instance will |
| // have the same id. |
| optional uint64 id = 1; |
| |
| // Indicates the type of this hotword. |
| optional Type type = 2; |
| |
| // Id of the audio frame in which the hotword was detected. |
| optional uint64 frame_id = 3; |
| |
| // Indicates the start time of this hotword in the audio frame. |
| optional uint64 start_timestamp_ms = 4; |
| |
| // Indicates the end time of this hotword in the audio frame. |
| optional uint64 end_timestamp_ms = 5; |
| |
| // Indicates a probability in [0, 1] interval that this hotword is present |
| // in the audio frame. |
| optional float confidence = 6; |
| } |
| |
| repeated Hotword hotwords = 1; |
| } |
| |
| // An estimate of the direction that the sound is coming from. |
| message AudioLocalization { |
| // An angle in radians in the horizontal plane. It roughly points to the peak |
| // in the probability distribution of azimuth defined below. |
| optional double azimuth_radians = 1; |
| |
| // A probability distribution for the current snapshot in time that shows the |
| // likelihood of a sound source being at a particular azimuth. For example, |
| // azimuthScores = [0.1, 0.2, 0.3, 0.4] means that the probability that the |
| // sound is coming from an azimuth of 0, pi/2, pi, 3*pi/2 is 0.1, 0.2, 0.3 and |
| // 0.4, respectively. |
| repeated double azimuth_scores = 2; |
| } |
| |
| // Detection of human presence close to the microphone. |
| message AudioHumanPresenceDetection { |
| // Indicates a probability in [0, 1] interval that a human has caused a sound |
| // close to the microphone. |
| optional double human_presence_likelihood = 1; |
| |
| // Estimate of the noise spectrogram. |
| optional AudioSpectrogram noise_spectrogram = 2; |
| |
| // Spectrogram of an audio frame. |
| optional AudioSpectrogram frame_spectrogram = 3; |
| } |
| |
| // Spectrogram of an audio frame. |
| message AudioSpectrogram { |
| repeated double values = 1; |
| } |
| |
| // This message stores the image frame along with the meta data. |
| message ImageFrame { |
| enum Format { |
| FORMAT_UNSPECIFIED = 0; // Unused required default value for Proto enums. |
| RGB = 1; // Raw rgb image. |
| PNG = 2; // PNG image. |
| JPEG = 3; // JPEG image. |
| } |
| optional int32 width = 1; |
| optional int32 height = 2; |
| optional Format format = 3; |
| optional int32 data_length = 4; |
| optional bytes pixel_data = 5; |
| } |
| |
| // The set of computer vision metadata for an image frame. |
| message FramePerception { |
| optional uint64 frame_id = 1; |
| |
| optional uint32 frame_width_in_px = 2; |
| optional uint32 frame_height_in_px = 3; |
| |
| // The timestamp associated with the frame (when it enters the graph). |
| optional uint64 timestamp = 4; |
| |
| // The list of entities detected for this frame. |
| repeated Entity entity = 5; |
| |
| // Latency measurement for a list of packet streams in drishti graph. |
| repeated PacketLatency packet_latency = 6; |
| |
| // Human presence detection results for a video frame. |
| optional VideoHumanPresenceDetection video_human_presence_detection = 7; |
| |
| enum PerceptionType { |
| UNKNOWN_TYPE = 0; |
| FACE_DETECTION = 1; |
| MOTION_DETECTION = 2; |
| PERSON_DETECTION = 3; |
| } |
| |
| // Indicates what types of frame perception were run. |
| repeated PerceptionType perception_types = 8; |
| } |
| |
| // Detection of human presence close to the camera. |
| message VideoHumanPresenceDetection { |
| // Indicates a probability in [0, 1] interval that a human is present in the |
| // video frame. |
| optional double human_presence_likelihood = 1; |
| |
| // Indicates a probability in [0, 1] interval that motion has been detected |
| // in the video frame. |
| optional double motion_detected_likelihood = 2; |
| |
| // Type of lighting conditions. |
| enum LightCondition { |
| UNSPECIFIED = 0; |
| |
| // No noticeable change occurred. |
| NO_CHANGE = 1; |
| |
| // Light was switched on in the room. |
| TURNED_ON = 2; |
| |
| // Light was switched off in the room. |
| TURNED_OFF = 3; |
| |
| // Light gradually got dimmer (for example, due to a sunset). |
| DIMMER = 4; |
| |
| // Light gradually got brighter (for example, due to a sunrise). |
| BRIGHTER = 5; |
| |
| // Black frame detected - the current frame contains only noise. |
| BLACK_FRAME = 6; |
| } |
| |
| // Indicates lighting condition in the video frame. |
| optional LightCondition light_condition = 3; |
| |
| // Indicates a probability in [0, 1] interval that light condition value is |
| // correct. |
| optional double light_condition_likelihood = 4; |
| } |
| |
| message Entity { |
| // A unique id associated with the detected entity, which can be used to track |
| // the entity over time. |
| optional uint32 id = 1; |
| |
| enum EntityType { |
| UNSPECIFIED = 0; |
| FACE = 1; |
| PERSON = 2; |
| MOTION_REGION = 3; |
| LABELED_REGION = 4; |
| } |
| |
| optional EntityType type = 2; |
| |
| // Minimum box, which captures entire detected entity. |
| optional BoundingBox bounding_box = 3; |
| |
| // A value for the quality of this detection. |
| optional float confidence = 4; |
| |
| // Perpendicular distance (depth) from the camera plane to the entity. |
| optional Distance depth = 5; |
| |
| // String label for this entity. |
| optional string label = 6; |
| } |
| |
| message BoundingBox { |
| // The points that define the corners of a bounding box. |
| optional Point top_left = 1; |
| optional Point bottom_right = 2; |
| // Indicates whether or not these coordinates are normalized to values between |
| // 0 and 1. |
| optional bool normalized = 3 [default = false]; |
| } |
| |
| message PacketLatency { |
| // An identifier label for the packet. |
| optional string label = 1; |
| |
| // Delay in microseconds with respect to a reference packet. |
| optional uint64 latency_usec = 2; |
| } |
| |
| message Point { |
| // x represents the horizontal distance from the top left corner of the image |
| // to the point. |
| optional float x = 1; |
| // y represents the vertical distance from the top left corner of the image to |
| // the point. |
| optional float y = 2; |
| } |
| |
| // Generic message object to encapsulate a distance magnitude and units. |
| message Distance { |
| enum DistanceUnits { |
| UNITS_UNSPECIFIED = 0; |
| METERS = 1; |
| PIXELS = 2; |
| } |
| |
| optional DistanceUnits units = 1; |
| |
| optional float magnitude = 2; |
| } |
| |
| // The parameters for a whiteboard in the image frame. |
| message Whiteboard { |
| // The top left corner of the whiteboard in the image frame. |
| optional Point top_left = 1; |
| |
| // The top right corner of the whiteboard in the image frame. |
| optional Point top_right = 2; |
| |
| // The bottom left corner of the whiteboard in the image frame. |
| optional Point bottom_left = 3; |
| |
| // The bottom right corner of the whiteboard in the image frame. |
| optional Point bottom_right = 4; |
| |
| // The physical aspect ratio of the whiteboard. |
| optional float aspect_ratio = 5; |
| } |