| syntax = "proto2"; |
| |
| option optimize_for = LITE_RUNTIME; |
| |
| package mri; |
| |
| // The output of the media analytics process. Implicitly tied to the |
| // MediaPerception dictionary defined in Chromium source at |
| // src/extensions/common/api/media_perception_private.idl for the |
| // Chromium mediaPerceptionPrivate API. This .proto needs to be compatible |
| // with the version used in the binary checked into the Chromebox For |
| // Meetings private overlay. |
| // |
| // This message is packaged by the graph runner when a PerceptionSample |
| // or array of PerceptionSamples comes out of the graph. |
| message MediaPerception { |
| // The timestamp attached when this data originated from the analysis process. |
| optional uint64 timestamp = 1; // In milliseconds since Epoch. |
| |
| // A single FramePerception message or array of perceptions (if reporting the |
| // results from multiple frames). |
| repeated FramePerception frame_perception = 2; |
| |
| // A single AudioPerception message or array of audio perceptions (if |
| // reporting the results from multiple audio frames). |
| repeated AudioPerception audio_perception = 3; |
| |
| // A single AudioVisualPerception message or array of audio-visual |
| // perceptions. |
| repeated AudioVisualPerception audio_visual_perception = 4; |
| } |
| |
| // Used to transmit a history of image frames and their associated annotations. |
| // This is accumulated over time by the graph runner. |
| message Diagnostics { |
| repeated PerceptionSample perception_sample = 1; |
| } |
| |
| message VideoStreamParam { |
| // Identifies the video stream described by these parameters. |
| optional string id = 1; |
| |
| // Frame width in pixels. |
| optional int32 width = 2; |
| |
| // Frame height in pixels. |
| optional int32 height = 3; |
| |
| // The frame rate at which this video stream would be processed. |
| optional float frame_rate = 4; |
| } |
| |
| message State { |
| enum Status { |
| STATUS_UNSPECIFIED = 0; // Unused required default value for Proto enums. |
| UNINITIALIZED = 1; // Media analytics working on loading configuration. |
| STARTED = 2; // Analysis process running but not recieving frames. |
| RUNNING = 3; // Analysis process running and injesting frames. |
| SUSPENDED = 4; // Media analytics process waiting to be started. |
| RESTARTING = 5; // Media analytics process should be restarted. |
| STOPPED = 6; // Media analytics process should be stopped. |
| } |
| |
| // Note: RUNNING and SUSPENDED are the only two states which should be sent to |
| // SetState. |
| optional Status status = 1; |
| |
| // Device context so that the media analytics process can better select the |
| // right video device to open. |
| optional string device_context = 2; |
| |
| // A list of video streams processed by the analytics process. |
| repeated VideoStreamParam video_stream_param = 3; |
| |
| // Media analytics configuration. It can only be used when setting state to |
| // RUNNING. |
| optional string configuration = 4; |
| } |
| |
| // This is the output of the MediaPerceptionSinkCalculator. |
| message PerceptionSample { |
| optional FramePerception frame_perception = 1; |
| |
| // The image frame data associated with the frame perception. |
| optional ImageFrame image_frame = 2; |
| |
| optional AudioPerception audio_perception = 3; |
| |
| optional AudioVisualPerception audio_visual_perception = 4; |
| } |
| |
| // Perception results based on both audio and video inputs. |
| message AudioVisualPerception { |
| // A timestamp in microseconds attached when this message was generated. |
| optional uint64 timestamp_us = 1; |
| |
| // Human presence detection results. |
| optional AudioVisualHumanPresenceDetection |
| audio_visual_human_presence_detection = 2; |
| } |
| |
| // Detection of human presence based on both audio and video inputs. |
| message AudioVisualHumanPresenceDetection { |
| // Indicates a probability in [0, 1] interval that a human is present. |
| optional double human_presence_likelihood = 1; |
| } |
| |
| // Audio perception results for an audio frame. |
| message AudioPerception { |
| // A timestamp in microseconds attached when this message was generated. |
| optional uint64 timestamp_us = 1; |
| |
| // Audio localization results for an audio frame. |
| optional AudioLocalization audio_localization = 2; |
| |
| // Human presence detection results for an audio frame. |
| optional AudioHumanPresenceDetection audio_human_presence_detection = 3; |
| } |
| |
| // An estimate of the direction that the sound is coming from. |
| message AudioLocalization { |
| // An angle in radians in the horizontal plane. It roughly points to the peak |
| // in the probability distribution of azimuth defined below. |
| optional double azimuth_radians = 1; |
| |
| // A probability distribution for the current snapshot in time that shows the |
| // likelihood of a sound source being at a particular azimuth. For example, |
| // azimuthScores = [0.1, 0.2, 0.3, 0.4] means that the probability that the |
| // sound is coming from an azimuth of 0, pi/2, pi, 3*pi/2 is 0.1, 0.2, 0.3 and |
| // 0.4, respectively. |
| repeated double azimuth_scores = 2; |
| } |
| |
| // Detection of human presence close to the microphone. |
| message AudioHumanPresenceDetection { |
| // Indicates a probability in [0, 1] interval that a human has caused a sound |
| // close to the microphone. |
| optional double human_presence_likelihood = 1; |
| |
| // Estimate of the noise spectrogram. |
| optional AudioSpectrogram noise_spectrogram = 2; |
| |
| // Spectrogram of an audio frame. |
| optional AudioSpectrogram frame_spectrogram = 3; |
| } |
| |
| // Spectrogram of an audio frame. |
| message AudioSpectrogram { |
| repeated double values = 1; |
| } |
| |
| // This message stores the image frame along with the meta data. |
| message ImageFrame { |
| enum Format { |
| FORMAT_UNSPECIFIED = 0; // Unused required default value for Proto enums. |
| RGB = 1; // Raw rgb image. |
| PNG = 2; // PNG image. |
| JPEG = 3; // JPEG image. |
| } |
| optional int32 width = 1; |
| optional int32 height = 2; |
| optional Format format = 3; |
| optional int32 data_length = 4; |
| optional bytes pixel_data = 5; |
| } |
| |
| // The set of computer vision metadata for an image frame. |
| message FramePerception { |
| optional uint64 frame_id = 1; |
| |
| optional uint32 frame_width_in_px = 2; |
| optional uint32 frame_height_in_px = 3; |
| |
| // The timestamp associated with the frame (when it enters the graph). |
| optional uint64 timestamp = 4; |
| |
| // The list of entities detected for this frame. |
| repeated Entity entity = 5; |
| |
| // Latency measurement for a list of packet streams in drishti graph. |
| repeated PacketLatency packet_latency = 6; |
| |
| // Human presence detection results for a video frame. |
| optional VideoHumanPresenceDetection video_human_presence_detection = 7; |
| } |
| |
| // Detection of human presence close to the camera. |
| message VideoHumanPresenceDetection { |
| // Indicates a probability in [0, 1] interval that a human is present in the |
| // video frame. |
| optional double human_presence_likelihood = 1; |
| |
| // Indicates a probability in [0, 1] interval that motion has been detected |
| // in the video frame. |
| optional double motion_detected_likelihood = 2; |
| |
| // Type of lighting conditions. |
| enum LightCondition { |
| UNSPECIFIED = 0; |
| |
| // No noticeable change occurred. |
| NO_CHANGE = 1; |
| |
| // Light was switched on in the room. |
| TURNED_ON = 2; |
| |
| // Light was switched off in the room. |
| TURNED_OFF = 3; |
| |
| // Light gradually got dimmer (for example, due to a sunset). |
| DIMMER = 4; |
| |
| // Light gradually got brighter (for example, due to a sunrise). |
| BRIGHTER = 5; |
| |
| // Black frame detected - the current frame contains only noise. |
| BLACK_FRAME = 6; |
| } |
| |
| // Indicates lighting condition in the video frame. |
| optional LightCondition light_condition = 3; |
| |
| // Indicates a probability in [0, 1] interval that light condition value is |
| // correct. |
| optional double light_condition_likelihood = 4; |
| } |
| |
| message Entity { |
| // A unique id associated with the detected entity, which can be used to track |
| // the entity over time. |
| optional uint32 id = 1; |
| |
| enum EntityType { |
| UNSPECIFIED = 0; |
| FACE = 1; |
| PERSON = 2; |
| MOTION_REGION = 3; |
| LABELED_REGION = 4; |
| } |
| |
| optional EntityType type = 2; |
| |
| // Minimum box, which captures entire detected entity. |
| optional BoundingBox bounding_box = 3; |
| |
| // A value for the quality of this detection. |
| optional float confidence = 4; |
| |
| // Perpendicular distance (depth) from the camera plane to the entity. |
| optional Distance depth = 5; |
| |
| // String label for this entity. |
| optional string label = 6; |
| } |
| |
| message BoundingBox { |
| // The points that define the corners of a bounding box. |
| optional Point top_left = 1; |
| optional Point bottom_right = 2; |
| // Indicates whether or not these coordinates are normalized to values between |
| // 0 and 1. |
| optional bool normalized = 3 [default = false]; |
| } |
| |
| message PacketLatency { |
| // An identifier label for the packet. |
| optional string label = 1; |
| |
| // Delay in microseconds with respect to a reference packet. |
| optional uint64 latency_usec = 2; |
| } |
| |
| message Point { |
| // x represents the horizontal distance from the top left corner of the image |
| // to the point. |
| optional float x = 1; |
| // y represents the vertical distance from the top left corner of the image to |
| // the point. |
| optional float y = 2; |
| } |
| |
| // Generic message object to encapsulate a distance magnitude and units. |
| message Distance { |
| enum DistanceUnits { |
| UNITS_UNSPECIFIED = 0; |
| METERS = 1; |
| PIXELS = 2; |
| } |
| |
| optional DistanceUnits units = 1; |
| |
| optional float magnitude = 2; |
| } |